基于autoawq进行qwen3 的awq量化

发布于:2025-09-11 ⋅ 阅读:(19) ⋅ 点赞:(0)

awq量化 精度降低6个点。推理耗时降低从0.447s降低到0.4s

在llamafactory环境中,安装

pip install autoawq

量化代码:

def qu_awq():
    from awq import AutoAWQForCausalLM
    from transformers import AutoTokenizer
    import json
    model_path = "model_path"
    quant_path = "awq_model_path"
    calib_data = "_quantize.json"
    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}

    # Load model
    model = AutoAWQForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(
        model_path, trust_remote_code=True, device_map="auto", safetensors=True
    )

    # The pattern of data
    """ # Example
    msg=[
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": "Tell me who you are."},
        {"role": "assistant", "content": "I am a large language model named Qwen..."}
    ]
    data = []
    for msg in dataset:
        text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
        data.append(text.strip())
    return data
    """
    # !!!!!!!!!      Customize the code here for calib_data processing    !!!!!!!!!!!!!!

    def data_gen():

        data = []
        with open(calib_data, "r", encoding="utf-8") as file:
            for line in file:
                msg = json.loads(line)["messages"]
                text = tokenizer.apply_chat_template(
                    msg, tokenize=False, add_generation_prompt=False
                )
                data.append(text.strip())
        return data
        # !!!!!!!!!      Customize the code here for calib_data processing    !!!!!!!!!!!!!!
    with open(calib_data, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    json_data = [each["text"] for each in json_data]
    # Quantize
    model.quantize(
        tokenizer,
        quant_config=quant_config,
        calib_data=json_data,
        n_parallel_calib_samples=1,
        max_calib_samples=256,
        max_calib_seq_len=1024,
    )

    # Save quantized model
    model.save_quantized(quant_path)
    tokenizer.save_pretrained(quant_path)

    print(f'Model is quantized and saved at "{quant_path}"')

qu_awq()

推理:

#!/bin/bash
# XFORMERS 比 FLASH_ATTN 少10ms
#export VLLM_ATTENTION_BACKEND=XFORMERS  #old machine use 
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
source /opt/conda/etc/profile.d/conda.sh
conda activate /opt/conda/envs/vllm085
Model_path="/llm/models/general_knowledge_agent_router/general_knowledge_agent_202250820_v21_01_awq5"
#Model_path="/llm/models/Qwen3-4B-Instruct-2507"

CUDA_VISIBLE_DEVICES=0 nohup  python -m vllm.entrypoints.openai.api_server \
  --model ${Model_path} \
  --served-model-name 'qwen3_4b' \
  --host 0.0.0.0 \
  --port 9005 \
  --max-model-len 9000 \
  --trust-remote-code \
  --device cuda \
  --tensor-parallel-size 1 \
  --swap-space 0 \
  --quantization awq \
  --dtype float16 \
  --gpu-memory-utilization 0.7 \
  --max-num-seqs 1  > eval_qwen3_quant.log 2>&1 &


网站公告

今日签到

点亮在社区的每一天
去签到