【工程开发】Qwen2.5-VL-32B-Instruct 微调(三)

发布于:2025-04-05 ⋅ 阅读:(34) ⋅ 点赞:(0)

【测试一下性能】

图像测试

超算互联网里已经有相应的测试环境了,多配置几张卡,进入下载好模型,可以测试,目前根据测试代码测了图像的理解。

代码

代码如下:

# TODO: 导入相关依赖
b_env_ready = True
try:
    # 系统内置依赖
    import re
    import os
    import sys
    import subprocess
    import argparse
    import io
    import time
    import logging
    import warnings
    from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
    from qwen_vl_utils import process_vision_info
    
    # 获取根日志记录器并设置日志级别为 ERROR
    logger = logging.getLogger()
    logger.setLevel(logging.ERROR)
    
    # 忽略所有警告
    warnings.filterwarnings("ignore")
    warnings.filterwarnings('ignore', category=UserWarning, message='1Torch was not compiled.*')
    warnings.filterwarnings('ignore', category=UserWarning, message='Using a slow image processor*')

    import ipywidgets as widgets
    from IPython.display import display, HTML, Image
    from PIL import Image as PILImage
    from datetime import datetime

    # 模型依赖
    from PIL import Image
    import requests
    import torch
    # from diffusers.utils import load_image
except ModuleNotFoundError as e:
    missing_module = str(e).split("'")[1]
    print(f"\033[31m模块未找到错误: 没有名为 '{missing_module}' 的模块\033[0m")
    print("\033[31m请确保执行第一步依赖模块安装后再运行应用:\033[0m")
    print("\033[31m如仍有模块缺失错误,请在Notebook中新建单元格执行以下命令安装缺少的模块:\033[0m")
    print(f"\033[32mpip install {missing_module}\033[0m")

    print(f"\033[31m将为您自动安装缺失的模块...\033[0m")
    # Optionally, you could automatically install the module
    subprocess.check_call([sys.executable, "-m", "pip", "install", missing_module])
    print(f"\033[32m'{missing_module}'模块安装命令执行完成...\033[0m")
    b_env_ready = False

try:
    if b_env_ready == False:
        # 退出执行
        raise SystemExit(f"已尝试安装 '{missing_module}' 模块,请重启Notebook内核,再执行此步骤")
except SystemExit as e:
    print(e)
    sys.exit()

# 环境变量设置
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ************************************************
# 代码实现部分
# ************************************************

print(f"\033[32m环境依赖导入完成...\033[0m\n")

g_input_img_url = ""
g_input_text = ""

# 创建文件上传小部件
upload = widgets.FileUpload(
    description='选择本地图片',
    accept='image/*',  # 接受所有图片文件
    multiple=False  # 禁止多文件上传
)

# 创建文本输入框
text_box = widgets.Text(
    placeholder='请输入提示词',
    disabled=True  # 初始时禁用文本框
)

# 创建按钮小部件
upload_button = widgets.Button(
    description='上传图片',
    disabled=True,  # 初始时禁用
    button_style='primary'
)
process_button = widgets.Button(
    description='执行推理任务',
    disabled=True,  # 初始时禁用按钮
    button_style='success'
)

# 创建输出小部件
output = widgets.Output()

# 定义文件上传事件处理函数
def on_upload_change(change):
    global g_input_img_url
    # 检查是否有文件上传
    if upload.value:
        # 当有文件上传时,启用上传按钮
        upload_button.disabled = False
        check_all_inputs()  # 检查是否满足执行推理任务的条件
    else:
        # 没有文件上传时,禁用上传按钮
        upload_button.disabled = True

# 定义上传按钮点击事件处理函数
def on_upload_button_click(b):
    global g_input_img_url
    with output:
        output.clear_output()  # 清空之前的输出

        if upload.value == "":
            print(f"请先选择需要上传的图片!")
            return
        
        uploaded_file = upload.value[0]
        file_name = uploaded_file['name']
        content = uploaded_file['content']

        # 将字节内容转换为PIL图像
        image = PILImage.open(io.BytesIO(content))
        
        # 指定保存的文件名,以png格式保存
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        input_filename = f"input_{file_name}_{timestamp}.png"
        image.save(input_filename, format='PNG')
        g_input_img_url = input_filename

        print(f"\033[32m已将您上传图片{file_name}保存为{input_filename},请输入提示词后点击'执行推理任务'\033[0m\n")
        text_box.disabled = False
        # 启用处理按钮
        check_all_inputs()

# 设置文本框的事件处理程序
def on_text_change(change):
    global g_input_text
    g_input_text = change['new']
    check_all_inputs()  # 检查是否满足执行推理任务的条件

# 定义处理按钮点击事件处理函数
def on_process_button_click(b):
    global g_input_img_url, g_input_text
    with output:
        output.clear_output()  # 清空之前的输出
        if g_input_img_url and g_input_text:

            upload.disabled = True
            #upload_button.disabled = True
            text_box.disabled = True
            process_button.disabled = True
            
            # 执行推理任务
            run_model(g_input_img_url, g_input_text, "")  

            upload.disabled = False
            text_box.value = ""
        else:
            print("请确保已经上传图片并输入文字。")

# 检查所有输入是否满足条件
def check_all_inputs():
    if g_input_img_url and g_input_text:
        process_button.disabled = False
    else:
        process_button.disabled = True

# 设置小部件的事件处理程序
upload.observe(on_upload_change, names='value')

upload_button.on_click(on_upload_button_click)

text_box.observe(on_text_change, names='value')

process_button.on_click(on_process_button_click)


# TODO: 模型和pipe建立
model_id = "./"

global pipe

def load_model(torch_dtype, device):
    global pipe
    num_gpus = torch.cuda.device_count()
    if num_gpus < 2:
        print("\033[31m该模型推理运行至少需要 2 张异构加速卡卡,当前只检测到 {} 张。请重新选择创建实例。\033[0m".format(num_gpus))
        sys.exit()
    # 释放显存
    torch.cuda.empty_cache()
    print(f"\033[31m本模型较大,加载耗时约3-5分钟,请耐心等待...\033[0m")
    # 加载模型
    try:
        print(f"正在以{torch_dtype}类型加载模型...")
        print(f"开始加载{model_id}模型...")
        
        # 使用 AutoProcessor 和模型加载
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, device_map="auto"
            )
        min_pixels = 256 * 28 * 28
        max_pixels = 1280 * 28 * 28
        processor = AutoProcessor.from_pretrained(
            model_id, min_pixels=min_pixels, max_pixels=max_pixels
        )
       # processor = AutoProcessor.from_pretrained(model_id)
        pipe = processor, model  # 保存处理器和模型
        
        # 这里可以添加更多显存信息和模型状态输出
        print(f"模型加载完成...")
        # return processor, model
        
        gpu_memory_allocated = torch.cuda.memory_allocated(device)       
        print(f"\033[32m模型{model_id}加载完成,显存占用: {gpu_memory_allocated / 1024 ** 3:.2f} GB,剩余显存:{(g_gpu_memory - gpu_memory_allocated) / (1024 ** 3):.2f} GB\033[0m\n")
        print(f"\033[32m模型加载完成...\033[0m\n")

        print(f"\033[31m请按照以下步骤体验本模型:\033[0m")
        print(f"\033[31m1.点击'选择本地图片'按钮,选择需要输入的图片\033[0m")
        print(f"\033[31m2.点击'上传图片'按钮,将您选择的本地图片上传\033[0m")
        print(f"\033[31m3.将提示词输入在提示词输入框\033[0m")
        print(f"\033[31m4.点击'执行推理任务'按钮执行\033[0m\n")
        
        # 显示文件上传小部件、按钮、文本框和输出
        display(upload)
        display(upload_button)
        display(text_box)
        display(process_button)
        display(output)
    except torch.cuda.OutOfMemoryError as e:
        print(f"\033[31m显存不足,无法继续运行模型,请重启内核释放显存...\033[0m")
        print(f"\033[31m错误信息: \033[0m")
        print(f"{str(e)}\n")
        # 释放显存
        torch.cuda.empty_cache()
        return False

def run_model(image_url, prompt, negative_prompt):
    global pipe
    processor, model = pipe  # 获取处理器和模型

    try:
        input_image = Image.open(image_url).convert("RGB")
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": input_image ,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]

        # 生成图片  
        print(f"正在加载图像...")
        print(f"您输入的图片为:")
        display(input_image)
        
        print(f"您输入的提示词为:{prompt}")
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # 将图片和文本准备好
        inputs = processor.apply_chat_template(
            messages, 
            add_generation_prompt=True, 
            tokenize=True,
            return_dict=True, 
            return_tensors="pt"
        ).to(model.device, dtype=torch.bfloat16)
        generated_ids = model.generate(**inputs, max_new_tokens=1280)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        ) 
        print(output_text)
        # # 生成输出
        # with torch.no_grad():
        #     output = model.generate(**inputs, max_new_tokens=1024)
        
        # # 解码输出
        # decoded_output = processor.batch_decode(
        #     output, 
        #     skip_special_tokens=True
        # )
        filtered_text = re.sub(r'user\n.*?\nmodel', '', output_text[0], flags=re.DOTALL)
        print(filtered_text)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        print(f"\n\033[31m如需再次体验,请重新上传图片,输入提示词,点击'执行推理任务'按钮执行...\033[0m\n")
    except torch.cuda.OutOfMemoryError as e:
        print(f"\033[31m显存不足,无法继续运行模型,请重启内核释放显存...\033[0m")
        print(f"\033[31m错误信息: \033[0m")
        print(f"{str(e)}\n")
        # 释放显存
        torch.cuda.empty_cache()
        return False

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        g_gpu_memory = props.total_memory
        print(f"GPU{i}显存为: {props.total_memory / (1024 ** 3):.2f} GB")
else:
    print(f"未检测到GPU卡,模型将被加载至CPU,推理所需时间较长")

# 加载模型并执行推理
prompt = ""
negative_prompt = ""
image_url = ""

parser = argparse.ArgumentParser(description="模型推理提示词参数")
parser.add_argument("image", type=str, nargs='?', default="", help='请输入模型推理输入图片参数')
parser.add_argument("prompt", type=str, nargs='?', default="", help='请输入模型推理提示词参数')
args = parser.parse_args()

image_url = args.image
prompt = args.prompt

load_model(torch.float16, device)

运行效果

视频推理

代码

视频推理没有现成的代码,但是参考了一下:

【工程开发】Qwen2.5-VL-32B-Instruct 微调(一)-CSDN博客

需要改一下模型加载的代码,最终代码如下:

import os
import sys
import torch
import argparse
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import logging
import warnings
# 在类初始化时添加版本检查
from packaging import version
import qwen_vl_utils

# if version.parse(qwen_vl_utils.__version__) < version.parse("0.2.0"):
#     raise ImportError("需要 qwen_vl_utils >= 0.2.0,当前版本: " + qwen_vl_utils.__version__)

# 禁用不必要的日志和警告
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

class VideoInferenceSystem:
    def __init__(self, model_name="/root/private_data/SothisAI/model/Aihub/Qwen2.5-VL-32B-Instruct/main/Qwen2.5-VL-32B-Instruct"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = None
        self.processor = None
        self.load_model(model_name)

    def load_model(self, model_name):
        """加载模型和处理器,使用代码片段1的配置"""
        print("正在加载模型和处理器...")
        
        # 显存优化配置
        min_pixels = 256 * 28 * 28
        max_pixels = 1280 * 28 * 28
        
        try:
           # 加载模型(禁用特殊注意力机制)
            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                model_name,
                local_files_only=True,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                attn_implementation="flash_attention_2"  # 关键修改
            )
            
            # 加载处理器(启用快速模式)
            self.processor = AutoProcessor.from_pretrained(
                model_name,
                local_files_only=True,
                use_fast=True  # 关键修改
            )
            
            print(f"成功加载 {model_name} 模型")

        except Exception as e:
            print(f"模型加载失败: {str(e)}")
            sys.exit(1)

    def process_video(self, video_path, prompt, fps=1.0):
        """处理视频推理"""
        # try:
        # 构建消息结构(兼容本地文件和URL)
        messages = [{
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video":  f"file://{os.path.abspath(video_path)}",
                    "max_pixels": 360 * 420,
                    "fps": fps
                },
                {"type": "text", "text": prompt}
            ]
        }]
        text=self.processor.apply_chat_template(messages,tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs =  process_vision_info(messages)
        inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                fps=fps,
                padding=True,
                return_tensors="pt"
            )
        inputs = inputs.to("cuda")


        # 推理
        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        return output_text
    
        # except Exception as e:
        #     print(f"推理错误: {str(e)}")
        #     return None

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="视频推理系统")
    parser.add_argument("--video", type=str, required=True, help="输入视频路径")
    parser.add_argument("--prompt", type=str, required=True, help="推理提示词")
    parser.add_argument("--fps", type=float, default=1.0, help="视频采样率")
    args = parser.parse_args()

    # 初始化系统
    vis = VideoInferenceSystem()
    # 执行推理
    result = vis.process_video(args.video, args.prompt) 
    print("\n=== 推理结果 ===")
    print(result)

运行效果

加载模型和推理花费的时间特别长,是因为显卡的问题?