【测试一下性能】
图像测试
在超算互联网里已经有相应的测试环境了,多配置几张卡,进入下载好模型,可以测试,目前根据测试代码测了图像的理解。
代码
代码如下:
# TODO: 导入相关依赖
b_env_ready = True
try:
# 系统内置依赖
import re
import os
import sys
import subprocess
import argparse
import io
import time
import logging
import warnings
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
# 获取根日志记录器并设置日志级别为 ERROR
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
# 忽略所有警告
warnings.filterwarnings("ignore")
warnings.filterwarnings('ignore', category=UserWarning, message='1Torch was not compiled.*')
warnings.filterwarnings('ignore', category=UserWarning, message='Using a slow image processor*')
import ipywidgets as widgets
from IPython.display import display, HTML, Image
from PIL import Image as PILImage
from datetime import datetime
# 模型依赖
from PIL import Image
import requests
import torch
# from diffusers.utils import load_image
except ModuleNotFoundError as e:
missing_module = str(e).split("'")[1]
print(f"\033[31m模块未找到错误: 没有名为 '{missing_module}' 的模块\033[0m")
print("\033[31m请确保执行第一步依赖模块安装后再运行应用:\033[0m")
print("\033[31m如仍有模块缺失错误,请在Notebook中新建单元格执行以下命令安装缺少的模块:\033[0m")
print(f"\033[32mpip install {missing_module}\033[0m")
print(f"\033[31m将为您自动安装缺失的模块...\033[0m")
# Optionally, you could automatically install the module
subprocess.check_call([sys.executable, "-m", "pip", "install", missing_module])
print(f"\033[32m'{missing_module}'模块安装命令执行完成...\033[0m")
b_env_ready = False
try:
if b_env_ready == False:
# 退出执行
raise SystemExit(f"已尝试安装 '{missing_module}' 模块,请重启Notebook内核,再执行此步骤")
except SystemExit as e:
print(e)
sys.exit()
# 环境变量设置
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# ************************************************
# 代码实现部分
# ************************************************
print(f"\033[32m环境依赖导入完成...\033[0m\n")
g_input_img_url = ""
g_input_text = ""
# 创建文件上传小部件
upload = widgets.FileUpload(
description='选择本地图片',
accept='image/*', # 接受所有图片文件
multiple=False # 禁止多文件上传
)
# 创建文本输入框
text_box = widgets.Text(
placeholder='请输入提示词',
disabled=True # 初始时禁用文本框
)
# 创建按钮小部件
upload_button = widgets.Button(
description='上传图片',
disabled=True, # 初始时禁用
button_style='primary'
)
process_button = widgets.Button(
description='执行推理任务',
disabled=True, # 初始时禁用按钮
button_style='success'
)
# 创建输出小部件
output = widgets.Output()
# 定义文件上传事件处理函数
def on_upload_change(change):
global g_input_img_url
# 检查是否有文件上传
if upload.value:
# 当有文件上传时,启用上传按钮
upload_button.disabled = False
check_all_inputs() # 检查是否满足执行推理任务的条件
else:
# 没有文件上传时,禁用上传按钮
upload_button.disabled = True
# 定义上传按钮点击事件处理函数
def on_upload_button_click(b):
global g_input_img_url
with output:
output.clear_output() # 清空之前的输出
if upload.value == "":
print(f"请先选择需要上传的图片!")
return
uploaded_file = upload.value[0]
file_name = uploaded_file['name']
content = uploaded_file['content']
# 将字节内容转换为PIL图像
image = PILImage.open(io.BytesIO(content))
# 指定保存的文件名,以png格式保存
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
input_filename = f"input_{file_name}_{timestamp}.png"
image.save(input_filename, format='PNG')
g_input_img_url = input_filename
print(f"\033[32m已将您上传图片{file_name}保存为{input_filename},请输入提示词后点击'执行推理任务'\033[0m\n")
text_box.disabled = False
# 启用处理按钮
check_all_inputs()
# 设置文本框的事件处理程序
def on_text_change(change):
global g_input_text
g_input_text = change['new']
check_all_inputs() # 检查是否满足执行推理任务的条件
# 定义处理按钮点击事件处理函数
def on_process_button_click(b):
global g_input_img_url, g_input_text
with output:
output.clear_output() # 清空之前的输出
if g_input_img_url and g_input_text:
upload.disabled = True
#upload_button.disabled = True
text_box.disabled = True
process_button.disabled = True
# 执行推理任务
run_model(g_input_img_url, g_input_text, "")
upload.disabled = False
text_box.value = ""
else:
print("请确保已经上传图片并输入文字。")
# 检查所有输入是否满足条件
def check_all_inputs():
if g_input_img_url and g_input_text:
process_button.disabled = False
else:
process_button.disabled = True
# 设置小部件的事件处理程序
upload.observe(on_upload_change, names='value')
upload_button.on_click(on_upload_button_click)
text_box.observe(on_text_change, names='value')
process_button.on_click(on_process_button_click)
# TODO: 模型和pipe建立
model_id = "./"
global pipe
def load_model(torch_dtype, device):
global pipe
num_gpus = torch.cuda.device_count()
if num_gpus < 2:
print("\033[31m该模型推理运行至少需要 2 张异构加速卡卡,当前只检测到 {} 张。请重新选择创建实例。\033[0m".format(num_gpus))
sys.exit()
# 释放显存
torch.cuda.empty_cache()
print(f"\033[31m本模型较大,加载耗时约3-5分钟,请耐心等待...\033[0m")
# 加载模型
try:
print(f"正在以{torch_dtype}类型加载模型...")
print(f"开始加载{model_id}模型...")
# 使用 AutoProcessor 和模型加载
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
model_id, min_pixels=min_pixels, max_pixels=max_pixels
)
# processor = AutoProcessor.from_pretrained(model_id)
pipe = processor, model # 保存处理器和模型
# 这里可以添加更多显存信息和模型状态输出
print(f"模型加载完成...")
# return processor, model
gpu_memory_allocated = torch.cuda.memory_allocated(device)
print(f"\033[32m模型{model_id}加载完成,显存占用: {gpu_memory_allocated / 1024 ** 3:.2f} GB,剩余显存:{(g_gpu_memory - gpu_memory_allocated) / (1024 ** 3):.2f} GB\033[0m\n")
print(f"\033[32m模型加载完成...\033[0m\n")
print(f"\033[31m请按照以下步骤体验本模型:\033[0m")
print(f"\033[31m1.点击'选择本地图片'按钮,选择需要输入的图片\033[0m")
print(f"\033[31m2.点击'上传图片'按钮,将您选择的本地图片上传\033[0m")
print(f"\033[31m3.将提示词输入在提示词输入框\033[0m")
print(f"\033[31m4.点击'执行推理任务'按钮执行\033[0m\n")
# 显示文件上传小部件、按钮、文本框和输出
display(upload)
display(upload_button)
display(text_box)
display(process_button)
display(output)
except torch.cuda.OutOfMemoryError as e:
print(f"\033[31m显存不足,无法继续运行模型,请重启内核释放显存...\033[0m")
print(f"\033[31m错误信息: \033[0m")
print(f"{str(e)}\n")
# 释放显存
torch.cuda.empty_cache()
return False
def run_model(image_url, prompt, negative_prompt):
global pipe
processor, model = pipe # 获取处理器和模型
try:
input_image = Image.open(image_url).convert("RGB")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": input_image ,
},
{"type": "text", "text": prompt},
],
}
]
# 生成图片
print(f"正在加载图像...")
print(f"您输入的图片为:")
display(input_image)
print(f"您输入的提示词为:{prompt}")
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# 将图片和文本准备好
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=1280)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
# # 生成输出
# with torch.no_grad():
# output = model.generate(**inputs, max_new_tokens=1024)
# # 解码输出
# decoded_output = processor.batch_decode(
# output,
# skip_special_tokens=True
# )
filtered_text = re.sub(r'user\n.*?\nmodel', '', output_text[0], flags=re.DOTALL)
print(filtered_text)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"\n\033[31m如需再次体验,请重新上传图片,输入提示词,点击'执行推理任务'按钮执行...\033[0m\n")
except torch.cuda.OutOfMemoryError as e:
print(f"\033[31m显存不足,无法继续运行模型,请重启内核释放显存...\033[0m")
print(f"\033[31m错误信息: \033[0m")
print(f"{str(e)}\n")
# 释放显存
torch.cuda.empty_cache()
return False
# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
g_gpu_memory = props.total_memory
print(f"GPU{i}显存为: {props.total_memory / (1024 ** 3):.2f} GB")
else:
print(f"未检测到GPU卡,模型将被加载至CPU,推理所需时间较长")
# 加载模型并执行推理
prompt = ""
negative_prompt = ""
image_url = ""
parser = argparse.ArgumentParser(description="模型推理提示词参数")
parser.add_argument("image", type=str, nargs='?', default="", help='请输入模型推理输入图片参数')
parser.add_argument("prompt", type=str, nargs='?', default="", help='请输入模型推理提示词参数')
args = parser.parse_args()
image_url = args.image
prompt = args.prompt
load_model(torch.float16, device)
运行效果
视频推理
代码
视频推理没有现成的代码,但是参考了一下:
【工程开发】Qwen2.5-VL-32B-Instruct 微调(一)-CSDN博客
需要改一下模型加载的代码,最终代码如下:
import os
import sys
import torch
import argparse
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import logging
import warnings
# 在类初始化时添加版本检查
from packaging import version
import qwen_vl_utils
# if version.parse(qwen_vl_utils.__version__) < version.parse("0.2.0"):
# raise ImportError("需要 qwen_vl_utils >= 0.2.0,当前版本: " + qwen_vl_utils.__version__)
# 禁用不必要的日志和警告
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")
class VideoInferenceSystem:
def __init__(self, model_name="/root/private_data/SothisAI/model/Aihub/Qwen2.5-VL-32B-Instruct/main/Qwen2.5-VL-32B-Instruct"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = None
self.processor = None
self.load_model(model_name)
def load_model(self, model_name):
"""加载模型和处理器,使用代码片段1的配置"""
print("正在加载模型和处理器...")
# 显存优化配置
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
try:
# 加载模型(禁用特殊注意力机制)
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_name,
local_files_only=True,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2" # 关键修改
)
# 加载处理器(启用快速模式)
self.processor = AutoProcessor.from_pretrained(
model_name,
local_files_only=True,
use_fast=True # 关键修改
)
print(f"成功加载 {model_name} 模型")
except Exception as e:
print(f"模型加载失败: {str(e)}")
sys.exit(1)
def process_video(self, video_path, prompt, fps=1.0):
"""处理视频推理"""
# try:
# 构建消息结构(兼容本地文件和URL)
messages = [{
"role": "user",
"content": [
{
"type": "video",
"video": f"file://{os.path.abspath(video_path)}",
"max_pixels": 360 * 420,
"fps": fps
},
{"type": "text", "text": prompt}
]
}]
text=self.processor.apply_chat_template(messages,tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
fps=fps,
padding=True,
return_tensors="pt"
)
inputs = inputs.to("cuda")
# 推理
generated_ids = self.model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text
# except Exception as e:
# print(f"推理错误: {str(e)}")
# return None
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="视频推理系统")
parser.add_argument("--video", type=str, required=True, help="输入视频路径")
parser.add_argument("--prompt", type=str, required=True, help="推理提示词")
parser.add_argument("--fps", type=float, default=1.0, help="视频采样率")
args = parser.parse_args()
# 初始化系统
vis = VideoInferenceSystem()
# 执行推理
result = vis.process_video(args.video, args.prompt)
print("\n=== 推理结果 ===")
print(result)
运行效果
加载模型和推理花费的时间特别长,是因为显卡的问题?