【大模型实战】微调Qwen2.5 VL模型，增强目标检测任务。-EW帮帮网

制作数据集

这个章节将详细解析一个将Labelme标注数据集转换为Qwen2.5-VL模型训练格式的Python脚本。该工具实现了图像大小调整、边界框坐标转换和数据格式标准化等功能。生成适用Qwen2.5-VL的数据集。

核心功能概述

图像处理：将图像调整为固定尺寸
坐标转换：同步调整边界框坐标
格式转换：生成Qwen2.5-VL兼容的JSONL格式
错误处理：记录处理失败的文件

import os
import json
import numpy as np
from PIL import Image
from tqdm import tqdm


def direct_resize(image, target_size=(1024, 1024)):
    """
    直接调整图像到目标尺寸（不保持宽高比）

    参数:
    image: PIL.Image对象 - 原始图像
    target_size: (width, height) - 目标图像尺寸

    返回:
    resized_image: PIL.Image对象 - 调整后的图像
    scale: (scale_x, scale_y) - 宽高缩放比例
    """
    orig_w, orig_h = image.size
    target_w, target_h = target_size

    # 计算缩放比例
    scale_x = target_w / orig_w
    scale_y = target_h / orig_h

    # 直接缩放图像
    resized_image = image.resize(target_size, Image.Resampling.LANCZOS)

    return resized_image, (scale_x, scale_y)


def direct_resize_bbox(original_size, target_size, bbox, scale):
    """
    直接缩放边界框坐标（不保持宽高比）

    参数:
    original_size: (width, height) 原始图像尺寸
    target_size: (width, height) 目标图像尺寸
    bbox: [x_min, y_min, x_max, y_max] 原始边界框坐标
    scale: (scale_x, scale_y) 宽高缩放比例

    返回:
    normalized_bbox: [x1, y1, x2, y2] 归一化后的坐标(0-1范围)
    """
    orig_w, orig_h = original_size
    target_w, target_h = target_size
    scale_x, scale_y = scale

    # 解包原始bbox坐标
    x_min, y_min, x_max, y_max = bbox

    # 应用缩放
    x_min_scaled = x_min * scale_x
    y_min_scaled = y_min * scale_y
    x_max_scaled = x_max * scale_x
    y_max_scaled = y_max * scale_y


    return [round(x_min_scaled, 4), round(y_min_scaled, 4), round(x_max_scaled, 4), round(y_max_scaled, 4)]


def labelme_to_qwenvl(labelme_dir, output_file, target_size=(1024, 1024), default_description="请定位图像中的物体"):
    """
    转换Labelme数据集为Qwen2.5-VL格式

    参数:
    labelme_dir: Labelme数据集目录路径
    output_file: 输出JSONL文件路径
    target_size: (width, height) 目标图像尺寸
    default_description: 默认图像描述文本
    """
    # 创建输出目录
    output_dir = os.path.join(os.path.dirname(labelme_dir), "resized_images")
    os.makedirs(output_dir, exist_ok=True)

    # 收集所有Labelme JSON文件
    json_files = [f for f in os.listdir(labelme_dir) if f.endswith('.json')]

    error_count = 0
    processed_count = 0

    with open(output_file, 'w', encoding='utf-8') as out_f:
        for json_file in tqdm(json_files, desc="转换数据集中"):
            json_path = os.path.join(labelme_dir, json_file)

            try:
                # 读取Labelme标注文件
                with open(json_path, 'r', encoding='utf-8') as f:
                    labelme_data = json.load(f)

                # 获取图像信息
                img_name = labelme_data['imagePath']
                img_path = os.path.join(labelme_dir, img_name)
                img_width = labelme_data['imageWidth']
                img_height = labelme_data['imageHeight']
                original_size = (img_width, img_height)

                # 打开并处理图像
                with Image.open(img_path) as img:
                    # 直接调整图像大小（不保持宽高比）
                    resized_img, scale = direct_resize(img, target_size)

                    # 保存调整后的图像
                    new_img_name = f"resized_{img_name}"
                    new_img_path = os.path.join(output_dir, new_img_name)
                    resized_img.save(new_img_path)

                    # 收集所有对象的边界框和标签
                    objects = []
                    for shape in labelme_data['shapes']:
                        if shape['shape_type'] != 'rectangle':
                            continue  # 跳过非矩形标注

                        label = shape['label']
                        points = np.array(shape['points'])

                        # 转换为[x_min, y_min, x_max, y_max]格式
                        x_coords = points[:, 0]
                        y_coords = points[:, 1]
                        x_min, x_max = min(x_coords), max(x_coords)
                        y_min, y_max = min(y_coords), max(y_coords)
                        bbox = [x_min, y_min, x_max, y_max]

                        # 应用直接缩放转换
                        normalized_bbox = direct_resize_bbox(
                            original_size,
                            target_size,
                            bbox,
                            scale
                        )

                        objects.append({
                            "bbox_2d": normalized_bbox,
                            "label": label
                        })

                    # 构建Qwen2.5-VL格式
                    assistant_content = "```json\n" + json.dumps(objects, ensure_ascii=False) + "\n```"

                    sample = {
                        "messages": [
                            {
                                "role": "user",
                                "content": f"<image>{default_description}"
                            },
                            {
                                "role": "assistant",
                                "content": assistant_content
                            }
                        ],
                        "images": [new_img_path]  # 使用新图片路径
                    }

                    # 写入JSONL文件
                    out_f.write(json.dumps(sample, ensure_ascii=False) + '\n')
                    processed_count += 1

            except Exception as e:
                print(f"处理文件 {json_file} 时出错: {str(e)}")
                error_count += 1

    print(f"\n转换完成! 输出文件: {output_file}")
    print(f"成功处理: {processed_count} 个文件")
    print(f"失败: {error_count} 个文件")
    print(f"调整后的图像已保存到: {output_dir}")


if __name__ == "__main__":
    # ===== 配置参数 =====
    LABELME_DIR = "../labelme-car-618"  # 替换为你的Labelme数据集目录
    OUTPUT_FILE = "qwen_vg_dataset.jsonl"  # 输出文件名
    TARGET_SIZE = (512, 512)  # 目标图像尺寸

    # 任务提示语
    task_prompt = """
    请仔细标注图像中每辆出租车、每辆私家车、每辆卡车、每辆公交车的精确边界框。对于每辆出租车，提供一个JSON对象包含：
    - 'bbox_2d': 由四个整数组成的数组 [x1, y1, x2, y2]，分别表示左上角和右下角坐标
    - 'label': 出租车字符串值 'taxi',私家车字符串值 'car',卡车字符串值 'truck',公交车字符串值 'bus'

    确保：
    1. 边界框紧密贴合整个车辆（包括车轮和车顶）
    2. 坐标是相对于图像尺寸的绝对像素值
    3. 只标注完全可见的出租车
    4. 仅输出有效的JSON对象，每辆出租车一个对象，不要添加额外文本或解释
    """
    # 执行转换
    labelme_to_qwenvl(
        labelme_dir=LABELME_DIR,
        output_file=OUTPUT_FILE,
        target_size=TARGET_SIZE,
        default_description=task_prompt.strip()  # 去除首尾空白
    )

使用微调的模型制作数据集

import glob
import json
import os
import re
import ast  # 新增用于解析非标准JSON
import cv2
from qwen_vl_utils import process_vision_info
from transformers import AutoModelForVision2Seq, AutoProcessor

# 初始化模型
model = AutoModelForVision2Seq.from_pretrained(
    "output/Qwen2.5-VL-7B-Instruct/v2-20250625-112537/checkpoint-47-merged",
    torch_dtype='auto',
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("output/Qwen2.5-VL-7B-Instruct/v2-20250625-112537/checkpoint-47-merged")

prompt = """
请仔细标注图像中每辆出租车的精确边界框。对于每辆出租车，提供一个JSON对象包含：
- 'bbox_2d': 由四个整数组成的数组 [x1, y1, x2, y2]，分别表示左上角和右下角坐标
- 'label': 字符串值 'taxi'

确保：
1. 边界框紧密贴合整个车辆（包括车轮和车顶）
2. 坐标是相对于图像尺寸的绝对像素值
3. 只标注完全可见的出租车（忽略部分遮挡的车辆）
4. 仅输出有效的JSON对象，每辆出租车一个对象，不要添加额外文本或解释
"""


def extract_taxi_data(response):
    """
    从响应中提取出租车边界框数据（增强解析能力）
    """
    # 尝试提取JSON代码块
    json_str = None
    match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
    if match:
        json_str = match.group(1).strip()
    else:
        # 尝试直接提取JSON数组
        match = re.search(r'\[.*\]', response, re.DOTALL)
        if match:
            json_str = match.group(0).strip()

    if not json_str:
        print("未找到有效的JSON数据")
        return []

    # 增强JSON解析（处理单引号等非标准格式）
    try:
        # 先尝试标准JSON解析
        return json.loads(json_str)
    except json.JSONDecodeError:
        try:
            # 尝试使用ast解析Python字面量
            return ast.literal_eval(json_str)
        except (SyntaxError, ValueError) as e:
            print(f"JSON解析错误: {e}")
            return []


def auto_annotate(image_path, output_json):
    img = cv2.imread(image_path)
    if img is None:
        print(f"无法读取图像: {image_path}")
        return

    height, width = img.shape[:2]

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    # 准备输入
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)

    # 模型推理
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048
    )
    response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("模型原始响应:\n", response)

    # 解析输出
    json_data = extract_taxi_data(response)
    print("解析后的JSON数据:", json_data)

    shapes = []
    for item in json_data:
        if 'bbox_2d' not in item or len(item['bbox_2d']) != 4:
            print(f"跳过无效的bbox数据: {item}")
            continue

        x1, y1, x2, y2 = item['bbox_2d']

        # 关键修复：处理归一化坐标（模型返回的是0-1之间的值）
        # 检查是否是归一化坐标（所有值在0-1之间）
        if all(0 <= val <= 1 for val in [x1, y1, x2, y2]):
            # 归一化坐标 → 绝对坐标
            x1_abs = int(x1 * width)
            y1_abs = int(y1 * height)
            x2_abs = int(x2 * width)
            y2_abs = int(y2 * height)
        else:
            # 已经是绝对坐标（直接取整）
            x1_abs = int(x1)
            y1_abs = int(y1)
            x2_abs = int(x2)
            y2_abs = int(y2)  # 修复：原来是int(x2)

        # 确保坐标在图像范围内
        x1_abs = max(0, min(x1_abs, width - 1))
        y1_abs = max(0, min(y1_abs, height - 1))
        x2_abs = max(0, min(x2_abs, width - 1))
        y2_abs = max(0, min(y2_abs, height - 1))

        # 确保是有效矩形
        if x1_abs >= x2_abs or y1_abs >= y2_abs:
            print(f"跳过无效的矩形: [{x1_abs}, {y1_abs}, {x2_abs}, {y2_abs}]")
            continue

        shapes.append({
            "label": item.get('label', 'taxi').strip(),
            "points": [[x1_abs, y1_abs], [x2_abs, y2_abs]],
            "group_id": None,
            "shape_type": "rectangle",
            "flags": {}
        })

    # 构建labelme格式
    labelme_data = {
        "version": "5.1.1",
        "flags": {},
        "shapes": shapes,
        "imagePath": os.path.basename(image_path),
        "imageData": None,
        "imageHeight": height,
        "imageWidth": width
    }

    # 保存JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(labelme_data, f, ensure_ascii=False, indent=2)

    print(f"标注已保存至: {output_json}, 检测到 {len(shapes)} 辆出租车")
    return labelme_data


if __name__ == "__main__":
    root_labelme = "../Labelme_Taxi"

    # 清理旧JSON文件
    for json_file in glob.glob(os.path.join(root_labelme, "*.json")):
        os.remove(json_file)

    # 处理所有JPG图像
    for img_path in glob.glob(os.path.join(root_labelme, "*.jpg")):
        print(f"\n处理图像: {img_path}")
        auto_annotate(img_path, img_path.replace('.jpg', '.json'))

【大模型实战】微调Qwen2.5 VL模型，增强目标检测任务。

制作数据集

使用微调的模型制作数据集

网站公告

今日签到

热门文章

最新发布