vlm MiniCPM 学习部署实战-EW帮帮网



# test.py
import torch
from PIL import Image
from modelscope import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)

image = Image.open(r"B:\360MoveData\Users\Administrator\Pictures\liuying\IMG_20150903_123711.jpg").convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]

res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(res)

## if you want to use streaming, please make sure sampling=True and stream=True
## the model.chat will return a generator
res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True,
    stream=True
)

generated_text = ""
for new_text in res:
    generated_text += new_text
    print(new_text, flush=True, end='')

多图推理demo：

import torch  
from PIL import Image  
from modelscope import AutoModel, AutoTokenizer  

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,  
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager  
model = model.eval().cuda()  
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)  

image1 = Image.open('image1.jpg').convert('RGB')  
image2 = Image.open('image2.jpg').convert('RGB')  
question = 'Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'  

msgs = [{'role': 'user', 'content': [image1, image2, question]}]  

answer = model.chat(  
    image=None,  
    msgs=msgs,  
    tokenizer=tokenizer  
)  
print(answer)

视频理解

import torch  
from PIL import Image  
from modelscope import AutoModel, AutoTokenizer  
from decord import VideoReader, cpu    # pip install decord  

params={}  

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,  
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager  
model = model.eval().cuda()  
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)  

MAX_NUM_FRAMES=64  

def encode_video(video_path):  
    def uniform_sample(l, n):  
        gap = len(l) / n  
        idxs = [int(i * gap + gap / 2) for i in range(n)]  
        return [l[i] for i in idxs]  

    vr = VideoReader(video_path, ctx=cpu(0))  
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS  
    frame_idx = [i for i in range(0, len(vr), sample_fps)]  
    if len(frame_idx) > MAX_NUM_FRAMES:  
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)  
    frames = vr.get_batch(frame_idx).asnumpy()  
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]  
    print('num frames:', len(frames))  
    return frames  

video_path="car.mp4"  
frames = encode_video(video_path)  
question = "Describe the video"  
msgs = [  
    {'role': 'user', 'content': frames + [question]},   
]  

# Set decode params for video  
params["use_image_id"] = False  
params["max_slice_nums"] = 2 # 如果cuda OOM且视频分辨率大于448*448 可设为1  

answer = model.chat(  
    image=None,  
    msgs=msgs,  
    tokenizer=tokenizer,  
    **params  
)  
print(answer)

BitCPM4 技术报告

而刚刚提到的43页技术报告，我看了一遍，觉得可以拆成以下：

InfLLM v2：Attention 层只看重点
FR-Spec：草稿阶段不全写
BitCPM4：训练时就考虑压缩
CPM.cu + ArkInfer：定制推理 & 部署系统
风洞 2.0：小模型先试，大模型再训

vlm MiniCPM 学习部署实战

开源地址：

模型repo下载：

单图片demo：

多图推理demo：

视频理解

论文学习笔记：

部署完整教程：

微调教程：

部署，微调教程，视频实测

BitCPM4 技术报告

创意：把量化塞进训练

网站公告

今日签到

热门文章

最新发布