视觉语言模型VLM部署：基于tensorrt和tensorrt-llm的C++代码-EW帮帮网

llava是一个典型的视觉语言模型，之前视觉语言模型VLM部署：基于tensorrt和tensorrt-llm的python代码使用python进行推理。由于在部署端一般采用c++进行部署，这里采用C++进行实现。基本逻辑与之前相似，以llava-interleave-qwen-0.5b-hf为例，模型整体是采用Siglip（视觉模型）+MLP（投射模型）+Qwen0.5B（语言模型），采用tensorrt和tensorrt-llm进行模型部署。

/*------------------------
Author:    kk
Date:      2025.8.27
Version:   v0.1
function:  vlm llava inference with tensorrt and tensorrt-llm
-------------------------*/
#include <filesystem>
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
#include <cassert>
#include <iomanip>
#include <algorithm>
#include <cstring>   
#include <cstdint>  

#include <opencv2/opencv.hpp>
#include "tokenizers_cpp.h"

#include <cuda_fp16.h> 

#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <cuda_runtime_api.h>

#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"


// tensorrtllm ns
namespace llm = tensorrt_llm::executor;

// image process parameters
static constexpr int   kH = 384;
static constexpr int   kW = 384;
static constexpr bool  kDoConvertRGB = true;   
static constexpr bool  kDoResize     = true;
static constexpr bool  kDoRescale    = true;
static constexpr bool  kDoNormalize  = true;
static constexpr float kRescale      = 1.0f / 255.0f;
static constexpr float kMean[3]      = {0.5f, 0.5f, 0.5f};
static constexpr float kStd[3]       = {0.5f, 0.5f, 0.5f};
static constexpr int   kInterp       = cv::INTER_CUBIC; // PIL BICUBIC ≈ OpenCV INTER_CUBIC


// =====================================================
// 图像预处理，输出：CHW float32，长度 3*kH*kW
inline std::vector<float> preprocess_siglip_chw_f32(const cv::Mat& bgr) {
    CV_Assert(!bgr.empty() && bgr.type() == CV_8UC3);
    cv::Mat img = bgr;

    if (kDoConvertRGB) {cv::cvtColor(img, img, cv::COLOR_BGR2RGB);}
    if (kDoResize) {cv::resize(img, img, cv::Size(kW, kH), 0, 0, kInterp); }
    img.convertTo(img, CV_32FC3);
    if (kDoRescale) {img *= kRescale;}
    // Normalize: (x - mean) / std
    if (kDoNormalize) {
        std::vector<cv::Mat> ch(3);
        cv::split(img, ch);
        for (int i = 0; i < 3; ++i) {
            ch[i] = (ch[i] - kMean[i]) / kStd[i];
        }
        cv::merge(ch, img);
    }

    // print img 
    // cv::Mat roi = img(cv::Rect(0,0,5,5)).clone();
    // std::cout << cv::format(roi, cv::Formatter::FMT_NUMPY) << std::endl;    

    // HWC -> CHW
    std::vector<float> out(3 * kH * kW);
    std::vector<cv::Mat> ch(3);
    cv::split(img, ch);                       
    const size_t plane = static_cast<size_t>(kH) * static_cast<size_t>(kW);
    for (int c = 0; c < 3; ++c) {
        std::memcpy(out.data() + c * plane, ch[c].ptr<float>(), plane * sizeof(float));
    }
    return out;
}


// =====================================================
// ----------------- load tensorrt engine -----------------
class Logger : public nvinfer1::ILogger {
public:
    explicit Logger(Severity s = Severity::kWARNING) : reportableSeverity(s) {}
    void log(Severity severity, const char* msg) noexcept override {
        if (severity <= reportableSeverity) {
            std::cerr << "[TRT] " << msg << std::endl;
        }
    }
private:
    Severity reportableSeverity;
};

static std::vector<char> readFile(const std::string& path) {
    std::ifstream f(path, std::ios::binary);
    if (!f) { throw std::runtime_error("Cannot open " + path); }
    f.seekg(0, std::ios::end);
    size_t sz = static_cast<size_t>(f.tellg());
    f.seekg(0, std::ios::beg);
    std::vector<char> buf(sz);
    f.read(buf.data(), sz);
    return buf;
}

static inline void checkCuda(cudaError_t e, const char* file, int line) {
    if (e != cudaSuccess) {
        std::cerr << "CUDA Error " << cudaGetErrorString(e) << " at " << file << ":" << line << std::endl;
        std::exit(1);
    }
}
#define CHECK_CUDA(expr) checkCuda((expr), __FILE__, __LINE__)

static inline size_t trtTypeSize(nvinfer1::DataType t) {
    switch (t) {
        case nvinfer1::DataType::kFLOAT: return 4;
        case nvinfer1::DataType::kHALF:  return 2;
        case nvinfer1::DataType::kINT8:  return 1;
        case nvinfer1::DataType::kINT32: return 4;
        case nvinfer1::DataType::kBOOL:  return 1;
        default:                    return 0;
    }
}

// cal length
static inline int64_t volume(const nvinfer1::Dims& d) {
    int64_t v = 1;
    for (int i = 0; i < d.nbDims; ++i) v *= d.d[i];
    return v;
}

template <typename T>
using TrtUniquePtr = std::unique_ptr<T>;

struct EngineIO { std::string inName; std::string outName; };

struct RunResult {
    std::vector<uint8_t> outHost;  // raw bytes of output (CPU)
    nvinfer1::Dims            outShape; // TRT output shape 
    nvinfer1::DataType        outType;  // TRT output dtype
};

RunResult runEngineV3(
    const std::string& enginePath,
    const EngineIO& io,
    const nvinfer1::Dims& inputShape,
    const void* inputDataHost,
    size_t inputBytes,
    Logger& logger
){
    TrtUniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(logger)};
    auto blob = readFile(enginePath);
    TrtUniquePtr<nvinfer1::ICudaEngine> engine{runtime->deserializeCudaEngine(blob.data(), blob.size())};
    if (!engine) throw std::runtime_error("Deserialize failed: " + enginePath);

    auto hasByName = [&](const std::string& n){
        return engine->getTensorIOMode(n.c_str()) != nvinfer1::TensorIOMode::kNONE;
    };
    if (!hasByName(io.inName) || !hasByName(io.outName)) {
        throw std::runtime_error("Tensor not found: " + io.inName + " or " + io.outName);
    }

    TrtUniquePtr<nvinfer1::IExecutionContext> context{engine->createExecutionContext()};
    if (!context) throw std::runtime_error("createExecutionContext failed");

    // 设置动态输入形状
    if (!context->setInputShape(io.inName.c_str(), inputShape)) {
        throw std::runtime_error("setInputShape failed for " + io.inName);
    }

    const nvinfer1::DataType outType = engine->getTensorDataType(io.outName.c_str());
    const nvinfer1::Dims outShape    = context->getTensorShape(io.outName.c_str());

    cudaStream_t stream{}; CHECK_CUDA(cudaStreamCreate(&stream));

    // H2D 输入
    void* dIn{nullptr}; CHECK_CUDA(cudaMalloc(&dIn, inputBytes));
    CHECK_CUDA(cudaMemcpyAsync(dIn, inputDataHost, inputBytes, cudaMemcpyHostToDevice, stream));

    const int64_t outElems = volume(outShape);
    const size_t outBytes  = static_cast<size_t>(outElems) * trtTypeSize(outType);

    void* dOut{nullptr}; CHECK_CUDA(cudaMalloc(&dOut, outBytes));
    std::vector<uint8_t> hOut(outBytes);

    // 绑定地址并执行
    if (!context->setTensorAddress(io.inName.c_str(),  dIn))  throw std::runtime_error("setTensorAddress(in) failed");
    if (!context->setTensorAddress(io.outName.c_str(), dOut)) throw std::runtime_error("setTensorAddress(out) failed");
    if (!context->enqueueV3(stream))                           throw std::runtime_error("enqueueV3 failed");

    CHECK_CUDA(cudaMemcpyAsync(hOut.data(), dOut, outBytes, cudaMemcpyDeviceToHost, stream));
    CHECK_CUDA(cudaStreamSynchronize(stream));
    cudaFree(dIn); cudaFree(dOut); cudaStreamDestroy(stream);

    RunResult rr;
    rr.outHost  = std::move(hOut);
    rr.outShape = outShape;
    rr.outType  = outType;
    return rr;
}


// =====================================================
// ----------------- load tensorrt-llm engine --------------
static constexpr llm::DataType kPromptDtype = llm::DataType::kFP16;

// fp32 -> fp16
inline std::vector<uint16_t> fp32_to_fp16(const std::vector<float>& x) {
    std::vector<uint16_t> h(x.size());
    for (size_t i=0;i<x.size();++i) {
        uint32_t u; std::memcpy(&u, &x[i], 4);
        uint32_t s=(u>>31)&1; int e=((u>>23)&0xFF)-127+15; uint32_t m=(u>>13)&0x3FF;
        uint16_t out;
        if (e<=0){ if(e<-10) out=(uint16_t)(s<<15); else { m=(m|0x400)>>(1-e); out=(uint16_t)((s<<15)|m);} }
        else if(e>=31){ out=(uint16_t)((s<<15)|(0x1F<<10)); if(u&0x7FFFFF) out|=1; }
        else { out=(uint16_t)((s<<15)|(e<<10)|(m&0x3FF)); }
        h[i]=out;
    }
    return h;
}

// // fp16 -> fp32
// inline float fp16_to_fp32_cuda(uint16_t bits){
//     __half h;
//     std::memcpy(&h, &bits, sizeof(uint16_t)); // 把位模式放进 __half
//     return __half2float(h);                   // 转 float32（Host可用）
// }  

// 数据拷到 llm::Tensor
template <typename T>
llm::Tensor makeCpuTensor2D(const std::vector<T>& flat, size_t M, size_t H) {
    llm::Tensor t = llm::Tensor::cpu(
        std::is_same<T,uint16_t>::value ? llm::DataType::kFP16 : llm::DataType::kFP32,
        llm::Shape{ static_cast<int64_t>(M), static_cast<int64_t>(H) }
    );
    std::memcpy(t.getData(), flat.data(), flat.size()*sizeof(T));
    return t;
}

// 构造 expanded_ids：把 token [eng_vocab, eng_vocab+M) 插到 pos 位置
static std::vector<int32_t> buildExpandedIds(const std::vector<int32_t>& ids, int pos, int eng_vocab, int M) {
    std::vector<int32_t> out;
    out.reserve(ids.size() - 1 + M);
    out.insert(out.end(), ids.begin(), ids.begin()+pos);           // left
    for (int i=0;i<M;++i) out.push_back(eng_vocab + i);            // fake ids
    out.insert(out.end(), ids.begin()+pos+1, ids.end());           // right
    return out;
}


// =====================================================
// load tokenizer
static std::string LoadBytesFromFile(const std::string& path) {
    std::ifstream ifs(path, std::ios::binary);
    if (!ifs) { throw std::runtime_error("open failed: " + path); }
    return std::string{std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>()};
}


std::string build_chat_prompt(const std::string& user_message = "What are these?")
{
    std::string prompt;
    prompt += "<|im_start|>";
    prompt += "user\n";
    prompt += "<image>\n";
    prompt += user_message;
    prompt += "<|im_end|>\n<|im_start|>";
    prompt += "assistant\n";
    return prompt;
}

// =====================================================
// -------------------- main主函数----------------------
int main(int argc, char** argv) {

    // load img and preprocess
    cv::Mat bgr = cv::imread("/media/wangyl/OS/wyl/program/VLMInference/vlm_cxx_inference/source/000000039769.jpg");
    if (bgr.empty()) { std::cerr << "read image failed\n"; return 1; }
    auto preprocess_img = preprocess_siglip_chw_f32(bgr);

    // tensorrt inference parameters
    Logger logger;
    std::string V_ENGINE = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/vision_siglip.engine";
    std::string P_ENGINE = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/projector.engine";
    const int Batch_size = 1, Channel = 3, H_img = 384, W_img = 384;


    // 1) vision infer
    nvinfer1::Dims inDimsVision{4, {Batch_size, Channel, H_img, W_img}};
    RunResult vis = runEngineV3(
        V_ENGINE,
        EngineIO{ "pixel_values", "feats" },/* io */ 
        inDimsVision,
        preprocess_img.data(), /* input host ptr */ 
        preprocess_img.size() * sizeof(float), /* bytes */
        logger
    );
    std::cout << "[Vision] out dtype=" << static_cast<int>(vis.outType) << " shape=(";
    for (int i=0;i<vis.outShape.nbDims;++i)
        std::cout << vis.outShape.d[i] << (i+1<vis.outShape.nbDims? ",":"");
    std::cout << ")\n";

    
    // 2) projector infer
    const int64_t featsElems = volume(vis.outShape);
    const size_t  featsBytes = static_cast<size_t>(featsElems) * trtTypeSize(vis.outType);

    nvinfer1::Dims inDimsProj = vis.outShape;
    RunResult proj = runEngineV3(
        P_ENGINE,
        EngineIO{ "feats_in", "feats_out" },/* io */ 
        inDimsProj,
        vis.outHost.data(),/* input host ptr */ 
        featsBytes,/* bytes */ 
        logger
    );
    std::cout << "[Projector] out dtype=" << static_cast<int>(proj.outType) << " shape=(";
    for (int i=0;i<proj.outShape.nbDims;++i)
        std::cout << proj.outShape.d[i] << (i+1<proj.outShape.nbDims? ",":"");
    std::cout << ")\n";

    // 3) vision feature for llm
    assert(proj.outType == nvinfer1::DataType::kFLOAT);
    const int nb = proj.outShape.nbDims;
    int64_t hiddenDim = proj.outShape.d[nb - 1];
    int64_t M_rows = 1;
    for (int i = 0; i < nb - 1; ++i) M_rows *= proj.outShape.d[i];

    const int64_t total = M_rows * hiddenDim;
    std::vector<float> mm_features_fp32(static_cast<size_t>(total));
    std::memcpy(mm_features_fp32.data(), proj.outHost.data(), static_cast<size_t>(total) * sizeof(float));

    // std::cout << std::fixed << std::setprecision(6);
    // std::cout << "mm_features: " << M_rows << " x " << hiddenDim << "\n"
    //           << "first row (8 vals): ";
    // for (int j = 0; j < std::min<int64_t>(8, hiddenDim); ++j) {
    //     std::cout << mm_features_fp32[j] << " ";
    // }
    // std::cout << "\n";

    // 4) tensorrt-llm infer
    initTrtLlmPlugins();
    std::string engineDir = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/llm_qwen2_0.5b/llama_trt_engine"; 

    //tokenizer
    const int EOS_ID        = 151645; // "<|im_end|>"
    const int PAD_ID        = 151643; // "<|endoftext|>"
    const int IMAGE_TOKEN_ID= 151646; // "<image>"
    int eng_vocab = 152000; // vocab_size

    std::string blob = LoadBytesFromFile("/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/tokenizer/tokenizer.json");
    auto tok = tokenizers::Tokenizer::FromBlobJSON(blob);
    std::string prompt = build_chat_prompt("What are these?");
    std::vector<int32_t> ids = tok->Encode(prompt);

    // 定位<image>的位置作为图像特征插入位置 pos
    auto itImg = std::find(ids.begin(), ids.end(), IMAGE_TOKEN_ID);
    if (itImg == ids.end())
    {
        std::cerr << "[FATAL] prompt ids does not contain <image> (id=" << IMAGE_TOKEN_ID << ")\n";
        return 1;
    }
    int pos = static_cast<int>(std::distance(ids.begin(), itImg)); //pos=3


    // 构造 expanded ids：把 [eng_vocab, eng_vocab+M_rows) 插在 pos 位置
    std::vector<int32_t> expanded = buildExpandedIds(ids, pos, eng_vocab, static_cast<int>(M_rows));

    // kv复用标志位
    std::vector<long unsigned int> inputTokenExtraIds(expanded.size(), static_cast<long unsigned int>(-1));
    for (int i = 0; i < static_cast<int>(M_rows); ++i) {
        inputTokenExtraIds[pos + i] = i;
    }

    // 构造 Prompt 表（[M_rows, hiddenDim]）
    llm::Tensor promptTable;
    if (kPromptDtype == llm::DataType::kFP16){
        auto half_bits = fp32_to_fp16(mm_features_fp32);
        promptTable = makeCpuTensor2D<uint16_t>(
            half_bits, static_cast<size_t>(M_rows), static_cast<size_t>(hiddenDim));
    }

    // 采样参数
    llm::SamplingConfig sampling;
    sampling.setTemperature(0.0f);
    sampling.setTopP(1.0f);
    sampling.setSeed(0);
    sampling.setTopK(0);                 // 显式关闭 top-k

    // infer
    llm::PromptTuningConfig pConfig(promptTable, std::make_optional(inputTokenExtraIds));
    llm::VecTokens batchInput;                        
    batchInput.assign(expanded.begin(), expanded.end());
    llm::SizeType32 maxNewTokens = 200;
    llm::Request req(
        batchInput,
        maxNewTokens,
        /*streaming=*/false,
        sampling,
        llm::OutputConfig{},     // 默认输出配置
        /*endId=*/EOS_ID,
        /*padId=*/PAD_ID,
        /*positionIds=*/std::nullopt,
        /*badWords=*/std::nullopt,
        /*stopWords=*/std::nullopt,   
        /*embeddingBias=*/std::nullopt,
        /*externalDraftTokens=*/std::nullopt,
        /*pTuningConfig=*/pConfig,    // 关键：挂上图像 Prompt 表
        /*multimodalInput=*/std::nullopt,
        /*multimodalEmbedding=*/std::nullopt
    );
    llm::SizeType32 beamWidth = 1;
    llm::ExecutorConfig execCfg(beamWidth);
    execCfg.setGpuWeightsPercent(1.0f); 
    llm::Executor executor(std::filesystem::path(engineDir),
                        llm::ModelType::kDECODER_ONLY,
                        execCfg);
    auto rid = executor.enqueueRequest(req);
    auto responses = executor.awaitResponses(rid);

    // decode
    std::vector<int32_t> out_ids;
    if (!responses.empty())
    {
        const auto& res = responses.at(0).getResult();
        if (!res.outputTokenIds.empty())
        {
            // 取第0个 beam
            const auto& beams = res.outputTokenIds;
            out_ids.assign(beams[0].begin(), beams[0].end());
        }
    }

    std::string text = tok->Decode(out_ids);
    std::cout << "decoded: " << text << "\n";

    return 0;
}

CMakeLists.txt文件：

cmake_minimum_required(VERSION 3.27)

set(TRTLLM_DIR "/home/wangyl/docker_program/trtllm/TensorRT-LLM")
list(APPEND CMAKE_MODULE_PATH "${TRTLLM_DIR}/cpp/cmake/modules")

if(NOT TRTLLM_BUILD_DIR)
  set(TRTLLM_BUILD_DIR "${TRTLLM_DIR}/cpp/build")
endif()
set(TRTLLM_LIB_PATH "${TRTLLM_BUILD_DIR}/tensorrt_llm/libtensorrt_llm.so")
set(TRTLLM_PLUGIN_PATH
    "${TRTLLM_BUILD_DIR}/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so"
)
set(TRTLLM_INCLUDE_DIR "${TRTLLM_DIR}/cpp/include")

# Determine CXX11 ABI compatibility
execute_process(
  COMMAND bash -c "nm -f posix -D ${TRTLLM_LIB_PATH} | grep __cxx11"
  RESULT_VARIABLE GLIB_CXX11_FOUND
  OUTPUT_QUIET)
if(GLIB_CXX11_FOUND EQUAL 0)
  set(USE_CXX11_ABI 1)
else()
  set(USE_CXX11_ABI 0)
endif()
message(STATUS "Use CXX11 ABI: ${USE_CXX11_ABI}")
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}")

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
set(CMAKE_VERBOSE_MAKEFILE 1)

# Define project name
project(llava_vlm_Inference)

# Compile options
set(CMAKE_CXX_FLAGS "-Wall -pthread -lstdc++ -DENABLE_MULTI_DEVICE=1")
# set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
set(CMAKE_BUILD_TYPE Debug)


# find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)
find_package(CUDAToolkit REQUIRED COMPONENTS cuda_driver cudart_static nvml)
message(STATUS "CUDA library status:")
message(STATUS "    version: ${CUDAToolkit_VERSION}")
message(STATUS "    libraries: ${CUDAToolkit_LIBRARY_DIR}")
message(STATUS "    include path: ${CUDAToolkit_INCLUDE_DIRS}")

# TRT dependencies
find_package(TensorRT 10 REQUIRED)

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
  add_definitions("-DENABLE_BF16")
  message(
    STATUS
      "CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
  )
endif()

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8")
  add_definitions("-DENABLE_FP8")
  message(
    STATUS
      "CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
  )
endif()

set(TOKCPP_ROOT /home/wangyl/docker_program/trtllm/tokenizers-cpp)
set(TOKCPP_BUILD ${TOKCPP_ROOT}/build)


# tensorrt_llm shared lib
add_library(tensorrt_llm SHARED IMPORTED)
set_property(TARGET tensorrt_llm PROPERTY IMPORTED_LOCATION ${TRTLLM_LIB_PATH})
set_property(
  TARGET tensorrt_llm PROPERTY IMPORTED_LINK_INTERFACE_LIBRARIES
                               CUDA::cuda_driver CUDA::cudart_static CUDA::nvml)

# nvinfer_plugin_tensorrt_llm shared lib
add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
set_property(TARGET nvinfer_plugin_tensorrt_llm PROPERTY IMPORTED_LOCATION
                                                         ${TRTLLM_PLUGIN_PATH})
set_property(TARGET nvinfer_plugin_tensorrt_llm
             PROPERTY IMPORTED_LINK_INTERFACE_LIBRARIES tensorrt_llm)

include_directories(${TRTLLM_INCLUDE_DIR} ${CUDAToolkit_INCLUDE_DIRS})

include_directories(${CUDA_INCLUDE_DIRS} /usr/include/opencv4)

add_executable(llava_qwen_05B_infer ./src/llava_qwen_05B_infer.cpp)
target_include_directories(llava_qwen_05B_infer PRIVATE ${TOKCPP_ROOT}/include)
target_link_directories(llava_qwen_05B_infer PRIVATE  ${TOKCPP_BUILD})
target_link_libraries(llava_qwen_05B_infer 
                        ${CUDA_LIBRARIES} 
                        nvinfer 
                        CUDA::cudart 
                        opencv_core 
                        opencv_imgproc 
                        opencv_highgui 
                        nvinfer_plugin_tensorrt_llm
                        tokenizers_cpp
                        tokenizers_c
                        -lcnpy)

同样的图像和提示词，输出结果与python结果输出一致：

These are two cats, one on the left and one on the right. They are lying on a pink blanket, which is placed on a couch. The cat on the left is sleeping, while the one on the right is resting.

其他例子：自动驾驶仿真场景描述：

These are images of a virtual environment, likely from a video game or a simulation. The scene depicts a car driving down a road with a cloudy sky, greenery on the sides, and a street lamp in the background. The environment is designed to simulate a realistic driving experience, with realistic textures and lighting effects that give the impression of a real-world setting.

视觉语言模型VLM部署：基于tensorrt和tensorrt-llm的C++代码

网站公告

今日签到

热门文章

最新发布