单模型
llama-server.exe -m "G:\AI-AI\LLM\stablediffusionv2.gguf" --port 8081
多模型
llama-server.exe --config_file <config_file>
{
"host": "0.0.0.0",
"port": 8080,
"models": [
{
"model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
"model_alias": "gpt-3.5-turbo",
"chat_format": "chatml",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 512,
"n_ctx": 2048
},
{
"model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
"model_alias": "gpt-4",
"chat_format": "chatml",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 512,
"n_ctx": 2048
},
{
"model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
"model_alias": "gpt-4-vision-preview",
"chat_format": "llava-1-5",
"clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 512,
"n_ctx": 2048
},
{
"model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
"model_alias": "text-davinci-003",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 512,
"n_ctx": 2048
},
{
"model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
"model_alias": "copilot-codex",
"n_gpu_layers": -1,
"offload_kqv": true,
"n_threads": 12,
"n_batch": 1024,
"n_ctx": 9216
}
]
}