version: '3'
services:
moonlit-vllm-qwen-hotvideo:
image: vllm/vllm-openai:v0.6.2
container_name: hitvideos_api
restart: always
command: [
"--served-model-name", "qwen2.5-14b-hitvideos",
"--model", "/root/models/Qwen2.5-14B-Insruct-GPTQ-Int4-1113",
# "--api-key", "sk-zZVAfGSXnGjVpYT127Cf5aD420F648F1826355455eEaD881",
# "--max-model-len", "512",
"--tool-call-parser", "hermes",
"--enable-auto-tool-choice",
"--enforce_eager",
"--gpu-memory-utilization","0.5",
# "--max_num_seqs","256",
"--cpu-offload-gb","2"
]
volumes:
- /data/preview/base/models/SFT/hitvideos:/root/models
ports:
- "11110:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
device_ids: ['0']
environment:
TZ: Asia/Shanghai
# networks:
# - moonlit-vllm
#
#networks:
# moonlit-vllm:
# external: true