export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |
vllm serve \ | |
"/data/zeju/DeepSeek-R1-Distill-Qwen-14B" \ | |
--served-model-name "DeepSeek-R1-Distill-Qwen-14B" \ | |
--port 8014 \ | |
--tensor-parallel-size 8 \ | |
--dtype auto \ | |
--api-key "token-abc123" \ | |
# --gpu_memory_utilization 0.8 \ | |
# --enable-prefix-caching | |