Spaces:
Sleeping
Sleeping
# Set a writable cache directory for Hugging Face Hub | |
export HF_HOME=/app/.cache | |
export XDG_CONFIG_HOME=/app/.config | |
mkdir -p /app/.cache | |
# Optionally set a USER_AGENT to identify your requests | |
export USER_AGENT="vllm_huggingface_space" | |
# Launch the vLLM server with the model tag as a positional argument | |
vllm serve unsloth/llama-3-8b-Instruct-bnb-4bit \ | |
--enable-auto-tool-choice \ | |
--tool-call-parser llama3_json \ | |
--chat-template examples/tool_chat_template_llama3.1_json.jinja \ | |
--quantization bitsandbytes \ | |
--load-format bitsandbytes \ | |
--dtype half \ | |
--enforce-eager \ | |
--max-model-len 8192 & | |
# Wait to ensure the vLLM server is fully started (adjust if needed) | |
sleep 10 | |
# Start the Gradio application using python3 | |
python3 app.py | |