#!/bin/bash # Set a writable cache directory for Hugging Face Hub export HF_HOME=/app/.cache export XDG_CONFIG_HOME=/app/.config mkdir -p /app/.cache # Optionally set a USER_AGENT to identify your requests export USER_AGENT="vllm_huggingface_space" # Launch the vLLM server with the model tag as a positional argument vllm serve unsloth/llama-3-8b-Instruct-bnb-4bit \ --enable-auto-tool-choice \ --tool-call-parser llama3_json \ --chat-template examples/tool_chat_template_llama3.1_json.jinja \ --quantization bitsandbytes \ --load-format bitsandbytes \ --dtype half \ --enforce-eager \ --max-model-len 8192 & # Wait to ensure the vLLM server is fully started (adjust if needed) sleep 10 # Start the Gradio application using python3 python3 app.py