import os import requests from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from llama_cpp import Llama from pydantic import BaseModel import uvicorn # Configuration MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" MODEL_DIR = "model" MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) # Create model directory if it doesn't exist os.makedirs(MODEL_DIR, exist_ok=True) # Download the model if it doesn't exist if not os.path.exists(MODEL_PATH): print(f"Downloading model from {MODEL_URL}...") response = requests.get(MODEL_URL, stream=True) if response.status_code == 200: with open(MODEL_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print("Model downloaded successfully!") else: raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") else: print("Model already exists. Skipping download.") # Initialize FastAPI app = FastAPI( title="DeepSeek-R1 OpenAI-Compatible API", description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", version="1.0.0", ) # CORS Configuration app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # Load the model print("Loading model...") try: llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=4, n_gpu_layers=0, verbose=False ) print("Model loaded successfully!") except Exception as e: raise RuntimeError(f"Failed to load model: {str(e)}") # Root endpoint with documentation @app.get("/", response_class=HTMLResponse) async def root(): return f"""

API Documentation

""" # OpenAI-Compatible Request Schema class ChatCompletionRequest(BaseModel): prompt: str max_tokens: int = 300 # temperature: float = 0.7 # top_p: float = 0.9 # stream: bool = False # OpenAI-Compatible Response Schema class ChatCompletionResponse(BaseModel): model: str = MODEL_NAME choices: list[dict] usage: dict @app.post("/v1/chat/completions") async def chat_completion(request: ChatCompletionRequest): try: prompt = request.prompt response = llm( prompt=prompt, max_tokens=request.max_tokens, temperature=0.7, top_p=0.9, stop=[""], ) return ChatCompletionResponse( choices=[ { "index": 0, "message": { "role": "assistant", "content": response["choices"][0]["text"].strip(), }, "finish_reason": "stop", } ], usage={ "prompt_tokens": len(prompt), "completion_tokens": len(response["choices"][0]["text"]), "total_tokens": len(prompt) + len(response["choices"][0]["text"]), }, ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") def health_check(): return {"status": "healthy"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)