Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from fastapi import FastAPI, HTTPException | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.responses import HTMLResponse | |
from llama_cpp import Llama | |
from pydantic import BaseModel | |
import uvicorn | |
# Configuration | |
MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" | |
MODEL_DIR = "model" | |
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) | |
# Create model directory if it doesn't exist | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
# Download the model if it doesn't exist | |
if not os.path.exists(MODEL_PATH): | |
print(f"Downloading model from {MODEL_URL}...") | |
response = requests.get(MODEL_URL, stream=True) | |
if response.status_code == 200: | |
with open(MODEL_PATH, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
print("Model downloaded successfully!") | |
else: | |
raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") | |
else: | |
print("Model already exists. Skipping download.") | |
# Initialize FastAPI | |
app = FastAPI( | |
title="DeepSeek-R1 OpenAI-Compatible API", | |
description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", | |
version="1.0.0", | |
) | |
# CORS Configuration | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
# Load the model | |
print("Loading model...") | |
try: | |
llm = Llama( | |
model_path=MODEL_PATH, n_ctx=2048, n_threads=4, n_gpu_layers=0, verbose=False | |
) | |
print("Model loaded successfully!") | |
except Exception as e: | |
raise RuntimeError(f"Failed to load model: {str(e)}") | |
# Root endpoint with documentation | |
async def root(): | |
return f""" | |
<html> | |
<head> | |
</head> | |
<body> | |
<h2>API Documentation</h2> | |
<ul> | |
<li><a href="/docs">Interactive Swagger Documentation</a></li> | |
<li><a href="/redoc">ReDoc Documentation</a></li> | |
</ul> | |
</body> | |
</html> | |
""" | |
# OpenAI-Compatible Request Schema | |
class ChatCompletionRequest(BaseModel): | |
prompt: str | |
max_tokens: int = 300 | |
# temperature: float = 0.7 | |
# top_p: float = 0.9 | |
# stream: bool = False | |
# OpenAI-Compatible Response Schema | |
class ChatCompletionResponse(BaseModel): | |
model: str = MODEL_NAME | |
choices: list[dict] | |
usage: dict | |
async def chat_completion(request: ChatCompletionRequest): | |
try: | |
prompt = request.prompt | |
response = llm( | |
prompt=prompt, | |
max_tokens=request.max_tokens, | |
temperature=0.7, | |
top_p=0.9, | |
stop=["</s>"], | |
) | |
return ChatCompletionResponse( | |
choices=[ | |
{ | |
"index": 0, | |
"message": { | |
"role": "assistant", | |
"content": response["choices"][0]["text"].strip(), | |
}, | |
"finish_reason": "stop", | |
} | |
], | |
usage={ | |
"prompt_tokens": len(prompt), | |
"completion_tokens": len(response["choices"][0]["text"]), | |
"total_tokens": len(prompt) + len(response["choices"][0]["text"]), | |
}, | |
) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
def health_check(): | |
return {"status": "healthy"} | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |