Deepseek-R1-1.5b-API

Sleeping

App Files Files Community

Deepseek-R1-1.5b-API / app.py

harin-khakhi

added link to docs

a5bd21f about 1 month ago

raw

history blame contribute delete

3.66 kB

	import os
	import requests
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	from llama_cpp import Llama
	from pydantic import BaseModel
	import uvicorn

	# Configuration
	MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
	MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
	MODEL_DIR = "model"
	MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)

	# Create model directory if it doesn't exist
	os.makedirs(MODEL_DIR, exist_ok=True)

	# Download the model if it doesn't exist
	if not os.path.exists(MODEL_PATH):
	print(f"Downloading model from {MODEL_URL}...")
	response = requests.get(MODEL_URL, stream=True)
	if response.status_code == 200:
	with open(MODEL_PATH, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print("Model downloaded successfully!")
	else:
	raise RuntimeError(f"Failed to download model: HTTP {response.status_code}")
	else:
	print("Model already exists. Skipping download.")

	# Initialize FastAPI
	app = FastAPI(
	title="DeepSeek-R1 OpenAI-Compatible API",
	description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
	version="1.0.0",
	)

	# CORS Configuration
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Load the model
	print("Loading model...")
	try:
	llm = Llama(
	model_path=MODEL_PATH, n_ctx=2048, n_threads=4, n_gpu_layers=0, verbose=False
	)
	print("Model loaded successfully!")
	except Exception as e:
	raise RuntimeError(f"Failed to load model: {str(e)}")


	# Root endpoint with documentation
	@app.get("/", response_class=HTMLResponse)
	async def root():
	return f"""
	<html>
	<head>
	</head>
	<body>
	<h2>API Documentation</h2>
	<ul>
	<li><a href="/docs">Interactive Swagger Documentation</a></li>
	<li><a href="/redoc">ReDoc Documentation</a></li>
	</ul>
	</body>
	</html>
	"""


	# OpenAI-Compatible Request Schema
	class ChatCompletionRequest(BaseModel):
	prompt: str
	max_tokens: int = 300
	# temperature: float = 0.7
	# top_p: float = 0.9
	# stream: bool = False


	# OpenAI-Compatible Response Schema
	class ChatCompletionResponse(BaseModel):
	model: str = MODEL_NAME
	choices: list[dict]
	usage: dict


	@app.post("/v1/chat/completions")
	async def chat_completion(request: ChatCompletionRequest):
	try:
	prompt = request.prompt

	response = llm(
	prompt=prompt,
	max_tokens=request.max_tokens,
	temperature=0.7,
	top_p=0.9,
	stop=["</s>"],
	)

	return ChatCompletionResponse(
	choices=[
	{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": response["choices"][0]["text"].strip(),
	},
	"finish_reason": "stop",
	}
	],
	usage={
	"prompt_tokens": len(prompt),
	"completion_tokens": len(response["choices"][0]["text"]),
	"total_tokens": len(prompt) + len(response["choices"][0]["text"]),
	},
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.get("/health")
	def health_check():
	return {"status": "healthy"}


	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)