Spaces:

wehead
/

csm-1b-tts-demo

Runtime error

csm-1b-tts-demo / app.py

Константин

Refactor FastAPI application to enhance audio synthesis functionality and improve logging. Introduced model loading with error handling, updated endpoint to '/synthesize', and implemented thread-safe audio generation. Modified Dockerfile to streamline dependency installation and updated requirements.txt to include additional packages.

5f36d04 3 days ago

raw

history blame contribute delete

2.73 kB

	import os
	import logging
	from io import BytesIO

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import StreamingResponse
	from pydantic import BaseModel

	import torch
	import torchaudio
	from huggingface_hub import hf_hub_download

	# Set watermark key to avoid errors in model's watermarking (use public GH key)
	os.environ["WATERMARK_KEY"] = os.environ.get("WATERMARK_KEY", "212 211 146 56 201")

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("csm_app")

	# Initialize FastAPI
	app = FastAPI()

	# Request model for input payload
	class SynthesisRequest(BaseModel):
	text: str

	# Load model at startup
	device = "cuda" if torch.cuda.is_available() else "cpu"
	try:
	logger.info("Downloading CSM-1B model from Hugging Face...")
	model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
	# Import model loader from CSM repository (requires generator.py from the model's codebase)
	from generator import load_csm_1b
	generator = load_csm_1b(model_path, device)
	logger.info(f"CSM-1B model loaded on {device}.")
	except Exception as e:
	logger.error("Failed to load the CSM-1B model", exc_info=True)
	raise RuntimeError("Model loading failed") from e

	# Optional lock for thread-safe generation (ensure one generation at a time)
	from threading import Lock
	_generate_lock = Lock()

	@app.post("/synthesize")
	def synthesize(request: SynthesisRequest):
	"""Synthesize speech from text and return WAV audio."""
	text = request.text
	if not text or not text.strip():
	logger.error("Received empty text input")
	raise HTTPException(status_code=400, detail="Text input is empty.")
	logger.info(f"Received synthesis request (text length={len(text)} chars).")
	try:
	with _generate_lock:
	# Generate audio tensor from text
	audio = generator.generate(
	text=text,
	speaker=0,
	context=[],
	max_audio_length_ms=10000
	)
	# Move to CPU (if on GPU) and prepare WAV bytes
	audio = audio.cpu()
	sample_rate = getattr(generator, "sample_rate", 44100) # model sample rate (default 44100 Hz)
	wav_bytes = BytesIO()
	torchaudio.save(wav_bytes, audio.unsqueeze(0), sample_rate, format="wav")
	wav_bytes.seek(0)
	logger.info("Audio generated successfully, returning WAV file.")
	except Exception as e:
	logger.error("Error during audio generation", exc_info=True)
	raise HTTPException(status_code=500, detail="Internal server error during synthesis.")
	# Stream the WAV audio back to the client
	return StreamingResponse(wav_bytes, media_type="audio/wav")