Spaces:

Hameed13
/

my_news_podcast

Runtime error

App Files Files Community

Hameed13 commited on 24 days ago

Commit

97d03bb

1 Parent(s): d985a8f

first commit

Browse files

Files changed (4) hide show

.gitignore +55 -0
client.py +280 -0
main.py +220 -0
requirements.txt +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Model files (these will be downloaded at runtime)
+models/*.ckpt
+models/*.yaml
+models/*.pt
+models/*.bin
+# Audio files (generated content)
+audio_files/
+*.wav
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+# Virtual environments
+venv/
+env/
+ENV/
+.env
+.venv
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+# Jupyter Notebook
+.ipynb_checkpoints
+# OS specific files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+logs/
+# Local configuration
+config.local.py
+.env.local
+# Temporary files
+tmp/
+temp/
+# Cache directories
+.cache/
+.pytest_cache/

client.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import streamlit as st
+import requests
+import base64
+from io import BytesIO
+import pandas as pd
+# Set page config
+st.set_page_config(
+    page_title="Nigerian Text-to-Speech",
+    page_icon="🎙️",
+    layout="wide"
+)
+# Define the available voices and languages
+AVAILABLE_VOICES = {
+    "Female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
+    "Male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
+}
+AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
+# IMPORTANT: Replace this with the ngrok URL shown in your Colab notebook
+# Example: API_BASE_URL = "https://a1b2-34-56-78-90.ngrok.io"
+API_BASE_URL = st.text_input(
+    "Enter the ngrok URL from Colab (e.g., https://a1b2-34-56-78-90.ngrok.io)",
+    value="",
+    key="api_url"
+)
+# Derive the TTS endpoint from the base URL
+if API_BASE_URL:
+    API_TTS_ENDPOINT = f"{API_BASE_URL}/tts"
+    # Test connection to backend
+    try:
+        health_check = requests.get(f"{API_BASE_URL}")
+        if health_check.status_code == 200:
+            st.success(f"✅ Connected to backend API successfully!")
+        else:
+            st.warning(f"⚠️ Backend API returned status code {health_check.status_code}")
+    except Exception as e:
+        st.error(f"❌ Cannot connect to backend API: {str(e)}")
+else:
+    st.warning("⚠️ Please enter the ngrok URL from your Colab notebook to continue")
+# App title and description
+st.title("Nigerian Text-to-Speech")
+st.markdown("""
+Convert text to speech with authentic Nigerian accents. This app uses YarnGPT, a text-to-speech model
+that generates natural Nigerian-accented speech in English, Yoruba, Igbo, and Hausa.
+""")
+# Create tabs for different functions
+tab1, tab2, tab3 = st.tabs(["Basic TTS", "Batch Processing", "About"])
+# Tab 1: Basic TTS
+with tab1:
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        # Text input
+        text_input = st.text_area(
+            "Enter text to convert to speech",
+            "Welcome to Nigeria, the giant of Africa. Our diverse cultures and languages make us unique.",
+            height=150
+        )
+        # Generate button
+        generate_button = st.button("Generate Audio", type="primary", disabled=not API_BASE_URL)
+    with col2:
+        # Options
+        language = st.selectbox("Language", AVAILABLE_LANGUAGES)
+        gender = st.radio("Gender", ["Female", "Male"])
+        voice = st.selectbox("Voice", AVAILABLE_VOICES[gender])
+        st.info(f"Selected voice: **{voice}** ({gender.lower()})")
+    # Generate audio when button is clicked
+    if generate_button and text_input and API_BASE_URL:
+        with st.spinner("Generating audio... (This may take a minute as the audio is processed through Colab)"):
+            try:
+                # Call the API with timeout increased
+                response = requests.post(
+                    API_TTS_ENDPOINT,
+                    json={"text": text_input, "language": language, "voice": voice},
+                    timeout=100000  # Increase timeout to 2 minutes
+                )
+                if response.status_code == 200:
+                    # Get response data
+                    audio_data = response.json()
+                    # Save info in session state
+                    st.session_state.last_text = text_input
+                    st.session_state.last_voice = voice
+                    st.session_state.last_language = language
+                    # Display success and audio player
+                    st.success("Audio generated successfully!")
+                    st.markdown(f"Voice: **{voice}** | Language: **{language}**")
+                    # Handle base64-encoded audio
+                    if "audio_base64" in audio_data:
+                        audio_bytes = base64.b64decode(audio_data["audio_base64"])
+                        audio_stream = BytesIO(audio_bytes)
+                        # Play audio directly from the stream
+                        st.audio(audio_stream, format="audio/wav")
+                    else:
+                        # Fall back to URL method (legacy support)
+                        audio_url = f"{API_BASE_URL}{audio_data['audio_url']}"
+                        st.warning("Using legacy URL-based audio (may not work)")
+                        st.code(audio_url, language="text")
+                        st.audio(audio_url, format="audio/wav")
+                else:
+                    st.error(f"Error: {response.status_code} - {response.text}")
+            except Exception as e:
+                st.error(f"Error generating audio: {str(e)}")
+                st.info(f"Make sure the backend API is running and accessible at {API_BASE_URL}")
+# Tab 2: Batch Processing
+with tab2:
+    st.header("Batch Text-to-Speech Conversion")
+    st.markdown("""
+    Process multiple text entries at once. Upload a CSV file with the following columns:
+    - `text`: The text to convert to speech
+    - `language` (optional): Language for the text (english, yoruba, igbo, hausa)
+    - `voice` (optional): Voice name to use
+    """)
+    # File uploader
+    uploaded_file = st.file_uploader("Upload CSV file", type="csv")
+    if uploaded_file and API_BASE_URL:
+        # Process the file
+        try:
+            df = pd.read_csv(uploaded_file)
+            if "text" not in df.columns:
+                st.error("CSV file must contain a 'text' column")
+            else:
+                st.dataframe(df.head())
+                # Default values
+                default_language = st.selectbox("Default language", AVAILABLE_LANGUAGES)
+                default_voice = st.selectbox("Default voice", AVAILABLE_VOICES["Female"] + AVAILABLE_VOICES["Male"])
+                if st.button("Process Batch", disabled=not API_BASE_URL):
+                    # Create a container for audio files
+                    audio_container = st.container()
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    # Process each row
+                    results = []
+                    audio_files = []  # Store audio data for playback
+                    for i, row in enumerate(df.itertuples()):
+                        # Update progress
+                        progress = int((i + 1) / len(df) * 100)
+                        progress_bar.progress(progress)
+                        status_text.text(f"Processing item {i+1} of {len(df)}...")
+                        # Get text and parameters
+                        text = row.text
+                        lang = getattr(row, 'language', default_language) if hasattr(row, 'language') else default_language
+                        voice_name = getattr(row, 'voice', default_voice) if hasattr(row, 'voice') else default_voice
+                        try:
+                            # Make API call with increased timeout
+                            response = requests.post(
+                                API_TTS_ENDPOINT,
+                                json={"text": text, "language": lang, "voice": voice_name},
+                                timeout=120  # Increase timeout to 2 minutes
+                            )
+                            if response.status_code == 200:
+                                audio_data = response.json()
+                                # Handle base64-encoded audio
+                                if "audio_base64" in audio_data:
+                                    audio_bytes = base64.b64decode(audio_data["audio_base64"])
+                                    audio_files.append({
+                                        "index": i,
+                                        "bytes": audio_bytes,
+                                        "text": text,
+                                        "voice": voice_name,
+                                        "language": lang
+                                    })
+                                    status = "Success"
+                                else:
+                                    # Fall back to URL method (legacy support)
+                                    audio_url = f"{API_BASE_URL}{audio_data['audio_url']}"
+                                    status = "Success (URL mode)"
+                                # Add to results
+                                results.append({
+                                    "text": text[:50] + "..." if len(text) > 50 else text,
+                                    "language": lang,
+                                    "voice": voice_name,
+                                    "status": status
+                                })
+                            else:
+                                results.append({
+                                    "text": text[:50] + "..." if len(text) > 50 else text,
+                                    "language": lang,
+                                    "voice": voice_name,
+                                    "status": f"Error: {response.status_code}"
+                                })
+                        except Exception as e:
+                            results.append({
+                                "text": text[:50] + "..." if len(text) > 50 else text,
+                                "language": lang,
+                                "voice": voice_name,
+                                "status": f"Error: {str(e)}"
+                            })
+                    # Show results
+                    st.success("Batch processing completed!")
+                    results_df = pd.DataFrame(results)
+                    st.dataframe(results_df)
+                    # Display audio players for successful generations
+                    with audio_container:
+                        st.subheader("Generated Audio Files")
+                        for audio_item in audio_files:
+                            st.markdown(f"**{audio_item['index']+1}. {audio_item['text'][:50]}...** ({audio_item['voice']}, {audio_item['language']})")
+                            audio_stream = BytesIO(audio_item["bytes"])
+                            st.audio(audio_stream, format="audio/wav")
+                            st.markdown("---")
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+    elif not API_BASE_URL:
+        st.warning("Please enter the ngrok URL first to enable batch processing")
+# Tab 3: About
+with tab3:
+    st.header("About YarnGPT")
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.markdown("""
+        ### Features
+        - 🗣️ 12 preset voices (6 male, 6 female)
+        - 🎯 Trained on 2000+ hours of Nigerian audio
+        - 🔊 24kHz high-quality audio output
+        - 📝 Support for long-form text
+        ### Model Details
+        - Base: HuggingFaceTB/SmolLM2-360M
+        - Training: 5 epochs on A100 GPU
+        - Data: Nigerian movies, podcasts, and open-source audio
+        """)
+    with col2:
+        st.markdown("""
+        ### Available Voices
+        - **Female**: zainab, idera, regina, chinenye, joke, remi
+        - **Male**: jude, tayo, umar, osagie, onye, emma
+        ### Limitations
+        - English to Nigerian-accented English primarily
+        - May not capture all Nigerian accent variations
+        - Training data includes auto-generated content
+        """)
+    st.markdown("""
+    ### Credits
+    - YarnGPT was created by Saheed Abdulrahman, a Unilag student
+    - Model is available as open source on [GitHub](https://github.com/saheedniyi02/yarngpt)
+    - Web demo: [https://yarngpt.co/](https://yarngpt.co/)
+    """)
+# Footer
+st.markdown("---")
+st.markdown("Developed for a Nigerian News App Podcaster API | Powered by YarnGPT")

main.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import os
+import uuid
+import torch
+import torchaudio
+import base64
+from io import BytesIO
+from transformers import AutoModelForCausalLM
+import sys
+import subprocess
+from datetime import datetime, timedelta
+app = FastAPI(title="Nigerian TTS API")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, set this to your Next.js domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize necessary directories
+os.makedirs("audio_files", exist_ok=True)
+os.makedirs("models", exist_ok=True)
+# Check if YarnGPT is installed, if not install it
+try:
+    import yarngpt
+    from yarngpt.audiotokenizer import AudioTokenizerV2
+except ImportError:
+    print("Installing YarnGPT and dependencies...")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/saheedniyi02/yarngpt.git"])
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "outetts", "uroman", "transformers", "torchaudio"])
+    from yarngpt.audiotokenizer import AudioTokenizerV2
+# Model configuration
+tokenizer_path = "saheedniyi/YarnGPT2"
+# Check if model files exist, if not download them
+wav_tokenizer_config_path = "./models/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+wav_tokenizer_model_path = "./models/wavtokenizer_large_speech_320_24k.ckpt"
+if not os.path.exists(wav_tokenizer_config_path):
+    print("Downloading model config file...")
+    subprocess.check_call([
+        "wget", "-O", wav_tokenizer_config_path,
+        "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+    ])
+if not os.path.exists(wav_tokenizer_model_path):
+    print("Downloading model checkpoint file...")
+    subprocess.check_call([
+        "wget", "-O", wav_tokenizer_model_path,
+        "https://drive.google.com/uc?id=1-ASeEkrn4HY49yZWHTASgfGFNXdVnLTt&export=download"
+    ])
+print("Loading YarnGPT model and tokenizer...")
+audio_tokenizer = AudioTokenizerV2(
+    tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path
+)
+model = AutoModelForCausalLM.from_pretrained(tokenizer_path, torch_dtype="auto").to(audio_tokenizer.device)
+print("Model loaded successfully!")
+# Available voices and languages
+AVAILABLE_VOICES = {
+    "female": ["zainab", "idera", "regina", "chinenye", "joke", "remi"],
+    "male": ["jude", "tayo", "umar", "osagie", "onye", "emma"]
+}
+AVAILABLE_LANGUAGES = ["english", "yoruba", "igbo", "hausa"]
+# Input validation model
+class TTSRequest(BaseModel):
+    text: str
+    language: str = "english"
+    voice: str = "idera"
+# Output model with base64-encoded audio
+class TTSResponse(BaseModel):
+    audio_base64: str  # Base64-encoded audio data
+    audio_url: str     # Keep for backward compatibility
+    text: str
+    voice: str
+    language: str
+@app.get("/")
+async def root():
+    """API health check and info"""
+    return {
+        "status": "ok",
+        "message": "Nigerian TTS API is running",
+        "available_languages": AVAILABLE_LANGUAGES,
+        "available_voices": AVAILABLE_VOICES
+    }
+@app.post("/tts", response_model=TTSResponse)
+async def text_to_speech(request: TTSRequest, background_tasks: BackgroundTasks):
+    """Convert text to Nigerian-accented speech"""
+    # Validate inputs
+    if request.language not in AVAILABLE_LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Language must be one of {AVAILABLE_LANGUAGES}")
+    all_voices = AVAILABLE_VOICES["female"] + AVAILABLE_VOICES["male"]
+    if request.voice not in all_voices:
+        raise HTTPException(status_code=400, detail=f"Voice must be one of {all_voices}")
+    # Generate unique filename
+    audio_id = str(uuid.uuid4())
+    output_path = f"audio_files/{audio_id}.wav"
+    try:
+        # Create prompt and generate audio
+        prompt = audio_tokenizer.create_prompt(request.text, lang=request.language, speaker_name=request.voice)
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        output = model.generate(
+            input_ids=input_ids,
+            temperature=0.1,
+            repetition_penalty=1.1,
+            max_length=4000,
+        )
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Save audio file
+        torchaudio.save(output_path, audio, sample_rate=24000)
+        # Read the file and encode as base64
+        with open(output_path, "rb") as audio_file:
+            audio_bytes = audio_file.read()
+            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+        # Clean up old files after a while
+        background_tasks.add_task(cleanup_old_files)
+        return TTSResponse(
+            audio_base64=audio_base64,
+            audio_url=f"/audio/{audio_id}.wav",  # Keep for compatibility
+            text=request.text,
+            voice=request.voice,
+            language=request.language
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating audio: {str(e)}")
+# File serving endpoint for direct audio access
+@app.get("/audio/{filename}")
+async def get_audio(filename: str):
+    file_path = f"audio_files/{filename}"
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="Audio file not found")
+    def iterfile():
+        with open(file_path, "rb") as audio_file:
+            yield from audio_file
+    return StreamingResponse(iterfile(), media_type="audio/wav")
+# Endpoint to stream audio directly from base64 (useful for debugging)
+@app.post("/stream-audio")
+async def stream_audio(request: TTSRequest):
+    """Stream audio directly without saving to disk"""
+    try:
+        # Create prompt and generate audio
+        prompt = audio_tokenizer.create_prompt(request.text, lang=request.language, speaker_name=request.voice)
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        output = model.generate(
+            input_ids=input_ids,
+            temperature=0.1,
+            repetition_penalty=1.1,
+            max_length=4000,
+        )
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Create BytesIO object
+        buffer = BytesIO()
+        torchaudio.save(buffer, audio, sample_rate=24000, format="wav")
+        buffer.seek(0)
+        return StreamingResponse(buffer, media_type="audio/wav")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating audio: {str(e)}")
+# Cleanup function to remove old files
+def cleanup_old_files():
+    """Delete audio files older than 6 hours to manage disk space"""
+    try:
+        now = datetime.now()
+        audio_dir = "audio_files"
+        for filename in os.listdir(audio_dir):
+            if not filename.endswith(".wav"):
+                continue
+            file_path = os.path.join(audio_dir, filename)
+            file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
+            # Delete files older than 6 hours
+            if now - file_mod_time > timedelta(hours=6):
+                os.remove(file_path)
+                print(f"Deleted old audio file: {filename}")
+    except Exception as e:
+        print(f"Error cleaning up old files: {e}")
+# For running locally with uvicorn
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+torch==2.1.0
+torchaudio==2.1.0
+transformers==4.35.0
+pydantic==2.4.2
+python-multipart==0.0.6
+wget
+gdown
+numpy>=1.20.0
+requests>=2.27.1
+outetts
+uroman