Spaces:

Lesterchia174
/

fpoce_Multilingual_Translator_with_Speech_Support

Running

fpoce_Multilingual_Translator_with_Speech_Support

File size: 11,385 Bytes

import gradio as gr
import os
import tempfile
import whisper
import re
from groq import Groq
from gtts import gTTS

# Load the local Whisper model for speech-to-text
whisper_model = whisper.load_model("base")

# Instantiate Groq client with API key
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY", "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"))

# Supported languages (separated Malaysian Malay & Indonesian Malay)
SUPPORTED_LANGUAGES = [
    "English", "Chinese", "Thai", 
    "Malaysian Malay", "Indonesian Malay",  # Split into two entries
    "Korean", "Japanese", "Spanish", "German", 
    "Hindi", "Urdu", "French", "Russian", 
    "Tagalog", "Arabic", "Myanmar", "Vietnamese"
]

LANGUAGE_CODES = {
    "English": "en", "Chinese": "zh", "Thai": "th", 
    "Malaysian Malay": "ms",  # Bahasa Malaysia (ms)
    "Indonesian Malay": "id",  # Bahasa Indonesia (id)
    "Korean": "ko", "Japanese": "ja", "Spanish": "es", 
    "German": "de", "Hindi": "hi", "Urdu": "ur", 
    "French": "fr", "Russian": "ru", "Tagalog": "tl", 
    "Arabic": "ar", "Myanmar": "my", "Vietnamese": "vi"
}

# Available LLM models
AVAILABLE_MODELS = {
    "DeepSeek-R1 llama 70B": "deepseek-r1-distill-llama-70b",
    "Qwen 32B": "qwen-qwq-32b",
    "Llama-3.3 70B": "llama-3.3-70b-versatile",
    "Llama-4 Scout 17B":"meta-llama/llama-4-scout-17b-16e-instruct",
    "Llama-4 Maverick 17B": "meta-llama/llama-4-maverick-17b-128e-instruct"
    
}

def transcribe_audio_locally(audio):
    """Transcribe audio using local Whisper model"""
    if audio is None:
        return ""
    
    try:
        audio_path = audio["name"] if isinstance(audio, dict) and "name" in audio else audio
        result = whisper_model.transcribe(audio_path)
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio locally: {e}")
        return f"Error transcribing audio: {str(e)}"

def translate_text(input_text, input_lang, output_langs, model_name):
    """Translate text using Groq's API with the selected model"""
    if not input_text or not output_langs:
        return []
    
    try:
        # Get the actual model ID from our dictionary
        model_id = AVAILABLE_MODELS.get(model_name, "meta-llama/llama-4-maverick-17b-128e-instruct")
        
        # Using a more direct instruction to avoid exposing the thinking process
        system_prompt = """You are a translation assistant that provides direct, accurate translations. 
        Do NOT include any thinking, reasoning, or explanations in your response.
        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
        Do NOT use any special formatting like asterisks (**) or other markdown.
        Always respond with ONLY the exact translation text itself."""
        
        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix. Do not use any special formatting or markdown."
        
        response = groq_client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        translation_text = response.choices[0].message.content.strip()
        
        # Remove any "thinking" patterns or COT that might have leaked through
        # Remove text between <think> tags if they exist
        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
        
        # Remove any asterisks
        translation_text = translation_text.replace('**', '')

        # Remove any line starting with common thinking patterns
        thinking_patterns = [
            r'^\s*Let me think.*$',
            r'^\s*I need to.*$',
            r'^\s*First,.*$',
            r'^\s*Okay, so.*$',
            r'^\s*Hmm,.*$',
            r'^\s*Let\'s break this down.*$'
        ]
        
        for pattern in thinking_patterns:
            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
        
        return translation_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return f"Error: {str(e)}"

def synthesize_speech(text, lang):
    """Generate speech from text"""
    if not text:
        return None
    
    try:
        lang_code = LANGUAGE_CODES.get(lang, "en")
        tts = gTTS(text=text, lang=lang_code)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts.save(fp.name)
            return fp.name
    except Exception as e:
        print(f"Error synthesizing speech: {e}")
        return None

def clear_memory():
    """Clear all fields"""
    return "", "", "", "", None, None, None

def process_speech_to_text(audio):
    """Process audio and return the transcribed text"""
    if not audio:
        return ""
    
    transcribed_text = transcribe_audio_locally(audio)
    return transcribed_text

def clean_translation_output(text):
    """Clean translation output to remove any thinking or processing text"""
    if not text:
        return ""
    
    # Remove any meta-content or thinking
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    
    # Remove asterisks from the text
    text = text.replace('**', '')
    text = text.replace('*', '')

    # Remove lines that appear to be thinking/reasoning
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip lines that look like thinking
        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
            continue
            
        # Keep translations with language names
        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
            cleaned_lines.append(line)
        # Or keep direct translations without prefixes if they don't look like thinking
        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def extract_translations(translations_text, output_langs):
    """Extract clean translations from the model output"""
    if not translations_text or not output_langs:
        return [""] * 3
    
    # Clean the translations text first
    clean_text = clean_translation_output(translations_text)
    
    # Try to match language patterns
    translation_results = []
    
    # First try to find language-labeled translations
    for lang in output_langs:
        pattern = rf'{lang}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
        if match:
            translation_results.append(match.group(1).strip())
    
    # If we couldn't find labeled translations, just split by lines
    if not translation_results and '\n' in clean_text:
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            # Check if this line has a language prefix
            if ':' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    translation_results.append(parts[1].strip())
            else:
                # Just add the line as is if it seems like a translation
                translation_results.append(line)
    elif not translation_results:
        # If no newlines, just use the whole text
        translation_results.append(clean_text)
    
    # Ensure we have exactly 3 results
    while len(translation_results) < 3:
        translation_results.append("")
    
    return translation_results[:3]

def perform_translation(audio, typed_text, input_lang, output_langs, model_name):
    """Main function to handle translation process"""
    # Check if we have valid inputs
    if not output_langs:
        return typed_text, "", "", "", None, None, None
    
    # Limit to 3 output languages
    selected_langs = output_langs[:3]
    
    # Get the input text either from typed text or by transcribing audio
    input_text = typed_text
    if not input_text and audio:
        input_text = transcribe_audio_locally(audio)
    
    if not input_text:
        return "", "", "", "", None, None, None
    
    # Get translations using the selected model
    translations_text = translate_text(input_text, input_lang, selected_langs, model_name)
    
    # Extract clean translations
    translation_results = extract_translations(translations_text, selected_langs)
    
    # Generate speech for each valid translation
    audio_paths = []
    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
        if trans:
            audio_path = synthesize_speech(trans, lang)
            audio_paths.append(audio_path)
        else:
            audio_paths.append(None)
    
    # Ensure we have exactly 3 audio paths
    while len(audio_paths) < 3:
        audio_paths.append(None)
    
    # Return results in the expected format
    return [input_text] + translation_results + audio_paths

with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")

    with gr.Row():
        input_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="English", label="Input Language")
        output_langs = gr.CheckboxGroup(choices=SUPPORTED_LANGUAGES, label="Output Languages (select up to 3)")

    with gr.Row():
        model_selector = gr.Dropdown(
            choices=list(AVAILABLE_MODELS.keys()), 
            value="DeepSeek-R1 llama 70B", 
            label="Translation Model"
        )

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Speak Your Input (upload or record)")
        text_input = gr.Textbox(label="Or Type Text", elem_id="text_input")

    transcribed_text = gr.Textbox(label="Transcribed Text (from audio)", interactive=False)
    translated_outputs = [gr.Textbox(label=f"Translation {i+1}", interactive=False) for i in range(3)]
    audio_outputs = [gr.Audio(label=f"Speech Output {i+1}") for i in range(3)]

    with gr.Row():
        translate_btn = gr.Button("Translate", elem_id="translate_btn")
        clear_btn = gr.Button("Clear Memory")

    # Handle audio input separately
    def on_audio_change(audio):
        if audio is None:
            return ""
        transcribed = process_speech_to_text(audio)
        return transcribed
    
    # Update text input when audio is processed
    audio_input.change(
        on_audio_change,
        inputs=[audio_input],
        outputs=[text_input]
    )
    
    # Enable Enter key to submit
    text_input.submit(
        perform_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    translate_btn.click(
        perform_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

    clear_btn.click( 
        clear_memory,
        inputs=[],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

demo.launch()