Spaces:

Lesterchia174
/

fpoce_Multilingual_Translator_with_Speech_Support

Running

App Files Files Community

Lesterchia174 commited on Apr 12

Commit

6a6baf6

verified ·

1 Parent(s): 5de3c7d

Upload 3 files

Browse files

Files changed (3) hide show

app.py +283 -0
apt.txt +1 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# -*- coding: utf-8 -*-
+"""app
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1hDCBaCrOX0FZx8VUT9_cUfWWg7y97yrx
+"""
+import gradio as gr
+import os
+import tempfile
+import whisper
+import re
+from groq import Groq
+from gtts import gTTS
+# Load the local Whisper model for speech-to-text
+whisper_model = whisper.load_model("base")
+# Instantiate Groq client with API key
+groq_client = Groq(api_key=os.getenv("GROQ_API_KEY", "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"))
+# Supported languages
+SUPPORTED_LANGUAGES = [
+    "English", "Chinese", "Thai", "Malay", "Korean",
+    "Japanese", "Spanish", "German", "Hindi",
+    "French", "Russian", "Tagalog", "Arabic"
+]
+LANGUAGE_CODES = {
+    "English": "en", "Chinese": "zh", "Thai": "th", "Malay": "ms", "Korean": "ko",
+    "Japanese": "ja", "Spanish": "es", "German": "de", "Hindi": "hi",
+    "French": "fr", "Russian": "ru", "Tagalog": "tl", "Arabic": "ar"
+}
+def transcribe_audio_locally(audio):
+    """Transcribe audio using local Whisper model"""
+    if audio is None:
+        return ""
+    try:
+        audio_path = audio["name"] if isinstance(audio, dict) and "name" in audio else audio
+        result = whisper_model.transcribe(audio_path)
+        return result["text"]
+    except Exception as e:
+        print(f"Error transcribing audio locally: {e}")
+        return f"Error transcribing audio: {str(e)}"
+def translate_text(input_text, input_lang, output_langs):
+    """Translate text using Groq's API with improved prompt to avoid COT"""
+    if not input_text or not output_langs:
+        return []
+    try:
+        # Using a more direct instruction to avoid exposing the thinking process
+        system_prompt = """You are a translation assistant that provides direct, accurate translations.
+        Do NOT include any thinking, reasoning, or explanations in your response.
+        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
+        Always respond with ONLY the exact translation text itself."""
+        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix."
+        response = groq_client.chat.completions.create(
+            model="deepseek-r1-distill-Qwen-32b",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+        )
+        translation_text = response.choices[0].message.content.strip()
+        # Remove any "thinking" patterns or COT that might have leaked through
+        # Remove text between <think> tags if they exist
+        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
+        # Remove any line starting with common thinking patterns
+        thinking_patterns = [
+            r'^\s*Let me think.*$',
+            r'^\s*I need to.*$',
+            r'^\s*First,.*$',
+            r'^\s*Okay, so.*$',
+            r'^\s*Hmm,.*$',
+            r'^\s*Let\'s break this down.*$'
+        ]
+        for pattern in thinking_patterns:
+            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
+        return translation_text
+    except Exception as e:
+        print(f"Error translating text: {e}")
+        return f"Error: {str(e)}"
+def synthesize_speech(text, lang):
+    """Generate speech from text"""
+    if not text:
+        return None
+    try:
+        lang_code = LANGUAGE_CODES.get(lang, "en")
+        tts = gTTS(text=text, lang=lang_code)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+            tts.save(fp.name)
+            return fp.name
+    except Exception as e:
+        print(f"Error synthesizing speech: {e}")
+        return None
+def clear_memory():
+    """Clear all fields"""
+    return "", "", "", "", None, None, None
+def process_speech_to_text(audio):
+    """Process audio and return the transcribed text"""
+    if not audio:
+        return ""
+    transcribed_text = transcribe_audio_locally(audio)
+    return transcribed_text
+def clean_translation_output(text):
+    """Clean translation output to remove any thinking or processing text"""
+    if not text:
+        return ""
+    # Remove any meta-content or thinking
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    # Remove lines that appear to be thinking/reasoning
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        # Skip lines that look like thinking
+        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
+            continue
+        # Keep translations with language names
+        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
+            cleaned_lines.append(line)
+        # Or keep direct translations without prefixes if they don't look like thinking
+        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
+            cleaned_lines.append(line)
+    return '\n'.join(cleaned_lines)
+def extract_translations(translations_text, output_langs):
+    """Extract clean translations from the model output"""
+    if not translations_text or not output_langs:
+        return [""] * 3
+    # Clean the translations text first
+    clean_text = clean_translation_output(translations_text)
+    # Try to match language patterns
+    translation_results = []
+    # First try to find language-labeled translations
+    for lang in output_langs:
+        pattern = rf'{lang}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
+        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
+        if match:
+            translation_results.append(match.group(1).strip())
+    # If we couldn't find labeled translations, just split by lines
+    if not translation_results and '\n' in clean_text:
+        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
+        for line in lines:
+            # Check if this line has a language prefix
+            if ':' in line:
+                parts = line.split(':', 1)
+                if len(parts) == 2:
+                    translation_results.append(parts[1].strip())
+            else:
+                # Just add the line as is if it seems like a translation
+                translation_results.append(line)
+    elif not translation_results:
+        # If no newlines, just use the whole text
+        translation_results.append(clean_text)
+    # Ensure we have exactly 3 results
+    while len(translation_results) < 3:
+        translation_results.append("")
+    return translation_results[:3]
+def perform_translation(audio, typed_text, input_lang, output_langs):
+    """Main function to handle translation process"""
+    # Check if we have valid inputs
+    if not output_langs:
+        return typed_text, "", "", "", None, None, None
+    # Limit to 3 output languages
+    selected_langs = output_langs[:3]
+    # Get the input text either from typed text or by transcribing audio
+    input_text = typed_text
+    if not input_text and audio:
+        input_text = transcribe_audio_locally(audio)
+    if not input_text:
+        return "", "", "", "", None, None, None
+    # Get translations
+    translations_text = translate_text(input_text, input_lang, selected_langs)
+    # Extract clean translations
+    translation_results = extract_translations(translations_text, selected_langs)
+    # Generate speech for each valid translation
+    audio_paths = []
+    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
+        if trans:
+            audio_path = synthesize_speech(trans, lang)
+            audio_paths.append(audio_path)
+        else:
+            audio_paths.append(None)
+    # Ensure we have exactly 3 audio paths
+    while len(audio_paths) < 3:
+        audio_paths.append(None)
+    # Return results in the expected format
+    return [input_text] + translation_results + audio_paths
+with gr.Blocks() as demo:
+    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")
+    with gr.Row():
+        input_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="English", label="Input Language")
+        output_langs = gr.CheckboxGroup(choices=SUPPORTED_LANGUAGES, label="Output Languages (select up to 3)")
+    with gr.Row():
+        audio_input = gr.Audio(type="filepath", label="Speak Your Input (upload or record)")
+        text_input = gr.Textbox(label="Or Type Text", elem_id="text_input")
+    transcribed_text = gr.Textbox(label="Transcribed Text (from audio)", interactive=False)
+    translated_outputs = [gr.Textbox(label=f"Translation {i+1}", interactive=False) for i in range(3)]
+    audio_outputs = [gr.Audio(label=f"Speech Output {i+1}") for i in range(3)]
+    with gr.Row():
+        translate_btn = gr.Button("Translate", elem_id="translate_btn")
+        clear_btn = gr.Button("Clear Memory")
+    # Handle audio input separately
+    def on_audio_change(audio):
+        if audio is None:
+            return ""
+        transcribed = process_speech_to_text(audio)
+        return transcribed
+    # Update text input when audio is processed
+    audio_input.change(
+        on_audio_change,
+        inputs=[audio_input],
+        #outputs=[text_input]
+        outputs=[transcribed_text]
+    )
+    # Enable Enter key to submit
+    text_input.submit(
+        perform_translation,
+        inputs=[audio_input, text_input, input_lang, output_langs],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+    translate_btn.click(
+        perform_translation,
+        inputs=[audio_input, text_input, input_lang, output_langs],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+    clear_btn.click(
+        clear_memory,
+        inputs=[],
+        outputs=[transcribed_text] + translated_outputs + audio_outputs
+    )
+demo.launch()

apt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ espeak

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+torch
+groq
+soundfile
+transformers
+openai-whisper
+gTTS