File size: 11,385 Bytes
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
46e98e9
6f6d4d2
46e98e9
 
 
 
 
6f6d4d2
 
 
46e98e9
 
 
 
 
 
 
6f6d4d2
 
 
 
13bfba5
c4fe867
13bfba5
dd83cd4
13bfba5
 
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e0cf29
6f6d4d2
 
 
 
 
afe42b5
6f6d4d2
 
afe42b5
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afe42b5
 
 
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afe42b5
 
 
 
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd83cd4
6f6d4d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import gradio as gr
import os
import tempfile
import whisper
import re
from groq import Groq
from gtts import gTTS

# Load the local Whisper model for speech-to-text
whisper_model = whisper.load_model("base")

# Instantiate Groq client with API key
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY", "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"))

# Supported languages (separated Malaysian Malay & Indonesian Malay)
SUPPORTED_LANGUAGES = [
    "English", "Chinese", "Thai", 
    "Malaysian Malay", "Indonesian Malay",  # Split into two entries
    "Korean", "Japanese", "Spanish", "German", 
    "Hindi", "Urdu", "French", "Russian", 
    "Tagalog", "Arabic", "Myanmar", "Vietnamese"
]

LANGUAGE_CODES = {
    "English": "en", "Chinese": "zh", "Thai": "th", 
    "Malaysian Malay": "ms",  # Bahasa Malaysia (ms)
    "Indonesian Malay": "id",  # Bahasa Indonesia (id)
    "Korean": "ko", "Japanese": "ja", "Spanish": "es", 
    "German": "de", "Hindi": "hi", "Urdu": "ur", 
    "French": "fr", "Russian": "ru", "Tagalog": "tl", 
    "Arabic": "ar", "Myanmar": "my", "Vietnamese": "vi"
}

# Available LLM models
AVAILABLE_MODELS = {
    "DeepSeek-R1 llama 70B": "deepseek-r1-distill-llama-70b",
    "Qwen 32B": "qwen-qwq-32b",
    "Llama-3.3 70B": "llama-3.3-70b-versatile",
    "Llama-4 Scout 17B":"meta-llama/llama-4-scout-17b-16e-instruct",
    "Llama-4 Maverick 17B": "meta-llama/llama-4-maverick-17b-128e-instruct"
    
}

def transcribe_audio_locally(audio):
    """Transcribe audio using local Whisper model"""
    if audio is None:
        return ""
    
    try:
        audio_path = audio["name"] if isinstance(audio, dict) and "name" in audio else audio
        result = whisper_model.transcribe(audio_path)
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio locally: {e}")
        return f"Error transcribing audio: {str(e)}"

def translate_text(input_text, input_lang, output_langs, model_name):
    """Translate text using Groq's API with the selected model"""
    if not input_text or not output_langs:
        return []
    
    try:
        # Get the actual model ID from our dictionary
        model_id = AVAILABLE_MODELS.get(model_name, "meta-llama/llama-4-maverick-17b-128e-instruct")
        
        # Using a more direct instruction to avoid exposing the thinking process
        system_prompt = """You are a translation assistant that provides direct, accurate translations. 
        Do NOT include any thinking, reasoning, or explanations in your response.
        Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes.
        Do NOT use any special formatting like asterisks (**) or other markdown.
        Always respond with ONLY the exact translation text itself."""
        
        user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix. Do not use any special formatting or markdown."
        
        response = groq_client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        translation_text = response.choices[0].message.content.strip()
        
        # Remove any "thinking" patterns or COT that might have leaked through
        # Remove text between <think> tags if they exist
        translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL)
        
        # Remove any asterisks
        translation_text = translation_text.replace('**', '')

        # Remove any line starting with common thinking patterns
        thinking_patterns = [
            r'^\s*Let me think.*$',
            r'^\s*I need to.*$',
            r'^\s*First,.*$',
            r'^\s*Okay, so.*$',
            r'^\s*Hmm,.*$',
            r'^\s*Let\'s break this down.*$'
        ]
        
        for pattern in thinking_patterns:
            translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE)
        
        return translation_text
    except Exception as e:
        print(f"Error translating text: {e}")
        return f"Error: {str(e)}"

def synthesize_speech(text, lang):
    """Generate speech from text"""
    if not text:
        return None
    
    try:
        lang_code = LANGUAGE_CODES.get(lang, "en")
        tts = gTTS(text=text, lang=lang_code)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
            tts.save(fp.name)
            return fp.name
    except Exception as e:
        print(f"Error synthesizing speech: {e}")
        return None

def clear_memory():
    """Clear all fields"""
    return "", "", "", "", None, None, None

def process_speech_to_text(audio):
    """Process audio and return the transcribed text"""
    if not audio:
        return ""
    
    transcribed_text = transcribe_audio_locally(audio)
    return transcribed_text

def clean_translation_output(text):
    """Clean translation output to remove any thinking or processing text"""
    if not text:
        return ""
    
    # Remove any meta-content or thinking
    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
    
    # Remove asterisks from the text
    text = text.replace('**', '')
    text = text.replace('*', '')

    # Remove lines that appear to be thinking/reasoning
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Skip lines that look like thinking
        if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE):
            continue
            
        # Keep translations with language names
        if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES):
            cleaned_lines.append(line)
        # Or keep direct translations without prefixes if they don't look like thinking
        elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE):
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def extract_translations(translations_text, output_langs):
    """Extract clean translations from the model output"""
    if not translations_text or not output_langs:
        return [""] * 3
    
    # Clean the translations text first
    clean_text = clean_translation_output(translations_text)
    
    # Try to match language patterns
    translation_results = []
    
    # First try to find language-labeled translations
    for lang in output_langs:
        pattern = rf'{lang}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)'
        match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
        if match:
            translation_results.append(match.group(1).strip())
    
    # If we couldn't find labeled translations, just split by lines
    if not translation_results and '\n' in clean_text:
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            # Check if this line has a language prefix
            if ':' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    translation_results.append(parts[1].strip())
            else:
                # Just add the line as is if it seems like a translation
                translation_results.append(line)
    elif not translation_results:
        # If no newlines, just use the whole text
        translation_results.append(clean_text)
    
    # Ensure we have exactly 3 results
    while len(translation_results) < 3:
        translation_results.append("")
    
    return translation_results[:3]

def perform_translation(audio, typed_text, input_lang, output_langs, model_name):
    """Main function to handle translation process"""
    # Check if we have valid inputs
    if not output_langs:
        return typed_text, "", "", "", None, None, None
    
    # Limit to 3 output languages
    selected_langs = output_langs[:3]
    
    # Get the input text either from typed text or by transcribing audio
    input_text = typed_text
    if not input_text and audio:
        input_text = transcribe_audio_locally(audio)
    
    if not input_text:
        return "", "", "", "", None, None, None
    
    # Get translations using the selected model
    translations_text = translate_text(input_text, input_lang, selected_langs, model_name)
    
    # Extract clean translations
    translation_results = extract_translations(translations_text, selected_langs)
    
    # Generate speech for each valid translation
    audio_paths = []
    for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)):
        if trans:
            audio_path = synthesize_speech(trans, lang)
            audio_paths.append(audio_path)
        else:
            audio_paths.append(None)
    
    # Ensure we have exactly 3 audio paths
    while len(audio_paths) < 3:
        audio_paths.append(None)
    
    # Return results in the expected format
    return [input_text] + translation_results + audio_paths

with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Multilingual Translator with Speech Support")

    with gr.Row():
        input_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="English", label="Input Language")
        output_langs = gr.CheckboxGroup(choices=SUPPORTED_LANGUAGES, label="Output Languages (select up to 3)")

    with gr.Row():
        model_selector = gr.Dropdown(
            choices=list(AVAILABLE_MODELS.keys()), 
            value="DeepSeek-R1 llama 70B", 
            label="Translation Model"
        )

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Speak Your Input (upload or record)")
        text_input = gr.Textbox(label="Or Type Text", elem_id="text_input")

    transcribed_text = gr.Textbox(label="Transcribed Text (from audio)", interactive=False)
    translated_outputs = [gr.Textbox(label=f"Translation {i+1}", interactive=False) for i in range(3)]
    audio_outputs = [gr.Audio(label=f"Speech Output {i+1}") for i in range(3)]

    with gr.Row():
        translate_btn = gr.Button("Translate", elem_id="translate_btn")
        clear_btn = gr.Button("Clear Memory")

    # Handle audio input separately
    def on_audio_change(audio):
        if audio is None:
            return ""
        transcribed = process_speech_to_text(audio)
        return transcribed
    
    # Update text input when audio is processed
    audio_input.change(
        on_audio_change,
        inputs=[audio_input],
        outputs=[text_input]
    )
    
    # Enable Enter key to submit
    text_input.submit(
        perform_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )
    
    translate_btn.click(
        perform_translation,
        inputs=[audio_input, text_input, input_lang, output_langs, model_selector],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

    clear_btn.click( 
        clear_memory,
        inputs=[],
        outputs=[transcribed_text] + translated_outputs + audio_outputs
    )

demo.launch()