Spaces:

Luigi
/

dinercall-intent-demo

Sleeping

App Files Files Community

Luigi commited on Apr 12

Commit

d236f33

1 Parent(s): b824b83

use also file mode to send audio from mic to whisper

Browse files

Files changed (1) hide show

app.py +9 -16

app.py CHANGED Viewed

@@ -122,13 +122,11 @@ def transcribe_audio(audio_input):
     if isinstance(audio_input, str):
         result = whisper_pipe(audio_input)
         return result["text"]
-    # For microphone input, Gradio returns a tuple (sample_rate, audio_array).
     elif isinstance(audio_input, tuple):
-        audio_array = audio_input[1]
-        if audio_array.ndim > 1:
-            audio_array = np.mean(audio_array, axis=-1)
-        result = whisper_pipe(audio_array)
-        return result["text"]
     else:
         return ""
@@ -137,6 +135,7 @@ def transcribe_audio(audio_input):
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
     # Determine input based on selected mode.
     if mode == "Microphone" and mic_audio is not None:
         transcription = transcribe_audio(mic_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
@@ -161,28 +160,22 @@ with gr.Blocks() as demo:
     gr.Markdown("錄音、上傳語音檔案或輸入文字，自動判斷是否具有訂位意圖。")
     with gr.Row():
-        # Input Mode Selector with three options.
         mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
-        # Three input components: microphone, text, and file upload.
-        mic_audio = gr.Audio(sources=["microphone"], type="numpy", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
-        # For file input, use 'filepath' so Whisper pipeline handles conversion.
         file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
-    # Initially, only the microphone input is visible.
-    text_input.visible = False
-    file_audio.visible = False
-    # Change event for mode selection to toggle visibility.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
             return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
         elif selected_mode == "Text":
             return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         else:  # File
-            return gr.update(visible=False), gr.update(visible(False)), gr.update(visible=True)
     mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
     with gr.Row():

     if isinstance(audio_input, str):
         result = whisper_pipe(audio_input)
         return result["text"]
+    # For microphone input, we now also use file_path.
     elif isinstance(audio_input, tuple):
+        # In our updated configuration, microphone input should be provided as a file path,
+        # so this branch may not be reached.
+        return ""
     else:
         return ""
 def classify_intent(mode, mic_audio, text_input, file_audio, model_choice):
     # Determine input based on selected mode.
     if mode == "Microphone" and mic_audio is not None:
+        # mic_audio is a file path.
         transcription = transcribe_audio(mic_audio)
     elif mode == "Text" and text_input:
         transcription = text_input
     gr.Markdown("錄音、上傳語音檔案或輸入文字，自動判斷是否具有訂位意圖。")
     with gr.Row():
         mode = gr.Radio(choices=["Microphone", "Text", "File"], label="選擇輸入模式", value="Microphone")
     with gr.Row():
+        # For microphone input, set type="filepath" so that we always get a file path.
+        mic_audio = gr.Audio(sources=["microphone"], type="filepath", label="語音輸入 (點擊錄音)")
         text_input = gr.Textbox(lines=2, placeholder="請輸入文字", label="文字輸入")
         file_audio = gr.Audio(sources=["upload"], type="filepath", label="上傳語音檔案")
+    # Set visibility based on selected mode.
     def update_visibility(selected_mode):
         if selected_mode == "Microphone":
             return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
         elif selected_mode == "Text":
             return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         else:  # File
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
     mode.change(fn=update_visibility, inputs=mode, outputs=[mic_audio, text_input, file_audio])
     with gr.Row():