Emmanuel08 commited on
Commit
826b8e9
Β·
verified Β·
1 Parent(s): 4fb7b6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -6
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- import torchaudio # βœ… Added torchaudio to handle audio resampling
3
  import gradio as gr
4
  import time
5
  import numpy as np
@@ -11,9 +11,9 @@ device = "cpu"
11
  torch_dtype = torch.float32 # Use CPU-friendly float type
12
  MODEL_NAME = "openai/whisper-tiny" # βœ… Switched to smallest model for fastest performance
13
 
14
- # βœ… 2️⃣ Load Whisper Tiny Model on CPU (Removed `low_cpu_mem_usage=True`)
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
- MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True # βœ… Removed low_cpu_mem_usage
17
  )
18
  model.to(device)
19
 
@@ -28,7 +28,7 @@ pipe = pipeline(
28
  chunk_length_s=2, # βœ… Process in 2-second chunks for ultra-low latency
29
  torch_dtype=torch_dtype,
30
  device=device,
31
- sampling_rate=16000, # βœ… Explicitly set sampling rate to avoid resampling issues
32
  )
33
 
34
  # βœ… 4️⃣ Real-Time Streaming Transcription (Microphone)
@@ -40,8 +40,99 @@ def stream_transcribe(stream, new_chunk):
40
  # βœ… Convert stereo to mono
41
  if y.ndim > 1:
42
  y = y.mean(axis=1)
43
-
44
  y = y.astype(np.float32)
45
  y /= np.max(np.abs(y))
46
 
47
- # βœ… Resample audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import torchaudio # βœ… Added torchaudio for resampling
3
  import gradio as gr
4
  import time
5
  import numpy as np
 
11
  torch_dtype = torch.float32 # Use CPU-friendly float type
12
  MODEL_NAME = "openai/whisper-tiny" # βœ… Switched to smallest model for fastest performance
13
 
14
+ # βœ… 2️⃣ Load Whisper Tiny Model on CPU
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
+ MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
17
  )
18
  model.to(device)
19
 
 
28
  chunk_length_s=2, # βœ… Process in 2-second chunks for ultra-low latency
29
  torch_dtype=torch_dtype,
30
  device=device,
31
+ sampling_rate=16000, # βœ… Explicitly set sampling rate
32
  )
33
 
34
  # βœ… 4️⃣ Real-Time Streaming Transcription (Microphone)
 
40
  # βœ… Convert stereo to mono
41
  if y.ndim > 1:
42
  y = y.mean(axis=1)
43
+
44
  y = y.astype(np.float32)
45
  y /= np.max(np.abs(y))
46
 
47
+ # βœ… Resample audio to 16kHz using torchaudio
48
+ y_tensor = torch.tensor(y)
49
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
50
+ y_resampled = resampler(y_tensor).numpy()
51
+
52
+ # βœ… Append to Stream
53
+ if stream is not None:
54
+ stream = np.concatenate([stream, y_resampled])
55
+ else:
56
+ stream = y_resampled
57
+
58
+ # βœ… Run Transcription
59
+ transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
60
+ latency = time.time() - start_time
61
+
62
+ return stream, transcription, f"{latency:.2f} sec"
63
+
64
+ except Exception as e:
65
+ print(f"Error: {e}")
66
+ return stream, str(e), "Error"
67
+
68
+ # βœ… 5️⃣ Transcription for File Upload
69
+ def transcribe(inputs, previous_transcription):
70
+ start_time = time.time()
71
+ try:
72
+ # βœ… Convert file input to correct format
73
+ sample_rate, audio_data = inputs
74
+
75
+ # βœ… Resample to 16kHz using torchaudio
76
+ audio_tensor = torch.tensor(audio_data)
77
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
78
+ resampled_audio = resampler(audio_tensor).numpy()
79
+
80
+ transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
81
+
82
+ previous_transcription += transcription
83
+ latency = time.time() - start_time
84
+
85
+ return previous_transcription, f"{latency:.2f} sec"
86
+
87
+ except Exception as e:
88
+ print(f"Error: {e}")
89
+ return previous_transcription, "Error"
90
+
91
+ # βœ… 6️⃣ Clear Function
92
+ def clear():
93
+ return ""
94
+
95
+ # βœ… 7️⃣ Gradio Interface (Microphone Streaming)
96
+ with gr.Blocks() as microphone:
97
+ gr.Markdown(f"# Whisper Tiny - Real-Time Transcription (CPU) πŸŽ™οΈ")
98
+ gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text.")
99
+
100
+ with gr.Row():
101
+ input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
102
+ output = gr.Textbox(label="Live Transcription", value="")
103
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
104
+
105
+ with gr.Row():
106
+ clear_button = gr.Button("Clear Output")
107
+
108
+ state = gr.State()
109
+ input_audio_microphone.stream(
110
+ stream_transcribe, [state, input_audio_microphone],
111
+ [state, output, latency_textbox], time_limit=30, stream_every=1
112
+ )
113
+ clear_button.click(clear, outputs=[output])
114
+
115
+ # βœ… 8️⃣ Gradio Interface (File Upload)
116
+ with gr.Blocks() as file:
117
+ gr.Markdown(f"# Upload Audio File for Transcription 🎡")
118
+ gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for speech-to-text.")
119
+
120
+ with gr.Row():
121
+ input_audio = gr.Audio(sources=["upload"], type="numpy")
122
+ output = gr.Textbox(label="Transcription", value="")
123
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
124
+
125
+ with gr.Row():
126
+ submit_button = gr.Button("Submit")
127
+ clear_button = gr.Button("Clear Output")
128
+
129
+ submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
130
+ clear_button.click(clear, outputs=[output])
131
+
132
+ # βœ… 9️⃣ Final Gradio App (Supports Microphone & File Upload)
133
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
134
+ gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
135
+
136
+ # βœ… 1️⃣0️⃣ Run Gradio Locally
137
+ if __name__ == "__main__":
138
+ demo.launch()