Emmanuel08 commited on
Commit
c229ede
Β·
verified Β·
1 Parent(s): 0e843c5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ import time
5
+ import numpy as np
6
+ import scipy.io.wavfile
7
+
8
+ # βœ… 1️⃣ Load Silero STT Model for CPU
9
+ device = torch.device("cpu") # βœ… Ensuring CPU-only execution
10
+ torch_dtype = torch.float32
11
+
12
+ MODEL_NAME = "silero_stt"
13
+
14
+ # βœ… 2️⃣ Load Silero Model & Decoder
15
+ torch.set_num_threads(4) # βœ… Improve CPU performance by using multiple threads
16
+ model, decoder, utils = torch.hub.load(repo_or_dir="snakers4/silero-models",
17
+ model="silero_stt",
18
+ language="en",
19
+ device=device)
20
+ (read_batch, split_into_batches, read_audio, prepare_model_input) = utils
21
+
22
+ # βœ… 3️⃣ Real-Time Streaming Transcription (Microphone)
23
+ def stream_transcribe(stream, new_chunk):
24
+ start_time = time.time()
25
+ try:
26
+ sr, y = new_chunk
27
+
28
+ # βœ… Convert stereo to mono
29
+ if y.ndim > 1:
30
+ y = y.mean(axis=1)
31
+
32
+ y = y.astype(np.float32)
33
+ y /= np.max(np.abs(y))
34
+
35
+ # βœ… Resample audio to 16kHz using torchaudio
36
+ y_tensor = torch.tensor(y)
37
+ y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
38
+
39
+ # βœ… Append to Stream
40
+ if stream is not None:
41
+ stream = np.concatenate([stream, y_resampled])
42
+ else:
43
+ stream = y_resampled
44
+
45
+ # βœ… Prepare Model Input
46
+ input_tensor = torch.from_numpy(stream).unsqueeze(0)
47
+ input_tensor = prepare_model_input(input_tensor, device=device)
48
+
49
+ # βœ… Run Transcription
50
+ transcription = model(input_tensor)
51
+ text = decoder(transcription[0].cpu())
52
+
53
+ latency = time.time() - start_time
54
+ return stream, text, f"{latency:.2f} sec"
55
+
56
+ except Exception as e:
57
+ print(f"Error: {e}")
58
+ return stream, str(e), "Error"
59
+
60
+ # βœ… 4️⃣ Transcription for File Upload
61
+ def transcribe(inputs, previous_transcription):
62
+ start_time = time.time()
63
+ try:
64
+ # βœ… Convert file input to correct format
65
+ sample_rate, audio_data = inputs
66
+
67
+ # βœ… Resample using torchaudio (optimized)
68
+ audio_tensor = torch.tensor(audio_data)
69
+ resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()
70
+
71
+ # βœ… Prepare Model Input
72
+ input_tensor = torch.from_numpy(resampled_audio).unsqueeze(0)
73
+ input_tensor = prepare_model_input(input_tensor, device=device)
74
+
75
+ # βœ… Run Transcription
76
+ transcription = model(input_tensor)
77
+ text = decoder(transcription[0].cpu())
78
+
79
+ previous_transcription += text
80
+ latency = time.time() - start_time
81
+
82
+ return previous_transcription, f"{latency:.2f} sec"
83
+
84
+ except Exception as e:
85
+ print(f"Error: {e}")
86
+ return previous_transcription, "Error"
87
+
88
+ # βœ… 5️⃣ Clear Function
89
+ def clear():
90
+ return ""
91
+
92
+ # βœ… 6️⃣ Gradio Interface (Microphone Streaming)
93
+ with gr.Blocks() as microphone:
94
+ gr.Markdown(f"# Silero STT - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
95
+ gr.Markdown("Using `Silero STT` for lightweight, accurate speech-to-text.")
96
+
97
+ with gr.Row():
98
+ input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
99
+ output = gr.Textbox(label="Live Transcription", value="")
100
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
101
+
102
+ with gr.Row():
103
+ clear_button = gr.Button("Clear Output")
104
+
105
+ state = gr.State()
106
+ input_audio_microphone.stream(
107
+ stream_transcribe, [state, input_audio_microphone],
108
+ [state, output, latency_textbox], time_limit=30, stream_every=1
109
+ )
110
+ clear_button.click(clear, outputs=[output])
111
+
112
+ # βœ… 7️⃣ Gradio Interface (File Upload)
113
+ with gr.Blocks() as file:
114
+ gr.Markdown(f"# Upload Audio File for Transcription 🎡")
115
+ gr.Markdown("Using `Silero STT` for offline, high-accuracy transcription.")
116
+
117
+ with gr.Row():
118
+ input_audio = gr.Audio(sources=["upload"], type="numpy")
119
+ output = gr.Textbox(label="Transcription", value="")
120
+ latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
121
+
122
+ with gr.Row():
123
+ submit_button = gr.Button("Submit")
124
+ clear_button = gr.Button("Clear Output")
125
+
126
+ submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
127
+ clear_button.click(clear, outputs=[output])
128
+
129
+ # βœ… 8️⃣ Final Gradio App
130
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
131
+ gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
132
+
133
+ # βœ… 9️⃣ Run Gradio Locally
134
+ if __name__ == "__main__":
135
+ demo.launch()