Emmanuel08 commited on
Commit
f33e6ad
Β·
verified Β·
1 Parent(s): 0c444f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -28
app.py CHANGED
@@ -1,37 +1,42 @@
1
  import torch
2
- import torchaudio # βœ… Added torchaudio for resampling
3
  import gradio as gr
4
- import time
5
  import numpy as np
6
  import scipy.io.wavfile
7
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
 
9
- # βœ… 1️⃣ Force Model to Run on CPU
10
  device = "cpu"
11
  torch_dtype = torch.float32 # Use CPU-friendly float type
12
- MODEL_NAME = "openai/whisper-tiny" # βœ… Switched to smallest model for fastest performance
13
 
14
- # βœ… 2️⃣ Load Whisper Tiny Model on CPU
 
 
 
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
- MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
17
  )
18
  model.to(device)
19
 
20
- # βœ… 3️⃣ Load Processor & Pipeline
21
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
22
- processor.feature_extractor.sampling_rate = 16000 # βœ… Set correct sampling rate here
23
 
 
24
  pipe = pipeline(
25
  task="automatic-speech-recognition",
26
  model=model,
27
  tokenizer=processor.tokenizer,
28
  feature_extractor=processor.feature_extractor,
29
- chunk_length_s=2, # βœ… Process in 2-second chunks for ultra-low latency
30
  torch_dtype=torch_dtype,
31
  device=device,
 
32
  )
33
 
34
- # βœ… 4️⃣ Real-Time Streaming Transcription (Microphone)
35
  def stream_transcribe(stream, new_chunk):
36
  start_time = time.time()
37
  try:
@@ -44,18 +49,17 @@ def stream_transcribe(stream, new_chunk):
44
  y = y.astype(np.float32)
45
  y /= np.max(np.abs(y))
46
 
47
- # βœ… Resample audio to 16kHz using torchaudio
48
  y_tensor = torch.tensor(y)
49
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
50
- y_resampled = resampler(y_tensor).numpy()
51
-
52
  # βœ… Append to Stream
53
  if stream is not None:
54
  stream = np.concatenate([stream, y_resampled])
55
  else:
56
  stream = y_resampled
57
 
58
- # βœ… Run Transcription
59
  transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
60
  latency = time.time() - start_time
61
 
@@ -65,17 +69,16 @@ def stream_transcribe(stream, new_chunk):
65
  print(f"Error: {e}")
66
  return stream, str(e), "Error"
67
 
68
- # βœ… 5️⃣ Transcription for File Upload
69
  def transcribe(inputs, previous_transcription):
70
  start_time = time.time()
71
  try:
72
  # βœ… Convert file input to correct format
73
  sample_rate, audio_data = inputs
74
 
75
- # βœ… Resample to 16kHz using torchaudio
76
  audio_tensor = torch.tensor(audio_data)
77
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
78
- resampled_audio = resampler(audio_tensor).numpy()
79
 
80
  transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
81
 
@@ -88,14 +91,14 @@ def transcribe(inputs, previous_transcription):
88
  print(f"Error: {e}")
89
  return previous_transcription, "Error"
90
 
91
- # βœ… 6️⃣ Clear Function
92
  def clear():
93
  return ""
94
 
95
- # βœ… 7️⃣ Gradio Interface (Microphone Streaming)
96
  with gr.Blocks() as microphone:
97
- gr.Markdown(f"# Whisper Tiny - Real-Time Transcription (CPU) πŸŽ™οΈ")
98
- gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text.")
99
 
100
  with gr.Row():
101
  input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
@@ -112,10 +115,10 @@ with gr.Blocks() as microphone:
112
  )
113
  clear_button.click(clear, outputs=[output])
114
 
115
- # βœ… 8️⃣ Gradio Interface (File Upload)
116
  with gr.Blocks() as file:
117
  gr.Markdown(f"# Upload Audio File for Transcription 🎡")
118
- gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for speech-to-text.")
119
 
120
  with gr.Row():
121
  input_audio = gr.Audio(sources=["upload"], type="numpy")
@@ -129,10 +132,10 @@ with gr.Blocks() as file:
129
  submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
130
  clear_button.click(clear, outputs=[output])
131
 
132
- # βœ… 9️⃣ Final Gradio App (Supports Microphone & File Upload)
133
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
134
  gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
135
 
136
- # βœ… 1️⃣0️⃣ Run Gradio Locally
137
  if __name__ == "__main__":
138
  demo.launch()
 
1
  import torch
2
+ import torchaudio
3
  import gradio as gr
4
+ import time
5
  import numpy as np
6
  import scipy.io.wavfile
7
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, BitsAndBytesConfig
8
 
9
+ # βœ… 1️⃣ Optimize Model Selection
10
  device = "cpu"
11
  torch_dtype = torch.float32 # Use CPU-friendly float type
12
+ MODEL_NAME = "openai/whisper-small" # βœ… Switched to "small" for better accuracy
13
 
14
+ # βœ… 2️⃣ Enable Quantization (Reduces Memory Usage, Speeds Up Inference)
15
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
16
+
17
+ # βœ… 3️⃣ Load Whisper Model on CPU with Optimized Settings
18
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
+ MODEL_NAME, quantization_config=quantization_config, torch_dtype=torch_dtype, use_safetensors=True
20
  )
21
  model.to(device)
22
 
23
+ # βœ… 4️⃣ Load Processor & Set Default Sampling Rate
24
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
25
+ processor.feature_extractor.sampling_rate = 16000 # βœ… Set correct sampling rate
26
 
27
+ # βœ… 5️⃣ Optimized Pipeline with Beam Search for Better Accuracy
28
  pipe = pipeline(
29
  task="automatic-speech-recognition",
30
  model=model,
31
  tokenizer=processor.tokenizer,
32
  feature_extractor=processor.feature_extractor,
33
+ chunk_length_s=5, # βœ… Increase chunk size for better performance
34
  torch_dtype=torch_dtype,
35
  device=device,
36
+ generate_kwargs={"num_beams": 5, "language": "en"}, # βœ… Beam search for better accuracy
37
  )
38
 
39
+ # βœ… 6️⃣ Real-Time Streaming Transcription (Microphone)
40
  def stream_transcribe(stream, new_chunk):
41
  start_time = time.time()
42
  try:
 
49
  y = y.astype(np.float32)
50
  y /= np.max(np.abs(y))
51
 
52
+ # βœ… Resample audio to 16kHz using optimized torchaudio method
53
  y_tensor = torch.tensor(y)
54
+ y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
55
+
 
56
  # βœ… Append to Stream
57
  if stream is not None:
58
  stream = np.concatenate([stream, y_resampled])
59
  else:
60
  stream = y_resampled
61
 
62
+ # βœ… Run Transcription with Optimized Parameters
63
  transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
64
  latency = time.time() - start_time
65
 
 
69
  print(f"Error: {e}")
70
  return stream, str(e), "Error"
71
 
72
+ # βœ… 7️⃣ Transcription for File Upload
73
  def transcribe(inputs, previous_transcription):
74
  start_time = time.time()
75
  try:
76
  # βœ… Convert file input to correct format
77
  sample_rate, audio_data = inputs
78
 
79
+ # βœ… Resample using torchaudio (optimized)
80
  audio_tensor = torch.tensor(audio_data)
81
+ resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()
 
82
 
83
  transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
84
 
 
91
  print(f"Error: {e}")
92
  return previous_transcription, "Error"
93
 
94
+ # βœ… 8️⃣ Clear Function
95
  def clear():
96
  return ""
97
 
98
+ # βœ… 9️⃣ Gradio Interface (Microphone Streaming)
99
  with gr.Blocks() as microphone:
100
+ gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
101
+ gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
102
 
103
  with gr.Row():
104
  input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
 
115
  )
116
  clear_button.click(clear, outputs=[output])
117
 
118
+ # βœ… πŸ”Ÿ Gradio Interface (File Upload)
119
  with gr.Blocks() as file:
120
  gr.Markdown(f"# Upload Audio File for Transcription 🎡")
121
+ gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
122
 
123
  with gr.Row():
124
  input_audio = gr.Audio(sources=["upload"], type="numpy")
 
132
  submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
133
  clear_button.click(clear, outputs=[output])
134
 
135
+ # βœ… 1️⃣1️⃣ Final Gradio App
136
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
137
  gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
138
 
139
+ # βœ… 1️⃣2️⃣ Run Gradio Locally
140
  if __name__ == "__main__":
141
  demo.launch()