Emmanuel08 commited on
Commit
d63bba0
Β·
verified Β·
1 Parent(s): f33e6ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -20
app.py CHANGED
@@ -4,39 +4,38 @@ import gradio as gr
4
  import time
5
  import numpy as np
6
  import scipy.io.wavfile
7
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, BitsAndBytesConfig
8
 
9
- # βœ… 1️⃣ Optimize Model Selection
10
  device = "cpu"
11
- torch_dtype = torch.float32 # Use CPU-friendly float type
12
- MODEL_NAME = "openai/whisper-small" # βœ… Switched to "small" for better accuracy
13
 
14
- # βœ… 2️⃣ Enable Quantization (Reduces Memory Usage, Speeds Up Inference)
15
- quantization_config = BitsAndBytesConfig(load_in_8bit=True)
16
-
17
- # βœ… 3️⃣ Load Whisper Model on CPU with Optimized Settings
18
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
- MODEL_NAME, quantization_config=quantization_config, torch_dtype=torch_dtype, use_safetensors=True
20
  )
21
  model.to(device)
22
 
23
- # βœ… 4️⃣ Load Processor & Set Default Sampling Rate
 
 
 
24
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
25
  processor.feature_extractor.sampling_rate = 16000 # βœ… Set correct sampling rate
26
 
27
- # βœ… 5️⃣ Optimized Pipeline with Beam Search for Better Accuracy
28
  pipe = pipeline(
29
  task="automatic-speech-recognition",
30
  model=model,
31
  tokenizer=processor.tokenizer,
32
  feature_extractor=processor.feature_extractor,
33
- chunk_length_s=5, # βœ… Increase chunk size for better performance
34
  torch_dtype=torch_dtype,
35
  device=device,
36
  generate_kwargs={"num_beams": 5, "language": "en"}, # βœ… Beam search for better accuracy
37
  )
38
 
39
- # βœ… 6️⃣ Real-Time Streaming Transcription (Microphone)
40
  def stream_transcribe(stream, new_chunk):
41
  start_time = time.time()
42
  try:
@@ -49,7 +48,7 @@ def stream_transcribe(stream, new_chunk):
49
  y = y.astype(np.float32)
50
  y /= np.max(np.abs(y))
51
 
52
- # βœ… Resample audio to 16kHz using optimized torchaudio method
53
  y_tensor = torch.tensor(y)
54
  y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
55
 
@@ -69,7 +68,7 @@ def stream_transcribe(stream, new_chunk):
69
  print(f"Error: {e}")
70
  return stream, str(e), "Error"
71
 
72
- # βœ… 7️⃣ Transcription for File Upload
73
  def transcribe(inputs, previous_transcription):
74
  start_time = time.time()
75
  try:
@@ -91,11 +90,11 @@ def transcribe(inputs, previous_transcription):
91
  print(f"Error: {e}")
92
  return previous_transcription, "Error"
93
 
94
- # βœ… 8️⃣ Clear Function
95
  def clear():
96
  return ""
97
 
98
- # βœ… 9️⃣ Gradio Interface (Microphone Streaming)
99
  with gr.Blocks() as microphone:
100
  gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
101
  gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
@@ -115,7 +114,7 @@ with gr.Blocks() as microphone:
115
  )
116
  clear_button.click(clear, outputs=[output])
117
 
118
- # βœ… πŸ”Ÿ Gradio Interface (File Upload)
119
  with gr.Blocks() as file:
120
  gr.Markdown(f"# Upload Audio File for Transcription 🎡")
121
  gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
@@ -132,10 +131,10 @@ with gr.Blocks() as file:
132
  submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
133
  clear_button.click(clear, outputs=[output])
134
 
135
- # βœ… 1️⃣1️⃣ Final Gradio App
136
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
137
  gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
138
 
139
- # βœ… 1️⃣2️⃣ Run Gradio Locally
140
  if __name__ == "__main__":
141
  demo.launch()
 
4
  import time
5
  import numpy as np
6
  import scipy.io.wavfile
7
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
 
9
+ # βœ… 1️⃣ Use "whisper-small" for better accuracy
10
  device = "cpu"
11
+ torch_dtype = torch.float32
12
+ MODEL_NAME = "openai/whisper-small"
13
 
14
+ # βœ… 2️⃣ Load Whisper Model on CPU (Removed bitsandbytes)
 
 
 
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
+ MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
17
  )
18
  model.to(device)
19
 
20
+ # βœ… 3️⃣ Speed up execution with torch.compile()
21
+ model = torch.compile(model) # βœ… Faster inference on CPU
22
+
23
+ # βœ… 4️⃣ Load Processor & Pipeline
24
  processor = AutoProcessor.from_pretrained(MODEL_NAME)
25
  processor.feature_extractor.sampling_rate = 16000 # βœ… Set correct sampling rate
26
 
 
27
  pipe = pipeline(
28
  task="automatic-speech-recognition",
29
  model=model,
30
  tokenizer=processor.tokenizer,
31
  feature_extractor=processor.feature_extractor,
32
+ chunk_length_s=5, # βœ… Better balance between speed & accuracy
33
  torch_dtype=torch_dtype,
34
  device=device,
35
  generate_kwargs={"num_beams": 5, "language": "en"}, # βœ… Beam search for better accuracy
36
  )
37
 
38
+ # βœ… 5️⃣ Real-Time Streaming Transcription (Microphone)
39
  def stream_transcribe(stream, new_chunk):
40
  start_time = time.time()
41
  try:
 
48
  y = y.astype(np.float32)
49
  y /= np.max(np.abs(y))
50
 
51
+ # βœ… Resample audio using optimized torchaudio method
52
  y_tensor = torch.tensor(y)
53
  y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
54
 
 
68
  print(f"Error: {e}")
69
  return stream, str(e), "Error"
70
 
71
+ # βœ… 6️⃣ Transcription for File Upload
72
  def transcribe(inputs, previous_transcription):
73
  start_time = time.time()
74
  try:
 
90
  print(f"Error: {e}")
91
  return previous_transcription, "Error"
92
 
93
+ # βœ… 7️⃣ Clear Function
94
  def clear():
95
  return ""
96
 
97
+ # βœ… 8️⃣ Gradio Interface (Microphone Streaming)
98
  with gr.Blocks() as microphone:
99
  gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
100
  gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
 
114
  )
115
  clear_button.click(clear, outputs=[output])
116
 
117
+ # βœ… 9️⃣ Gradio Interface (File Upload)
118
  with gr.Blocks() as file:
119
  gr.Markdown(f"# Upload Audio File for Transcription 🎡")
120
  gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
 
131
  submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
132
  clear_button.click(clear, outputs=[output])
133
 
134
+ # βœ… πŸ”Ÿ Final Gradio App
135
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
136
  gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
137
 
138
+ # βœ… 1️⃣1️⃣ Run Gradio Locally
139
  if __name__ == "__main__":
140
  demo.launch()