Emmanuel08 commited on
Commit
a612649
Β·
verified Β·
1 Parent(s): e8b3dee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -6,12 +6,12 @@ import numpy as np
6
  import scipy.io.wavfile
7
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
 
9
- # βœ… 1️⃣ Use "whisper-small" for better accuracy
10
  device = "cpu"
11
  torch_dtype = torch.float32
12
- MODEL_NAME = "openai/whisper-small"
13
 
14
- # βœ… 2️⃣ Load Whisper Model on CPU (Removed bitsandbytes)
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
  MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
17
  )
@@ -29,10 +29,10 @@ pipe = pipeline(
29
  model=model,
30
  tokenizer=processor.tokenizer,
31
  feature_extractor=processor.feature_extractor,
32
- chunk_length_s=5, # βœ… Better balance between speed & accuracy
33
  torch_dtype=torch_dtype,
34
  device=device,
35
- generate_kwargs={"num_beams": 5, "language": "en"}, # βœ… Beam search for better accuracy
36
  )
37
 
38
  # βœ… 5️⃣ Real-Time Streaming Transcription (Microphone)
@@ -48,7 +48,7 @@ def stream_transcribe(stream, new_chunk):
48
  y = y.astype(np.float32)
49
  y /= np.max(np.abs(y))
50
 
51
- # βœ… Resample audio using optimized torchaudio method
52
  y_tensor = torch.tensor(y)
53
  y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
54
 
@@ -96,8 +96,8 @@ def clear():
96
 
97
  # βœ… 8️⃣ Gradio Interface (Microphone Streaming)
98
  with gr.Blocks() as microphone:
99
- gr.Markdown(f"# Whisper Small - Real-Time Transcription (Optimized CPU) πŸŽ™οΈ")
100
- gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text with better accuracy.")
101
 
102
  with gr.Row():
103
  input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
 
6
  import scipy.io.wavfile
7
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
 
9
+ # βœ… 1️⃣ Use "whisper-medium" for the best balance of speed & accuracy
10
  device = "cpu"
11
  torch_dtype = torch.float32
12
+ MODEL_NAME = "openai/whisper-medium"
13
 
14
+ # βœ… 2️⃣ Load Whisper Model on CPU
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
  MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
17
  )
 
29
  model=model,
30
  tokenizer=processor.tokenizer,
31
  feature_extractor=processor.feature_extractor,
32
+ chunk_length_s=10, # βœ… Longer chunks for better accuracy
33
  torch_dtype=torch_dtype,
34
  device=device,
35
+ generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1}, # βœ… Beam search + English
36
  )
37
 
38
  # βœ… 5️⃣ Real-Time Streaming Transcription (Microphone)
 
48
  y = y.astype(np.float32)
49
  y /= np.max(np.abs(y))
50
 
51
+ # βœ… Resample audio to 16kHz using torchaudio
52
  y_tensor = torch.tensor(y)
53
  y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
54
 
 
96
 
97
  # βœ… 8️⃣ Gradio Interface (Microphone Streaming)
98
  with gr.Blocks() as microphone:
99
+ gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) πŸŽ™οΈ")
100
+ gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")
101
 
102
  with gr.Row():
103
  input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)