File size: 4,889 Bytes
32b6530 f33e6ad 32b6530 f33e6ad 32b6530 d63bba0 32b6530 a612649 32b6530 d63bba0 a612649 32b6530 a612649 32b6530 d63bba0 32b6530 d63bba0 32b6530 f33e6ad 32b6530 a612649 32b6530 a612649 32b6530 d63bba0 32b6530 826b8e9 32b6530 a612649 826b8e9 f33e6ad 826b8e9 f33e6ad 826b8e9 d63bba0 826b8e9 f33e6ad 826b8e9 f33e6ad 826b8e9 d63bba0 826b8e9 d63bba0 826b8e9 a612649 826b8e9 d63bba0 826b8e9 f33e6ad 826b8e9 d63bba0 826b8e9 d63bba0 826b8e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
import torchaudio
import gradio as gr
import time
import numpy as np
import scipy.io.wavfile
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# β
1οΈβ£ Use "whisper-medium" for the best balance of speed & accuracy
device = "cpu"
torch_dtype = torch.float32
MODEL_NAME = "openai/whisper-medium"
# β
2οΈβ£ Load Whisper Model on CPU
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)
# β
3οΈβ£ Speed up execution with torch.compile()
model = torch.compile(model) # β
Faster inference on CPU
# β
4οΈβ£ Load Processor & Pipeline
processor = AutoProcessor.from_pretrained(MODEL_NAME)
processor.feature_extractor.sampling_rate = 16000 # β
Set correct sampling rate
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=10, # β
Longer chunks for better accuracy
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1}, # β
Beam search + English
)
# β
5οΈβ£ Real-Time Streaming Transcription (Microphone)
def stream_transcribe(stream, new_chunk):
start_time = time.time()
try:
sr, y = new_chunk
# β
Convert stereo to mono
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# β
Resample audio to 16kHz using torchaudio
y_tensor = torch.tensor(y)
y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()
# β
Append to Stream
if stream is not None:
stream = np.concatenate([stream, y_resampled])
else:
stream = y_resampled
# β
Run Transcription with Optimized Parameters
transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
latency = time.time() - start_time
return stream, transcription, f"{latency:.2f} sec"
except Exception as e:
print(f"Error: {e}")
return stream, str(e), "Error"
# β
6οΈβ£ Transcription for File Upload
def transcribe(inputs, previous_transcription):
start_time = time.time()
try:
# β
Convert file input to correct format
sample_rate, audio_data = inputs
# β
Resample using torchaudio (optimized)
audio_tensor = torch.tensor(audio_data)
resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()
transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]
previous_transcription += transcription
latency = time.time() - start_time
return previous_transcription, f"{latency:.2f} sec"
except Exception as e:
print(f"Error: {e}")
return previous_transcription, "Error"
# β
7οΈβ£ Clear Function
def clear():
return ""
# β
8οΈβ£ Gradio Interface (Microphone Streaming)
with gr.Blocks() as microphone:
gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) ποΈ")
gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")
with gr.Row():
input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
output = gr.Textbox(label="Live Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
with gr.Row():
clear_button = gr.Button("Clear Output")
state = gr.State()
input_audio_microphone.stream(
stream_transcribe, [state, input_audio_microphone],
[state, output, latency_textbox], time_limit=30, stream_every=1
)
clear_button.click(clear, outputs=[output])
# β
9οΈβ£ Gradio Interface (File Upload)
with gr.Blocks() as file:
gr.Markdown(f"# Upload Audio File for Transcription π΅")
gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")
with gr.Row():
input_audio = gr.Audio(sources=["upload"], type="numpy")
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")
with gr.Row():
submit_button = gr.Button("Submit")
clear_button = gr.Button("Clear Output")
submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
clear_button.click(clear, outputs=[output])
# β
π Final Gradio App
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])
# β
1οΈβ£1οΈβ£ Run Gradio Locally
if __name__ == "__main__":
demo.launch()
|