Spaces:

Emmanuel08
/

CCI_Realtime_Transcribing_model

Sleeping

App Files Files Community

CCI_Realtime_Transcribing_model / app.py

Emmanuel08

Update app.py

a612649 verified 2 months ago

raw

history blame contribute delete

4.89 kB

	import torch
	import torchaudio
	import gradio as gr
	import time
	import numpy as np
	import scipy.io.wavfile
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

	# ✅ 1️⃣ Use "whisper-medium" for the best balance of speed & accuracy
	device = "cpu"
	torch_dtype = torch.float32
	MODEL_NAME = "openai/whisper-medium"

	# ✅ 2️⃣ Load Whisper Model on CPU
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
	)
	model.to(device)

	# ✅ 3️⃣ Speed up execution with torch.compile()
	model = torch.compile(model) # ✅ Faster inference on CPU

	# ✅ 4️⃣ Load Processor & Pipeline
	processor = AutoProcessor.from_pretrained(MODEL_NAME)
	processor.feature_extractor.sampling_rate = 16000 # ✅ Set correct sampling rate

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=10, # ✅ Longer chunks for better accuracy
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"num_beams": 5, "language": "en", "temperature": 0.1}, # ✅ Beam search + English
	)

	# ✅ 5️⃣ Real-Time Streaming Transcription (Microphone)
	def stream_transcribe(stream, new_chunk):
	start_time = time.time()
	try:
	sr, y = new_chunk

	# ✅ Convert stereo to mono
	if y.ndim > 1:
	y = y.mean(axis=1)

	y = y.astype(np.float32)
	y /= np.max(np.abs(y))

	# ✅ Resample audio to 16kHz using torchaudio
	y_tensor = torch.tensor(y)
	y_resampled = torchaudio.functional.resample(y_tensor, orig_freq=sr, new_freq=16000).numpy()

	# ✅ Append to Stream
	if stream is not None:
	stream = np.concatenate([stream, y_resampled])
	else:
	stream = y_resampled

	# ✅ Run Transcription with Optimized Parameters
	transcription = pipe({"sampling_rate": 16000, "raw": stream})["text"]
	latency = time.time() - start_time

	return stream, transcription, f"{latency:.2f} sec"

	except Exception as e:
	print(f"Error: {e}")
	return stream, str(e), "Error"

	# ✅ 6️⃣ Transcription for File Upload
	def transcribe(inputs, previous_transcription):
	start_time = time.time()
	try:
	# ✅ Convert file input to correct format
	sample_rate, audio_data = inputs

	# ✅ Resample using torchaudio (optimized)
	audio_tensor = torch.tensor(audio_data)
	resampled_audio = torchaudio.functional.resample(audio_tensor, orig_freq=sample_rate, new_freq=16000).numpy()

	transcription = pipe({"sampling_rate": 16000, "raw": resampled_audio})["text"]

	previous_transcription += transcription
	latency = time.time() - start_time

	return previous_transcription, f"{latency:.2f} sec"

	except Exception as e:
	print(f"Error: {e}")
	return previous_transcription, "Error"

	# ✅ 7️⃣ Clear Function
	def clear():
	return ""

	# ✅ 8️⃣ Gradio Interface (Microphone Streaming)
	with gr.Blocks() as microphone:
	gr.Markdown(f"# Whisper Medium - High Accuracy Transcription (Optimized CPU) 🎙️")
	gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for best speech-to-text performance.")

	with gr.Row():
	input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
	output = gr.Textbox(label="Live Transcription", value="")
	latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

	with gr.Row():
	clear_button = gr.Button("Clear Output")

	state = gr.State()
	input_audio_microphone.stream(
	stream_transcribe, [state, input_audio_microphone],
	[state, output, latency_textbox], time_limit=30, stream_every=1
	)
	clear_button.click(clear, outputs=[output])

	# ✅ 9️⃣ Gradio Interface (File Upload)
	with gr.Blocks() as file:
	gr.Markdown(f"# Upload Audio File for Transcription 🎵")
	gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for better transcription accuracy.")

	with gr.Row():
	input_audio = gr.Audio(sources=["upload"], type="numpy")
	output = gr.Textbox(label="Transcription", value="")
	latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

	with gr.Row():
	submit_button = gr.Button("Submit")
	clear_button = gr.Button("Clear Output")

	submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
	clear_button.click(clear, outputs=[output])

	# ✅ 🔟 Final Gradio App
	with gr.Blocks(theme=gr.themes.Ocean()) as demo:
	gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])

	# ✅ 1️⃣1️⃣ Run Gradio Locally
	if __name__ == "__main__":
	demo.launch()