Spaces:
Running
Running
import os | |
import gradio as gr | |
import torch | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
import numpy as np | |
import librosa | |
import os | |
from huggingface_hub import login | |
hf_token = os.getenv("token") | |
if hf_token is None: | |
raise ValueError("HF_TOKEN environment variable not set.") | |
login(token=hf_token) | |
# Load model | |
processor = AutoProcessor.from_pretrained("ragunath-ravi/whisper-mini-ta") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained("ragunath-ravi/whisper-mini-ta") | |
# Move model to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
def transcribe_audio(audio): | |
""" | |
Transcribe audio using the fine-tuned Whisper model | |
""" | |
if audio is None: | |
return "No audio provided" | |
# Load audio | |
if isinstance(audio, str): # Path to audio file | |
audio_array, sampling_rate = librosa.load(audio, sr=16000) | |
else: # Tuple of (sample_rate, audio_array) | |
sampling_rate, audio_array = audio | |
# Convert to float32 if not already in floating-point format | |
if not np.issubdtype(audio_array.dtype, np.floating): | |
audio_array = audio_array.astype(np.float32) | |
# Normalize if needed (assuming int16 or int32 input) | |
if np.max(np.abs(audio_array)) > 1.0: | |
audio_array = audio_array / 32768.0 # Normalize for 16-bit audio | |
if sampling_rate != 16000: | |
audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000) | |
sampling_rate = 16000 | |
# Process through model | |
inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
generated_ids = model.generate(input_features=inputs.input_features) | |
# Decode | |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return transcription | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Tamil Speech Transcription") | |
gr.Markdown("This app transcribes Tamil speech using a fine-tuned Whisper model. Upload an audio file or record using the microphone.") | |
with gr.Row(): | |
with gr.Column(): | |
# Set max_length to 10 seconds explicitly (10 * 16000 = 160000 samples) | |
audio_input = gr.Audio( | |
sources=["microphone", "upload"], | |
type="numpy", | |
label="Audio Input (Record up to 10 seconds)", | |
max_length=10 | |
) | |
transcribe_btn = gr.Button("Transcribe") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Transcription Output", lines=5) | |
# Connect components | |
transcribe_btn.click(fn=transcribe_audio, inputs=audio_input, outputs=output_text) | |
audio_input.change(fn=transcribe_audio, inputs=audio_input, outputs=output_text) | |
gr.Markdown("## Instructions") | |
gr.Markdown("1. Record audio using the microphone or upload an audio file") | |
gr.Markdown("2. Click 'Transcribe' to get the Tamil transcription") | |
gr.Markdown("3. For best results, speak clearly in Tamil") | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch(share=True) |