BahasaRojakMalaysiaSTT_wav2vec_whisper

Running

App Files Files Community

ashantharosary commited on 5 days ago

Commit

e8d2580

verified ·

1 Parent(s): 2ec3f86

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -120

app.py CHANGED Viewed

@@ -3,144 +3,204 @@ import torch
 import librosa
 import numpy as np
 import tempfile
-from transformers import AutoModelForCTC, Wav2Vec2Processor, WhisperForConditionalGeneration, WhisperProcessor
-from pyctcdecode import build_ctcdecoder
-from huggingface_hub import hf_hub_download
 from jiwer import wer
-import json
-import gzip
-import shutil
 import os
 from pydub import AudioSegment
-import torchaudio
-import re
 import time
-st.set_page_config(page_title="Rojak STT", layout="centered")
-st.title("🎙️ Bahasa Rojak Malaysia Speech-to-Text")
-# Sidebar: Model selector
-model_choice = st.sidebar.selectbox("Choose Model", ["wav2vec2", "whisper"])
-# Session State
-for key in ["audio_bytes", "audio_path", "ground_truth", "wer_value", "predicted_text"]:
-    if key not in st.session_state:
-        st.session_state[key] = None if key in ["audio_bytes", "audio_path", "wer_value"] else ""
-# Tabs
 tab1, tab2 = st.tabs(["📁 Upload Audio", "🎤 Record Audio"])
-# Tab 1: Upload
 with tab1:
-    uploaded_file = st.file_uploader("Upload .wav or .mp3", type=["wav", "mp3", "flac", "m4a", "ogg"])
     if uploaded_file:
-        st.session_state.audio_bytes = uploaded_file.read()
-        ext = uploaded_file.name.split(".")[-1]
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp:
-            tmp.write(st.session_state.audio_bytes)
-            st.session_state.audio_path = tmp.name
-        if ext == "mp3":
-            audio = AudioSegment.from_mp3(st.session_state.audio_path)
-            wav_path = st.session_state.audio_path.replace(".mp3", ".wav")
-            audio.export(wav_path, format="wav")
-            st.session_state.audio_path = wav_path
-        librosa.load(st.session_state.audio_path, sr=16000)
-        st.audio(st.session_state.audio_bytes, format="audio/wav")
-# Tab 2: Record
 with tab2:
-    audio_input = st.audio_input("🎤 Record your audio")
     if audio_input:
-        st.session_state.audio_bytes = audio_input.getvalue()
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-            tmp.write(st.session_state.audio_bytes)
-            st.session_state.audio_path = tmp.name
-        librosa.load(st.session_state.audio_path, sr=16000)
-        st.audio(st.session_state.audio_bytes, format="audio/wav")
-# Clear state if no audio
-if not st.session_state.audio_bytes:
-    st.session_state["ground_truth"] = ""
-    st.session_state["predicted_text"] = ""
-    st.session_state["wer_value"] = None
-# Ground truth input
-st.session_state["ground_truth"] = st.text_input("Optional: Enter ground truth", value=st.session_state["ground_truth"])
-# ---- Loaders ----
 @st.cache_resource
-def load_wav2vec2_model():
-    processor = Wav2Vec2Processor.from_pretrained("mesolitica/wav2vec2-xls-r-300m-mixed")
-    model = AutoModelForCTC.from_pretrained("mesolitica/wav2vec2-xls-r-300m-mixed")
-    model.eval()
-    return processor, model
-@st.cache_resource
-def load_decoder():
-    vocab_path = hf_hub_download("ashantharosary/wav2vec2-ngram-finetuned", "vocab.json", repo_type="model")
-    with open(vocab_path, "r") as f:
-        vocab_dict = json.load(f)
-    vocab_list = [k.lower() for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])]
-    arpa_gz_path = hf_hub_download("ashantharosary/wav2vec2-ngram-finetuned", "4gram.arpa.gz", repo_type="model")
-    arpa_path = "4gram.arpa"
-    if not os.path.exists(arpa_path):
-        with gzip.open(arpa_gz_path, 'rb') as f_in, open(arpa_path, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    return build_ctcdecoder(vocab_list, kenlm_model_path=arpa_path, alpha=0.2, beta=1.0)
 @st.cache_resource
-def load_whisper_model():
-    model = WhisperForConditionalGeneration.from_pretrained("wy0909/Whisper-MixedLanguageModel")
-    processor = WhisperProcessor.from_pretrained("wy0909/Whisper-MixedLanguageModel")
     model.config.forced_decoder_ids = None
     model.generation_config.forced_decoder_ids = None
     model.config.suppress_tokens = []
     return model, processor
-# ---- Transcription ----
-def capitalize_sentences(text):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    return ' '.join([s.strip().capitalize() for s in sentences])
-if st.button("📝 Transcribe", disabled=not st.session_state.audio_bytes):
-    start_time = time.time()
-    try:
-        if model_choice == "wav2vec2":
-            processor, model = load_wav2vec2_model()
-            decoder = load_decoder()
-            audio, _ = librosa.load(st.session_state.audio_path, sr=16000)
-            input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
-            with torch.no_grad():
-                logits = model(input_values).logits[0].cpu().numpy()
-            decoded_ngram = decoder.decode_beams(logits, prune_history=True)
-            text = decoded_ngram[0][0]
-            st.markdown("### 🧠 Transcription (Wav2Vec2 + LM)")
-            st.success(text)
-        else:  # whisper
-            model, processor = load_whisper_model()
-            waveform, sr = torchaudio.load(st.session_state.audio_path)
-            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
-            inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
-            with torch.no_grad():
-                predicted_ids = model.generate(inputs["input_features"])
-            text = capitalize_sentences(processor.batch_decode(predicted_ids, skip_special_tokens=True)[0])
-            st.markdown("### 🎧 Transcription (Whisper)")
-            st.success(text)
-        st.session_state["predicted_text"] = text
-        if st.session_state["ground_truth"]:
-            error = wer(st.session_state["ground_truth"].lower(), text.lower())
-            st.session_state["wer_value"] = error
-            st.markdown("### 🧮 Word Error Rate (WER)")
-            st.write(f"WER: `{error:.2f}`")
-    except Exception as e:
-        st.error(f"❌ Transcription failed: {str(e)}")
-    st.caption(f"🕒 Time taken: {time.time() - start_time:.2f}s")

 import librosa
 import numpy as np
 import tempfile
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from jiwer import wer
 import os
 from pydub import AudioSegment
 import time
+import re
+# Constants
+WHISPER_FINETUNED = "wy0909/whisper-medium_mixedLanguageModel"
+WHISPER_PRETRAINED = "openai/whisper-medium"
+MAX_RECORDING_SECONDS = 12
+def capitalize_sentences(text):
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    capitalized = [s.strip().capitalize() for s in sentences]
+    return ' '.join(capitalized)
+# Main title
+st.title("🎙️ Speech-to-Text with Whisper")
+# Session state initialization
+if "audio_bytes" not in st.session_state:
+    st.session_state.audio_bytes = None
+if "audio_path" not in st.session_state:
+    st.session_state.audio_path = None
+if "ground_truth" not in st.session_state:
+    st.session_state.ground_truth = ""
+if "predicted_text" not in st.session_state:
+    st.session_state.predicted_text = ""
+if "wer_value" not in st.session_state:
+    st.session_state.wer_value = None
+if "selected_tab" not in st.session_state:
+    st.session_state.selected_tab = "📁 Upload Audio"
+if "previous_tab" not in st.session_state:
+    st.session_state.previous_tab = "📁 Upload Audio"
+# Tab Selection
 tab1, tab2 = st.tabs(["📁 Upload Audio", "🎤 Record Audio"])
+# Reset state if tab is changed
+if st.session_state.selected_tab != st.session_state.previous_tab:
+    st.session_state.audio_bytes = None
+    st.session_state.audio_path = None
+    st.session_state.ground_truth = ""
+    st.session_state.predicted_text = ""
+    st.session_state.wer_value = None
+    st.session_state.previous_tab = st.session_state.selected_tab
+# Tab 1: Upload Audio
 with tab1:
+    uploaded_file = st.file_uploader("Upload a .wav or .mp3 file", type=["wav", "mp3"])
     if uploaded_file:
+        try:
+            st.session_state.audio_bytes = uploaded_file.read()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp:
+                tmp.write(st.session_state.audio_bytes)
+                st.session_state.audio_path = tmp.name
+            if uploaded_file.name.endswith(".mp3"):
+                audio = AudioSegment.from_mp3(st.session_state.audio_path)
+                wav_path = st.session_state.audio_path.replace(".mp3", ".wav")
+                audio.export(wav_path, format="wav")
+                os.unlink(st.session_state.audio_path)
+                st.session_state.audio_path = wav_path
+            librosa.load(st.session_state.audio_path, sr=16000)
+            st.audio(st.session_state.audio_bytes, format="audio/wav")
+        except Exception as e:
+            st.error(f"❌ Failed to read audio file: {str(e)}")
+            if 'st.session_state.audio_path' in locals() and os.path.exists(st.session_state.audio_path):
+                os.unlink(st.session_state.audio_path)
+            st.session_state.audio_bytes = None
+# Tab 2: Record Audio
 with tab2:
+    st.session_state.selected_tab = "🎤 Record Audio"
+    st.caption(f"Click microphone below to start recording (max {MAX_RECORDING_SECONDS} seconds)")
+    audio_input = st.audio_input("🎙️ Record Audio")
     if audio_input:
+        try:
+            # Get the audio bytes in the correct format
+            audio_bytes = audio_input.read() if hasattr(audio_input, 'read') else audio_input.getvalue()
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                tmp.write(audio_bytes)
+                temp_path = tmp.name
+            # Check duration
+            audio_segment = AudioSegment.from_file(temp_path)
+            duration_seconds = len(audio_segment) / 1000
+            if duration_seconds > MAX_RECORDING_SECONDS:
+                st.error(f"❌ Recording too long! Please keep it under {MAX_RECORDING_SECONDS} seconds.")
+                os.unlink(temp_path)
+            else:
+                # Store in session state
+                st.session_state.audio_bytes = audio_bytes
+                st.session_state.audio_path = temp_path
+                # Validate and display
+                librosa.load(st.session_state.audio_path, sr=16000)
+        except Exception as e:
+            st.error(f"❌ Failed to process recorded audio: {str(e)}")
+            if 'temp_path' in locals() and os.path.exists(temp_path):
+                os.unlink(temp_path)
+            st.session_state.audio_bytes = None
+            st.session_state.audio_path = None
+# Input ground truth for WER
+st.session_state.ground_truth = st.text_input(
+    "Enter ground truth for WER calculation (Optional)",
+    value=st.session_state.ground_truth,
+    key="ground_truth_input"
+)
+# Whisper configuration
+model_choice = st.selectbox(
+    "Select Whisper Model",
+    options=["Fine-tuned Model", "Pretrained Whisper-Medium Model"],
+    help="Choose the Whisper model to transcribe the audio"
+)
 @st.cache_resource
+def load_finetuned_model_and_processor():
+    model = WhisperForConditionalGeneration.from_pretrained(
+        WHISPER_FINETUNED,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
+    )
+    processor = WhisperProcessor.from_pretrained(WHISPER_FINETUNED)
+    model.config.forced_decoder_ids = None
+    model.generation_config.forced_decoder_ids = None
+    model.config.use_cache = None
+    model.config.suppress_tokens = []
+    if torch.cuda.is_available():
+        model = model.to("cuda")
+    return model, processor
 @st.cache_resource
+def load_pretrained_model_and_processor():
+    model = WhisperForConditionalGeneration.from_pretrained(WHISPER_PRETRAINED)
+    processor = WhisperProcessor.from_pretrained(WHISPER_PRETRAINED)
     model.config.forced_decoder_ids = None
     model.generation_config.forced_decoder_ids = None
+    model.config.use_cache = None
     model.config.suppress_tokens = []
     return model, processor
+if model_choice == "Fine-tuned Model":
+    model, processor = load_finetuned_model_and_processor()
+else:
+    model, processor = load_pretrained_model_and_processor()
+# Transcription Button
+if st.button("📝 Transcribe"):
+    if not st.session_state.audio_bytes:
+        st.error("❌ Please upload or record an audio file first.")
+    else:
+        start_time = time.time()
+        try:
+            audio_input_data, _ = librosa.load(st.session_state.audio_path, sr=16000)
+            input_features = processor(
+                audio_input_data, sampling_rate=16000, return_tensors="pt"
+            ).input_features
+            predicted_ids = model.generate(input_features)
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+            transcription = capitalize_sentences(transcription)
+            st.session_state.predicted_text = transcription
+            st.markdown("### 🔊 Predicted Transcription")
+            st.success(st.session_state.predicted_text)
+            if st.session_state.ground_truth:
+                st.session_state.wer_value = wer(
+                    st.session_state.ground_truth.lower(),
+                    st.session_state.predicted_text.lower()
+                )
+                st.markdown("### 🧮 Word Error Rate (WER)")
+                st.write(f"WER: `{st.session_state.wer_value * 100:.2f}%`")
+        except Exception as e:
+            st.error(f"❌ Transcription failed: {str(e)}")
+        finally:
+            # Clean up temporary files
+            if st.session_state.audio_path and os.path.exists(st.session_state.audio_path):
+                os.unlink(st.session_state.audio_path)
+            st.session_state.audio_bytes = None
+            st.session_state.audio_path = None
+            st.session_state.audio_path = None
+            st.session_state.predicted_text = ""
+            st.session_state.ground_truth = ""
+            st.session_state.wer_value = None
+            end_time = time.time()
+            duration = end_time - start_time
+            st.caption(f"🕒 Time taken: {duration:.2f}s")