Spaces:
Sleeping
Sleeping
from pprint import pformat | |
import gradio as gr | |
import librosa | |
from huggingface_hub import hf_hub_download | |
from pipeline import PreTrainedPipeline | |
HF_HUB_URL = "ales/wav2vec2-cv-be" | |
LM_HUB_FP = "language_model/cv8be_5gram.bin" | |
MODEL_SAMPLING_RATE = 16_000 # 16kHz | |
# download Language Model from HF Hub | |
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP) | |
# init pipeline | |
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp) | |
def main(recorded_audio_fp: str | None, uploaded_audio_fp: str | None): | |
audio_fp = None | |
if recorded_audio_fp is not None: | |
audio_fp = recorded_audio_fp | |
used_audiofile = "recorded" | |
elif uploaded_audio_fp is not None: | |
audio_fp = uploaded_audio_fp | |
used_audiofile = "uploaded" | |
else: | |
return ( | |
"Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.", | |
"Error! You have to either record or upload an audiofile.", | |
) | |
# read audio file | |
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0] | |
# recognize speech | |
pipeline_res = pipeline(inputs=inputs) | |
text = pipeline_res["text"][0] # unpack batch of size 1 | |
# add technical information to the output | |
tech_data = pipeline_res | |
del tech_data["text"] | |
tech_data["used_audiofile"] = used_audiofile | |
tech_data["recorded_file_present"] = recorded_audio_fp is not None | |
tech_data["uploaded_file_present"] = uploaded_audio_fp is not None | |
tech_data["audiofile_path"] = audio_fp | |
tech_data["model_sampling_rate"] = MODEL_SAMPLING_RATE | |
tech_data["inputs_shape"] = inputs.shape | |
tech_data["inputs_max"] = inputs.max().item() | |
tech_data["inputs_min"] = inputs.min().item() | |
tech_data_str = pformat(tech_data) | |
return text, tech_data_str | |
article = """ | |
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be) | |
 | |
""" | |
iface = gr.Interface( | |
fn=main, | |
inputs=[ | |
gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
label="Запішыце аўдыяфайл, каб распазнаць маўленьне", | |
), | |
gr.Audio( | |
sources=["upload"], | |
type="filepath", | |
label="Альбо загрузіце ўжо запісаны аўдыяфайл сюды", | |
), | |
], | |
outputs=[ | |
gr.Textbox(label="Распазнаны тэкст"), | |
gr.Textbox(label="Тэхнічная інфармацыя"), | |
], | |
title="wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model", | |
description=( | |
"Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n" | |
"Акустычная мадэль + моўная мадэль." | |
), | |
article=article, | |
) | |
iface.launch() | |