open_universal_arabic_asr_leaderboard_all

Running

open_universal_arabic_asr_leaderboard_all / app.py

Mahmoud Salhab

updating last update msg

c71f75e 10 days ago

8.45 kB

	import gradio as gr
	import pandas as pd

	banner_url = "https://huggingface.co/spaces/elmresearchcenter/open_universal_arabic_asr_leaderboard/resolve/main/banner.png"
	BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 20vw; min-width: 300px; max-width: 600px;"> </div>'

	INTRODUCTION_TEXT = """
	📖Open Universal Arabic ASR Leaderboard📖 benchmarks multi-dialect Arabic ASR models on various multi-dialect datasets.
	\nApart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.
	\nTo reproduce the benchmark numbers and request a model that is not listed, you can launch an issue/PR in our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard)😊.
	\nFor more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check our [paper](https://arxiv.org/pdf/2412.13788).
	"""

	CITATION_BUTTON_TEXT = """
	@article{wang2024open,
	title={Open Universal Arabic ASR Leaderboard},
	author={Wang, Yingzhi and Alhmoud, Anas and Alqurishi, Muhammad},
	journal={arXiv preprint arXiv:2412.13788},
	year={2024}
	}
	"""

	METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
	## Metrics
	We report both the Word Error Rate (WER) and Character Error Rate (CER) metrics.
	## Reproduction
	The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project.
	\nWe open-source the evaluation scripts at our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard).
	\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.

	## Benchmark datasets
	\| Test Set \| Num Dialects \| Test (h) \|
	\|-------------------------------------------------------------------------------------------------\|----------------\|-------------\|
	\| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022) \| 10 \| 10.7 \|
	\| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets) \| 25 \| 12.6 \|
	\| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus) \| 7 \| 10.5 \|
	\| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus) \| 8 \| 14.9 \|
	\| [MGB-2](http://www.mgb-challenge.org/MGB-2.html) \| Unspecified \| 9.6 \|
	\| [Casablanca](https://huggingface.co/datasets/UBC-NLP/Casablanca) \| 8 \| 7.7 \|

	## In-depth Analysis
	We also provide a comprehensive analysis of the models' robustness, speaker adaptation, inference efficiency and memory consumption.
	\nPlease check our [paper](https://arxiv.org/pdf/2412.13788) to learn more.
	"""


	def styled_message(message):
	return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"

	LAST_UPDATED = "Apr 24th 2025:[New models included: cntxt-ai-munsit-1, elevenlabs-scribe-v1, microsoft-azure-stt, openai-gpt-4o-transcribe]"


	results = {
	"Model": ["cntxt-ai-munsit-1", "elevenlabs-scribe-v1", "microsoft-azure-stt", "openai-gpt-4o-transcribe", "nvidia-conformer-ctc-large-arabic (lm)", "nvidia-conformer-ctc-large-arabic (greedy)", "openai/whisper-large-v3", "facebook/seamless-m4t-v2-large", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "asafaya/hubert-large-arabic-transcribe", "openai/whisper-medium", "nvidia-Parakeet-ctc-1.1b-concat", "nvidia-Parakeet-ctc-1.1b-universal", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
	"Average WER⬇️": [26.68, 40.05, 45.72, 44.97, 32.91, 34.74, 36.86, 38.16, 40.05, 40.20, 42.57, 45.50, 45.57, 46.54, 51.96, 54.54, 55.13, 58.13, 60.98, 65.74],
	"Average CER": [10.05, 14.75, 19.45, 24.31, 13.84, 13.37, 17.21, 17.03, 18.87, 19.55, 20.49, 17.35, 22.27, 23.88, 25.19, 21.45, 21.68, 27.62, 25.61, 30.93],
	"SADA WER": [27.71, 49.44, 58.5, 66.47, 44.52, 47.26, 55.96, 62.52, 60.36, 57.46, 63.24, 67.82, 67.71, 70.70, 73.58, 77.48, 78.02, 87.34, 86.82, 88.54],
	"SADA CER": [11.65, 23.33, 35.39, 49.57, 23.76, 22.54, 34.62, 37.61, 37.67, 36.59, 40.16, 31.83, 43.83, 46.70, 49.48, 37.50, 33.17, 56.75, 44.20, 50.28],
	"Common Voice WER": [10.42, 28.27, 33.775, 28.19, 8.80, 10.60, 17.83, 21.70, 25.73, 21.77, 26.04, 8.01, 28.07, 26.34, 40.01, 26.52, 24.18, 41.79, 23.00, 29.17],
	"Common Voice CER": [3.21, 7.33, 9.29, 8.14, 2.77, 3.05, 5.74, 6.24, 10.89, 7.44, 9.61, 2.37, 10.38, 9.82, 14.64, 7.21, 6.79, 15.75, 6.64, 9.85],
	"MASC(clean-test) WER": [21.74, 31.93, 40.66, 31.53, 23.74, 24.12, 24.66, 25.04, 25.51, 27.25, 28.89, 32.94, 29.99, 30.49, 36.16, 38.82, 35.93, 37.82, 42.75, 49.10],
	"MASC(clean-test) CER": [5.8, 8.23, 14.735, 8.85, 5.63, 5.63, 7.24, 7.19, 7.55, 8.28, 9.05, 7.15, 8.98, 8.41, 10.29, 10.36, 9.01, 11.92, 11.87, 16.37],
	"MASC(noisy-test) WER": [28.08, 41.23, 45.645, 43.29, 34.29, 35.64, 34.63, 33.24, 37.16, 38.55, 40.79, 50.16, 42.91, 45.95, 50.03, 57.33, 56.36, 53.28, 64.27, 69.57],
	"MASC(noisy-test) CER": [8.88, 13.14, 15.77, 18.81, 11.07, 11.02, 12.89, 11.92, 13.93, 15.49, 16.31, 15.62, 17.49, 18.72, 20.09, 19.76, 19.43, 21.93, 24.17, 30.17],
	"MGB-2 WER": [12.1, 25.68, 30.91, 29.62, 17.20, 19.69, 16.26, 20.23, 17.75, 25.17, 24.28, 37.51, 29.32, 24.94, 30.68, 39.16, 48.64, 40.66, 56.29, 64.37],
	"MGB-2 CER": [5.27, 9.27, 13.7, 17.34, 6.87, 7.46, 7.74, 9.37, 8.34, 13.48, 12.10, 11.07, 14.82, 9.87, 11.36, 13.48, 15.56, 19.39, 20.44, 26.56],
	"Casablanca WER": [60.04, 63.77, 64.84, 70.72, 68.90, 71.13, 71.81, 66.25, 73.79, 71.01, 72.18, 76.53, 75.44, 80.80, 81.30, 87.95, 87.64, 87.88, 92.72, 93.68],
	"Casablanca CER": [25.51, 27.17, 27.84, 43.15, 32.97, 30.50, 35.04, 29.85, 34.83, 36.00, 35.71, 36.03, 38.12, 49.77, 45.31, 40.41, 46.12, 39.99, 46.33, 52.36],
	}

	original_df = pd.DataFrame(results)
	original_df.sort_values(by="Average WER⬇️", inplace=True)

	TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

	LEADERBOARD_CSS = """
	#leaderboard-table th .header-content {
	white-space: nowrap;
	}
	"""

	def request_model(model_text):
	return styled_message("🤗 Please launch a discussion in our GitHub repo, thank you. 🤗")

	with gr.Blocks(css=LEADERBOARD_CSS) as demo:
	gr.HTML(BANNER, elem_id="banner")
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
	leaderboard_table = gr.Dataframe(
	value=original_df,
	datatype=TYPES,
	elem_id="leaderboard-table",
	interactive=False,
	visible=True,
	)

	with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
	gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

	with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
	with gr.Column():
	gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text")
	model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
	mdw_submission_result = gr.Markdown()
	btn_submit = gr.Button(value="🚀 Request")
	btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)

	gr.Markdown(f"Last updated on {LAST_UPDATED}", elem_classes="markdown-text")

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	gr.Textbox(
	value=CITATION_BUTTON_TEXT, lines=7,
	label="Copy the BibTeX snippet to cite this source",
	elem_id="citation-button",
	show_copy_button=True,
	)

	demo.launch(allowed_paths=["banner.png"], ssr_mode=False)