AudioBench-Leaderboard-Extend

Running

App Files Files Community

binwang commited on 6 days ago

Commit

32e2641

verified ·

1 Parent(s): 8a4a728

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

app/content.py +0 -1
app/draw_diagram.py +25 -83
app/pages.py +232 -235
app/summarization.py +24 -14

app/content.py CHANGED Viewed

@@ -151,7 +151,6 @@ dataset_diaplay_information = {
 metrics_info = {
     'wer'                    : 'Word Error Rate (WER) - The Lower, the better.',
-    'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
     'llama3_70b_judge'       : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
     'meteor'                 : 'METEOR Score. The higher, the better.',
     'bleu'                   : 'BLEU Score. The higher, the better.',

 metrics_info = {
     'wer'                    : 'Word Error Rate (WER) - The Lower, the better.',
     'llama3_70b_judge'       : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
     'meteor'                 : 'METEOR Score. The higher, the better.',
     'bleu'                   : 'BLEU Score. The higher, the better.',

app/draw_diagram.py CHANGED Viewed

@@ -15,56 +15,14 @@ info_df = get_dataframe()
 def draw_table(dataset_displayname, metrics):
-    dataset_nickname = displayname2datasetname[dataset_displayname]
     with open('organize_model_results.json', 'r') as f:
         organize_model_results = json.load(f)
     model_results      = organize_model_results[dataset_nickname][metrics]
     model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
     model_results      = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
-    # folder = f"./results_organized/{metrics}/"
-    # # Load the results from CSV
-    # data_path = f'{folder}/{category_name.lower()}.csv'
-    # chart_data = pd.read_csv(data_path).round(3)
-    # dataset_name = displayname2datasetname[displayname]
-    # chart_data = chart_data[['Model', dataset_name]]
-    # # Rename to proper display name
-    # chart_data = chart_data.rename(columns=datasetname2diaplayname)
-    # st.markdown("""
-    #             <style>
-    #             .stMultiSelect [data-baseweb=select] span {
-    #                 max-width: 800px;
-    #                 font-size: 0.9rem;
-    #                 background-color: #3C6478 !important; /* Background color for selected items */
-    #                 color: white; /* Change text color */
-    #                 back
-    #             }
-    #             </style>
-    #             """, unsafe_allow_html=True)
-    # # remap model names
-    # display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
-    # chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
-    # models = st.multiselect("Please choose the model",
-    #                         sorted(chart_data['model_show'].tolist()),
-    #                         default = sorted(chart_data['model_show'].tolist()),
-    #                         )
-    # chart_data = chart_data[chart_data['model_show'].isin(models)]
-    # chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
-    # if len(chart_data) == 0: return
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
@@ -73,23 +31,13 @@ def draw_table(dataset_displayname, metrics):
     with st.container():
         st.markdown('##### TABLE')
-        model_link_mapping = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
-        chart_data_table = pd.DataFrame(list(model_results.items()), columns=["model_show", dataset_displayname])
         chart_data_table["model_link"] = chart_data_table["model_show"].map(model_link_mapping)
-        # chart_data['model_link'] = chart_data['model_show'].map(model_link)
-        # chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
-        # Format numeric columns to 2 decimal places
-        #chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
-        # dataset_name = chart_data_table.columns[1]
         def highlight_first_element(x):
                 # Create a DataFrame with the same shape as the input
-                df_style = pd.DataFrame('', index=x.index, columns=x.columns)
                 df_style.iloc[0, 1] = 'background-color: #b0c1d7'
                 return df_style
@@ -126,40 +74,39 @@ def draw_table(dataset_displayname, metrics):
                             ]:
             chart_data_table = chart_data_table.sort_values(
-                    by=chart_data_table.columns[1],
-                    ascending=True
-                ).reset_index(drop=True)
         else:
             chart_data_table = chart_data_table.sort_values(
-                    by=chart_data_table.columns[1],
-                    ascending=False
-                ).reset_index(drop=True)
         styled_df = chart_data_table.style.format(
-            {chart_data_table.columns[1]: "{:.3f}"}
-        ).apply(
-            highlight_first_element, axis=None
-        )
         st.dataframe(
-                styled_df,
-                column_config={
-                    'model_show'               : 'Model',
-                    chart_data_table.columns[1]: {'alignment': 'left'},
-                    "model_link"               : st.column_config.LinkColumn("Model Link"),
-                },
-                hide_index=True,
-                use_container_width=True
-            )
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
     Show Chart
     '''
     # Initialize a session state variable for toggling the chart visibility
     if "show_chart" not in st.session_state:
         st.session_state.show_chart = False
@@ -232,15 +179,10 @@ def draw_table(dataset_displayname, metrics):
             value = st_echarts(options=options, events=events, height="500px")
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
     Show Examples
     '''
     # Initialize a session state variable for toggling the chart visibility
     if "show_examples" not in st.session_state:
         st.session_state.show_examples = False

 def draw_table(dataset_displayname, metrics):
     with open('organize_model_results.json', 'r') as f:
         organize_model_results = json.load(f)
+    dataset_nickname   = displayname2datasetname[dataset_displayname]
     model_results      = organize_model_results[dataset_nickname][metrics]
     model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
     model_results      = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
     with st.container():
         st.markdown('##### TABLE')
+        model_link_mapping             = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
+        chart_data_table               = pd.DataFrame(list(model_results.items()), columns=["model_show", dataset_displayname])
         chart_data_table["model_link"] = chart_data_table["model_show"].map(model_link_mapping)
         def highlight_first_element(x):
                 # Create a DataFrame with the same shape as the input
+                df_style            = pd.DataFrame('', index=x.index, columns=x.columns)
                 df_style.iloc[0, 1] = 'background-color: #b0c1d7'
                 return df_style
                             ]:
             chart_data_table = chart_data_table.sort_values(
+                                    by        = chart_data_table.columns[1],
+                                    ascending = True
+                                ).reset_index(drop=True)
         else:
             chart_data_table = chart_data_table.sort_values(
+                                    by        = chart_data_table.columns[1],
+                                    ascending = False
+                                ).reset_index(drop=True)
         styled_df = chart_data_table.style.format(
+                                    {chart_data_table.columns[1]: "{:.3f}"}
+                                ).apply(
+                                    highlight_first_element, axis=None
+                                )
         st.dataframe(
+                        styled_df,
+                        column_config={
+                            'model_show'               : 'Model',
+                            chart_data_table.columns[1]: {'alignment': 'left'},
+                            "model_link"               : st.column_config.LinkColumn("Model Link"),
+                        },
+                        hide_index=True,
+                        use_container_width=True
+                    )
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
     Show Chart
     '''
     # Initialize a session state variable for toggling the chart visibility
     if "show_chart" not in st.session_state:
         st.session_state.show_chart = False
             value = st_echarts(options=options, events=events, height="500px")
     # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
     Show Examples
     '''
     # Initialize a session state variable for toggling the chart visibility
     if "show_examples" not in st.session_state:
         st.session_state.show_examples = False

app/pages.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
-from app.draw_diagram import *
 from app.content import *
-from app.summarization import *
 def dataset_contents(dataset, metrics):
     custom_css = """
@@ -115,7 +115,7 @@ def asr_english():
     st.title("Task: Automatic Speech Recognition - English")
     sum = ['Overall']
-    dataset_lists = [
                     'LibriSpeech-Clean',
                     'LibriSpeech-Other',
                     'CommonVoice-15-EN',
@@ -126,32 +126,29 @@ def asr_english():
                     'TED-LIUM-3',
                     'TED-LIUM-3-LongForm',
                     ]
-    filters_1_list = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
         tab_section = st.selectbox('Dataset', filters_1_list)
-    with right:
-        metric = st.selectbox('Metric', ['wer'])
     if tab_section:
         if tab_section in sum:
-            sum_table_mulit_metrix('asr_english', ['wer'])
         else:
             dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
             draw_table(tab_section, metric)
 def asr_singlish():
     st.title("Task: Automatic Speech Recognition - Singlish")
     sum = ['Overall']
-    dataset_lists = [
                     'MNSC-PART1-ASR',
                     'MNSC-PART2-ASR',
                     'MNSC-PART3-ASR',
@@ -161,20 +158,22 @@ def asr_singlish():
                     'SEAME-Dev-Man',
                     'SEAME-Dev-Sge',
                     ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('asr_singlish', ['wer'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
-            draw('su', 'asr_singlish', filter_1, 'wer')
@@ -183,52 +182,56 @@ def asr_mandarin():
     st.title("Task: Automatic Speech Recognition - Mandarin")
     sum = ['Overall']
-    dataset_lists = [
                     'AISHELL-ASR-ZH',
                     ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('asr_mandarin', ['wer'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
-            draw('su', 'asr_mandarin', filter_1, 'wer')
 def speech_translation():
     st.title("Task: Speech Translation")
     sum = ['Overall']
-    dataset_lists = [
                         'CoVoST2-EN-ID',
                         'CoVoST2-EN-ZH',
                         'CoVoST2-EN-TA',
                         'CoVoST2-ID-EN',
                         'CoVoST2-ZH-EN',
                         'CoVoST2-TA-EN']
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('st', ['bleu'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu'])
-            draw('su', 'ST', filter_1, 'bleu')
@@ -237,93 +240,85 @@ def speech_question_answering_english():
     st.title("Task: Spoken Question Answering - English")
     sum = ['Overall']
-    dataset_lists = [
                     'CN-College-Listen-MCQ',
                     'DREAM-TTS-MCQ',
                     'SLUE-P2-SQA5',
                     'Public-SG-Speech-QA',
                     'Spoken-SQuAD',
                      ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
-        #elif filter_1 in dataset_lists:
-        #    dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
-        #    draw('su', 'SQA', filter_1, 'llama3_70b_judge')
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
 def speech_question_answering_singlish():
     st.title("Task: Spoken Question Answering - Singlish")
     sum = ['Overall']
-    dataset_lists = [
               'MNSC-PART3-SQA',
               'MNSC-PART4-SQA',
               'MNSC-PART5-SQA',
               'MNSC-PART6-SQA',
               ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
 def spoken_dialogue_summarization_singlish():
     st.title("Task: Spoken Dialogue Summarization - Singlish")
     sum = ['Overall']
-    dataset_lists = [
               'MNSC-PART3-SDS',
               'MNSC-PART4-SDS',
               'MNSC-PART5-SDS',
               'MNSC-PART6-SDS',
               ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
@@ -332,100 +327,72 @@ def speech_instruction():
     st.title("Task: Speech Instruction")
     sum = ['Overall']
-    dataset_lists = ['OpenHermes-Audio',
                      'ALPACA-Audio',
                      ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
 def audio_captioning():
     st.title("Task: Audio Captioning")
-    filters_levelone = ['WavCaps',
                         'AudioCaps',
                         ]
-    filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    with middle:
-        metric = st.selectbox('Metric', filters_leveltwo)
-    if filter_1 or metric:
-        dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')])
-        draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
 def audio_scene_question_answering():
     st.title("Task: Audio Scene Question Answering")
     sum = ['Overall']
-    dataset_lists = ['Clotho-AQA',
                     'WavCaps-QA',
                     'AudioCaps-QA']
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
-        else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
-def emotion_recognition():
-    st.title("Task: Emotion Recognition")
-    sum = ['Overall']
-    dataset_lists = [
-                    'IEMOCAP-Emotion',
-                    'MELD-Sentiment',
-                    'MELD-Emotion',
-                    ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
@@ -434,28 +401,27 @@ def accent_recognition():
     st.title("Task: Accent Recognition")
     sum = ['Overall']
-    dataset_lists = [
         'VoxCeleb-Accent',
         'MNSC-AR-Sentence',
         'MNSC-AR-Dialogue',
         ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
@@ -463,25 +429,56 @@ def gender_recognition():
     st.title("Task: Gender Recognition")
     sum = ['Overall']
-    dataset_lists =  [
                         'VoxCeleb-Gender',
                         'IEMOCAP-Gender'
                         ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
         else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
@@ -491,25 +488,25 @@ def music_understanding():
     sum = ['Overall']
-    dataset_lists =  ['MuChoMusic',
                       ]
-    filters_levelone = sum + dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    if filter_1:
-        if filter_1 in sum:
-            sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
-        else:
-            dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
-            draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
@@ -520,8 +517,7 @@ def music_understanding():
 def under_development():
     st.title("Task: Under Development")
-    dataset_lists =  [
                       'CNA',
                       'IDPC',
                       'Parliament',
@@ -536,43 +532,44 @@ def under_development():
                       'YTB-SQA-Batch1',
                       'YTB-SDS-Batch1',
                       'YTB-PQA-Batch1',
                       ]
-    filters_levelone = dataset_lists
-    left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
-    with left:
-        filter_1 = st.selectbox('Dataset', filters_levelone)
-    dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
-    if filter_1 in [
-                      'CNA',
-                      'IDPC',
-                      'Parliament',
-                      'UKUS-News',
-                      'Mediacorp',
-                      'IDPC-Short',
-                      'Parliament-Short',
-                      'UKUS-News-Short',
-                      'Mediacorp-Short',
-                      'YTB-ASR-Batch1',
-                      'YTB-ASR-Batch2',
-                      'SEAME-Dev-Man',
-                      'SEAME-Dev-Sge',
-                      ]:
-        draw('vu', 'under_development_wer', filter_1, 'wer')
-    elif filter_1 in [
-        'YTB-SQA-Batch1',
-        'YTB-SDS-Batch1',
-        'YTB-PQA-Batch1',
-        ]:
-        draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
 def mmau_evaluation():

 import streamlit as st
+from app.draw_diagram import draw_table
 from app.content import *
+from app.summarization import sum_table_mulit_metrix
 def dataset_contents(dataset, metrics):
     custom_css = """
     st.title("Task: Automatic Speech Recognition - English")
     sum = ['Overall']
+    dataset_list = [
                     'LibriSpeech-Clean',
                     'LibriSpeech-Other',
                     'CommonVoice-15-EN',
                     'TED-LIUM-3',
                     'TED-LIUM-3-LongForm',
                     ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
         tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['WER'])
+        metric = metric.lower()
     if tab_section:
         if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
             dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
             draw_table(tab_section, metric)
 def asr_singlish():
     st.title("Task: Automatic Speech Recognition - Singlish")
     sum = ['Overall']
+    dataset_list = [
                     'MNSC-PART1-ASR',
                     'MNSC-PART2-ASR',
                     'MNSC-PART3-ASR',
                     'SEAME-Dev-Man',
                     'SEAME-Dev-Sge',
                     ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['WER'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     st.title("Task: Automatic Speech Recognition - Mandarin")
     sum = ['Overall']
+    dataset_list = [
                     'AISHELL-ASR-ZH',
                     ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['WER'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
 def speech_translation():
     st.title("Task: Speech Translation")
     sum = ['Overall']
+    dataset_list = [
                         'CoVoST2-EN-ID',
                         'CoVoST2-EN-ZH',
                         'CoVoST2-EN-TA',
                         'CoVoST2-ID-EN',
                         'CoVoST2-ZH-EN',
                         'CoVoST2-TA-EN']
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['BLEU'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     st.title("Task: Spoken Question Answering - English")
     sum = ['Overall']
+    dataset_list = [
                     'CN-College-Listen-MCQ',
                     'DREAM-TTS-MCQ',
                     'SLUE-P2-SQA5',
                     'Public-SG-Speech-QA',
                     'Spoken-SQuAD',
                      ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
 def speech_question_answering_singlish():
     st.title("Task: Spoken Question Answering - Singlish")
     sum = ['Overall']
+    dataset_list = [
               'MNSC-PART3-SQA',
               'MNSC-PART4-SQA',
               'MNSC-PART5-SQA',
               'MNSC-PART6-SQA',
               ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
 def spoken_dialogue_summarization_singlish():
     st.title("Task: Spoken Dialogue Summarization - Singlish")
     sum = ['Overall']
+    dataset_list = [
               'MNSC-PART3-SDS',
               'MNSC-PART4-SDS',
               'MNSC-PART5-SDS',
               'MNSC-PART6-SDS',
               ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     st.title("Task: Speech Instruction")
     sum = ['Overall']
+    dataset_list = ['OpenHermes-Audio',
                      'ALPACA-Audio',
                      ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
 def audio_captioning():
     st.title("Task: Audio Captioning")
+    dataset_list = [    'WavCaps',
                         'AudioCaps',
                         ]
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', dataset_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'METEOR'])
+        metric = metric.lower()
+    if tab_section:
+        dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+        draw_table(tab_section, metric)
 def audio_scene_question_answering():
     st.title("Task: Audio Scene Question Answering")
     sum = ['Overall']
+    dataset_list = ['Clotho-AQA',
                     'WavCaps-QA',
                     'AudioCaps-QA']
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     st.title("Task: Accent Recognition")
     sum = ['Overall']
+    dataset_list = [
         'VoxCeleb-Accent',
         'MNSC-AR-Sentence',
         'MNSC-AR-Dialogue',
         ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     st.title("Task: Gender Recognition")
     sum = ['Overall']
+    dataset_list =  [
                         'VoxCeleb-Gender',
                         'IEMOCAP-Gender'
                         ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
+        else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
+def emotion_recognition():
+    st.title("Task: Emotion Recognition")
+    sum = ['Overall']
+    dataset_list = [
+                    'IEMOCAP-Emotion',
+                    'MELD-Sentiment',
+                    'MELD-Emotion',
+                    ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
         else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
     sum = ['Overall']
+    dataset_list =  ['MuChoMusic',
                       ]
+    filters_1_list = sum + dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+        metric = metric.lower()
+    if tab_section:
+        if tab_section in sum:
+            sum_table_mulit_metrix(dataset_list, metric)
+        else:
+            dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+            draw_table(tab_section, metric)
 def under_development():
     st.title("Task: Under Development")
+    dataset_list =  [
                       'CNA',
                       'IDPC',
                       'Parliament',
                       'YTB-SQA-Batch1',
                       'YTB-SDS-Batch1',
                       'YTB-PQA-Batch1',
                       ]
+    filters_1_list = dataset_list
+    space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
+    with space1:
+        tab_section = st.selectbox('Dataset', filters_1_list)
+    with space2:
+        if tab_section in [
+                'CNA',
+                'IDPC',
+                'Parliament',
+                'UKUS-News',
+                'Mediacorp',
+                'IDPC-Short',
+                'Parliament-Short',
+                'UKUS-News-Short',
+                'Mediacorp-Short',
+                'YTB-ASR-Batch1',
+                'YTB-ASR-Batch2',
+                ]:
+            metric = st.selectbox('Metric', ['WER'])
+            metric = metric.lower()
+        elif tab_section in [
+                'YTB-SQA-Batch1',
+                'YTB-SDS-Batch1',
+                'YTB-PQA-Batch1',
+                ]:
+            metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
+            metric = metric.lower()
+        else:
+            raise ValueError('Invalid dataset')
+    if tab_section:
+        dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
+        draw_table(tab_section, metric)
 def mmau_evaluation():

app/summarization.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
 from streamlit_echarts import st_echarts
 from streamlit.components.v1 import html
 # from PIL import Image
@@ -14,20 +17,27 @@ from model_information import get_dataframe
 info_df = get_dataframe()
-def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
-    # combine chart data from multiple sources
-    chart_data = pd.DataFrame()
-    for metrics in metrics_lists:
-        folder = f"./results_organized/{metrics}"
-        data_path = f'{folder}/{task_name.lower()}.csv'
-        one_chart_data = pd.read_csv(data_path).round(3)
-        if len(chart_data) == 0:
-            chart_data = one_chart_data
-        else:
-            chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
     selected_columns = [i for i in chart_data.columns if i != 'Model']
     chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
@@ -81,7 +91,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
         # Format numeric columns to 2 decimal places
         chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
-        if metrics in ['wer']:
             ascend = True
         else:
             ascend= False
@@ -124,4 +134,4 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
             )
     # Only report the last metrics
-    st.markdown(f'###### Metric: {metrics_info[metrics]}')

 import streamlit as st
 import pandas as pd
 import numpy as np
+import json
 from streamlit_echarts import st_echarts
 from streamlit.components.v1 import html
 # from PIL import Image
 info_df = get_dataframe()
+def sum_table_mulit_metrix(dataset_displayname_list, metric):
+    with open('organize_model_results.json', 'r') as f:
+        organize_model_results = json.load(f)
+    dataset_results = {}
+    for dataset_displayname in dataset_displayname_list:
+        dataset_nickname = displayname2datasetname[dataset_displayname]
+        model_results = organize_model_results[dataset_nickname][metric]
+        model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
+        model_results      = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
+        dataset_results[dataset_displayname] = model_results
+    df_results = pd.DataFrame(dataset_results)
+    # Reset index to have models as a column
+    df_results.reset_index(inplace=True)
+    df_results.rename(columns={"index": "Model"}, inplace=True)
+    chart_data = df_results
     selected_columns = [i for i in chart_data.columns if i != 'Model']
     chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
         # Format numeric columns to 2 decimal places
         chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
+        if metric == 'wer':
             ascend = True
         else:
             ascend= False
             )
     # Only report the last metrics
+    st.markdown(f'###### Metric: {metrics_info[metric]}')