binwang commited on
Commit
32e2641
·
verified ·
1 Parent(s): 8a4a728

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +0 -1
  2. app/draw_diagram.py +25 -83
  3. app/pages.py +232 -235
  4. app/summarization.py +24 -14
app/content.py CHANGED
@@ -151,7 +151,6 @@ dataset_diaplay_information = {
151
 
152
  metrics_info = {
153
  'wer' : 'Word Error Rate (WER) - The Lower, the better.',
154
- 'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
155
  'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
156
  'meteor' : 'METEOR Score. The higher, the better.',
157
  'bleu' : 'BLEU Score. The higher, the better.',
 
151
 
152
  metrics_info = {
153
  'wer' : 'Word Error Rate (WER) - The Lower, the better.',
 
154
  'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
155
  'meteor' : 'METEOR Score. The higher, the better.',
156
  'bleu' : 'BLEU Score. The higher, the better.',
app/draw_diagram.py CHANGED
@@ -15,56 +15,14 @@ info_df = get_dataframe()
15
 
16
  def draw_table(dataset_displayname, metrics):
17
 
18
- dataset_nickname = displayname2datasetname[dataset_displayname]
19
-
20
  with open('organize_model_results.json', 'r') as f:
21
  organize_model_results = json.load(f)
22
 
 
23
  model_results = organize_model_results[dataset_nickname][metrics]
24
  model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
25
  model_results = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
26
 
27
-
28
- # folder = f"./results_organized/{metrics}/"
29
-
30
- # # Load the results from CSV
31
- # data_path = f'{folder}/{category_name.lower()}.csv'
32
- # chart_data = pd.read_csv(data_path).round(3)
33
-
34
- # dataset_name = displayname2datasetname[displayname]
35
- # chart_data = chart_data[['Model', dataset_name]]
36
-
37
- # # Rename to proper display name
38
- # chart_data = chart_data.rename(columns=datasetname2diaplayname)
39
-
40
- # st.markdown("""
41
- # <style>
42
- # .stMultiSelect [data-baseweb=select] span {
43
- # max-width: 800px;
44
- # font-size: 0.9rem;
45
- # background-color: #3C6478 !important; /* Background color for selected items */
46
- # color: white; /* Change text color */
47
- # back
48
- # }
49
- # </style>
50
- # """, unsafe_allow_html=True)
51
-
52
- # # remap model names
53
- # display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
54
- # chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
55
-
56
-
57
- # models = st.multiselect("Please choose the model",
58
- # sorted(chart_data['model_show'].tolist()),
59
- # default = sorted(chart_data['model_show'].tolist()),
60
- # )
61
-
62
- # chart_data = chart_data[chart_data['model_show'].isin(models)]
63
- # chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
64
-
65
- # if len(chart_data) == 0: return
66
-
67
-
68
 
69
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
70
  '''
@@ -73,23 +31,13 @@ def draw_table(dataset_displayname, metrics):
73
  with st.container():
74
  st.markdown('##### TABLE')
75
 
76
- model_link_mapping = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
77
-
78
- chart_data_table = pd.DataFrame(list(model_results.items()), columns=["model_show", dataset_displayname])
79
  chart_data_table["model_link"] = chart_data_table["model_show"].map(model_link_mapping)
80
 
81
- # chart_data['model_link'] = chart_data['model_show'].map(model_link)
82
-
83
- # chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
84
-
85
- # Format numeric columns to 2 decimal places
86
- #chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
87
- # dataset_name = chart_data_table.columns[1]
88
-
89
-
90
  def highlight_first_element(x):
91
  # Create a DataFrame with the same shape as the input
92
- df_style = pd.DataFrame('', index=x.index, columns=x.columns)
93
  df_style.iloc[0, 1] = 'background-color: #b0c1d7'
94
  return df_style
95
 
@@ -126,40 +74,39 @@ def draw_table(dataset_displayname, metrics):
126
  ]:
127
 
128
  chart_data_table = chart_data_table.sort_values(
129
- by=chart_data_table.columns[1],
130
- ascending=True
131
- ).reset_index(drop=True)
132
  else:
133
  chart_data_table = chart_data_table.sort_values(
134
- by=chart_data_table.columns[1],
135
- ascending=False
136
- ).reset_index(drop=True)
137
-
138
 
139
  styled_df = chart_data_table.style.format(
140
- {chart_data_table.columns[1]: "{:.3f}"}
141
- ).apply(
142
- highlight_first_element, axis=None
143
- )
144
 
145
 
146
  st.dataframe(
147
- styled_df,
148
- column_config={
149
- 'model_show' : 'Model',
150
- chart_data_table.columns[1]: {'alignment': 'left'},
151
- "model_link" : st.column_config.LinkColumn("Model Link"),
152
- },
153
- hide_index=True,
154
- use_container_width=True
155
- )
156
-
157
 
158
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
159
  '''
160
  Show Chart
161
  '''
162
-
163
  # Initialize a session state variable for toggling the chart visibility
164
  if "show_chart" not in st.session_state:
165
  st.session_state.show_chart = False
@@ -232,15 +179,10 @@ def draw_table(dataset_displayname, metrics):
232
  value = st_echarts(options=options, events=events, height="500px")
233
 
234
 
235
-
236
-
237
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
238
-
239
  '''
240
  Show Examples
241
  '''
242
-
243
-
244
  # Initialize a session state variable for toggling the chart visibility
245
  if "show_examples" not in st.session_state:
246
  st.session_state.show_examples = False
 
15
 
16
  def draw_table(dataset_displayname, metrics):
17
 
 
 
18
  with open('organize_model_results.json', 'r') as f:
19
  organize_model_results = json.load(f)
20
 
21
+ dataset_nickname = displayname2datasetname[dataset_displayname]
22
  model_results = organize_model_results[dataset_nickname][metrics]
23
  model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
24
  model_results = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
28
  '''
 
31
  with st.container():
32
  st.markdown('##### TABLE')
33
 
34
+ model_link_mapping = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
35
+ chart_data_table = pd.DataFrame(list(model_results.items()), columns=["model_show", dataset_displayname])
 
36
  chart_data_table["model_link"] = chart_data_table["model_show"].map(model_link_mapping)
37
 
 
 
 
 
 
 
 
 
 
38
  def highlight_first_element(x):
39
  # Create a DataFrame with the same shape as the input
40
+ df_style = pd.DataFrame('', index=x.index, columns=x.columns)
41
  df_style.iloc[0, 1] = 'background-color: #b0c1d7'
42
  return df_style
43
 
 
74
  ]:
75
 
76
  chart_data_table = chart_data_table.sort_values(
77
+ by = chart_data_table.columns[1],
78
+ ascending = True
79
+ ).reset_index(drop=True)
80
  else:
81
  chart_data_table = chart_data_table.sort_values(
82
+ by = chart_data_table.columns[1],
83
+ ascending = False
84
+ ).reset_index(drop=True)
85
+
86
 
87
  styled_df = chart_data_table.style.format(
88
+ {chart_data_table.columns[1]: "{:.3f}"}
89
+ ).apply(
90
+ highlight_first_element, axis=None
91
+ )
92
 
93
 
94
  st.dataframe(
95
+ styled_df,
96
+ column_config={
97
+ 'model_show' : 'Model',
98
+ chart_data_table.columns[1]: {'alignment': 'left'},
99
+ "model_link" : st.column_config.LinkColumn("Model Link"),
100
+ },
101
+ hide_index=True,
102
+ use_container_width=True
103
+ )
104
+
105
 
106
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
107
  '''
108
  Show Chart
109
  '''
 
110
  # Initialize a session state variable for toggling the chart visibility
111
  if "show_chart" not in st.session_state:
112
  st.session_state.show_chart = False
 
179
  value = st_echarts(options=options, events=events, height="500px")
180
 
181
 
 
 
182
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
183
  '''
184
  Show Examples
185
  '''
 
 
186
  # Initialize a session state variable for toggling the chart visibility
187
  if "show_examples" not in st.session_state:
188
  st.session_state.show_examples = False
app/pages.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
- from app.draw_diagram import *
3
  from app.content import *
4
- from app.summarization import *
5
 
6
  def dataset_contents(dataset, metrics):
7
  custom_css = """
@@ -115,7 +115,7 @@ def asr_english():
115
  st.title("Task: Automatic Speech Recognition - English")
116
 
117
  sum = ['Overall']
118
- dataset_lists = [
119
  'LibriSpeech-Clean',
120
  'LibriSpeech-Other',
121
  'CommonVoice-15-EN',
@@ -126,32 +126,29 @@ def asr_english():
126
  'TED-LIUM-3',
127
  'TED-LIUM-3-LongForm',
128
  ]
129
-
130
- filters_1_list = sum + dataset_lists
131
 
132
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
133
 
134
- with left:
135
  tab_section = st.selectbox('Dataset', filters_1_list)
136
- with right:
137
- metric = st.selectbox('Metric', ['wer'])
 
138
 
139
  if tab_section:
140
  if tab_section in sum:
141
- sum_table_mulit_metrix('asr_english', ['wer'])
142
-
143
  else:
144
  dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
145
  draw_table(tab_section, metric)
146
 
147
 
148
-
149
-
150
  def asr_singlish():
151
  st.title("Task: Automatic Speech Recognition - Singlish")
152
 
153
  sum = ['Overall']
154
- dataset_lists = [
155
  'MNSC-PART1-ASR',
156
  'MNSC-PART2-ASR',
157
  'MNSC-PART3-ASR',
@@ -161,20 +158,22 @@ def asr_singlish():
161
  'SEAME-Dev-Man',
162
  'SEAME-Dev-Sge',
163
  ]
164
-
165
- filters_levelone = sum + dataset_lists
166
 
167
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
168
 
169
- with left:
170
- filter_1 = st.selectbox('Dataset', filters_levelone)
 
 
 
171
 
172
- if filter_1:
173
- if filter_1 in sum:
174
- sum_table_mulit_metrix('asr_singlish', ['wer'])
175
  else:
176
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
177
- draw('su', 'asr_singlish', filter_1, 'wer')
178
 
179
 
180
 
@@ -183,52 +182,56 @@ def asr_mandarin():
183
  st.title("Task: Automatic Speech Recognition - Mandarin")
184
 
185
  sum = ['Overall']
186
- dataset_lists = [
187
  'AISHELL-ASR-ZH',
188
  ]
189
-
190
- filters_levelone = sum + dataset_lists
191
 
192
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
193
 
194
- with left:
195
- filter_1 = st.selectbox('Dataset', filters_levelone)
196
-
197
- if filter_1:
198
- if filter_1 in sum:
199
- sum_table_mulit_metrix('asr_mandarin', ['wer'])
 
 
 
200
  else:
201
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
202
- draw('su', 'asr_mandarin', filter_1, 'wer')
203
 
204
-
205
 
 
206
 
207
  def speech_translation():
208
  st.title("Task: Speech Translation")
209
 
210
  sum = ['Overall']
211
- dataset_lists = [
212
  'CoVoST2-EN-ID',
213
  'CoVoST2-EN-ZH',
214
  'CoVoST2-EN-TA',
215
  'CoVoST2-ID-EN',
216
  'CoVoST2-ZH-EN',
217
  'CoVoST2-TA-EN']
218
-
219
- filters_levelone = sum + dataset_lists
220
 
221
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
222
 
223
- with left:
224
- filter_1 = st.selectbox('Dataset', filters_levelone)
225
-
226
- if filter_1:
227
- if filter_1 in sum:
228
- sum_table_mulit_metrix('st', ['bleu'])
 
 
 
229
  else:
230
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu'])
231
- draw('su', 'ST', filter_1, 'bleu')
232
 
233
 
234
 
@@ -237,93 +240,85 @@ def speech_question_answering_english():
237
  st.title("Task: Spoken Question Answering - English")
238
 
239
  sum = ['Overall']
240
-
241
- dataset_lists = [
242
  'CN-College-Listen-MCQ',
243
  'DREAM-TTS-MCQ',
244
  'SLUE-P2-SQA5',
245
  'Public-SG-Speech-QA',
246
  'Spoken-SQuAD',
247
  ]
248
-
249
- filters_levelone = sum + dataset_lists
250
 
251
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
252
 
253
- with left:
254
- filter_1 = st.selectbox('Dataset', filters_levelone)
255
-
256
- if filter_1:
257
- if filter_1 in sum:
258
- sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
259
 
260
- #elif filter_1 in dataset_lists:
261
- # dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
262
- # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
263
-
264
  else:
265
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
266
- draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
267
-
268
-
269
 
270
 
271
  def speech_question_answering_singlish():
272
  st.title("Task: Spoken Question Answering - Singlish")
273
 
274
  sum = ['Overall']
275
-
276
- dataset_lists = [
277
  'MNSC-PART3-SQA',
278
  'MNSC-PART4-SQA',
279
  'MNSC-PART5-SQA',
280
  'MNSC-PART6-SQA',
281
  ]
282
-
283
-
284
- filters_levelone = sum + dataset_lists
285
 
286
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
287
 
288
- with left:
289
- filter_1 = st.selectbox('Dataset', filters_levelone)
 
 
 
290
 
291
- if filter_1:
292
- if filter_1 in sum:
293
- sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
294
-
295
  else:
296
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
297
- draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
298
 
299
 
300
  def spoken_dialogue_summarization_singlish():
301
  st.title("Task: Spoken Dialogue Summarization - Singlish")
302
 
303
  sum = ['Overall']
304
-
305
- dataset_lists = [
306
  'MNSC-PART3-SDS',
307
  'MNSC-PART4-SDS',
308
  'MNSC-PART5-SDS',
309
  'MNSC-PART6-SDS',
310
  ]
 
311
 
312
-
313
- filters_levelone = sum + dataset_lists
314
 
315
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
316
-
317
- with left:
318
- filter_1 = st.selectbox('Dataset', filters_levelone)
 
319
 
320
- if filter_1:
321
- if filter_1 in sum:
322
- sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge'])
323
-
324
  else:
325
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
326
- draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
327
 
328
 
329
 
@@ -332,100 +327,72 @@ def speech_instruction():
332
  st.title("Task: Speech Instruction")
333
 
334
  sum = ['Overall']
335
-
336
- dataset_lists = ['OpenHermes-Audio',
337
  'ALPACA-Audio',
338
  ]
 
 
339
 
340
- filters_levelone = sum + dataset_lists
341
-
342
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
343
-
344
- with left:
345
- filter_1 = st.selectbox('Dataset', filters_levelone)
346
 
347
- if filter_1:
348
- if filter_1 in sum:
349
- sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
350
  else:
351
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
352
- draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
353
-
354
 
355
 
356
 
357
  def audio_captioning():
358
  st.title("Task: Audio Captioning")
359
 
360
- filters_levelone = ['WavCaps',
361
  'AudioCaps',
362
  ]
363
- filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
364
 
365
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
366
 
367
- with left:
368
- filter_1 = st.selectbox('Dataset', filters_levelone)
369
- with middle:
370
- metric = st.selectbox('Metric', filters_leveltwo)
371
-
372
- if filter_1 or metric:
373
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')])
374
- draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
375
-
376
 
 
 
 
377
 
378
 
379
  def audio_scene_question_answering():
380
  st.title("Task: Audio Scene Question Answering")
381
 
382
  sum = ['Overall']
383
-
384
- dataset_lists = ['Clotho-AQA',
385
  'WavCaps-QA',
386
  'AudioCaps-QA']
387
 
388
- filters_levelone = sum + dataset_lists
389
-
390
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
391
-
392
- with left:
393
- filter_1 = st.selectbox('Dataset', filters_levelone)
394
-
395
- if filter_1:
396
- if filter_1 in sum:
397
- sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
398
- else:
399
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
400
- draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
401
-
402
-
403
-
404
-
405
- def emotion_recognition():
406
- st.title("Task: Emotion Recognition")
407
-
408
- sum = ['Overall']
409
 
410
- dataset_lists = [
411
- 'IEMOCAP-Emotion',
412
- 'MELD-Sentiment',
413
- 'MELD-Emotion',
414
- ]
415
-
416
- filters_levelone = sum + dataset_lists
417
-
418
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
419
 
420
- with left:
421
- filter_1 = st.selectbox('Dataset', filters_levelone)
 
 
 
422
 
423
- if filter_1:
424
- if filter_1 in sum:
425
- sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
426
  else:
427
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
428
- draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
 
429
 
430
 
431
 
@@ -434,28 +401,27 @@ def accent_recognition():
434
  st.title("Task: Accent Recognition")
435
 
436
  sum = ['Overall']
437
- dataset_lists = [
438
  'VoxCeleb-Accent',
439
  'MNSC-AR-Sentence',
440
  'MNSC-AR-Dialogue',
441
  ]
442
-
443
-
444
- filters_levelone = sum + dataset_lists
445
 
446
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
447
 
448
- with left:
449
- filter_1 = st.selectbox('Dataset', filters_levelone)
450
-
 
 
451
 
452
- if filter_1:
453
- if filter_1 in sum:
454
- sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
455
  else:
456
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
457
- draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
458
-
459
 
460
 
461
 
@@ -463,25 +429,56 @@ def gender_recognition():
463
  st.title("Task: Gender Recognition")
464
 
465
  sum = ['Overall']
466
-
467
- dataset_lists = [
468
  'VoxCeleb-Gender',
469
  'IEMOCAP-Gender'
470
  ]
471
-
472
- filters_levelone = sum + dataset_lists
473
 
474
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
475
 
476
- with left:
477
- filter_1 = st.selectbox('Dataset', filters_levelone)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
- if filter_1:
480
- if filter_1 in sum:
481
- sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
 
 
 
 
 
 
482
  else:
483
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
484
- draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
485
 
486
 
487
 
@@ -491,25 +488,25 @@ def music_understanding():
491
 
492
  sum = ['Overall']
493
 
494
- dataset_lists = ['MuChoMusic',
495
  ]
496
 
497
- filters_levelone = sum + dataset_lists
498
-
499
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
500
 
501
- with left:
502
- filter_1 = st.selectbox('Dataset', filters_levelone)
503
 
504
- if filter_1:
505
- if filter_1 in sum:
506
- sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
507
- else:
508
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
509
- draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
510
-
511
-
512
 
 
 
 
 
 
 
513
 
514
 
515
 
@@ -520,8 +517,7 @@ def music_understanding():
520
  def under_development():
521
  st.title("Task: Under Development")
522
 
523
-
524
- dataset_lists = [
525
  'CNA',
526
  'IDPC',
527
  'Parliament',
@@ -536,43 +532,44 @@ def under_development():
536
  'YTB-SQA-Batch1',
537
  'YTB-SDS-Batch1',
538
  'YTB-PQA-Batch1',
539
-
540
  ]
541
 
542
- filters_levelone = dataset_lists
543
-
544
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
545
-
546
- with left:
547
- filter_1 = st.selectbox('Dataset', filters_levelone)
548
 
549
- dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
550
 
551
- if filter_1 in [
552
- 'CNA',
553
- 'IDPC',
554
- 'Parliament',
555
- 'UKUS-News',
556
- 'Mediacorp',
557
- 'IDPC-Short',
558
- 'Parliament-Short',
559
- 'UKUS-News-Short',
560
- 'Mediacorp-Short',
561
- 'YTB-ASR-Batch1',
562
- 'YTB-ASR-Batch2',
563
- 'SEAME-Dev-Man',
564
- 'SEAME-Dev-Sge',
565
- ]:
566
-
567
- draw('vu', 'under_development_wer', filter_1, 'wer')
568
-
569
- elif filter_1 in [
570
- 'YTB-SQA-Batch1',
571
- 'YTB-SDS-Batch1',
572
- 'YTB-PQA-Batch1',
573
- ]:
574
- draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
 
 
 
575
 
 
 
 
 
576
 
577
 
578
  def mmau_evaluation():
 
1
  import streamlit as st
2
+ from app.draw_diagram import draw_table
3
  from app.content import *
4
+ from app.summarization import sum_table_mulit_metrix
5
 
6
  def dataset_contents(dataset, metrics):
7
  custom_css = """
 
115
  st.title("Task: Automatic Speech Recognition - English")
116
 
117
  sum = ['Overall']
118
+ dataset_list = [
119
  'LibriSpeech-Clean',
120
  'LibriSpeech-Other',
121
  'CommonVoice-15-EN',
 
126
  'TED-LIUM-3',
127
  'TED-LIUM-3-LongForm',
128
  ]
129
+ filters_1_list = sum + dataset_list
 
130
 
131
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
132
 
133
+ with space1:
134
  tab_section = st.selectbox('Dataset', filters_1_list)
135
+ with space2:
136
+ metric = st.selectbox('Metric', ['WER'])
137
+ metric = metric.lower()
138
 
139
  if tab_section:
140
  if tab_section in sum:
141
+ sum_table_mulit_metrix(dataset_list, metric)
 
142
  else:
143
  dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
144
  draw_table(tab_section, metric)
145
 
146
 
 
 
147
  def asr_singlish():
148
  st.title("Task: Automatic Speech Recognition - Singlish")
149
 
150
  sum = ['Overall']
151
+ dataset_list = [
152
  'MNSC-PART1-ASR',
153
  'MNSC-PART2-ASR',
154
  'MNSC-PART3-ASR',
 
158
  'SEAME-Dev-Man',
159
  'SEAME-Dev-Sge',
160
  ]
161
+ filters_1_list = sum + dataset_list
 
162
 
163
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
164
 
165
+ with space1:
166
+ tab_section = st.selectbox('Dataset', filters_1_list)
167
+ with space2:
168
+ metric = st.selectbox('Metric', ['WER'])
169
+ metric = metric.lower()
170
 
171
+ if tab_section:
172
+ if tab_section in sum:
173
+ sum_table_mulit_metrix(dataset_list, metric)
174
  else:
175
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
176
+ draw_table(tab_section, metric)
177
 
178
 
179
 
 
182
  st.title("Task: Automatic Speech Recognition - Mandarin")
183
 
184
  sum = ['Overall']
185
+ dataset_list = [
186
  'AISHELL-ASR-ZH',
187
  ]
188
+ filters_1_list = sum + dataset_list
 
189
 
190
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
191
 
192
+ with space1:
193
+ tab_section = st.selectbox('Dataset', filters_1_list)
194
+ with space2:
195
+ metric = st.selectbox('Metric', ['WER'])
196
+ metric = metric.lower()
197
+
198
+ if tab_section:
199
+ if tab_section in sum:
200
+ sum_table_mulit_metrix(dataset_list, metric)
201
  else:
202
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
203
+ draw_table(tab_section, metric)
204
 
 
205
 
206
+
207
 
208
  def speech_translation():
209
  st.title("Task: Speech Translation")
210
 
211
  sum = ['Overall']
212
+ dataset_list = [
213
  'CoVoST2-EN-ID',
214
  'CoVoST2-EN-ZH',
215
  'CoVoST2-EN-TA',
216
  'CoVoST2-ID-EN',
217
  'CoVoST2-ZH-EN',
218
  'CoVoST2-TA-EN']
219
+ filters_1_list = sum + dataset_list
 
220
 
221
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
222
 
223
+ with space1:
224
+ tab_section = st.selectbox('Dataset', filters_1_list)
225
+ with space2:
226
+ metric = st.selectbox('Metric', ['BLEU'])
227
+ metric = metric.lower()
228
+
229
+ if tab_section:
230
+ if tab_section in sum:
231
+ sum_table_mulit_metrix(dataset_list, metric)
232
  else:
233
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
234
+ draw_table(tab_section, metric)
235
 
236
 
237
 
 
240
  st.title("Task: Spoken Question Answering - English")
241
 
242
  sum = ['Overall']
243
+ dataset_list = [
 
244
  'CN-College-Listen-MCQ',
245
  'DREAM-TTS-MCQ',
246
  'SLUE-P2-SQA5',
247
  'Public-SG-Speech-QA',
248
  'Spoken-SQuAD',
249
  ]
250
+ filters_1_list = sum + dataset_list
 
251
 
252
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
253
 
254
+ with space1:
255
+ tab_section = st.selectbox('Dataset', filters_1_list)
256
+ with space2:
257
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
258
+ metric = metric.lower()
 
259
 
260
+ if tab_section:
261
+ if tab_section in sum:
262
+ sum_table_mulit_metrix(dataset_list, metric)
 
263
  else:
264
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
265
+ draw_table(tab_section, metric)
 
 
266
 
267
 
268
  def speech_question_answering_singlish():
269
  st.title("Task: Spoken Question Answering - Singlish")
270
 
271
  sum = ['Overall']
272
+ dataset_list = [
 
273
  'MNSC-PART3-SQA',
274
  'MNSC-PART4-SQA',
275
  'MNSC-PART5-SQA',
276
  'MNSC-PART6-SQA',
277
  ]
278
+ filters_1_list = sum + dataset_list
 
 
279
 
280
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
281
 
282
+ with space1:
283
+ tab_section = st.selectbox('Dataset', filters_1_list)
284
+ with space2:
285
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
286
+ metric = metric.lower()
287
 
288
+ if tab_section:
289
+ if tab_section in sum:
290
+ sum_table_mulit_metrix(dataset_list, metric)
 
291
  else:
292
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
293
+ draw_table(tab_section, metric)
294
 
295
 
296
  def spoken_dialogue_summarization_singlish():
297
  st.title("Task: Spoken Dialogue Summarization - Singlish")
298
 
299
  sum = ['Overall']
300
+ dataset_list = [
 
301
  'MNSC-PART3-SDS',
302
  'MNSC-PART4-SDS',
303
  'MNSC-PART5-SDS',
304
  'MNSC-PART6-SDS',
305
  ]
306
+ filters_1_list = sum + dataset_list
307
 
308
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
 
309
 
310
+ with space1:
311
+ tab_section = st.selectbox('Dataset', filters_1_list)
312
+ with space2:
313
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
314
+ metric = metric.lower()
315
 
316
+ if tab_section:
317
+ if tab_section in sum:
318
+ sum_table_mulit_metrix(dataset_list, metric)
 
319
  else:
320
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
321
+ draw_table(tab_section, metric)
322
 
323
 
324
 
 
327
  st.title("Task: Speech Instruction")
328
 
329
  sum = ['Overall']
330
+ dataset_list = ['OpenHermes-Audio',
 
331
  'ALPACA-Audio',
332
  ]
333
+ filters_1_list = sum + dataset_list
334
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
335
 
336
+ with space1:
337
+ tab_section = st.selectbox('Dataset', filters_1_list)
338
+ with space2:
339
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
340
+ metric = metric.lower()
 
341
 
342
+ if tab_section:
343
+ if tab_section in sum:
344
+ sum_table_mulit_metrix(dataset_list, metric)
345
  else:
346
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
347
+ draw_table(tab_section, metric)
 
348
 
349
 
350
 
351
  def audio_captioning():
352
  st.title("Task: Audio Captioning")
353
 
354
+ dataset_list = [ 'WavCaps',
355
  'AudioCaps',
356
  ]
 
357
 
358
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
359
 
360
+ with space1:
361
+ tab_section = st.selectbox('Dataset', dataset_list)
362
+ with space2:
363
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'METEOR'])
364
+ metric = metric.lower()
 
 
 
 
365
 
366
+ if tab_section:
367
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
368
+ draw_table(tab_section, metric)
369
 
370
 
371
  def audio_scene_question_answering():
372
  st.title("Task: Audio Scene Question Answering")
373
 
374
  sum = ['Overall']
375
+ dataset_list = ['Clotho-AQA',
 
376
  'WavCaps-QA',
377
  'AudioCaps-QA']
378
 
379
+ filters_1_list = sum + dataset_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
 
 
 
 
 
 
 
 
382
 
383
+ with space1:
384
+ tab_section = st.selectbox('Dataset', filters_1_list)
385
+ with space2:
386
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
387
+ metric = metric.lower()
388
 
389
+ if tab_section:
390
+ if tab_section in sum:
391
+ sum_table_mulit_metrix(dataset_list, metric)
392
  else:
393
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
394
+ draw_table(tab_section, metric)
395
+
396
 
397
 
398
 
 
401
  st.title("Task: Accent Recognition")
402
 
403
  sum = ['Overall']
404
+ dataset_list = [
405
  'VoxCeleb-Accent',
406
  'MNSC-AR-Sentence',
407
  'MNSC-AR-Dialogue',
408
  ]
409
+ filters_1_list = sum + dataset_list
 
 
410
 
411
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
412
 
413
+ with space1:
414
+ tab_section = st.selectbox('Dataset', filters_1_list)
415
+ with space2:
416
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
417
+ metric = metric.lower()
418
 
419
+ if tab_section:
420
+ if tab_section in sum:
421
+ sum_table_mulit_metrix(dataset_list, metric)
422
  else:
423
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
424
+ draw_table(tab_section, metric)
 
425
 
426
 
427
 
 
429
  st.title("Task: Gender Recognition")
430
 
431
  sum = ['Overall']
432
+ dataset_list = [
 
433
  'VoxCeleb-Gender',
434
  'IEMOCAP-Gender'
435
  ]
436
+ filters_1_list = sum + dataset_list
 
437
 
438
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
439
 
440
+ with space1:
441
+ tab_section = st.selectbox('Dataset', filters_1_list)
442
+ with space2:
443
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
444
+ metric = metric.lower()
445
+
446
+ if tab_section:
447
+ if tab_section in sum:
448
+ sum_table_mulit_metrix(dataset_list, metric)
449
+ else:
450
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
451
+ draw_table(tab_section, metric)
452
+
453
+
454
+
455
+
456
+
457
+ def emotion_recognition():
458
+ st.title("Task: Emotion Recognition")
459
+
460
+ sum = ['Overall']
461
+ dataset_list = [
462
+ 'IEMOCAP-Emotion',
463
+ 'MELD-Sentiment',
464
+ 'MELD-Emotion',
465
+ ]
466
+ filters_1_list = sum + dataset_list
467
+
468
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
469
 
470
+ with space1:
471
+ tab_section = st.selectbox('Dataset', filters_1_list)
472
+ with space2:
473
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
474
+ metric = metric.lower()
475
+
476
+ if tab_section:
477
+ if tab_section in sum:
478
+ sum_table_mulit_metrix(dataset_list, metric)
479
  else:
480
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
481
+ draw_table(tab_section, metric)
482
 
483
 
484
 
 
488
 
489
  sum = ['Overall']
490
 
491
+ dataset_list = ['MuChoMusic',
492
  ]
493
 
494
+ filters_1_list = sum + dataset_list
 
 
495
 
496
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
 
497
 
498
+ with space1:
499
+ tab_section = st.selectbox('Dataset', filters_1_list)
500
+ with space2:
501
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
502
+ metric = metric.lower()
 
 
 
503
 
504
+ if tab_section:
505
+ if tab_section in sum:
506
+ sum_table_mulit_metrix(dataset_list, metric)
507
+ else:
508
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
509
+ draw_table(tab_section, metric)
510
 
511
 
512
 
 
517
  def under_development():
518
  st.title("Task: Under Development")
519
 
520
+ dataset_list = [
 
521
  'CNA',
522
  'IDPC',
523
  'Parliament',
 
532
  'YTB-SQA-Batch1',
533
  'YTB-SDS-Batch1',
534
  'YTB-PQA-Batch1',
 
535
  ]
536
 
537
+ filters_1_list = dataset_list
 
 
 
 
 
538
 
539
+ space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2])
540
 
541
+ with space1:
542
+ tab_section = st.selectbox('Dataset', filters_1_list)
543
+ with space2:
544
+ if tab_section in [
545
+ 'CNA',
546
+ 'IDPC',
547
+ 'Parliament',
548
+ 'UKUS-News',
549
+ 'Mediacorp',
550
+ 'IDPC-Short',
551
+ 'Parliament-Short',
552
+ 'UKUS-News-Short',
553
+ 'Mediacorp-Short',
554
+ 'YTB-ASR-Batch1',
555
+ 'YTB-ASR-Batch2',
556
+ ]:
557
+ metric = st.selectbox('Metric', ['WER'])
558
+ metric = metric.lower()
559
+ elif tab_section in [
560
+ 'YTB-SQA-Batch1',
561
+ 'YTB-SDS-Batch1',
562
+ 'YTB-PQA-Batch1',
563
+ ]:
564
+ metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE'])
565
+ metric = metric.lower()
566
+ else:
567
+ raise ValueError('Invalid dataset')
568
 
569
+
570
+ if tab_section:
571
+ dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric])
572
+ draw_table(tab_section, metric)
573
 
574
 
575
  def mmau_evaluation():
app/summarization.py CHANGED
@@ -1,6 +1,9 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  from streamlit_echarts import st_echarts
5
  from streamlit.components.v1 import html
6
  # from PIL import Image
@@ -14,20 +17,27 @@ from model_information import get_dataframe
14
 
15
  info_df = get_dataframe()
16
 
 
17
 
18
- def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
 
19
 
20
- # combine chart data from multiple sources
21
- chart_data = pd.DataFrame()
22
- for metrics in metrics_lists:
23
- folder = f"./results_organized/{metrics}"
24
- data_path = f'{folder}/{task_name.lower()}.csv'
25
- one_chart_data = pd.read_csv(data_path).round(3)
26
- if len(chart_data) == 0:
27
- chart_data = one_chart_data
28
- else:
29
- chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
30
-
 
 
 
 
 
31
 
32
  selected_columns = [i for i in chart_data.columns if i != 'Model']
33
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
@@ -81,7 +91,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
81
  # Format numeric columns to 2 decimal places
82
  chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
83
 
84
- if metrics in ['wer']:
85
  ascend = True
86
  else:
87
  ascend= False
@@ -124,4 +134,4 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
124
  )
125
 
126
  # Only report the last metrics
127
- st.markdown(f'###### Metric: {metrics_info[metrics]}')
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+
5
+ import json
6
+
7
  from streamlit_echarts import st_echarts
8
  from streamlit.components.v1 import html
9
  # from PIL import Image
 
17
 
18
  info_df = get_dataframe()
19
 
20
+ def sum_table_mulit_metrix(dataset_displayname_list, metric):
21
 
22
+ with open('organize_model_results.json', 'r') as f:
23
+ organize_model_results = json.load(f)
24
 
25
+ dataset_results = {}
26
+
27
+ for dataset_displayname in dataset_displayname_list:
28
+ dataset_nickname = displayname2datasetname[dataset_displayname]
29
+ model_results = organize_model_results[dataset_nickname][metric]
30
+ model_name_mapping = {key.strip(): val for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
31
+ model_results = {model_name_mapping.get(key, key): val for key, val in model_results.items()}
32
+
33
+ dataset_results[dataset_displayname] = model_results
34
+
35
+ df_results = pd.DataFrame(dataset_results)
36
+
37
+ # Reset index to have models as a column
38
+ df_results.reset_index(inplace=True)
39
+ df_results.rename(columns={"index": "Model"}, inplace=True)
40
+ chart_data = df_results
41
 
42
  selected_columns = [i for i in chart_data.columns if i != 'Model']
43
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
 
91
  # Format numeric columns to 2 decimal places
92
  chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
93
 
94
+ if metric == 'wer':
95
  ascend = True
96
  else:
97
  ascend= False
 
134
  )
135
 
136
  # Only report the last metrics
137
+ st.markdown(f'###### Metric: {metrics_info[metric]}')