Upload organize_model_results.json with huggingface_hub
Browse files- organize_model_results.json +121 -41
organize_model_results.json
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 47.01682396389003,
|
6 |
"Qwen2-Audio-7B-Instruct": 29.187525646286417,
|
7 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.640951990151827,
|
|
|
8 |
"WavLLM_fairseq": 39.96717275338531,
|
9 |
"SALMONN_7B": 34.222404595814524,
|
10 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.32704144439885
|
@@ -13,12 +14,22 @@
|
|
13 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.462453836684446
|
14 |
}
|
15 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
"wavcaps_test": {
|
17 |
"meteor": {
|
18 |
"Qwen-Audio-Chat": 0.2355106805560457,
|
19 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.3175511907248581,
|
20 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
21 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
|
|
22 |
"WavLLM_fairseq": 0.06399522524688675,
|
23 |
"SALMONN_7B": 0.17175112770658157,
|
24 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
@@ -28,6 +39,7 @@
|
|
28 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 33.97687861271676,
|
29 |
"Qwen2-Audio-7B-Instruct": 33.78034682080925,
|
30 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 6.3468208092485545,
|
|
|
31 |
"WavLLM_fairseq": 6.901734104046243,
|
32 |
"SALMONN_7B": 23.76878612716763,
|
33 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 3.445086705202312
|
@@ -43,6 +55,7 @@
|
|
43 |
"Qwen2-Audio-7B-Instruct": 16.466557744958333,
|
44 |
"whisper_large_v3": 14.673689493155793,
|
45 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
|
|
46 |
"WavLLM_fairseq": 2.368659001743569,
|
47 |
"SALMONN_7B": 5.296039450108202,
|
48 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
@@ -54,6 +67,7 @@
|
|
54 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 67.2,
|
55 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
56 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
|
|
57 |
"WavLLM_fairseq": 62.199999999999996,
|
58 |
"SALMONN_7B": 46.8,
|
59 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
@@ -92,6 +106,7 @@
|
|
92 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.7927548441449,
|
93 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
94 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
|
|
95 |
"WavLLM_fairseq": 44.3133951137321,
|
96 |
"SALMONN_7B": 50.88458298230834,
|
97 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
@@ -116,6 +131,7 @@
|
|
116 |
"Qwen2-Audio-7B-Instruct": 0.1905689473257041,
|
117 |
"whisper_large_v3": 0.3171008846684522,
|
118 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
|
|
119 |
"WavLLM_fairseq": 0.4463923382842302,
|
120 |
"SALMONN_7B": 0.42346400454508565,
|
121 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
@@ -128,6 +144,7 @@
|
|
128 |
"Qwen2-Audio-7B-Instruct": 0.18872219319407232,
|
129 |
"whisper_large_v3": 0.11863959266711877,
|
130 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.11416493424197618,
|
|
|
131 |
"WavLLM_fairseq": 0.6447482518259942,
|
132 |
"SALMONN_7B": 0.2577708974886327,
|
133 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.11773910240019567
|
@@ -152,6 +169,7 @@
|
|
152 |
"Qwen2-Audio-7B-Instruct": 0.060415760304159495,
|
153 |
"whisper_large_v3": 0.03660128246354058,
|
154 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05307658841999735,
|
|
|
155 |
"WavLLM_fairseq": 0.04798834811886432,
|
156 |
"SALMONN_7B": 0.09671439650443565,
|
157 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.03714982881570734
|
@@ -164,6 +182,7 @@
|
|
164 |
"Qwen2-Audio-7B-Instruct": 0.035141660693401744,
|
165 |
"whisper_large_v3": 0.01878749009695552,
|
166 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.032349945297468596,
|
|
|
167 |
"WavLLM_fairseq": 0.02103218017882069,
|
168 |
"SALMONN_7B": 0.10270871845172973,
|
169 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.018334779492209605
|
@@ -176,6 +195,7 @@
|
|
176 |
"Qwen2-Audio-7B-Instruct": 0.2245352799625317,
|
177 |
"whisper_large_v3": 0.1698509342851144,
|
178 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
|
|
179 |
"WavLLM_fairseq": 0.42541061709652933,
|
180 |
"SALMONN_7B": 0.24872817713464365,
|
181 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
@@ -187,6 +207,7 @@
|
|
187 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 65.6,
|
188 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
189 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
|
|
190 |
"WavLLM_fairseq": 19.2,
|
191 |
"SALMONN_7B": 15.8,
|
192 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
@@ -201,6 +222,7 @@
|
|
201 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.505976095617534,
|
202 |
"Qwen2-Audio-7B-Instruct": 53.98406374501992,
|
203 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.322709163346616,
|
|
|
204 |
"WavLLM_fairseq": 59.76095617529881,
|
205 |
"SALMONN_7B": 23.804780876494025,
|
206 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.713147410358566
|
@@ -212,6 +234,7 @@
|
|
212 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 59.7093023255814,
|
213 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
214 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
|
|
215 |
"WavLLM_fairseq": 58.54651162790698,
|
216 |
"SALMONN_7B": 59.24418604651163,
|
217 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
@@ -239,6 +262,7 @@
|
|
239 |
"Qwen2-Audio-7B-Instruct": 0.11438872500819404,
|
240 |
"whisper_large_v3": 0.10001863741235596,
|
241 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.10600831614192711,
|
|
|
242 |
"WavLLM_fairseq": 0.14533325621300636,
|
243 |
"SALMONN_7B": 0.3062255383962828,
|
244 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09876543209876543
|
@@ -250,6 +274,7 @@
|
|
250 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 73.66473556344609,
|
251 |
"Qwen2-Audio-7B-Instruct": 64.86264249672958,
|
252 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.61894972902262,
|
|
|
253 |
"WavLLM_fairseq": 77.64903756307233,
|
254 |
"SALMONN_7B": 66.39506634273968,
|
255 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 83.81984675761541
|
@@ -266,6 +291,7 @@
|
|
266 |
"whisper_large_v3": 0.5377268970583734,
|
267 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
268 |
"gemini-1.5-flash": 1.1100431601824359,
|
|
|
269 |
"WavLLM_fairseq": 1.2204842511249197,
|
270 |
"SALMONN_7B": 1.0189782362484312,
|
271 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
@@ -277,6 +303,7 @@
|
|
277 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.206896551724135,
|
278 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
279 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
|
|
280 |
"WavLLM_fairseq": 51.072796934865906,
|
281 |
"SALMONN_7B": 41.7624521072797,
|
282 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
@@ -288,6 +315,7 @@
|
|
288 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 53.2,
|
289 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
290 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
|
|
291 |
"WavLLM_fairseq": 46.6,
|
292 |
"SALMONN_7B": 36.6,
|
293 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
@@ -302,6 +330,7 @@
|
|
302 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 99.75379565038982,
|
303 |
"Qwen2-Audio-7B-Instruct": 99.1177677472302,
|
304 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 34.94050061551087,
|
|
|
305 |
"WavLLM_fairseq": 69.61427985227739,
|
306 |
"SALMONN_7B": 88.79770209273697,
|
307 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 42.921624948707425
|
@@ -313,6 +342,7 @@
|
|
313 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.76666666666667,
|
314 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
315 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
|
|
316 |
"WavLLM_fairseq": 46.766666666666666,
|
317 |
"SALMONN_7B": 42.733333333333334,
|
318 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
@@ -337,6 +367,7 @@
|
|
337 |
"Qwen2-Audio-7B-Instruct": 0.27856006770658537,
|
338 |
"whisper_large_v3": 0.2143555471246589,
|
339 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
|
|
340 |
"WavLLM_fairseq": 0.39796588405247263,
|
341 |
"SALMONN_7B": 0.34868891450584405,
|
342 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
@@ -360,6 +391,7 @@
|
|
360 |
"Qwen2-Audio-7B-Instruct": 0.23542555661330924,
|
361 |
"whisper_large_v3": 0.15887899737116104,
|
362 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
|
|
363 |
"WavLLM_fairseq": 0.6671766188447099,
|
364 |
"SALMONN_7B": 0.3597423676988383,
|
365 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
@@ -396,6 +428,7 @@
|
|
396 |
"Qwen2-Audio-7B-Instruct": 6.326113431899141,
|
397 |
"whisper_large_v3": 46.01512198258627,
|
398 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
|
|
399 |
"WavLLM_fairseq": 5.933522277713613,
|
400 |
"SALMONN_7B": 26.89649039333571,
|
401 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
@@ -432,6 +465,7 @@
|
|
432 |
"Qwen2-Audio-7B-Instruct": 0.11723812890302816,
|
433 |
"whisper_large_v3": 0.09459022434812692,
|
434 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
|
|
435 |
"WavLLM_fairseq": 0.15491778414546403,
|
436 |
"SALMONN_7B": 0.10765150204693537,
|
437 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
@@ -455,6 +489,7 @@
|
|
455 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.4,
|
456 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
457 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
|
|
458 |
"WavLLM_fairseq": 31.6,
|
459 |
"SALMONN_7B": 9.0,
|
460 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
@@ -469,6 +504,7 @@
|
|
469 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 77.83333333333333,
|
470 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
471 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
|
|
472 |
"WavLLM_fairseq": 0.23333333333333336,
|
473 |
"SALMONN_7B": 0.06666666666666667,
|
474 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
@@ -480,6 +516,7 @@
|
|
480 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.48605577689243,
|
481 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
482 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
|
|
483 |
"WavLLM_fairseq": 51.932270916334666,
|
484 |
"SALMONN_7B": 81.31474103585658,
|
485 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
@@ -492,6 +529,7 @@
|
|
492 |
"Qwen2-Audio-7B-Instruct": 0.2080008649583739,
|
493 |
"whisper_large_v3": 0.17210509244242622,
|
494 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.2192622950819672,
|
|
|
495 |
"WavLLM_fairseq": 0.48091685587631094,
|
496 |
"SALMONN_7B": 0.3238620391393664,
|
497 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.23561466104443723
|
@@ -516,6 +554,7 @@
|
|
516 |
"Qwen2-Audio-7B-Instruct": 74.7247908410392,
|
517 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 91.85380889476001,
|
518 |
"gemini-1.5-flash": 89.25583443416997,
|
|
|
519 |
"WavLLM_fairseq": 66.31439894319684,
|
520 |
"SALMONN_7B": 50.99075297225891,
|
521 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 85.2928225451343
|
@@ -527,6 +566,7 @@
|
|
527 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 84.31782540512285,
|
528 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
529 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
|
|
530 |
"WavLLM_fairseq": 66.5446941975954,
|
531 |
"SALMONN_7B": 56.455828541557764,
|
532 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
@@ -538,6 +578,7 @@
|
|
538 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.0,
|
539 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
540 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
|
|
541 |
"WavLLM_fairseq": 45.199999999999996,
|
542 |
"SALMONN_7B": 17.2,
|
543 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
@@ -553,6 +594,7 @@
|
|
553 |
"Qwen2-Audio-7B-Instruct": 0.09260359129694522,
|
554 |
"whisper_large_v3": 0.12359684029221357,
|
555 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20886539565639167,
|
|
|
556 |
"WavLLM_fairseq": 0.7054601967888183,
|
557 |
"SALMONN_7B": 0.8259290055631446,
|
558 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12450753301261111
|
@@ -564,6 +606,7 @@
|
|
564 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 51.4,
|
565 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
566 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
|
|
567 |
"WavLLM_fairseq": 45.199999999999996,
|
568 |
"SALMONN_7B": 40.599999999999994,
|
569 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
@@ -578,6 +621,7 @@
|
|
578 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.4,
|
579 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
580 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
|
|
581 |
"WavLLM_fairseq": 31.6,
|
582 |
"SALMONN_7B": 7.0,
|
583 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
@@ -604,6 +648,7 @@
|
|
604 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 38.00454545454545,
|
605 |
"Qwen2-Audio-7B-Instruct": 40.77727272727273,
|
606 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 3.0954545454545457,
|
|
|
607 |
"WavLLM_fairseq": 5.5,
|
608 |
"SALMONN_7B": 37.445454545454545,
|
609 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4727272727272727
|
@@ -613,6 +658,7 @@
|
|
613 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.24920047034353812,
|
614 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
615 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
|
|
616 |
"WavLLM_fairseq": 0.041732965094428545,
|
617 |
"SALMONN_7B": 0.20994052484339956,
|
618 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
@@ -627,6 +673,7 @@
|
|
627 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 7.816666666666666,
|
628 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
629 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
|
|
630 |
"WavLLM_fairseq": 2.6833333333333336,
|
631 |
"SALMONN_7B": 2.5166666666666666,
|
632 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
@@ -651,6 +698,7 @@
|
|
651 |
"Qwen2-Audio-7B-Instruct": 0.04425838146050298,
|
652 |
"whisper_large_v3": 2.451098639578599,
|
653 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
|
|
654 |
"WavLLM_fairseq": 0.1695522548322915,
|
655 |
"SALMONN_7B": 0.3649023706010388,
|
656 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
@@ -663,6 +711,7 @@
|
|
663 |
"Qwen2-Audio-7B-Instruct": 16.325186897428104,
|
664 |
"whisper_large_v3": 1.600581653970121,
|
665 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 27.620150160643625,
|
|
|
666 |
"WavLLM_fairseq": 13.841886973016162,
|
667 |
"SALMONN_7B": 14.102682915273142,
|
668 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 10.930203684508578
|
@@ -674,6 +723,7 @@
|
|
674 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 63.15021876519203,
|
675 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
676 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
|
|
677 |
"WavLLM_fairseq": 43.01199466903598,
|
678 |
"SALMONN_7B": 57.75401069518716,
|
679 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
@@ -690,6 +740,7 @@
|
|
690 |
"whisper_large_v3": 0.12226319428439733,
|
691 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1400092187139894,
|
692 |
"gemini-1.5-flash": 0.1089344703080587,
|
|
|
693 |
"WavLLM_fairseq": 0.41876008296842593,
|
694 |
"SALMONN_7B": 0.21487285856956287,
|
695 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12579703464700007
|
@@ -702,6 +753,7 @@
|
|
702 |
"Qwen2-Audio-7B-Instruct": 0.35076166942732234,
|
703 |
"whisper_large_v3": 0.27026366524560785,
|
704 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
|
|
705 |
"WavLLM_fairseq": 0.7540934640345399,
|
706 |
"SALMONN_7B": 0.6569229098215983,
|
707 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
@@ -713,6 +765,7 @@
|
|
713 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 74.80000000000001,
|
714 |
"Qwen2-Audio-7B-Instruct": 52.599999999999994,
|
715 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.8,
|
|
|
716 |
"WavLLM_fairseq": 21.6,
|
717 |
"SALMONN_7B": 17.2,
|
718 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 70.8
|
@@ -750,6 +803,7 @@
|
|
750 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 49.77635782747604,
|
751 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
752 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
|
|
753 |
"WavLLM_fairseq": 29.840255591054312,
|
754 |
"SALMONN_7B": 50.287539936102235,
|
755 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
@@ -765,6 +819,7 @@
|
|
765 |
"Qwen2-Audio-7B-Instruct": 0.07197717796796138,
|
766 |
"whisper_large_v3": 0.06844171360300393,
|
767 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
|
|
768 |
"WavLLM_fairseq": 0.10077292565771828,
|
769 |
"SALMONN_7B": 0.0925804013361617,
|
770 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
@@ -789,6 +844,7 @@
|
|
789 |
"Qwen2-Audio-7B-Instruct": 0.03245972071872916,
|
790 |
"whisper_large_v3": 0.02107778621423822,
|
791 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 8.433062902024755,
|
|
|
792 |
"WavLLM_fairseq": 0.0033159224040994286,
|
793 |
"SALMONN_7B": 0.00046745670226766583,
|
794 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 1.0368044741318085
|
@@ -800,6 +856,7 @@
|
|
800 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.31578947368421,
|
801 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
802 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
|
|
803 |
"WavLLM_fairseq": 26.25,
|
804 |
"SALMONN_7B": 47.30263157894737,
|
805 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
@@ -827,11 +884,68 @@
|
|
827 |
"Qwen2-Audio-7B-Instruct": 25.765420247070075,
|
828 |
"whisper_large_v3": 0.16408986541757878,
|
829 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 35.274306071307024,
|
|
|
830 |
"WavLLM_fairseq": 31.96381187282953,
|
831 |
"SALMONN_7B": 33.88941292215531,
|
832 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 5.987143868370054
|
833 |
}
|
834 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
835 |
"imda_part5_30s_sqa_test": {
|
836 |
"llama3_70b_judge": {
|
837 |
"Qwen-Audio-Chat": 61.260000000000005,
|
@@ -850,6 +964,7 @@
|
|
850 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 66.13333333333333,
|
851 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
852 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
|
|
853 |
"WavLLM_fairseq": 49.06666666666666,
|
854 |
"SALMONN_7B": 59.766666666666666,
|
855 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
@@ -861,6 +976,7 @@
|
|
861 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 86.76470588235293,
|
862 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
863 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
|
|
864 |
"WavLLM_fairseq": 83.92156862745098,
|
865 |
"SALMONN_7B": 83.48039215686273,
|
866 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
@@ -876,6 +992,7 @@
|
|
876 |
"Qwen2-Audio-7B-Instruct": 0.08739585179932637,
|
877 |
"whisper_large_v3": 0.03208650948413402,
|
878 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04396383619925545,
|
|
|
879 |
"WavLLM_fairseq": 0.4536784258110264,
|
880 |
"SALMONN_7B": 0.14231519234178336,
|
881 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.04754476156709803
|
@@ -887,6 +1004,7 @@
|
|
887 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 36.36015325670498,
|
888 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
889 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
|
|
890 |
"WavLLM_fairseq": 41.57088122605364,
|
891 |
"SALMONN_7B": 30.536398467432953,
|
892 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
@@ -899,6 +1017,7 @@
|
|
899 |
"Qwen2-Audio-7B-Instruct": 0.06114048472375004,
|
900 |
"whisper_large_v3": 0.037649480146197796,
|
901 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04900464852205386,
|
|
|
902 |
"WavLLM_fairseq": 0.06621482559171073,
|
903 |
"SALMONN_7B": 0.0459884319222171,
|
904 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.038146268762641496
|
@@ -912,6 +1031,7 @@
|
|
912 |
"whisper_large_v3": 0.7225930420711975,
|
913 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
914 |
"gemini-1.5-flash": 0.9690871089536138,
|
|
|
915 |
"WavLLM_fairseq": 1.2913969795037756,
|
916 |
"SALMONN_7B": 1.2721817691477886,
|
917 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
@@ -935,6 +1055,7 @@
|
|
935 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.80000000000001,
|
936 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
937 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
|
|
938 |
"WavLLM_fairseq": 50.8,
|
939 |
"SALMONN_7B": 44.6,
|
940 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
@@ -1006,47 +1127,6 @@
|
|
1006 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.68000000000001
|
1007 |
}
|
1008 |
},
|
1009 |
-
"imda_part4_30s_asr_test": {
|
1010 |
-
"wer": {
|
1011 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.0
|
1012 |
-
}
|
1013 |
-
},
|
1014 |
-
"mmau_mini": {
|
1015 |
-
"string_match": {
|
1016 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 60.5
|
1017 |
-
},
|
1018 |
-
"llama3_70b_judge": {
|
1019 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.60000000000001,
|
1020 |
-
"phi_4_multimodal_instruct": 59.4
|
1021 |
-
}
|
1022 |
-
},
|
1023 |
-
"mmau_mini_music": {
|
1024 |
-
"string_match": {
|
1025 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6077844311377245
|
1026 |
-
},
|
1027 |
-
"llama3_70b_judge": {
|
1028 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6437125748502994,
|
1029 |
-
"phi_4_multimodal_instruct": 0.688622754491018
|
1030 |
-
}
|
1031 |
-
},
|
1032 |
-
"mmau_mini_sound": {
|
1033 |
-
"string_match": {
|
1034 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6606606606606606
|
1035 |
-
},
|
1036 |
-
"llama3_70b_judge": {
|
1037 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.7027027027027027,
|
1038 |
-
"phi_4_multimodal_instruct": 0.6456456456456456
|
1039 |
-
}
|
1040 |
-
},
|
1041 |
-
"mmau_mini_speech": {
|
1042 |
-
"string_match": {
|
1043 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5465465465465466
|
1044 |
-
},
|
1045 |
-
"llama3_70b_judge": {
|
1046 |
-
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5915915915915916,
|
1047 |
-
"phi_4_multimodal_instruct": 0.44744744744744747
|
1048 |
-
}
|
1049 |
-
},
|
1050 |
"imda_30s_ar_test": {
|
1051 |
"llama3_70b_judge": {
|
1052 |
"Qwen2-Audio-7B-Instruct": 5.106666666666667,
|
|
|
5 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 47.01682396389003,
|
6 |
"Qwen2-Audio-7B-Instruct": 29.187525646286417,
|
7 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.640951990151827,
|
8 |
+
"phi_4_multimodal_instruct": 26.815757078375054,
|
9 |
"WavLLM_fairseq": 39.96717275338531,
|
10 |
"SALMONN_7B": 34.222404595814524,
|
11 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.32704144439885
|
|
|
14 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.462453836684446
|
15 |
}
|
16 |
},
|
17 |
+
"imda_part4_30s_asr_test": {
|
18 |
+
"wer": {
|
19 |
+
"Qwen-Audio-Chat": 1.1764312018747907,
|
20 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.0,
|
21 |
+
"Qwen2-Audio-7B-Instruct": 0.5685405990059489,
|
22 |
+
"whisper_large_v3": 0.8294532718704128,
|
23 |
+
"phi_4_multimodal_instruct": 1.3868687388941825
|
24 |
+
}
|
25 |
+
},
|
26 |
"wavcaps_test": {
|
27 |
"meteor": {
|
28 |
"Qwen-Audio-Chat": 0.2355106805560457,
|
29 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.3175511907248581,
|
30 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
31 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
32 |
+
"phi_4_multimodal_instruct": 0.24508284335582894,
|
33 |
"WavLLM_fairseq": 0.06399522524688675,
|
34 |
"SALMONN_7B": 0.17175112770658157,
|
35 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
|
|
39 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 33.97687861271676,
|
40 |
"Qwen2-Audio-7B-Instruct": 33.78034682080925,
|
41 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 6.3468208092485545,
|
42 |
+
"phi_4_multimodal_instruct": 21.884393063583815,
|
43 |
"WavLLM_fairseq": 6.901734104046243,
|
44 |
"SALMONN_7B": 23.76878612716763,
|
45 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 3.445086705202312
|
|
|
55 |
"Qwen2-Audio-7B-Instruct": 16.466557744958333,
|
56 |
"whisper_large_v3": 14.673689493155793,
|
57 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
58 |
+
"phi_4_multimodal_instruct": 22.678131781242936,
|
59 |
"WavLLM_fairseq": 2.368659001743569,
|
60 |
"SALMONN_7B": 5.296039450108202,
|
61 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
|
|
67 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 67.2,
|
68 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
69 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
70 |
+
"phi_4_multimodal_instruct": 66.2,
|
71 |
"WavLLM_fairseq": 62.199999999999996,
|
72 |
"SALMONN_7B": 46.8,
|
73 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
|
|
106 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.7927548441449,
|
107 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
108 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
109 |
+
"phi_4_multimodal_instruct": 54.422914911541696,
|
110 |
"WavLLM_fairseq": 44.3133951137321,
|
111 |
"SALMONN_7B": 50.88458298230834,
|
112 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
|
|
131 |
"Qwen2-Audio-7B-Instruct": 0.1905689473257041,
|
132 |
"whisper_large_v3": 0.3171008846684522,
|
133 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
134 |
+
"phi_4_multimodal_instruct": 0.3470091713334957,
|
135 |
"WavLLM_fairseq": 0.4463923382842302,
|
136 |
"SALMONN_7B": 0.42346400454508565,
|
137 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
|
|
144 |
"Qwen2-Audio-7B-Instruct": 0.18872219319407232,
|
145 |
"whisper_large_v3": 0.11863959266711877,
|
146 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.11416493424197618,
|
147 |
+
"phi_4_multimodal_instruct": 0.15921168191570967,
|
148 |
"WavLLM_fairseq": 0.6447482518259942,
|
149 |
"SALMONN_7B": 0.2577708974886327,
|
150 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.11773910240019567
|
|
|
169 |
"Qwen2-Audio-7B-Instruct": 0.060415760304159495,
|
170 |
"whisper_large_v3": 0.03660128246354058,
|
171 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05307658841999735,
|
172 |
+
"phi_4_multimodal_instruct": 0.03879546787220762,
|
173 |
"WavLLM_fairseq": 0.04798834811886432,
|
174 |
"SALMONN_7B": 0.09671439650443565,
|
175 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.03714982881570734
|
|
|
182 |
"Qwen2-Audio-7B-Instruct": 0.035141660693401744,
|
183 |
"whisper_large_v3": 0.01878749009695552,
|
184 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.032349945297468596,
|
185 |
+
"phi_4_multimodal_instruct": 0.0167502923755989,
|
186 |
"WavLLM_fairseq": 0.02103218017882069,
|
187 |
"SALMONN_7B": 0.10270871845172973,
|
188 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.018334779492209605
|
|
|
195 |
"Qwen2-Audio-7B-Instruct": 0.2245352799625317,
|
196 |
"whisper_large_v3": 0.1698509342851144,
|
197 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
198 |
+
"phi_4_multimodal_instruct": 0.14552883606001388,
|
199 |
"WavLLM_fairseq": 0.42541061709652933,
|
200 |
"SALMONN_7B": 0.24872817713464365,
|
201 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
|
|
207 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 65.6,
|
208 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
209 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
210 |
+
"phi_4_multimodal_instruct": 30.8,
|
211 |
"WavLLM_fairseq": 19.2,
|
212 |
"SALMONN_7B": 15.8,
|
213 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
|
|
222 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.505976095617534,
|
223 |
"Qwen2-Audio-7B-Instruct": 53.98406374501992,
|
224 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.322709163346616,
|
225 |
+
"phi_4_multimodal_instruct": 41.03585657370518,
|
226 |
"WavLLM_fairseq": 59.76095617529881,
|
227 |
"SALMONN_7B": 23.804780876494025,
|
228 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.713147410358566
|
|
|
234 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 59.7093023255814,
|
235 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
236 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
237 |
+
"phi_4_multimodal_instruct": 68.40116279069767,
|
238 |
"WavLLM_fairseq": 58.54651162790698,
|
239 |
"SALMONN_7B": 59.24418604651163,
|
240 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
|
|
262 |
"Qwen2-Audio-7B-Instruct": 0.11438872500819404,
|
263 |
"whisper_large_v3": 0.10001863741235596,
|
264 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.10600831614192711,
|
265 |
+
"phi_4_multimodal_instruct": 0.08262800367606891,
|
266 |
"WavLLM_fairseq": 0.14533325621300636,
|
267 |
"SALMONN_7B": 0.3062255383962828,
|
268 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09876543209876543
|
|
|
274 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 73.66473556344609,
|
275 |
"Qwen2-Audio-7B-Instruct": 64.86264249672958,
|
276 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.61894972902262,
|
277 |
+
"phi_4_multimodal_instruct": 77.58549803774996,
|
278 |
"WavLLM_fairseq": 77.64903756307233,
|
279 |
"SALMONN_7B": 66.39506634273968,
|
280 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 83.81984675761541
|
|
|
291 |
"whisper_large_v3": 0.5377268970583734,
|
292 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
293 |
"gemini-1.5-flash": 1.1100431601824359,
|
294 |
+
"phi_4_multimodal_instruct": 0.8529492791331231,
|
295 |
"WavLLM_fairseq": 1.2204842511249197,
|
296 |
"SALMONN_7B": 1.0189782362484312,
|
297 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
|
|
303 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.206896551724135,
|
304 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
305 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
306 |
+
"phi_4_multimodal_instruct": 51.609195402298845,
|
307 |
"WavLLM_fairseq": 51.072796934865906,
|
308 |
"SALMONN_7B": 41.7624521072797,
|
309 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
|
|
315 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 53.2,
|
316 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
317 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
318 |
+
"phi_4_multimodal_instruct": 43.8,
|
319 |
"WavLLM_fairseq": 46.6,
|
320 |
"SALMONN_7B": 36.6,
|
321 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
|
|
330 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 99.75379565038982,
|
331 |
"Qwen2-Audio-7B-Instruct": 99.1177677472302,
|
332 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 34.94050061551087,
|
333 |
+
"phi_4_multimodal_instruct": 94.54247025030776,
|
334 |
"WavLLM_fairseq": 69.61427985227739,
|
335 |
"SALMONN_7B": 88.79770209273697,
|
336 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 42.921624948707425
|
|
|
342 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.76666666666667,
|
343 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
344 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
345 |
+
"phi_4_multimodal_instruct": 36.833333333333336,
|
346 |
"WavLLM_fairseq": 46.766666666666666,
|
347 |
"SALMONN_7B": 42.733333333333334,
|
348 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
|
|
367 |
"Qwen2-Audio-7B-Instruct": 0.27856006770658537,
|
368 |
"whisper_large_v3": 0.2143555471246589,
|
369 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
370 |
+
"phi_4_multimodal_instruct": 0.22801359968481416,
|
371 |
"WavLLM_fairseq": 0.39796588405247263,
|
372 |
"SALMONN_7B": 0.34868891450584405,
|
373 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
|
|
391 |
"Qwen2-Audio-7B-Instruct": 0.23542555661330924,
|
392 |
"whisper_large_v3": 0.15887899737116104,
|
393 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
394 |
+
"phi_4_multimodal_instruct": 0.24134627375003423,
|
395 |
"WavLLM_fairseq": 0.6671766188447099,
|
396 |
"SALMONN_7B": 0.3597423676988383,
|
397 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
|
|
428 |
"Qwen2-Audio-7B-Instruct": 6.326113431899141,
|
429 |
"whisper_large_v3": 46.01512198258627,
|
430 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
431 |
+
"phi_4_multimodal_instruct": 0.36465303013961253,
|
432 |
"WavLLM_fairseq": 5.933522277713613,
|
433 |
"SALMONN_7B": 26.89649039333571,
|
434 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
|
|
465 |
"Qwen2-Audio-7B-Instruct": 0.11723812890302816,
|
466 |
"whisper_large_v3": 0.09459022434812692,
|
467 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
468 |
+
"phi_4_multimodal_instruct": 0.09672866386388193,
|
469 |
"WavLLM_fairseq": 0.15491778414546403,
|
470 |
"SALMONN_7B": 0.10765150204693537,
|
471 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
|
|
489 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.4,
|
490 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
491 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
492 |
+
"phi_4_multimodal_instruct": 41.2,
|
493 |
"WavLLM_fairseq": 31.6,
|
494 |
"SALMONN_7B": 9.0,
|
495 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
|
|
504 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 77.83333333333333,
|
505 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
506 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
507 |
+
"phi_4_multimodal_instruct": 0.5333333333333333,
|
508 |
"WavLLM_fairseq": 0.23333333333333336,
|
509 |
"SALMONN_7B": 0.06666666666666667,
|
510 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
|
|
516 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.48605577689243,
|
517 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
518 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
519 |
+
"phi_4_multimodal_instruct": 59.46215139442231,
|
520 |
"WavLLM_fairseq": 51.932270916334666,
|
521 |
"SALMONN_7B": 81.31474103585658,
|
522 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
|
|
529 |
"Qwen2-Audio-7B-Instruct": 0.2080008649583739,
|
530 |
"whisper_large_v3": 0.17210509244242622,
|
531 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.2192622950819672,
|
532 |
+
"phi_4_multimodal_instruct": 0.23849064763758243,
|
533 |
"WavLLM_fairseq": 0.48091685587631094,
|
534 |
"SALMONN_7B": 0.3238620391393664,
|
535 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.23561466104443723
|
|
|
554 |
"Qwen2-Audio-7B-Instruct": 74.7247908410392,
|
555 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 91.85380889476001,
|
556 |
"gemini-1.5-flash": 89.25583443416997,
|
557 |
+
"phi_4_multimodal_instruct": 73.18361955085865,
|
558 |
"WavLLM_fairseq": 66.31439894319684,
|
559 |
"SALMONN_7B": 50.99075297225891,
|
560 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 85.2928225451343
|
|
|
566 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 84.31782540512285,
|
567 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
568 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
569 |
+
"phi_4_multimodal_instruct": 72.60846837428123,
|
570 |
"WavLLM_fairseq": 66.5446941975954,
|
571 |
"SALMONN_7B": 56.455828541557764,
|
572 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
|
|
578 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.0,
|
579 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
580 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
581 |
+
"phi_4_multimodal_instruct": 52.199999999999996,
|
582 |
"WavLLM_fairseq": 45.199999999999996,
|
583 |
"SALMONN_7B": 17.2,
|
584 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
594 |
"Qwen2-Audio-7B-Instruct": 0.09260359129694522,
|
595 |
"whisper_large_v3": 0.12359684029221357,
|
596 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20886539565639167,
|
597 |
+
"phi_4_multimodal_instruct": 0.07466690423868068,
|
598 |
"WavLLM_fairseq": 0.7054601967888183,
|
599 |
"SALMONN_7B": 0.8259290055631446,
|
600 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12450753301261111
|
|
|
606 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 51.4,
|
607 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
608 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
609 |
+
"phi_4_multimodal_instruct": 43.8,
|
610 |
"WavLLM_fairseq": 45.199999999999996,
|
611 |
"SALMONN_7B": 40.599999999999994,
|
612 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
621 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.4,
|
622 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
623 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
624 |
+
"phi_4_multimodal_instruct": 37.0,
|
625 |
"WavLLM_fairseq": 31.6,
|
626 |
"SALMONN_7B": 7.0,
|
627 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
|
|
648 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 38.00454545454545,
|
649 |
"Qwen2-Audio-7B-Instruct": 40.77727272727273,
|
650 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 3.0954545454545457,
|
651 |
+
"phi_4_multimodal_instruct": 26.386363636363637,
|
652 |
"WavLLM_fairseq": 5.5,
|
653 |
"SALMONN_7B": 37.445454545454545,
|
654 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4727272727272727
|
|
|
658 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.24920047034353812,
|
659 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
660 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
661 |
+
"phi_4_multimodal_instruct": 0.1757379026471828,
|
662 |
"WavLLM_fairseq": 0.041732965094428545,
|
663 |
"SALMONN_7B": 0.20994052484339956,
|
664 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
|
|
673 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 7.816666666666666,
|
674 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
675 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
676 |
+
"phi_4_multimodal_instruct": 3.5166666666666666,
|
677 |
"WavLLM_fairseq": 2.6833333333333336,
|
678 |
"SALMONN_7B": 2.5166666666666666,
|
679 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
|
|
698 |
"Qwen2-Audio-7B-Instruct": 0.04425838146050298,
|
699 |
"whisper_large_v3": 2.451098639578599,
|
700 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
701 |
+
"phi_4_multimodal_instruct": 0.053138495633157125,
|
702 |
"WavLLM_fairseq": 0.1695522548322915,
|
703 |
"SALMONN_7B": 0.3649023706010388,
|
704 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
|
|
711 |
"Qwen2-Audio-7B-Instruct": 16.325186897428104,
|
712 |
"whisper_large_v3": 1.600581653970121,
|
713 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 27.620150160643625,
|
714 |
+
"phi_4_multimodal_instruct": 15.012558278964478,
|
715 |
"WavLLM_fairseq": 13.841886973016162,
|
716 |
"SALMONN_7B": 14.102682915273142,
|
717 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 10.930203684508578
|
|
|
723 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 63.15021876519203,
|
724 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
725 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
726 |
+
"phi_4_multimodal_instruct": 47.86582401555663,
|
727 |
"WavLLM_fairseq": 43.01199466903598,
|
728 |
"SALMONN_7B": 57.75401069518716,
|
729 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
|
|
740 |
"whisper_large_v3": 0.12226319428439733,
|
741 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1400092187139894,
|
742 |
"gemini-1.5-flash": 0.1089344703080587,
|
743 |
+
"phi_4_multimodal_instruct": 0.16175001920565416,
|
744 |
"WavLLM_fairseq": 0.41876008296842593,
|
745 |
"SALMONN_7B": 0.21487285856956287,
|
746 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12579703464700007
|
|
|
753 |
"Qwen2-Audio-7B-Instruct": 0.35076166942732234,
|
754 |
"whisper_large_v3": 0.27026366524560785,
|
755 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
756 |
+
"phi_4_multimodal_instruct": 0.44227061666711925,
|
757 |
"WavLLM_fairseq": 0.7540934640345399,
|
758 |
"SALMONN_7B": 0.6569229098215983,
|
759 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
|
|
765 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 74.80000000000001,
|
766 |
"Qwen2-Audio-7B-Instruct": 52.599999999999994,
|
767 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.8,
|
768 |
+
"phi_4_multimodal_instruct": 25.8,
|
769 |
"WavLLM_fairseq": 21.6,
|
770 |
"SALMONN_7B": 17.2,
|
771 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 70.8
|
|
|
803 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 49.77635782747604,
|
804 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
805 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
806 |
+
"phi_4_multimodal_instruct": 38.466453674121404,
|
807 |
"WavLLM_fairseq": 29.840255591054312,
|
808 |
"SALMONN_7B": 50.287539936102235,
|
809 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
|
|
819 |
"Qwen2-Audio-7B-Instruct": 0.07197717796796138,
|
820 |
"whisper_large_v3": 0.06844171360300393,
|
821 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
822 |
+
"phi_4_multimodal_instruct": 0.05739643527661961,
|
823 |
"WavLLM_fairseq": 0.10077292565771828,
|
824 |
"SALMONN_7B": 0.0925804013361617,
|
825 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
|
|
844 |
"Qwen2-Audio-7B-Instruct": 0.03245972071872916,
|
845 |
"whisper_large_v3": 0.02107778621423822,
|
846 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 8.433062902024755,
|
847 |
+
"phi_4_multimodal_instruct": 0.19835914151649442,
|
848 |
"WavLLM_fairseq": 0.0033159224040994286,
|
849 |
"SALMONN_7B": 0.00046745670226766583,
|
850 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 1.0368044741318085
|
|
|
856 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.31578947368421,
|
857 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
858 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
859 |
+
"phi_4_multimodal_instruct": 35.13157894736842,
|
860 |
"WavLLM_fairseq": 26.25,
|
861 |
"SALMONN_7B": 47.30263157894737,
|
862 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
|
|
884 |
"Qwen2-Audio-7B-Instruct": 25.765420247070075,
|
885 |
"whisper_large_v3": 0.16408986541757878,
|
886 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 35.274306071307024,
|
887 |
+
"phi_4_multimodal_instruct": 45.295964957544776,
|
888 |
"WavLLM_fairseq": 31.96381187282953,
|
889 |
"SALMONN_7B": 33.88941292215531,
|
890 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 5.987143868370054
|
891 |
}
|
892 |
},
|
893 |
+
"mmau_mini": {
|
894 |
+
"string_match": {
|
895 |
+
"Qwen-Audio-Chat": 38.5,
|
896 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 60.5,
|
897 |
+
"Qwen2-Audio-7B-Instruct": 44.4,
|
898 |
+
"phi_4_multimodal_instruct": 54.50000000000001
|
899 |
+
},
|
900 |
+
"llama3_70b_judge": {
|
901 |
+
"Qwen-Audio-Chat": 53.6,
|
902 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.60000000000001,
|
903 |
+
"Qwen2-Audio-7B-Instruct": 58.9,
|
904 |
+
"phi_4_multimodal_instruct": 59.4
|
905 |
+
}
|
906 |
+
},
|
907 |
+
"mmau_mini_music": {
|
908 |
+
"string_match": {
|
909 |
+
"Qwen-Audio-Chat": 0.4311377245508982,
|
910 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6077844311377245,
|
911 |
+
"Qwen2-Audio-7B-Instruct": 0.45808383233532934,
|
912 |
+
"phi_4_multimodal_instruct": 0.6377245508982036
|
913 |
+
},
|
914 |
+
"llama3_70b_judge": {
|
915 |
+
"Qwen-Audio-Chat": 0.5958083832335329,
|
916 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6437125748502994,
|
917 |
+
"Qwen2-Audio-7B-Instruct": 0.6017964071856288,
|
918 |
+
"phi_4_multimodal_instruct": 0.688622754491018
|
919 |
+
}
|
920 |
+
},
|
921 |
+
"mmau_mini_sound": {
|
922 |
+
"string_match": {
|
923 |
+
"Qwen-Audio-Chat": 0.43543543543543545,
|
924 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6606606606606606,
|
925 |
+
"Qwen2-Audio-7B-Instruct": 0.4744744744744745,
|
926 |
+
"phi_4_multimodal_instruct": 0.5975975975975976
|
927 |
+
},
|
928 |
+
"llama3_70b_judge": {
|
929 |
+
"Qwen-Audio-Chat": 0.5945945945945946,
|
930 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.7027027027027027,
|
931 |
+
"Qwen2-Audio-7B-Instruct": 0.6306306306306306,
|
932 |
+
"phi_4_multimodal_instruct": 0.6456456456456456
|
933 |
+
}
|
934 |
+
},
|
935 |
+
"mmau_mini_speech": {
|
936 |
+
"string_match": {
|
937 |
+
"Qwen-Audio-Chat": 0.2882882882882883,
|
938 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5465465465465466,
|
939 |
+
"Qwen2-Audio-7B-Instruct": 0.3993993993993994,
|
940 |
+
"phi_4_multimodal_instruct": 0.3993993993993994
|
941 |
+
},
|
942 |
+
"llama3_70b_judge": {
|
943 |
+
"Qwen-Audio-Chat": 0.4174174174174174,
|
944 |
+
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5915915915915916,
|
945 |
+
"Qwen2-Audio-7B-Instruct": 0.5345345345345346,
|
946 |
+
"phi_4_multimodal_instruct": 0.44744744744744747
|
947 |
+
}
|
948 |
+
},
|
949 |
"imda_part5_30s_sqa_test": {
|
950 |
"llama3_70b_judge": {
|
951 |
"Qwen-Audio-Chat": 61.260000000000005,
|
|
|
964 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 66.13333333333333,
|
965 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
966 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
967 |
+
"phi_4_multimodal_instruct": 51.68333333333334,
|
968 |
"WavLLM_fairseq": 49.06666666666666,
|
969 |
"SALMONN_7B": 59.766666666666666,
|
970 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
|
|
976 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 86.76470588235293,
|
977 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
978 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
979 |
+
"phi_4_multimodal_instruct": 88.33333333333334,
|
980 |
"WavLLM_fairseq": 83.92156862745098,
|
981 |
"SALMONN_7B": 83.48039215686273,
|
982 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
|
|
992 |
"Qwen2-Audio-7B-Instruct": 0.08739585179932637,
|
993 |
"whisper_large_v3": 0.03208650948413402,
|
994 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04396383619925545,
|
995 |
+
"phi_4_multimodal_instruct": 0.0381847190214501,
|
996 |
"WavLLM_fairseq": 0.4536784258110264,
|
997 |
"SALMONN_7B": 0.14231519234178336,
|
998 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.04754476156709803
|
|
|
1004 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 36.36015325670498,
|
1005 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
1006 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
1007 |
+
"phi_4_multimodal_instruct": 43.524904214559385,
|
1008 |
"WavLLM_fairseq": 41.57088122605364,
|
1009 |
"SALMONN_7B": 30.536398467432953,
|
1010 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
|
|
1017 |
"Qwen2-Audio-7B-Instruct": 0.06114048472375004,
|
1018 |
"whisper_large_v3": 0.037649480146197796,
|
1019 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04900464852205386,
|
1020 |
+
"phi_4_multimodal_instruct": 0.028494375643163834,
|
1021 |
"WavLLM_fairseq": 0.06621482559171073,
|
1022 |
"SALMONN_7B": 0.0459884319222171,
|
1023 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.038146268762641496
|
|
|
1031 |
"whisper_large_v3": 0.7225930420711975,
|
1032 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
1033 |
"gemini-1.5-flash": 0.9690871089536138,
|
1034 |
+
"phi_4_multimodal_instruct": 0.7126483279395901,
|
1035 |
"WavLLM_fairseq": 1.2913969795037756,
|
1036 |
"SALMONN_7B": 1.2721817691477886,
|
1037 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
|
|
1055 |
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.80000000000001,
|
1056 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
1057 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
1058 |
+
"phi_4_multimodal_instruct": 49.0,
|
1059 |
"WavLLM_fairseq": 50.8,
|
1060 |
"SALMONN_7B": 44.6,
|
1061 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
|
|
1127 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.68000000000001
|
1128 |
}
|
1129 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1130 |
"imda_30s_ar_test": {
|
1131 |
"llama3_70b_judge": {
|
1132 |
"Qwen2-Audio-7B-Instruct": 5.106666666666667,
|