|
{ |
|
"cn_college_listen_mcq_test": { |
|
"llama3_70b_judge": { |
|
"gpt-4o-audio": 90.88507265521797, |
|
"Qwen-Audio-Chat": 63.232056362835756, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 88.50726552179657, |
|
"Qwen2-Audio-7B-Instruct": 74.7247908410392, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 91.85380889476001, |
|
"gemini-1.5-flash": 89.25583443416997, |
|
"phi_4_multimodal_instruct": 73.18361955085865, |
|
"WavLLM_fairseq": 66.31439894319684, |
|
"SALMONN_7B": 50.99075297225891, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 85.2928225451343 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 88.77146631439894, |
|
"Qwen2-Audio-7B-Instruct": 79.0 |
|
} |
|
}, |
|
"mmau_mini": { |
|
"string_match": { |
|
"gpt-4o-audio": 0.0, |
|
"Qwen-Audio-Chat": 38.5, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 60.5, |
|
"Qwen2-Audio-7B-Instruct": 44.4, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.7, |
|
"gemini-1.5-flash": 31.4, |
|
"phi_4_multimodal_instruct": 54.50000000000001, |
|
"seallms_audio_7b": 51.5, |
|
"SALMONN_7B": 40.5, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 51.0 |
|
}, |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 53.6, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.60000000000001, |
|
"Qwen2-Audio-7B-Instruct": 58.9, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 52.7, |
|
"gemini-1.5-flash": 58.199999999999996, |
|
"phi_4_multimodal_instruct": 59.4, |
|
"seallms_audio_7b": 60.199999999999996, |
|
"SALMONN_7B": 48.4, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 55.60000000000001 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 63.9, |
|
"Qwen2-Audio-7B-Instruct": 53.0 |
|
} |
|
}, |
|
"mmau_mini_music": { |
|
"string_match": { |
|
"gpt-4o-audio": 0.0, |
|
"Qwen-Audio-Chat": 0.4311377245508982, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6077844311377245, |
|
"Qwen2-Audio-7B-Instruct": 0.45808383233532934, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.39520958083832336, |
|
"gemini-1.5-flash": 0.2904191616766467, |
|
"phi_4_multimodal_instruct": 0.6377245508982036, |
|
"seallms_audio_7b": 0.6047904191616766, |
|
"SALMONN_7B": 0.4820359281437126, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.5 |
|
}, |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 0.5958083832335329, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6437125748502994, |
|
"Qwen2-Audio-7B-Instruct": 0.6017964071856288, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.44011976047904194, |
|
"gemini-1.5-flash": 0.5868263473053892, |
|
"phi_4_multimodal_instruct": 0.688622754491018, |
|
"seallms_audio_7b": 0.6646706586826348, |
|
"SALMONN_7B": 0.5598802395209581, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.5359281437125748 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6347305389221557, |
|
"Qwen2-Audio-7B-Instruct": 0.5473684210526316 |
|
} |
|
}, |
|
"mmau_mini_sound": { |
|
"string_match": { |
|
"gpt-4o-audio": 0.0, |
|
"Qwen-Audio-Chat": 0.43543543543543545, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6606606606606606, |
|
"Qwen2-Audio-7B-Instruct": 0.4744744744744745, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.45045045045045046, |
|
"gemini-1.5-flash": 0.3483483483483483, |
|
"phi_4_multimodal_instruct": 0.5975975975975976, |
|
"seallms_audio_7b": 0.5165165165165165, |
|
"SALMONN_7B": 0.4594594594594595, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.46546546546546547 |
|
}, |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 0.5945945945945946, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.7027027027027027, |
|
"Qwen2-Audio-7B-Instruct": 0.6306306306306306, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5345345345345346, |
|
"gemini-1.5-flash": 0.5885885885885885, |
|
"phi_4_multimodal_instruct": 0.6456456456456456, |
|
"seallms_audio_7b": 0.6486486486486487, |
|
"SALMONN_7B": 0.5105105105105106, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.5105105105105106 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.6996996996996997, |
|
"Qwen2-Audio-7B-Instruct": 0.5980392156862745 |
|
} |
|
}, |
|
"mmau_mini_speech": { |
|
"string_match": { |
|
"gpt-4o-audio": 0.0, |
|
"Qwen-Audio-Chat": 0.2882882882882883, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5465465465465466, |
|
"Qwen2-Audio-7B-Instruct": 0.3993993993993994, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5555555555555556, |
|
"gemini-1.5-flash": 0.3033033033033033, |
|
"phi_4_multimodal_instruct": 0.3993993993993994, |
|
"seallms_audio_7b": 0.42342342342342343, |
|
"SALMONN_7B": 0.2732732732732733, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.5645645645645646 |
|
}, |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 0.4174174174174174, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5915915915915916, |
|
"Qwen2-Audio-7B-Instruct": 0.5345345345345346, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.6066066066066066, |
|
"gemini-1.5-flash": 0.5705705705705706, |
|
"phi_4_multimodal_instruct": 0.44744744744744747, |
|
"seallms_audio_7b": 0.4924924924924925, |
|
"SALMONN_7B": 0.3813813813813814, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6216216216216216 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5825825825825826, |
|
"Qwen2-Audio-7B-Instruct": 0.44660194174757284 |
|
} |
|
}, |
|
"slue_p2_sqa5_test": { |
|
"llama3_70b_judge": { |
|
"gpt-4o-audio": 89.41176470588235, |
|
"Qwen-Audio-Chat": 79.36274509803921, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 86.76470588235293, |
|
"Qwen2-Audio-7B-Instruct": 80.04901960784315, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902, |
|
"phi_4_multimodal_instruct": 88.33333333333334, |
|
"seallms_audio_7b": 83.52941176470588, |
|
"WavLLM_fairseq": 83.92156862745098, |
|
"SALMONN_7B": 83.48039215686273, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 88.23529411764707, |
|
"Qwen2-Audio-7B-Instruct": 84.86666666666666, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 87.79411764705883 |
|
} |
|
}, |
|
"voxceleb_accent_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 48.05088223225277, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 47.01682396389003, |
|
"Qwen2-Audio-7B-Instruct": 29.187525646286417, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.640951990151827, |
|
"phi_4_multimodal_instruct": 26.815757078375054, |
|
"seallms_audio_7b": 8.658186294624539, |
|
"WavLLM_fairseq": 39.96717275338531, |
|
"SALMONN_7B": 34.222404595814524, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.32704144439885 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 14.813295034878948, |
|
"Qwen2-Audio-7B-Instruct": 22.666666666666664, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 39.462453836684446 |
|
} |
|
}, |
|
"imda_part4_30s_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 1.1764312018747907, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.0, |
|
"Qwen2-Audio-7B-Instruct": 0.5685405990059489, |
|
"whisper_large_v3": 0.8294532718704128, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995, |
|
"phi_4_multimodal_instruct": 1.3868687388941825, |
|
"seallms_audio_7b": 1.8960881769720068, |
|
"WavLLM_fairseq": 1.2058793232211378, |
|
"SALMONN_7B": 0.7757204295537071, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886 |
|
} |
|
}, |
|
"wavcaps_test": { |
|
"meteor": { |
|
"Qwen-Audio-Chat": 0.2355106805560457, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.3175511907248581, |
|
"Qwen2-Audio-7B-Instruct": 0.21342294856199182, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385, |
|
"phi_4_multimodal_instruct": 0.24508284335582894, |
|
"seallms_audio_7b": 0.1444387454989207, |
|
"WavLLM_fairseq": 0.06399522524688675, |
|
"SALMONN_7B": 0.17175112770658157, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543 |
|
}, |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 32.9364161849711, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 33.97687861271676, |
|
"Qwen2-Audio-7B-Instruct": 33.78034682080925, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 6.3468208092485545, |
|
"phi_4_multimodal_instruct": 21.884393063583815, |
|
"WavLLM_fairseq": 6.901734104046243, |
|
"SALMONN_7B": 23.76878612716763, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 3.445086705202312 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 31.641618497109825, |
|
"Qwen2-Audio-7B-Instruct": 34.86666666666667, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 4.61271676300578 |
|
} |
|
}, |
|
"covost2_zh_en_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 9.898238298955656, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 18.76473995941838, |
|
"Qwen2-Audio-7B-Instruct": 16.466557744958333, |
|
"whisper_large_v3": 14.673689493155793, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538, |
|
"phi_4_multimodal_instruct": 22.678131781242936, |
|
"seallms_audio_7b": 18.79451062979056, |
|
"WavLLM_fairseq": 2.368659001743569, |
|
"SALMONN_7B": 5.296039450108202, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419 |
|
} |
|
}, |
|
"imda_part6_30s_sqa_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 51.4, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 67.2, |
|
"Qwen2-Audio-7B-Instruct": 53.6, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6, |
|
"phi_4_multimodal_instruct": 66.2, |
|
"seallms_audio_7b": 58.2, |
|
"WavLLM_fairseq": 62.199999999999996, |
|
"SALMONN_7B": 46.8, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 71.6, |
|
"Qwen2-Audio-7B-Instruct": 56.0, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 67.0 |
|
} |
|
}, |
|
"ukusnews_short_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.10399586086125925, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.10144869855926132, |
|
"Qwen2-Audio-7B-Instruct": 0.1194380323171217, |
|
"whisper_large_v3": 0.06168908700151238, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.06877338215394412, |
|
"WavLLM_fairseq": 0.2066783411605508, |
|
"SALMONN_7B": 0.09042426172092653, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.0700867627159118 |
|
} |
|
}, |
|
"imda_part6_30s_ds_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 43.84, |
|
"Qwen2-Audio-7B-Instruct": 48.38, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.6, |
|
"SALMONN_7B": 27.12, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 59.2 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 67.58 |
|
} |
|
}, |
|
"muchomusic_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 59.0564448188711, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.7927548441449, |
|
"Qwen2-Audio-7B-Instruct": 71.60909856781802, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134, |
|
"phi_4_multimodal_instruct": 54.422914911541696, |
|
"seallms_audio_7b": 63.184498736310026, |
|
"WavLLM_fairseq": 44.3133951137321, |
|
"SALMONN_7B": 50.88458298230834, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.87700084245998, |
|
"Qwen2-Audio-7B-Instruct": 64.66666666666666 |
|
} |
|
}, |
|
"imda_30s_sqa_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 42.199999999999996, |
|
"Qwen2-Audio-7B-Instruct": 47.1, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 62.95, |
|
"SALMONN_7B": 42.300000000000004, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 55.7 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 61.550000000000004 |
|
} |
|
}, |
|
"imda_part2_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.45479263046830615, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.048088629169710254, |
|
"Qwen2-Audio-7B-Instruct": 0.1905689473257041, |
|
"whisper_large_v3": 0.3171008846684522, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613, |
|
"phi_4_multimodal_instruct": 0.3470091713334957, |
|
"seallms_audio_7b": 0.290236182128074, |
|
"WavLLM_fairseq": 0.4463923382842302, |
|
"SALMONN_7B": 0.42346400454508565, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237 |
|
} |
|
}, |
|
"earnings21_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.2655529121410546, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.13488732754499672, |
|
"Qwen2-Audio-7B-Instruct": 0.18872219319407232, |
|
"whisper_large_v3": 0.11863959266711877, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.11416493424197618, |
|
"phi_4_multimodal_instruct": 0.15921168191570967, |
|
"seallms_audio_7b": 0.5115646296316884, |
|
"WavLLM_fairseq": 0.6447482518259942, |
|
"SALMONN_7B": 0.2577708974886327, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.11773910240019567 |
|
} |
|
}, |
|
"parliament_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.26279685873781816, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.058922319992430694, |
|
"Qwen2-Audio-7B-Instruct": 0.23270886555019396, |
|
"whisper_large_v3": 0.0753619074652285, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.06282524363705176, |
|
"WavLLM_fairseq": 0.5216434856656259, |
|
"SALMONN_7B": 0.3010928186204939, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07517267480367111 |
|
} |
|
}, |
|
"librispeech_test_other": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.043467569561352074, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.041576030415949455, |
|
"Qwen2-Audio-7B-Instruct": 0.060415760304159495, |
|
"whisper_large_v3": 0.03660128246354058, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05307658841999735, |
|
"phi_4_multimodal_instruct": 0.03879546787220762, |
|
"seallms_audio_7b": 0.09453912648722265, |
|
"WavLLM_fairseq": 0.04798834811886432, |
|
"SALMONN_7B": 0.09671439650443565, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.03714982881570734 |
|
} |
|
}, |
|
"librispeech_test_clean": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.020258799562379748, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.022918474365262006, |
|
"Qwen2-Audio-7B-Instruct": 0.035141660693401744, |
|
"whisper_large_v3": 0.01878749009695552, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.032349945297468596, |
|
"phi_4_multimodal_instruct": 0.0167502923755989, |
|
"seallms_audio_7b": 0.0509676689176444, |
|
"WavLLM_fairseq": 0.02103218017882069, |
|
"SALMONN_7B": 0.10270871845172973, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.018334779492209605 |
|
} |
|
}, |
|
"imda_part6_30s_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.31394240863063033, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.11292172031202054, |
|
"Qwen2-Audio-7B-Instruct": 0.2245352799625317, |
|
"whisper_large_v3": 0.1698509342851144, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623, |
|
"phi_4_multimodal_instruct": 0.14552883606001388, |
|
"seallms_audio_7b": 0.6259629515980555, |
|
"WavLLM_fairseq": 0.42541061709652933, |
|
"SALMONN_7B": 0.24872817713464365, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267 |
|
} |
|
}, |
|
"openhermes_audio_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 10.600000000000001, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 65.6, |
|
"Qwen2-Audio-7B-Instruct": 44.800000000000004, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2, |
|
"phi_4_multimodal_instruct": 30.8, |
|
"seallms_audio_7b": 63.8, |
|
"WavLLM_fairseq": 19.2, |
|
"SALMONN_7B": 15.8, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 74.2, |
|
"Qwen2-Audio-7B-Instruct": 57.199999999999996, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 75.0 |
|
} |
|
}, |
|
"iemocap_emotion_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 29.382470119521916, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.505976095617534, |
|
"Qwen2-Audio-7B-Instruct": 53.98406374501992, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.322709163346616, |
|
"phi_4_multimodal_instruct": 41.03585657370518, |
|
"WavLLM_fairseq": 59.76095617529881, |
|
"SALMONN_7B": 23.804780876494025, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.713147410358566 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 37.45019920318725, |
|
"Qwen2-Audio-7B-Instruct": 35.333333333333336 |
|
} |
|
}, |
|
"public_sg_speech_qa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 63.16860465116279, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 59.7093023255814, |
|
"Qwen2-Audio-7B-Instruct": 58.31395348837209, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907, |
|
"phi_4_multimodal_instruct": 68.40116279069767, |
|
"seallms_audio_7b": 57.587209302325576, |
|
"WavLLM_fairseq": 58.54651162790698, |
|
"SALMONN_7B": 59.24418604651163, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.18604651162791, |
|
"Qwen2-Audio-7B-Instruct": 62.733333333333334, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 73.02325581395348 |
|
} |
|
}, |
|
"mediacorp_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.4498529892192094, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.170859196341065, |
|
"Qwen2-Audio-7B-Instruct": 0.18694870957203527, |
|
"whisper_large_v3": 0.12054884024828487, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.12455080039202875, |
|
"WavLLM_fairseq": 0.3595230316889905, |
|
"SALMONN_7B": 0.32089186540346293, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.13598497223129696 |
|
} |
|
}, |
|
"common_voice_15_en_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.11272421128398918, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.07811646454714301, |
|
"Qwen2-Audio-7B-Instruct": 0.11438872500819404, |
|
"whisper_large_v3": 0.10001863741235596, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.10600831614192711, |
|
"phi_4_multimodal_instruct": 0.08262800367606891, |
|
"seallms_audio_7b": 0.14429855849255468, |
|
"WavLLM_fairseq": 0.14533325621300636, |
|
"SALMONN_7B": 0.3062255383962828, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09876543209876543 |
|
} |
|
}, |
|
"spoken_squad_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 64.8327415436367, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 73.66473556344609, |
|
"Qwen2-Audio-7B-Instruct": 64.86264249672958, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.61894972902262, |
|
"phi_4_multimodal_instruct": 77.58549803774996, |
|
"seallms_audio_7b": 67.73313399364605, |
|
"WavLLM_fairseq": 77.64903756307233, |
|
"SALMONN_7B": 66.39506634273968, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 83.81984675761541 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 74.99159035694262, |
|
"Qwen2-Audio-7B-Instruct": 65.6, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 90.12521024107643 |
|
} |
|
}, |
|
"seame_dev_sge": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 1.05567969634822, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.35550521901496834, |
|
"Qwen2-Audio-7B-Instruct": 0.5486546879304539, |
|
"whisper_large_v3": 0.5377268970583734, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387, |
|
"gemini-1.5-flash": 1.1100431601824359, |
|
"phi_4_multimodal_instruct": 0.8529492791331231, |
|
"seallms_audio_7b": 1.7106737273868193, |
|
"WavLLM_fairseq": 1.2204842511249197, |
|
"SALMONN_7B": 1.0189782362484312, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792 |
|
} |
|
}, |
|
"meld_sentiment_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 44.90421455938697, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.206896551724135, |
|
"Qwen2-Audio-7B-Instruct": 53.9463601532567, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625, |
|
"phi_4_multimodal_instruct": 51.609195402298845, |
|
"seallms_audio_7b": 52.1455938697318, |
|
"WavLLM_fairseq": 51.072796934865906, |
|
"SALMONN_7B": 41.7624521072797, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 47.356321839080465, |
|
"Qwen2-Audio-7B-Instruct": 57.666666666666664 |
|
} |
|
}, |
|
"imda_part4_30s_sqa_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 37.8, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 53.2, |
|
"Qwen2-Audio-7B-Instruct": 39.6, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0, |
|
"phi_4_multimodal_instruct": 43.8, |
|
"seallms_audio_7b": 45.0, |
|
"WavLLM_fairseq": 46.6, |
|
"SALMONN_7B": 36.6, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 60.0, |
|
"Qwen2-Audio-7B-Instruct": 43.4, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 61.4 |
|
} |
|
}, |
|
"voxceleb_gender_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 70.5990972507181, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 99.75379565038982, |
|
"Qwen2-Audio-7B-Instruct": 99.1177677472302, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 34.94050061551087, |
|
"phi_4_multimodal_instruct": 94.54247025030776, |
|
"WavLLM_fairseq": 69.61427985227739, |
|
"SALMONN_7B": 88.79770209273697, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 42.921624948707425 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 99.46655724251129, |
|
"Qwen2-Audio-7B-Instruct": 99.66666666666667 |
|
} |
|
}, |
|
"imda_gr_dialogue": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 37.2, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.76666666666667, |
|
"Qwen2-Audio-7B-Instruct": 61.56666666666667, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6, |
|
"phi_4_multimodal_instruct": 36.833333333333336, |
|
"seallms_audio_7b": 30.5, |
|
"WavLLM_fairseq": 46.766666666666666, |
|
"SALMONN_7B": 42.733333333333334, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.86666666666666, |
|
"Qwen2-Audio-7B-Instruct": 61.0 |
|
} |
|
}, |
|
"imda_30s_ds_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 30.65, |
|
"Qwen2-Audio-7B-Instruct": 37.599999999999994, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 50.15, |
|
"SALMONN_7B": 16.15, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 43.849999999999994 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 54.65 |
|
} |
|
}, |
|
"imda_part5_30s_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.3016882870525747, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.17694182194919086, |
|
"Qwen2-Audio-7B-Instruct": 0.27856006770658537, |
|
"whisper_large_v3": 0.2143555471246589, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825, |
|
"phi_4_multimodal_instruct": 0.22801359968481416, |
|
"seallms_audio_7b": 0.5812260145043848, |
|
"WavLLM_fairseq": 0.39796588405247263, |
|
"SALMONN_7B": 0.34868891450584405, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695 |
|
} |
|
}, |
|
"imda_part4_30s_sqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 41.92, |
|
"Qwen2-Audio-7B-Instruct": 50.279999999999994, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.34, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 61.980000000000004 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.9 |
|
} |
|
}, |
|
"earnings22_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.3664994875132684, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.1652245056860175, |
|
"Qwen2-Audio-7B-Instruct": 0.23542555661330924, |
|
"whisper_large_v3": 0.15887899737116104, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777, |
|
"phi_4_multimodal_instruct": 0.24134627375003423, |
|
"seallms_audio_7b": 0.5738685499413504, |
|
"WavLLM_fairseq": 0.6671766188447099, |
|
"SALMONN_7B": 0.3597423676988383, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763 |
|
} |
|
}, |
|
"idpc_short_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.6008025988916491, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.24918784635964075, |
|
"Qwen2-Audio-7B-Instruct": 0.21326199120963119, |
|
"whisper_large_v3": 0.1662526275558953, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.16931014714313014, |
|
"WavLLM_fairseq": 0.36728454041658704, |
|
"SALMONN_7B": 0.26313777947639977, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15803554366520162 |
|
} |
|
}, |
|
"cna_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.19753284203780838, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.15924383210509452, |
|
"Qwen2-Audio-7B-Instruct": 0.2067713339741536, |
|
"whisper_large_v3": 0.13841717398269784, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.15171419416853574, |
|
"WavLLM_fairseq": 0.26946491509131687, |
|
"SALMONN_7B": 0.15395706504325538, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.13798996048275125 |
|
} |
|
}, |
|
"covost2_id_en_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 0.45648619714728844, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 44.43289180618449, |
|
"Qwen2-Audio-7B-Instruct": 6.326113431899141, |
|
"whisper_large_v3": 46.01512198258627, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861, |
|
"phi_4_multimodal_instruct": 0.36465303013961253, |
|
"seallms_audio_7b": 43.98074943006231, |
|
"WavLLM_fairseq": 5.933522277713613, |
|
"SALMONN_7B": 26.89649039333571, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527 |
|
} |
|
}, |
|
"imda_part3_30s_sqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 51.08, |
|
"Qwen2-Audio-7B-Instruct": 60.620000000000005, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 70.17999999999999, |
|
"SALMONN_7B": 50.8, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 70.28 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 73.0 |
|
} |
|
}, |
|
"idpc_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.7710863986313088, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.30008554319931563, |
|
"Qwen2-Audio-7B-Instruct": 0.19093242087254064, |
|
"whisper_large_v3": 0.19880239520958085, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.16766467065868262, |
|
"WavLLM_fairseq": 0.7686911890504705, |
|
"SALMONN_7B": 0.4550898203592814, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17741659538066723 |
|
} |
|
}, |
|
"gigaspeech_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.13018910022587737, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.14457154747310655, |
|
"Qwen2-Audio-7B-Instruct": 0.11723812890302816, |
|
"whisper_large_v3": 0.09459022434812692, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261, |
|
"phi_4_multimodal_instruct": 0.09672866386388193, |
|
"seallms_audio_7b": 0.13672725996455154, |
|
"WavLLM_fairseq": 0.15491778414546403, |
|
"SALMONN_7B": 0.10765150204693537, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297 |
|
} |
|
}, |
|
"mediacorp_short_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.2548909377108163, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.13301101866426804, |
|
"Qwen2-Audio-7B-Instruct": 0.17180121430177647, |
|
"whisper_large_v3": 0.11715763436024286, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.14571621317742298, |
|
"WavLLM_fairseq": 0.2621992354396222, |
|
"SALMONN_7B": 0.1751742747919946, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.11434675061839443 |
|
} |
|
}, |
|
"imda_part3_30s_ds_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 16.4, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 48.4, |
|
"Qwen2-Audio-7B-Instruct": 33.8, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4, |
|
"phi_4_multimodal_instruct": 41.2, |
|
"seallms_audio_7b": 43.0, |
|
"WavLLM_fairseq": 31.6, |
|
"SALMONN_7B": 9.0, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 59.2, |
|
"Qwen2-Audio-7B-Instruct": 43.2, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 47.400000000000006 |
|
} |
|
}, |
|
"imda_ar_dialogue": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 0.6666666666666667, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 77.83333333333333, |
|
"Qwen2-Audio-7B-Instruct": 0.9666666666666667, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334, |
|
"phi_4_multimodal_instruct": 0.5333333333333333, |
|
"seallms_audio_7b": 15.633333333333333, |
|
"WavLLM_fairseq": 0.23333333333333336, |
|
"SALMONN_7B": 0.06666666666666667, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 77.13333333333333, |
|
"Qwen2-Audio-7B-Instruct": 0.33333333333333337 |
|
} |
|
}, |
|
"iemocap_gender_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 50.0996015936255, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 93.48605577689243, |
|
"Qwen2-Audio-7B-Instruct": 92.80876494023903, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685, |
|
"phi_4_multimodal_instruct": 59.46215139442231, |
|
"seallms_audio_7b": 66.43426294820716, |
|
"WavLLM_fairseq": 51.932270916334666, |
|
"SALMONN_7B": 81.31474103585658, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 98.20717131474103, |
|
"Qwen2-Audio-7B-Instruct": 98.33333333333333 |
|
} |
|
}, |
|
"ytb_asr_batch2": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.4315277327278625, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.15162720294085846, |
|
"Qwen2-Audio-7B-Instruct": 0.2080008649583739, |
|
"whisper_large_v3": 0.17210509244242622, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.2192622950819672, |
|
"phi_4_multimodal_instruct": 0.23849064763758243, |
|
"WavLLM_fairseq": 0.48091685587631094, |
|
"SALMONN_7B": 0.3238620391393664, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.23561466104443723 |
|
} |
|
}, |
|
"ytb_pqa_batch1": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 37.16117216117216, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 40.97069597069597, |
|
"Qwen2-Audio-7B-Instruct": 36.97802197802198, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 55.01831501831502, |
|
"gemini-1.5-flash": 49.908424908424905, |
|
"WavLLM_fairseq": 40.95238095238095, |
|
"SALMONN_7B": 32.124542124542124, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 52.252747252747255 |
|
} |
|
}, |
|
"dream_tts_mcq_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 59.749085206481965, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 84.31782540512285, |
|
"Qwen2-Audio-7B-Instruct": 66.49242028227914, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353, |
|
"phi_4_multimodal_instruct": 72.60846837428123, |
|
"seallms_audio_7b": 75.6926293779404, |
|
"WavLLM_fairseq": 66.5446941975954, |
|
"SALMONN_7B": 56.455828541557764, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 84.31782540512285, |
|
"Qwen2-Audio-7B-Instruct": 68.66666666666667 |
|
} |
|
}, |
|
"imda_part5_30s_ds_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 28.2, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 57.0, |
|
"Qwen2-Audio-7B-Instruct": 40.4, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0, |
|
"phi_4_multimodal_instruct": 52.199999999999996, |
|
"seallms_audio_7b": 49.400000000000006, |
|
"WavLLM_fairseq": 45.199999999999996, |
|
"SALMONN_7B": 17.2, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.0, |
|
"Qwen2-Audio-7B-Instruct": 50.8, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.8 |
|
} |
|
}, |
|
"aishell_asr_zh_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.9469917443725129, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.13165449110094832, |
|
"Qwen2-Audio-7B-Instruct": 0.09260359129694522, |
|
"whisper_large_v3": 0.12359684029221357, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20886539565639167, |
|
"phi_4_multimodal_instruct": 0.07466690423868068, |
|
"seallms_audio_7b": 0.11804359446457208, |
|
"WavLLM_fairseq": 0.7054601967888183, |
|
"SALMONN_7B": 0.8259290055631446, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12450753301261111 |
|
} |
|
}, |
|
"imda_part3_30s_sqa_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 32.2, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 51.4, |
|
"Qwen2-Audio-7B-Instruct": 42.0, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0, |
|
"phi_4_multimodal_instruct": 43.8, |
|
"seallms_audio_7b": 45.599999999999994, |
|
"WavLLM_fairseq": 45.199999999999996, |
|
"SALMONN_7B": 40.599999999999994, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 60.599999999999994, |
|
"Qwen2-Audio-7B-Instruct": 50.199999999999996, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 52.800000000000004 |
|
} |
|
}, |
|
"imda_part4_30s_ds_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 16.0, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.4, |
|
"Qwen2-Audio-7B-Instruct": 24.8, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0, |
|
"phi_4_multimodal_instruct": 37.0, |
|
"seallms_audio_7b": 35.4, |
|
"WavLLM_fairseq": 31.6, |
|
"SALMONN_7B": 7.0, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 55.199999999999996, |
|
"Qwen2-Audio-7B-Instruct": 35.8, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 48.2 |
|
} |
|
}, |
|
"ytb_sds_batch1": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 43.878954607977995, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 53.97524071526823, |
|
"Qwen2-Audio-7B-Instruct": 51.5818431911967, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 64.12654745529574, |
|
"gemini-1.5-flash": 65.9697386519945, |
|
"WavLLM_fairseq": 55.625859697386524, |
|
"SALMONN_7B": 31.279229711141674, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 59.44979367262724 |
|
} |
|
}, |
|
"audiocaps_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 47.04090909090909, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 38.00454545454545, |
|
"Qwen2-Audio-7B-Instruct": 40.77727272727273, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 3.0954545454545457, |
|
"phi_4_multimodal_instruct": 26.386363636363637, |
|
"seallms_audio_7b": 53.20909090909091, |
|
"WavLLM_fairseq": 5.5, |
|
"SALMONN_7B": 37.445454545454545, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4727272727272727 |
|
}, |
|
"meteor": { |
|
"Qwen-Audio-Chat": 0.27553015076950976, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.24920047034353812, |
|
"Qwen2-Audio-7B-Instruct": 0.19891712076314283, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051, |
|
"phi_4_multimodal_instruct": 0.1757379026471828, |
|
"seallms_audio_7b": 0.30423899385222564, |
|
"WavLLM_fairseq": 0.041732965094428545, |
|
"SALMONN_7B": 0.20994052484339956, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 39.29545454545455, |
|
"Qwen2-Audio-7B-Instruct": 41.53333333333333, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 4.868181818181818 |
|
} |
|
}, |
|
"imda_ar_sentence": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 3.933333333333333, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 7.816666666666666, |
|
"Qwen2-Audio-7B-Instruct": 2.55, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666, |
|
"phi_4_multimodal_instruct": 3.5166666666666666, |
|
"seallms_audio_7b": 3.5833333333333335, |
|
"WavLLM_fairseq": 2.6833333333333336, |
|
"SALMONN_7B": 2.5166666666666666, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 10.116666666666667, |
|
"Qwen2-Audio-7B-Instruct": 4.666666666666667 |
|
} |
|
}, |
|
"imda_part6_30s_sqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 63.040000000000006, |
|
"Qwen2-Audio-7B-Instruct": 69.42, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 83.08, |
|
"SALMONN_7B": 66.86, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 80.60000000000001 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 81.8 |
|
} |
|
}, |
|
"covost2_ta_en_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 0.01699144301093184, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 5.023057608950299, |
|
"Qwen2-Audio-7B-Instruct": 0.04425838146050298, |
|
"whisper_large_v3": 2.451098639578599, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337, |
|
"phi_4_multimodal_instruct": 0.053138495633157125, |
|
"seallms_audio_7b": 0.06475917031217593, |
|
"WavLLM_fairseq": 0.1695522548322915, |
|
"SALMONN_7B": 0.3649023706010388, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917 |
|
} |
|
}, |
|
"covost2_en_id_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 4.102230932924371, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 37.60224687716629, |
|
"Qwen2-Audio-7B-Instruct": 16.325186897428104, |
|
"whisper_large_v3": 1.600581653970121, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 27.620150160643625, |
|
"phi_4_multimodal_instruct": 15.012558278964478, |
|
"seallms_audio_7b": 27.583542512329426, |
|
"WavLLM_fairseq": 13.841886973016162, |
|
"SALMONN_7B": 14.102682915273142, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 10.930203684508578 |
|
} |
|
}, |
|
"clotho_aqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 61.934856587263, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 63.15021876519203, |
|
"Qwen2-Audio-7B-Instruct": 50.919591292758774, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585, |
|
"phi_4_multimodal_instruct": 47.86582401555663, |
|
"seallms_audio_7b": 53.03840544482256, |
|
"WavLLM_fairseq": 43.01199466903598, |
|
"SALMONN_7B": 57.75401069518716, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.09333981526495, |
|
"Qwen2-Audio-7B-Instruct": 56.86666666666667, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 28.076410484229232 |
|
} |
|
}, |
|
"ytb_asr_batch1": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.2297764461857571, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.11484981178458939, |
|
"Qwen2-Audio-7B-Instruct": 0.16843358684796805, |
|
"whisper_large_v3": 0.12226319428439733, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1400092187139894, |
|
"gemini-1.5-flash": 0.1089344703080587, |
|
"phi_4_multimodal_instruct": 0.16175001920565416, |
|
"WavLLM_fairseq": 0.41876008296842593, |
|
"SALMONN_7B": 0.21487285856956287, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.12579703464700007 |
|
} |
|
}, |
|
"imda_part3_30s_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.6412550574306894, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.2919053954978684, |
|
"Qwen2-Audio-7B-Instruct": 0.35076166942732234, |
|
"whisper_large_v3": 0.27026366524560785, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043, |
|
"phi_4_multimodal_instruct": 0.44227061666711925, |
|
"seallms_audio_7b": 1.0837293290249002, |
|
"WavLLM_fairseq": 0.7540934640345399, |
|
"SALMONN_7B": 0.6569229098215983, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493 |
|
} |
|
}, |
|
"alpaca_audio_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 9.8, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 74.80000000000001, |
|
"Qwen2-Audio-7B-Instruct": 52.599999999999994, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.8, |
|
"phi_4_multimodal_instruct": 25.8, |
|
"WavLLM_fairseq": 21.6, |
|
"SALMONN_7B": 17.2, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 70.8 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 78.60000000000001, |
|
"Qwen2-Audio-7B-Instruct": 61.6, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 77.8 |
|
} |
|
}, |
|
"imda_30s_sqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 54.669999999999995, |
|
"Qwen2-Audio-7B-Instruct": 62.190000000000005, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 75.09, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 72.475 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 75.11999999999999 |
|
} |
|
}, |
|
"ytb_sqa_batch1": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 60.827586206896555, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.51231527093596, |
|
"Qwen2-Audio-7B-Instruct": 60.453201970443345, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 70.18719211822659, |
|
"gemini-1.5-flash": 78.06896551724138, |
|
"WavLLM_fairseq": 60.70935960591133, |
|
"SALMONN_7B": 55.665024630541865, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 67.3103448275862 |
|
} |
|
}, |
|
"audiocaps_qa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 50.22364217252396, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 49.77635782747604, |
|
"Qwen2-Audio-7B-Instruct": 45.75079872204473, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407, |
|
"phi_4_multimodal_instruct": 38.466453674121404, |
|
"seallms_audio_7b": 53.73801916932908, |
|
"WavLLM_fairseq": 29.840255591054312, |
|
"SALMONN_7B": 50.287539936102235, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 50.60702875399361, |
|
"Qwen2-Audio-7B-Instruct": 50.599999999999994, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.63258785942492 |
|
} |
|
}, |
|
"imda_part1_asr_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.10550313315290274, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.042254894789457, |
|
"Qwen2-Audio-7B-Instruct": 0.07197717796796138, |
|
"whisper_large_v3": 0.06844171360300393, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775, |
|
"phi_4_multimodal_instruct": 0.05739643527661961, |
|
"seallms_audio_7b": 0.17813863896813206, |
|
"WavLLM_fairseq": 0.10077292565771828, |
|
"SALMONN_7B": 0.0925804013361617, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074 |
|
} |
|
}, |
|
"peoples_speech_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.31419144746723354, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.21050407754683692, |
|
"Qwen2-Audio-7B-Instruct": 0.2165498391593041, |
|
"whisper_large_v3": 0.14602420615337386, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682, |
|
"seallms_audio_7b": 0.369768551146351, |
|
"WavLLM_fairseq": 0.3792176325635977, |
|
"SALMONN_7B": 0.23699946689025367, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275 |
|
} |
|
}, |
|
"covost2_en_ta_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 0.03451483807236294, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 14.407399367512914, |
|
"Qwen2-Audio-7B-Instruct": 0.03245972071872916, |
|
"whisper_large_v3": 0.02107778621423822, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 8.433062902024755, |
|
"phi_4_multimodal_instruct": 0.19835914151649442, |
|
"seallms_audio_7b": 0.012334972259958572, |
|
"WavLLM_fairseq": 0.0033159224040994286, |
|
"SALMONN_7B": 0.00046745670226766583, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 1.0368044741318085 |
|
} |
|
}, |
|
"wavcaps_qa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 42.69736842105263, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.31578947368421, |
|
"Qwen2-Audio-7B-Instruct": 44.473684210526315, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842, |
|
"phi_4_multimodal_instruct": 35.13157894736842, |
|
"seallms_audio_7b": 42.10526315789473, |
|
"WavLLM_fairseq": 26.25, |
|
"SALMONN_7B": 47.30263157894737, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 46.31578947368421, |
|
"Qwen2-Audio-7B-Instruct": 48.2, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.736842105263158 |
|
} |
|
}, |
|
"parliament_short_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.09347360821020603, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.056935097083623425, |
|
"Qwen2-Audio-7B-Instruct": 0.08416492612361723, |
|
"whisper_large_v3": 0.05543951935226013, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07325752301384698, |
|
"WavLLM_fairseq": 0.09512390087929656, |
|
"SALMONN_7B": 0.08676929424202573, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.05742502771975968 |
|
} |
|
}, |
|
"covost2_en_zh_test": { |
|
"bleu": { |
|
"Qwen-Audio-Chat": 15.330641138043728, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 43.941098854450516, |
|
"Qwen2-Audio-7B-Instruct": 25.765420247070075, |
|
"whisper_large_v3": 0.16408986541757878, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 35.274306071307024, |
|
"phi_4_multimodal_instruct": 45.295964957544776, |
|
"seallms_audio_7b": 36.4496678966979, |
|
"WavLLM_fairseq": 31.96381187282953, |
|
"SALMONN_7B": 33.88941292215531, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 5.987143868370054 |
|
} |
|
}, |
|
"imda_part5_30s_sqa_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 61.260000000000005, |
|
"Qwen2-Audio-7B-Instruct": 68.52000000000001, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 80.34, |
|
"SALMONN_7B": 62.62, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 76.56 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 80.36 |
|
} |
|
}, |
|
"imda_gr_sentence": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 57.550000000000004, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 66.13333333333333, |
|
"Qwen2-Audio-7B-Instruct": 68.38333333333333, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35, |
|
"phi_4_multimodal_instruct": 51.68333333333334, |
|
"seallms_audio_7b": 50.083333333333336, |
|
"WavLLM_fairseq": 49.06666666666666, |
|
"SALMONN_7B": 59.766666666666666, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 66.9, |
|
"Qwen2-Audio-7B-Instruct": 66.33333333333333 |
|
} |
|
}, |
|
"tedlium3_long_form_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.2911540507002305, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.10228682857649353, |
|
"Qwen2-Audio-7B-Instruct": 0.08739585179932637, |
|
"whisper_large_v3": 0.03208650948413402, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04396383619925545, |
|
"phi_4_multimodal_instruct": 0.0381847190214501, |
|
"seallms_audio_7b": 0.3208650948413402, |
|
"WavLLM_fairseq": 0.4536784258110264, |
|
"SALMONN_7B": 0.14231519234178336, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.04754476156709803 |
|
} |
|
}, |
|
"meld_emotion_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 50.72796934865901, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 36.36015325670498, |
|
"Qwen2-Audio-7B-Instruct": 41.60919540229885, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465, |
|
"phi_4_multimodal_instruct": 43.524904214559385, |
|
"seallms_audio_7b": 51.11111111111111, |
|
"WavLLM_fairseq": 41.57088122605364, |
|
"SALMONN_7B": 30.536398467432953, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 36.206896551724135, |
|
"Qwen2-Audio-7B-Instruct": 39.0 |
|
} |
|
}, |
|
"tedlium3_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.04052375714133636, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.07884745040985061, |
|
"Qwen2-Audio-7B-Instruct": 0.06114048472375004, |
|
"whisper_large_v3": 0.037649480146197796, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.04900464852205386, |
|
"phi_4_multimodal_instruct": 0.028494375643163834, |
|
"seallms_audio_7b": 0.04829495049856286, |
|
"WavLLM_fairseq": 0.06621482559171073, |
|
"SALMONN_7B": 0.0459884319222171, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.038146268762641496 |
|
} |
|
}, |
|
"seame_dev_man": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.8783373786407767, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.388282092772384, |
|
"Qwen2-Audio-7B-Instruct": 0.5522518878101402, |
|
"whisper_large_v3": 0.7225930420711975, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711, |
|
"gemini-1.5-flash": 0.9690871089536138, |
|
"phi_4_multimodal_instruct": 0.7126483279395901, |
|
"seallms_audio_7b": 1.0639495685005393, |
|
"WavLLM_fairseq": 1.2913969795037756, |
|
"SALMONN_7B": 1.2721817691477886, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123 |
|
} |
|
}, |
|
"imda_30s_ds_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 31.295, |
|
"Qwen2-Audio-7B-Instruct": 38.915, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 54.515, |
|
"SALMONN_7B": 18.345, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 48.269999999999996 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.99 |
|
} |
|
}, |
|
"imda_part5_30s_sqa_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 47.800000000000004, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 64.80000000000001, |
|
"Qwen2-Audio-7B-Instruct": 51.6, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0, |
|
"phi_4_multimodal_instruct": 49.0, |
|
"seallms_audio_7b": 54.2, |
|
"WavLLM_fairseq": 50.8, |
|
"SALMONN_7B": 44.6, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 66.8, |
|
"Qwen2-Audio-7B-Instruct": 58.0, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.80000000000001 |
|
} |
|
}, |
|
"imda_part6_30s_ds_human_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 40.4, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 62.599999999999994, |
|
"Qwen2-Audio-7B-Instruct": 46.2, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4, |
|
"phi_4_multimodal_instruct": 52.599999999999994, |
|
"seallms_audio_7b": 53.0, |
|
"WavLLM_fairseq": 49.400000000000006, |
|
"SALMONN_7B": 24.2, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996 |
|
}, |
|
"gpt4o_judge": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 70.0, |
|
"Qwen2-Audio-7B-Instruct": 51.4, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.4 |
|
} |
|
}, |
|
"imda_part4_30s_ds_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 18.060000000000002, |
|
"Qwen2-Audio-7B-Instruct": 25.019999999999996, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 43.4, |
|
"SALMONN_7B": 9.399999999999999, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.879999999999995 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 47.74 |
|
} |
|
}, |
|
"ukusnews_test": { |
|
"wer": { |
|
"Qwen-Audio-Chat": 0.3158631121194933, |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.12554358101720553, |
|
"Qwen2-Audio-7B-Instruct": 0.13843826810361126, |
|
"whisper_large_v3": 0.07135564378899603, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07388920400831915, |
|
"WavLLM_fairseq": 0.5911892607298166, |
|
"SALMONN_7B": 0.18918510115333712, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07642276422764227 |
|
} |
|
}, |
|
"imda_part3_30s_ds_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 25.22, |
|
"Qwen2-Audio-7B-Instruct": 35.54, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 48.339999999999996, |
|
"WavLLM_fairseq": 36.5, |
|
"SALMONN_7B": 12.82, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 42.32 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 52.38 |
|
} |
|
}, |
|
"imda_part5_30s_ds_test": { |
|
"llama3_70b_judge": { |
|
"Qwen-Audio-Chat": 39.14, |
|
"Qwen2-Audio-7B-Instruct": 45.38, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 61.48, |
|
"SALMONN_7B": 24.340000000000003, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 54.379999999999995 |
|
}, |
|
"gpt4o_judge": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.68000000000001 |
|
} |
|
}, |
|
"gigaspeech2_viet": { |
|
"wer": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.4960741822016732, |
|
"Qwen2-Audio-7B-Instruct": 1.5011671350211242 |
|
} |
|
}, |
|
"gigaspeech2_thai": { |
|
"wer": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.8901628256099774, |
|
"Qwen2-Audio-7B-Instruct": 1.2449725324578913, |
|
"seallms_audio_7b": 0.3332398502070376 |
|
} |
|
}, |
|
"gigaspeech2_indo": { |
|
"wer": { |
|
"MERaLiON-AudioLLM-Whisper-SEA-LION": 0.5087211232500294, |
|
"Qwen2-Audio-7B-Instruct": 0.8956121130400584 |
|
} |
|
}, |
|
"imda_30s_ar_test": { |
|
"llama3_70b_judge": { |
|
"Qwen2-Audio-7B-Instruct": 5.106666666666667, |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.773333333333333, |
|
"SALMONN_7B": 5.673333333333334, |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 27.186666666666667 |
|
} |
|
}, |
|
"imda_30s_gr_test": { |
|
"llama3_70b_judge": { |
|
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.46666666666667 |
|
} |
|
}, |
|
"nlb_asr_test": { |
|
"wer": { |
|
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.2796380263880551 |
|
} |
|
} |
|
} |