princeton-nlp's picture
Upload folder using huggingface_hub
856c4a6 verified
{"all_primary_scores": ["mmlu:mc::olmes: 0.265201", "mmlu:rc::olmes: 0.323072", "mmlu::olmes: 0.323072", "mmlu_abstract_algebra:mc::olmes: 0.32", "mmlu_anatomy:mc::olmes: 0.311111", "mmlu_astronomy:mc::olmes: 0.184211", "mmlu_business_ethics:mc::olmes: 0.25", "mmlu_clinical_knowledge:mc::olmes: 0.233962", "mmlu_college_biology:mc::olmes: 0.263889", "mmlu_college_chemistry:mc::olmes: 0.17", "mmlu_college_computer_science:mc::olmes: 0.33", "mmlu_college_mathematics:mc::olmes: 0.3", "mmlu_college_medicine:mc::olmes: 0.236994", "mmlu_college_physics:mc::olmes: 0.156863", "mmlu_computer_security:mc::olmes: 0.23", "mmlu_conceptual_physics:mc::olmes: 0.208511", "mmlu_econometrics:mc::olmes: 0.254386", "mmlu_electrical_engineering:mc::olmes: 0.268966", "mmlu_elementary_mathematics:mc::olmes: 0.248677", "mmlu_formal_logic:mc::olmes: 0.15873", "mmlu_global_facts:mc::olmes: 0.29", "mmlu_high_school_biology:mc::olmes: 0.23871", "mmlu_high_school_chemistry:mc::olmes: 0.26601", "mmlu_high_school_computer_science:mc::olmes: 0.33", "mmlu_high_school_european_history:mc::olmes: 0.236364", "mmlu_high_school_geography:mc::olmes: 0.267677", "mmlu_high_school_government_and_politics:mc::olmes: 0.305699", "mmlu_high_school_macroeconomics:mc::olmes: 0.25641", "mmlu_high_school_mathematics:mc::olmes: 0.255556", "mmlu_high_school_microeconomics:mc::olmes: 0.226891", "mmlu_high_school_physics:mc::olmes: 0.331126", "mmlu_high_school_psychology:mc::olmes: 0.216514", "mmlu_high_school_statistics:mc::olmes: 0.416667", "mmlu_high_school_us_history:mc::olmes: 0.220588", "mmlu_high_school_world_history:mc::olmes: 0.261603", "mmlu_human_aging:mc::olmes: 0.358744", "mmlu_human_sexuality:mc::olmes: 0.21374", "mmlu_international_law:mc::olmes: 0.363636", "mmlu_jurisprudence:mc::olmes: 0.25", "mmlu_logical_fallacies:mc::olmes: 0.269939", "mmlu_machine_learning:mc::olmes: 0.285714", "mmlu_management:mc::olmes: 0.291262", "mmlu_marketing:mc::olmes: 0.247863", "mmlu_medical_genetics:mc::olmes: 0.27", "mmlu_miscellaneous:mc::olmes: 0.296296", "mmlu_moral_disputes:mc::olmes: 0.245665", "mmlu_moral_scenarios:mc::olmes: 0.243575", "mmlu_nutrition:mc::olmes: 0.218954", "mmlu_philosophy:mc::olmes: 0.308682", "mmlu_prehistory:mc::olmes: 0.29321", "mmlu_professional_accounting:mc::olmes: 0.276596", "mmlu_professional_law:mc::olmes: 0.235984", "mmlu_professional_medicine:mc::olmes: 0.397059", "mmlu_professional_psychology:mc::olmes: 0.25817", "mmlu_public_relations:mc::olmes: 0.345455", "mmlu_security_studies:mc::olmes: 0.167347", "mmlu_sociology:mc::olmes: 0.243781", "mmlu_us_foreign_policy:mc::olmes: 0.23", "mmlu_virology:mc::olmes: 0.307229", "mmlu_world_religions:mc::olmes: 0.251462", "mmlu_abstract_algebra:rc::olmes: 0.19", "mmlu_anatomy:rc::olmes: 0.303704", "mmlu_astronomy:rc::olmes: 0.388158", "mmlu_business_ethics:rc::olmes: 0.45", "mmlu_clinical_knowledge:rc::olmes: 0.381132", "mmlu_college_biology:rc::olmes: 0.381944", "mmlu_college_chemistry:rc::olmes: 0.26", "mmlu_college_computer_science:rc::olmes: 0.27", "mmlu_college_mathematics:rc::olmes: 0.25", "mmlu_college_medicine:rc::olmes: 0.289017", "mmlu_college_physics:rc::olmes: 0.186275", "mmlu_computer_security:rc::olmes: 0.43", "mmlu_conceptual_physics:rc::olmes: 0.387234", "mmlu_econometrics:rc::olmes: 0.27193", "mmlu_electrical_engineering:rc::olmes: 0.331034", "mmlu_elementary_mathematics:rc::olmes: 0.230159", "mmlu_formal_logic:rc::olmes: 0.301587", "mmlu_global_facts:rc::olmes: 0.25", "mmlu_high_school_biology:rc::olmes: 0.406452", "mmlu_high_school_chemistry:rc::olmes: 0.241379", "mmlu_high_school_computer_science:rc::olmes: 0.31", "mmlu_high_school_european_history:rc::olmes: 0.412121", "mmlu_high_school_geography:rc::olmes: 0.39899", "mmlu_high_school_government_and_politics:rc::olmes: 0.42487", "mmlu_high_school_macroeconomics:rc::olmes: 0.284615", "mmlu_high_school_mathematics:rc::olmes: 0.155556", "mmlu_high_school_microeconomics:rc::olmes: 0.344538", "mmlu_high_school_physics:rc::olmes: 0.251656", "mmlu_high_school_psychology:rc::olmes: 0.436697", "mmlu_high_school_statistics:rc::olmes: 0.305556", "mmlu_high_school_us_history:rc::olmes: 0.372549", "mmlu_high_school_world_history:rc::olmes: 0.295359", "mmlu_human_aging:rc::olmes: 0.390135", "mmlu_human_sexuality:rc::olmes: 0.358779", "mmlu_international_law:rc::olmes: 0.31405", "mmlu_jurisprudence:rc::olmes: 0.268519", "mmlu_logical_fallacies:rc::olmes: 0.325153", "mmlu_machine_learning:rc::olmes: 0.276786", "mmlu_management:rc::olmes: 0.436893", "mmlu_marketing:rc::olmes: 0.495726", "mmlu_medical_genetics:rc::olmes: 0.4", "mmlu_miscellaneous:rc::olmes: 0.464879", "mmlu_moral_disputes:rc::olmes: 0.257225", "mmlu_moral_scenarios:rc::olmes: 0.237989", "mmlu_nutrition:rc::olmes: 0.323529", "mmlu_philosophy:rc::olmes: 0.286174", "mmlu_prehistory:rc::olmes: 0.342593", "mmlu_professional_accounting:rc::olmes: 0.241135", "mmlu_professional_law:rc::olmes: 0.280965", "mmlu_professional_medicine:rc::olmes: 0.3125", "mmlu_professional_psychology:rc::olmes: 0.303922", "mmlu_public_relations:rc::olmes: 0.309091", "mmlu_security_studies:rc::olmes: 0.257143", "mmlu_sociology:rc::olmes: 0.258706", "mmlu_us_foreign_policy:rc::olmes: 0.34", "mmlu_virology:rc::olmes: 0.331325", "mmlu_world_religions:rc::olmes: 0.409357"], "metrics": [{"task": "mmlu:mc::olmes", "acc_per_token_micro": 0.26071784646061813, "acc_per_token_macro": 0.2652013554745296, "correct_loss_per_char_micro": 0.7111146623864845, "correct_loss_per_char_macro": 0.7122749918694099, "incorrect_loss_raw_micro": 1.4371644482479946, "incorrect_loss_raw_macro": 1.4411920306088446, "acc_per_char_micro": 0.26071784646061813, "acc_per_char_macro": 0.2652013554745296, "primary_score_micro": 0.26071784646061813, "primary_score_macro": 0.2652013554745296, "incorrect_loss_per_token_micro": 1.4371644482479946, "incorrect_loss_per_token_macro": 1.4411920306088446, "correct_loss_raw_micro": 1.422229324772969, "correct_loss_raw_macro": 1.4245499837388198, "acc_raw_micro": 0.26071784646061813, "acc_raw_macro": 0.2652013554745296, "incorrect_loss_per_char_micro": 0.7185822241239973, "incorrect_loss_per_char_macro": 0.7205960153044223, "correct_loss_per_token_micro": 1.422229324772969, "correct_loss_per_token_macro": 1.4245499837388198, "primary_score": 0.2652013554745296, "num_instances": 14042, "task_config": {"task_name": "mmlu:mc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:mc::olmes"}}}, {"task": "mmlu:rc::olmes", "acc_uncond_micro": 0.33314342686227033, "acc_uncond_macro": 0.33435596803300893, "acc_per_token_micro": 0.3257370744908133, "acc_per_token_macro": 0.3275611650446029, "correct_loss_per_char_micro": 0.6532332432635897, "correct_loss_per_char_macro": 0.7244786922443343, "incorrect_loss_raw_micro": 22.687225431967185, "incorrect_loss_raw_macro": 21.513673654986444, "acc_per_char_micro": 0.3216778236718416, "acc_per_char_macro": 0.32307164395785354, "correct_loss_uncond_micro": -14.90609547050038, "correct_loss_uncond_macro": -13.414712741701779, "primary_score_micro": 0.3216778236718416, "primary_score_macro": 0.32307164395785354, "incorrect_loss_per_token_micro": 3.0516866685921893, "incorrect_loss_per_token_macro": 3.187838365185059, "incorrect_loss_uncond_micro": -14.05838893042516, "incorrect_loss_uncond_macro": -12.608247113816947, "correct_loss_raw_micro": 22.630186270126003, "correct_loss_raw_macro": 21.551042191209497, "acc_raw_micro": 0.3052983905426577, "acc_raw_macro": 0.3034148546188076, "incorrect_loss_per_char_micro": 0.700118585892161, "incorrect_loss_per_char_macro": 0.7691466871719901, "correct_loss_per_token_micro": 2.7886637249538646, "correct_loss_per_token_macro": 2.9236862303547144, "primary_score": 0.32307164395785354, "num_instances": 14042, "task_config": {"task_name": "mmlu:rc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:rc::olmes"}}}, {"task": "mmlu::olmes", "primary_score": 0.32307164395785354, "num_instances": 28084, "task_config": {"task_name": "mmlu::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "mmlu::olmes"}}}, {"task": "mmlu_abstract_algebra:mc", "acc_raw": 0.32, "acc_per_token": 0.32, "acc_per_char": 0.32, "correct_loss_raw": 1.4286141949892044, "incorrect_loss_raw": 1.4459923624992368, "correct_loss_per_token": 1.4286141949892044, "incorrect_loss_per_token": 1.4459923624992368, "correct_loss_per_char": 0.7143070974946022, "incorrect_loss_per_char": 0.7229961812496184, "primary_score": 0.32, "num_instances": 100, "task_config": {"task_name": "mmlu_abstract_algebra:mc", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:mc::olmes"}}}, {"task": "mmlu_anatomy:mc", "acc_raw": 0.3111111111111111, "acc_per_token": 0.3111111111111111, "acc_per_char": 0.3111111111111111, "correct_loss_raw": 1.391642408459275, "incorrect_loss_raw": 1.466421911304379, "correct_loss_per_token": 1.391642408459275, "incorrect_loss_per_token": 1.466421911304379, "correct_loss_per_char": 0.6958212042296374, "incorrect_loss_per_char": 0.7332109556521895, "primary_score": 0.3111111111111111, "num_instances": 135, "task_config": {"task_name": "mmlu_anatomy:mc", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:mc::olmes"}}}, {"task": "mmlu_astronomy:mc", "acc_raw": 0.18421052631578946, "acc_per_token": 0.18421052631578946, "acc_per_char": 0.18421052631578946, "correct_loss_raw": 1.5038828645881854, "incorrect_loss_raw": 1.4131246705849962, "correct_loss_per_token": 1.5038828645881854, "incorrect_loss_per_token": 1.4131246705849962, "correct_loss_per_char": 0.7519414322940927, "incorrect_loss_per_char": 0.7065623352924981, "primary_score": 0.18421052631578946, "num_instances": 152, "task_config": {"task_name": "mmlu_astronomy:mc", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:mc::olmes"}}}, {"task": "mmlu_business_ethics:mc", "acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.4087046658992768, "incorrect_loss_raw": 1.4307063547770176, "correct_loss_per_token": 1.4087046658992768, "incorrect_loss_per_token": 1.4307063547770176, "correct_loss_per_char": 0.7043523329496384, "incorrect_loss_per_char": 0.7153531773885088, "primary_score": 0.25, "num_instances": 100, "task_config": {"task_name": "mmlu_business_ethics:mc", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:mc::olmes"}}}, {"task": "mmlu_clinical_knowledge:mc", "acc_raw": 0.2339622641509434, "acc_per_token": 0.2339622641509434, "acc_per_char": 0.2339622641509434, "correct_loss_raw": 1.4263436362428485, "incorrect_loss_raw": 1.4283468099510153, "correct_loss_per_token": 1.4263436362428485, "incorrect_loss_per_token": 1.4283468099510153, "correct_loss_per_char": 0.7131718181214243, "incorrect_loss_per_char": 0.7141734049755076, "primary_score": 0.2339622641509434, "num_instances": 265, "task_config": {"task_name": "mmlu_clinical_knowledge:mc", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:mc::olmes"}}}, {"task": "mmlu_college_biology:mc", "acc_raw": 0.2638888888888889, "acc_per_token": 0.2638888888888889, "acc_per_char": 0.2638888888888889, "correct_loss_raw": 1.4325314201414585, "incorrect_loss_raw": 1.4315688815657748, "correct_loss_per_token": 1.4325314201414585, "incorrect_loss_per_token": 1.4315688815657748, "correct_loss_per_char": 0.7162657100707293, "incorrect_loss_per_char": 0.7157844407828874, "primary_score": 0.2638888888888889, "num_instances": 144, "task_config": {"task_name": "mmlu_college_biology:mc", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:mc::olmes"}}}, {"task": "mmlu_college_chemistry:mc", "acc_raw": 0.17, "acc_per_token": 0.17, "acc_per_char": 0.17, "correct_loss_raw": 1.4802032005786896, "incorrect_loss_raw": 1.4110419366757072, "correct_loss_per_token": 1.4802032005786896, "incorrect_loss_per_token": 1.4110419366757072, "correct_loss_per_char": 0.7401016002893448, "incorrect_loss_per_char": 0.7055209683378536, "primary_score": 0.17, "num_instances": 100, "task_config": {"task_name": "mmlu_college_chemistry:mc", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:mc::olmes"}}}, {"task": "mmlu_college_computer_science:mc", "acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.4178925317525863, "incorrect_loss_raw": 1.5295472476879763, "correct_loss_per_token": 1.4178925317525863, "incorrect_loss_per_token": 1.5295472476879763, "correct_loss_per_char": 0.7089462658762932, "incorrect_loss_per_char": 0.7647736238439882, "primary_score": 0.33, "num_instances": 100, "task_config": {"task_name": "mmlu_college_computer_science:mc", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:mc::olmes"}}}, {"task": "mmlu_college_mathematics:mc", "acc_raw": 0.3, "acc_per_token": 0.3, "acc_per_char": 0.3, "correct_loss_raw": 1.4630403077602387, "incorrect_loss_raw": 1.5132715060313535, "correct_loss_per_token": 1.4630403077602387, "incorrect_loss_per_token": 1.5132715060313535, "correct_loss_per_char": 0.7315201538801194, "incorrect_loss_per_char": 0.7566357530156768, "primary_score": 0.3, "num_instances": 100, "task_config": {"task_name": "mmlu_college_mathematics:mc", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:mc::olmes"}}}, {"task": "mmlu_college_medicine:mc", "acc_raw": 0.23699421965317918, "acc_per_token": 0.23699421965317918, "acc_per_char": 0.23699421965317918, "correct_loss_raw": 1.4322606531870847, "incorrect_loss_raw": 1.428708042597724, "correct_loss_per_token": 1.4322606531870847, "incorrect_loss_per_token": 1.428708042597724, "correct_loss_per_char": 0.7161303265935424, "incorrect_loss_per_char": 0.714354021298862, "primary_score": 0.23699421965317918, "num_instances": 173, "task_config": {"task_name": "mmlu_college_medicine:mc", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:mc::olmes"}}}, {"task": "mmlu_college_physics:mc", "acc_raw": 0.1568627450980392, "acc_per_token": 0.1568627450980392, "acc_per_char": 0.1568627450980392, "correct_loss_raw": 1.4933925411280464, "incorrect_loss_raw": 1.4061525699749486, "correct_loss_per_token": 1.4933925411280464, "incorrect_loss_per_token": 1.4061525699749486, "correct_loss_per_char": 0.7466962705640232, "incorrect_loss_per_char": 0.7030762849874743, "primary_score": 0.1568627450980392, "num_instances": 102, "task_config": {"task_name": "mmlu_college_physics:mc", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:mc::olmes"}}}, {"task": "mmlu_computer_security:mc", "acc_raw": 0.23, "acc_per_token": 0.23, "acc_per_char": 0.23, "correct_loss_raw": 1.4304591560363769, "incorrect_loss_raw": 1.4309185570478433, "correct_loss_per_token": 1.4304591560363769, "incorrect_loss_per_token": 1.4309185570478433, "correct_loss_per_char": 0.7152295780181884, "incorrect_loss_per_char": 0.7154592785239217, "primary_score": 0.23, "num_instances": 100, "task_config": {"task_name": "mmlu_computer_security:mc", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:mc::olmes"}}}, {"task": "mmlu_conceptual_physics:mc", "acc_raw": 0.20851063829787234, "acc_per_token": 0.20851063829787234, "acc_per_char": 0.20851063829787234, "correct_loss_raw": 1.4332183908908924, "incorrect_loss_raw": 1.4322371693367655, "correct_loss_per_token": 1.4332183908908924, "incorrect_loss_per_token": 1.4322371693367655, "correct_loss_per_char": 0.7166091954454462, "incorrect_loss_per_char": 0.7161185846683827, "primary_score": 0.20851063829787234, "num_instances": 235, "task_config": {"task_name": "mmlu_conceptual_physics:mc", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:mc::olmes"}}}, {"task": "mmlu_econometrics:mc", "acc_raw": 0.2543859649122807, "acc_per_token": 0.2543859649122807, "acc_per_char": 0.2543859649122807, "correct_loss_raw": 1.4873452510750085, "incorrect_loss_raw": 1.4898098088495915, "correct_loss_per_token": 1.4873452510750085, "incorrect_loss_per_token": 1.4898098088495915, "correct_loss_per_char": 0.7436726255375042, "incorrect_loss_per_char": 0.7449049044247957, "primary_score": 0.2543859649122807, "num_instances": 114, "task_config": {"task_name": "mmlu_econometrics:mc", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:mc::olmes"}}}, {"task": "mmlu_electrical_engineering:mc", "acc_raw": 0.2689655172413793, "acc_per_token": 0.2689655172413793, "acc_per_char": 0.2689655172413793, "correct_loss_raw": 1.4234175213452043, "incorrect_loss_raw": 1.434983214427685, "correct_loss_per_token": 1.4234175213452043, "incorrect_loss_per_token": 1.434983214427685, "correct_loss_per_char": 0.7117087606726021, "incorrect_loss_per_char": 0.7174916072138425, "primary_score": 0.2689655172413793, "num_instances": 145, "task_config": {"task_name": "mmlu_electrical_engineering:mc", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:mc::olmes"}}}, {"task": "mmlu_elementary_mathematics:mc", "acc_raw": 0.24867724867724866, "acc_per_token": 0.24867724867724866, "acc_per_char": 0.24867724867724866, "correct_loss_raw": 1.425037144195466, "incorrect_loss_raw": 1.4287045017226445, "correct_loss_per_token": 1.425037144195466, "incorrect_loss_per_token": 1.4287045017226445, "correct_loss_per_char": 0.712518572097733, "incorrect_loss_per_char": 0.7143522508613223, "primary_score": 0.24867724867724866, "num_instances": 378, "task_config": {"task_name": "mmlu_elementary_mathematics:mc", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:mc::olmes"}}}, {"task": "mmlu_formal_logic:mc", "acc_raw": 0.15873015873015872, "acc_per_token": 0.15873015873015872, "acc_per_char": 0.15873015873015872, "correct_loss_raw": 1.5439469222984616, "incorrect_loss_raw": 1.456999213922592, "correct_loss_per_token": 1.5439469222984616, "incorrect_loss_per_token": 1.456999213922592, "correct_loss_per_char": 0.7719734611492308, "incorrect_loss_per_char": 0.728499606961296, "primary_score": 0.15873015873015872, "num_instances": 126, "task_config": {"task_name": "mmlu_formal_logic:mc", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:mc::olmes"}}}, {"task": "mmlu_global_facts:mc", "acc_raw": 0.29, "acc_per_token": 0.29, "acc_per_char": 0.29, "correct_loss_raw": 1.416217890381813, "incorrect_loss_raw": 1.4884590681393934, "correct_loss_per_token": 1.416217890381813, "incorrect_loss_per_token": 1.4884590681393934, "correct_loss_per_char": 0.7081089451909065, "incorrect_loss_per_char": 0.7442295340696967, "primary_score": 0.29, "num_instances": 100, "task_config": {"task_name": "mmlu_global_facts:mc", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:mc::olmes"}}}, {"task": "mmlu_high_school_biology:mc", "acc_raw": 0.23870967741935484, "acc_per_token": 0.23870967741935484, "acc_per_char": 0.23870967741935484, "correct_loss_raw": 1.4149069778380856, "incorrect_loss_raw": 1.426428426978408, "correct_loss_per_token": 1.4149069778380856, "incorrect_loss_per_token": 1.426428426978408, "correct_loss_per_char": 0.7074534889190428, "incorrect_loss_per_char": 0.713214213489204, "primary_score": 0.23870967741935484, "num_instances": 310, "task_config": {"task_name": "mmlu_high_school_biology:mc", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:mc::olmes"}}}, {"task": "mmlu_high_school_chemistry:mc", "acc_raw": 0.2660098522167488, "acc_per_token": 0.2660098522167488, "acc_per_char": 0.2660098522167488, "correct_loss_raw": 1.4112723845566435, "incorrect_loss_raw": 1.4276176255520538, "correct_loss_per_token": 1.4112723845566435, "incorrect_loss_per_token": 1.4276176255520538, "correct_loss_per_char": 0.7056361922783217, "incorrect_loss_per_char": 0.7138088127760269, "primary_score": 0.2660098522167488, "num_instances": 203, "task_config": {"task_name": "mmlu_high_school_chemistry:mc", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:mc::olmes"}}}, {"task": "mmlu_high_school_computer_science:mc", "acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.4288902390003204, "incorrect_loss_raw": 1.4646583642562232, "correct_loss_per_token": 1.4288902390003204, "incorrect_loss_per_token": 1.4646583642562232, "correct_loss_per_char": 0.7144451195001602, "incorrect_loss_per_char": 0.7323291821281116, "primary_score": 0.33, "num_instances": 100, "task_config": {"task_name": "mmlu_high_school_computer_science:mc", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:mc::olmes"}}}, {"task": "mmlu_high_school_european_history:mc", "acc_raw": 0.23636363636363636, "acc_per_token": 0.23636363636363636, "acc_per_char": 0.23636363636363636, "correct_loss_raw": 1.4495411656119608, "incorrect_loss_raw": 1.4138268783839067, "correct_loss_per_token": 1.4495411656119608, "incorrect_loss_per_token": 1.4138268783839067, "correct_loss_per_char": 0.7247705828059804, "incorrect_loss_per_char": 0.7069134391919534, "primary_score": 0.23636363636363636, "num_instances": 165, "task_config": {"task_name": "mmlu_high_school_european_history:mc", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:mc::olmes"}}}, {"task": "mmlu_high_school_geography:mc", "acc_raw": 0.2676767676767677, "acc_per_token": 0.2676767676767677, "acc_per_char": 0.2676767676767677, "correct_loss_raw": 1.400579711102476, "incorrect_loss_raw": 1.43751945318999, "correct_loss_per_token": 1.400579711102476, "incorrect_loss_per_token": 1.43751945318999, "correct_loss_per_char": 0.700289855551238, "incorrect_loss_per_char": 0.718759726594995, "primary_score": 0.2676767676767677, "num_instances": 198, "task_config": {"task_name": "mmlu_high_school_geography:mc", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:mc::olmes"}}}, {"task": "mmlu_high_school_government_and_politics:mc", "acc_raw": 0.30569948186528495, "acc_per_token": 0.30569948186528495, "acc_per_char": 0.30569948186528495, "correct_loss_raw": 1.3960181246149725, "incorrect_loss_raw": 1.459184393672745, "correct_loss_per_token": 1.3960181246149725, "incorrect_loss_per_token": 1.459184393672745, "correct_loss_per_char": 0.6980090623074863, "incorrect_loss_per_char": 0.7295921968363726, "primary_score": 0.30569948186528495, "num_instances": 193, "task_config": {"task_name": "mmlu_high_school_government_and_politics:mc", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:mc::olmes"}}}, {"task": "mmlu_high_school_macroeconomics:mc", "acc_raw": 0.2564102564102564, "acc_per_token": 0.2564102564102564, "acc_per_char": 0.2564102564102564, "correct_loss_raw": 1.4008340325110997, "incorrect_loss_raw": 1.4511074923042557, "correct_loss_per_token": 1.4008340325110997, "incorrect_loss_per_token": 1.4511074923042557, "correct_loss_per_char": 0.7004170162555499, "incorrect_loss_per_char": 0.7255537461521279, "primary_score": 0.2564102564102564, "num_instances": 390, "task_config": {"task_name": "mmlu_high_school_macroeconomics:mc", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:mc::olmes"}}}, {"task": "mmlu_high_school_mathematics:mc", "acc_raw": 0.25555555555555554, "acc_per_token": 0.25555555555555554, "acc_per_char": 0.25555555555555554, "correct_loss_raw": 1.4557758278316921, "incorrect_loss_raw": 1.4901090468153537, "correct_loss_per_token": 1.4557758278316921, "incorrect_loss_per_token": 1.4901090468153537, "correct_loss_per_char": 0.7278879139158461, "incorrect_loss_per_char": 0.7450545234076769, "primary_score": 0.25555555555555554, "num_instances": 270, "task_config": {"task_name": "mmlu_high_school_mathematics:mc", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:mc::olmes"}}}, {"task": "mmlu_high_school_microeconomics:mc", "acc_raw": 0.226890756302521, "acc_per_token": 0.226890756302521, "acc_per_char": 0.226890756302521, "correct_loss_raw": 1.426626341683524, "incorrect_loss_raw": 1.4269534432921422, "correct_loss_per_token": 1.426626341683524, "incorrect_loss_per_token": 1.4269534432921422, "correct_loss_per_char": 0.713313170841762, "incorrect_loss_per_char": 0.7134767216460711, "primary_score": 0.226890756302521, "num_instances": 238, "task_config": {"task_name": "mmlu_high_school_microeconomics:mc", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:mc::olmes"}}}, {"task": "mmlu_high_school_physics:mc", "acc_raw": 0.33112582781456956, "acc_per_token": 0.33112582781456956, "acc_per_char": 0.33112582781456956, "correct_loss_raw": 1.3873338265134798, "incorrect_loss_raw": 1.4437498321333198, "correct_loss_per_token": 1.3873338265134798, "incorrect_loss_per_token": 1.4437498321333198, "correct_loss_per_char": 0.6936669132567399, "incorrect_loss_per_char": 0.7218749160666599, "primary_score": 0.33112582781456956, "num_instances": 151, "task_config": {"task_name": "mmlu_high_school_physics:mc", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:mc::olmes"}}}, {"task": "mmlu_high_school_psychology:mc", "acc_raw": 0.21651376146788992, "acc_per_token": 0.21651376146788992, "acc_per_char": 0.21651376146788992, "correct_loss_raw": 1.4213156835748515, "incorrect_loss_raw": 1.4153327799718305, "correct_loss_per_token": 1.4213156835748515, "incorrect_loss_per_token": 1.4153327799718305, "correct_loss_per_char": 0.7106578417874257, "incorrect_loss_per_char": 0.7076663899859152, "primary_score": 0.21651376146788992, "num_instances": 545, "task_config": {"task_name": "mmlu_high_school_psychology:mc", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:mc::olmes"}}}, {"task": "mmlu_high_school_statistics:mc", "acc_raw": 0.4166666666666667, "acc_per_token": 0.4166666666666667, "acc_per_char": 0.4166666666666667, "correct_loss_raw": 1.313091162454199, "incorrect_loss_raw": 1.5389358467525904, "correct_loss_per_token": 1.313091162454199, "incorrect_loss_per_token": 1.5389358467525904, "correct_loss_per_char": 0.6565455812270995, "incorrect_loss_per_char": 0.7694679233762952, "primary_score": 0.4166666666666667, "num_instances": 216, "task_config": {"task_name": "mmlu_high_school_statistics:mc", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:mc::olmes"}}}, {"task": "mmlu_high_school_us_history:mc", "acc_raw": 0.22058823529411764, "acc_per_token": 0.22058823529411764, "acc_per_char": 0.22058823529411764, "correct_loss_raw": 1.4462888538837433, "incorrect_loss_raw": 1.443970621313924, "correct_loss_per_token": 1.4462888538837433, "incorrect_loss_per_token": 1.443970621313924, "correct_loss_per_char": 0.7231444269418716, "incorrect_loss_per_char": 0.721985310656962, "primary_score": 0.22058823529411764, "num_instances": 204, "task_config": {"task_name": "mmlu_high_school_us_history:mc", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:mc::olmes"}}}, {"task": "mmlu_high_school_world_history:mc", "acc_raw": 0.2616033755274262, "acc_per_token": 0.2616033755274262, "acc_per_char": 0.2616033755274262, "correct_loss_raw": 1.426479207061011, "incorrect_loss_raw": 1.4354138376843584, "correct_loss_per_token": 1.426479207061011, "incorrect_loss_per_token": 1.4354138376843584, "correct_loss_per_char": 0.7132396035305055, "incorrect_loss_per_char": 0.7177069188421792, "primary_score": 0.2616033755274262, "num_instances": 237, "task_config": {"task_name": "mmlu_high_school_world_history:mc", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:mc::olmes"}}}, {"task": "mmlu_human_aging:mc", "acc_raw": 0.35874439461883406, "acc_per_token": 0.35874439461883406, "acc_per_char": 0.35874439461883406, "correct_loss_raw": 1.3965698876188475, "incorrect_loss_raw": 1.4336071704000641, "correct_loss_per_token": 1.3965698876188475, "incorrect_loss_per_token": 1.4336071704000641, "correct_loss_per_char": 0.6982849438094237, "incorrect_loss_per_char": 0.7168035852000321, "primary_score": 0.35874439461883406, "num_instances": 223, "task_config": {"task_name": "mmlu_human_aging:mc", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:mc::olmes"}}}, {"task": "mmlu_human_sexuality:mc", "acc_raw": 0.21374045801526717, "acc_per_token": 0.21374045801526717, "acc_per_char": 0.21374045801526717, "correct_loss_raw": 1.4585472927748702, "incorrect_loss_raw": 1.428870550548758, "correct_loss_per_token": 1.4585472927748702, "incorrect_loss_per_token": 1.428870550548758, "correct_loss_per_char": 0.7292736463874351, "incorrect_loss_per_char": 0.714435275274379, "primary_score": 0.21374045801526717, "num_instances": 131, "task_config": {"task_name": "mmlu_human_sexuality:mc", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:mc::olmes"}}}, {"task": "mmlu_international_law:mc", "acc_raw": 0.36363636363636365, "acc_per_token": 0.36363636363636365, "acc_per_char": 0.36363636363636365, "correct_loss_raw": 1.365358995997216, "incorrect_loss_raw": 1.4374075474489492, "correct_loss_per_token": 1.365358995997216, "incorrect_loss_per_token": 1.4374075474489492, "correct_loss_per_char": 0.682679497998608, "incorrect_loss_per_char": 0.7187037737244746, "primary_score": 0.36363636363636365, "num_instances": 121, "task_config": {"task_name": "mmlu_international_law:mc", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:mc::olmes"}}}, {"task": "mmlu_jurisprudence:mc", "acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.4087109709227528, "incorrect_loss_raw": 1.4130348375550026, "correct_loss_per_token": 1.4087109709227528, "incorrect_loss_per_token": 1.4130348375550026, "correct_loss_per_char": 0.7043554854613764, "incorrect_loss_per_char": 0.7065174187775013, "primary_score": 0.25, "num_instances": 108, "task_config": {"task_name": "mmlu_jurisprudence:mc", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:mc::olmes"}}}, {"task": "mmlu_logical_fallacies:mc", "acc_raw": 0.26993865030674846, "acc_per_token": 0.26993865030674846, "acc_per_char": 0.26993865030674846, "correct_loss_raw": 1.4300255252539746, "incorrect_loss_raw": 1.4530853404833006, "correct_loss_per_token": 1.4300255252539746, "incorrect_loss_per_token": 1.4530853404833006, "correct_loss_per_char": 0.7150127626269873, "incorrect_loss_per_char": 0.7265426702416503, "primary_score": 0.26993865030674846, "num_instances": 163, "task_config": {"task_name": "mmlu_logical_fallacies:mc", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:mc::olmes"}}}, {"task": "mmlu_machine_learning:mc", "acc_raw": 0.2857142857142857, "acc_per_token": 0.2857142857142857, "acc_per_char": 0.2857142857142857, "correct_loss_raw": 1.4112436388220106, "incorrect_loss_raw": 1.4298354834318159, "correct_loss_per_token": 1.4112436388220106, "incorrect_loss_per_token": 1.4298354834318159, "correct_loss_per_char": 0.7056218194110053, "incorrect_loss_per_char": 0.7149177417159079, "primary_score": 0.2857142857142857, "num_instances": 112, "task_config": {"task_name": "mmlu_machine_learning:mc", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:mc::olmes"}}}, {"task": "mmlu_management:mc", "acc_raw": 0.2912621359223301, "acc_per_token": 0.2912621359223301, "acc_per_char": 0.2912621359223301, "correct_loss_raw": 1.3950761058955516, "incorrect_loss_raw": 1.4241073177856147, "correct_loss_per_token": 1.3950761058955516, "incorrect_loss_per_token": 1.4241073177856147, "correct_loss_per_char": 0.6975380529477758, "incorrect_loss_per_char": 0.7120536588928074, "primary_score": 0.2912621359223301, "num_instances": 103, "task_config": {"task_name": "mmlu_management:mc", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:mc::olmes"}}}, {"task": "mmlu_marketing:mc", "acc_raw": 0.24786324786324787, "acc_per_token": 0.24786324786324787, "acc_per_char": 0.24786324786324787, "correct_loss_raw": 1.4277025889127681, "incorrect_loss_raw": 1.4223103850995034, "correct_loss_per_token": 1.4277025889127681, "incorrect_loss_per_token": 1.4223103850995034, "correct_loss_per_char": 0.7138512944563841, "incorrect_loss_per_char": 0.7111551925497517, "primary_score": 0.24786324786324787, "num_instances": 234, "task_config": {"task_name": "mmlu_marketing:mc", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:mc::olmes"}}}, {"task": "mmlu_medical_genetics:mc", "acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.4142525935173034, "incorrect_loss_raw": 1.4275709124406175, "correct_loss_per_token": 1.4142525935173034, "incorrect_loss_per_token": 1.4275709124406175, "correct_loss_per_char": 0.7071262967586517, "incorrect_loss_per_char": 0.7137854562203088, "primary_score": 0.27, "num_instances": 100, "task_config": {"task_name": "mmlu_medical_genetics:mc", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:mc::olmes"}}}, {"task": "mmlu_miscellaneous:mc", "acc_raw": 0.2962962962962963, "acc_per_token": 0.2962962962962963, "acc_per_char": 0.2962962962962963, "correct_loss_raw": 1.4032514787268364, "incorrect_loss_raw": 1.428520736947064, "correct_loss_per_token": 1.4032514787268364, "incorrect_loss_per_token": 1.428520736947064, "correct_loss_per_char": 0.7016257393634182, "incorrect_loss_per_char": 0.714260368473532, "primary_score": 0.2962962962962963, "num_instances": 783, "task_config": {"task_name": "mmlu_miscellaneous:mc", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:mc::olmes"}}}, {"task": "mmlu_moral_disputes:mc", "acc_raw": 0.24566473988439305, "acc_per_token": 0.24566473988439305, "acc_per_char": 0.24566473988439305, "correct_loss_raw": 1.4478095125600783, "incorrect_loss_raw": 1.4461355473494477, "correct_loss_per_token": 1.4478095125600783, "incorrect_loss_per_token": 1.4461355473494477, "correct_loss_per_char": 0.7239047562800391, "incorrect_loss_per_char": 0.7230677736747239, "primary_score": 0.24566473988439305, "num_instances": 346, "task_config": {"task_name": "mmlu_moral_disputes:mc", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:mc::olmes"}}}, {"task": "mmlu_moral_scenarios:mc", "acc_raw": 0.2435754189944134, "acc_per_token": 0.2435754189944134, "acc_per_char": 0.2435754189944134, "correct_loss_raw": 1.421490123551651, "incorrect_loss_raw": 1.4254808845910725, "correct_loss_per_token": 1.421490123551651, "incorrect_loss_per_token": 1.4254808845910725, "correct_loss_per_char": 0.7107450617758255, "incorrect_loss_per_char": 0.7127404422955362, "primary_score": 0.2435754189944134, "num_instances": 895, "task_config": {"task_name": "mmlu_moral_scenarios:mc", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:mc::olmes"}}}, {"task": "mmlu_nutrition:mc", "acc_raw": 0.21895424836601307, "acc_per_token": 0.21895424836601307, "acc_per_char": 0.21895424836601307, "correct_loss_raw": 1.4414266320615032, "incorrect_loss_raw": 1.421766403973233, "correct_loss_per_token": 1.4414266320615032, "incorrect_loss_per_token": 1.421766403973233, "correct_loss_per_char": 0.7207133160307516, "incorrect_loss_per_char": 0.7108832019866165, "primary_score": 0.21895424836601307, "num_instances": 306, "task_config": {"task_name": "mmlu_nutrition:mc", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:mc::olmes"}}}, {"task": "mmlu_philosophy:mc", "acc_raw": 0.3086816720257235, "acc_per_token": 0.3086816720257235, "acc_per_char": 0.3086816720257235, "correct_loss_raw": 1.394637080250829, "incorrect_loss_raw": 1.4242691859555008, "correct_loss_per_token": 1.394637080250829, "incorrect_loss_per_token": 1.4242691859555008, "correct_loss_per_char": 0.6973185401254145, "incorrect_loss_per_char": 0.7121345929777504, "primary_score": 0.3086816720257235, "num_instances": 311, "task_config": {"task_name": "mmlu_philosophy:mc", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:mc::olmes"}}}, {"task": "mmlu_prehistory:mc", "acc_raw": 0.2932098765432099, "acc_per_token": 0.2932098765432099, "acc_per_char": 0.2932098765432099, "correct_loss_raw": 1.3984669380717807, "incorrect_loss_raw": 1.4247865308949983, "correct_loss_per_token": 1.3984669380717807, "incorrect_loss_per_token": 1.4247865308949983, "correct_loss_per_char": 0.6992334690358903, "incorrect_loss_per_char": 0.7123932654474991, "primary_score": 0.2932098765432099, "num_instances": 324, "task_config": {"task_name": "mmlu_prehistory:mc", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:mc::olmes"}}}, {"task": "mmlu_professional_accounting:mc", "acc_raw": 0.2765957446808511, "acc_per_token": 0.2765957446808511, "acc_per_char": 0.2765957446808511, "correct_loss_raw": 1.4184195805103221, "incorrect_loss_raw": 1.41706075186425, "correct_loss_per_token": 1.4184195805103221, "incorrect_loss_per_token": 1.41706075186425, "correct_loss_per_char": 0.7092097902551611, "incorrect_loss_per_char": 0.708530375932125, "primary_score": 0.2765957446808511, "num_instances": 282, "task_config": {"task_name": "mmlu_professional_accounting:mc", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:mc::olmes"}}}, {"task": "mmlu_professional_law:mc", "acc_raw": 0.23598435462842243, "acc_per_token": 0.23598435462842243, "acc_per_char": 0.23598435462842243, "correct_loss_raw": 1.4360968665629041, "incorrect_loss_raw": 1.4304321384440266, "correct_loss_per_token": 1.4360968665629041, "incorrect_loss_per_token": 1.4304321384440266, "correct_loss_per_char": 0.7180484332814521, "incorrect_loss_per_char": 0.7152160692220133, "primary_score": 0.23598435462842243, "num_instances": 1534, "task_config": {"task_name": "mmlu_professional_law:mc", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:mc::olmes"}}}, {"task": "mmlu_professional_medicine:mc", "acc_raw": 0.39705882352941174, "acc_per_token": 0.39705882352941174, "acc_per_char": 0.39705882352941174, "correct_loss_raw": 1.3938986779574085, "incorrect_loss_raw": 1.5369568731562764, "correct_loss_per_token": 1.3938986779574085, "incorrect_loss_per_token": 1.5369568731562764, "correct_loss_per_char": 0.6969493389787043, "incorrect_loss_per_char": 0.7684784365781382, "primary_score": 0.39705882352941174, "num_instances": 272, "task_config": {"task_name": "mmlu_professional_medicine:mc", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:mc::olmes"}}}, {"task": "mmlu_professional_psychology:mc", "acc_raw": 0.2581699346405229, "acc_per_token": 0.2581699346405229, "acc_per_char": 0.2581699346405229, "correct_loss_raw": 1.410348999539232, "incorrect_loss_raw": 1.4249647083430514, "correct_loss_per_token": 1.410348999539232, "incorrect_loss_per_token": 1.4249647083430514, "correct_loss_per_char": 0.705174499769616, "incorrect_loss_per_char": 0.7124823541715257, "primary_score": 0.2581699346405229, "num_instances": 612, "task_config": {"task_name": "mmlu_professional_psychology:mc", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:mc::olmes"}}}, {"task": "mmlu_public_relations:mc", "acc_raw": 0.34545454545454546, "acc_per_token": 0.34545454545454546, "acc_per_char": 0.34545454545454546, "correct_loss_raw": 1.3966401826251638, "incorrect_loss_raw": 1.4288235136956882, "correct_loss_per_token": 1.3966401826251638, "incorrect_loss_per_token": 1.4288235136956882, "correct_loss_per_char": 0.6983200913125819, "incorrect_loss_per_char": 0.7144117568478441, "primary_score": 0.34545454545454546, "num_instances": 110, "task_config": {"task_name": "mmlu_public_relations:mc", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:mc::olmes"}}}, {"task": "mmlu_security_studies:mc", "acc_raw": 0.1673469387755102, "acc_per_token": 0.1673469387755102, "acc_per_char": 0.1673469387755102, "correct_loss_raw": 1.4461856803115534, "incorrect_loss_raw": 1.4255701874389128, "correct_loss_per_token": 1.4461856803115534, "incorrect_loss_per_token": 1.4255701874389128, "correct_loss_per_char": 0.7230928401557767, "incorrect_loss_per_char": 0.7127850937194564, "primary_score": 0.1673469387755102, "num_instances": 245, "task_config": {"task_name": "mmlu_security_studies:mc", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:mc::olmes"}}}, {"task": "mmlu_sociology:mc", "acc_raw": 0.24378109452736318, "acc_per_token": 0.24378109452736318, "acc_per_char": 0.24378109452736318, "correct_loss_raw": 1.4228800048875572, "incorrect_loss_raw": 1.421098598397984, "correct_loss_per_token": 1.4228800048875572, "incorrect_loss_per_token": 1.421098598397984, "correct_loss_per_char": 0.7114400024437786, "incorrect_loss_per_char": 0.710549299198992, "primary_score": 0.24378109452736318, "num_instances": 201, "task_config": {"task_name": "mmlu_sociology:mc", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:mc::olmes"}}}, {"task": "mmlu_us_foreign_policy:mc", "acc_raw": 0.23, "acc_per_token": 0.23, "acc_per_char": 0.23, "correct_loss_raw": 1.434000769853592, "incorrect_loss_raw": 1.4196133653322858, "correct_loss_per_token": 1.434000769853592, "incorrect_loss_per_token": 1.4196133653322858, "correct_loss_per_char": 0.717000384926796, "incorrect_loss_per_char": 0.7098066826661429, "primary_score": 0.23, "num_instances": 100, "task_config": {"task_name": "mmlu_us_foreign_policy:mc", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:mc::olmes"}}}, {"task": "mmlu_virology:mc", "acc_raw": 0.3072289156626506, "acc_per_token": 0.3072289156626506, "acc_per_char": 0.3072289156626506, "correct_loss_raw": 1.4114296220871339, "incorrect_loss_raw": 1.4285837040847564, "correct_loss_per_token": 1.4114296220871339, "incorrect_loss_per_token": 1.4285837040847564, "correct_loss_per_char": 0.7057148110435669, "incorrect_loss_per_char": 0.7142918520423782, "primary_score": 0.3072289156626506, "num_instances": 166, "task_config": {"task_name": "mmlu_virology:mc", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:mc::olmes"}}}, {"task": "mmlu_world_religions:mc", "acc_raw": 0.25146198830409355, "acc_per_token": 0.25146198830409355, "acc_per_char": 0.25146198830409355, "correct_loss_raw": 1.3977750846517016, "incorrect_loss_raw": 1.4322511996442102, "correct_loss_per_token": 1.3977750846517016, "incorrect_loss_per_token": 1.4322511996442102, "correct_loss_per_char": 0.6988875423258508, "incorrect_loss_per_char": 0.7161255998221051, "primary_score": 0.25146198830409355, "num_instances": 171, "task_config": {"task_name": "mmlu_world_religions:mc", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:mc::olmes"}}}, {"task": "mmlu_abstract_algebra", "acc_raw": 0.16, "acc_per_token": 0.19, "acc_per_char": 0.19, "correct_loss_raw": 6.869614138007164, "incorrect_loss_raw": 5.123212166229885, "correct_loss_per_token": 1.8998175663358217, "incorrect_loss_per_token": 2.0451965660127027, "correct_loss_per_char": 0.7431339020727424, "incorrect_loss_per_char": 0.737700834289516, "acc_uncond": 0.26, "correct_loss_uncond": -9.973355069756508, "incorrect_loss_uncond": -9.873269970814388, "primary_score": 0.19, "num_instances": 100, "task_config": {"task_name": "mmlu_abstract_algebra", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:rc::olmes"}}}, {"task": "mmlu_anatomy", "acc_raw": 0.362962962962963, "acc_per_token": 0.34074074074074073, "acc_per_char": 0.3037037037037037, "correct_loss_raw": 18.050097020025607, "incorrect_loss_raw": 18.311192920767233, "correct_loss_per_token": 2.293636003848102, "incorrect_loss_per_token": 2.62485800422084, "correct_loss_per_char": 0.5213529013542572, "incorrect_loss_per_char": 0.594342243889545, "acc_uncond": 0.3111111111111111, "correct_loss_uncond": -15.044037494394514, "incorrect_loss_uncond": -14.717101418824846, "primary_score": 0.3037037037037037, "num_instances": 135, "task_config": {"task_name": "mmlu_anatomy", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:rc::olmes"}}}, {"task": "mmlu_astronomy", "acc_raw": 0.3092105263157895, "acc_per_token": 0.375, "acc_per_char": 0.3881578947368421, "correct_loss_raw": 25.580662697749702, "incorrect_loss_raw": 25.1740071794443, "correct_loss_per_token": 2.4733851108936378, "incorrect_loss_per_token": 2.85147891723307, "correct_loss_per_char": 0.6145575749730166, "incorrect_loss_per_char": 0.6958959260665767, "acc_uncond": 0.40789473684210525, "correct_loss_uncond": -14.832323157277546, "incorrect_loss_uncond": -13.536962173487014, "primary_score": 0.3881578947368421, "num_instances": 152, "task_config": {"task_name": "mmlu_astronomy", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:rc::olmes"}}}, {"task": "mmlu_business_ethics", "acc_raw": 0.5, "acc_per_token": 0.43, "acc_per_char": 0.45, "correct_loss_raw": 23.453789367675782, "incorrect_loss_raw": 26.393909745216362, "correct_loss_per_token": 3.281713834419038, "incorrect_loss_per_token": 3.63527224490647, "correct_loss_per_char": 0.9186450543984302, "incorrect_loss_per_char": 0.9744966450427746, "acc_uncond": 0.36, "correct_loss_uncond": -12.153233604431152, "incorrect_loss_uncond": -11.242717363039652, "primary_score": 0.45, "num_instances": 100, "task_config": {"task_name": "mmlu_business_ethics", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:rc::olmes"}}}, {"task": "mmlu_clinical_knowledge", "acc_raw": 0.2830188679245283, "acc_per_token": 0.38113207547169814, "acc_per_char": 0.38113207547169814, "correct_loss_raw": 22.123643714639375, "incorrect_loss_raw": 20.55967536917273, "correct_loss_per_token": 2.5528685467264762, "incorrect_loss_per_token": 2.8272442358128638, "correct_loss_per_char": 0.6015267454680685, "incorrect_loss_per_char": 0.6873710059581529, "acc_uncond": 0.3584905660377358, "correct_loss_uncond": -13.87818564142821, "incorrect_loss_uncond": -12.788745701837843, "primary_score": 0.38113207547169814, "num_instances": 265, "task_config": {"task_name": "mmlu_clinical_knowledge", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:rc::olmes"}}}, {"task": "mmlu_college_biology", "acc_raw": 0.3333333333333333, "acc_per_token": 0.3333333333333333, "acc_per_char": 0.3819444444444444, "correct_loss_raw": 20.15562465869718, "incorrect_loss_raw": 21.83716755774286, "correct_loss_per_token": 2.5882522416235885, "incorrect_loss_per_token": 3.0003426710170906, "correct_loss_per_char": 0.4998466177829353, "incorrect_loss_per_char": 0.58942911369962, "acc_uncond": 0.3819444444444444, "correct_loss_uncond": -15.927579243150022, "incorrect_loss_uncond": -14.359596281139943, "primary_score": 0.3819444444444444, "num_instances": 144, "task_config": {"task_name": "mmlu_college_biology", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:rc::olmes"}}}, {"task": "mmlu_college_chemistry", "acc_raw": 0.26, "acc_per_token": 0.35, "acc_per_char": 0.26, "correct_loss_raw": 18.376699719429016, "incorrect_loss_raw": 18.637492257754, "correct_loss_per_token": 2.95496215366955, "incorrect_loss_per_token": 3.047901730301063, "correct_loss_per_char": 1.1798262270589766, "incorrect_loss_per_char": 1.1793778047394419, "acc_uncond": 0.25, "correct_loss_uncond": -12.02003538608551, "incorrect_loss_uncond": -11.522469477653503, "primary_score": 0.26, "num_instances": 100, "task_config": {"task_name": "mmlu_college_chemistry", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:rc::olmes"}}}, {"task": "mmlu_college_computer_science", "acc_raw": 0.32, "acc_per_token": 0.28, "acc_per_char": 0.27, "correct_loss_raw": 18.364038631916046, "incorrect_loss_raw": 17.90530202150345, "correct_loss_per_token": 2.6942068554548784, "incorrect_loss_per_token": 2.9695804003074513, "correct_loss_per_char": 0.8683093138928093, "incorrect_loss_per_char": 0.8950832082290698, "acc_uncond": 0.35, "correct_loss_uncond": -12.329349439144135, "incorrect_loss_uncond": -12.21822862386703, "primary_score": 0.27, "num_instances": 100, "task_config": {"task_name": "mmlu_college_computer_science", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:rc::olmes"}}}, {"task": "mmlu_college_mathematics", "acc_raw": 0.18, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 11.714233223199845, "incorrect_loss_raw": 10.420037207206095, "correct_loss_per_token": 2.9405851893096635, "incorrect_loss_per_token": 2.885563433783357, "correct_loss_per_char": 1.2039596272554867, "incorrect_loss_per_char": 1.1564252400547155, "acc_uncond": 0.3, "correct_loss_uncond": -9.182509263753891, "incorrect_loss_uncond": -8.80835912267367, "primary_score": 0.25, "num_instances": 100, "task_config": {"task_name": "mmlu_college_mathematics", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:rc::olmes"}}}, {"task": "mmlu_college_medicine", "acc_raw": 0.2947976878612717, "acc_per_token": 0.3179190751445087, "acc_per_char": 0.28901734104046245, "correct_loss_raw": 20.329613751069658, "incorrect_loss_raw": 20.08376282901434, "correct_loss_per_token": 2.6962297764270766, "incorrect_loss_per_token": 2.848341829317948, "correct_loss_per_char": 0.6281389185459614, "incorrect_loss_per_char": 0.6688032917664197, "acc_uncond": 0.31213872832369943, "correct_loss_uncond": -13.599645053031127, "incorrect_loss_uncond": -13.151097627044416, "primary_score": 0.28901734104046245, "num_instances": 173, "task_config": {"task_name": "mmlu_college_medicine", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:rc::olmes"}}}, {"task": "mmlu_college_physics", "acc_raw": 0.17647058823529413, "acc_per_token": 0.20588235294117646, "acc_per_char": 0.18627450980392157, "correct_loss_raw": 12.276802088700089, "incorrect_loss_raw": 10.65681253110661, "correct_loss_per_token": 2.6965595728334724, "incorrect_loss_per_token": 2.4891722812701556, "correct_loss_per_char": 1.078457387054979, "incorrect_loss_per_char": 0.9895847622831168, "acc_uncond": 0.2549019607843137, "correct_loss_uncond": -11.833522378229627, "incorrect_loss_uncond": -11.821572918907492, "primary_score": 0.18627450980392157, "num_instances": 102, "task_config": {"task_name": "mmlu_college_physics", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:rc::olmes"}}}, {"task": "mmlu_computer_security", "acc_raw": 0.37, "acc_per_token": 0.42, "acc_per_char": 0.43, "correct_loss_raw": 22.6862280023098, "incorrect_loss_raw": 22.077150499820704, "correct_loss_per_token": 3.5369763457558765, "incorrect_loss_per_token": 4.166081396972171, "correct_loss_per_char": 0.8252690707760125, "incorrect_loss_per_char": 0.9430687283352381, "acc_uncond": 0.47, "correct_loss_uncond": -12.012285667657853, "incorrect_loss_uncond": -9.246938327948254, "primary_score": 0.43, "num_instances": 100, "task_config": {"task_name": "mmlu_computer_security", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:rc::olmes"}}}, {"task": "mmlu_conceptual_physics", "acc_raw": 0.44680851063829785, "acc_per_token": 0.39574468085106385, "acc_per_char": 0.3872340425531915, "correct_loss_raw": 9.180902650508475, "incorrect_loss_raw": 11.18278905656321, "correct_loss_per_token": 2.9558468813155776, "incorrect_loss_per_token": 3.725539554479406, "correct_loss_per_char": 0.6056277324516344, "incorrect_loss_per_char": 0.7330539230596994, "acc_uncond": 0.3191489361702128, "correct_loss_uncond": -11.380888761358058, "incorrect_loss_uncond": -9.883389157733163, "primary_score": 0.3872340425531915, "num_instances": 235, "task_config": {"task_name": "mmlu_conceptual_physics", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:rc::olmes"}}}, {"task": "mmlu_econometrics", "acc_raw": 0.2719298245614035, "acc_per_token": 0.2807017543859649, "acc_per_char": 0.2719298245614035, "correct_loss_raw": 20.309982992055122, "incorrect_loss_raw": 21.030795758927777, "correct_loss_per_token": 2.264997538978219, "incorrect_loss_per_token": 2.1897822494700447, "correct_loss_per_char": 0.5448100617714338, "incorrect_loss_per_char": 0.5515671949345781, "acc_uncond": 0.2894736842105263, "correct_loss_uncond": -15.18037696261155, "incorrect_loss_uncond": -15.323047515244507, "primary_score": 0.2719298245614035, "num_instances": 114, "task_config": {"task_name": "mmlu_econometrics", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:rc::olmes"}}}, {"task": "mmlu_electrical_engineering", "acc_raw": 0.27586206896551724, "acc_per_token": 0.33793103448275863, "acc_per_char": 0.3310344827586207, "correct_loss_raw": 13.682885837554931, "incorrect_loss_raw": 13.75903522063945, "correct_loss_per_token": 3.3592418816663727, "incorrect_loss_per_token": 3.7358111697527363, "correct_loss_per_char": 0.9527622174152375, "incorrect_loss_per_char": 0.9747887331770683, "acc_uncond": 0.2689655172413793, "correct_loss_uncond": -9.166813426182188, "incorrect_loss_uncond": -9.70173498186572, "primary_score": 0.3310344827586207, "num_instances": 145, "task_config": {"task_name": "mmlu_electrical_engineering", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:rc::olmes"}}}, {"task": "mmlu_elementary_mathematics", "acc_raw": 0.22486772486772486, "acc_per_token": 0.23809523809523808, "acc_per_char": 0.23015873015873015, "correct_loss_raw": 12.500810869471737, "incorrect_loss_raw": 12.490979615973412, "correct_loss_per_token": 4.041004668331812, "incorrect_loss_per_token": 4.105295090620753, "correct_loss_per_char": 1.598499868102426, "incorrect_loss_per_char": 1.6004921054290886, "acc_uncond": 0.25132275132275134, "correct_loss_uncond": -8.476502858457112, "incorrect_loss_uncond": -8.33731818367355, "primary_score": 0.23015873015873015, "num_instances": 378, "task_config": {"task_name": "mmlu_elementary_mathematics", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:rc::olmes"}}}, {"task": "mmlu_formal_logic", "acc_raw": 0.30952380952380953, "acc_per_token": 0.2698412698412698, "acc_per_char": 0.30158730158730157, "correct_loss_raw": 24.893998077937535, "incorrect_loss_raw": 26.133295801581532, "correct_loss_per_token": 2.7116675138348123, "incorrect_loss_per_token": 2.7241648390492945, "correct_loss_per_char": 1.1867459102371267, "incorrect_loss_per_char": 1.2454240104438674, "acc_uncond": 0.2222222222222222, "correct_loss_uncond": -25.99389488734896, "incorrect_loss_uncond": -26.22075840468129, "primary_score": 0.30158730158730157, "num_instances": 126, "task_config": {"task_name": "mmlu_formal_logic", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:rc::olmes"}}}, {"task": "mmlu_global_facts", "acc_raw": 0.27, "acc_per_token": 0.24, "acc_per_char": 0.25, "correct_loss_raw": 8.278837785720825, "incorrect_loss_raw": 8.835856211582822, "correct_loss_per_token": 2.8056958920990214, "incorrect_loss_per_token": 2.8483650020497873, "correct_loss_per_char": 1.1085497451363076, "incorrect_loss_per_char": 1.1334918764757753, "acc_uncond": 0.2, "correct_loss_uncond": -6.920755653381348, "incorrect_loss_uncond": -7.079783718188605, "primary_score": 0.25, "num_instances": 100, "task_config": {"task_name": "mmlu_global_facts", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:rc::olmes"}}}, {"task": "mmlu_high_school_biology", "acc_raw": 0.35161290322580646, "acc_per_token": 0.4032258064516129, "acc_per_char": 0.4064516129032258, "correct_loss_raw": 21.607249181885873, "incorrect_loss_raw": 22.094821071624768, "correct_loss_per_token": 2.546654726542327, "incorrect_loss_per_token": 2.941898022847176, "correct_loss_per_char": 0.5462745469798623, "incorrect_loss_per_char": 0.5900671546020984, "acc_uncond": 0.3870967741935484, "correct_loss_uncond": -13.925496973145393, "incorrect_loss_uncond": -12.674920809653498, "primary_score": 0.4064516129032258, "num_instances": 310, "task_config": {"task_name": "mmlu_high_school_biology", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:rc::olmes"}}}, {"task": "mmlu_high_school_chemistry", "acc_raw": 0.20689655172413793, "acc_per_token": 0.24630541871921183, "acc_per_char": 0.2413793103448276, "correct_loss_raw": 20.41424568592034, "incorrect_loss_raw": 18.734923248807792, "correct_loss_per_token": 2.638010876369508, "incorrect_loss_per_token": 2.577175435616267, "correct_loss_per_char": 0.893972889317772, "incorrect_loss_per_char": 0.8817604139892572, "acc_uncond": 0.3103448275862069, "correct_loss_uncond": -13.46020593960297, "incorrect_loss_uncond": -13.278387852294497, "primary_score": 0.2413793103448276, "num_instances": 203, "task_config": {"task_name": "mmlu_high_school_chemistry", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:rc::olmes"}}}, {"task": "mmlu_high_school_computer_science", "acc_raw": 0.25, "acc_per_token": 0.27, "acc_per_char": 0.31, "correct_loss_raw": 24.24228214800358, "incorrect_loss_raw": 24.508576377630238, "correct_loss_per_token": 2.729215729422765, "incorrect_loss_per_token": 2.921161604473935, "correct_loss_per_char": 0.8552203291511253, "incorrect_loss_per_char": 0.920918109466471, "acc_uncond": 0.27, "correct_loss_uncond": -15.800592300295829, "incorrect_loss_uncond": -15.446470299959188, "primary_score": 0.31, "num_instances": 100, "task_config": {"task_name": "mmlu_high_school_computer_science", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:rc::olmes"}}}, {"task": "mmlu_high_school_european_history", "acc_raw": 0.2909090909090909, "acc_per_token": 0.4, "acc_per_char": 0.4121212121212121, "correct_loss_raw": 28.99259257966822, "incorrect_loss_raw": 28.175486922023275, "correct_loss_per_token": 2.664430461905858, "incorrect_loss_per_token": 3.1671600496326557, "correct_loss_per_char": 0.48344800734399823, "incorrect_loss_per_char": 0.5706152578899196, "acc_uncond": 0.4, "correct_loss_uncond": -15.019520569570137, "incorrect_loss_uncond": -13.166462616005328, "primary_score": 0.4121212121212121, "num_instances": 165, "task_config": {"task_name": "mmlu_high_school_european_history", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:rc::olmes"}}}, {"task": "mmlu_high_school_geography", "acc_raw": 0.35353535353535354, "acc_per_token": 0.41414141414141414, "acc_per_char": 0.398989898989899, "correct_loss_raw": 14.893016030391058, "incorrect_loss_raw": 15.203946114589865, "correct_loss_per_token": 3.2453381852729546, "incorrect_loss_per_token": 3.7009779211436875, "correct_loss_per_char": 0.6177910502396351, "incorrect_loss_per_char": 0.7281685944975731, "acc_uncond": 0.45454545454545453, "correct_loss_uncond": -10.973916836158194, "incorrect_loss_uncond": -9.519376937588463, "primary_score": 0.398989898989899, "num_instances": 198, "task_config": {"task_name": "mmlu_high_school_geography", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:rc::olmes"}}}, {"task": "mmlu_high_school_government_and_politics", "acc_raw": 0.37823834196891193, "acc_per_token": 0.41450777202072536, "acc_per_char": 0.42487046632124353, "correct_loss_raw": 22.53443668095559, "incorrect_loss_raw": 23.97239570872978, "correct_loss_per_token": 2.3713786314612118, "incorrect_loss_per_token": 2.839619713869773, "correct_loss_per_char": 0.40028085378000405, "incorrect_loss_per_char": 0.4791517659915154, "acc_uncond": 0.43523316062176165, "correct_loss_uncond": -16.59763565412457, "incorrect_loss_uncond": -13.856588512298767, "primary_score": 0.42487046632124353, "num_instances": 193, "task_config": {"task_name": "mmlu_high_school_government_and_politics", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:rc::olmes"}}}, {"task": "mmlu_high_school_macroeconomics", "acc_raw": 0.2564102564102564, "acc_per_token": 0.3128205128205128, "acc_per_char": 0.2846153846153846, "correct_loss_raw": 23.32332377005846, "incorrect_loss_raw": 23.13647856590075, "correct_loss_per_token": 2.8958814854123323, "incorrect_loss_per_token": 3.039365654909822, "correct_loss_per_char": 0.6132319119590228, "incorrect_loss_per_char": 0.6297969417394456, "acc_uncond": 0.31794871794871793, "correct_loss_uncond": -16.318520267804463, "incorrect_loss_uncond": -15.51478351364787, "primary_score": 0.2846153846153846, "num_instances": 390, "task_config": {"task_name": "mmlu_high_school_macroeconomics", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:rc::olmes"}}}, {"task": "mmlu_high_school_mathematics", "acc_raw": 0.13333333333333333, "acc_per_token": 0.16296296296296298, "acc_per_char": 0.15555555555555556, "correct_loss_raw": 9.204976223133228, "incorrect_loss_raw": 8.08783481268235, "correct_loss_per_token": 4.325949969449563, "incorrect_loss_per_token": 4.017497469068086, "correct_loss_per_char": 1.760739348124492, "incorrect_loss_per_char": 1.6445159907803975, "acc_uncond": 0.25555555555555554, "correct_loss_uncond": -5.782799828494037, "incorrect_loss_uncond": -5.6601883399633754, "primary_score": 0.15555555555555556, "num_instances": 270, "task_config": {"task_name": "mmlu_high_school_mathematics", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:rc::olmes"}}}, {"task": "mmlu_high_school_microeconomics", "acc_raw": 0.29831932773109243, "acc_per_token": 0.3277310924369748, "acc_per_char": 0.3445378151260504, "correct_loss_raw": 28.13104149573991, "incorrect_loss_raw": 27.048356812875138, "correct_loss_per_token": 3.0403787622264153, "incorrect_loss_per_token": 3.18991426331285, "correct_loss_per_char": 0.6511666080516382, "incorrect_loss_per_char": 0.6694161241463367, "acc_uncond": 0.29411764705882354, "correct_loss_uncond": -16.016987619279814, "incorrect_loss_uncond": -15.557740727058636, "primary_score": 0.3445378151260504, "num_instances": 238, "task_config": {"task_name": "mmlu_high_school_microeconomics", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:rc::olmes"}}}, {"task": "mmlu_high_school_physics", "acc_raw": 0.2185430463576159, "acc_per_token": 0.24503311258278146, "acc_per_char": 0.25165562913907286, "correct_loss_raw": 21.145936244370922, "incorrect_loss_raw": 20.615255395571385, "correct_loss_per_token": 2.4163758161593165, "incorrect_loss_per_token": 2.3898283100977675, "correct_loss_per_char": 0.8514601195880208, "incorrect_loss_per_char": 0.8441124034745007, "acc_uncond": 0.2582781456953642, "correct_loss_uncond": -15.770635069600793, "incorrect_loss_uncond": -15.784137553463449, "primary_score": 0.25165562913907286, "num_instances": 151, "task_config": {"task_name": "mmlu_high_school_physics", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:rc::olmes"}}}, {"task": "mmlu_high_school_psychology", "acc_raw": 0.44770642201834865, "acc_per_token": 0.44220183486238535, "acc_per_char": 0.43669724770642204, "correct_loss_raw": 15.492815904715739, "incorrect_loss_raw": 17.66775804688806, "correct_loss_per_token": 3.2219814424440893, "incorrect_loss_per_token": 4.018716729662848, "correct_loss_per_char": 0.5360055991624847, "incorrect_loss_per_char": 0.6700789690531939, "acc_uncond": 0.41467889908256883, "correct_loss_uncond": -13.453667054646607, "incorrect_loss_uncond": -11.540014688975951, "primary_score": 0.43669724770642204, "num_instances": 545, "task_config": {"task_name": "mmlu_high_school_psychology", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:rc::olmes"}}}, {"task": "mmlu_high_school_statistics", "acc_raw": 0.28703703703703703, "acc_per_token": 0.3287037037037037, "acc_per_char": 0.3055555555555556, "correct_loss_raw": 26.200835422785193, "incorrect_loss_raw": 27.084853073697037, "correct_loss_per_token": 2.6154795940156275, "incorrect_loss_per_token": 2.7054257533621136, "correct_loss_per_char": 0.7976069811799237, "incorrect_loss_per_char": 0.8407188293755185, "acc_uncond": 0.30092592592592593, "correct_loss_uncond": -17.29601466159026, "incorrect_loss_uncond": -16.46438581119348, "primary_score": 0.3055555555555556, "num_instances": 216, "task_config": {"task_name": "mmlu_high_school_statistics", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:rc::olmes"}}}, {"task": "mmlu_high_school_us_history", "acc_raw": 0.31862745098039214, "acc_per_token": 0.31862745098039214, "acc_per_char": 0.37254901960784315, "correct_loss_raw": 26.618980217213725, "incorrect_loss_raw": 27.263960099103404, "correct_loss_per_token": 2.588034789381987, "incorrect_loss_per_token": 2.8024372832035565, "correct_loss_per_char": 0.4853787438826697, "incorrect_loss_per_char": 0.5319232386880215, "acc_uncond": 0.3382352941176471, "correct_loss_uncond": -13.67589058011186, "incorrect_loss_uncond": -12.063057624241877, "primary_score": 0.37254901960784315, "num_instances": 204, "task_config": {"task_name": "mmlu_high_school_us_history", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:rc::olmes"}}}, {"task": "mmlu_high_school_world_history", "acc_raw": 0.28270042194092826, "acc_per_token": 0.350210970464135, "acc_per_char": 0.29535864978902954, "correct_loss_raw": 31.26888810431404, "incorrect_loss_raw": 30.740597895932098, "correct_loss_per_token": 2.928289248436712, "incorrect_loss_per_token": 3.231146465106264, "correct_loss_per_char": 0.5425519997491743, "incorrect_loss_per_char": 0.5788576879738572, "acc_uncond": 0.3755274261603376, "correct_loss_uncond": -13.914379819033016, "incorrect_loss_uncond": -12.460183474249638, "primary_score": 0.29535864978902954, "num_instances": 237, "task_config": {"task_name": "mmlu_high_school_world_history", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:rc::olmes"}}}, {"task": "mmlu_human_aging", "acc_raw": 0.3901345291479821, "acc_per_token": 0.39461883408071746, "acc_per_char": 0.3901345291479821, "correct_loss_raw": 13.265995577579123, "incorrect_loss_raw": 15.933169311472115, "correct_loss_per_token": 3.1792583123113225, "incorrect_loss_per_token": 3.6824491460763893, "correct_loss_per_char": 0.5931539026499014, "incorrect_loss_per_char": 0.7213349821239929, "acc_uncond": 0.35874439461883406, "correct_loss_uncond": -9.616355861516277, "incorrect_loss_uncond": -8.394420486929164, "primary_score": 0.3901345291479821, "num_instances": 223, "task_config": {"task_name": "mmlu_human_aging", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:rc::olmes"}}}, {"task": "mmlu_human_sexuality", "acc_raw": 0.33587786259541985, "acc_per_token": 0.3969465648854962, "acc_per_char": 0.35877862595419846, "correct_loss_raw": 15.421348831125798, "incorrect_loss_raw": 17.013223619558115, "correct_loss_per_token": 3.232259738158243, "incorrect_loss_per_token": 3.8039631588523775, "correct_loss_per_char": 0.6779892372553639, "incorrect_loss_per_char": 0.7219905285426741, "acc_uncond": 0.2595419847328244, "correct_loss_uncond": -11.125376795084422, "incorrect_loss_uncond": -11.816082447843085, "primary_score": 0.35877862595419846, "num_instances": 131, "task_config": {"task_name": "mmlu_human_sexuality", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:rc::olmes"}}}, {"task": "mmlu_international_law", "acc_raw": 0.2066115702479339, "acc_per_token": 0.32231404958677684, "acc_per_char": 0.3140495867768595, "correct_loss_raw": 49.143299920499814, "incorrect_loss_raw": 35.66231494435924, "correct_loss_per_token": 2.4919706925454244, "incorrect_loss_per_token": 2.6574130665678393, "correct_loss_per_char": 0.45580109453343454, "incorrect_loss_per_char": 0.47563891910474376, "acc_uncond": 0.4132231404958678, "correct_loss_uncond": -25.122074539011177, "incorrect_loss_uncond": -22.341709102809265, "primary_score": 0.3140495867768595, "num_instances": 121, "task_config": {"task_name": "mmlu_international_law", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:rc::olmes"}}}, {"task": "mmlu_jurisprudence", "acc_raw": 0.2037037037037037, "acc_per_token": 0.23148148148148148, "acc_per_char": 0.26851851851851855, "correct_loss_raw": 28.849514014191097, "incorrect_loss_raw": 23.601570882914984, "correct_loss_per_token": 3.4083927781709167, "incorrect_loss_per_token": 3.690943605943219, "correct_loss_per_char": 0.6617516235075503, "incorrect_loss_per_char": 0.6789110459237553, "acc_uncond": 0.3148148148148148, "correct_loss_uncond": -13.126620374344013, "incorrect_loss_uncond": -12.157751857498544, "primary_score": 0.26851851851851855, "num_instances": 108, "task_config": {"task_name": "mmlu_jurisprudence", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:rc::olmes"}}}, {"task": "mmlu_logical_fallacies", "acc_raw": 0.3128834355828221, "acc_per_token": 0.32515337423312884, "acc_per_char": 0.32515337423312884, "correct_loss_raw": 25.324365175574837, "incorrect_loss_raw": 24.82704315205051, "correct_loss_per_token": 3.6489869125370564, "incorrect_loss_per_token": 3.9748963475403993, "correct_loss_per_char": 0.6471265034347871, "incorrect_loss_per_char": 0.727814055247372, "acc_uncond": 0.3496932515337423, "correct_loss_uncond": -12.140092412386934, "incorrect_loss_uncond": -10.665011691657075, "primary_score": 0.32515337423312884, "num_instances": 163, "task_config": {"task_name": "mmlu_logical_fallacies", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:rc::olmes"}}}, {"task": "mmlu_machine_learning", "acc_raw": 0.26785714285714285, "acc_per_token": 0.23214285714285715, "acc_per_char": 0.2767857142857143, "correct_loss_raw": 19.73031181735652, "incorrect_loss_raw": 19.962696756635385, "correct_loss_per_token": 3.962941681372349, "incorrect_loss_per_token": 3.890866119612744, "correct_loss_per_char": 1.019208526942912, "incorrect_loss_per_char": 1.0051290247251965, "acc_uncond": 0.25, "correct_loss_uncond": -7.80760141994272, "incorrect_loss_uncond": -7.384287642581122, "primary_score": 0.2767857142857143, "num_instances": 112, "task_config": {"task_name": "mmlu_machine_learning", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:rc::olmes"}}}, {"task": "mmlu_management", "acc_raw": 0.30097087378640774, "acc_per_token": 0.4077669902912621, "acc_per_char": 0.4368932038834951, "correct_loss_raw": 14.372589470692052, "incorrect_loss_raw": 14.791216655842307, "correct_loss_per_token": 3.7546848435269626, "incorrect_loss_per_token": 4.193690584315543, "correct_loss_per_char": 0.6444644352395216, "incorrect_loss_per_char": 0.7096893629869219, "acc_uncond": 0.42718446601941745, "correct_loss_uncond": -9.07513473276953, "incorrect_loss_uncond": -7.9406714099896405, "primary_score": 0.4368932038834951, "num_instances": 103, "task_config": {"task_name": "mmlu_management", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:rc::olmes"}}}, {"task": "mmlu_marketing", "acc_raw": 0.49572649572649574, "acc_per_token": 0.49572649572649574, "acc_per_char": 0.49572649572649574, "correct_loss_raw": 13.834066818412552, "incorrect_loss_raw": 16.770095258017214, "correct_loss_per_token": 2.8060586235688953, "incorrect_loss_per_token": 3.6331203108692036, "correct_loss_per_char": 0.5785293789824133, "incorrect_loss_per_char": 0.7565314767293014, "acc_uncond": 0.5085470085470085, "correct_loss_uncond": -12.960387743945814, "incorrect_loss_uncond": -10.329098611136104, "primary_score": 0.49572649572649574, "num_instances": 234, "task_config": {"task_name": "mmlu_marketing", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:rc::olmes"}}}, {"task": "mmlu_medical_genetics", "acc_raw": 0.35, "acc_per_token": 0.38, "acc_per_char": 0.4, "correct_loss_raw": 15.001992144584655, "incorrect_loss_raw": 14.109933145046236, "correct_loss_per_token": 2.5846105495765688, "incorrect_loss_per_token": 2.9764702960370695, "correct_loss_per_char": 0.6665677303876627, "incorrect_loss_per_char": 0.7574390813948088, "acc_uncond": 0.37, "correct_loss_uncond": -13.242882170677184, "incorrect_loss_uncond": -11.95330838123957, "primary_score": 0.4, "num_instances": 100, "task_config": {"task_name": "mmlu_medical_genetics", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:rc::olmes"}}}, {"task": "mmlu_miscellaneous", "acc_raw": 0.4661558109833972, "acc_per_token": 0.4508301404853129, "acc_per_char": 0.4648786717752235, "correct_loss_raw": 10.563235344767266, "incorrect_loss_raw": 12.620073988900485, "correct_loss_per_token": 3.094379709762898, "incorrect_loss_per_token": 4.042740800844056, "correct_loss_per_char": 0.6668340255124663, "incorrect_loss_per_char": 0.8802967710188774, "acc_uncond": 0.4840357598978289, "correct_loss_uncond": -10.465876538729912, "incorrect_loss_uncond": -8.39672588913833, "primary_score": 0.4648786717752235, "num_instances": 783, "task_config": {"task_name": "mmlu_miscellaneous", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:rc::olmes"}}}, {"task": "mmlu_moral_disputes", "acc_raw": 0.2745664739884393, "acc_per_token": 0.30346820809248554, "acc_per_char": 0.25722543352601157, "correct_loss_raw": 28.29641576033796, "incorrect_loss_raw": 25.717709578071222, "correct_loss_per_token": 3.1054814251087093, "incorrect_loss_per_token": 3.2459228834051115, "correct_loss_per_char": 0.6142554110442542, "incorrect_loss_per_char": 0.6158678736943969, "acc_uncond": 0.3208092485549133, "correct_loss_uncond": -14.065454878559002, "incorrect_loss_uncond": -13.653906957262524, "primary_score": 0.25722543352601157, "num_instances": 346, "task_config": {"task_name": "mmlu_moral_disputes", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:rc::olmes"}}}, {"task": "mmlu_moral_scenarios", "acc_raw": 0.23798882681564246, "acc_per_token": 0.23798882681564246, "acc_per_char": 0.23798882681564246, "correct_loss_raw": 1.8738852696378803, "incorrect_loss_raw": 1.8248078697664563, "correct_loss_per_token": 0.4412940012542896, "incorrect_loss_per_token": 0.43430041784754714, "correct_loss_per_char": 0.10404536119186, "incorrect_loss_per_char": 0.10233466985782204, "acc_uncond": 0.25251396648044694, "correct_loss_uncond": -19.073117448131466, "incorrect_loss_uncond": -18.97169191604218, "primary_score": 0.23798882681564246, "num_instances": 895, "task_config": {"task_name": "mmlu_moral_scenarios", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:rc::olmes"}}}, {"task": "mmlu_nutrition", "acc_raw": 0.27450980392156865, "acc_per_token": 0.3366013071895425, "acc_per_char": 0.3235294117647059, "correct_loss_raw": 26.88473216458863, "incorrect_loss_raw": 23.947185122369444, "correct_loss_per_token": 2.7123445598352913, "incorrect_loss_per_token": 2.9259598929731725, "correct_loss_per_char": 0.5916553396222153, "incorrect_loss_per_char": 0.6382808079213305, "acc_uncond": 0.34967320261437906, "correct_loss_uncond": -11.458059841511297, "incorrect_loss_uncond": -11.157264869197515, "primary_score": 0.3235294117647059, "num_instances": 306, "task_config": {"task_name": "mmlu_nutrition", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:rc::olmes"}}}, {"task": "mmlu_philosophy", "acc_raw": 0.2733118971061093, "acc_per_token": 0.2604501607717042, "acc_per_char": 0.2861736334405145, "correct_loss_raw": 24.160437655602237, "incorrect_loss_raw": 22.095235615553445, "correct_loss_per_token": 3.456400314243523, "incorrect_loss_per_token": 3.523076761974967, "correct_loss_per_char": 0.6824404975617542, "incorrect_loss_per_char": 0.6849178703389117, "acc_uncond": 0.3247588424437299, "correct_loss_uncond": -12.740601260945727, "incorrect_loss_uncond": -12.200350506895989, "primary_score": 0.2861736334405145, "num_instances": 311, "task_config": {"task_name": "mmlu_philosophy", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:rc::olmes"}}}, {"task": "mmlu_prehistory", "acc_raw": 0.3950617283950617, "acc_per_token": 0.3549382716049383, "acc_per_char": 0.3425925925925926, "correct_loss_raw": 21.544764254840068, "incorrect_loss_raw": 23.408556957671667, "correct_loss_per_token": 2.719362407400458, "incorrect_loss_per_token": 3.061571254592011, "correct_loss_per_char": 0.6246061464464701, "incorrect_loss_per_char": 0.6890525985713941, "acc_uncond": 0.3395061728395062, "correct_loss_uncond": -15.288531370093057, "incorrect_loss_uncond": -14.466170308153329, "primary_score": 0.3425925925925926, "num_instances": 324, "task_config": {"task_name": "mmlu_prehistory", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:rc::olmes"}}}, {"task": "mmlu_professional_accounting", "acc_raw": 0.25886524822695034, "acc_per_token": 0.24113475177304963, "acc_per_char": 0.24113475177304963, "correct_loss_raw": 26.63527681641545, "incorrect_loss_raw": 26.714282638355925, "correct_loss_per_token": 3.1146707298600944, "incorrect_loss_per_token": 3.1343855241720178, "correct_loss_per_char": 0.8344222590911455, "incorrect_loss_per_char": 0.855965238549494, "acc_uncond": 0.23404255319148937, "correct_loss_uncond": -12.230710915639891, "incorrect_loss_uncond": -11.857481007880352, "primary_score": 0.24113475177304963, "num_instances": 282, "task_config": {"task_name": "mmlu_professional_accounting", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:rc::olmes"}}}, {"task": "mmlu_professional_law", "acc_raw": 0.2470664928292047, "acc_per_token": 0.27249022164276404, "acc_per_char": 0.28096479791395046, "correct_loss_raw": 42.60308619288453, "incorrect_loss_raw": 40.950438800893174, "correct_loss_per_token": 2.3378126243416184, "incorrect_loss_per_token": 2.344101123782433, "correct_loss_per_char": 0.46450144197903787, "incorrect_loss_per_char": 0.46480584545610665, "acc_uncond": 0.27835723598435463, "correct_loss_uncond": -26.366236305765256, "incorrect_loss_uncond": -25.505998810469052, "primary_score": 0.28096479791395046, "num_instances": 1534, "task_config": {"task_name": "mmlu_professional_law", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:rc::olmes"}}}, {"task": "mmlu_professional_medicine", "acc_raw": 0.3014705882352941, "acc_per_token": 0.3088235294117647, "acc_per_char": 0.3125, "correct_loss_raw": 14.52855256068356, "incorrect_loss_raw": 15.382910631450951, "correct_loss_per_token": 2.5904497607872914, "incorrect_loss_per_token": 2.7594740984440853, "correct_loss_per_char": 0.5286169810304566, "incorrect_loss_per_char": 0.5743104560037122, "acc_uncond": 0.35294117647058826, "correct_loss_uncond": -11.053912863573608, "incorrect_loss_uncond": -10.347978409598849, "primary_score": 0.3125, "num_instances": 272, "task_config": {"task_name": "mmlu_professional_medicine", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:rc::olmes"}}}, {"task": "mmlu_professional_psychology", "acc_raw": 0.29411764705882354, "acc_per_token": 0.31862745098039214, "acc_per_char": 0.30392156862745096, "correct_loss_raw": 25.987278289216405, "incorrect_loss_raw": 26.76041926300733, "correct_loss_per_token": 3.3580263735069833, "incorrect_loss_per_token": 3.6368981728185092, "correct_loss_per_char": 0.6099669233283038, "incorrect_loss_per_char": 0.6562401783781975, "acc_uncond": 0.315359477124183, "correct_loss_uncond": -15.71813710457554, "incorrect_loss_uncond": -14.971442831659884, "primary_score": 0.30392156862745096, "num_instances": 612, "task_config": {"task_name": "mmlu_professional_psychology", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:rc::olmes"}}}, {"task": "mmlu_public_relations", "acc_raw": 0.45454545454545453, "acc_per_token": 0.36363636363636365, "acc_per_char": 0.3090909090909091, "correct_loss_raw": 14.288992928916757, "incorrect_loss_raw": 16.822460697275222, "correct_loss_per_token": 4.1535559337992884, "incorrect_loss_per_token": 4.676294730493403, "correct_loss_per_char": 0.7359015006541597, "incorrect_loss_per_char": 0.7919844297755388, "acc_uncond": 0.32727272727272727, "correct_loss_uncond": -9.454623821106823, "incorrect_loss_uncond": -8.436705364241742, "primary_score": 0.3090909090909091, "num_instances": 110, "task_config": {"task_name": "mmlu_public_relations", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:rc::olmes"}}}, {"task": "mmlu_security_studies", "acc_raw": 0.31020408163265306, "acc_per_token": 0.30612244897959184, "acc_per_char": 0.2571428571428571, "correct_loss_raw": 90.24697107392915, "incorrect_loss_raw": 99.85066714189485, "correct_loss_per_token": 3.2406289145919915, "incorrect_loss_per_token": 3.150665943844446, "correct_loss_per_char": 0.6222363331286122, "incorrect_loss_per_char": 0.5721624972013643, "acc_uncond": 0.2693877551020408, "correct_loss_uncond": -16.94638129837659, "incorrect_loss_uncond": -19.379616625130595, "primary_score": 0.2571428571428571, "num_instances": 245, "task_config": {"task_name": "mmlu_security_studies", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:rc::olmes"}}}, {"task": "mmlu_sociology", "acc_raw": 0.31343283582089554, "acc_per_token": 0.31840796019900497, "acc_per_char": 0.25870646766169153, "correct_loss_raw": 31.317165066353716, "incorrect_loss_raw": 31.61777329365807, "correct_loss_per_token": 3.4413731654544994, "incorrect_loss_per_token": 3.6675033583865777, "correct_loss_per_char": 0.5888368911897749, "incorrect_loss_per_char": 0.6072444322793061, "acc_uncond": 0.43283582089552236, "correct_loss_uncond": -14.812071022109606, "incorrect_loss_uncond": -13.896083878838208, "primary_score": 0.25870646766169153, "num_instances": 201, "task_config": {"task_name": "mmlu_sociology", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:rc::olmes"}}}, {"task": "mmlu_us_foreign_policy", "acc_raw": 0.28, "acc_per_token": 0.35, "acc_per_char": 0.34, "correct_loss_raw": 22.961134161949158, "incorrect_loss_raw": 21.269579972426094, "correct_loss_per_token": 2.646917520621951, "incorrect_loss_per_token": 2.967826551998018, "correct_loss_per_char": 0.5221212054711434, "incorrect_loss_per_char": 0.5597690716018958, "acc_uncond": 0.4, "correct_loss_uncond": -13.380189175605773, "incorrect_loss_uncond": -12.273421669801078, "primary_score": 0.34, "num_instances": 100, "task_config": {"task_name": "mmlu_us_foreign_policy", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:rc::olmes"}}}, {"task": "mmlu_virology", "acc_raw": 0.24096385542168675, "acc_per_token": 0.3674698795180723, "acc_per_char": 0.3313253012048193, "correct_loss_raw": 18.831141976706952, "incorrect_loss_raw": 18.900941733375614, "correct_loss_per_token": 3.2626452048840506, "incorrect_loss_per_token": 3.6867306698044504, "correct_loss_per_char": 0.6558878277864738, "incorrect_loss_per_char": 0.7173650237459901, "acc_uncond": 0.27710843373493976, "correct_loss_uncond": -9.973504549767597, "incorrect_loss_uncond": -9.532383361973443, "primary_score": 0.3313253012048193, "num_instances": 166, "task_config": {"task_name": "mmlu_virology", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:rc::olmes"}}}, {"task": "mmlu_world_religions", "acc_raw": 0.38596491228070173, "acc_per_token": 0.42105263157894735, "acc_per_char": 0.4093567251461988, "correct_loss_raw": 9.84376669627184, "incorrect_loss_raw": 10.777373207359055, "correct_loss_per_token": 2.930561060974347, "incorrect_loss_per_token": 3.6832057014687414, "correct_loss_per_char": 0.7892170156977061, "incorrect_loss_per_char": 0.9457867980879529, "acc_uncond": 0.4678362573099415, "correct_loss_uncond": -9.483134711694996, "incorrect_loss_uncond": -7.820731720380616, "primary_score": 0.4093567251461988, "num_instances": 171, "task_config": {"task_name": "mmlu_world_religions", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:rc::olmes"}}}], "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000"}}