|
{"task_name": "mmlu:mc::olmes", "task_hash": "f0f05cd4953d75d76242750a66e32adb", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:mc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 754.1312301158905, "current_date": "2025-01-28 22:35:02 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"acc_per_token_micro": 0.26071784646061813, "acc_per_token_macro": 0.2652013554745296, "correct_loss_per_char_micro": 0.7111146623864845, "correct_loss_per_char_macro": 0.7122749918694099, "incorrect_loss_raw_micro": 1.4371644482479946, "incorrect_loss_raw_macro": 1.4411920306088446, "acc_per_char_micro": 0.26071784646061813, "acc_per_char_macro": 0.2652013554745296, "primary_score_micro": 0.26071784646061813, "primary_score_macro": 0.2652013554745296, "incorrect_loss_per_token_micro": 1.4371644482479946, "incorrect_loss_per_token_macro": 1.4411920306088446, "correct_loss_raw_micro": 1.422229324772969, "correct_loss_raw_macro": 1.4245499837388198, "acc_raw_micro": 0.26071784646061813, "acc_raw_macro": 0.2652013554745296, "incorrect_loss_per_char_micro": 0.7185822241239973, "incorrect_loss_per_char_macro": 0.7205960153044223, "correct_loss_per_token_micro": 1.422229324772969, "correct_loss_per_token_macro": 1.4245499837388198, "primary_score": 0.2652013554745296}, "task_idx": null} |
|
{"task_name": "mmlu:rc::olmes", "task_hash": "d3fcbcac54951cec9ca2867583e71aa6", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:rc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1712.0159969329834, "current_date": "2025-01-28 22:47:37 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"acc_uncond_micro": 0.33314342686227033, "acc_uncond_macro": 0.33435596803300893, "acc_per_token_micro": 0.3257370744908133, "acc_per_token_macro": 0.3275611650446029, "correct_loss_per_char_micro": 0.6532332432635897, "correct_loss_per_char_macro": 0.7244786922443343, "incorrect_loss_raw_micro": 22.687225431967185, "incorrect_loss_raw_macro": 21.513673654986444, "acc_per_char_micro": 0.3216778236718416, "acc_per_char_macro": 0.32307164395785354, "correct_loss_uncond_micro": -14.90609547050038, "correct_loss_uncond_macro": -13.414712741701779, "primary_score_micro": 0.3216778236718416, "primary_score_macro": 0.32307164395785354, "incorrect_loss_per_token_micro": 3.0516866685921893, "incorrect_loss_per_token_macro": 3.187838365185059, "incorrect_loss_uncond_micro": -14.05838893042516, "incorrect_loss_uncond_macro": -12.608247113816947, "correct_loss_raw_micro": 22.630186270126003, "correct_loss_raw_macro": 21.551042191209497, "acc_raw_micro": 0.3052983905426577, "acc_raw_macro": 0.3034148546188076, "incorrect_loss_per_char_micro": 0.700118585892161, "incorrect_loss_per_char_macro": 0.7691466871719901, "correct_loss_per_token_micro": 2.7886637249538646, "correct_loss_per_token_macro": 2.9236862303547144, "primary_score": 0.32307164395785354}, "task_idx": null} |
|
{"task_name": "mmlu::olmes", "task_hash": "f5ac6da68d1e2b6ae02dda443aa04648", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "mmlu::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2466.147227048874, "current_date": "2025-01-28 22:35:02 UTC", "num_instances": 28084, "beaker_info": {}, "metrics": {"primary_score": 0.32307164395785354}, "task_idx": null} |
|
{"task_name": "mmlu_abstract_algebra:mc", "task_hash": "bdde3fee40ebc8ddc5786c67975c5b31", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra:mc", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.024841785430908, "current_date": "2025-01-28 22:35:02 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.32, "acc_per_token": 0.32, "acc_per_char": 0.32, "correct_loss_raw": 1.4286141949892044, "incorrect_loss_raw": 1.4459923624992368, "correct_loss_per_token": 1.4286141949892044, "incorrect_loss_per_token": 1.4459923624992368, "correct_loss_per_char": 0.7143070974946022, "incorrect_loss_per_char": 0.7229961812496184, "primary_score": 0.32}, "task_idx": 0} |
|
{"task_name": "mmlu_anatomy:mc", "task_hash": "ba9ed92a6ef8f2c40aa5551bfc77b5e7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy:mc", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.623577833175659, "current_date": "2025-01-28 22:35:05 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.3111111111111111, "acc_per_token": 0.3111111111111111, "acc_per_char": 0.3111111111111111, "correct_loss_raw": 1.391642408459275, "incorrect_loss_raw": 1.466421911304379, "correct_loss_per_token": 1.391642408459275, "incorrect_loss_per_token": 1.466421911304379, "correct_loss_per_char": 0.6958212042296374, "incorrect_loss_per_char": 0.7332109556521895, "primary_score": 0.3111111111111111}, "task_idx": 1} |
|
{"task_name": "mmlu_astronomy:mc", "task_hash": "e7ca8a8921c02622e23c99b7d90379f7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy:mc", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.698110818862915, "current_date": "2025-01-28 22:35:09 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.18421052631578946, "acc_per_token": 0.18421052631578946, "acc_per_char": 0.18421052631578946, "correct_loss_raw": 1.5038828645881854, "incorrect_loss_raw": 1.4131246705849962, "correct_loss_per_token": 1.5038828645881854, "incorrect_loss_per_token": 1.4131246705849962, "correct_loss_per_char": 0.7519414322940927, "incorrect_loss_per_char": 0.7065623352924981, "primary_score": 0.18421052631578946}, "task_idx": 2} |
|
{"task_name": "mmlu_business_ethics:mc", "task_hash": "7de417726ca2cc155dd1475a38afc381", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics:mc", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.375203371047974, "current_date": "2025-01-28 22:35:15 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.4087046658992768, "incorrect_loss_raw": 1.4307063547770176, "correct_loss_per_token": 1.4087046658992768, "incorrect_loss_per_token": 1.4307063547770176, "correct_loss_per_char": 0.7043523329496384, "incorrect_loss_per_char": 0.7153531773885088, "primary_score": 0.25}, "task_idx": 3} |
|
{"task_name": "mmlu_clinical_knowledge:mc", "task_hash": "221ee08c4359ce7072b8d66f1c37f500", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge:mc", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.782487869262695, "current_date": "2025-01-28 22:35:20 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.2339622641509434, "acc_per_token": 0.2339622641509434, "acc_per_char": 0.2339622641509434, "correct_loss_raw": 1.4263436362428485, "incorrect_loss_raw": 1.4283468099510153, "correct_loss_per_token": 1.4263436362428485, "incorrect_loss_per_token": 1.4283468099510153, "correct_loss_per_char": 0.7131718181214243, "incorrect_loss_per_char": 0.7141734049755076, "primary_score": 0.2339622641509434}, "task_idx": 4} |
|
{"task_name": "mmlu_college_biology:mc", "task_hash": "aaf0bf4441359de8ffba70cefb786807", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology:mc", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.252874135971069, "current_date": "2025-01-28 22:35:28 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.2638888888888889, "acc_per_token": 0.2638888888888889, "acc_per_char": 0.2638888888888889, "correct_loss_raw": 1.4325314201414585, "incorrect_loss_raw": 1.4315688815657748, "correct_loss_per_token": 1.4325314201414585, "incorrect_loss_per_token": 1.4315688815657748, "correct_loss_per_char": 0.7162657100707293, "incorrect_loss_per_char": 0.7157844407828874, "primary_score": 0.2638888888888889}, "task_idx": 5} |
|
{"task_name": "mmlu_college_chemistry:mc", "task_hash": "1980c88e607a6dea06d45f27c60e3365", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry:mc", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.108367681503296, "current_date": "2025-01-28 22:35:33 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.17, "acc_per_token": 0.17, "acc_per_char": 0.17, "correct_loss_raw": 1.4802032005786896, "incorrect_loss_raw": 1.4110419366757072, "correct_loss_per_token": 1.4802032005786896, "incorrect_loss_per_token": 1.4110419366757072, "correct_loss_per_char": 0.7401016002893448, "incorrect_loss_per_char": 0.7055209683378536, "primary_score": 0.17}, "task_idx": 6} |
|
{"task_name": "mmlu_college_computer_science:mc", "task_hash": "9d5570c603bbcb33a0727904a22ef997", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science:mc", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.587414979934692, "current_date": "2025-01-28 22:35:37 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.4178925317525863, "incorrect_loss_raw": 1.5295472476879763, "correct_loss_per_token": 1.4178925317525863, "incorrect_loss_per_token": 1.5295472476879763, "correct_loss_per_char": 0.7089462658762932, "incorrect_loss_per_char": 0.7647736238439882, "primary_score": 0.33}, "task_idx": 7} |
|
{"task_name": "mmlu_college_mathematics:mc", "task_hash": "264fbafdeceacfd7588ca20ca3546113", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics:mc", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.614655017852783, "current_date": "2025-01-28 22:35:44 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.3, "acc_per_token": 0.3, "acc_per_char": 0.3, "correct_loss_raw": 1.4630403077602387, "incorrect_loss_raw": 1.5132715060313535, "correct_loss_per_token": 1.4630403077602387, "incorrect_loss_per_token": 1.5132715060313535, "correct_loss_per_char": 0.7315201538801194, "incorrect_loss_per_char": 0.7566357530156768, "primary_score": 0.3}, "task_idx": 8} |
|
{"task_name": "mmlu_college_medicine:mc", "task_hash": "9b3c95bd3bbac8771701a5abc3ab28ba", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine:mc", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.311716794967651, "current_date": "2025-01-28 22:35:48 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.23699421965317918, "acc_per_token": 0.23699421965317918, "acc_per_char": 0.23699421965317918, "correct_loss_raw": 1.4322606531870847, "incorrect_loss_raw": 1.428708042597724, "correct_loss_per_token": 1.4322606531870847, "incorrect_loss_per_token": 1.428708042597724, "correct_loss_per_char": 0.7161303265935424, "incorrect_loss_per_char": 0.714354021298862, "primary_score": 0.23699421965317918}, "task_idx": 9} |
|
{"task_name": "mmlu_college_physics:mc", "task_hash": "2c97b2d8aac8dff8cd2656474c1dfb86", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics:mc", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.573668956756592, "current_date": "2025-01-28 22:35:56 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.1568627450980392, "acc_per_token": 0.1568627450980392, "acc_per_char": 0.1568627450980392, "correct_loss_raw": 1.4933925411280464, "incorrect_loss_raw": 1.4061525699749486, "correct_loss_per_token": 1.4933925411280464, "incorrect_loss_per_token": 1.4061525699749486, "correct_loss_per_char": 0.7466962705640232, "incorrect_loss_per_char": 0.7030762849874743, "primary_score": 0.1568627450980392}, "task_idx": 10} |
|
{"task_name": "mmlu_computer_security:mc", "task_hash": "6d7c3f721bf97797f0e660d896f4585b", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security:mc", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.3057401180267334, "current_date": "2025-01-28 22:35:59 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.23, "acc_per_token": 0.23, "acc_per_char": 0.23, "correct_loss_raw": 1.4304591560363769, "incorrect_loss_raw": 1.4309185570478433, "correct_loss_per_token": 1.4304591560363769, "incorrect_loss_per_token": 1.4309185570478433, "correct_loss_per_char": 0.7152295780181884, "incorrect_loss_per_char": 0.7154592785239217, "primary_score": 0.23}, "task_idx": 11} |
|
{"task_name": "mmlu_conceptual_physics:mc", "task_hash": "ffbb5f78c71ff87a70f5b59d313a380d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics:mc", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.869374752044678, "current_date": "2025-01-28 22:36:03 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.20851063829787234, "acc_per_token": 0.20851063829787234, "acc_per_char": 0.20851063829787234, "correct_loss_raw": 1.4332183908908924, "incorrect_loss_raw": 1.4322371693367655, "correct_loss_per_token": 1.4332183908908924, "incorrect_loss_per_token": 1.4322371693367655, "correct_loss_per_char": 0.7166091954454462, "incorrect_loss_per_char": 0.7161185846683827, "primary_score": 0.20851063829787234}, "task_idx": 12} |
|
{"task_name": "mmlu_econometrics:mc", "task_hash": "c69ca4807df1205e806299e8e20218af", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics:mc", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.335997819900513, "current_date": "2025-01-28 22:36:08 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.2543859649122807, "acc_per_token": 0.2543859649122807, "acc_per_char": 0.2543859649122807, "correct_loss_raw": 1.4873452510750085, "incorrect_loss_raw": 1.4898098088495915, "correct_loss_per_token": 1.4873452510750085, "incorrect_loss_per_token": 1.4898098088495915, "correct_loss_per_char": 0.7436726255375042, "incorrect_loss_per_char": 0.7449049044247957, "primary_score": 0.2543859649122807}, "task_idx": 13} |
|
{"task_name": "mmlu_electrical_engineering:mc", "task_hash": "c279f61638992683680ca9604e20fa4d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering:mc", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.8969244956970215, "current_date": "2025-01-28 22:36:14 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.2689655172413793, "acc_per_token": 0.2689655172413793, "acc_per_char": 0.2689655172413793, "correct_loss_raw": 1.4234175213452043, "incorrect_loss_raw": 1.434983214427685, "correct_loss_per_token": 1.4234175213452043, "incorrect_loss_per_token": 1.434983214427685, "correct_loss_per_char": 0.7117087606726021, "incorrect_loss_per_char": 0.7174916072138425, "primary_score": 0.2689655172413793}, "task_idx": 14} |
|
{"task_name": "mmlu_elementary_mathematics:mc", "task_hash": "35b6f0933f711770d09fb00b45905c5c", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics:mc", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.077240705490112, "current_date": "2025-01-28 22:36:19 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.24867724867724866, "acc_per_token": 0.24867724867724866, "acc_per_char": 0.24867724867724866, "correct_loss_raw": 1.425037144195466, "incorrect_loss_raw": 1.4287045017226445, "correct_loss_per_token": 1.425037144195466, "incorrect_loss_per_token": 1.4287045017226445, "correct_loss_per_char": 0.712518572097733, "incorrect_loss_per_char": 0.7143522508613223, "primary_score": 0.24867724867724866}, "task_idx": 15} |
|
{"task_name": "mmlu_formal_logic:mc", "task_hash": "74d8e6a1f297e0274243d2bbb7df4d1b", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic:mc", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.191099405288696, "current_date": "2025-01-28 22:36:33 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.15873015873015872, "acc_per_token": 0.15873015873015872, "acc_per_char": 0.15873015873015872, "correct_loss_raw": 1.5439469222984616, "incorrect_loss_raw": 1.456999213922592, "correct_loss_per_token": 1.5439469222984616, "incorrect_loss_per_token": 1.456999213922592, "correct_loss_per_char": 0.7719734611492308, "incorrect_loss_per_char": 0.728499606961296, "primary_score": 0.15873015873015872}, "task_idx": 16} |
|
{"task_name": "mmlu_global_facts:mc", "task_hash": "4f14cfa253ea56a8d3b0d2c805ccdb28", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts:mc", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.3018155097961426, "current_date": "2025-01-28 22:36:39 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.29, "acc_per_token": 0.29, "acc_per_char": 0.29, "correct_loss_raw": 1.416217890381813, "incorrect_loss_raw": 1.4884590681393934, "correct_loss_per_token": 1.416217890381813, "incorrect_loss_per_token": 1.4884590681393934, "correct_loss_per_char": 0.7081089451909065, "incorrect_loss_per_char": 0.7442295340696967, "primary_score": 0.29}, "task_idx": 17} |
|
{"task_name": "mmlu_high_school_biology:mc", "task_hash": "055cfa37938a062655e6ce08f80c7765", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology:mc", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.827435493469238, "current_date": "2025-01-28 22:36:42 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.23870967741935484, "acc_per_token": 0.23870967741935484, "acc_per_char": 0.23870967741935484, "correct_loss_raw": 1.4149069778380856, "incorrect_loss_raw": 1.426428426978408, "correct_loss_per_token": 1.4149069778380856, "incorrect_loss_per_token": 1.426428426978408, "correct_loss_per_char": 0.7074534889190428, "incorrect_loss_per_char": 0.713214213489204, "primary_score": 0.23870967741935484}, "task_idx": 18} |
|
{"task_name": "mmlu_high_school_chemistry:mc", "task_hash": "6cef5e5a35451e467b97a8cf773fb61c", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry:mc", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.301903963088989, "current_date": "2025-01-28 22:36:54 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.2660098522167488, "acc_per_token": 0.2660098522167488, "acc_per_char": 0.2660098522167488, "correct_loss_raw": 1.4112723845566435, "incorrect_loss_raw": 1.4276176255520538, "correct_loss_per_token": 1.4112723845566435, "incorrect_loss_per_token": 1.4276176255520538, "correct_loss_per_char": 0.7056361922783217, "incorrect_loss_per_char": 0.7138088127760269, "primary_score": 0.2660098522167488}, "task_idx": 19} |
|
{"task_name": "mmlu_high_school_computer_science:mc", "task_hash": "31a39a79632638f209cd0a9c599f158d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science:mc", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.064443111419678, "current_date": "2025-01-28 22:37:01 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.4288902390003204, "incorrect_loss_raw": 1.4646583642562232, "correct_loss_per_token": 1.4288902390003204, "incorrect_loss_per_token": 1.4646583642562232, "correct_loss_per_char": 0.7144451195001602, "incorrect_loss_per_char": 0.7323291821281116, "primary_score": 0.33}, "task_idx": 20} |
|
{"task_name": "mmlu_high_school_european_history:mc", "task_hash": "e8f2a29738091af55efa8a7194452ac2", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history:mc", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.744449853897095, "current_date": "2025-01-28 22:37:09 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.23636363636363636, "acc_per_token": 0.23636363636363636, "acc_per_char": 0.23636363636363636, "correct_loss_raw": 1.4495411656119608, "incorrect_loss_raw": 1.4138268783839067, "correct_loss_per_token": 1.4495411656119608, "incorrect_loss_per_token": 1.4138268783839067, "correct_loss_per_char": 0.7247705828059804, "incorrect_loss_per_char": 0.7069134391919534, "primary_score": 0.23636363636363636}, "task_idx": 21} |
|
{"task_name": "mmlu_high_school_geography:mc", "task_hash": "6a43a92b543ec77afeeda9d5011e0c36", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography:mc", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.131516456604004, "current_date": "2025-01-28 22:37:36 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.2676767676767677, "acc_per_token": 0.2676767676767677, "acc_per_char": 0.2676767676767677, "correct_loss_raw": 1.400579711102476, "incorrect_loss_raw": 1.43751945318999, "correct_loss_per_token": 1.400579711102476, "incorrect_loss_per_token": 1.43751945318999, "correct_loss_per_char": 0.700289855551238, "incorrect_loss_per_char": 0.718759726594995, "primary_score": 0.2676767676767677}, "task_idx": 22} |
|
{"task_name": "mmlu_high_school_government_and_politics:mc", "task_hash": "65cdc0b1dc4018c2017fc6023e9bb862", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics:mc", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.325153350830078, "current_date": "2025-01-28 22:37:43 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.30569948186528495, "acc_per_token": 0.30569948186528495, "acc_per_char": 0.30569948186528495, "correct_loss_raw": 1.3960181246149725, "incorrect_loss_raw": 1.459184393672745, "correct_loss_per_token": 1.3960181246149725, "incorrect_loss_per_token": 1.459184393672745, "correct_loss_per_char": 0.6980090623074863, "incorrect_loss_per_char": 0.7295921968363726, "primary_score": 0.30569948186528495}, "task_idx": 23} |
|
{"task_name": "mmlu_high_school_macroeconomics:mc", "task_hash": "177b3e0ec28ae90f76d191ba937fb328", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics:mc", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.836204528808594, "current_date": "2025-01-28 22:37:50 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.2564102564102564, "acc_per_token": 0.2564102564102564, "acc_per_char": 0.2564102564102564, "correct_loss_raw": 1.4008340325110997, "incorrect_loss_raw": 1.4511074923042557, "correct_loss_per_token": 1.4008340325110997, "incorrect_loss_per_token": 1.4511074923042557, "correct_loss_per_char": 0.7004170162555499, "incorrect_loss_per_char": 0.7255537461521279, "primary_score": 0.2564102564102564}, "task_idx": 24} |
|
{"task_name": "mmlu_high_school_mathematics:mc", "task_hash": "934371e2cf927fc449e77df454d85d2d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics:mc", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.988232374191284, "current_date": "2025-01-28 22:38:02 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.25555555555555554, "acc_per_token": 0.25555555555555554, "acc_per_char": 0.25555555555555554, "correct_loss_raw": 1.4557758278316921, "incorrect_loss_raw": 1.4901090468153537, "correct_loss_per_token": 1.4557758278316921, "incorrect_loss_per_token": 1.4901090468153537, "correct_loss_per_char": 0.7278879139158461, "incorrect_loss_per_char": 0.7450545234076769, "primary_score": 0.25555555555555554}, "task_idx": 25} |
|
{"task_name": "mmlu_high_school_microeconomics:mc", "task_hash": "3738e45ad1235f9f0a4825ae099697cb", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics:mc", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.5144665241241455, "current_date": "2025-01-28 22:38:12 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.226890756302521, "acc_per_token": 0.226890756302521, "acc_per_char": 0.226890756302521, "correct_loss_raw": 1.426626341683524, "incorrect_loss_raw": 1.4269534432921422, "correct_loss_per_token": 1.426626341683524, "incorrect_loss_per_token": 1.4269534432921422, "correct_loss_per_char": 0.713313170841762, "incorrect_loss_per_char": 0.7134767216460711, "primary_score": 0.226890756302521}, "task_idx": 26} |
|
{"task_name": "mmlu_high_school_physics:mc", "task_hash": "583350c5b48fd28100732ad06943489f", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics:mc", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.418765306472778, "current_date": "2025-01-28 22:38:19 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.33112582781456956, "acc_per_token": 0.33112582781456956, "acc_per_char": 0.33112582781456956, "correct_loss_raw": 1.3873338265134798, "incorrect_loss_raw": 1.4437498321333198, "correct_loss_per_token": 1.3873338265134798, "incorrect_loss_per_token": 1.4437498321333198, "correct_loss_per_char": 0.6936669132567399, "incorrect_loss_per_char": 0.7218749160666599, "primary_score": 0.33112582781456956}, "task_idx": 27} |
|
{"task_name": "mmlu_high_school_psychology:mc", "task_hash": "accf1559d013b1e7ac36647c1fe9dd67", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology:mc", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.500592708587646, "current_date": "2025-01-28 22:38:26 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.21651376146788992, "acc_per_token": 0.21651376146788992, "acc_per_char": 0.21651376146788992, "correct_loss_raw": 1.4213156835748515, "incorrect_loss_raw": 1.4153327799718305, "correct_loss_per_token": 1.4213156835748515, "incorrect_loss_per_token": 1.4153327799718305, "correct_loss_per_char": 0.7106578417874257, "incorrect_loss_per_char": 0.7076663899859152, "primary_score": 0.21651376146788992}, "task_idx": 28} |
|
{"task_name": "mmlu_high_school_statistics:mc", "task_hash": "7bd3b2133806936ee947ebd9c9890647", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics:mc", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.673909902572632, "current_date": "2025-01-28 22:38:46 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.4166666666666667, "acc_per_token": 0.4166666666666667, "acc_per_char": 0.4166666666666667, "correct_loss_raw": 1.313091162454199, "incorrect_loss_raw": 1.5389358467525904, "correct_loss_per_token": 1.313091162454199, "incorrect_loss_per_token": 1.5389358467525904, "correct_loss_per_char": 0.6565455812270995, "incorrect_loss_per_char": 0.7694679233762952, "primary_score": 0.4166666666666667}, "task_idx": 29} |
|
{"task_name": "mmlu_high_school_us_history:mc", "task_hash": "8097dc2c4728e3ef312c10bfcc9a0c47", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history:mc", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 32.81372022628784, "current_date": "2025-01-28 22:38:59 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.22058823529411764, "acc_per_token": 0.22058823529411764, "acc_per_char": 0.22058823529411764, "correct_loss_raw": 1.4462888538837433, "incorrect_loss_raw": 1.443970621313924, "correct_loss_per_token": 1.4462888538837433, "incorrect_loss_per_token": 1.443970621313924, "correct_loss_per_char": 0.7231444269418716, "incorrect_loss_per_char": 0.721985310656962, "primary_score": 0.22058823529411764}, "task_idx": 30} |
|
{"task_name": "mmlu_high_school_world_history:mc", "task_hash": "4c9689dbb0e9effb2991bc98e1364c03", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history:mc", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.67262077331543, "current_date": "2025-01-28 22:39:32 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.2616033755274262, "acc_per_token": 0.2616033755274262, "acc_per_char": 0.2616033755274262, "correct_loss_raw": 1.426479207061011, "incorrect_loss_raw": 1.4354138376843584, "correct_loss_per_token": 1.426479207061011, "incorrect_loss_per_token": 1.4354138376843584, "correct_loss_per_char": 0.7132396035305055, "incorrect_loss_per_char": 0.7177069188421792, "primary_score": 0.2616033755274262}, "task_idx": 31} |
|
{"task_name": "mmlu_human_aging:mc", "task_hash": "aed6dc4e5de4b465852e8add68f1e1c7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging:mc", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.6488049030303955, "current_date": "2025-01-28 22:39:58 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.35874439461883406, "acc_per_token": 0.35874439461883406, "acc_per_char": 0.35874439461883406, "correct_loss_raw": 1.3965698876188475, "incorrect_loss_raw": 1.4336071704000641, "correct_loss_per_token": 1.3965698876188475, "incorrect_loss_per_token": 1.4336071704000641, "correct_loss_per_char": 0.6982849438094237, "incorrect_loss_per_char": 0.7168035852000321, "primary_score": 0.35874439461883406}, "task_idx": 32} |
|
{"task_name": "mmlu_human_sexuality:mc", "task_hash": "40c85ccce055746bdd1f28232f48f0fa", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality:mc", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.658782958984375, "current_date": "2025-01-28 22:40:04 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.21374045801526717, "acc_per_token": 0.21374045801526717, "acc_per_char": 0.21374045801526717, "correct_loss_raw": 1.4585472927748702, "incorrect_loss_raw": 1.428870550548758, "correct_loss_per_token": 1.4585472927748702, "incorrect_loss_per_token": 1.428870550548758, "correct_loss_per_char": 0.7292736463874351, "incorrect_loss_per_char": 0.714435275274379, "primary_score": 0.21374045801526717}, "task_idx": 33} |
|
{"task_name": "mmlu_international_law:mc", "task_hash": "3cfc657dd55e3ad96d5c3e9cd17bc346", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law:mc", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.244182825088501, "current_date": "2025-01-28 22:40:08 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.36363636363636365, "acc_per_token": 0.36363636363636365, "acc_per_char": 0.36363636363636365, "correct_loss_raw": 1.365358995997216, "incorrect_loss_raw": 1.4374075474489492, "correct_loss_per_token": 1.365358995997216, "incorrect_loss_per_token": 1.4374075474489492, "correct_loss_per_char": 0.682679497998608, "incorrect_loss_per_char": 0.7187037737244746, "primary_score": 0.36363636363636365}, "task_idx": 34} |
|
{"task_name": "mmlu_jurisprudence:mc", "task_hash": "ca4ac71f0fd702b39c6245be2ab32061", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence:mc", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.7798855304718018, "current_date": "2025-01-28 22:40:14 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.4087109709227528, "incorrect_loss_raw": 1.4130348375550026, "correct_loss_per_token": 1.4087109709227528, "incorrect_loss_per_token": 1.4130348375550026, "correct_loss_per_char": 0.7043554854613764, "incorrect_loss_per_char": 0.7065174187775013, "primary_score": 0.25}, "task_idx": 35} |
|
{"task_name": "mmlu_logical_fallacies:mc", "task_hash": "a4b3c214c3cb1c10bfa4042dd0e9df92", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies:mc", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.102569341659546, "current_date": "2025-01-28 22:40:18 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.26993865030674846, "acc_per_token": 0.26993865030674846, "acc_per_char": 0.26993865030674846, "correct_loss_raw": 1.4300255252539746, "incorrect_loss_raw": 1.4530853404833006, "correct_loss_per_token": 1.4300255252539746, "incorrect_loss_per_token": 1.4530853404833006, "correct_loss_per_char": 0.7150127626269873, "incorrect_loss_per_char": 0.7265426702416503, "primary_score": 0.26993865030674846}, "task_idx": 36} |
|
{"task_name": "mmlu_machine_learning:mc", "task_hash": "43ad1436fc44eed0bc66cc7239ecd94b", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning:mc", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.81254506111145, "current_date": "2025-01-28 22:40:24 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.2857142857142857, "acc_per_token": 0.2857142857142857, "acc_per_char": 0.2857142857142857, "correct_loss_raw": 1.4112436388220106, "incorrect_loss_raw": 1.4298354834318159, "correct_loss_per_token": 1.4112436388220106, "incorrect_loss_per_token": 1.4298354834318159, "correct_loss_per_char": 0.7056218194110053, "incorrect_loss_per_char": 0.7149177417159079, "primary_score": 0.2857142857142857}, "task_idx": 37} |
|
{"task_name": "mmlu_management:mc", "task_hash": "f565b650124e104d5d59b40491bde8e7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management:mc", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.748063087463379, "current_date": "2025-01-28 22:40:30 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.2912621359223301, "acc_per_token": 0.2912621359223301, "acc_per_char": 0.2912621359223301, "correct_loss_raw": 1.3950761058955516, "incorrect_loss_raw": 1.4241073177856147, "correct_loss_per_token": 1.3950761058955516, "incorrect_loss_per_token": 1.4241073177856147, "correct_loss_per_char": 0.6975380529477758, "incorrect_loss_per_char": 0.7120536588928074, "primary_score": 0.2912621359223301}, "task_idx": 38} |
|
{"task_name": "mmlu_marketing:mc", "task_hash": "63c7c7a1863fe3aaf961947124cbd4c3", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing:mc", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.01939582824707, "current_date": "2025-01-28 22:40:32 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.24786324786324787, "acc_per_token": 0.24786324786324787, "acc_per_char": 0.24786324786324787, "correct_loss_raw": 1.4277025889127681, "incorrect_loss_raw": 1.4223103850995034, "correct_loss_per_token": 1.4277025889127681, "incorrect_loss_per_token": 1.4223103850995034, "correct_loss_per_char": 0.7138512944563841, "incorrect_loss_per_char": 0.7111551925497517, "primary_score": 0.24786324786324787}, "task_idx": 39} |
|
{"task_name": "mmlu_medical_genetics:mc", "task_hash": "11f7f7576f9aeb3dae4cc770e7a06c98", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics:mc", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.135511875152588, "current_date": "2025-01-28 22:40:41 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.4142525935173034, "incorrect_loss_raw": 1.4275709124406175, "correct_loss_per_token": 1.4142525935173034, "incorrect_loss_per_token": 1.4275709124406175, "correct_loss_per_char": 0.7071262967586517, "incorrect_loss_per_char": 0.7137854562203088, "primary_score": 0.27}, "task_idx": 40} |
|
{"task_name": "mmlu_miscellaneous:mc", "task_hash": "d9c892ba8631049d773d6fa3dc5dca82", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous:mc", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 19.111703634262085, "current_date": "2025-01-28 22:40:44 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.2962962962962963, "acc_per_token": 0.2962962962962963, "acc_per_char": 0.2962962962962963, "correct_loss_raw": 1.4032514787268364, "incorrect_loss_raw": 1.428520736947064, "correct_loss_per_token": 1.4032514787268364, "incorrect_loss_per_token": 1.428520736947064, "correct_loss_per_char": 0.7016257393634182, "incorrect_loss_per_char": 0.714260368473532, "primary_score": 0.2962962962962963}, "task_idx": 41} |
|
{"task_name": "mmlu_moral_disputes:mc", "task_hash": "d05901af9b9e012ab9e4ce8bb28c2bb8", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes:mc", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.97450566291809, "current_date": "2025-01-28 22:41:03 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.24566473988439305, "acc_per_token": 0.24566473988439305, "acc_per_char": 0.24566473988439305, "correct_loss_raw": 1.4478095125600783, "incorrect_loss_raw": 1.4461355473494477, "correct_loss_per_token": 1.4478095125600783, "incorrect_loss_per_token": 1.4461355473494477, "correct_loss_per_char": 0.7239047562800391, "incorrect_loss_per_char": 0.7230677736747239, "primary_score": 0.24566473988439305}, "task_idx": 42} |
|
{"task_name": "mmlu_moral_scenarios:mc", "task_hash": "33949ee763bf0ed37a82aa7796d56cd6", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios:mc", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 43.8379852771759, "current_date": "2025-01-28 22:41:16 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.2435754189944134, "acc_per_token": 0.2435754189944134, "acc_per_char": 0.2435754189944134, "correct_loss_raw": 1.421490123551651, "incorrect_loss_raw": 1.4254808845910725, "correct_loss_per_token": 1.421490123551651, "incorrect_loss_per_token": 1.4254808845910725, "correct_loss_per_char": 0.7107450617758255, "incorrect_loss_per_char": 0.7127404422955362, "primary_score": 0.2435754189944134}, "task_idx": 43} |
|
{"task_name": "mmlu_nutrition:mc", "task_hash": "e68f4b08d1adc45a7ab0ea385d987849", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition:mc", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.001081228256226, "current_date": "2025-01-28 22:41:59 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.21895424836601307, "acc_per_token": 0.21895424836601307, "acc_per_char": 0.21895424836601307, "correct_loss_raw": 1.4414266320615032, "incorrect_loss_raw": 1.421766403973233, "correct_loss_per_token": 1.4414266320615032, "incorrect_loss_per_token": 1.421766403973233, "correct_loss_per_char": 0.7207133160307516, "incorrect_loss_per_char": 0.7108832019866165, "primary_score": 0.21895424836601307}, "task_idx": 44} |
|
{"task_name": "mmlu_philosophy:mc", "task_hash": "dd14a2446c6e46449cd5b14ee7982b73", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy:mc", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.187968015670776, "current_date": "2025-01-28 22:42:12 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.3086816720257235, "acc_per_token": 0.3086816720257235, "acc_per_char": 0.3086816720257235, "correct_loss_raw": 1.394637080250829, "incorrect_loss_raw": 1.4242691859555008, "correct_loss_per_token": 1.394637080250829, "incorrect_loss_per_token": 1.4242691859555008, "correct_loss_per_char": 0.6973185401254145, "incorrect_loss_per_char": 0.7121345929777504, "primary_score": 0.3086816720257235}, "task_idx": 45} |
|
{"task_name": "mmlu_prehistory:mc", "task_hash": "d65b3e5cf8049b1c1442537b281f5a72", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory:mc", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.143106460571289, "current_date": "2025-01-28 22:42:21 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.2932098765432099, "acc_per_token": 0.2932098765432099, "acc_per_char": 0.2932098765432099, "correct_loss_raw": 1.3984669380717807, "incorrect_loss_raw": 1.4247865308949983, "correct_loss_per_token": 1.3984669380717807, "incorrect_loss_per_token": 1.4247865308949983, "correct_loss_per_char": 0.6992334690358903, "incorrect_loss_per_char": 0.7123932654474991, "primary_score": 0.2932098765432099}, "task_idx": 46} |
|
{"task_name": "mmlu_professional_accounting:mc", "task_hash": "2d9464b5e5a5ee20a777a37004dd3a2d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting:mc", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.87564992904663, "current_date": "2025-01-28 22:42:34 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.2765957446808511, "acc_per_token": 0.2765957446808511, "acc_per_char": 0.2765957446808511, "correct_loss_raw": 1.4184195805103221, "incorrect_loss_raw": 1.41706075186425, "correct_loss_per_token": 1.4184195805103221, "incorrect_loss_per_token": 1.41706075186425, "correct_loss_per_char": 0.7092097902551611, "incorrect_loss_per_char": 0.708530375932125, "primary_score": 0.2765957446808511}, "task_idx": 47} |
|
{"task_name": "mmlu_professional_law:mc", "task_hash": "c4dd4f89898c6498217d79776e68bb06", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law:mc", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 193.62796354293823, "current_date": "2025-01-28 22:42:48 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.23598435462842243, "acc_per_token": 0.23598435462842243, "acc_per_char": 0.23598435462842243, "correct_loss_raw": 1.4360968665629041, "incorrect_loss_raw": 1.4304321384440266, "correct_loss_per_token": 1.4360968665629041, "incorrect_loss_per_token": 1.4304321384440266, "correct_loss_per_char": 0.7180484332814521, "incorrect_loss_per_char": 0.7152160692220133, "primary_score": 0.23598435462842243}, "task_idx": 48} |
|
{"task_name": "mmlu_professional_medicine:mc", "task_hash": "8b8aa33e03e2f1b4abff4cbb3dd56cd7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine:mc", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 21.896700143814087, "current_date": "2025-01-28 22:46:01 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.39705882352941174, "acc_per_token": 0.39705882352941174, "acc_per_char": 0.39705882352941174, "correct_loss_raw": 1.3938986779574085, "incorrect_loss_raw": 1.5369568731562764, "correct_loss_per_token": 1.3938986779574085, "incorrect_loss_per_token": 1.5369568731562764, "correct_loss_per_char": 0.6969493389787043, "incorrect_loss_per_char": 0.7684784365781382, "primary_score": 0.39705882352941174}, "task_idx": 49} |
|
{"task_name": "mmlu_professional_psychology:mc", "task_hash": "3094d326fde18b55836110e1d0f8f241", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology:mc", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.001360177993774, "current_date": "2025-01-28 22:46:23 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.2581699346405229, "acc_per_token": 0.2581699346405229, "acc_per_char": 0.2581699346405229, "correct_loss_raw": 1.410348999539232, "incorrect_loss_raw": 1.4249647083430514, "correct_loss_per_token": 1.410348999539232, "incorrect_loss_per_token": 1.4249647083430514, "correct_loss_per_char": 0.705174499769616, "incorrect_loss_per_char": 0.7124823541715257, "primary_score": 0.2581699346405229}, "task_idx": 50} |
|
{"task_name": "mmlu_public_relations:mc", "task_hash": "b10f684a09888253de5b2778544ace3d", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations:mc", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.8859925270080566, "current_date": "2025-01-28 22:46:50 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.34545454545454546, "acc_per_token": 0.34545454545454546, "acc_per_char": 0.34545454545454546, "correct_loss_raw": 1.3966401826251638, "incorrect_loss_raw": 1.4288235136956882, "correct_loss_per_token": 1.3966401826251638, "incorrect_loss_per_token": 1.4288235136956882, "correct_loss_per_char": 0.6983200913125819, "incorrect_loss_per_char": 0.7144117568478441, "primary_score": 0.34545454545454546}, "task_idx": 51} |
|
{"task_name": "mmlu_security_studies:mc", "task_hash": "1f8f03c4608bfc16b773b6789dff3612", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies:mc", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.30668067932129, "current_date": "2025-01-28 22:46:55 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.1673469387755102, "acc_per_token": 0.1673469387755102, "acc_per_char": 0.1673469387755102, "correct_loss_raw": 1.4461856803115534, "incorrect_loss_raw": 1.4255701874389128, "correct_loss_per_token": 1.4461856803115534, "incorrect_loss_per_token": 1.4255701874389128, "correct_loss_per_char": 0.7230928401557767, "incorrect_loss_per_char": 0.7127850937194564, "primary_score": 0.1673469387755102}, "task_idx": 52} |
|
{"task_name": "mmlu_sociology:mc", "task_hash": "8febc5ac38c21f5a0811d42006faf2ea", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology:mc", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.110002279281616, "current_date": "2025-01-28 22:47:18 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.24378109452736318, "acc_per_token": 0.24378109452736318, "acc_per_char": 0.24378109452736318, "correct_loss_raw": 1.4228800048875572, "incorrect_loss_raw": 1.421098598397984, "correct_loss_per_token": 1.4228800048875572, "incorrect_loss_per_token": 1.421098598397984, "correct_loss_per_char": 0.7114400024437786, "incorrect_loss_per_char": 0.710549299198992, "primary_score": 0.24378109452736318}, "task_idx": 53} |
|
{"task_name": "mmlu_us_foreign_policy:mc", "task_hash": "cceb9539ca6356676c1a014a74093ec9", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy:mc", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.6541314125061035, "current_date": "2025-01-28 22:47:25 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.23, "acc_per_token": 0.23, "acc_per_char": 0.23, "correct_loss_raw": 1.434000769853592, "incorrect_loss_raw": 1.4196133653322858, "correct_loss_per_token": 1.434000769853592, "incorrect_loss_per_token": 1.4196133653322858, "correct_loss_per_char": 0.717000384926796, "incorrect_loss_per_char": 0.7098066826661429, "primary_score": 0.23}, "task_idx": 54} |
|
{"task_name": "mmlu_virology:mc", "task_hash": "1b216fb4e04c61029da5dfb32810fabc", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology:mc", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.669735908508301, "current_date": "2025-01-28 22:47:28 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.3072289156626506, "acc_per_token": 0.3072289156626506, "acc_per_char": 0.3072289156626506, "correct_loss_raw": 1.4114296220871339, "incorrect_loss_raw": 1.4285837040847564, "correct_loss_per_token": 1.4114296220871339, "incorrect_loss_per_token": 1.4285837040847564, "correct_loss_per_char": 0.7057148110435669, "incorrect_loss_per_char": 0.7142918520423782, "primary_score": 0.3072289156626506}, "task_idx": 55} |
|
{"task_name": "mmlu_world_religions:mc", "task_hash": "223d634e4c9d91a64ed77b7e259d7010", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions:mc", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.95239520072937, "current_date": "2025-01-28 22:47:33 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.25146198830409355, "acc_per_token": 0.25146198830409355, "acc_per_char": 0.25146198830409355, "correct_loss_raw": 1.3977750846517016, "incorrect_loss_raw": 1.4322511996442102, "correct_loss_per_token": 1.3977750846517016, "incorrect_loss_per_token": 1.4322511996442102, "correct_loss_per_char": 0.6988875423258508, "incorrect_loss_per_char": 0.7161255998221051, "primary_score": 0.25146198830409355}, "task_idx": 56} |
|
{"task_name": "mmlu_abstract_algebra", "task_hash": "c85fa3ca2628093d327501718793d07b", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.8007991313934326, "current_date": "2025-01-28 22:47:37 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.16, "acc_per_token": 0.19, "acc_per_char": 0.19, "correct_loss_raw": 6.869614138007164, "incorrect_loss_raw": 5.123212166229885, "correct_loss_per_token": 1.8998175663358217, "incorrect_loss_per_token": 2.0451965660127027, "correct_loss_per_char": 0.7431339020727424, "incorrect_loss_per_char": 0.737700834289516, "acc_uncond": 0.26, "correct_loss_uncond": -9.973355069756508, "incorrect_loss_uncond": -9.873269970814388, "primary_score": 0.19}, "task_idx": 57} |
|
{"task_name": "mmlu_anatomy", "task_hash": "3f9b02c965eba1bd23b0446d0e9deff4", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.418304681777954, "current_date": "2025-01-28 22:47:41 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.362962962962963, "acc_per_token": 0.34074074074074073, "acc_per_char": 0.3037037037037037, "correct_loss_raw": 18.050097020025607, "incorrect_loss_raw": 18.311192920767233, "correct_loss_per_token": 2.293636003848102, "incorrect_loss_per_token": 2.62485800422084, "correct_loss_per_char": 0.5213529013542572, "incorrect_loss_per_char": 0.594342243889545, "acc_uncond": 0.3111111111111111, "correct_loss_uncond": -15.044037494394514, "incorrect_loss_uncond": -14.717101418824846, "primary_score": 0.3037037037037037}, "task_idx": 58} |
|
{"task_name": "mmlu_astronomy", "task_hash": "d9e63c18cde7815546c5a54ffadb81f9", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.086711645126343, "current_date": "2025-01-28 22:47:47 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.3092105263157895, "acc_per_token": 0.375, "acc_per_char": 0.3881578947368421, "correct_loss_raw": 25.580662697749702, "incorrect_loss_raw": 25.1740071794443, "correct_loss_per_token": 2.4733851108936378, "incorrect_loss_per_token": 2.85147891723307, "correct_loss_per_char": 0.6145575749730166, "incorrect_loss_per_char": 0.6958959260665767, "acc_uncond": 0.40789473684210525, "correct_loss_uncond": -14.832323157277546, "incorrect_loss_uncond": -13.536962173487014, "primary_score": 0.3881578947368421}, "task_idx": 59} |
|
{"task_name": "mmlu_business_ethics", "task_hash": "dbbf5c673a31d657513075cc70e4f670", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.893115282058716, "current_date": "2025-01-28 22:47:57 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.5, "acc_per_token": 0.43, "acc_per_char": 0.45, "correct_loss_raw": 23.453789367675782, "incorrect_loss_raw": 26.393909745216362, "correct_loss_per_token": 3.281713834419038, "incorrect_loss_per_token": 3.63527224490647, "correct_loss_per_char": 0.9186450543984302, "incorrect_loss_per_char": 0.9744966450427746, "acc_uncond": 0.36, "correct_loss_uncond": -12.153233604431152, "incorrect_loss_uncond": -11.242717363039652, "primary_score": 0.45}, "task_idx": 60} |
|
{"task_name": "mmlu_clinical_knowledge", "task_hash": "940022f2e7983e3f56cfc7196b310a7f", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.811922311782837, "current_date": "2025-01-28 22:48:06 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.2830188679245283, "acc_per_token": 0.38113207547169814, "acc_per_char": 0.38113207547169814, "correct_loss_raw": 22.123643714639375, "incorrect_loss_raw": 20.55967536917273, "correct_loss_per_token": 2.5528685467264762, "incorrect_loss_per_token": 2.8272442358128638, "correct_loss_per_char": 0.6015267454680685, "incorrect_loss_per_char": 0.6873710059581529, "acc_uncond": 0.3584905660377358, "correct_loss_uncond": -13.87818564142821, "incorrect_loss_uncond": -12.788745701837843, "primary_score": 0.38113207547169814}, "task_idx": 61} |
|
{"task_name": "mmlu_college_biology", "task_hash": "0b879b8081c2b7d376a6abd76697f553", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.112276077270508, "current_date": "2025-01-28 22:48:21 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.3333333333333333, "acc_per_token": 0.3333333333333333, "acc_per_char": 0.3819444444444444, "correct_loss_raw": 20.15562465869718, "incorrect_loss_raw": 21.83716755774286, "correct_loss_per_token": 2.5882522416235885, "incorrect_loss_per_token": 3.0003426710170906, "correct_loss_per_char": 0.4998466177829353, "incorrect_loss_per_char": 0.58942911369962, "acc_uncond": 0.3819444444444444, "correct_loss_uncond": -15.927579243150022, "incorrect_loss_uncond": -14.359596281139943, "primary_score": 0.3819444444444444}, "task_idx": 62} |
|
{"task_name": "mmlu_college_chemistry", "task_hash": "0ed8a28c3b6ceca7f72f02bc9b87d236", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.227715730667114, "current_date": "2025-01-28 22:48:32 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.26, "acc_per_token": 0.35, "acc_per_char": 0.26, "correct_loss_raw": 18.376699719429016, "incorrect_loss_raw": 18.637492257754, "correct_loss_per_token": 2.95496215366955, "incorrect_loss_per_token": 3.047901730301063, "correct_loss_per_char": 1.1798262270589766, "incorrect_loss_per_char": 1.1793778047394419, "acc_uncond": 0.25, "correct_loss_uncond": -12.02003538608551, "incorrect_loss_uncond": -11.522469477653503, "primary_score": 0.26}, "task_idx": 63} |
|
{"task_name": "mmlu_college_computer_science", "task_hash": "563c1a7e8c030ab92f3c9359a1196891", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 17.07775902748108, "current_date": "2025-01-28 22:48:40 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.32, "acc_per_token": 0.28, "acc_per_char": 0.27, "correct_loss_raw": 18.364038631916046, "incorrect_loss_raw": 17.90530202150345, "correct_loss_per_token": 2.6942068554548784, "incorrect_loss_per_token": 2.9695804003074513, "correct_loss_per_char": 0.8683093138928093, "incorrect_loss_per_char": 0.8950832082290698, "acc_uncond": 0.35, "correct_loss_uncond": -12.329349439144135, "incorrect_loss_uncond": -12.21822862386703, "primary_score": 0.27}, "task_idx": 64} |
|
{"task_name": "mmlu_college_mathematics", "task_hash": "97a6ddef0d69128d9260dd1f8c82521c", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.061524629592896, "current_date": "2025-01-28 22:48:57 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.18, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 11.714233223199845, "incorrect_loss_raw": 10.420037207206095, "correct_loss_per_token": 2.9405851893096635, "incorrect_loss_per_token": 2.885563433783357, "correct_loss_per_char": 1.2039596272554867, "incorrect_loss_per_char": 1.1564252400547155, "acc_uncond": 0.3, "correct_loss_uncond": -9.182509263753891, "incorrect_loss_uncond": -8.80835912267367, "primary_score": 0.25}, "task_idx": 65} |
|
{"task_name": "mmlu_college_medicine", "task_hash": "483a77ff3415e8b126e8e83fda055b39", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.255683660507202, "current_date": "2025-01-28 22:49:07 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.2947976878612717, "acc_per_token": 0.3179190751445087, "acc_per_char": 0.28901734104046245, "correct_loss_raw": 20.329613751069658, "incorrect_loss_raw": 20.08376282901434, "correct_loss_per_token": 2.6962297764270766, "incorrect_loss_per_token": 2.848341829317948, "correct_loss_per_char": 0.6281389185459614, "incorrect_loss_per_char": 0.6688032917664197, "acc_uncond": 0.31213872832369943, "correct_loss_uncond": -13.599645053031127, "incorrect_loss_uncond": -13.151097627044416, "primary_score": 0.28901734104046245}, "task_idx": 66} |
|
{"task_name": "mmlu_college_physics", "task_hash": "db149cec3fe17117a3fa544e9ea18d10", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.695809364318848, "current_date": "2025-01-28 22:49:21 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.17647058823529413, "acc_per_token": 0.20588235294117646, "acc_per_char": 0.18627450980392157, "correct_loss_raw": 12.276802088700089, "incorrect_loss_raw": 10.65681253110661, "correct_loss_per_token": 2.6965595728334724, "incorrect_loss_per_token": 2.4891722812701556, "correct_loss_per_char": 1.078457387054979, "incorrect_loss_per_char": 0.9895847622831168, "acc_uncond": 0.2549019607843137, "correct_loss_uncond": -11.833522378229627, "incorrect_loss_uncond": -11.821572918907492, "primary_score": 0.18627450980392157}, "task_idx": 67} |
|
{"task_name": "mmlu_computer_security", "task_hash": "4a7052996611caebbf6877da200249e9", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.392957925796509, "current_date": "2025-01-28 22:49:29 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.37, "acc_per_token": 0.42, "acc_per_char": 0.43, "correct_loss_raw": 22.6862280023098, "incorrect_loss_raw": 22.077150499820704, "correct_loss_per_token": 3.5369763457558765, "incorrect_loss_per_token": 4.166081396972171, "correct_loss_per_char": 0.8252690707760125, "incorrect_loss_per_char": 0.9430687283352381, "acc_uncond": 0.47, "correct_loss_uncond": -12.012285667657853, "incorrect_loss_uncond": -9.246938327948254, "primary_score": 0.43}, "task_idx": 68} |
|
{"task_name": "mmlu_conceptual_physics", "task_hash": "f183468e707d67350aa3143009a25cb4", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.816831827163696, "current_date": "2025-01-28 22:49:34 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.44680851063829785, "acc_per_token": 0.39574468085106385, "acc_per_char": 0.3872340425531915, "correct_loss_raw": 9.180902650508475, "incorrect_loss_raw": 11.18278905656321, "correct_loss_per_token": 2.9558468813155776, "incorrect_loss_per_token": 3.725539554479406, "correct_loss_per_char": 0.6056277324516344, "incorrect_loss_per_char": 0.7330539230596994, "acc_uncond": 0.3191489361702128, "correct_loss_uncond": -11.380888761358058, "incorrect_loss_uncond": -9.883389157733163, "primary_score": 0.3872340425531915}, "task_idx": 69} |
|
{"task_name": "mmlu_econometrics", "task_hash": "f07b012d85c15887c3dce1c9c732f2cd", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.971760272979736, "current_date": "2025-01-28 22:49:44 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.2719298245614035, "acc_per_token": 0.2807017543859649, "acc_per_char": 0.2719298245614035, "correct_loss_raw": 20.309982992055122, "incorrect_loss_raw": 21.030795758927777, "correct_loss_per_token": 2.264997538978219, "incorrect_loss_per_token": 2.1897822494700447, "correct_loss_per_char": 0.5448100617714338, "incorrect_loss_per_char": 0.5515671949345781, "acc_uncond": 0.2894736842105263, "correct_loss_uncond": -15.18037696261155, "incorrect_loss_uncond": -15.323047515244507, "primary_score": 0.2719298245614035}, "task_idx": 70} |
|
{"task_name": "mmlu_electrical_engineering", "task_hash": "4dd791561a029e99d7a01f69b382e913", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.327355861663818, "current_date": "2025-01-28 22:49:57 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.27586206896551724, "acc_per_token": 0.33793103448275863, "acc_per_char": 0.3310344827586207, "correct_loss_raw": 13.682885837554931, "incorrect_loss_raw": 13.75903522063945, "correct_loss_per_token": 3.3592418816663727, "incorrect_loss_per_token": 3.7358111697527363, "correct_loss_per_char": 0.9527622174152375, "incorrect_loss_per_char": 0.9747887331770683, "acc_uncond": 0.2689655172413793, "correct_loss_uncond": -9.166813426182188, "incorrect_loss_uncond": -9.70173498186572, "primary_score": 0.3310344827586207}, "task_idx": 71} |
|
{"task_name": "mmlu_elementary_mathematics", "task_hash": "34eb4bd85bcf6cf6a0740154b20610f9", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.46187472343445, "current_date": "2025-01-28 22:50:07 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.22486772486772486, "acc_per_token": 0.23809523809523808, "acc_per_char": 0.23015873015873015, "correct_loss_raw": 12.500810869471737, "incorrect_loss_raw": 12.490979615973412, "correct_loss_per_token": 4.041004668331812, "incorrect_loss_per_token": 4.105295090620753, "correct_loss_per_char": 1.598499868102426, "incorrect_loss_per_char": 1.6004921054290886, "acc_uncond": 0.25132275132275134, "correct_loss_uncond": -8.476502858457112, "incorrect_loss_uncond": -8.33731818367355, "primary_score": 0.23015873015873015}, "task_idx": 72} |
|
{"task_name": "mmlu_formal_logic", "task_hash": "edba816f035a5a7d7df7dae63a847ed4", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.245741605758667, "current_date": "2025-01-28 22:50:31 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.30952380952380953, "acc_per_token": 0.2698412698412698, "acc_per_char": 0.30158730158730157, "correct_loss_raw": 24.893998077937535, "incorrect_loss_raw": 26.133295801581532, "correct_loss_per_token": 2.7116675138348123, "incorrect_loss_per_token": 2.7241648390492945, "correct_loss_per_char": 1.1867459102371267, "incorrect_loss_per_char": 1.2454240104438674, "acc_uncond": 0.2222222222222222, "correct_loss_uncond": -25.99389488734896, "incorrect_loss_uncond": -26.22075840468129, "primary_score": 0.30158730158730157}, "task_idx": 73} |
|
{"task_name": "mmlu_global_facts", "task_hash": "83faa1c084d9844ed22d2f870171a354", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.952316522598267, "current_date": "2025-01-28 22:50:44 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.24, "acc_per_char": 0.25, "correct_loss_raw": 8.278837785720825, "incorrect_loss_raw": 8.835856211582822, "correct_loss_per_token": 2.8056958920990214, "incorrect_loss_per_token": 2.8483650020497873, "correct_loss_per_char": 1.1085497451363076, "incorrect_loss_per_char": 1.1334918764757753, "acc_uncond": 0.2, "correct_loss_uncond": -6.920755653381348, "incorrect_loss_uncond": -7.079783718188605, "primary_score": 0.25}, "task_idx": 74} |
|
{"task_name": "mmlu_high_school_biology", "task_hash": "40305e6449b4c634cf3858f0cb1a9ea0", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.631765127182007, "current_date": "2025-01-28 22:50:50 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.35161290322580646, "acc_per_token": 0.4032258064516129, "acc_per_char": 0.4064516129032258, "correct_loss_raw": 21.607249181885873, "incorrect_loss_raw": 22.094821071624768, "correct_loss_per_token": 2.546654726542327, "incorrect_loss_per_token": 2.941898022847176, "correct_loss_per_char": 0.5462745469798623, "incorrect_loss_per_char": 0.5900671546020984, "acc_uncond": 0.3870967741935484, "correct_loss_uncond": -13.925496973145393, "incorrect_loss_uncond": -12.674920809653498, "primary_score": 0.4064516129032258}, "task_idx": 75} |
|
{"task_name": "mmlu_high_school_chemistry", "task_hash": "c148a2f0c73c4d2e8a363125f171f603", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.474448919296265, "current_date": "2025-01-28 22:51:12 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.20689655172413793, "acc_per_token": 0.24630541871921183, "acc_per_char": 0.2413793103448276, "correct_loss_raw": 20.41424568592034, "incorrect_loss_raw": 18.734923248807792, "correct_loss_per_token": 2.638010876369508, "incorrect_loss_per_token": 2.577175435616267, "correct_loss_per_char": 0.893972889317772, "incorrect_loss_per_char": 0.8817604139892572, "acc_uncond": 0.3103448275862069, "correct_loss_uncond": -13.46020593960297, "incorrect_loss_uncond": -13.278387852294497, "primary_score": 0.2413793103448276}, "task_idx": 76} |
|
{"task_name": "mmlu_high_school_computer_science", "task_hash": "7f237d33901391c40fe99221b7fc7df2", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.995660305023193, "current_date": "2025-01-28 22:51:29 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.27, "acc_per_char": 0.31, "correct_loss_raw": 24.24228214800358, "incorrect_loss_raw": 24.508576377630238, "correct_loss_per_token": 2.729215729422765, "incorrect_loss_per_token": 2.921161604473935, "correct_loss_per_char": 0.8552203291511253, "incorrect_loss_per_char": 0.920918109466471, "acc_uncond": 0.27, "correct_loss_uncond": -15.800592300295829, "incorrect_loss_uncond": -15.446470299959188, "primary_score": 0.31}, "task_idx": 77} |
|
{"task_name": "mmlu_high_school_european_history", "task_hash": "bce04ae918d4f75bd0e71aeb5508ea76", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 90.8071620464325, "current_date": "2025-01-28 22:51:42 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.2909090909090909, "acc_per_token": 0.4, "acc_per_char": 0.4121212121212121, "correct_loss_raw": 28.99259257966822, "incorrect_loss_raw": 28.175486922023275, "correct_loss_per_token": 2.664430461905858, "incorrect_loss_per_token": 3.1671600496326557, "correct_loss_per_char": 0.48344800734399823, "incorrect_loss_per_char": 0.5706152578899196, "acc_uncond": 0.4, "correct_loss_uncond": -15.019520569570137, "incorrect_loss_uncond": -13.166462616005328, "primary_score": 0.4121212121212121}, "task_idx": 78} |
|
{"task_name": "mmlu_high_school_geography", "task_hash": "2451a97e8ea5ba8e49d0f60db615137b", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.376394748687744, "current_date": "2025-01-28 22:53:13 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.35353535353535354, "acc_per_token": 0.41414141414141414, "acc_per_char": 0.398989898989899, "correct_loss_raw": 14.893016030391058, "incorrect_loss_raw": 15.203946114589865, "correct_loss_per_token": 3.2453381852729546, "incorrect_loss_per_token": 3.7009779211436875, "correct_loss_per_char": 0.6177910502396351, "incorrect_loss_per_char": 0.7281685944975731, "acc_uncond": 0.45454545454545453, "correct_loss_uncond": -10.973916836158194, "incorrect_loss_uncond": -9.519376937588463, "primary_score": 0.398989898989899}, "task_idx": 79} |
|
{"task_name": "mmlu_high_school_government_and_politics", "task_hash": "432e3dd431e2137bb51952baabfe8d40", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.984575748443604, "current_date": "2025-01-28 22:53:23 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.37823834196891193, "acc_per_token": 0.41450777202072536, "acc_per_char": 0.42487046632124353, "correct_loss_raw": 22.53443668095559, "incorrect_loss_raw": 23.97239570872978, "correct_loss_per_token": 2.3713786314612118, "incorrect_loss_per_token": 2.839619713869773, "correct_loss_per_char": 0.40028085378000405, "incorrect_loss_per_char": 0.4791517659915154, "acc_uncond": 0.43523316062176165, "correct_loss_uncond": -16.59763565412457, "incorrect_loss_uncond": -13.856588512298767, "primary_score": 0.42487046632124353}, "task_idx": 80} |
|
{"task_name": "mmlu_high_school_macroeconomics", "task_hash": "fa28d7d574940324e3f18cc755314008", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.716152667999268, "current_date": "2025-01-28 22:53:36 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.2564102564102564, "acc_per_token": 0.3128205128205128, "acc_per_char": 0.2846153846153846, "correct_loss_raw": 23.32332377005846, "incorrect_loss_raw": 23.13647856590075, "correct_loss_per_token": 2.8958814854123323, "incorrect_loss_per_token": 3.039365654909822, "correct_loss_per_char": 0.6132319119590228, "incorrect_loss_per_char": 0.6297969417394456, "acc_uncond": 0.31794871794871793, "correct_loss_uncond": -16.318520267804463, "incorrect_loss_uncond": -15.51478351364787, "primary_score": 0.2846153846153846}, "task_idx": 81} |
|
{"task_name": "mmlu_high_school_mathematics", "task_hash": "d35dafac7b92c7adc6cb83bfcf827620", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.076260089874268, "current_date": "2025-01-28 22:53:57 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.13333333333333333, "acc_per_token": 0.16296296296296298, "acc_per_char": 0.15555555555555556, "correct_loss_raw": 9.204976223133228, "incorrect_loss_raw": 8.08783481268235, "correct_loss_per_token": 4.325949969449563, "incorrect_loss_per_token": 4.017497469068086, "correct_loss_per_char": 1.760739348124492, "incorrect_loss_per_char": 1.6445159907803975, "acc_uncond": 0.25555555555555554, "correct_loss_uncond": -5.782799828494037, "incorrect_loss_uncond": -5.6601883399633754, "primary_score": 0.15555555555555556}, "task_idx": 82} |
|
{"task_name": "mmlu_high_school_microeconomics", "task_hash": "9b84847fb5a13e1e48dfd2e71e7dfdc5", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.753453493118286, "current_date": "2025-01-28 22:54:13 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.29831932773109243, "acc_per_token": 0.3277310924369748, "acc_per_char": 0.3445378151260504, "correct_loss_raw": 28.13104149573991, "incorrect_loss_raw": 27.048356812875138, "correct_loss_per_token": 3.0403787622264153, "incorrect_loss_per_token": 3.18991426331285, "correct_loss_per_char": 0.6511666080516382, "incorrect_loss_per_char": 0.6694161241463367, "acc_uncond": 0.29411764705882354, "correct_loss_uncond": -16.016987619279814, "incorrect_loss_uncond": -15.557740727058636, "primary_score": 0.3445378151260504}, "task_idx": 83} |
|
{"task_name": "mmlu_high_school_physics", "task_hash": "2438f80fa949fdfba5fd0982a3e13ce8", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.358663082122803, "current_date": "2025-01-28 22:54:26 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.2185430463576159, "acc_per_token": 0.24503311258278146, "acc_per_char": 0.25165562913907286, "correct_loss_raw": 21.145936244370922, "incorrect_loss_raw": 20.615255395571385, "correct_loss_per_token": 2.4163758161593165, "incorrect_loss_per_token": 2.3898283100977675, "correct_loss_per_char": 0.8514601195880208, "incorrect_loss_per_char": 0.8441124034745007, "acc_uncond": 0.2582781456953642, "correct_loss_uncond": -15.770635069600793, "incorrect_loss_uncond": -15.784137553463449, "primary_score": 0.25165562913907286}, "task_idx": 84} |
|
{"task_name": "mmlu_high_school_psychology", "task_hash": "e5c6b909fb842973d0ba75f8fad285a1", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 40.372140407562256, "current_date": "2025-01-28 22:54:40 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.44770642201834865, "acc_per_token": 0.44220183486238535, "acc_per_char": 0.43669724770642204, "correct_loss_raw": 15.492815904715739, "incorrect_loss_raw": 17.66775804688806, "correct_loss_per_token": 3.2219814424440893, "incorrect_loss_per_token": 4.018716729662848, "correct_loss_per_char": 0.5360055991624847, "incorrect_loss_per_char": 0.6700789690531939, "acc_uncond": 0.41467889908256883, "correct_loss_uncond": -13.453667054646607, "incorrect_loss_uncond": -11.540014688975951, "primary_score": 0.43669724770642204}, "task_idx": 85} |
|
{"task_name": "mmlu_high_school_statistics", "task_hash": "c5e879c445098b25ee27496e3b91777c", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.359761238098145, "current_date": "2025-01-28 22:55:21 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.28703703703703703, "acc_per_token": 0.3287037037037037, "acc_per_char": 0.3055555555555556, "correct_loss_raw": 26.200835422785193, "incorrect_loss_raw": 27.084853073697037, "correct_loss_per_token": 2.6154795940156275, "incorrect_loss_per_token": 2.7054257533621136, "correct_loss_per_char": 0.7976069811799237, "incorrect_loss_per_char": 0.8407188293755185, "acc_uncond": 0.30092592592592593, "correct_loss_uncond": -17.29601466159026, "incorrect_loss_uncond": -16.46438581119348, "primary_score": 0.3055555555555556}, "task_idx": 86} |
|
{"task_name": "mmlu_high_school_us_history", "task_hash": "07edfc83a12773340cdb716671b46541", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 110.82937216758728, "current_date": "2025-01-28 22:55:47 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.31862745098039214, "acc_per_token": 0.31862745098039214, "acc_per_char": 0.37254901960784315, "correct_loss_raw": 26.618980217213725, "incorrect_loss_raw": 27.263960099103404, "correct_loss_per_token": 2.588034789381987, "incorrect_loss_per_token": 2.8024372832035565, "correct_loss_per_char": 0.4853787438826697, "incorrect_loss_per_char": 0.5319232386880215, "acc_uncond": 0.3382352941176471, "correct_loss_uncond": -13.67589058011186, "incorrect_loss_uncond": -12.063057624241877, "primary_score": 0.37254901960784315}, "task_idx": 87} |
|
{"task_name": "mmlu_high_school_world_history", "task_hash": "38f161e2f228b6acfe7cb1aa36d0d3ef", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 79.43909883499146, "current_date": "2025-01-28 22:57:38 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.28270042194092826, "acc_per_token": 0.350210970464135, "acc_per_char": 0.29535864978902954, "correct_loss_raw": 31.26888810431404, "incorrect_loss_raw": 30.740597895932098, "correct_loss_per_token": 2.928289248436712, "incorrect_loss_per_token": 3.231146465106264, "correct_loss_per_char": 0.5425519997491743, "incorrect_loss_per_char": 0.5788576879738572, "acc_uncond": 0.3755274261603376, "correct_loss_uncond": -13.914379819033016, "incorrect_loss_uncond": -12.460183474249638, "primary_score": 0.29535864978902954}, "task_idx": 88} |
|
{"task_name": "mmlu_human_aging", "task_hash": "8c66e7db317c293ebcd7cd3ad67b5840", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.16870927810669, "current_date": "2025-01-28 22:58:57 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.3901345291479821, "acc_per_token": 0.39461883408071746, "acc_per_char": 0.3901345291479821, "correct_loss_raw": 13.265995577579123, "incorrect_loss_raw": 15.933169311472115, "correct_loss_per_token": 3.1792583123113225, "incorrect_loss_per_token": 3.6824491460763893, "correct_loss_per_char": 0.5931539026499014, "incorrect_loss_per_char": 0.7213349821239929, "acc_uncond": 0.35874439461883406, "correct_loss_uncond": -9.616355861516277, "incorrect_loss_uncond": -8.394420486929164, "primary_score": 0.3901345291479821}, "task_idx": 89} |
|
{"task_name": "mmlu_human_sexuality", "task_hash": "f3dcb40d784b716dae889d9bf3c62232", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.6616621017456055, "current_date": "2025-01-28 22:59:08 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.33587786259541985, "acc_per_token": 0.3969465648854962, "acc_per_char": 0.35877862595419846, "correct_loss_raw": 15.421348831125798, "incorrect_loss_raw": 17.013223619558115, "correct_loss_per_token": 3.232259738158243, "incorrect_loss_per_token": 3.8039631588523775, "correct_loss_per_char": 0.6779892372553639, "incorrect_loss_per_char": 0.7219905285426741, "acc_uncond": 0.2595419847328244, "correct_loss_uncond": -11.125376795084422, "incorrect_loss_uncond": -11.816082447843085, "primary_score": 0.35877862595419846}, "task_idx": 90} |
|
{"task_name": "mmlu_international_law", "task_hash": "b4d3ab839d093262fe791e56c98053df", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.197525978088379, "current_date": "2025-01-28 22:59:15 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.2066115702479339, "acc_per_token": 0.32231404958677684, "acc_per_char": 0.3140495867768595, "correct_loss_raw": 49.143299920499814, "incorrect_loss_raw": 35.66231494435924, "correct_loss_per_token": 2.4919706925454244, "incorrect_loss_per_token": 2.6574130665678393, "correct_loss_per_char": 0.45580109453343454, "incorrect_loss_per_char": 0.47563891910474376, "acc_uncond": 0.4132231404958678, "correct_loss_uncond": -25.122074539011177, "incorrect_loss_uncond": -22.341709102809265, "primary_score": 0.3140495867768595}, "task_idx": 91} |
|
{"task_name": "mmlu_jurisprudence", "task_hash": "a5a3583aea5dbd6ece8896b0140522f5", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.853705167770386, "current_date": "2025-01-28 22:59:25 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.2037037037037037, "acc_per_token": 0.23148148148148148, "acc_per_char": 0.26851851851851855, "correct_loss_raw": 28.849514014191097, "incorrect_loss_raw": 23.601570882914984, "correct_loss_per_token": 3.4083927781709167, "incorrect_loss_per_token": 3.690943605943219, "correct_loss_per_char": 0.6617516235075503, "incorrect_loss_per_char": 0.6789110459237553, "acc_uncond": 0.3148148148148148, "correct_loss_uncond": -13.126620374344013, "incorrect_loss_uncond": -12.157751857498544, "primary_score": 0.26851851851851855}, "task_idx": 92} |
|
{"task_name": "mmlu_logical_fallacies", "task_hash": "87754a93f67c5e3682212e20e26d138f", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.98760724067688, "current_date": "2025-01-28 22:59:32 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.3128834355828221, "acc_per_token": 0.32515337423312884, "acc_per_char": 0.32515337423312884, "correct_loss_raw": 25.324365175574837, "incorrect_loss_raw": 24.82704315205051, "correct_loss_per_token": 3.6489869125370564, "incorrect_loss_per_token": 3.9748963475403993, "correct_loss_per_char": 0.6471265034347871, "incorrect_loss_per_char": 0.727814055247372, "acc_uncond": 0.3496932515337423, "correct_loss_uncond": -12.140092412386934, "incorrect_loss_uncond": -10.665011691657075, "primary_score": 0.32515337423312884}, "task_idx": 93} |
|
{"task_name": "mmlu_machine_learning", "task_hash": "c7a50715045d63764fe2fc8c95f84e4e", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.166738748550415, "current_date": "2025-01-28 22:59:43 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.26785714285714285, "acc_per_token": 0.23214285714285715, "acc_per_char": 0.2767857142857143, "correct_loss_raw": 19.73031181735652, "incorrect_loss_raw": 19.962696756635385, "correct_loss_per_token": 3.962941681372349, "incorrect_loss_per_token": 3.890866119612744, "correct_loss_per_char": 1.019208526942912, "incorrect_loss_per_char": 1.0051290247251965, "acc_uncond": 0.25, "correct_loss_uncond": -7.80760141994272, "incorrect_loss_uncond": -7.384287642581122, "primary_score": 0.2767857142857143}, "task_idx": 94} |
|
{"task_name": "mmlu_management", "task_hash": "bb2a328db2333c8df600dba174c2c4f7", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.456880807876587, "current_date": "2025-01-28 22:59:52 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.30097087378640774, "acc_per_token": 0.4077669902912621, "acc_per_char": 0.4368932038834951, "correct_loss_raw": 14.372589470692052, "incorrect_loss_raw": 14.791216655842307, "correct_loss_per_token": 3.7546848435269626, "incorrect_loss_per_token": 4.193690584315543, "correct_loss_per_char": 0.6444644352395216, "incorrect_loss_per_char": 0.7096893629869219, "acc_uncond": 0.42718446601941745, "correct_loss_uncond": -9.07513473276953, "incorrect_loss_uncond": -7.9406714099896405, "primary_score": 0.4368932038834951}, "task_idx": 95} |
|
{"task_name": "mmlu_marketing", "task_hash": "58c595b7c49dba71f3aa397880a13a84", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.927972316741943, "current_date": "2025-01-28 22:59:57 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.49572649572649574, "acc_per_token": 0.49572649572649574, "acc_per_char": 0.49572649572649574, "correct_loss_raw": 13.834066818412552, "incorrect_loss_raw": 16.770095258017214, "correct_loss_per_token": 2.8060586235688953, "incorrect_loss_per_token": 3.6331203108692036, "correct_loss_per_char": 0.5785293789824133, "incorrect_loss_per_char": 0.7565314767293014, "acc_uncond": 0.5085470085470085, "correct_loss_uncond": -12.960387743945814, "incorrect_loss_uncond": -10.329098611136104, "primary_score": 0.49572649572649574}, "task_idx": 96} |
|
{"task_name": "mmlu_medical_genetics", "task_hash": "36a9fec8301b47f23d8ced742c53d402", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.598636150360107, "current_date": "2025-01-28 23:00:12 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.35, "acc_per_token": 0.38, "acc_per_char": 0.4, "correct_loss_raw": 15.001992144584655, "incorrect_loss_raw": 14.109933145046236, "correct_loss_per_token": 2.5846105495765688, "incorrect_loss_per_token": 2.9764702960370695, "correct_loss_per_char": 0.6665677303876627, "incorrect_loss_per_char": 0.7574390813948088, "acc_uncond": 0.37, "correct_loss_uncond": -13.242882170677184, "incorrect_loss_uncond": -11.95330838123957, "primary_score": 0.4}, "task_idx": 97} |
|
{"task_name": "mmlu_miscellaneous", "task_hash": "3ce7aa82135b0926faa1a6d49e1f073f", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 31.48420763015747, "current_date": "2025-01-28 23:00:16 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.4661558109833972, "acc_per_token": 0.4508301404853129, "acc_per_char": 0.4648786717752235, "correct_loss_raw": 10.563235344767266, "incorrect_loss_raw": 12.620073988900485, "correct_loss_per_token": 3.094379709762898, "incorrect_loss_per_token": 4.042740800844056, "correct_loss_per_char": 0.6668340255124663, "incorrect_loss_per_char": 0.8802967710188774, "acc_uncond": 0.4840357598978289, "correct_loss_uncond": -10.465876538729912, "incorrect_loss_uncond": -8.39672588913833, "primary_score": 0.4648786717752235}, "task_idx": 98} |
|
{"task_name": "mmlu_moral_disputes", "task_hash": "643b3f1a385bb8b4ce6a53105fffb3de", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 24.030919075012207, "current_date": "2025-01-28 23:00:48 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.2745664739884393, "acc_per_token": 0.30346820809248554, "acc_per_char": 0.25722543352601157, "correct_loss_raw": 28.29641576033796, "incorrect_loss_raw": 25.717709578071222, "correct_loss_per_token": 3.1054814251087093, "incorrect_loss_per_token": 3.2459228834051115, "correct_loss_per_char": 0.6142554110442542, "incorrect_loss_per_char": 0.6158678736943969, "acc_uncond": 0.3208092485549133, "correct_loss_uncond": -14.065454878559002, "incorrect_loss_uncond": -13.653906957262524, "primary_score": 0.25722543352601157}, "task_idx": 99} |
|
{"task_name": "mmlu_moral_scenarios", "task_hash": "49d4bc1cb20a4596312dda1c40b5467e", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 116.58157849311829, "current_date": "2025-01-28 23:01:12 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.23798882681564246, "acc_per_token": 0.23798882681564246, "acc_per_char": 0.23798882681564246, "correct_loss_raw": 1.8738852696378803, "incorrect_loss_raw": 1.8248078697664563, "correct_loss_per_token": 0.4412940012542896, "incorrect_loss_per_token": 0.43430041784754714, "correct_loss_per_char": 0.10404536119186, "incorrect_loss_per_char": 0.10233466985782204, "acc_uncond": 0.25251396648044694, "correct_loss_uncond": -19.073117448131466, "incorrect_loss_uncond": -18.97169191604218, "primary_score": 0.23798882681564246}, "task_idx": 100} |
|
{"task_name": "mmlu_nutrition", "task_hash": "96b6d39ad9e2a3d1f6444ca444eafe21", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 24.05422830581665, "current_date": "2025-01-28 23:03:09 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.27450980392156865, "acc_per_token": 0.3366013071895425, "acc_per_char": 0.3235294117647059, "correct_loss_raw": 26.88473216458863, "incorrect_loss_raw": 23.947185122369444, "correct_loss_per_token": 2.7123445598352913, "incorrect_loss_per_token": 2.9259598929731725, "correct_loss_per_char": 0.5916553396222153, "incorrect_loss_per_char": 0.6382808079213305, "acc_uncond": 0.34967320261437906, "correct_loss_uncond": -11.458059841511297, "incorrect_loss_uncond": -11.157264869197515, "primary_score": 0.3235294117647059}, "task_idx": 101} |
|
{"task_name": "mmlu_philosophy", "task_hash": "e8a8e079a41710f36b2b11993287bbfb", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.96663212776184, "current_date": "2025-01-28 23:03:33 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.2733118971061093, "acc_per_token": 0.2604501607717042, "acc_per_char": 0.2861736334405145, "correct_loss_raw": 24.160437655602237, "incorrect_loss_raw": 22.095235615553445, "correct_loss_per_token": 3.456400314243523, "incorrect_loss_per_token": 3.523076761974967, "correct_loss_per_char": 0.6824404975617542, "incorrect_loss_per_char": 0.6849178703389117, "acc_uncond": 0.3247588424437299, "correct_loss_uncond": -12.740601260945727, "incorrect_loss_uncond": -12.200350506895989, "primary_score": 0.2861736334405145}, "task_idx": 102} |
|
{"task_name": "mmlu_prehistory", "task_hash": "7b3aeaaf8c8020231ef7fed4751f86c2", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.940000772476196, "current_date": "2025-01-28 23:03:50 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.3950617283950617, "acc_per_token": 0.3549382716049383, "acc_per_char": 0.3425925925925926, "correct_loss_raw": 21.544764254840068, "incorrect_loss_raw": 23.408556957671667, "correct_loss_per_token": 2.719362407400458, "incorrect_loss_per_token": 3.061571254592011, "correct_loss_per_char": 0.6246061464464701, "incorrect_loss_per_char": 0.6890525985713941, "acc_uncond": 0.3395061728395062, "correct_loss_uncond": -15.288531370093057, "incorrect_loss_uncond": -14.466170308153329, "primary_score": 0.3425925925925926}, "task_idx": 103} |
|
{"task_name": "mmlu_professional_accounting", "task_hash": "271a9bf402980f6076d2237f6c3d56d5", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 35.53977155685425, "current_date": "2025-01-28 23:04:11 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.25886524822695034, "acc_per_token": 0.24113475177304963, "acc_per_char": 0.24113475177304963, "correct_loss_raw": 26.63527681641545, "incorrect_loss_raw": 26.714282638355925, "correct_loss_per_token": 3.1146707298600944, "incorrect_loss_per_token": 3.1343855241720178, "correct_loss_per_char": 0.8344222590911455, "incorrect_loss_per_char": 0.855965238549494, "acc_uncond": 0.23404255319148937, "correct_loss_uncond": -12.230710915639891, "incorrect_loss_uncond": -11.857481007880352, "primary_score": 0.24113475177304963}, "task_idx": 104} |
|
{"task_name": "mmlu_professional_law", "task_hash": "9cf2ca304d70aaad2023633d91fbfefa", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 498.7431628704071, "current_date": "2025-01-28 23:04:46 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.2470664928292047, "acc_per_token": 0.27249022164276404, "acc_per_char": 0.28096479791395046, "correct_loss_raw": 42.60308619288453, "incorrect_loss_raw": 40.950438800893174, "correct_loss_per_token": 2.3378126243416184, "incorrect_loss_per_token": 2.344101123782433, "correct_loss_per_char": 0.46450144197903787, "incorrect_loss_per_char": 0.46480584545610665, "acc_uncond": 0.27835723598435463, "correct_loss_uncond": -26.366236305765256, "incorrect_loss_uncond": -25.505998810469052, "primary_score": 0.28096479791395046}, "task_idx": 105} |
|
{"task_name": "mmlu_professional_medicine", "task_hash": "e76678f3aea053cba7bbb3fe152ff642", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 65.0459406375885, "current_date": "2025-01-28 23:13:05 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.3014705882352941, "acc_per_token": 0.3088235294117647, "acc_per_char": 0.3125, "correct_loss_raw": 14.52855256068356, "incorrect_loss_raw": 15.382910631450951, "correct_loss_per_token": 2.5904497607872914, "incorrect_loss_per_token": 2.7594740984440853, "correct_loss_per_char": 0.5286169810304566, "incorrect_loss_per_char": 0.5743104560037122, "acc_uncond": 0.35294117647058826, "correct_loss_uncond": -11.053912863573608, "incorrect_loss_uncond": -10.347978409598849, "primary_score": 0.3125}, "task_idx": 106} |
|
{"task_name": "mmlu_professional_psychology", "task_hash": "1f11cdabb27186bb3d09781f9a2bce87", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 52.2474410533905, "current_date": "2025-01-28 23:14:10 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.29411764705882354, "acc_per_token": 0.31862745098039214, "acc_per_char": 0.30392156862745096, "correct_loss_raw": 25.987278289216405, "incorrect_loss_raw": 26.76041926300733, "correct_loss_per_token": 3.3580263735069833, "incorrect_loss_per_token": 3.6368981728185092, "correct_loss_per_char": 0.6099669233283038, "incorrect_loss_per_char": 0.6562401783781975, "acc_uncond": 0.315359477124183, "correct_loss_uncond": -15.71813710457554, "incorrect_loss_uncond": -14.971442831659884, "primary_score": 0.30392156862745096}, "task_idx": 107} |
|
{"task_name": "mmlu_public_relations", "task_hash": "f4f7d9efa5b14b632f1bb8cf53a780d0", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.905596017837524, "current_date": "2025-01-28 23:15:02 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.45454545454545453, "acc_per_token": 0.36363636363636365, "acc_per_char": 0.3090909090909091, "correct_loss_raw": 14.288992928916757, "incorrect_loss_raw": 16.822460697275222, "correct_loss_per_token": 4.1535559337992884, "incorrect_loss_per_token": 4.676294730493403, "correct_loss_per_char": 0.7359015006541597, "incorrect_loss_per_char": 0.7919844297755388, "acc_uncond": 0.32727272727272727, "correct_loss_uncond": -9.454623821106823, "incorrect_loss_uncond": -8.436705364241742, "primary_score": 0.3090909090909091}, "task_idx": 108} |
|
{"task_name": "mmlu_security_studies", "task_hash": "ae4ffe7cce87e733dc815d013b44ec75", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.74823260307312, "current_date": "2025-01-28 23:15:08 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.31020408163265306, "acc_per_token": 0.30612244897959184, "acc_per_char": 0.2571428571428571, "correct_loss_raw": 90.24697107392915, "incorrect_loss_raw": 99.85066714189485, "correct_loss_per_token": 3.2406289145919915, "incorrect_loss_per_token": 3.150665943844446, "correct_loss_per_char": 0.6222363331286122, "incorrect_loss_per_char": 0.5721624972013643, "acc_uncond": 0.2693877551020408, "correct_loss_uncond": -16.94638129837659, "incorrect_loss_uncond": -19.379616625130595, "primary_score": 0.2571428571428571}, "task_idx": 109} |
|
{"task_name": "mmlu_sociology", "task_hash": "66633d3e396945e27b4489e2e582b958", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.797818183898926, "current_date": "2025-01-28 23:15:36 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.31343283582089554, "acc_per_token": 0.31840796019900497, "acc_per_char": 0.25870646766169153, "correct_loss_raw": 31.317165066353716, "incorrect_loss_raw": 31.61777329365807, "correct_loss_per_token": 3.4413731654544994, "incorrect_loss_per_token": 3.6675033583865777, "correct_loss_per_char": 0.5888368911897749, "incorrect_loss_per_char": 0.6072444322793061, "acc_uncond": 0.43283582089552236, "correct_loss_uncond": -14.812071022109606, "incorrect_loss_uncond": -13.896083878838208, "primary_score": 0.25870646766169153}, "task_idx": 110} |
|
{"task_name": "mmlu_us_foreign_policy", "task_hash": "bd1ffb65bcdfb1582c6b60bcdbd3d533", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.146430492401123, "current_date": "2025-01-28 23:15:48 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.28, "acc_per_token": 0.35, "acc_per_char": 0.34, "correct_loss_raw": 22.961134161949158, "incorrect_loss_raw": 21.269579972426094, "correct_loss_per_token": 2.646917520621951, "incorrect_loss_per_token": 2.967826551998018, "correct_loss_per_char": 0.5221212054711434, "incorrect_loss_per_char": 0.5597690716018958, "acc_uncond": 0.4, "correct_loss_uncond": -13.380189175605773, "incorrect_loss_uncond": -12.273421669801078, "primary_score": 0.34}, "task_idx": 111} |
|
{"task_name": "mmlu_virology", "task_hash": "ea10babc381c242bef7bc631f8d422d2", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.738795518875122, "current_date": "2025-01-28 23:15:54 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.24096385542168675, "acc_per_token": 0.3674698795180723, "acc_per_char": 0.3313253012048193, "correct_loss_raw": 18.831141976706952, "incorrect_loss_raw": 18.900941733375614, "correct_loss_per_token": 3.2626452048840506, "incorrect_loss_per_token": 3.6867306698044504, "correct_loss_per_char": 0.6558878277864738, "incorrect_loss_per_char": 0.7173650237459901, "acc_uncond": 0.27710843373493976, "correct_loss_uncond": -9.973504549767597, "incorrect_loss_uncond": -9.532383361973443, "primary_score": 0.3313253012048193}, "task_idx": 112} |
|
{"task_name": "mmlu_world_religions", "task_hash": "7b18e63e9c2a47f065dce28de478a8c0", "model_hash": "ed4f59953ec0e27753b699d099c8e94b", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_topic-v3.8-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.240434646606445, "current_date": "2025-01-28 23:16:03 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.38596491228070173, "acc_per_token": 0.42105263157894735, "acc_per_char": 0.4093567251461988, "correct_loss_raw": 9.84376669627184, "incorrect_loss_raw": 10.777373207359055, "correct_loss_per_token": 2.930561060974347, "incorrect_loss_per_token": 3.6832057014687414, "correct_loss_per_char": 0.7892170156977061, "incorrect_loss_per_char": 0.9457867980879529, "acc_uncond": 0.4678362573099415, "correct_loss_uncond": -9.483134711694996, "incorrect_loss_uncond": -7.820731720380616, "primary_score": 0.4093567251461988}, "task_idx": 113} |
|
|