|
{"task_name": "mmlu:mc::olmes", "task_hash": "f0f05cd4953d75d76242750a66e32adb", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:mc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 757.9112219810486, "current_date": "2025-01-28 04:54:41 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"acc_per_token_micro": 0.25630252100840334, "acc_per_token_macro": 0.25847334955949963, "correct_loss_per_char_micro": 0.719286721077184, "correct_loss_per_char_macro": 0.7165804649125801, "incorrect_loss_raw_micro": 1.4442702353889205, "incorrect_loss_raw_macro": 1.4442158066303494, "primary_score_micro": 0.25630252100840334, "primary_score_macro": 0.25847334955949963, "acc_raw_micro": 0.25630252100840334, "acc_raw_macro": 0.25847334955949963, "acc_per_char_micro": 0.25630252100840334, "acc_per_char_macro": 0.25847334955949963, "incorrect_loss_per_token_micro": 1.4442702353889205, "incorrect_loss_per_token_macro": 1.4442158066303494, "incorrect_loss_per_char_micro": 0.7221351176944603, "incorrect_loss_per_char_macro": 0.7221079033151747, "correct_loss_raw_micro": 1.438573442154368, "correct_loss_raw_macro": 1.4331609298251602, "correct_loss_per_token_micro": 1.438573442154368, "correct_loss_per_token_macro": 1.4331609298251602, "primary_score": 0.25847334955949963}, "task_idx": null} |
|
{"task_name": "mmlu:rc::olmes", "task_hash": "d3fcbcac54951cec9ca2867583e71aa6", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:rc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1714.5079617500305, "current_date": "2025-01-28 05:07:20 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"acc_per_token_micro": 0.32417034610454354, "acc_per_token_macro": 0.32387364851360145, "correct_loss_per_char_micro": 0.66049537839636, "correct_loss_per_char_macro": 0.7377383445671886, "acc_uncond_micro": 0.33072211935621704, "acc_uncond_macro": 0.3297870333905069, "incorrect_loss_raw_micro": 22.78805232234096, "incorrect_loss_raw_macro": 21.693349636065182, "correct_loss_uncond_micro": -14.519220747604113, "correct_loss_uncond_macro": -12.963740082318353, "incorrect_loss_uncond_micro": -13.695242474049193, "incorrect_loss_uncond_macro": -12.196338377549107, "acc_raw_micro": 0.30608175473579263, "acc_raw_macro": 0.3064532060689066, "primary_score_micro": 0.32046716991881496, "primary_score_macro": 0.31987496703152013, "acc_per_char_micro": 0.32046716991881496, "acc_per_char_macro": 0.31987496703152013, "incorrect_loss_per_token_micro": 3.0752847658655846, "incorrect_loss_per_token_macro": 3.2272402317055615, "incorrect_loss_per_char_micro": 0.7063630108529165, "incorrect_loss_per_char_macro": 0.7808164480587589, "correct_loss_raw_micro": 22.75624917182934, "correct_loss_raw_macro": 21.768539919673405, "correct_loss_per_token_micro": 2.8146570144229934, "correct_loss_per_token_macro": 2.966446094460615, "primary_score": 0.31987496703152013}, "task_idx": null} |
|
{"task_name": "mmlu::olmes", "task_hash": "f5ac6da68d1e2b6ae02dda443aa04648", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "mmlu::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2472.419183731079, "current_date": "2025-01-28 04:54:41 UTC", "num_instances": 28084, "beaker_info": {}, "metrics": {"primary_score": 0.31987496703152013}, "task_idx": null} |
|
{"task_name": "mmlu_abstract_algebra:mc", "task_hash": "bdde3fee40ebc8ddc5786c67975c5b31", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra:mc", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.7630550861358643, "current_date": "2025-01-28 04:54:41 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.22, "acc_per_token": 0.22, "acc_per_char": 0.22, "correct_loss_raw": 1.623652411699295, "incorrect_loss_raw": 1.6604957938194271, "correct_loss_per_token": 1.623652411699295, "incorrect_loss_per_token": 1.6604957938194271, "correct_loss_per_char": 0.8118262058496475, "incorrect_loss_per_char": 0.8302478969097136, "primary_score": 0.22}, "task_idx": 0} |
|
{"task_name": "mmlu_anatomy:mc", "task_hash": "ba9ed92a6ef8f2c40aa5551bfc77b5e7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy:mc", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.4921655654907227, "current_date": "2025-01-28 04:54:44 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.26666666666666666, "acc_per_token": 0.26666666666666666, "acc_per_char": 0.26666666666666666, "correct_loss_raw": 1.4088038197270145, "incorrect_loss_raw": 1.438953545947134, "correct_loss_per_token": 1.4088038197270145, "incorrect_loss_per_token": 1.438953545947134, "correct_loss_per_char": 0.7044019098635073, "incorrect_loss_per_char": 0.719476772973567, "primary_score": 0.26666666666666666}, "task_idx": 1} |
|
{"task_name": "mmlu_astronomy:mc", "task_hash": "e7ca8a8921c02622e23c99b7d90379f7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy:mc", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.01880145072937, "current_date": "2025-01-28 04:54:48 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.17763157894736842, "acc_per_token": 0.17763157894736842, "acc_per_char": 0.17763157894736842, "correct_loss_raw": 1.524578217220934, "incorrect_loss_raw": 1.4304847093789204, "correct_loss_per_token": 1.524578217220934, "incorrect_loss_per_token": 1.4304847093789204, "correct_loss_per_char": 0.762289108610467, "incorrect_loss_per_char": 0.7152423546894602, "primary_score": 0.17763157894736842}, "task_idx": 2} |
|
{"task_name": "mmlu_business_ethics:mc", "task_hash": "7de417726ca2cc155dd1475a38afc381", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics:mc", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.072767496109009, "current_date": "2025-01-28 04:54:55 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.28, "acc_per_token": 0.28, "acc_per_char": 0.28, "correct_loss_raw": 1.4333354794979096, "incorrect_loss_raw": 1.430518087347348, "correct_loss_per_token": 1.4333354794979096, "incorrect_loss_per_token": 1.430518087347348, "correct_loss_per_char": 0.7166677397489548, "incorrect_loss_per_char": 0.715259043673674, "primary_score": 0.28}, "task_idx": 3} |
|
{"task_name": "mmlu_clinical_knowledge:mc", "task_hash": "221ee08c4359ce7072b8d66f1c37f500", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge:mc", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.193182229995728, "current_date": "2025-01-28 04:55:00 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.2830188679245283, "acc_per_token": 0.2830188679245283, "acc_per_char": 0.2830188679245283, "correct_loss_raw": 1.3999048642392429, "incorrect_loss_raw": 1.4092254412249208, "correct_loss_per_token": 1.3999048642392429, "incorrect_loss_per_token": 1.4092254412249208, "correct_loss_per_char": 0.6999524321196214, "incorrect_loss_per_char": 0.7046127206124604, "primary_score": 0.2830188679245283}, "task_idx": 4} |
|
{"task_name": "mmlu_college_biology:mc", "task_hash": "aaf0bf4441359de8ffba70cefb786807", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology:mc", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.898008823394775, "current_date": "2025-01-28 04:55:09 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.24305555555555555, "acc_per_token": 0.24305555555555555, "acc_per_char": 0.24305555555555555, "correct_loss_raw": 1.4207617516318958, "incorrect_loss_raw": 1.4089779379191232, "correct_loss_per_token": 1.4207617516318958, "incorrect_loss_per_token": 1.4089779379191232, "correct_loss_per_char": 0.7103808758159479, "incorrect_loss_per_char": 0.7044889689595616, "primary_score": 0.24305555555555555}, "task_idx": 5} |
|
{"task_name": "mmlu_college_chemistry:mc", "task_hash": "1980c88e607a6dea06d45f27c60e3365", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry:mc", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.464242458343506, "current_date": "2025-01-28 04:55:14 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.17, "acc_per_token": 0.17, "acc_per_char": 0.17, "correct_loss_raw": 1.4544871437549591, "incorrect_loss_raw": 1.4141180368264514, "correct_loss_per_token": 1.4544871437549591, "incorrect_loss_per_token": 1.4141180368264514, "correct_loss_per_char": 0.7272435718774796, "incorrect_loss_per_char": 0.7070590184132257, "primary_score": 0.17}, "task_idx": 6} |
|
{"task_name": "mmlu_college_computer_science:mc", "task_hash": "9d5570c603bbcb33a0727904a22ef997", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science:mc", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.658799886703491, "current_date": "2025-01-28 04:55:19 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.4542404013872146, "incorrect_loss_raw": 1.4667112388213477, "correct_loss_per_token": 1.4542404013872146, "incorrect_loss_per_token": 1.4667112388213477, "correct_loss_per_char": 0.7271202006936073, "incorrect_loss_per_char": 0.7333556194106738, "primary_score": 0.27}, "task_idx": 7} |
|
{"task_name": "mmlu_college_mathematics:mc", "task_hash": "264fbafdeceacfd7588ca20ca3546113", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics:mc", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.60849404335022, "current_date": "2025-01-28 04:55:25 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.4936019653081893, "incorrect_loss_raw": 1.5547984532515216, "correct_loss_per_token": 1.4936019653081893, "incorrect_loss_per_token": 1.5547984532515216, "correct_loss_per_char": 0.7468009826540947, "incorrect_loss_per_char": 0.7773992266257608, "primary_score": 0.33}, "task_idx": 8} |
|
{"task_name": "mmlu_college_medicine:mc", "task_hash": "9b3c95bd3bbac8771701a5abc3ab28ba", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine:mc", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.327268600463867, "current_date": "2025-01-28 04:55:30 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.1907514450867052, "acc_per_token": 0.1907514450867052, "acc_per_char": 0.1907514450867052, "correct_loss_raw": 1.4331099246278665, "incorrect_loss_raw": 1.4175534970728192, "correct_loss_per_token": 1.4331099246278665, "incorrect_loss_per_token": 1.4175534970728192, "correct_loss_per_char": 0.7165549623139332, "incorrect_loss_per_char": 0.7087767485364096, "primary_score": 0.1907514450867052}, "task_idx": 9} |
|
{"task_name": "mmlu_college_physics:mc", "task_hash": "2c97b2d8aac8dff8cd2656474c1dfb86", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics:mc", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.522810697555542, "current_date": "2025-01-28 04:55:37 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.27450980392156865, "acc_per_token": 0.27450980392156865, "acc_per_char": 0.27450980392156865, "correct_loss_raw": 1.4211181343770494, "incorrect_loss_raw": 1.4180532643218446, "correct_loss_per_token": 1.4211181343770494, "incorrect_loss_per_token": 1.4180532643218446, "correct_loss_per_char": 0.7105590671885247, "incorrect_loss_per_char": 0.7090266321609223, "primary_score": 0.27450980392156865}, "task_idx": 10} |
|
{"task_name": "mmlu_computer_security:mc", "task_hash": "6d7c3f721bf97797f0e660d896f4585b", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security:mc", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.278942584991455, "current_date": "2025-01-28 04:55:40 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.23, "acc_per_token": 0.23, "acc_per_char": 0.23, "correct_loss_raw": 1.4321056604385376, "incorrect_loss_raw": 1.4166043557723367, "correct_loss_per_token": 1.4321056604385376, "incorrect_loss_per_token": 1.4166043557723367, "correct_loss_per_char": 0.7160528302192688, "incorrect_loss_per_char": 0.7083021778861683, "primary_score": 0.23}, "task_idx": 11} |
|
{"task_name": "mmlu_conceptual_physics:mc", "task_hash": "ffbb5f78c71ff87a70f5b59d313a380d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics:mc", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.843907833099365, "current_date": "2025-01-28 04:55:44 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.2680851063829787, "acc_per_token": 0.2680851063829787, "acc_per_char": 0.2680851063829787, "correct_loss_raw": 1.4095682504329277, "incorrect_loss_raw": 1.40788460893834, "correct_loss_per_token": 1.4095682504329277, "incorrect_loss_per_token": 1.40788460893834, "correct_loss_per_char": 0.7047841252164638, "incorrect_loss_per_char": 0.70394230446917, "primary_score": 0.2680851063829787}, "task_idx": 12} |
|
{"task_name": "mmlu_econometrics:mc", "task_hash": "c69ca4807df1205e806299e8e20218af", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics:mc", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.4785284996032715, "current_date": "2025-01-28 04:55:50 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.2543859649122807, "acc_per_token": 0.2543859649122807, "acc_per_char": 0.2543859649122807, "correct_loss_raw": 1.499662618888052, "incorrect_loss_raw": 1.484006332026587, "correct_loss_per_token": 1.499662618888052, "incorrect_loss_per_token": 1.484006332026587, "correct_loss_per_char": 0.749831309444026, "incorrect_loss_per_char": 0.7420031660132935, "primary_score": 0.2543859649122807}, "task_idx": 13} |
|
{"task_name": "mmlu_electrical_engineering:mc", "task_hash": "c279f61638992683680ca9604e20fa4d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering:mc", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.848874092102051, "current_date": "2025-01-28 04:55:55 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.25517241379310346, "acc_per_token": 0.25517241379310346, "acc_per_char": 0.25517241379310346, "correct_loss_raw": 1.4315930695369325, "incorrect_loss_raw": 1.4528641719927733, "correct_loss_per_token": 1.4315930695369325, "incorrect_loss_per_token": 1.4528641719927733, "correct_loss_per_char": 0.7157965347684663, "incorrect_loss_per_char": 0.7264320859963866, "primary_score": 0.25517241379310346}, "task_idx": 14} |
|
{"task_name": "mmlu_elementary_mathematics:mc", "task_hash": "35b6f0933f711770d09fb00b45905c5c", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics:mc", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.087385892868042, "current_date": "2025-01-28 04:56:00 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.2566137566137566, "acc_per_token": 0.2566137566137566, "acc_per_char": 0.2566137566137566, "correct_loss_raw": 1.4478250868106015, "incorrect_loss_raw": 1.444192697446814, "correct_loss_per_token": 1.4478250868106015, "incorrect_loss_per_token": 1.444192697446814, "correct_loss_per_char": 0.7239125434053008, "incorrect_loss_per_char": 0.722096348723407, "primary_score": 0.2566137566137566}, "task_idx": 15} |
|
{"task_name": "mmlu_formal_logic:mc", "task_hash": "74d8e6a1f297e0274243d2bbb7df4d1b", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic:mc", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.395558595657349, "current_date": "2025-01-28 04:56:14 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.19047619047619047, "acc_per_token": 0.19047619047619047, "acc_per_char": 0.19047619047619047, "correct_loss_raw": 1.590899599449975, "incorrect_loss_raw": 1.4669205514842243, "correct_loss_per_token": 1.590899599449975, "incorrect_loss_per_token": 1.4669205514842243, "correct_loss_per_char": 0.7954497997249875, "incorrect_loss_per_char": 0.7334602757421121, "primary_score": 0.19047619047619047}, "task_idx": 16} |
|
{"task_name": "mmlu_global_facts:mc", "task_hash": "4f14cfa253ea56a8d3b0d2c805ccdb28", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts:mc", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.2749645709991455, "current_date": "2025-01-28 04:56:20 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.29, "acc_per_token": 0.29, "acc_per_char": 0.29, "correct_loss_raw": 1.4361648845672608, "incorrect_loss_raw": 1.5098727031548813, "correct_loss_per_token": 1.4361648845672608, "incorrect_loss_per_token": 1.5098727031548813, "correct_loss_per_char": 0.7180824422836304, "incorrect_loss_per_char": 0.7549363515774407, "primary_score": 0.29}, "task_idx": 17} |
|
{"task_name": "mmlu_high_school_biology:mc", "task_hash": "055cfa37938a062655e6ce08f80c7765", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology:mc", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.818602323532104, "current_date": "2025-01-28 04:56:24 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.3064516129032258, "acc_per_token": 0.3064516129032258, "acc_per_char": 0.3064516129032258, "correct_loss_raw": 1.3894446688313637, "incorrect_loss_raw": 1.4320785844838753, "correct_loss_per_token": 1.3894446688313637, "incorrect_loss_per_token": 1.4320785844838753, "correct_loss_per_char": 0.6947223344156819, "incorrect_loss_per_char": 0.7160392922419376, "primary_score": 0.3064516129032258}, "task_idx": 18} |
|
{"task_name": "mmlu_high_school_chemistry:mc", "task_hash": "6cef5e5a35451e467b97a8cf773fb61c", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry:mc", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.313126564025879, "current_date": "2025-01-28 04:56:36 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.2413793103448276, "acc_per_token": 0.2413793103448276, "acc_per_char": 0.2413793103448276, "correct_loss_raw": 1.4189456830471021, "incorrect_loss_raw": 1.421994413452587, "correct_loss_per_token": 1.4189456830471021, "incorrect_loss_per_token": 1.421994413452587, "correct_loss_per_char": 0.7094728415235511, "incorrect_loss_per_char": 0.7109972067262935, "primary_score": 0.2413793103448276}, "task_idx": 19} |
|
{"task_name": "mmlu_high_school_computer_science:mc", "task_hash": "31a39a79632638f209cd0a9c599f158d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science:mc", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.094257593154907, "current_date": "2025-01-28 04:56:43 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.3, "acc_per_token": 0.3, "acc_per_char": 0.3, "correct_loss_raw": 1.3876521039009093, "incorrect_loss_raw": 1.4405102294683458, "correct_loss_per_token": 1.3876521039009093, "incorrect_loss_per_token": 1.4405102294683458, "correct_loss_per_char": 0.6938260519504547, "incorrect_loss_per_char": 0.7202551147341729, "primary_score": 0.3}, "task_idx": 20} |
|
{"task_name": "mmlu_high_school_european_history:mc", "task_hash": "e8f2a29738091af55efa8a7194452ac2", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history:mc", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.57564067840576, "current_date": "2025-01-28 04:56:50 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.24242424242424243, "acc_per_token": 0.24242424242424243, "acc_per_char": 0.24242424242424243, "correct_loss_raw": 1.4085102702632095, "incorrect_loss_raw": 1.407174792193403, "correct_loss_per_token": 1.4085102702632095, "incorrect_loss_per_token": 1.407174792193403, "correct_loss_per_char": 0.7042551351316048, "incorrect_loss_per_char": 0.7035873960967015, "primary_score": 0.24242424242424243}, "task_idx": 21} |
|
{"task_name": "mmlu_high_school_geography:mc", "task_hash": "6a43a92b543ec77afeeda9d5011e0c36", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography:mc", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.280180215835571, "current_date": "2025-01-28 04:57:18 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.20707070707070707, "acc_per_token": 0.20707070707070707, "acc_per_char": 0.20707070707070707, "correct_loss_raw": 1.4242555143857243, "incorrect_loss_raw": 1.4006931962388938, "correct_loss_per_token": 1.4242555143857243, "incorrect_loss_per_token": 1.4006931962388938, "correct_loss_per_char": 0.7121277571928621, "incorrect_loss_per_char": 0.7003465981194469, "primary_score": 0.20707070707070707}, "task_idx": 22} |
|
{"task_name": "mmlu_high_school_government_and_politics:mc", "task_hash": "65cdc0b1dc4018c2017fc6023e9bb862", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics:mc", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.276082754135132, "current_date": "2025-01-28 04:57:24 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.20725388601036268, "acc_per_token": 0.20725388601036268, "acc_per_char": 0.20725388601036268, "correct_loss_raw": 1.4184853703246834, "incorrect_loss_raw": 1.40174199406553, "correct_loss_per_token": 1.4184853703246834, "incorrect_loss_per_token": 1.40174199406553, "correct_loss_per_char": 0.7092426851623417, "incorrect_loss_per_char": 0.700870997032765, "primary_score": 0.20725388601036268}, "task_idx": 23} |
|
{"task_name": "mmlu_high_school_macroeconomics:mc", "task_hash": "177b3e0ec28ae90f76d191ba937fb328", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics:mc", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.919329404830933, "current_date": "2025-01-28 04:57:31 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.2692307692307692, "acc_per_token": 0.2692307692307692, "acc_per_char": 0.2692307692307692, "correct_loss_raw": 1.4050409514170425, "incorrect_loss_raw": 1.4443521368707346, "correct_loss_per_token": 1.4050409514170425, "incorrect_loss_per_token": 1.4443521368707346, "correct_loss_per_char": 0.7025204757085213, "incorrect_loss_per_char": 0.7221760684353673, "primary_score": 0.2692307692307692}, "task_idx": 24} |
|
{"task_name": "mmlu_high_school_mathematics:mc", "task_hash": "934371e2cf927fc449e77df454d85d2d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics:mc", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.00506043434143, "current_date": "2025-01-28 04:57:43 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.2851851851851852, "acc_per_token": 0.2851851851851852, "acc_per_char": 0.2851851851851852, "correct_loss_raw": 1.4732028327606344, "incorrect_loss_raw": 1.5230170361789652, "correct_loss_per_token": 1.4732028327606344, "incorrect_loss_per_token": 1.5230170361789652, "correct_loss_per_char": 0.7366014163803172, "incorrect_loss_per_char": 0.7615085180894826, "primary_score": 0.2851851851851852}, "task_idx": 25} |
|
{"task_name": "mmlu_high_school_microeconomics:mc", "task_hash": "3738e45ad1235f9f0a4825ae099697cb", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics:mc", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.591324090957642, "current_date": "2025-01-28 04:57:53 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.25210084033613445, "acc_per_token": 0.25210084033613445, "acc_per_char": 0.25210084033613445, "correct_loss_raw": 1.4040252651487077, "incorrect_loss_raw": 1.4292737781500613, "correct_loss_per_token": 1.4040252651487077, "incorrect_loss_per_token": 1.4292737781500613, "correct_loss_per_char": 0.7020126325743539, "incorrect_loss_per_char": 0.7146368890750306, "primary_score": 0.25210084033613445}, "task_idx": 26} |
|
{"task_name": "mmlu_high_school_physics:mc", "task_hash": "583350c5b48fd28100732ad06943489f", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics:mc", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.399226188659668, "current_date": "2025-01-28 04:58:01 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.2847682119205298, "acc_per_token": 0.2847682119205298, "acc_per_char": 0.2847682119205298, "correct_loss_raw": 1.403415952297236, "incorrect_loss_raw": 1.4426044532268523, "correct_loss_per_token": 1.403415952297236, "incorrect_loss_per_token": 1.4426044532268523, "correct_loss_per_char": 0.701707976148618, "incorrect_loss_per_char": 0.7213022266134261, "primary_score": 0.2847682119205298}, "task_idx": 27} |
|
{"task_name": "mmlu_high_school_psychology:mc", "task_hash": "accf1559d013b1e7ac36647c1fe9dd67", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology:mc", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.361661195755005, "current_date": "2025-01-28 04:58:07 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.22201834862385322, "acc_per_token": 0.22201834862385322, "acc_per_char": 0.22201834862385322, "correct_loss_raw": 1.4348006235350164, "incorrect_loss_raw": 1.4091741864469804, "correct_loss_per_token": 1.4348006235350164, "incorrect_loss_per_token": 1.4091741864469804, "correct_loss_per_char": 0.7174003117675082, "incorrect_loss_per_char": 0.7045870932234902, "primary_score": 0.22201834862385322}, "task_idx": 28} |
|
{"task_name": "mmlu_high_school_statistics:mc", "task_hash": "7bd3b2133806936ee947ebd9c9890647", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics:mc", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.818506956100464, "current_date": "2025-01-28 04:58:27 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.4166666666666667, "acc_per_token": 0.4166666666666667, "acc_per_char": 0.4166666666666667, "correct_loss_raw": 1.3328079123187948, "incorrect_loss_raw": 1.5104334376476434, "correct_loss_per_token": 1.3328079123187948, "incorrect_loss_per_token": 1.5104334376476434, "correct_loss_per_char": 0.6664039561593974, "incorrect_loss_per_char": 0.7552167188238217, "primary_score": 0.4166666666666667}, "task_idx": 29} |
|
{"task_name": "mmlu_high_school_us_history:mc", "task_hash": "8097dc2c4728e3ef312c10bfcc9a0c47", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history:mc", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 32.69856572151184, "current_date": "2025-01-28 04:58:40 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.2107843137254902, "acc_per_token": 0.2107843137254902, "acc_per_char": 0.2107843137254902, "correct_loss_raw": 1.410032211565504, "incorrect_loss_raw": 1.4099592198343855, "correct_loss_per_token": 1.410032211565504, "incorrect_loss_per_token": 1.4099592198343855, "correct_loss_per_char": 0.705016105782752, "incorrect_loss_per_char": 0.7049796099171928, "primary_score": 0.2107843137254902}, "task_idx": 30} |
|
{"task_name": "mmlu_high_school_world_history:mc", "task_hash": "4c9689dbb0e9effb2991bc98e1364c03", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history:mc", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.781399488449097, "current_date": "2025-01-28 04:59:13 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.2489451476793249, "acc_per_token": 0.2489451476793249, "acc_per_char": 0.2489451476793249, "correct_loss_raw": 1.4103977292901857, "incorrect_loss_raw": 1.4006773582155367, "correct_loss_per_token": 1.4103977292901857, "incorrect_loss_per_token": 1.4006773582155367, "correct_loss_per_char": 0.7051988646450928, "incorrect_loss_per_char": 0.7003386791077684, "primary_score": 0.2489451476793249}, "task_idx": 31} |
|
{"task_name": "mmlu_human_aging:mc", "task_hash": "aed6dc4e5de4b465852e8add68f1e1c7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging:mc", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.761896848678589, "current_date": "2025-01-28 04:59:40 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.28699551569506726, "acc_per_token": 0.28699551569506726, "acc_per_char": 0.28699551569506726, "correct_loss_raw": 1.379385376190391, "incorrect_loss_raw": 1.4162388687176555, "correct_loss_per_token": 1.379385376190391, "incorrect_loss_per_token": 1.4162388687176555, "correct_loss_per_char": 0.6896926880951955, "incorrect_loss_per_char": 0.7081194343588277, "primary_score": 0.28699551569506726}, "task_idx": 32} |
|
{"task_name": "mmlu_human_sexuality:mc", "task_hash": "40c85ccce055746bdd1f28232f48f0fa", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality:mc", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.8122663497924805, "current_date": "2025-01-28 04:59:46 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.25190839694656486, "acc_per_token": 0.25190839694656486, "acc_per_char": 0.25190839694656486, "correct_loss_raw": 1.4197146938047336, "incorrect_loss_raw": 1.4224386418442094, "correct_loss_per_token": 1.4197146938047336, "incorrect_loss_per_token": 1.4224386418442094, "correct_loss_per_char": 0.7098573469023668, "incorrect_loss_per_char": 0.7112193209221047, "primary_score": 0.25190839694656486}, "task_idx": 33} |
|
{"task_name": "mmlu_international_law:mc", "task_hash": "3cfc657dd55e3ad96d5c3e9cd17bc346", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law:mc", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.149319648742676, "current_date": "2025-01-28 04:59:49 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.2396694214876033, "acc_per_token": 0.2396694214876033, "acc_per_char": 0.2396694214876033, "correct_loss_raw": 1.4277856079014866, "incorrect_loss_raw": 1.5203524959317252, "correct_loss_per_token": 1.4277856079014866, "incorrect_loss_per_token": 1.5203524959317252, "correct_loss_per_char": 0.7138928039507433, "incorrect_loss_per_char": 0.7601762479658626, "primary_score": 0.2396694214876033}, "task_idx": 34} |
|
{"task_name": "mmlu_jurisprudence:mc", "task_hash": "ca4ac71f0fd702b39c6245be2ab32061", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence:mc", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.8075058460235596, "current_date": "2025-01-28 04:59:56 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.25925925925925924, "acc_per_token": 0.25925925925925924, "acc_per_char": 0.25925925925925924, "correct_loss_raw": 1.4182900571160846, "incorrect_loss_raw": 1.4486640287034305, "correct_loss_per_token": 1.4182900571160846, "incorrect_loss_per_token": 1.4486640287034305, "correct_loss_per_char": 0.7091450285580423, "incorrect_loss_per_char": 0.7243320143517152, "primary_score": 0.25925925925925924}, "task_idx": 35} |
|
{"task_name": "mmlu_logical_fallacies:mc", "task_hash": "a4b3c214c3cb1c10bfa4042dd0e9df92", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies:mc", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.935306787490845, "current_date": "2025-01-28 04:59:59 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.22699386503067484, "acc_per_token": 0.22699386503067484, "acc_per_char": 0.22699386503067484, "correct_loss_raw": 1.4177253582726228, "incorrect_loss_raw": 1.4250889277896992, "correct_loss_per_token": 1.4177253582726228, "incorrect_loss_per_token": 1.4250889277896992, "correct_loss_per_char": 0.7088626791363114, "incorrect_loss_per_char": 0.7125444638948496, "primary_score": 0.22699386503067484}, "task_idx": 36} |
|
{"task_name": "mmlu_machine_learning:mc", "task_hash": "43ad1436fc44eed0bc66cc7239ecd94b", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning:mc", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.81073260307312, "current_date": "2025-01-28 05:00:05 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.32142857142857145, "acc_per_token": 0.32142857142857145, "acc_per_char": 0.32142857142857145, "correct_loss_raw": 1.4316446940813745, "incorrect_loss_raw": 1.5338344322074025, "correct_loss_per_token": 1.4316446940813745, "incorrect_loss_per_token": 1.5338344322074025, "correct_loss_per_char": 0.7158223470406873, "incorrect_loss_per_char": 0.7669172161037012, "primary_score": 0.32142857142857145}, "task_idx": 37} |
|
{"task_name": "mmlu_management:mc", "task_hash": "f565b650124e104d5d59b40491bde8e7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management:mc", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.718989372253418, "current_date": "2025-01-28 05:00:11 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.17475728155339806, "acc_per_token": 0.17475728155339806, "acc_per_char": 0.17475728155339806, "correct_loss_raw": 1.483287247639258, "incorrect_loss_raw": 1.44468416577404, "correct_loss_per_token": 1.483287247639258, "incorrect_loss_per_token": 1.44468416577404, "correct_loss_per_char": 0.741643623819629, "incorrect_loss_per_char": 0.72234208288702, "primary_score": 0.17475728155339806}, "task_idx": 38} |
|
{"task_name": "mmlu_marketing:mc", "task_hash": "63c7c7a1863fe3aaf961947124cbd4c3", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing:mc", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.000325202941895, "current_date": "2025-01-28 05:00:14 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.2692307692307692, "acc_per_token": 0.2692307692307692, "acc_per_char": 0.2692307692307692, "correct_loss_raw": 1.4135292743006322, "incorrect_loss_raw": 1.4098624550549061, "correct_loss_per_token": 1.4135292743006322, "incorrect_loss_per_token": 1.4098624550549061, "correct_loss_per_char": 0.7067646371503161, "incorrect_loss_per_char": 0.7049312275274531, "primary_score": 0.2692307692307692}, "task_idx": 39} |
|
{"task_name": "mmlu_medical_genetics:mc", "task_hash": "11f7f7576f9aeb3dae4cc770e7a06c98", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics:mc", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.1817290782928467, "current_date": "2025-01-28 05:00:22 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.32, "acc_per_token": 0.32, "acc_per_char": 0.32, "correct_loss_raw": 1.3897346860170365, "incorrect_loss_raw": 1.4244294041395187, "correct_loss_per_token": 1.3897346860170365, "incorrect_loss_per_token": 1.4244294041395187, "correct_loss_per_char": 0.6948673430085183, "incorrect_loss_per_char": 0.7122147020697593, "primary_score": 0.32}, "task_idx": 40} |
|
{"task_name": "mmlu_miscellaneous:mc", "task_hash": "d9c892ba8631049d773d6fa3dc5dca82", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous:mc", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 18.97473907470703, "current_date": "2025-01-28 05:00:25 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.2567049808429119, "acc_per_token": 0.2567049808429119, "acc_per_char": 0.2567049808429119, "correct_loss_raw": 1.412226355623925, "incorrect_loss_raw": 1.414735851388324, "correct_loss_per_token": 1.412226355623925, "incorrect_loss_per_token": 1.414735851388324, "correct_loss_per_char": 0.7061131778119625, "incorrect_loss_per_char": 0.707367925694162, "primary_score": 0.2567049808429119}, "task_idx": 41} |
|
{"task_name": "mmlu_moral_disputes:mc", "task_hash": "d05901af9b9e012ab9e4ce8bb28c2bb8", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes:mc", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.008602142333984, "current_date": "2025-01-28 05:00:44 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.2254335260115607, "acc_per_token": 0.2254335260115607, "acc_per_char": 0.2254335260115607, "correct_loss_raw": 1.4499102469124545, "incorrect_loss_raw": 1.4418547151175993, "correct_loss_per_token": 1.4499102469124545, "incorrect_loss_per_token": 1.4418547151175993, "correct_loss_per_char": 0.7249551234562273, "incorrect_loss_per_char": 0.7209273575587997, "primary_score": 0.2254335260115607}, "task_idx": 42} |
|
{"task_name": "mmlu_moral_scenarios:mc", "task_hash": "33949ee763bf0ed37a82aa7796d56cd6", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios:mc", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 43.686490058898926, "current_date": "2025-01-28 05:00:57 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.2424581005586592, "acc_per_token": 0.2424581005586592, "acc_per_char": 0.2424581005586592, "correct_loss_raw": 1.6093816421551412, "incorrect_loss_raw": 1.5733506117231144, "correct_loss_per_token": 1.6093816421551412, "incorrect_loss_per_token": 1.5733506117231144, "correct_loss_per_char": 0.8046908210775706, "incorrect_loss_per_char": 0.7866753058615572, "primary_score": 0.2424581005586592}, "task_idx": 43} |
|
{"task_name": "mmlu_nutrition:mc", "task_hash": "e68f4b08d1adc45a7ab0ea385d987849", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition:mc", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.127213716506958, "current_date": "2025-01-28 05:01:41 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.23202614379084968, "acc_per_token": 0.23202614379084968, "acc_per_char": 0.23202614379084968, "correct_loss_raw": 1.4342387077075984, "incorrect_loss_raw": 1.4227450555575951, "correct_loss_per_token": 1.4342387077075984, "incorrect_loss_per_token": 1.4227450555575951, "correct_loss_per_char": 0.7171193538537992, "incorrect_loss_per_char": 0.7113725277787976, "primary_score": 0.23202614379084968}, "task_idx": 44} |
|
{"task_name": "mmlu_philosophy:mc", "task_hash": "dd14a2446c6e46449cd5b14ee7982b73", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy:mc", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.340004682540894, "current_date": "2025-01-28 05:01:54 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.2379421221864952, "acc_per_token": 0.2379421221864952, "acc_per_char": 0.2379421221864952, "correct_loss_raw": 1.4000776921821176, "incorrect_loss_raw": 1.4055763931519742, "correct_loss_per_token": 1.4000776921821176, "incorrect_loss_per_token": 1.4055763931519742, "correct_loss_per_char": 0.7000388460910588, "incorrect_loss_per_char": 0.7027881965759871, "primary_score": 0.2379421221864952}, "task_idx": 45} |
|
{"task_name": "mmlu_prehistory:mc", "task_hash": "d65b3e5cf8049b1c1442537b281f5a72", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory:mc", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.22838568687439, "current_date": "2025-01-28 05:02:03 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.2623456790123457, "acc_per_token": 0.2623456790123457, "acc_per_char": 0.2623456790123457, "correct_loss_raw": 1.4085913786181696, "incorrect_loss_raw": 1.4081900398795002, "correct_loss_per_token": 1.4085913786181696, "incorrect_loss_per_token": 1.4081900398795002, "correct_loss_per_char": 0.7042956893090848, "incorrect_loss_per_char": 0.7040950199397501, "primary_score": 0.2623456790123457}, "task_idx": 46} |
|
{"task_name": "mmlu_professional_accounting:mc", "task_hash": "2d9464b5e5a5ee20a777a37004dd3a2d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting:mc", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.947415113449097, "current_date": "2025-01-28 05:02:16 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.2624113475177305, "acc_per_token": 0.2624113475177305, "acc_per_char": 0.2624113475177305, "correct_loss_raw": 1.3991403571257355, "incorrect_loss_raw": 1.4244927636415954, "correct_loss_per_token": 1.3991403571257355, "incorrect_loss_per_token": 1.4244927636415954, "correct_loss_per_char": 0.6995701785628677, "incorrect_loss_per_char": 0.7122463818207977, "primary_score": 0.2624113475177305}, "task_idx": 47} |
|
{"task_name": "mmlu_professional_law:mc", "task_hash": "c4dd4f89898c6498217d79776e68bb06", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law:mc", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 193.0377974510193, "current_date": "2025-01-28 05:02:30 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.24771838331160365, "acc_per_token": 0.24771838331160365, "acc_per_char": 0.24771838331160365, "correct_loss_raw": 1.452149992971408, "incorrect_loss_raw": 1.4447676602496855, "correct_loss_per_token": 1.452149992971408, "incorrect_loss_per_token": 1.4447676602496855, "correct_loss_per_char": 0.726074996485704, "incorrect_loss_per_char": 0.7223838301248428, "primary_score": 0.24771838331160365}, "task_idx": 48} |
|
{"task_name": "mmlu_professional_medicine:mc", "task_hash": "8b8aa33e03e2f1b4abff4cbb3dd56cd7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine:mc", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.127638578414917, "current_date": "2025-01-28 05:05:43 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.43014705882352944, "acc_per_token": 0.43014705882352944, "acc_per_char": 0.43014705882352944, "correct_loss_raw": 1.3611369428827482, "incorrect_loss_raw": 1.4642259400410982, "correct_loss_per_token": 1.3611369428827482, "incorrect_loss_per_token": 1.4642259400410982, "correct_loss_per_char": 0.6805684714413741, "incorrect_loss_per_char": 0.7321129700205491, "primary_score": 0.43014705882352944}, "task_idx": 49} |
|
{"task_name": "mmlu_professional_psychology:mc", "task_hash": "3094d326fde18b55836110e1d0f8f241", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology:mc", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.038251876831055, "current_date": "2025-01-28 05:06:05 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.24836601307189543, "acc_per_token": 0.24836601307189543, "acc_per_char": 0.24836601307189543, "correct_loss_raw": 1.4247640344052532, "incorrect_loss_raw": 1.4300063770256277, "correct_loss_per_token": 1.4247640344052532, "incorrect_loss_per_token": 1.4300063770256277, "correct_loss_per_char": 0.7123820172026266, "incorrect_loss_per_char": 0.7150031885128139, "primary_score": 0.24836601307189543}, "task_idx": 50} |
|
{"task_name": "mmlu_public_relations:mc", "task_hash": "b10f684a09888253de5b2778544ace3d", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations:mc", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.934047222137451, "current_date": "2025-01-28 05:06:32 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.22727272727272727, "acc_per_token": 0.22727272727272727, "acc_per_char": 0.22727272727272727, "correct_loss_raw": 1.4112952524965459, "incorrect_loss_raw": 1.4418597143707845, "correct_loss_per_token": 1.4112952524965459, "incorrect_loss_per_token": 1.4418597143707845, "correct_loss_per_char": 0.7056476262482729, "incorrect_loss_per_char": 0.7209298571853923, "primary_score": 0.22727272727272727}, "task_idx": 51} |
|
{"task_name": "mmlu_security_studies:mc", "task_hash": "1f8f03c4608bfc16b773b6789dff3612", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies:mc", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.293635368347168, "current_date": "2025-01-28 05:06:36 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.17142857142857143, "acc_per_token": 0.17142857142857143, "acc_per_char": 0.17142857142857143, "correct_loss_raw": 1.4515007880269264, "incorrect_loss_raw": 1.4025412608977077, "correct_loss_per_token": 1.4515007880269264, "incorrect_loss_per_token": 1.4025412608977077, "correct_loss_per_char": 0.7257503940134632, "incorrect_loss_per_char": 0.7012706304488538, "primary_score": 0.17142857142857143}, "task_idx": 52} |
|
{"task_name": "mmlu_sociology:mc", "task_hash": "8febc5ac38c21f5a0811d42006faf2ea", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology:mc", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.266791820526123, "current_date": "2025-01-28 05:06:59 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.263681592039801, "acc_per_token": 0.263681592039801, "acc_per_char": 0.263681592039801, "correct_loss_raw": 1.4293666057918795, "incorrect_loss_raw": 1.416080294260338, "correct_loss_per_token": 1.4293666057918795, "incorrect_loss_per_token": 1.416080294260338, "correct_loss_per_char": 0.7146833028959397, "incorrect_loss_per_char": 0.708040147130169, "primary_score": 0.263681592039801}, "task_idx": 53} |
|
{"task_name": "mmlu_us_foreign_policy:mc", "task_hash": "cceb9539ca6356676c1a014a74093ec9", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy:mc", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.522918224334717, "current_date": "2025-01-28 05:07:07 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.41061336517334, "incorrect_loss_raw": 1.428267351786296, "correct_loss_per_token": 1.41061336517334, "incorrect_loss_per_token": 1.428267351786296, "correct_loss_per_char": 0.70530668258667, "incorrect_loss_per_char": 0.714133675893148, "primary_score": 0.33}, "task_idx": 54} |
|
{"task_name": "mmlu_virology:mc", "task_hash": "1b216fb4e04c61029da5dfb32810fabc", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology:mc", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.8751540184021, "current_date": "2025-01-28 05:07:11 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.3253012048192771, "acc_per_token": 0.3253012048192771, "acc_per_char": 0.3253012048192771, "correct_loss_raw": 1.3938420285661537, "incorrect_loss_raw": 1.4157007449123276, "correct_loss_per_token": 1.3938420285661537, "incorrect_loss_per_token": 1.4157007449123276, "correct_loss_per_char": 0.6969210142830768, "incorrect_loss_per_char": 0.7078503724561638, "primary_score": 0.3253012048192771}, "task_idx": 55} |
|
{"task_name": "mmlu_world_religions:mc", "task_hash": "223d634e4c9d91a64ed77b7e259d7010", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions:mc", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.163343191146851, "current_date": "2025-01-28 05:07:16 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.27485380116959063, "acc_per_token": 0.27485380116959063, "acc_per_char": 0.27485380116959063, "correct_loss_raw": 1.424410241389135, "incorrect_loss_raw": 1.4343925405431668, "correct_loss_per_token": 1.424410241389135, "incorrect_loss_per_token": 1.4343925405431668, "correct_loss_per_char": 0.7122051206945675, "incorrect_loss_per_char": 0.7171962702715834, "primary_score": 0.27485380116959063}, "task_idx": 56} |
|
{"task_name": "mmlu_abstract_algebra", "task_hash": "c85fa3ca2628093d327501718793d07b", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.7566418647766113, "current_date": "2025-01-28 05:07:20 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.13, "acc_per_token": 0.16, "acc_per_char": 0.15, "correct_loss_raw": 7.238449618816376, "incorrect_loss_raw": 5.4608514682451865, "correct_loss_per_token": 2.0800698444354238, "incorrect_loss_per_token": 2.207374636461063, "correct_loss_per_char": 0.8251452219643849, "incorrect_loss_per_char": 0.7998403575500812, "acc_uncond": 0.23, "correct_loss_uncond": -9.018387553691865, "incorrect_loss_uncond": -8.809169817765559, "primary_score": 0.15}, "task_idx": 57} |
|
{"task_name": "mmlu_anatomy", "task_hash": "3f9b02c965eba1bd23b0446d0e9deff4", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.403589963912964, "current_date": "2025-01-28 05:07:23 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.31851851851851853, "acc_per_token": 0.3037037037037037, "acc_per_char": 0.34814814814814815, "correct_loss_raw": 18.6549613793691, "incorrect_loss_raw": 18.791444842903704, "correct_loss_per_token": 2.3724382964207016, "incorrect_loss_per_token": 2.667046855845751, "correct_loss_per_char": 0.5377537390503253, "incorrect_loss_per_char": 0.6059921850098559, "acc_uncond": 0.28888888888888886, "correct_loss_uncond": -14.285047370416146, "incorrect_loss_uncond": -14.143531273323816, "primary_score": 0.34814814814814815}, "task_idx": 58} |
|
{"task_name": "mmlu_astronomy", "task_hash": "d9e63c18cde7815546c5a54ffadb81f9", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.050167798995972, "current_date": "2025-01-28 05:07:30 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.2565789473684211, "acc_per_token": 0.3355263157894737, "acc_per_char": 0.3355263157894737, "correct_loss_raw": 29.0879924595356, "incorrect_loss_raw": 27.18310218291324, "correct_loss_per_token": 2.785532537570271, "incorrect_loss_per_token": 3.035261307373645, "correct_loss_per_char": 0.6848314221289752, "incorrect_loss_per_char": 0.7338826834180289, "acc_uncond": 0.3618421052631579, "correct_loss_uncond": -13.803319995340548, "incorrect_loss_uncond": -13.695736216087097, "primary_score": 0.3355263157894737}, "task_idx": 59} |
|
{"task_name": "mmlu_business_ethics", "task_hash": "dbbf5c673a31d657513075cc70e4f670", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.87929916381836, "current_date": "2025-01-28 05:07:40 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.53, "acc_per_token": 0.45, "acc_per_char": 0.48, "correct_loss_raw": 23.24944877386093, "incorrect_loss_raw": 26.010213224093107, "correct_loss_per_token": 3.2996396047051695, "incorrect_loss_per_token": 3.605208941501484, "correct_loss_per_char": 0.930672972022235, "incorrect_loss_per_char": 0.9742519914137123, "acc_uncond": 0.38, "correct_loss_uncond": -11.453577845096587, "incorrect_loss_uncond": -10.477688962618508, "primary_score": 0.48}, "task_idx": 60} |
|
{"task_name": "mmlu_clinical_knowledge", "task_hash": "940022f2e7983e3f56cfc7196b310a7f", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.763206243515015, "current_date": "2025-01-28 05:07:49 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.24528301886792453, "acc_per_token": 0.32075471698113206, "acc_per_char": 0.33962264150943394, "correct_loss_raw": 22.77783377890317, "incorrect_loss_raw": 20.88542130443286, "correct_loss_per_token": 2.662753846075874, "incorrect_loss_per_token": 2.8963668845587955, "correct_loss_per_char": 0.6333168810938389, "incorrect_loss_per_char": 0.7056293638882262, "acc_uncond": 0.3320754716981132, "correct_loss_uncond": -13.234188887308228, "incorrect_loss_uncond": -12.220108844799066, "primary_score": 0.33962264150943394}, "task_idx": 61} |
|
{"task_name": "mmlu_college_biology", "task_hash": "0b879b8081c2b7d376a6abd76697f553", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.042107105255127, "current_date": "2025-01-28 05:08:04 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.3333333333333333, "acc_per_token": 0.2986111111111111, "acc_per_char": 0.3333333333333333, "correct_loss_raw": 21.360860213637352, "incorrect_loss_raw": 23.00310295102772, "correct_loss_per_token": 2.765583226471664, "incorrect_loss_per_token": 3.1703698783420986, "correct_loss_per_char": 0.5344246196547231, "incorrect_loss_per_char": 0.6231687688578911, "acc_uncond": 0.3125, "correct_loss_uncond": -15.441606284843552, "incorrect_loss_uncond": -14.069439778173411, "primary_score": 0.3333333333333333}, "task_idx": 62} |
|
{"task_name": "mmlu_college_chemistry", "task_hash": "0ed8a28c3b6ceca7f72f02bc9b87d236", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.2039794921875, "current_date": "2025-01-28 05:08:15 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.24, "acc_per_token": 0.32, "acc_per_char": 0.24, "correct_loss_raw": 19.081055735349654, "incorrect_loss_raw": 19.166683287223172, "correct_loss_per_token": 3.0706036289880916, "incorrect_loss_per_token": 3.1726666465890925, "correct_loss_per_char": 1.2177745460881189, "incorrect_loss_per_char": 1.211041814783291, "acc_uncond": 0.23, "correct_loss_uncond": -12.639790011644363, "incorrect_loss_uncond": -12.118665522336956, "primary_score": 0.24}, "task_idx": 63} |
|
{"task_name": "mmlu_college_computer_science", "task_hash": "563c1a7e8c030ab92f3c9359a1196891", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 17.094253301620483, "current_date": "2025-01-28 05:08:23 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.29, "acc_per_token": 0.23, "acc_per_char": 0.24, "correct_loss_raw": 19.577520344257355, "incorrect_loss_raw": 19.093906108538306, "correct_loss_per_token": 2.9963572845610407, "incorrect_loss_per_token": 3.2563074208865714, "correct_loss_per_char": 0.9630421158784124, "incorrect_loss_per_char": 0.9781260864412319, "acc_uncond": 0.29, "correct_loss_uncond": -11.22299966096878, "incorrect_loss_uncond": -11.177900560696921, "primary_score": 0.24}, "task_idx": 64} |
|
{"task_name": "mmlu_college_mathematics", "task_hash": "97a6ddef0d69128d9260dd1f8c82521c", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.05379319190979, "current_date": "2025-01-28 05:08:40 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.19, "acc_per_token": 0.21, "acc_per_char": 0.18, "correct_loss_raw": 12.604055041074753, "incorrect_loss_raw": 11.063768239418664, "correct_loss_per_token": 3.1818371390403577, "incorrect_loss_per_token": 3.11543055461903, "correct_loss_per_char": 1.2828475652594773, "incorrect_loss_per_char": 1.2393229212734804, "acc_uncond": 0.26, "correct_loss_uncond": -8.495612560510635, "incorrect_loss_uncond": -8.353109432458877, "primary_score": 0.18}, "task_idx": 65} |
|
{"task_name": "mmlu_college_medicine", "task_hash": "483a77ff3415e8b126e8e83fda055b39", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.256129026412964, "current_date": "2025-01-28 05:08:49 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.31213872832369943, "acc_per_token": 0.30057803468208094, "acc_per_char": 0.2658959537572254, "correct_loss_raw": 20.96267718317881, "incorrect_loss_raw": 20.63154107975363, "correct_loss_per_token": 2.8240699885975284, "incorrect_loss_per_token": 2.9788515415505215, "correct_loss_per_char": 0.6670761083108511, "incorrect_loss_per_char": 0.7022751311685591, "acc_uncond": 0.3179190751445087, "correct_loss_uncond": -12.840449129914962, "incorrect_loss_uncond": -12.496904141296543, "primary_score": 0.2658959537572254}, "task_idx": 66} |
|
{"task_name": "mmlu_college_physics", "task_hash": "db149cec3fe17117a3fa544e9ea18d10", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.676836013793945, "current_date": "2025-01-28 05:09:03 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.24509803921568626, "acc_per_token": 0.19607843137254902, "acc_per_char": 0.20588235294117646, "correct_loss_raw": 13.384697432611503, "incorrect_loss_raw": 11.688713600822519, "correct_loss_per_token": 2.9039878659705036, "incorrect_loss_per_token": 2.696122187397328, "correct_loss_per_char": 1.154485603179058, "incorrect_loss_per_char": 1.0733237878889328, "acc_uncond": 0.22549019607843138, "correct_loss_uncond": -11.379495354259715, "incorrect_loss_uncond": -11.307336607400108, "primary_score": 0.20588235294117646}, "task_idx": 67} |
|
{"task_name": "mmlu_computer_security", "task_hash": "4a7052996611caebbf6877da200249e9", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.383684396743774, "current_date": "2025-01-28 05:09:11 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.37, "acc_per_token": 0.39, "acc_per_char": 0.4, "correct_loss_raw": 23.404645067453384, "incorrect_loss_raw": 22.26494207461675, "correct_loss_per_token": 3.697066572162261, "incorrect_loss_per_token": 4.191401704360472, "correct_loss_per_char": 0.8755962735812557, "incorrect_loss_per_char": 0.9529320870270817, "acc_uncond": 0.45, "correct_loss_uncond": -11.437452062368394, "incorrect_loss_uncond": -9.14047906160355, "primary_score": 0.4}, "task_idx": 68} |
|
{"task_name": "mmlu_conceptual_physics", "task_hash": "f183468e707d67350aa3143009a25cb4", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.77080488204956, "current_date": "2025-01-28 05:09:16 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.4425531914893617, "acc_per_token": 0.40425531914893614, "acc_per_char": 0.3659574468085106, "correct_loss_raw": 9.681720062266004, "incorrect_loss_raw": 11.437909198314586, "correct_loss_per_token": 3.159033647294237, "incorrect_loss_per_token": 3.801626901312656, "correct_loss_per_char": 0.6345066301824792, "incorrect_loss_per_char": 0.7481924325882395, "acc_uncond": 0.32340425531914896, "correct_loss_uncond": -10.286786909052665, "incorrect_loss_uncond": -9.005295290507325, "primary_score": 0.3659574468085106}, "task_idx": 69} |
|
{"task_name": "mmlu_econometrics", "task_hash": "f07b012d85c15887c3dce1c9c732f2cd", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.9429292678833, "current_date": "2025-01-28 05:09:26 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.2719298245614035, "acc_per_token": 0.2807017543859649, "acc_per_char": 0.2543859649122807, "correct_loss_raw": 20.407962154400977, "incorrect_loss_raw": 21.27153128246118, "correct_loss_per_token": 2.2465179165738762, "incorrect_loss_per_token": 2.214162568517403, "correct_loss_per_char": 0.5334538634756668, "incorrect_loss_per_char": 0.5527947886786841, "acc_uncond": 0.2807017543859649, "correct_loss_uncond": -14.199475759476947, "incorrect_loss_uncond": -14.191661572944342, "primary_score": 0.2543859649122807}, "task_idx": 70} |
|
{"task_name": "mmlu_electrical_engineering", "task_hash": "4dd791561a029e99d7a01f69b382e913", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.24954628944397, "current_date": "2025-01-28 05:09:39 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.2827586206896552, "acc_per_token": 0.38620689655172413, "acc_per_char": 0.27586206896551724, "correct_loss_raw": 14.497281551361084, "incorrect_loss_raw": 14.476599709192904, "correct_loss_per_token": 3.5643252661563363, "incorrect_loss_per_token": 3.9365202202173757, "correct_loss_per_char": 1.0032122860219315, "incorrect_loss_per_char": 1.0192041238790017, "acc_uncond": 0.25517241379310346, "correct_loss_uncond": -8.318864940774851, "incorrect_loss_uncond": -8.915061661840854, "primary_score": 0.27586206896551724}, "task_idx": 71} |
|
{"task_name": "mmlu_elementary_mathematics", "task_hash": "34eb4bd85bcf6cf6a0740154b20610f9", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.444039821624756, "current_date": "2025-01-28 05:09:50 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.22486772486772486, "acc_per_token": 0.24603174603174602, "acc_per_char": 0.2222222222222222, "correct_loss_raw": 12.617182190456088, "incorrect_loss_raw": 12.648899849234864, "correct_loss_per_token": 4.011449249881623, "incorrect_loss_per_token": 4.0698648570679135, "correct_loss_per_char": 1.5799489105760054, "incorrect_loss_per_char": 1.5784691322387068, "acc_uncond": 0.25132275132275134, "correct_loss_uncond": -8.14370087971763, "incorrect_loss_uncond": -7.942084594483527, "primary_score": 0.2222222222222222}, "task_idx": 72} |
|
{"task_name": "mmlu_formal_logic", "task_hash": "edba816f035a5a7d7df7dae63a847ed4", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.252433776855469, "current_date": "2025-01-28 05:10:13 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.2857142857142857, "acc_per_token": 0.30158730158730157, "acc_per_char": 0.25396825396825395, "correct_loss_raw": 24.24169018911937, "incorrect_loss_raw": 25.16380687239309, "correct_loss_per_token": 2.6206852888514813, "incorrect_loss_per_token": 2.5913545592668363, "correct_loss_per_char": 1.178191394437991, "incorrect_loss_per_char": 1.2065683643156648, "acc_uncond": 0.2619047619047619, "correct_loss_uncond": -27.332725691416908, "incorrect_loss_uncond": -27.583639094438503, "primary_score": 0.25396825396825395}, "task_idx": 73} |
|
{"task_name": "mmlu_global_facts", "task_hash": "83faa1c084d9844ed22d2f870171a354", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.957371950149536, "current_date": "2025-01-28 05:10:26 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.28, "acc_per_token": 0.26, "acc_per_char": 0.27, "correct_loss_raw": 8.227108089923858, "incorrect_loss_raw": 8.770471336841581, "correct_loss_per_token": 2.8547716124380265, "incorrect_loss_per_token": 2.846778433476655, "correct_loss_per_char": 1.1330247212888274, "incorrect_loss_per_char": 1.1418194758238447, "acc_uncond": 0.23, "correct_loss_uncond": -6.596706464290619, "incorrect_loss_uncond": -6.718919524351756, "primary_score": 0.27}, "task_idx": 74} |
|
{"task_name": "mmlu_high_school_biology", "task_hash": "40305e6449b4c634cf3858f0cb1a9ea0", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.61864948272705, "current_date": "2025-01-28 05:10:32 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.33225806451612905, "acc_per_token": 0.38387096774193546, "acc_per_char": 0.3774193548387097, "correct_loss_raw": 23.422528468793438, "incorrect_loss_raw": 23.378436685633897, "correct_loss_per_token": 2.8268858231909193, "incorrect_loss_per_token": 3.1888776075004643, "correct_loss_per_char": 0.5962073244683979, "incorrect_loss_per_char": 0.635100570585042, "acc_uncond": 0.3903225806451613, "correct_loss_uncond": -13.304168920747696, "incorrect_loss_uncond": -12.192243623989887, "primary_score": 0.3774193548387097}, "task_idx": 75} |
|
{"task_name": "mmlu_high_school_chemistry", "task_hash": "c148a2f0c73c4d2e8a363125f171f603", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.492839813232422, "current_date": "2025-01-28 05:10:55 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.18719211822660098, "acc_per_token": 0.2512315270935961, "acc_per_char": 0.22660098522167488, "correct_loss_raw": 22.45076175158834, "incorrect_loss_raw": 20.38899238209419, "correct_loss_per_token": 2.850241222329733, "incorrect_loss_per_token": 2.780806227913936, "correct_loss_per_char": 0.9657884561766451, "incorrect_loss_per_char": 0.9530728059606413, "acc_uncond": 0.2315270935960591, "correct_loss_uncond": -12.865491454824438, "incorrect_loss_uncond": -12.986639757559603, "primary_score": 0.22660098522167488}, "task_idx": 76} |
|
{"task_name": "mmlu_high_school_computer_science", "task_hash": "7f237d33901391c40fe99221b7fc7df2", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.012717247009277, "current_date": "2025-01-28 05:11:11 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.32, "correct_loss_raw": 24.406939495801925, "incorrect_loss_raw": 24.642574167648956, "correct_loss_per_token": 2.733448426139586, "incorrect_loss_per_token": 2.899001420921982, "correct_loss_per_char": 0.8615298626273278, "incorrect_loss_per_char": 0.9178375013391131, "acc_uncond": 0.28, "correct_loss_uncond": -15.89029737830162, "incorrect_loss_uncond": -15.340218031009039, "primary_score": 0.32}, "task_idx": 77} |
|
{"task_name": "mmlu_high_school_european_history", "task_hash": "bce04ae918d4f75bd0e71aeb5508ea76", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 91.07091355323792, "current_date": "2025-01-28 05:11:24 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.3090909090909091, "acc_per_token": 0.4, "acc_per_char": 0.4303030303030303, "correct_loss_raw": 29.294819712638855, "incorrect_loss_raw": 28.467424194740545, "correct_loss_per_token": 2.6493792924973905, "incorrect_loss_per_token": 3.1982653896595075, "correct_loss_per_char": 0.4819176800770576, "incorrect_loss_per_char": 0.5754395499064723, "acc_uncond": 0.3939393939393939, "correct_loss_uncond": -14.362685403679356, "incorrect_loss_uncond": -12.473233738812535, "primary_score": 0.4303030303030303}, "task_idx": 78} |
|
{"task_name": "mmlu_high_school_geography", "task_hash": "2451a97e8ea5ba8e49d0f60db615137b", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.383923292160034, "current_date": "2025-01-28 05:12:55 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.398989898989899, "acc_per_token": 0.4292929292929293, "acc_per_char": 0.42424242424242425, "correct_loss_raw": 14.851458657721077, "incorrect_loss_raw": 15.32271108683512, "correct_loss_per_token": 3.2294719091238155, "incorrect_loss_per_token": 3.7772383174772566, "correct_loss_per_char": 0.6109877137406031, "incorrect_loss_per_char": 0.7332291409306742, "acc_uncond": 0.4494949494949495, "correct_loss_uncond": -10.619371057911353, "incorrect_loss_uncond": -8.937387857774294, "primary_score": 0.42424242424242425}, "task_idx": 79} |
|
{"task_name": "mmlu_high_school_government_and_politics", "task_hash": "432e3dd431e2137bb51952baabfe8d40", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.978307962417603, "current_date": "2025-01-28 05:13:06 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.36787564766839376, "acc_per_token": 0.41450777202072536, "acc_per_char": 0.40932642487046633, "correct_loss_raw": 22.927042597933756, "incorrect_loss_raw": 24.156823517333244, "correct_loss_per_token": 2.4248573949182557, "incorrect_loss_per_token": 2.8773401133777723, "correct_loss_per_char": 0.40896926714865334, "incorrect_loss_per_char": 0.4856609906527501, "acc_uncond": 0.46113989637305697, "correct_loss_uncond": -15.880143711603985, "incorrect_loss_uncond": -13.578314260703511, "primary_score": 0.40932642487046633}, "task_idx": 80} |
|
{"task_name": "mmlu_high_school_macroeconomics", "task_hash": "fa28d7d574940324e3f18cc755314008", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.658601760864258, "current_date": "2025-01-28 05:13:19 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.2564102564102564, "acc_per_token": 0.3435897435897436, "acc_per_char": 0.31794871794871793, "correct_loss_raw": 22.553980228839777, "incorrect_loss_raw": 22.513033985480302, "correct_loss_per_token": 2.8149989957995465, "incorrect_loss_per_token": 2.969685400406737, "correct_loss_per_char": 0.5993202375875064, "incorrect_loss_per_char": 0.6177948858909001, "acc_uncond": 0.33589743589743587, "correct_loss_uncond": -14.813048550410148, "incorrect_loss_uncond": -14.261887448873276, "primary_score": 0.31794871794871793}, "task_idx": 81} |
|
{"task_name": "mmlu_high_school_mathematics", "task_hash": "d35dafac7b92c7adc6cb83bfcf827620", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.009334564208984, "current_date": "2025-01-28 05:13:39 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.14074074074074075, "acc_per_token": 0.16296296296296298, "acc_per_char": 0.16666666666666666, "correct_loss_raw": 9.670874904703211, "incorrect_loss_raw": 8.418458518275509, "correct_loss_per_token": 4.388502920994134, "incorrect_loss_per_token": 4.0571036996920355, "correct_loss_per_char": 1.792509411031147, "incorrect_loss_per_char": 1.6654076509249354, "acc_uncond": 0.25925925925925924, "correct_loss_uncond": -5.577009178090979, "incorrect_loss_uncond": -5.338655949816291, "primary_score": 0.16666666666666666}, "task_idx": 82} |
|
{"task_name": "mmlu_high_school_microeconomics", "task_hash": "9b84847fb5a13e1e48dfd2e71e7dfdc5", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.750521898269653, "current_date": "2025-01-28 05:13:55 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.3067226890756303, "acc_per_token": 0.35294117647058826, "acc_per_char": 0.3445378151260504, "correct_loss_raw": 26.341775166888198, "incorrect_loss_raw": 25.615681278605404, "correct_loss_per_token": 2.8184217104124993, "incorrect_loss_per_token": 3.035311011271065, "correct_loss_per_char": 0.5994078265429786, "incorrect_loss_per_char": 0.6361737260306404, "acc_uncond": 0.2815126050420168, "correct_loss_uncond": -14.91285052820414, "incorrect_loss_uncond": -14.251284438021052, "primary_score": 0.3445378151260504}, "task_idx": 83} |
|
{"task_name": "mmlu_high_school_physics", "task_hash": "2438f80fa949fdfba5fd0982a3e13ce8", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.409867763519287, "current_date": "2025-01-28 05:14:08 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.23178807947019867, "acc_per_token": 0.23178807947019867, "acc_per_char": 0.26490066225165565, "correct_loss_raw": 22.763482046443105, "incorrect_loss_raw": 22.257052946827525, "correct_loss_per_token": 2.5960569844563373, "incorrect_loss_per_token": 2.5823878331253107, "correct_loss_per_char": 0.905630647252826, "incorrect_loss_per_char": 0.9196944828304492, "acc_uncond": 0.2052980132450331, "correct_loss_uncond": -15.276854006659905, "incorrect_loss_uncond": -15.482468052415662, "primary_score": 0.26490066225165565}, "task_idx": 84} |
|
{"task_name": "mmlu_high_school_psychology", "task_hash": "e5c6b909fb842973d0ba75f8fad285a1", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 40.113529920578, "current_date": "2025-01-28 05:14:23 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.43486238532110094, "acc_per_token": 0.44770642201834865, "acc_per_char": 0.44770642201834865, "correct_loss_raw": 15.644431526890589, "incorrect_loss_raw": 17.678874772112657, "correct_loss_per_token": 3.287111018232655, "incorrect_loss_per_token": 4.053066081401744, "correct_loss_per_char": 0.5460167619010124, "incorrect_loss_per_char": 0.6743094528059101, "acc_uncond": 0.43302752293577984, "correct_loss_uncond": -12.730780654270715, "incorrect_loss_uncond": -10.791531616394675, "primary_score": 0.44770642201834865}, "task_idx": 85} |
|
{"task_name": "mmlu_high_school_statistics", "task_hash": "c5e879c445098b25ee27496e3b91777c", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.460991144180298, "current_date": "2025-01-28 05:15:03 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.25462962962962965, "acc_per_token": 0.27314814814814814, "acc_per_char": 0.2824074074074074, "correct_loss_raw": 27.651417740517193, "incorrect_loss_raw": 28.06657115416028, "correct_loss_per_token": 2.7900353353234726, "incorrect_loss_per_token": 2.8485333483311313, "correct_loss_per_char": 0.844808223388995, "incorrect_loss_per_char": 0.8798398461280343, "acc_uncond": 0.2638888888888889, "correct_loss_uncond": -15.988951186890956, "incorrect_loss_uncond": -15.400075823988445, "primary_score": 0.2824074074074074}, "task_idx": 86} |
|
{"task_name": "mmlu_high_school_us_history", "task_hash": "07edfc83a12773340cdb716671b46541", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 111.46235418319702, "current_date": "2025-01-28 05:15:30 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.3137254901960784, "acc_per_token": 0.28431372549019607, "acc_per_char": 0.36764705882352944, "correct_loss_raw": 26.880014566522018, "incorrect_loss_raw": 27.453274901980677, "correct_loss_per_token": 2.6105476030654637, "incorrect_loss_per_token": 2.816699075767538, "correct_loss_per_char": 0.49075362699857583, "incorrect_loss_per_char": 0.5346520600328966, "acc_uncond": 0.4019607843137255, "correct_loss_uncond": -13.405258101867695, "incorrect_loss_uncond": -11.7003982871576, "primary_score": 0.36764705882352944}, "task_idx": 87} |
|
{"task_name": "mmlu_high_school_world_history", "task_hash": "38f161e2f228b6acfe7cb1aa36d0d3ef", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 79.84046602249146, "current_date": "2025-01-28 05:17:21 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.31223628691983124, "acc_per_token": 0.3333333333333333, "acc_per_char": 0.3291139240506329, "correct_loss_raw": 30.838054077534736, "incorrect_loss_raw": 30.87430640883251, "correct_loss_per_token": 2.867109712714596, "incorrect_loss_per_token": 3.260024923317792, "correct_loss_per_char": 0.5314544005975688, "incorrect_loss_per_char": 0.5843608341080979, "acc_uncond": 0.38396624472573837, "correct_loss_uncond": -13.943946617062082, "incorrect_loss_uncond": -12.21367009916721, "primary_score": 0.3291139240506329}, "task_idx": 88} |
|
{"task_name": "mmlu_human_aging", "task_hash": "8c66e7db317c293ebcd7cd3ad67b5840", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.146653413772583, "current_date": "2025-01-28 05:18:40 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.3901345291479821, "acc_per_token": 0.37668161434977576, "acc_per_char": 0.3632286995515695, "correct_loss_raw": 13.207357588370284, "incorrect_loss_raw": 15.715544661955262, "correct_loss_per_token": 3.1795063876129412, "incorrect_loss_per_token": 3.640655177994138, "correct_loss_per_char": 0.5910538896864498, "incorrect_loss_per_char": 0.711383290297721, "acc_uncond": 0.4125560538116592, "correct_loss_uncond": -9.76212308438904, "incorrect_loss_uncond": -8.600861716341722, "primary_score": 0.3632286995515695}, "task_idx": 89} |
|
{"task_name": "mmlu_human_sexuality", "task_hash": "f3dcb40d784b716dae889d9bf3c62232", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.637839317321777, "current_date": "2025-01-28 05:18:52 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.366412213740458, "acc_per_token": 0.4122137404580153, "acc_per_char": 0.37404580152671757, "correct_loss_raw": 15.403073868678726, "incorrect_loss_raw": 16.88063151266132, "correct_loss_per_token": 3.2456610589728214, "incorrect_loss_per_token": 3.786674935370138, "correct_loss_per_char": 0.6795283338072725, "incorrect_loss_per_char": 0.717655541040209, "acc_uncond": 0.31297709923664124, "correct_loss_uncond": -10.27558324446205, "incorrect_loss_uncond": -11.262372124589433, "primary_score": 0.37404580152671757}, "task_idx": 90} |
|
{"task_name": "mmlu_international_law", "task_hash": "b4d3ab839d093262fe791e56c98053df", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.133466958999634, "current_date": "2025-01-28 05:18:58 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.21487603305785125, "acc_per_token": 0.34710743801652894, "acc_per_char": 0.33884297520661155, "correct_loss_raw": 48.05373882459215, "incorrect_loss_raw": 35.16545577449905, "correct_loss_per_token": 2.442693908633941, "incorrect_loss_per_token": 2.6322821670669034, "correct_loss_per_char": 0.4478827587550682, "incorrect_loss_per_char": 0.47135644041508495, "acc_uncond": 0.4132231404958678, "correct_loss_uncond": -25.577862721829376, "incorrect_loss_uncond": -22.631607413291942, "primary_score": 0.33884297520661155}, "task_idx": 91} |
|
{"task_name": "mmlu_jurisprudence", "task_hash": "a5a3583aea5dbd6ece8896b0140522f5", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.887946605682373, "current_date": "2025-01-28 05:19:08 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.24074074074074073, "acc_per_token": 0.2777777777777778, "acc_per_char": 0.3055555555555556, "correct_loss_raw": 27.873796701431274, "incorrect_loss_raw": 23.404582691781314, "correct_loss_per_token": 3.2104180097790223, "incorrect_loss_per_token": 3.6451304542187484, "correct_loss_per_char": 0.6253939046986986, "incorrect_loss_per_char": 0.6669790162259673, "acc_uncond": 0.35185185185185186, "correct_loss_uncond": -13.328769586704395, "incorrect_loss_uncond": -11.962864413673495, "primary_score": 0.3055555555555556}, "task_idx": 92} |
|
{"task_name": "mmlu_logical_fallacies", "task_hash": "87754a93f67c5e3682212e20e26d138f", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.924015998840332, "current_date": "2025-01-28 05:19:15 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.32515337423312884, "acc_per_token": 0.32515337423312884, "acc_per_char": 0.3006134969325153, "correct_loss_raw": 24.923576025875068, "incorrect_loss_raw": 24.727881545669465, "correct_loss_per_token": 3.5600692408888506, "incorrect_loss_per_token": 3.9380153038708654, "correct_loss_per_char": 0.627988358930759, "incorrect_loss_per_char": 0.7189265105471663, "acc_uncond": 0.34355828220858897, "correct_loss_uncond": -11.801360945028762, "incorrect_loss_uncond": -9.991538072168707, "primary_score": 0.3006134969325153}, "task_idx": 93} |
|
{"task_name": "mmlu_machine_learning", "task_hash": "c7a50715045d63764fe2fc8c95f84e4e", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.163136959075928, "current_date": "2025-01-28 05:19:26 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.20535714285714285, "acc_per_char": 0.25892857142857145, "correct_loss_raw": 20.02137460027422, "incorrect_loss_raw": 20.219824033833685, "correct_loss_per_token": 4.044495080832256, "incorrect_loss_per_token": 3.9327769267589305, "correct_loss_per_char": 1.0349116590038663, "incorrect_loss_per_char": 1.015609778293177, "acc_uncond": 0.25892857142857145, "correct_loss_uncond": -7.27716230068888, "incorrect_loss_uncond": -6.800522708467076, "primary_score": 0.25892857142857145}, "task_idx": 94} |
|
{"task_name": "mmlu_management", "task_hash": "bb2a328db2333c8df600dba174c2c4f7", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.399227142333984, "current_date": "2025-01-28 05:19:35 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.3592233009708738, "acc_per_token": 0.4174757281553398, "acc_per_char": 0.46601941747572817, "correct_loss_raw": 13.373132923744546, "incorrect_loss_raw": 13.948064290204092, "correct_loss_per_token": 3.4889721474293993, "incorrect_loss_per_token": 3.9458886085815177, "correct_loss_per_char": 0.5919467928270647, "incorrect_loss_per_char": 0.6679528529252269, "acc_uncond": 0.4077669902912621, "correct_loss_uncond": -9.704405075825244, "incorrect_loss_uncond": -8.348487658022291, "primary_score": 0.46601941747572817}, "task_idx": 95} |
|
{"task_name": "mmlu_marketing", "task_hash": "58c595b7c49dba71f3aa397880a13a84", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.895387172698975, "current_date": "2025-01-28 05:19:40 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.5470085470085471, "acc_per_token": 0.5085470085470085, "acc_per_char": 0.5384615384615384, "correct_loss_raw": 13.572457664542728, "incorrect_loss_raw": 16.647636182627444, "correct_loss_per_token": 2.809427778515972, "incorrect_loss_per_token": 3.6180697037128433, "correct_loss_per_char": 0.5767109962686646, "incorrect_loss_per_char": 0.7547783048689498, "acc_uncond": 0.5256410256410257, "correct_loss_uncond": -11.739940891408512, "incorrect_loss_uncond": -9.285592714945475, "primary_score": 0.5384615384615384}, "task_idx": 96} |
|
{"task_name": "mmlu_medical_genetics", "task_hash": "36a9fec8301b47f23d8ced742c53d402", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.50648832321167, "current_date": "2025-01-28 05:19:55 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.31, "acc_per_token": 0.37, "acc_per_char": 0.37, "correct_loss_raw": 15.446263015270233, "incorrect_loss_raw": 14.41324984212717, "correct_loss_per_token": 2.6825006160742366, "incorrect_loss_per_token": 3.0072725567487204, "correct_loss_per_char": 0.6961493974895729, "incorrect_loss_per_char": 0.7720714342989248, "acc_uncond": 0.34, "correct_loss_uncond": -12.730000212192536, "incorrect_loss_uncond": -11.418937958280251, "primary_score": 0.37}, "task_idx": 97} |
|
{"task_name": "mmlu_miscellaneous", "task_hash": "3ce7aa82135b0926faa1a6d49e1f073f", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 31.32127857208252, "current_date": "2025-01-28 05:19:59 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.4623243933588761, "acc_per_token": 0.45338441890166026, "acc_per_char": 0.44189016602809705, "correct_loss_raw": 10.597466928901039, "incorrect_loss_raw": 12.492943032795754, "correct_loss_per_token": 3.077234891557639, "incorrect_loss_per_token": 3.983762621408433, "correct_loss_per_char": 0.6642742579545429, "incorrect_loss_per_char": 0.8663300746780773, "acc_uncond": 0.46998722860791825, "correct_loss_uncond": -9.998273584273278, "incorrect_loss_uncond": -7.989548977256683, "primary_score": 0.44189016602809705}, "task_idx": 98} |
|
{"task_name": "mmlu_moral_disputes", "task_hash": "643b3f1a385bb8b4ce6a53105fffb3de", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.997414588928223, "current_date": "2025-01-28 05:20:31 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.28901734104046245, "acc_per_token": 0.2861271676300578, "acc_per_char": 0.2658959537572254, "correct_loss_raw": 27.931170863223215, "incorrect_loss_raw": 25.438975363108458, "correct_loss_per_token": 3.0590862848354696, "incorrect_loss_per_token": 3.231302455917077, "correct_loss_per_char": 0.6051675742913569, "incorrect_loss_per_char": 0.61099888660525, "acc_uncond": 0.3236994219653179, "correct_loss_uncond": -12.899194907590832, "incorrect_loss_uncond": -12.524405207018408, "primary_score": 0.2658959537572254}, "task_idx": 99} |
|
{"task_name": "mmlu_moral_scenarios", "task_hash": "49d4bc1cb20a4596312dda1c40b5467e", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 117.07968044281006, "current_date": "2025-01-28 05:20:55 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.23798882681564246, "acc_per_token": 0.23910614525139665, "acc_per_char": 0.24134078212290502, "correct_loss_raw": 1.8169888589635241, "incorrect_loss_raw": 1.794629777965156, "correct_loss_per_token": 0.4470566453747245, "incorrect_loss_per_token": 0.44595804752658275, "correct_loss_per_char": 0.10512623958974505, "incorrect_loss_per_char": 0.10480863927019186, "acc_uncond": 0.2424581005586592, "correct_loss_uncond": -20.412340985463317, "incorrect_loss_uncond": -20.270843533475063, "primary_score": 0.24134078212290502}, "task_idx": 100} |
|
{"task_name": "mmlu_nutrition", "task_hash": "96b6d39ad9e2a3d1f6444ca444eafe21", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 24.025350332260132, "current_date": "2025-01-28 05:22:52 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.27124183006535946, "acc_per_token": 0.3104575163398693, "acc_per_char": 0.3300653594771242, "correct_loss_raw": 27.09890270642206, "incorrect_loss_raw": 24.103218915935177, "correct_loss_per_token": 2.741918507811255, "incorrect_loss_per_token": 2.9568815308321694, "correct_loss_per_char": 0.5967657225962812, "incorrect_loss_per_char": 0.6413422495462857, "acc_uncond": 0.30718954248366015, "correct_loss_uncond": -11.472530127156015, "incorrect_loss_uncond": -11.217267668065428, "primary_score": 0.3300653594771242}, "task_idx": 101} |
|
{"task_name": "mmlu_philosophy", "task_hash": "e8a8e079a41710f36b2b11993287bbfb", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.985312938690186, "current_date": "2025-01-28 05:23:16 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.31189710610932475, "acc_per_token": 0.2958199356913183, "acc_per_char": 0.29260450160771706, "correct_loss_raw": 22.84445811362512, "incorrect_loss_raw": 21.41734132250534, "correct_loss_per_token": 3.2569723900673795, "incorrect_loss_per_token": 3.448014758203821, "correct_loss_per_char": 0.6448074730052004, "incorrect_loss_per_char": 0.6701893653621859, "acc_uncond": 0.34726688102893893, "correct_loss_uncond": -12.343435560679513, "incorrect_loss_uncond": -11.51666408566416, "primary_score": 0.29260450160771706}, "task_idx": 102} |
|
{"task_name": "mmlu_prehistory", "task_hash": "7b3aeaaf8c8020231ef7fed4751f86c2", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.815616846084595, "current_date": "2025-01-28 05:23:33 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.38580246913580246, "acc_per_token": 0.33641975308641975, "acc_per_char": 0.3395061728395062, "correct_loss_raw": 22.741492905366567, "incorrect_loss_raw": 24.148995708045643, "correct_loss_per_token": 2.9285628383158593, "incorrect_loss_per_token": 3.1740829046824226, "correct_loss_per_char": 0.6684254604296711, "incorrect_loss_per_char": 0.7116053821350659, "acc_uncond": 0.30864197530864196, "correct_loss_uncond": -13.82639996394699, "incorrect_loss_uncond": -13.412203422299138, "primary_score": 0.3395061728395062}, "task_idx": 103} |
|
{"task_name": "mmlu_professional_accounting", "task_hash": "271a9bf402980f6076d2237f6c3d56d5", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 35.661864280700684, "current_date": "2025-01-28 05:23:54 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.2624113475177305, "acc_per_token": 0.24468085106382978, "acc_per_char": 0.2624113475177305, "correct_loss_raw": 25.468858400980633, "incorrect_loss_raw": 25.603731684791857, "correct_loss_per_token": 2.9282979776618254, "incorrect_loss_per_token": 2.9816599561063, "correct_loss_per_char": 0.7859532904616492, "incorrect_loss_per_char": 0.8187658903234888, "acc_uncond": 0.23404255319148937, "correct_loss_uncond": -12.358145314750942, "incorrect_loss_uncond": -11.935372286513626, "primary_score": 0.2624113475177305}, "task_idx": 104} |
|
{"task_name": "mmlu_professional_law", "task_hash": "9cf2ca304d70aaad2023633d91fbfefa", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 500.5496609210968, "current_date": "2025-01-28 05:24:30 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.24837027379400262, "acc_per_token": 0.2770534550195567, "acc_per_char": 0.28292046936114734, "correct_loss_raw": 42.53276868166482, "incorrect_loss_raw": 40.959912275578944, "correct_loss_per_token": 2.328051215523688, "incorrect_loss_per_token": 2.338804521712942, "correct_loss_per_char": 0.4630966530959382, "incorrect_loss_per_char": 0.4638887808229263, "acc_uncond": 0.28292046936114734, "correct_loss_uncond": -26.31520616145613, "incorrect_loss_uncond": -25.362183912558844, "primary_score": 0.28292046936114734}, "task_idx": 105} |
|
{"task_name": "mmlu_professional_medicine", "task_hash": "e76678f3aea053cba7bbb3fe152ff642", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 65.28554391860962, "current_date": "2025-01-28 05:32:50 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.27941176470588236, "acc_per_token": 0.30514705882352944, "acc_per_char": 0.3161764705882353, "correct_loss_raw": 15.306199021856575, "incorrect_loss_raw": 16.223133846971336, "correct_loss_per_token": 2.7531318445173576, "incorrect_loss_per_token": 2.9333700271242953, "correct_loss_per_char": 0.5619092239146051, "incorrect_loss_per_char": 0.6103003730666766, "acc_uncond": 0.35294117647058826, "correct_loss_uncond": -10.302531104973134, "incorrect_loss_uncond": -9.460067889123565, "primary_score": 0.3161764705882353}, "task_idx": 106} |
|
{"task_name": "mmlu_professional_psychology", "task_hash": "1f11cdabb27186bb3d09781f9a2bce87", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 52.16387963294983, "current_date": "2025-01-28 05:33:55 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.28104575163398693, "acc_per_token": 0.32189542483660133, "acc_per_char": 0.3137254901960784, "correct_loss_raw": 26.00777135838091, "incorrect_loss_raw": 26.806547340874054, "correct_loss_per_token": 3.388280352304593, "incorrect_loss_per_token": 3.66053271580569, "correct_loss_per_char": 0.6138900647704278, "incorrect_loss_per_char": 0.6624101534681627, "acc_uncond": 0.32516339869281047, "correct_loss_uncond": -14.89004742047366, "incorrect_loss_uncond": -14.08248072291251, "primary_score": 0.3137254901960784}, "task_idx": 107} |
|
{"task_name": "mmlu_public_relations", "task_hash": "f4f7d9efa5b14b632f1bb8cf53a780d0", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.839644908905029, "current_date": "2025-01-28 05:34:47 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.43636363636363634, "acc_per_token": 0.33636363636363636, "acc_per_char": 0.32727272727272727, "correct_loss_raw": 14.436861190470783, "incorrect_loss_raw": 17.1334787957596, "correct_loss_per_token": 4.36399790285229, "incorrect_loss_per_token": 4.841983841091792, "correct_loss_per_char": 0.7630880369032698, "incorrect_loss_per_char": 0.8140421525722414, "acc_uncond": 0.3090909090909091, "correct_loss_uncond": -8.835875168171796, "incorrect_loss_uncond": -7.6501546885028, "primary_score": 0.32727272727272727}, "task_idx": 108} |
|
{"task_name": "mmlu_security_studies", "task_hash": "ae4ffe7cce87e733dc815d013b44ec75", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.786017894744873, "current_date": "2025-01-28 05:34:53 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.2938775510204082, "acc_per_token": 0.2897959183673469, "acc_per_char": 0.24081632653061225, "correct_loss_raw": 89.91756982997973, "incorrect_loss_raw": 99.25826387470266, "correct_loss_per_token": 3.200697167437819, "incorrect_loss_per_token": 3.1255254454309878, "correct_loss_per_char": 0.6131914929363799, "incorrect_loss_per_char": 0.5667601617935849, "acc_uncond": 0.2612244897959184, "correct_loss_uncond": -16.19699908470621, "incorrect_loss_uncond": -19.218327763615825, "primary_score": 0.24081632653061225}, "task_idx": 109} |
|
{"task_name": "mmlu_sociology", "task_hash": "66633d3e396945e27b4489e2e582b958", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.78001070022583, "current_date": "2025-01-28 05:35:21 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.29850746268656714, "acc_per_token": 0.3383084577114428, "acc_per_char": 0.2935323383084577, "correct_loss_raw": 30.156895663607774, "incorrect_loss_raw": 30.96591551584588, "correct_loss_per_token": 3.2871702017277107, "incorrect_loss_per_token": 3.5714795698468693, "correct_loss_per_char": 0.5643088544491333, "incorrect_loss_per_char": 0.5912486594696444, "acc_uncond": 0.43781094527363185, "correct_loss_uncond": -14.328461995765345, "incorrect_loss_uncond": -13.052933505913902, "primary_score": 0.2935323383084577}, "task_idx": 110} |
|
{"task_name": "mmlu_us_foreign_policy", "task_hash": "bd1ffb65bcdfb1582c6b60bcdbd3d533", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.155822515487671, "current_date": "2025-01-28 05:35:33 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.35, "acc_per_token": 0.36, "acc_per_char": 0.33, "correct_loss_raw": 23.12191395521164, "incorrect_loss_raw": 21.31204191843669, "correct_loss_per_token": 2.6672976917101683, "incorrect_loss_per_token": 2.98203683182319, "correct_loss_per_char": 0.5250935917163915, "incorrect_loss_per_char": 0.5662430079511417, "acc_uncond": 0.42, "correct_loss_uncond": -12.74882021188736, "incorrect_loss_uncond": -11.86411322991053, "primary_score": 0.33}, "task_idx": 111} |
|
{"task_name": "mmlu_virology", "task_hash": "ea10babc381c242bef7bc631f8d422d2", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.729250192642212, "current_date": "2025-01-28 05:35:39 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.22289156626506024, "acc_per_token": 0.3795180722891566, "acc_per_char": 0.3132530120481928, "correct_loss_raw": 18.919861298009575, "incorrect_loss_raw": 18.860513225377314, "correct_loss_per_token": 3.2429491325508266, "incorrect_loss_per_token": 3.6717020625157706, "correct_loss_per_char": 0.6536002292812351, "incorrect_loss_per_char": 0.7150839392683639, "acc_uncond": 0.2710843373493976, "correct_loss_uncond": -10.158014026032873, "incorrect_loss_uncond": -10.19834279559224, "primary_score": 0.3132530120481928}, "task_idx": 112} |
|
{"task_name": "mmlu_world_religions", "task_hash": "7b18e63e9c2a47f065dce28de478a8c0", "model_hash": "311d8c1f2af0797bd3bfd7bf664762cf", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-uniform_by_type-v3.8new-lgbm-mmlu-v8.5-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.193619012832642, "current_date": "2025-01-28 05:35:48 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.4678362573099415, "acc_per_token": 0.47368421052631576, "acc_per_char": 0.45614035087719296, "correct_loss_raw": 9.278102223548974, "incorrect_loss_raw": 10.631291508674623, "correct_loss_per_token": 2.76918694587416, "incorrect_loss_per_token": 3.66177353335694, "correct_loss_per_char": 0.7462150697326452, "incorrect_loss_per_char": 0.9363976597324706, "acc_uncond": 0.4853801169590643, "correct_loss_uncond": -9.94865690067149, "incorrect_loss_uncond": -7.818852109286285, "primary_score": 0.45614035087719296}, "task_idx": 113} |
|
|