|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9902912621359223, |
|
"eval_steps": 500, |
|
"global_step": 51, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019417475728155338, |
|
"grad_norm": 6.841878890991211, |
|
"learning_rate": 2e-05, |
|
"loss": 4.5864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.038834951456310676, |
|
"grad_norm": 6.640402793884277, |
|
"learning_rate": 4e-05, |
|
"loss": 4.6138, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05825242718446602, |
|
"grad_norm": 6.41209077835083, |
|
"learning_rate": 6e-05, |
|
"loss": 4.6064, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 6.082383632659912, |
|
"learning_rate": 8e-05, |
|
"loss": 4.5534, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0970873786407767, |
|
"grad_norm": 4.00201416015625, |
|
"learning_rate": 0.0001, |
|
"loss": 4.3245, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11650485436893204, |
|
"grad_norm": 3.1707470417022705, |
|
"learning_rate": 0.00012, |
|
"loss": 4.0971, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.13592233009708737, |
|
"grad_norm": 3.160670518875122, |
|
"learning_rate": 0.00014, |
|
"loss": 3.789, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 2.994746208190918, |
|
"learning_rate": 0.00016, |
|
"loss": 3.3244, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.17475728155339806, |
|
"grad_norm": 2.6664786338806152, |
|
"learning_rate": 0.00018, |
|
"loss": 2.8561, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1941747572815534, |
|
"grad_norm": 2.029910087585449, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3926, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.21359223300970873, |
|
"grad_norm": 1.3144137859344482, |
|
"learning_rate": 0.00019998688836656323, |
|
"loss": 2.0283, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.23300970873786409, |
|
"grad_norm": 0.8263471722602844, |
|
"learning_rate": 0.00019994755690455152, |
|
"loss": 1.8683, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2524271844660194, |
|
"grad_norm": 0.7193414568901062, |
|
"learning_rate": 0.0001998820159279591, |
|
"loss": 1.732, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.27184466019417475, |
|
"grad_norm": 0.8350910544395447, |
|
"learning_rate": 0.00019979028262377118, |
|
"loss": 1.6208, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2912621359223301, |
|
"grad_norm": 0.6700139045715332, |
|
"learning_rate": 0.00019967238104745696, |
|
"loss": 1.493, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 1.1213518381118774, |
|
"learning_rate": 0.0001995283421166614, |
|
"loss": 1.4857, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3300970873786408, |
|
"grad_norm": 0.7505896687507629, |
|
"learning_rate": 0.00019935820360309777, |
|
"loss": 1.4643, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.34951456310679613, |
|
"grad_norm": 0.6984190940856934, |
|
"learning_rate": 0.00019916201012264254, |
|
"loss": 1.4227, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.36893203883495146, |
|
"grad_norm": 0.46819525957107544, |
|
"learning_rate": 0.00019893981312363562, |
|
"loss": 1.3889, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 0.5214670300483704, |
|
"learning_rate": 0.00019869167087338907, |
|
"loss": 1.3764, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4077669902912621, |
|
"grad_norm": 0.43329402804374695, |
|
"learning_rate": 0.00019841764844290744, |
|
"loss": 1.351, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42718446601941745, |
|
"grad_norm": 0.6535053253173828, |
|
"learning_rate": 0.0001981178176898239, |
|
"loss": 1.3576, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.44660194174757284, |
|
"grad_norm": 0.6363189816474915, |
|
"learning_rate": 0.00019779225723955707, |
|
"loss": 1.3159, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.46601941747572817, |
|
"grad_norm": 0.42140620946884155, |
|
"learning_rate": 0.00019744105246469263, |
|
"loss": 1.3072, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.6757440567016602, |
|
"learning_rate": 0.00019706429546259593, |
|
"loss": 1.3261, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5048543689320388, |
|
"grad_norm": 0.4116632044315338, |
|
"learning_rate": 0.00019666208503126112, |
|
"loss": 1.317, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5242718446601942, |
|
"grad_norm": 0.5359898209571838, |
|
"learning_rate": 0.00019623452664340306, |
|
"loss": 1.2932, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5436893203883495, |
|
"grad_norm": 0.4596949517726898, |
|
"learning_rate": 0.00019578173241879872, |
|
"loss": 1.2601, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5631067961165048, |
|
"grad_norm": 0.6857442855834961, |
|
"learning_rate": 0.0001953038210948861, |
|
"loss": 1.2734, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5825242718446602, |
|
"grad_norm": 0.4453957676887512, |
|
"learning_rate": 0.00019480091799562704, |
|
"loss": 1.2521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6019417475728155, |
|
"grad_norm": 0.6118847131729126, |
|
"learning_rate": 0.00019427315499864344, |
|
"loss": 1.2252, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 0.4894339442253113, |
|
"learning_rate": 0.00019372067050063438, |
|
"loss": 1.2369, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6407766990291263, |
|
"grad_norm": 0.5901785492897034, |
|
"learning_rate": 0.00019314360938108425, |
|
"loss": 1.2507, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6601941747572816, |
|
"grad_norm": 0.4147897958755493, |
|
"learning_rate": 0.00019254212296427044, |
|
"loss": 1.2454, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6796116504854369, |
|
"grad_norm": 0.2836126685142517, |
|
"learning_rate": 0.00019191636897958122, |
|
"loss": 1.2515, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6990291262135923, |
|
"grad_norm": 0.4523944556713104, |
|
"learning_rate": 0.00019126651152015403, |
|
"loss": 1.2217, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7184466019417476, |
|
"grad_norm": 0.4187013804912567, |
|
"learning_rate": 0.0001905927209998447, |
|
"loss": 1.2382, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7378640776699029, |
|
"grad_norm": 0.516381561756134, |
|
"learning_rate": 0.00018989517410853955, |
|
"loss": 1.2214, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7572815533980582, |
|
"grad_norm": 0.5928078889846802, |
|
"learning_rate": 0.00018917405376582145, |
|
"loss": 1.2407, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 0.4062119722366333, |
|
"learning_rate": 0.00018842954907300236, |
|
"loss": 1.1958, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7961165048543689, |
|
"grad_norm": 0.5966278910636902, |
|
"learning_rate": 0.0001876618552635348, |
|
"loss": 1.2043, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8155339805825242, |
|
"grad_norm": 0.47196483612060547, |
|
"learning_rate": 0.00018687117365181512, |
|
"loss": 1.2321, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8349514563106796, |
|
"grad_norm": 0.6465126872062683, |
|
"learning_rate": 0.00018605771158039253, |
|
"loss": 1.1996, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8543689320388349, |
|
"grad_norm": 0.4234643578529358, |
|
"learning_rate": 0.00018522168236559695, |
|
"loss": 1.198, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8737864077669902, |
|
"grad_norm": 0.3910824656486511, |
|
"learning_rate": 0.00018436330524160047, |
|
"loss": 1.1895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8932038834951457, |
|
"grad_norm": 0.5380635261535645, |
|
"learning_rate": 0.00018348280530292713, |
|
"loss": 1.1693, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.912621359223301, |
|
"grad_norm": 0.31823453307151794, |
|
"learning_rate": 0.00018258041344542566, |
|
"loss": 1.159, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 0.5679419636726379, |
|
"learning_rate": 0.0001816563663057211, |
|
"loss": 1.1931, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9514563106796117, |
|
"grad_norm": 0.3980861008167267, |
|
"learning_rate": 0.00018071090619916093, |
|
"loss": 1.1826, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.3688671588897705, |
|
"learning_rate": 0.00017974428105627208, |
|
"loss": 1.176, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9902912621359223, |
|
"grad_norm": 0.4137047827243805, |
|
"learning_rate": 0.00017875674435774547, |
|
"loss": 1.142, |
|
"step": 51 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 204, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 51, |
|
"total_flos": 1.0626058999037952e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|