{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9902912621359223, "eval_steps": 500, "global_step": 51, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019417475728155338, "grad_norm": 6.841878890991211, "learning_rate": 2e-05, "loss": 4.5864, "step": 1 }, { "epoch": 0.038834951456310676, "grad_norm": 6.640402793884277, "learning_rate": 4e-05, "loss": 4.6138, "step": 2 }, { "epoch": 0.05825242718446602, "grad_norm": 6.41209077835083, "learning_rate": 6e-05, "loss": 4.6064, "step": 3 }, { "epoch": 0.07766990291262135, "grad_norm": 6.082383632659912, "learning_rate": 8e-05, "loss": 4.5534, "step": 4 }, { "epoch": 0.0970873786407767, "grad_norm": 4.00201416015625, "learning_rate": 0.0001, "loss": 4.3245, "step": 5 }, { "epoch": 0.11650485436893204, "grad_norm": 3.1707470417022705, "learning_rate": 0.00012, "loss": 4.0971, "step": 6 }, { "epoch": 0.13592233009708737, "grad_norm": 3.160670518875122, "learning_rate": 0.00014, "loss": 3.789, "step": 7 }, { "epoch": 0.1553398058252427, "grad_norm": 2.994746208190918, "learning_rate": 0.00016, "loss": 3.3244, "step": 8 }, { "epoch": 0.17475728155339806, "grad_norm": 2.6664786338806152, "learning_rate": 0.00018, "loss": 2.8561, "step": 9 }, { "epoch": 0.1941747572815534, "grad_norm": 2.029910087585449, "learning_rate": 0.0002, "loss": 2.3926, "step": 10 }, { "epoch": 0.21359223300970873, "grad_norm": 1.3144137859344482, "learning_rate": 0.00019998688836656323, "loss": 2.0283, "step": 11 }, { "epoch": 0.23300970873786409, "grad_norm": 0.8263471722602844, "learning_rate": 0.00019994755690455152, "loss": 1.8683, "step": 12 }, { "epoch": 0.2524271844660194, "grad_norm": 0.7193414568901062, "learning_rate": 0.0001998820159279591, "loss": 1.732, "step": 13 }, { "epoch": 0.27184466019417475, "grad_norm": 0.8350910544395447, "learning_rate": 0.00019979028262377118, "loss": 1.6208, "step": 14 }, { "epoch": 0.2912621359223301, "grad_norm": 0.6700139045715332, "learning_rate": 0.00019967238104745696, "loss": 1.493, "step": 15 }, { "epoch": 0.3106796116504854, "grad_norm": 1.1213518381118774, "learning_rate": 0.0001995283421166614, "loss": 1.4857, "step": 16 }, { "epoch": 0.3300970873786408, "grad_norm": 0.7505896687507629, "learning_rate": 0.00019935820360309777, "loss": 1.4643, "step": 17 }, { "epoch": 0.34951456310679613, "grad_norm": 0.6984190940856934, "learning_rate": 0.00019916201012264254, "loss": 1.4227, "step": 18 }, { "epoch": 0.36893203883495146, "grad_norm": 0.46819525957107544, "learning_rate": 0.00019893981312363562, "loss": 1.3889, "step": 19 }, { "epoch": 0.3883495145631068, "grad_norm": 0.5214670300483704, "learning_rate": 0.00019869167087338907, "loss": 1.3764, "step": 20 }, { "epoch": 0.4077669902912621, "grad_norm": 0.43329402804374695, "learning_rate": 0.00019841764844290744, "loss": 1.351, "step": 21 }, { "epoch": 0.42718446601941745, "grad_norm": 0.6535053253173828, "learning_rate": 0.0001981178176898239, "loss": 1.3576, "step": 22 }, { "epoch": 0.44660194174757284, "grad_norm": 0.6363189816474915, "learning_rate": 0.00019779225723955707, "loss": 1.3159, "step": 23 }, { "epoch": 0.46601941747572817, "grad_norm": 0.42140620946884155, "learning_rate": 0.00019744105246469263, "loss": 1.3072, "step": 24 }, { "epoch": 0.4854368932038835, "grad_norm": 0.6757440567016602, "learning_rate": 0.00019706429546259593, "loss": 1.3261, "step": 25 }, { "epoch": 0.5048543689320388, "grad_norm": 0.4116632044315338, "learning_rate": 0.00019666208503126112, "loss": 1.317, "step": 26 }, { "epoch": 0.5242718446601942, "grad_norm": 0.5359898209571838, "learning_rate": 0.00019623452664340306, "loss": 1.2932, "step": 27 }, { "epoch": 0.5436893203883495, "grad_norm": 0.4596949517726898, "learning_rate": 0.00019578173241879872, "loss": 1.2601, "step": 28 }, { "epoch": 0.5631067961165048, "grad_norm": 0.6857442855834961, "learning_rate": 0.0001953038210948861, "loss": 1.2734, "step": 29 }, { "epoch": 0.5825242718446602, "grad_norm": 0.4453957676887512, "learning_rate": 0.00019480091799562704, "loss": 1.2521, "step": 30 }, { "epoch": 0.6019417475728155, "grad_norm": 0.6118847131729126, "learning_rate": 0.00019427315499864344, "loss": 1.2252, "step": 31 }, { "epoch": 0.6213592233009708, "grad_norm": 0.4894339442253113, "learning_rate": 0.00019372067050063438, "loss": 1.2369, "step": 32 }, { "epoch": 0.6407766990291263, "grad_norm": 0.5901785492897034, "learning_rate": 0.00019314360938108425, "loss": 1.2507, "step": 33 }, { "epoch": 0.6601941747572816, "grad_norm": 0.4147897958755493, "learning_rate": 0.00019254212296427044, "loss": 1.2454, "step": 34 }, { "epoch": 0.6796116504854369, "grad_norm": 0.2836126685142517, "learning_rate": 0.00019191636897958122, "loss": 1.2515, "step": 35 }, { "epoch": 0.6990291262135923, "grad_norm": 0.4523944556713104, "learning_rate": 0.00019126651152015403, "loss": 1.2217, "step": 36 }, { "epoch": 0.7184466019417476, "grad_norm": 0.4187013804912567, "learning_rate": 0.0001905927209998447, "loss": 1.2382, "step": 37 }, { "epoch": 0.7378640776699029, "grad_norm": 0.516381561756134, "learning_rate": 0.00018989517410853955, "loss": 1.2214, "step": 38 }, { "epoch": 0.7572815533980582, "grad_norm": 0.5928078889846802, "learning_rate": 0.00018917405376582145, "loss": 1.2407, "step": 39 }, { "epoch": 0.7766990291262136, "grad_norm": 0.4062119722366333, "learning_rate": 0.00018842954907300236, "loss": 1.1958, "step": 40 }, { "epoch": 0.7961165048543689, "grad_norm": 0.5966278910636902, "learning_rate": 0.0001876618552635348, "loss": 1.2043, "step": 41 }, { "epoch": 0.8155339805825242, "grad_norm": 0.47196483612060547, "learning_rate": 0.00018687117365181512, "loss": 1.2321, "step": 42 }, { "epoch": 0.8349514563106796, "grad_norm": 0.6465126872062683, "learning_rate": 0.00018605771158039253, "loss": 1.1996, "step": 43 }, { "epoch": 0.8543689320388349, "grad_norm": 0.4234643578529358, "learning_rate": 0.00018522168236559695, "loss": 1.198, "step": 44 }, { "epoch": 0.8737864077669902, "grad_norm": 0.3910824656486511, "learning_rate": 0.00018436330524160047, "loss": 1.1895, "step": 45 }, { "epoch": 0.8932038834951457, "grad_norm": 0.5380635261535645, "learning_rate": 0.00018348280530292713, "loss": 1.1693, "step": 46 }, { "epoch": 0.912621359223301, "grad_norm": 0.31823453307151794, "learning_rate": 0.00018258041344542566, "loss": 1.159, "step": 47 }, { "epoch": 0.9320388349514563, "grad_norm": 0.5679419636726379, "learning_rate": 0.0001816563663057211, "loss": 1.1931, "step": 48 }, { "epoch": 0.9514563106796117, "grad_norm": 0.3980861008167267, "learning_rate": 0.00018071090619916093, "loss": 1.1826, "step": 49 }, { "epoch": 0.970873786407767, "grad_norm": 0.3688671588897705, "learning_rate": 0.00017974428105627208, "loss": 1.176, "step": 50 }, { "epoch": 0.9902912621359223, "grad_norm": 0.4137047827243805, "learning_rate": 0.00017875674435774547, "loss": 1.142, "step": 51 } ], "logging_steps": 1, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 51, "total_flos": 1.0626058999037952e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }