{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9924215084806929, "eval_steps": 500, "global_step": 49500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01002445968162316, "grad_norm": 0.1908789426088333, "learning_rate": 2.969926620955131e-05, "loss": 0.0305, "step": 500 }, { "epoch": 0.02004891936324632, "grad_norm": 0.2333366870880127, "learning_rate": 2.939853241910261e-05, "loss": 0.027, "step": 1000 }, { "epoch": 0.03007337904486948, "grad_norm": 0.18207737803459167, "learning_rate": 2.9097798628653917e-05, "loss": 0.026, "step": 1500 }, { "epoch": 0.04009783872649264, "grad_norm": 0.13153837621212006, "learning_rate": 2.879706483820522e-05, "loss": 0.0253, "step": 2000 }, { "epoch": 0.0501222984081158, "grad_norm": 0.20580655336380005, "learning_rate": 2.849633104775653e-05, "loss": 0.0249, "step": 2500 }, { "epoch": 0.06014675808973896, "grad_norm": 0.16466622054576874, "learning_rate": 2.8196198724888727e-05, "loss": 0.0239, "step": 3000 }, { "epoch": 0.07017121777136212, "grad_norm": 0.1590748131275177, "learning_rate": 2.7895464934440035e-05, "loss": 0.0239, "step": 3500 }, { "epoch": 0.08019567745298528, "grad_norm": 0.14150911569595337, "learning_rate": 2.759473114399134e-05, "loss": 0.0236, "step": 4000 }, { "epoch": 0.09022013713460844, "grad_norm": 0.09914161264896393, "learning_rate": 2.7293997353542647e-05, "loss": 0.0236, "step": 4500 }, { "epoch": 0.1002445968162316, "grad_norm": 0.16175945103168488, "learning_rate": 2.6993263563093947e-05, "loss": 0.0232, "step": 5000 }, { "epoch": 0.11026905649785476, "grad_norm": 0.14557033777236938, "learning_rate": 2.6692529772645255e-05, "loss": 0.0229, "step": 5500 }, { "epoch": 0.12029351617947792, "grad_norm": 0.14236459136009216, "learning_rate": 2.639179598219656e-05, "loss": 0.0228, "step": 6000 }, { "epoch": 0.13031797586110108, "grad_norm": 0.1859973669052124, "learning_rate": 2.6091062191747867e-05, "loss": 0.0226, "step": 6500 }, { "epoch": 0.14034243554272424, "grad_norm": 0.21533679962158203, "learning_rate": 2.5790929868880065e-05, "loss": 0.0226, "step": 7000 }, { "epoch": 0.1503668952243474, "grad_norm": 0.13067375123500824, "learning_rate": 2.5490196078431373e-05, "loss": 0.0222, "step": 7500 }, { "epoch": 0.16039135490597056, "grad_norm": 0.15966835618019104, "learning_rate": 2.518946228798268e-05, "loss": 0.0223, "step": 8000 }, { "epoch": 0.17041581458759372, "grad_norm": 0.16358782351016998, "learning_rate": 2.4888728497533985e-05, "loss": 0.0218, "step": 8500 }, { "epoch": 0.18044027426921688, "grad_norm": 0.2502037286758423, "learning_rate": 2.458799470708529e-05, "loss": 0.0219, "step": 9000 }, { "epoch": 0.19046473395084004, "grad_norm": 0.13426147401332855, "learning_rate": 2.4287260916636593e-05, "loss": 0.0221, "step": 9500 }, { "epoch": 0.2004891936324632, "grad_norm": 0.17509840428829193, "learning_rate": 2.39865271261879e-05, "loss": 0.0219, "step": 10000 }, { "epoch": 0.21051365331408636, "grad_norm": 0.12009692937135696, "learning_rate": 2.3685793335739205e-05, "loss": 0.0216, "step": 10500 }, { "epoch": 0.22053811299570952, "grad_norm": 0.1322207748889923, "learning_rate": 2.338505954529051e-05, "loss": 0.0211, "step": 11000 }, { "epoch": 0.23056257267733268, "grad_norm": 0.14883308112621307, "learning_rate": 2.308492722242271e-05, "loss": 0.0214, "step": 11500 }, { "epoch": 0.24058703235895584, "grad_norm": 0.13109040260314941, "learning_rate": 2.278419343197402e-05, "loss": 0.0211, "step": 12000 }, { "epoch": 0.25061149204057903, "grad_norm": 0.1747232973575592, "learning_rate": 2.2483459641525323e-05, "loss": 0.0214, "step": 12500 }, { "epoch": 0.26063595172220216, "grad_norm": 0.14633044600486755, "learning_rate": 2.2182725851076627e-05, "loss": 0.0212, "step": 13000 }, { "epoch": 0.27066041140382535, "grad_norm": 0.16928279399871826, "learning_rate": 2.188199206062793e-05, "loss": 0.0209, "step": 13500 }, { "epoch": 0.2806848710854485, "grad_norm": 0.1422567069530487, "learning_rate": 2.158125827017924e-05, "loss": 0.0206, "step": 14000 }, { "epoch": 0.29070933076707167, "grad_norm": 0.1617174744606018, "learning_rate": 2.1280524479730543e-05, "loss": 0.0208, "step": 14500 }, { "epoch": 0.3007337904486948, "grad_norm": 0.12710513174533844, "learning_rate": 2.0979790689281848e-05, "loss": 0.0204, "step": 15000 }, { "epoch": 0.310758250130318, "grad_norm": 0.12561199069023132, "learning_rate": 2.0679056898833152e-05, "loss": 0.0207, "step": 15500 }, { "epoch": 0.3207827098119411, "grad_norm": 0.12565076351165771, "learning_rate": 2.037832310838446e-05, "loss": 0.0204, "step": 16000 }, { "epoch": 0.3308071694935643, "grad_norm": 0.16111774742603302, "learning_rate": 2.0077589317935764e-05, "loss": 0.0204, "step": 16500 }, { "epoch": 0.34083162917518744, "grad_norm": 0.08807434886693954, "learning_rate": 1.9776855527487068e-05, "loss": 0.0204, "step": 17000 }, { "epoch": 0.35085608885681063, "grad_norm": 0.12991702556610107, "learning_rate": 1.9476121737038372e-05, "loss": 0.0203, "step": 17500 }, { "epoch": 0.36088054853843377, "grad_norm": 0.17382492125034332, "learning_rate": 1.917538794658968e-05, "loss": 0.0199, "step": 18000 }, { "epoch": 0.37090500822005695, "grad_norm": 0.16158685088157654, "learning_rate": 1.8875255623721885e-05, "loss": 0.02, "step": 18500 }, { "epoch": 0.3809294679016801, "grad_norm": 0.1214708685874939, "learning_rate": 1.8575123300854082e-05, "loss": 0.0199, "step": 19000 }, { "epoch": 0.3909539275833033, "grad_norm": 0.11400719732046127, "learning_rate": 1.827438951040539e-05, "loss": 0.0197, "step": 19500 }, { "epoch": 0.4009783872649264, "grad_norm": 0.09430436044931412, "learning_rate": 1.7973655719956694e-05, "loss": 0.0197, "step": 20000 }, { "epoch": 0.4110028469465496, "grad_norm": 0.13852345943450928, "learning_rate": 1.7672921929508002e-05, "loss": 0.0199, "step": 20500 }, { "epoch": 0.4210273066281727, "grad_norm": 0.13223719596862793, "learning_rate": 1.7372188139059303e-05, "loss": 0.0198, "step": 21000 }, { "epoch": 0.4310517663097959, "grad_norm": 0.08761810511350632, "learning_rate": 1.707145434861061e-05, "loss": 0.0196, "step": 21500 }, { "epoch": 0.44107622599141905, "grad_norm": 0.13589079678058624, "learning_rate": 1.6770720558161915e-05, "loss": 0.02, "step": 22000 }, { "epoch": 0.45110068567304223, "grad_norm": 0.08927265554666519, "learning_rate": 1.6469986767713223e-05, "loss": 0.0196, "step": 22500 }, { "epoch": 0.46112514535466537, "grad_norm": 0.06504929065704346, "learning_rate": 1.6169252977264524e-05, "loss": 0.0196, "step": 23000 }, { "epoch": 0.47114960503628855, "grad_norm": 0.17200474441051483, "learning_rate": 1.586851918681583e-05, "loss": 0.0192, "step": 23500 }, { "epoch": 0.4811740647179117, "grad_norm": 0.15591026842594147, "learning_rate": 1.5567785396367136e-05, "loss": 0.0195, "step": 24000 }, { "epoch": 0.4911985243995349, "grad_norm": 0.07878988236188889, "learning_rate": 1.5267051605918443e-05, "loss": 0.0197, "step": 24500 }, { "epoch": 0.5012229840811581, "grad_norm": 0.14917835593223572, "learning_rate": 1.4967520750631541e-05, "loss": 0.0194, "step": 25000 }, { "epoch": 0.5112474437627812, "grad_norm": 0.13516920804977417, "learning_rate": 1.4666786960182846e-05, "loss": 0.0191, "step": 25500 }, { "epoch": 0.5212719034444043, "grad_norm": 0.13264451920986176, "learning_rate": 1.4366053169734152e-05, "loss": 0.019, "step": 26000 }, { "epoch": 0.5312963631260275, "grad_norm": 0.11801792681217194, "learning_rate": 1.4065319379285456e-05, "loss": 0.019, "step": 26500 }, { "epoch": 0.5413208228076507, "grad_norm": 0.10833830386400223, "learning_rate": 1.3764585588836762e-05, "loss": 0.0192, "step": 27000 }, { "epoch": 0.5513452824892738, "grad_norm": 0.1249992698431015, "learning_rate": 1.3463851798388068e-05, "loss": 0.0193, "step": 27500 }, { "epoch": 0.561369742170897, "grad_norm": 0.1460021287202835, "learning_rate": 1.3163118007939374e-05, "loss": 0.0189, "step": 28000 }, { "epoch": 0.5713942018525201, "grad_norm": 0.10502651333808899, "learning_rate": 1.2862384217490678e-05, "loss": 0.019, "step": 28500 }, { "epoch": 0.5814186615341433, "grad_norm": 0.1526552140712738, "learning_rate": 1.2561650427041984e-05, "loss": 0.0189, "step": 29000 }, { "epoch": 0.5914431212157665, "grad_norm": 0.10992331802845001, "learning_rate": 1.2261518104174185e-05, "loss": 0.0189, "step": 29500 }, { "epoch": 0.6014675808973896, "grad_norm": 0.13150319457054138, "learning_rate": 1.1961385781306388e-05, "loss": 0.0189, "step": 30000 }, { "epoch": 0.6114920405790127, "grad_norm": 0.13109181821346283, "learning_rate": 1.1661253458438591e-05, "loss": 0.0187, "step": 30500 }, { "epoch": 0.621516500260636, "grad_norm": 0.1083877831697464, "learning_rate": 1.1360519667989895e-05, "loss": 0.0187, "step": 31000 }, { "epoch": 0.6315409599422591, "grad_norm": 0.1419740468263626, "learning_rate": 1.1059785877541201e-05, "loss": 0.0186, "step": 31500 }, { "epoch": 0.6415654196238822, "grad_norm": 0.13743899762630463, "learning_rate": 1.0759052087092506e-05, "loss": 0.0187, "step": 32000 }, { "epoch": 0.6515898793055054, "grad_norm": 0.11756709218025208, "learning_rate": 1.0458318296643812e-05, "loss": 0.0187, "step": 32500 }, { "epoch": 0.6616143389871286, "grad_norm": 0.28276416659355164, "learning_rate": 1.0157584506195116e-05, "loss": 0.0186, "step": 33000 }, { "epoch": 0.6716387986687518, "grad_norm": 0.08689709007740021, "learning_rate": 9.856850715746422e-06, "loss": 0.0185, "step": 33500 }, { "epoch": 0.6816632583503749, "grad_norm": 0.16307085752487183, "learning_rate": 9.556116925297726e-06, "loss": 0.0188, "step": 34000 }, { "epoch": 0.691687718031998, "grad_norm": 0.09724652022123337, "learning_rate": 9.25598460242993e-06, "loss": 0.0186, "step": 34500 }, { "epoch": 0.7017121777136213, "grad_norm": 0.1357831209897995, "learning_rate": 8.955250811981233e-06, "loss": 0.0185, "step": 35000 }, { "epoch": 0.7117366373952444, "grad_norm": 0.07240907102823257, "learning_rate": 8.65451702153254e-06, "loss": 0.0182, "step": 35500 }, { "epoch": 0.7217610970768675, "grad_norm": 0.12668687105178833, "learning_rate": 8.353783231083844e-06, "loss": 0.0184, "step": 36000 }, { "epoch": 0.7317855567584907, "grad_norm": 0.10139577835798264, "learning_rate": 8.05304944063515e-06, "loss": 0.0183, "step": 36500 }, { "epoch": 0.7418100164401139, "grad_norm": 0.13369520008563995, "learning_rate": 7.752315650186454e-06, "loss": 0.0184, "step": 37000 }, { "epoch": 0.751834476121737, "grad_norm": 0.10762561112642288, "learning_rate": 7.451581859737761e-06, "loss": 0.0183, "step": 37500 }, { "epoch": 0.7618589358033602, "grad_norm": 0.11992637068033218, "learning_rate": 7.150848069289066e-06, "loss": 0.0182, "step": 38000 }, { "epoch": 0.7718833954849834, "grad_norm": 0.10149220377206802, "learning_rate": 6.850114278840371e-06, "loss": 0.0184, "step": 38500 }, { "epoch": 0.7819078551666065, "grad_norm": 0.11299099028110504, "learning_rate": 6.549981955972573e-06, "loss": 0.0183, "step": 39000 }, { "epoch": 0.7919323148482297, "grad_norm": 0.1773054599761963, "learning_rate": 6.249248165523878e-06, "loss": 0.0184, "step": 39500 }, { "epoch": 0.8019567745298528, "grad_norm": 0.09563998878002167, "learning_rate": 5.949115842656081e-06, "loss": 0.0184, "step": 40000 }, { "epoch": 0.811981234211476, "grad_norm": 0.12298491597175598, "learning_rate": 5.648382052207386e-06, "loss": 0.0182, "step": 40500 }, { "epoch": 0.8220056938930992, "grad_norm": 0.1283864825963974, "learning_rate": 5.3476482617586915e-06, "loss": 0.0182, "step": 41000 }, { "epoch": 0.8320301535747223, "grad_norm": 0.10248496383428574, "learning_rate": 5.046914471309997e-06, "loss": 0.0182, "step": 41500 }, { "epoch": 0.8420546132563455, "grad_norm": 0.09150300920009613, "learning_rate": 4.746180680861302e-06, "loss": 0.0183, "step": 42000 }, { "epoch": 0.8520790729379687, "grad_norm": 0.1290718913078308, "learning_rate": 4.445446890412607e-06, "loss": 0.0183, "step": 42500 }, { "epoch": 0.8621035326195918, "grad_norm": 0.12163352966308594, "learning_rate": 4.144713099963912e-06, "loss": 0.0179, "step": 43000 }, { "epoch": 0.872127992301215, "grad_norm": 0.15341030061244965, "learning_rate": 3.843979309515217e-06, "loss": 0.0178, "step": 43500 }, { "epoch": 0.8821524519828381, "grad_norm": 0.1429668664932251, "learning_rate": 3.5438469866474197e-06, "loss": 0.0178, "step": 44000 }, { "epoch": 0.8921769116644613, "grad_norm": 0.10146286338567734, "learning_rate": 3.243113196198725e-06, "loss": 0.018, "step": 44500 }, { "epoch": 0.9022013713460845, "grad_norm": 0.1440785676240921, "learning_rate": 2.9429808733309277e-06, "loss": 0.0181, "step": 45000 }, { "epoch": 0.9122258310277076, "grad_norm": 0.10165946930646896, "learning_rate": 2.642247082882233e-06, "loss": 0.0178, "step": 45500 }, { "epoch": 0.9222502907093307, "grad_norm": 0.14765135943889618, "learning_rate": 2.341513292433538e-06, "loss": 0.018, "step": 46000 }, { "epoch": 0.932274750390954, "grad_norm": 0.07201279699802399, "learning_rate": 2.040779501984843e-06, "loss": 0.018, "step": 46500 }, { "epoch": 0.9422992100725771, "grad_norm": 0.13335958123207092, "learning_rate": 1.7400457115361483e-06, "loss": 0.0178, "step": 47000 }, { "epoch": 0.9523236697542002, "grad_norm": 0.10147497057914734, "learning_rate": 1.4393119210874535e-06, "loss": 0.0178, "step": 47500 }, { "epoch": 0.9623481294358234, "grad_norm": 0.09820819646120071, "learning_rate": 1.1385781306387586e-06, "loss": 0.0179, "step": 48000 }, { "epoch": 0.9723725891174466, "grad_norm": 0.1263681799173355, "learning_rate": 8.378443401900638e-07, "loss": 0.018, "step": 48500 }, { "epoch": 0.9823970487990697, "grad_norm": 0.08029880374670029, "learning_rate": 5.377120173222663e-07, "loss": 0.0181, "step": 49000 }, { "epoch": 0.9924215084806929, "grad_norm": 0.12001931667327881, "learning_rate": 2.369782268735715e-07, "loss": 0.018, "step": 49500 } ], "logging_steps": 500, "max_steps": 49878, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }