|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9977426636568847, |
|
"eval_steps": 50, |
|
"global_step": 996, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015048908954100828, |
|
"grad_norm": 18.47044613613669, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7355, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.030097817908201655, |
|
"grad_norm": 12.86199157181735, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5959, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.045146726862302484, |
|
"grad_norm": 7.557418173894333, |
|
"learning_rate": 9.999365521737421e-07, |
|
"loss": 1.2852, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06019563581640331, |
|
"grad_norm": 4.370427039727891, |
|
"learning_rate": 9.99746224797475e-07, |
|
"loss": 1.1474, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07524454477050414, |
|
"grad_norm": 3.7509821509698265, |
|
"learning_rate": 9.99429066174632e-07, |
|
"loss": 1.0775, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09029345372460497, |
|
"grad_norm": 3.4862474442533076, |
|
"learning_rate": 9.989851567973138e-07, |
|
"loss": 1.0181, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1053423626787058, |
|
"grad_norm": 3.6303816391691863, |
|
"learning_rate": 9.984146093258608e-07, |
|
"loss": 1.0061, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12039127163280662, |
|
"grad_norm": 3.379335415407791, |
|
"learning_rate": 9.9771756856026e-07, |
|
"loss": 0.9556, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13544018058690746, |
|
"grad_norm": 3.757214111120403, |
|
"learning_rate": 9.968942114033973e-07, |
|
"loss": 0.9456, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1504890895410083, |
|
"grad_norm": 3.403304414146967, |
|
"learning_rate": 9.959447468161596e-07, |
|
"loss": 0.9297, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1504890895410083, |
|
"eval_loss": 0.939862847328186, |
|
"eval_runtime": 164.2799, |
|
"eval_samples_per_second": 57.524, |
|
"eval_steps_per_second": 0.901, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1655379984951091, |
|
"grad_norm": 3.5204364554568266, |
|
"learning_rate": 9.948694157644042e-07, |
|
"loss": 0.9204, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18058690744920994, |
|
"grad_norm": 3.318644271218084, |
|
"learning_rate": 9.936684911578017e-07, |
|
"loss": 0.8933, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19563581640331076, |
|
"grad_norm": 3.7154382360141494, |
|
"learning_rate": 9.923422777805751e-07, |
|
"loss": 0.9219, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2106847253574116, |
|
"grad_norm": 3.6290063947255926, |
|
"learning_rate": 9.908911122141486e-07, |
|
"loss": 0.8946, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22573363431151242, |
|
"grad_norm": 3.6808742110099466, |
|
"learning_rate": 9.893153627517248e-07, |
|
"loss": 0.902, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.24078254326561324, |
|
"grad_norm": 3.5301295406592255, |
|
"learning_rate": 9.876154293048163e-07, |
|
"loss": 0.8924, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2558314522197141, |
|
"grad_norm": 3.3462246475893123, |
|
"learning_rate": 9.857917433017508e-07, |
|
"loss": 0.8958, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2708803611738149, |
|
"grad_norm": 3.487142176575527, |
|
"learning_rate": 9.838447675781793e-07, |
|
"loss": 0.8881, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28592927012791575, |
|
"grad_norm": 3.3151772418217402, |
|
"learning_rate": 9.817749962596114e-07, |
|
"loss": 0.882, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3009781790820166, |
|
"grad_norm": 3.4456571870576362, |
|
"learning_rate": 9.795829546360113e-07, |
|
"loss": 0.8675, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3009781790820166, |
|
"eval_loss": 0.8835354447364807, |
|
"eval_runtime": 164.3323, |
|
"eval_samples_per_second": 57.505, |
|
"eval_steps_per_second": 0.901, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3160270880361174, |
|
"grad_norm": 3.68930117031382, |
|
"learning_rate": 9.77269199028483e-07, |
|
"loss": 0.8656, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3310759969902182, |
|
"grad_norm": 3.7770729679447816, |
|
"learning_rate": 9.748343166480822e-07, |
|
"loss": 0.8907, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.34612490594431905, |
|
"grad_norm": 3.5934806142622104, |
|
"learning_rate": 9.722789254467854e-07, |
|
"loss": 0.8678, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3611738148984199, |
|
"grad_norm": 3.4907829794302447, |
|
"learning_rate": 9.696036739606606e-07, |
|
"loss": 0.8634, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3762227238525207, |
|
"grad_norm": 3.7214337808863798, |
|
"learning_rate": 9.668092411452735e-07, |
|
"loss": 0.8789, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3912716328066215, |
|
"grad_norm": 3.510784238627902, |
|
"learning_rate": 9.638963362033756e-07, |
|
"loss": 0.8592, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.40632054176072235, |
|
"grad_norm": 3.3785797508677837, |
|
"learning_rate": 9.608656984049132e-07, |
|
"loss": 0.8438, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4213694507148232, |
|
"grad_norm": 3.3458509178809854, |
|
"learning_rate": 9.577180968994081e-07, |
|
"loss": 0.8544, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.436418359668924, |
|
"grad_norm": 3.4920402952738656, |
|
"learning_rate": 9.544543305207546e-07, |
|
"loss": 0.8518, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.45146726862302483, |
|
"grad_norm": 3.2912233930311627, |
|
"learning_rate": 9.510752275844809e-07, |
|
"loss": 0.8457, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45146726862302483, |
|
"eval_loss": 0.857115626335144, |
|
"eval_runtime": 164.1562, |
|
"eval_samples_per_second": 57.567, |
|
"eval_steps_per_second": 0.902, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46651617757712566, |
|
"grad_norm": 3.553467076040635, |
|
"learning_rate": 9.475816456775312e-07, |
|
"loss": 0.8595, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4815650865312265, |
|
"grad_norm": 3.3686846297763116, |
|
"learning_rate": 9.439744714406166e-07, |
|
"loss": 0.8392, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4966139954853273, |
|
"grad_norm": 3.45480960814811, |
|
"learning_rate": 9.402546203431947e-07, |
|
"loss": 0.8323, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5116629044394282, |
|
"grad_norm": 3.459267285158168, |
|
"learning_rate": 9.364230364511295e-07, |
|
"loss": 0.8355, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.526711813393529, |
|
"grad_norm": 3.4121749327302378, |
|
"learning_rate": 9.324806921870975e-07, |
|
"loss": 0.8537, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5417607223476298, |
|
"grad_norm": 3.5893222373958524, |
|
"learning_rate": 9.284285880837946e-07, |
|
"loss": 0.828, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5568096313017307, |
|
"grad_norm": 3.7134185725142936, |
|
"learning_rate": 9.242677525300088e-07, |
|
"loss": 0.8346, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5718585402558315, |
|
"grad_norm": 3.550721543432503, |
|
"learning_rate": 9.199992415096259e-07, |
|
"loss": 0.832, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5869074492099323, |
|
"grad_norm": 3.7325735346120728, |
|
"learning_rate": 9.156241383336278e-07, |
|
"loss": 0.8156, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6019563581640331, |
|
"grad_norm": 3.6426263387830535, |
|
"learning_rate": 9.111435533651595e-07, |
|
"loss": 0.8411, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6019563581640331, |
|
"eval_loss": 0.8422114253044128, |
|
"eval_runtime": 164.0752, |
|
"eval_samples_per_second": 57.596, |
|
"eval_steps_per_second": 0.902, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.617005267118134, |
|
"grad_norm": 3.978063196207494, |
|
"learning_rate": 9.065586237377274e-07, |
|
"loss": 0.8255, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6320541760722348, |
|
"grad_norm": 3.536724916036319, |
|
"learning_rate": 9.018705130666049e-07, |
|
"loss": 0.8355, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6471030850263356, |
|
"grad_norm": 3.7576883800972243, |
|
"learning_rate": 8.970804111535175e-07, |
|
"loss": 0.8244, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6621519939804364, |
|
"grad_norm": 3.413329114828355, |
|
"learning_rate": 8.921895336846812e-07, |
|
"loss": 0.8336, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6772009029345373, |
|
"grad_norm": 3.477984800656738, |
|
"learning_rate": 8.871991219222712e-07, |
|
"loss": 0.8152, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6922498118886381, |
|
"grad_norm": 3.6780798309273233, |
|
"learning_rate": 8.821104423894014e-07, |
|
"loss": 0.8235, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7072987208427389, |
|
"grad_norm": 3.4820310206949383, |
|
"learning_rate": 8.769247865486915e-07, |
|
"loss": 0.8226, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7223476297968398, |
|
"grad_norm": 3.3478327176800224, |
|
"learning_rate": 8.716434704745046e-07, |
|
"loss": 0.8341, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7373965387509406, |
|
"grad_norm": 3.540341298435034, |
|
"learning_rate": 8.662678345189396e-07, |
|
"loss": 0.825, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7524454477050414, |
|
"grad_norm": 3.658518292411783, |
|
"learning_rate": 8.607992429716608e-07, |
|
"loss": 0.8133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7524454477050414, |
|
"eval_loss": 0.828486442565918, |
|
"eval_runtime": 164.1931, |
|
"eval_samples_per_second": 57.554, |
|
"eval_steps_per_second": 0.901, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7674943566591422, |
|
"grad_norm": 3.3210348798020797, |
|
"learning_rate": 8.55239083713654e-07, |
|
"loss": 0.8267, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.782543265613243, |
|
"grad_norm": 3.2928105286016867, |
|
"learning_rate": 8.495887678649932e-07, |
|
"loss": 0.8336, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7975921745673439, |
|
"grad_norm": 3.7993668860558567, |
|
"learning_rate": 8.438497294267116e-07, |
|
"loss": 0.8371, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8126410835214447, |
|
"grad_norm": 3.7491461546658456, |
|
"learning_rate": 8.38023424916864e-07, |
|
"loss": 0.83, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8276899924755455, |
|
"grad_norm": 3.335971213466453, |
|
"learning_rate": 8.321113330008756e-07, |
|
"loss": 0.8231, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8427389014296464, |
|
"grad_norm": 3.50538153645489, |
|
"learning_rate": 8.261149541162691e-07, |
|
"loss": 0.8283, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8577878103837472, |
|
"grad_norm": 3.405552551438381, |
|
"learning_rate": 8.20035810091867e-07, |
|
"loss": 0.8333, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.872836719337848, |
|
"grad_norm": 3.5518417970408036, |
|
"learning_rate": 8.13875443761565e-07, |
|
"loss": 0.8215, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8878856282919488, |
|
"grad_norm": 3.346948128373629, |
|
"learning_rate": 8.076354185727734e-07, |
|
"loss": 0.8127, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9029345372460497, |
|
"grad_norm": 3.472824409308143, |
|
"learning_rate": 8.013173181896282e-07, |
|
"loss": 0.8114, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9029345372460497, |
|
"eval_loss": 0.8188709616661072, |
|
"eval_runtime": 163.9398, |
|
"eval_samples_per_second": 57.643, |
|
"eval_steps_per_second": 0.903, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9179834462001505, |
|
"grad_norm": 3.535029645974378, |
|
"learning_rate": 7.94922746091071e-07, |
|
"loss": 0.8245, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9330323551542513, |
|
"grad_norm": 3.350512530278651, |
|
"learning_rate": 7.884533251638999e-07, |
|
"loss": 0.8031, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9480812641083521, |
|
"grad_norm": 3.1272224593692903, |
|
"learning_rate": 7.819106972908949e-07, |
|
"loss": 0.792, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.963130173062453, |
|
"grad_norm": 3.3082625072971066, |
|
"learning_rate": 7.752965229341219e-07, |
|
"loss": 0.8073, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9781790820165538, |
|
"grad_norm": 3.6472654100125674, |
|
"learning_rate": 7.686124807135228e-07, |
|
"loss": 0.8091, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9932279909706546, |
|
"grad_norm": 3.4227050519519184, |
|
"learning_rate": 7.618602669808957e-07, |
|
"loss": 0.812, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0082768999247556, |
|
"grad_norm": 3.63643107405423, |
|
"learning_rate": 7.550415953893756e-07, |
|
"loss": 0.7794, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0233258088788564, |
|
"grad_norm": 3.5111108810888307, |
|
"learning_rate": 7.481581964585244e-07, |
|
"loss": 0.7534, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0383747178329572, |
|
"grad_norm": 3.738034258550528, |
|
"learning_rate": 7.412118171351395e-07, |
|
"loss": 0.7473, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.053423626787058, |
|
"grad_norm": 3.511742644224616, |
|
"learning_rate": 7.342042203498951e-07, |
|
"loss": 0.7379, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.053423626787058, |
|
"eval_loss": 0.8137179613113403, |
|
"eval_runtime": 163.9091, |
|
"eval_samples_per_second": 57.654, |
|
"eval_steps_per_second": 0.903, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0684725357411589, |
|
"grad_norm": 3.482412652291007, |
|
"learning_rate": 7.271371845699241e-07, |
|
"loss": 0.757, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0835214446952597, |
|
"grad_norm": 3.57693306476351, |
|
"learning_rate": 7.200125033474598e-07, |
|
"loss": 0.7405, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0985703536493605, |
|
"grad_norm": 3.551987518468492, |
|
"learning_rate": 7.128319848646477e-07, |
|
"loss": 0.7424, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.1136192626034613, |
|
"grad_norm": 3.644980029199069, |
|
"learning_rate": 7.055974514746445e-07, |
|
"loss": 0.7557, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1286681715575622, |
|
"grad_norm": 3.661260327693092, |
|
"learning_rate": 6.983107392391202e-07, |
|
"loss": 0.7595, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.143717080511663, |
|
"grad_norm": 3.6520213924338867, |
|
"learning_rate": 6.909736974622826e-07, |
|
"loss": 0.7411, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1587659894657638, |
|
"grad_norm": 3.341094037190979, |
|
"learning_rate": 6.835881882215395e-07, |
|
"loss": 0.7415, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.1738148984198646, |
|
"grad_norm": 3.4808910667039803, |
|
"learning_rate": 6.761560858949192e-07, |
|
"loss": 0.7317, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1888638073739655, |
|
"grad_norm": 3.6539416517236076, |
|
"learning_rate": 6.686792766853705e-07, |
|
"loss": 0.7537, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.2039127163280663, |
|
"grad_norm": 3.4648582888257002, |
|
"learning_rate": 6.611596581420599e-07, |
|
"loss": 0.7416, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2039127163280663, |
|
"eval_loss": 0.8082437515258789, |
|
"eval_runtime": 163.785, |
|
"eval_samples_per_second": 57.698, |
|
"eval_steps_per_second": 0.904, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2189616252821671, |
|
"grad_norm": 3.3756134153833433, |
|
"learning_rate": 6.53599138678791e-07, |
|
"loss": 0.7295, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.234010534236268, |
|
"grad_norm": 3.5366994024401093, |
|
"learning_rate": 6.459996370896652e-07, |
|
"loss": 0.7419, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2490594431903688, |
|
"grad_norm": 3.6126358863182118, |
|
"learning_rate": 6.383630820621081e-07, |
|
"loss": 0.7348, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.2641083521444696, |
|
"grad_norm": 3.329040929496611, |
|
"learning_rate": 6.306914116873862e-07, |
|
"loss": 0.7456, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2791572610985704, |
|
"grad_norm": 3.389626369376778, |
|
"learning_rate": 6.22986572968736e-07, |
|
"loss": 0.7399, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2942061700526712, |
|
"grad_norm": 3.640732106094957, |
|
"learning_rate": 6.152505213272307e-07, |
|
"loss": 0.7478, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.309255079006772, |
|
"grad_norm": 3.678659612980298, |
|
"learning_rate": 6.074852201055121e-07, |
|
"loss": 0.7348, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.324303987960873, |
|
"grad_norm": 3.456540220241952, |
|
"learning_rate": 5.996926400695113e-07, |
|
"loss": 0.7367, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3393528969149737, |
|
"grad_norm": 3.598565981484491, |
|
"learning_rate": 5.918747589082852e-07, |
|
"loss": 0.7452, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.3544018058690745, |
|
"grad_norm": 3.6204936371770584, |
|
"learning_rate": 5.840335607320963e-07, |
|
"loss": 0.7623, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3544018058690745, |
|
"eval_loss": 0.8033472299575806, |
|
"eval_runtime": 163.9716, |
|
"eval_samples_per_second": 57.632, |
|
"eval_steps_per_second": 0.903, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3694507148231754, |
|
"grad_norm": 3.5876675484160927, |
|
"learning_rate": 5.761710355688627e-07, |
|
"loss": 0.7431, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3844996237772762, |
|
"grad_norm": 3.619019215155952, |
|
"learning_rate": 5.682891788591065e-07, |
|
"loss": 0.7464, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.399548532731377, |
|
"grad_norm": 3.5444023840644237, |
|
"learning_rate": 5.603899909495283e-07, |
|
"loss": 0.7357, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.4145974416854779, |
|
"grad_norm": 3.4447315407410266, |
|
"learning_rate": 5.52475476585336e-07, |
|
"loss": 0.735, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4296463506395787, |
|
"grad_norm": 3.8423796002300983, |
|
"learning_rate": 5.445476444014591e-07, |
|
"loss": 0.7318, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.4446952595936795, |
|
"grad_norm": 3.594078636376061, |
|
"learning_rate": 5.366085064127734e-07, |
|
"loss": 0.7484, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4597441685477803, |
|
"grad_norm": 3.607424426488508, |
|
"learning_rate": 5.286600775034699e-07, |
|
"loss": 0.7435, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.4747930775018812, |
|
"grad_norm": 3.7328475918869715, |
|
"learning_rate": 5.207043749156944e-07, |
|
"loss": 0.7378, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.489841986455982, |
|
"grad_norm": 3.443608577698693, |
|
"learning_rate": 5.127434177375893e-07, |
|
"loss": 0.7418, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.5048908954100828, |
|
"grad_norm": 3.5186913080533637, |
|
"learning_rate": 5.047792263908659e-07, |
|
"loss": 0.7158, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5048908954100828, |
|
"eval_loss": 0.7987317442893982, |
|
"eval_runtime": 163.9794, |
|
"eval_samples_per_second": 57.629, |
|
"eval_steps_per_second": 0.903, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5199398043641836, |
|
"grad_norm": 3.865883308748703, |
|
"learning_rate": 4.968138221180401e-07, |
|
"loss": 0.7431, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.5349887133182845, |
|
"grad_norm": 3.710215468774283, |
|
"learning_rate": 4.888492264694565e-07, |
|
"loss": 0.7007, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5500376222723853, |
|
"grad_norm": 3.6132015362670145, |
|
"learning_rate": 4.808874607902397e-07, |
|
"loss": 0.7364, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.565086531226486, |
|
"grad_norm": 3.7199884506401593, |
|
"learning_rate": 4.7293054570729126e-07, |
|
"loss": 0.7267, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.580135440180587, |
|
"grad_norm": 3.9105644710376457, |
|
"learning_rate": 4.649805006164743e-07, |
|
"loss": 0.7298, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5951843491346878, |
|
"grad_norm": 3.5526438869005292, |
|
"learning_rate": 4.5703934317010727e-07, |
|
"loss": 0.7429, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6102332580887886, |
|
"grad_norm": 3.502575386275301, |
|
"learning_rate": 4.491090887649024e-07, |
|
"loss": 0.7337, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.6252821670428894, |
|
"grad_norm": 3.794160344449478, |
|
"learning_rate": 4.4119175003047407e-07, |
|
"loss": 0.7299, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6403310759969902, |
|
"grad_norm": 3.2770795614619237, |
|
"learning_rate": 4.3328933631855195e-07, |
|
"loss": 0.7321, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.655379984951091, |
|
"grad_norm": 3.6119103240995263, |
|
"learning_rate": 4.2540385319302524e-07, |
|
"loss": 0.7322, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.655379984951091, |
|
"eval_loss": 0.7954748272895813, |
|
"eval_runtime": 164.5155, |
|
"eval_samples_per_second": 57.441, |
|
"eval_steps_per_second": 0.9, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.670428893905192, |
|
"grad_norm": 3.5947026375482944, |
|
"learning_rate": 4.175373019209468e-07, |
|
"loss": 0.7457, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.6854778028592927, |
|
"grad_norm": 3.761848976466262, |
|
"learning_rate": 4.0969167896463046e-07, |
|
"loss": 0.7386, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7005267118133935, |
|
"grad_norm": 3.439361811338972, |
|
"learning_rate": 4.018689754749648e-07, |
|
"loss": 0.724, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.7155756207674944, |
|
"grad_norm": 3.4555454207422858, |
|
"learning_rate": 3.9407117678607756e-07, |
|
"loss": 0.7441, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7306245297215952, |
|
"grad_norm": 3.5617425595455177, |
|
"learning_rate": 3.8630026191147405e-07, |
|
"loss": 0.7249, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.745673438675696, |
|
"grad_norm": 3.448724415418216, |
|
"learning_rate": 3.78558203041782e-07, |
|
"loss": 0.6972, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7607223476297968, |
|
"grad_norm": 3.6943962689825827, |
|
"learning_rate": 3.7084696504422525e-07, |
|
"loss": 0.7302, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.7757712565838977, |
|
"grad_norm": 3.6561881604475945, |
|
"learning_rate": 3.6316850496395855e-07, |
|
"loss": 0.7424, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7908201655379985, |
|
"grad_norm": 3.625068299425613, |
|
"learning_rate": 3.555247715273867e-07, |
|
"loss": 0.7252, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.8058690744920993, |
|
"grad_norm": 3.519787303173951, |
|
"learning_rate": 3.4791770464759347e-07, |
|
"loss": 0.7148, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8058690744920993, |
|
"eval_loss": 0.7919732332229614, |
|
"eval_runtime": 163.9475, |
|
"eval_samples_per_second": 57.64, |
|
"eval_steps_per_second": 0.903, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8209179834462002, |
|
"grad_norm": 3.8300888994459306, |
|
"learning_rate": 3.4034923493201007e-07, |
|
"loss": 0.7377, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.835966892400301, |
|
"grad_norm": 3.400875766754663, |
|
"learning_rate": 3.3282128319244237e-07, |
|
"loss": 0.7385, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8510158013544018, |
|
"grad_norm": 3.6182045725169325, |
|
"learning_rate": 3.2533575995758694e-07, |
|
"loss": 0.7197, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.8660647103085026, |
|
"grad_norm": 3.904119144772108, |
|
"learning_rate": 3.178945649881543e-07, |
|
"loss": 0.7018, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8811136192626035, |
|
"grad_norm": 3.8565228264688822, |
|
"learning_rate": 3.1049958679472645e-07, |
|
"loss": 0.7246, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.8961625282167043, |
|
"grad_norm": 3.422783362651409, |
|
"learning_rate": 3.031527021584701e-07, |
|
"loss": 0.7336, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.911211437170805, |
|
"grad_norm": 3.675919325226495, |
|
"learning_rate": 2.9585577565482484e-07, |
|
"loss": 0.7169, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.926260346124906, |
|
"grad_norm": 3.726914018119805, |
|
"learning_rate": 2.886106591802908e-07, |
|
"loss": 0.7259, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9413092550790068, |
|
"grad_norm": 3.5805352508835333, |
|
"learning_rate": 2.814191914824332e-07, |
|
"loss": 0.7168, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.9563581640331076, |
|
"grad_norm": 3.60649172674426, |
|
"learning_rate": 2.7428319769322415e-07, |
|
"loss": 0.71, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9563581640331076, |
|
"eval_loss": 0.7893925309181213, |
|
"eval_runtime": 164.1237, |
|
"eval_samples_per_second": 57.579, |
|
"eval_steps_per_second": 0.902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9714070729872084, |
|
"grad_norm": 3.614812754262687, |
|
"learning_rate": 2.672044888658399e-07, |
|
"loss": 0.7041, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.9864559819413092, |
|
"grad_norm": 3.7073968028896473, |
|
"learning_rate": 2.6018486151503213e-07, |
|
"loss": 0.7285, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.00150489089541, |
|
"grad_norm": 3.647866320725029, |
|
"learning_rate": 2.532260971611867e-07, |
|
"loss": 0.7113, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.016553799849511, |
|
"grad_norm": 3.699708580061935, |
|
"learning_rate": 2.4632996187819034e-07, |
|
"loss": 0.6883, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0316027088036117, |
|
"grad_norm": 3.936748692781072, |
|
"learning_rate": 2.394982058452165e-07, |
|
"loss": 0.6756, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.0466516177577128, |
|
"grad_norm": 3.716518462078895, |
|
"learning_rate": 2.3273256290254402e-07, |
|
"loss": 0.6762, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0617005267118134, |
|
"grad_norm": 3.702081977847743, |
|
"learning_rate": 2.2603475011152517e-07, |
|
"loss": 0.6895, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.0767494356659144, |
|
"grad_norm": 3.605178078795864, |
|
"learning_rate": 2.1940646731880885e-07, |
|
"loss": 0.6607, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.091798344620015, |
|
"grad_norm": 3.7196889182496733, |
|
"learning_rate": 2.1284939672493506e-07, |
|
"loss": 0.6789, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.106847253574116, |
|
"grad_norm": 3.7090327815341753, |
|
"learning_rate": 2.0636520245740708e-07, |
|
"loss": 0.6651, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.106847253574116, |
|
"eval_loss": 0.7947185039520264, |
|
"eval_runtime": 164.1101, |
|
"eval_samples_per_second": 57.583, |
|
"eval_steps_per_second": 0.902, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1218961625282167, |
|
"grad_norm": 3.736696447237514, |
|
"learning_rate": 1.9995553014834986e-07, |
|
"loss": 0.6809, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.1369450714823177, |
|
"grad_norm": 3.6458918889335865, |
|
"learning_rate": 1.9362200651686406e-07, |
|
"loss": 0.6864, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.1519939804364183, |
|
"grad_norm": 4.002063868688003, |
|
"learning_rate": 1.873662389561771e-07, |
|
"loss": 0.6987, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.1670428893905194, |
|
"grad_norm": 3.8698599662856714, |
|
"learning_rate": 1.8118981512570254e-07, |
|
"loss": 0.6801, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.18209179834462, |
|
"grad_norm": 3.6230492953936677, |
|
"learning_rate": 1.750943025481046e-07, |
|
"loss": 0.6826, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.197140707298721, |
|
"grad_norm": 3.7726470306057953, |
|
"learning_rate": 1.6908124821147517e-07, |
|
"loss": 0.6904, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2121896162528216, |
|
"grad_norm": 3.787910233434616, |
|
"learning_rate": 1.631521781767214e-07, |
|
"loss": 0.6798, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.2272385252069227, |
|
"grad_norm": 3.9836210259763023, |
|
"learning_rate": 1.5730859719026535e-07, |
|
"loss": 0.6776, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2422874341610233, |
|
"grad_norm": 3.763373496609544, |
|
"learning_rate": 1.5155198830215144e-07, |
|
"loss": 0.6805, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.2573363431151243, |
|
"grad_norm": 3.8572405199189466, |
|
"learning_rate": 1.4588381248966185e-07, |
|
"loss": 0.6684, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.2573363431151243, |
|
"eval_loss": 0.7936417460441589, |
|
"eval_runtime": 163.9206, |
|
"eval_samples_per_second": 57.65, |
|
"eval_steps_per_second": 0.903, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.272385252069225, |
|
"grad_norm": 3.8388172371691924, |
|
"learning_rate": 1.4030550828653354e-07, |
|
"loss": 0.7012, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.287434161023326, |
|
"grad_norm": 3.8523292354815823, |
|
"learning_rate": 1.3481849141786977e-07, |
|
"loss": 0.6778, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3024830699774266, |
|
"grad_norm": 3.937046633996997, |
|
"learning_rate": 1.294241544408425e-07, |
|
"loss": 0.6775, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.3175319789315276, |
|
"grad_norm": 3.953239589293332, |
|
"learning_rate": 1.241238663912727e-07, |
|
"loss": 0.6868, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3325808878856282, |
|
"grad_norm": 4.089219915268593, |
|
"learning_rate": 1.1891897243618183e-07, |
|
"loss": 0.6694, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.3476297968397293, |
|
"grad_norm": 3.846961987007966, |
|
"learning_rate": 1.1381079353239915e-07, |
|
"loss": 0.6779, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.36267870579383, |
|
"grad_norm": 3.7397027273861885, |
|
"learning_rate": 1.0880062609131485e-07, |
|
"loss": 0.6726, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.377727614747931, |
|
"grad_norm": 3.921668306186131, |
|
"learning_rate": 1.0388974164986247e-07, |
|
"loss": 0.6796, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.3927765237020315, |
|
"grad_norm": 3.8468336671320853, |
|
"learning_rate": 9.907938654781306e-08, |
|
"loss": 0.6835, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.4078254326561326, |
|
"grad_norm": 3.7410679007377854, |
|
"learning_rate": 9.437078161146589e-08, |
|
"loss": 0.6826, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4078254326561326, |
|
"eval_loss": 0.7927989363670349, |
|
"eval_runtime": 167.9966, |
|
"eval_samples_per_second": 56.251, |
|
"eval_steps_per_second": 0.881, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.422874341610233, |
|
"grad_norm": 4.079995858636958, |
|
"learning_rate": 8.976512184381246e-08, |
|
"loss": 0.6795, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.4379232505643342, |
|
"grad_norm": 3.735129398089214, |
|
"learning_rate": 8.526357612125573e-08, |
|
"loss": 0.6898, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.452972159518435, |
|
"grad_norm": 3.8390441333226817, |
|
"learning_rate": 8.086728689695921e-08, |
|
"loss": 0.7046, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.468021068472536, |
|
"grad_norm": 3.9049522755901362, |
|
"learning_rate": 7.657736991090263e-08, |
|
"loss": 0.661, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4830699774266365, |
|
"grad_norm": 3.880401857930606, |
|
"learning_rate": 7.239491390671631e-08, |
|
"loss": 0.6742, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.4981188863807375, |
|
"grad_norm": 3.912207039826943, |
|
"learning_rate": 6.832098035536759e-08, |
|
"loss": 0.6672, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.513167795334838, |
|
"grad_norm": 3.922109823682805, |
|
"learning_rate": 6.435660318576935e-08, |
|
"loss": 0.6754, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.528216704288939, |
|
"grad_norm": 3.8537847324852224, |
|
"learning_rate": 6.0502788522377e-08, |
|
"loss": 0.6824, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.54326561324304, |
|
"grad_norm": 4.033607009118544, |
|
"learning_rate": 5.676051442984325e-08, |
|
"loss": 0.6716, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.558314522197141, |
|
"grad_norm": 3.6539091883922423, |
|
"learning_rate": 5.313073066479379e-08, |
|
"loss": 0.6711, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.558314522197141, |
|
"eval_loss": 0.7919635772705078, |
|
"eval_runtime": 164.1041, |
|
"eval_samples_per_second": 57.585, |
|
"eval_steps_per_second": 0.902, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5733634311512414, |
|
"grad_norm": 3.876589664826218, |
|
"learning_rate": 4.961435843478751e-08, |
|
"loss": 0.6806, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.5884123401053425, |
|
"grad_norm": 3.655653854351189, |
|
"learning_rate": 4.621229016452155e-08, |
|
"loss": 0.6535, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.603461249059443, |
|
"grad_norm": 4.079997158323887, |
|
"learning_rate": 4.2925389269341916e-08, |
|
"loss": 0.673, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.618510158013544, |
|
"grad_norm": 3.8325795147951207, |
|
"learning_rate": 3.975448993611652e-08, |
|
"loss": 0.6709, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.6335590669676447, |
|
"grad_norm": 3.778742993294259, |
|
"learning_rate": 3.67003969115251e-08, |
|
"loss": 0.6619, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.648607975921746, |
|
"grad_norm": 4.10366570220581, |
|
"learning_rate": 3.376388529782215e-08, |
|
"loss": 0.6848, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6636568848758464, |
|
"grad_norm": 3.8699319815585236, |
|
"learning_rate": 3.094570035612226e-08, |
|
"loss": 0.6853, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.6787057938299474, |
|
"grad_norm": 4.043194008918937, |
|
"learning_rate": 2.8246557317259723e-08, |
|
"loss": 0.6745, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.693754702784048, |
|
"grad_norm": 3.99392087783422, |
|
"learning_rate": 2.5667141200268694e-08, |
|
"loss": 0.6577, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.708803611738149, |
|
"grad_norm": 3.6631248500731814, |
|
"learning_rate": 2.3208106638531842e-08, |
|
"loss": 0.6757, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.708803611738149, |
|
"eval_loss": 0.7914727926254272, |
|
"eval_runtime": 164.3819, |
|
"eval_samples_per_second": 57.488, |
|
"eval_steps_per_second": 0.9, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7238525206922497, |
|
"grad_norm": 3.9578522639967537, |
|
"learning_rate": 2.087007771363969e-08, |
|
"loss": 0.6736, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.7389014296463507, |
|
"grad_norm": 3.739728228760654, |
|
"learning_rate": 1.8653647797004236e-08, |
|
"loss": 0.6729, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.7539503386004514, |
|
"grad_norm": 3.8722809533250326, |
|
"learning_rate": 1.655937939926655e-08, |
|
"loss": 0.6841, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.7689992475545524, |
|
"grad_norm": 3.8241600199550945, |
|
"learning_rate": 1.4587804027536454e-08, |
|
"loss": 0.6917, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.784048156508653, |
|
"grad_norm": 3.7138354028978506, |
|
"learning_rate": 1.2739422050500436e-08, |
|
"loss": 0.6751, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.799097065462754, |
|
"grad_norm": 3.790111519043104, |
|
"learning_rate": 1.101470257143261e-08, |
|
"loss": 0.6738, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8141459744168547, |
|
"grad_norm": 3.561873269533578, |
|
"learning_rate": 9.414083309140453e-09, |
|
"loss": 0.6952, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.8291948833709557, |
|
"grad_norm": 3.6893556293107364, |
|
"learning_rate": 7.93797048687539e-09, |
|
"loss": 0.6761, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8442437923250563, |
|
"grad_norm": 3.8751112944911412, |
|
"learning_rate": 6.5867387292369295e-09, |
|
"loss": 0.6932, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.8592927012791574, |
|
"grad_norm": 4.004750596943983, |
|
"learning_rate": 5.360730967096272e-09, |
|
"loss": 0.6791, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8592927012791574, |
|
"eval_loss": 0.7912743091583252, |
|
"eval_runtime": 164.3927, |
|
"eval_samples_per_second": 57.484, |
|
"eval_steps_per_second": 0.9, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.874341610233258, |
|
"grad_norm": 3.8028887659361637, |
|
"learning_rate": 4.260258350563317e-09, |
|
"loss": 0.6716, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.889390519187359, |
|
"grad_norm": 3.8634306165906107, |
|
"learning_rate": 3.285600170019609e-09, |
|
"loss": 0.6917, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9044394281414596, |
|
"grad_norm": 3.6280717262306363, |
|
"learning_rate": 2.437003785236702e-09, |
|
"loss": 0.665, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.9194883370955607, |
|
"grad_norm": 4.013775538099351, |
|
"learning_rate": 1.714684562598545e-09, |
|
"loss": 0.6845, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9345372460496613, |
|
"grad_norm": 3.8102208297791, |
|
"learning_rate": 1.1188258204433144e-09, |
|
"loss": 0.6673, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.9495861550037623, |
|
"grad_norm": 3.649482883492926, |
|
"learning_rate": 6.49578782538851e-10, |
|
"loss": 0.6772, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.964635063957863, |
|
"grad_norm": 3.9407869896383483, |
|
"learning_rate": 3.070625397031401e-10, |
|
"loss": 0.6743, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.979683972911964, |
|
"grad_norm": 3.857935656292572, |
|
"learning_rate": 9.136401958059759e-11, |
|
"loss": 0.6633, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9947328818660646, |
|
"grad_norm": 3.934124028791183, |
|
"learning_rate": 2.5379645800516215e-12, |
|
"loss": 0.672, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.9977426636568847, |
|
"step": 996, |
|
"total_flos": 5872633472090112.0, |
|
"train_loss": 0.7687147306988996, |
|
"train_runtime": 16194.6861, |
|
"train_samples_per_second": 15.755, |
|
"train_steps_per_second": 0.062 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 996, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5872633472090112.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|