{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9977426636568847, "eval_steps": 50, "global_step": 996, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015048908954100828, "grad_norm": 18.47044613613669, "learning_rate": 5e-07, "loss": 1.7355, "step": 5 }, { "epoch": 0.030097817908201655, "grad_norm": 12.86199157181735, "learning_rate": 1e-06, "loss": 1.5959, "step": 10 }, { "epoch": 0.045146726862302484, "grad_norm": 7.557418173894333, "learning_rate": 9.999365521737421e-07, "loss": 1.2852, "step": 15 }, { "epoch": 0.06019563581640331, "grad_norm": 4.370427039727891, "learning_rate": 9.99746224797475e-07, "loss": 1.1474, "step": 20 }, { "epoch": 0.07524454477050414, "grad_norm": 3.7509821509698265, "learning_rate": 9.99429066174632e-07, "loss": 1.0775, "step": 25 }, { "epoch": 0.09029345372460497, "grad_norm": 3.4862474442533076, "learning_rate": 9.989851567973138e-07, "loss": 1.0181, "step": 30 }, { "epoch": 0.1053423626787058, "grad_norm": 3.6303816391691863, "learning_rate": 9.984146093258608e-07, "loss": 1.0061, "step": 35 }, { "epoch": 0.12039127163280662, "grad_norm": 3.379335415407791, "learning_rate": 9.9771756856026e-07, "loss": 0.9556, "step": 40 }, { "epoch": 0.13544018058690746, "grad_norm": 3.757214111120403, "learning_rate": 9.968942114033973e-07, "loss": 0.9456, "step": 45 }, { "epoch": 0.1504890895410083, "grad_norm": 3.403304414146967, "learning_rate": 9.959447468161596e-07, "loss": 0.9297, "step": 50 }, { "epoch": 0.1504890895410083, "eval_loss": 0.939862847328186, "eval_runtime": 164.2799, "eval_samples_per_second": 57.524, "eval_steps_per_second": 0.901, "step": 50 }, { "epoch": 0.1655379984951091, "grad_norm": 3.5204364554568266, "learning_rate": 9.948694157644042e-07, "loss": 0.9204, "step": 55 }, { "epoch": 0.18058690744920994, "grad_norm": 3.318644271218084, "learning_rate": 9.936684911578017e-07, "loss": 0.8933, "step": 60 }, { "epoch": 0.19563581640331076, "grad_norm": 3.7154382360141494, "learning_rate": 9.923422777805751e-07, "loss": 0.9219, "step": 65 }, { "epoch": 0.2106847253574116, "grad_norm": 3.6290063947255926, "learning_rate": 9.908911122141486e-07, "loss": 0.8946, "step": 70 }, { "epoch": 0.22573363431151242, "grad_norm": 3.6808742110099466, "learning_rate": 9.893153627517248e-07, "loss": 0.902, "step": 75 }, { "epoch": 0.24078254326561324, "grad_norm": 3.5301295406592255, "learning_rate": 9.876154293048163e-07, "loss": 0.8924, "step": 80 }, { "epoch": 0.2558314522197141, "grad_norm": 3.3462246475893123, "learning_rate": 9.857917433017508e-07, "loss": 0.8958, "step": 85 }, { "epoch": 0.2708803611738149, "grad_norm": 3.487142176575527, "learning_rate": 9.838447675781793e-07, "loss": 0.8881, "step": 90 }, { "epoch": 0.28592927012791575, "grad_norm": 3.3151772418217402, "learning_rate": 9.817749962596114e-07, "loss": 0.882, "step": 95 }, { "epoch": 0.3009781790820166, "grad_norm": 3.4456571870576362, "learning_rate": 9.795829546360113e-07, "loss": 0.8675, "step": 100 }, { "epoch": 0.3009781790820166, "eval_loss": 0.8835354447364807, "eval_runtime": 164.3323, "eval_samples_per_second": 57.505, "eval_steps_per_second": 0.901, "step": 100 }, { "epoch": 0.3160270880361174, "grad_norm": 3.68930117031382, "learning_rate": 9.77269199028483e-07, "loss": 0.8656, "step": 105 }, { "epoch": 0.3310759969902182, "grad_norm": 3.7770729679447816, "learning_rate": 9.748343166480822e-07, "loss": 0.8907, "step": 110 }, { "epoch": 0.34612490594431905, "grad_norm": 3.5934806142622104, "learning_rate": 9.722789254467854e-07, "loss": 0.8678, "step": 115 }, { "epoch": 0.3611738148984199, "grad_norm": 3.4907829794302447, "learning_rate": 9.696036739606606e-07, "loss": 0.8634, "step": 120 }, { "epoch": 0.3762227238525207, "grad_norm": 3.7214337808863798, "learning_rate": 9.668092411452735e-07, "loss": 0.8789, "step": 125 }, { "epoch": 0.3912716328066215, "grad_norm": 3.510784238627902, "learning_rate": 9.638963362033756e-07, "loss": 0.8592, "step": 130 }, { "epoch": 0.40632054176072235, "grad_norm": 3.3785797508677837, "learning_rate": 9.608656984049132e-07, "loss": 0.8438, "step": 135 }, { "epoch": 0.4213694507148232, "grad_norm": 3.3458509178809854, "learning_rate": 9.577180968994081e-07, "loss": 0.8544, "step": 140 }, { "epoch": 0.436418359668924, "grad_norm": 3.4920402952738656, "learning_rate": 9.544543305207546e-07, "loss": 0.8518, "step": 145 }, { "epoch": 0.45146726862302483, "grad_norm": 3.2912233930311627, "learning_rate": 9.510752275844809e-07, "loss": 0.8457, "step": 150 }, { "epoch": 0.45146726862302483, "eval_loss": 0.857115626335144, "eval_runtime": 164.1562, "eval_samples_per_second": 57.567, "eval_steps_per_second": 0.902, "step": 150 }, { "epoch": 0.46651617757712566, "grad_norm": 3.553467076040635, "learning_rate": 9.475816456775312e-07, "loss": 0.8595, "step": 155 }, { "epoch": 0.4815650865312265, "grad_norm": 3.3686846297763116, "learning_rate": 9.439744714406166e-07, "loss": 0.8392, "step": 160 }, { "epoch": 0.4966139954853273, "grad_norm": 3.45480960814811, "learning_rate": 9.402546203431947e-07, "loss": 0.8323, "step": 165 }, { "epoch": 0.5116629044394282, "grad_norm": 3.459267285158168, "learning_rate": 9.364230364511295e-07, "loss": 0.8355, "step": 170 }, { "epoch": 0.526711813393529, "grad_norm": 3.4121749327302378, "learning_rate": 9.324806921870975e-07, "loss": 0.8537, "step": 175 }, { "epoch": 0.5417607223476298, "grad_norm": 3.5893222373958524, "learning_rate": 9.284285880837946e-07, "loss": 0.828, "step": 180 }, { "epoch": 0.5568096313017307, "grad_norm": 3.7134185725142936, "learning_rate": 9.242677525300088e-07, "loss": 0.8346, "step": 185 }, { "epoch": 0.5718585402558315, "grad_norm": 3.550721543432503, "learning_rate": 9.199992415096259e-07, "loss": 0.832, "step": 190 }, { "epoch": 0.5869074492099323, "grad_norm": 3.7325735346120728, "learning_rate": 9.156241383336278e-07, "loss": 0.8156, "step": 195 }, { "epoch": 0.6019563581640331, "grad_norm": 3.6426263387830535, "learning_rate": 9.111435533651595e-07, "loss": 0.8411, "step": 200 }, { "epoch": 0.6019563581640331, "eval_loss": 0.8422114253044128, "eval_runtime": 164.0752, "eval_samples_per_second": 57.596, "eval_steps_per_second": 0.902, "step": 200 }, { "epoch": 0.617005267118134, "grad_norm": 3.978063196207494, "learning_rate": 9.065586237377274e-07, "loss": 0.8255, "step": 205 }, { "epoch": 0.6320541760722348, "grad_norm": 3.536724916036319, "learning_rate": 9.018705130666049e-07, "loss": 0.8355, "step": 210 }, { "epoch": 0.6471030850263356, "grad_norm": 3.7576883800972243, "learning_rate": 8.970804111535175e-07, "loss": 0.8244, "step": 215 }, { "epoch": 0.6621519939804364, "grad_norm": 3.413329114828355, "learning_rate": 8.921895336846812e-07, "loss": 0.8336, "step": 220 }, { "epoch": 0.6772009029345373, "grad_norm": 3.477984800656738, "learning_rate": 8.871991219222712e-07, "loss": 0.8152, "step": 225 }, { "epoch": 0.6922498118886381, "grad_norm": 3.6780798309273233, "learning_rate": 8.821104423894014e-07, "loss": 0.8235, "step": 230 }, { "epoch": 0.7072987208427389, "grad_norm": 3.4820310206949383, "learning_rate": 8.769247865486915e-07, "loss": 0.8226, "step": 235 }, { "epoch": 0.7223476297968398, "grad_norm": 3.3478327176800224, "learning_rate": 8.716434704745046e-07, "loss": 0.8341, "step": 240 }, { "epoch": 0.7373965387509406, "grad_norm": 3.540341298435034, "learning_rate": 8.662678345189396e-07, "loss": 0.825, "step": 245 }, { "epoch": 0.7524454477050414, "grad_norm": 3.658518292411783, "learning_rate": 8.607992429716608e-07, "loss": 0.8133, "step": 250 }, { "epoch": 0.7524454477050414, "eval_loss": 0.828486442565918, "eval_runtime": 164.1931, "eval_samples_per_second": 57.554, "eval_steps_per_second": 0.901, "step": 250 }, { "epoch": 0.7674943566591422, "grad_norm": 3.3210348798020797, "learning_rate": 8.55239083713654e-07, "loss": 0.8267, "step": 255 }, { "epoch": 0.782543265613243, "grad_norm": 3.2928105286016867, "learning_rate": 8.495887678649932e-07, "loss": 0.8336, "step": 260 }, { "epoch": 0.7975921745673439, "grad_norm": 3.7993668860558567, "learning_rate": 8.438497294267116e-07, "loss": 0.8371, "step": 265 }, { "epoch": 0.8126410835214447, "grad_norm": 3.7491461546658456, "learning_rate": 8.38023424916864e-07, "loss": 0.83, "step": 270 }, { "epoch": 0.8276899924755455, "grad_norm": 3.335971213466453, "learning_rate": 8.321113330008756e-07, "loss": 0.8231, "step": 275 }, { "epoch": 0.8427389014296464, "grad_norm": 3.50538153645489, "learning_rate": 8.261149541162691e-07, "loss": 0.8283, "step": 280 }, { "epoch": 0.8577878103837472, "grad_norm": 3.405552551438381, "learning_rate": 8.20035810091867e-07, "loss": 0.8333, "step": 285 }, { "epoch": 0.872836719337848, "grad_norm": 3.5518417970408036, "learning_rate": 8.13875443761565e-07, "loss": 0.8215, "step": 290 }, { "epoch": 0.8878856282919488, "grad_norm": 3.346948128373629, "learning_rate": 8.076354185727734e-07, "loss": 0.8127, "step": 295 }, { "epoch": 0.9029345372460497, "grad_norm": 3.472824409308143, "learning_rate": 8.013173181896282e-07, "loss": 0.8114, "step": 300 }, { "epoch": 0.9029345372460497, "eval_loss": 0.8188709616661072, "eval_runtime": 163.9398, "eval_samples_per_second": 57.643, "eval_steps_per_second": 0.903, "step": 300 }, { "epoch": 0.9179834462001505, "grad_norm": 3.535029645974378, "learning_rate": 7.94922746091071e-07, "loss": 0.8245, "step": 305 }, { "epoch": 0.9330323551542513, "grad_norm": 3.350512530278651, "learning_rate": 7.884533251638999e-07, "loss": 0.8031, "step": 310 }, { "epoch": 0.9480812641083521, "grad_norm": 3.1272224593692903, "learning_rate": 7.819106972908949e-07, "loss": 0.792, "step": 315 }, { "epoch": 0.963130173062453, "grad_norm": 3.3082625072971066, "learning_rate": 7.752965229341219e-07, "loss": 0.8073, "step": 320 }, { "epoch": 0.9781790820165538, "grad_norm": 3.6472654100125674, "learning_rate": 7.686124807135228e-07, "loss": 0.8091, "step": 325 }, { "epoch": 0.9932279909706546, "grad_norm": 3.4227050519519184, "learning_rate": 7.618602669808957e-07, "loss": 0.812, "step": 330 }, { "epoch": 1.0082768999247556, "grad_norm": 3.63643107405423, "learning_rate": 7.550415953893756e-07, "loss": 0.7794, "step": 335 }, { "epoch": 1.0233258088788564, "grad_norm": 3.5111108810888307, "learning_rate": 7.481581964585244e-07, "loss": 0.7534, "step": 340 }, { "epoch": 1.0383747178329572, "grad_norm": 3.738034258550528, "learning_rate": 7.412118171351395e-07, "loss": 0.7473, "step": 345 }, { "epoch": 1.053423626787058, "grad_norm": 3.511742644224616, "learning_rate": 7.342042203498951e-07, "loss": 0.7379, "step": 350 }, { "epoch": 1.053423626787058, "eval_loss": 0.8137179613113403, "eval_runtime": 163.9091, "eval_samples_per_second": 57.654, "eval_steps_per_second": 0.903, "step": 350 }, { "epoch": 1.0684725357411589, "grad_norm": 3.482412652291007, "learning_rate": 7.271371845699241e-07, "loss": 0.757, "step": 355 }, { "epoch": 1.0835214446952597, "grad_norm": 3.57693306476351, "learning_rate": 7.200125033474598e-07, "loss": 0.7405, "step": 360 }, { "epoch": 1.0985703536493605, "grad_norm": 3.551987518468492, "learning_rate": 7.128319848646477e-07, "loss": 0.7424, "step": 365 }, { "epoch": 1.1136192626034613, "grad_norm": 3.644980029199069, "learning_rate": 7.055974514746445e-07, "loss": 0.7557, "step": 370 }, { "epoch": 1.1286681715575622, "grad_norm": 3.661260327693092, "learning_rate": 6.983107392391202e-07, "loss": 0.7595, "step": 375 }, { "epoch": 1.143717080511663, "grad_norm": 3.6520213924338867, "learning_rate": 6.909736974622826e-07, "loss": 0.7411, "step": 380 }, { "epoch": 1.1587659894657638, "grad_norm": 3.341094037190979, "learning_rate": 6.835881882215395e-07, "loss": 0.7415, "step": 385 }, { "epoch": 1.1738148984198646, "grad_norm": 3.4808910667039803, "learning_rate": 6.761560858949192e-07, "loss": 0.7317, "step": 390 }, { "epoch": 1.1888638073739655, "grad_norm": 3.6539416517236076, "learning_rate": 6.686792766853705e-07, "loss": 0.7537, "step": 395 }, { "epoch": 1.2039127163280663, "grad_norm": 3.4648582888257002, "learning_rate": 6.611596581420599e-07, "loss": 0.7416, "step": 400 }, { "epoch": 1.2039127163280663, "eval_loss": 0.8082437515258789, "eval_runtime": 163.785, "eval_samples_per_second": 57.698, "eval_steps_per_second": 0.904, "step": 400 }, { "epoch": 1.2189616252821671, "grad_norm": 3.3756134153833433, "learning_rate": 6.53599138678791e-07, "loss": 0.7295, "step": 405 }, { "epoch": 1.234010534236268, "grad_norm": 3.5366994024401093, "learning_rate": 6.459996370896652e-07, "loss": 0.7419, "step": 410 }, { "epoch": 1.2490594431903688, "grad_norm": 3.6126358863182118, "learning_rate": 6.383630820621081e-07, "loss": 0.7348, "step": 415 }, { "epoch": 1.2641083521444696, "grad_norm": 3.329040929496611, "learning_rate": 6.306914116873862e-07, "loss": 0.7456, "step": 420 }, { "epoch": 1.2791572610985704, "grad_norm": 3.389626369376778, "learning_rate": 6.22986572968736e-07, "loss": 0.7399, "step": 425 }, { "epoch": 1.2942061700526712, "grad_norm": 3.640732106094957, "learning_rate": 6.152505213272307e-07, "loss": 0.7478, "step": 430 }, { "epoch": 1.309255079006772, "grad_norm": 3.678659612980298, "learning_rate": 6.074852201055121e-07, "loss": 0.7348, "step": 435 }, { "epoch": 1.324303987960873, "grad_norm": 3.456540220241952, "learning_rate": 5.996926400695113e-07, "loss": 0.7367, "step": 440 }, { "epoch": 1.3393528969149737, "grad_norm": 3.598565981484491, "learning_rate": 5.918747589082852e-07, "loss": 0.7452, "step": 445 }, { "epoch": 1.3544018058690745, "grad_norm": 3.6204936371770584, "learning_rate": 5.840335607320963e-07, "loss": 0.7623, "step": 450 }, { "epoch": 1.3544018058690745, "eval_loss": 0.8033472299575806, "eval_runtime": 163.9716, "eval_samples_per_second": 57.632, "eval_steps_per_second": 0.903, "step": 450 }, { "epoch": 1.3694507148231754, "grad_norm": 3.5876675484160927, "learning_rate": 5.761710355688627e-07, "loss": 0.7431, "step": 455 }, { "epoch": 1.3844996237772762, "grad_norm": 3.619019215155952, "learning_rate": 5.682891788591065e-07, "loss": 0.7464, "step": 460 }, { "epoch": 1.399548532731377, "grad_norm": 3.5444023840644237, "learning_rate": 5.603899909495283e-07, "loss": 0.7357, "step": 465 }, { "epoch": 1.4145974416854779, "grad_norm": 3.4447315407410266, "learning_rate": 5.52475476585336e-07, "loss": 0.735, "step": 470 }, { "epoch": 1.4296463506395787, "grad_norm": 3.8423796002300983, "learning_rate": 5.445476444014591e-07, "loss": 0.7318, "step": 475 }, { "epoch": 1.4446952595936795, "grad_norm": 3.594078636376061, "learning_rate": 5.366085064127734e-07, "loss": 0.7484, "step": 480 }, { "epoch": 1.4597441685477803, "grad_norm": 3.607424426488508, "learning_rate": 5.286600775034699e-07, "loss": 0.7435, "step": 485 }, { "epoch": 1.4747930775018812, "grad_norm": 3.7328475918869715, "learning_rate": 5.207043749156944e-07, "loss": 0.7378, "step": 490 }, { "epoch": 1.489841986455982, "grad_norm": 3.443608577698693, "learning_rate": 5.127434177375893e-07, "loss": 0.7418, "step": 495 }, { "epoch": 1.5048908954100828, "grad_norm": 3.5186913080533637, "learning_rate": 5.047792263908659e-07, "loss": 0.7158, "step": 500 }, { "epoch": 1.5048908954100828, "eval_loss": 0.7987317442893982, "eval_runtime": 163.9794, "eval_samples_per_second": 57.629, "eval_steps_per_second": 0.903, "step": 500 }, { "epoch": 1.5199398043641836, "grad_norm": 3.865883308748703, "learning_rate": 4.968138221180401e-07, "loss": 0.7431, "step": 505 }, { "epoch": 1.5349887133182845, "grad_norm": 3.710215468774283, "learning_rate": 4.888492264694565e-07, "loss": 0.7007, "step": 510 }, { "epoch": 1.5500376222723853, "grad_norm": 3.6132015362670145, "learning_rate": 4.808874607902397e-07, "loss": 0.7364, "step": 515 }, { "epoch": 1.565086531226486, "grad_norm": 3.7199884506401593, "learning_rate": 4.7293054570729126e-07, "loss": 0.7267, "step": 520 }, { "epoch": 1.580135440180587, "grad_norm": 3.9105644710376457, "learning_rate": 4.649805006164743e-07, "loss": 0.7298, "step": 525 }, { "epoch": 1.5951843491346878, "grad_norm": 3.5526438869005292, "learning_rate": 4.5703934317010727e-07, "loss": 0.7429, "step": 530 }, { "epoch": 1.6102332580887886, "grad_norm": 3.502575386275301, "learning_rate": 4.491090887649024e-07, "loss": 0.7337, "step": 535 }, { "epoch": 1.6252821670428894, "grad_norm": 3.794160344449478, "learning_rate": 4.4119175003047407e-07, "loss": 0.7299, "step": 540 }, { "epoch": 1.6403310759969902, "grad_norm": 3.2770795614619237, "learning_rate": 4.3328933631855195e-07, "loss": 0.7321, "step": 545 }, { "epoch": 1.655379984951091, "grad_norm": 3.6119103240995263, "learning_rate": 4.2540385319302524e-07, "loss": 0.7322, "step": 550 }, { "epoch": 1.655379984951091, "eval_loss": 0.7954748272895813, "eval_runtime": 164.5155, "eval_samples_per_second": 57.441, "eval_steps_per_second": 0.9, "step": 550 }, { "epoch": 1.670428893905192, "grad_norm": 3.5947026375482944, "learning_rate": 4.175373019209468e-07, "loss": 0.7457, "step": 555 }, { "epoch": 1.6854778028592927, "grad_norm": 3.761848976466262, "learning_rate": 4.0969167896463046e-07, "loss": 0.7386, "step": 560 }, { "epoch": 1.7005267118133935, "grad_norm": 3.439361811338972, "learning_rate": 4.018689754749648e-07, "loss": 0.724, "step": 565 }, { "epoch": 1.7155756207674944, "grad_norm": 3.4555454207422858, "learning_rate": 3.9407117678607756e-07, "loss": 0.7441, "step": 570 }, { "epoch": 1.7306245297215952, "grad_norm": 3.5617425595455177, "learning_rate": 3.8630026191147405e-07, "loss": 0.7249, "step": 575 }, { "epoch": 1.745673438675696, "grad_norm": 3.448724415418216, "learning_rate": 3.78558203041782e-07, "loss": 0.6972, "step": 580 }, { "epoch": 1.7607223476297968, "grad_norm": 3.6943962689825827, "learning_rate": 3.7084696504422525e-07, "loss": 0.7302, "step": 585 }, { "epoch": 1.7757712565838977, "grad_norm": 3.6561881604475945, "learning_rate": 3.6316850496395855e-07, "loss": 0.7424, "step": 590 }, { "epoch": 1.7908201655379985, "grad_norm": 3.625068299425613, "learning_rate": 3.555247715273867e-07, "loss": 0.7252, "step": 595 }, { "epoch": 1.8058690744920993, "grad_norm": 3.519787303173951, "learning_rate": 3.4791770464759347e-07, "loss": 0.7148, "step": 600 }, { "epoch": 1.8058690744920993, "eval_loss": 0.7919732332229614, "eval_runtime": 163.9475, "eval_samples_per_second": 57.64, "eval_steps_per_second": 0.903, "step": 600 }, { "epoch": 1.8209179834462002, "grad_norm": 3.8300888994459306, "learning_rate": 3.4034923493201007e-07, "loss": 0.7377, "step": 605 }, { "epoch": 1.835966892400301, "grad_norm": 3.400875766754663, "learning_rate": 3.3282128319244237e-07, "loss": 0.7385, "step": 610 }, { "epoch": 1.8510158013544018, "grad_norm": 3.6182045725169325, "learning_rate": 3.2533575995758694e-07, "loss": 0.7197, "step": 615 }, { "epoch": 1.8660647103085026, "grad_norm": 3.904119144772108, "learning_rate": 3.178945649881543e-07, "loss": 0.7018, "step": 620 }, { "epoch": 1.8811136192626035, "grad_norm": 3.8565228264688822, "learning_rate": 3.1049958679472645e-07, "loss": 0.7246, "step": 625 }, { "epoch": 1.8961625282167043, "grad_norm": 3.422783362651409, "learning_rate": 3.031527021584701e-07, "loss": 0.7336, "step": 630 }, { "epoch": 1.911211437170805, "grad_norm": 3.675919325226495, "learning_rate": 2.9585577565482484e-07, "loss": 0.7169, "step": 635 }, { "epoch": 1.926260346124906, "grad_norm": 3.726914018119805, "learning_rate": 2.886106591802908e-07, "loss": 0.7259, "step": 640 }, { "epoch": 1.9413092550790068, "grad_norm": 3.5805352508835333, "learning_rate": 2.814191914824332e-07, "loss": 0.7168, "step": 645 }, { "epoch": 1.9563581640331076, "grad_norm": 3.60649172674426, "learning_rate": 2.7428319769322415e-07, "loss": 0.71, "step": 650 }, { "epoch": 1.9563581640331076, "eval_loss": 0.7893925309181213, "eval_runtime": 164.1237, "eval_samples_per_second": 57.579, "eval_steps_per_second": 0.902, "step": 650 }, { "epoch": 1.9714070729872084, "grad_norm": 3.614812754262687, "learning_rate": 2.672044888658399e-07, "loss": 0.7041, "step": 655 }, { "epoch": 1.9864559819413092, "grad_norm": 3.7073968028896473, "learning_rate": 2.6018486151503213e-07, "loss": 0.7285, "step": 660 }, { "epoch": 2.00150489089541, "grad_norm": 3.647866320725029, "learning_rate": 2.532260971611867e-07, "loss": 0.7113, "step": 665 }, { "epoch": 2.016553799849511, "grad_norm": 3.699708580061935, "learning_rate": 2.4632996187819034e-07, "loss": 0.6883, "step": 670 }, { "epoch": 2.0316027088036117, "grad_norm": 3.936748692781072, "learning_rate": 2.394982058452165e-07, "loss": 0.6756, "step": 675 }, { "epoch": 2.0466516177577128, "grad_norm": 3.716518462078895, "learning_rate": 2.3273256290254402e-07, "loss": 0.6762, "step": 680 }, { "epoch": 2.0617005267118134, "grad_norm": 3.702081977847743, "learning_rate": 2.2603475011152517e-07, "loss": 0.6895, "step": 685 }, { "epoch": 2.0767494356659144, "grad_norm": 3.605178078795864, "learning_rate": 2.1940646731880885e-07, "loss": 0.6607, "step": 690 }, { "epoch": 2.091798344620015, "grad_norm": 3.7196889182496733, "learning_rate": 2.1284939672493506e-07, "loss": 0.6789, "step": 695 }, { "epoch": 2.106847253574116, "grad_norm": 3.7090327815341753, "learning_rate": 2.0636520245740708e-07, "loss": 0.6651, "step": 700 }, { "epoch": 2.106847253574116, "eval_loss": 0.7947185039520264, "eval_runtime": 164.1101, "eval_samples_per_second": 57.583, "eval_steps_per_second": 0.902, "step": 700 }, { "epoch": 2.1218961625282167, "grad_norm": 3.736696447237514, "learning_rate": 1.9995553014834986e-07, "loss": 0.6809, "step": 705 }, { "epoch": 2.1369450714823177, "grad_norm": 3.6458918889335865, "learning_rate": 1.9362200651686406e-07, "loss": 0.6864, "step": 710 }, { "epoch": 2.1519939804364183, "grad_norm": 4.002063868688003, "learning_rate": 1.873662389561771e-07, "loss": 0.6987, "step": 715 }, { "epoch": 2.1670428893905194, "grad_norm": 3.8698599662856714, "learning_rate": 1.8118981512570254e-07, "loss": 0.6801, "step": 720 }, { "epoch": 2.18209179834462, "grad_norm": 3.6230492953936677, "learning_rate": 1.750943025481046e-07, "loss": 0.6826, "step": 725 }, { "epoch": 2.197140707298721, "grad_norm": 3.7726470306057953, "learning_rate": 1.6908124821147517e-07, "loss": 0.6904, "step": 730 }, { "epoch": 2.2121896162528216, "grad_norm": 3.787910233434616, "learning_rate": 1.631521781767214e-07, "loss": 0.6798, "step": 735 }, { "epoch": 2.2272385252069227, "grad_norm": 3.9836210259763023, "learning_rate": 1.5730859719026535e-07, "loss": 0.6776, "step": 740 }, { "epoch": 2.2422874341610233, "grad_norm": 3.763373496609544, "learning_rate": 1.5155198830215144e-07, "loss": 0.6805, "step": 745 }, { "epoch": 2.2573363431151243, "grad_norm": 3.8572405199189466, "learning_rate": 1.4588381248966185e-07, "loss": 0.6684, "step": 750 }, { "epoch": 2.2573363431151243, "eval_loss": 0.7936417460441589, "eval_runtime": 163.9206, "eval_samples_per_second": 57.65, "eval_steps_per_second": 0.903, "step": 750 }, { "epoch": 2.272385252069225, "grad_norm": 3.8388172371691924, "learning_rate": 1.4030550828653354e-07, "loss": 0.7012, "step": 755 }, { "epoch": 2.287434161023326, "grad_norm": 3.8523292354815823, "learning_rate": 1.3481849141786977e-07, "loss": 0.6778, "step": 760 }, { "epoch": 2.3024830699774266, "grad_norm": 3.937046633996997, "learning_rate": 1.294241544408425e-07, "loss": 0.6775, "step": 765 }, { "epoch": 2.3175319789315276, "grad_norm": 3.953239589293332, "learning_rate": 1.241238663912727e-07, "loss": 0.6868, "step": 770 }, { "epoch": 2.3325808878856282, "grad_norm": 4.089219915268593, "learning_rate": 1.1891897243618183e-07, "loss": 0.6694, "step": 775 }, { "epoch": 2.3476297968397293, "grad_norm": 3.846961987007966, "learning_rate": 1.1381079353239915e-07, "loss": 0.6779, "step": 780 }, { "epoch": 2.36267870579383, "grad_norm": 3.7397027273861885, "learning_rate": 1.0880062609131485e-07, "loss": 0.6726, "step": 785 }, { "epoch": 2.377727614747931, "grad_norm": 3.921668306186131, "learning_rate": 1.0388974164986247e-07, "loss": 0.6796, "step": 790 }, { "epoch": 2.3927765237020315, "grad_norm": 3.8468336671320853, "learning_rate": 9.907938654781306e-08, "loss": 0.6835, "step": 795 }, { "epoch": 2.4078254326561326, "grad_norm": 3.7410679007377854, "learning_rate": 9.437078161146589e-08, "loss": 0.6826, "step": 800 }, { "epoch": 2.4078254326561326, "eval_loss": 0.7927989363670349, "eval_runtime": 167.9966, "eval_samples_per_second": 56.251, "eval_steps_per_second": 0.881, "step": 800 }, { "epoch": 2.422874341610233, "grad_norm": 4.079995858636958, "learning_rate": 8.976512184381246e-08, "loss": 0.6795, "step": 805 }, { "epoch": 2.4379232505643342, "grad_norm": 3.735129398089214, "learning_rate": 8.526357612125573e-08, "loss": 0.6898, "step": 810 }, { "epoch": 2.452972159518435, "grad_norm": 3.8390441333226817, "learning_rate": 8.086728689695921e-08, "loss": 0.7046, "step": 815 }, { "epoch": 2.468021068472536, "grad_norm": 3.9049522755901362, "learning_rate": 7.657736991090263e-08, "loss": 0.661, "step": 820 }, { "epoch": 2.4830699774266365, "grad_norm": 3.880401857930606, "learning_rate": 7.239491390671631e-08, "loss": 0.6742, "step": 825 }, { "epoch": 2.4981188863807375, "grad_norm": 3.912207039826943, "learning_rate": 6.832098035536759e-08, "loss": 0.6672, "step": 830 }, { "epoch": 2.513167795334838, "grad_norm": 3.922109823682805, "learning_rate": 6.435660318576935e-08, "loss": 0.6754, "step": 835 }, { "epoch": 2.528216704288939, "grad_norm": 3.8537847324852224, "learning_rate": 6.0502788522377e-08, "loss": 0.6824, "step": 840 }, { "epoch": 2.54326561324304, "grad_norm": 4.033607009118544, "learning_rate": 5.676051442984325e-08, "loss": 0.6716, "step": 845 }, { "epoch": 2.558314522197141, "grad_norm": 3.6539091883922423, "learning_rate": 5.313073066479379e-08, "loss": 0.6711, "step": 850 }, { "epoch": 2.558314522197141, "eval_loss": 0.7919635772705078, "eval_runtime": 164.1041, "eval_samples_per_second": 57.585, "eval_steps_per_second": 0.902, "step": 850 }, { "epoch": 2.5733634311512414, "grad_norm": 3.876589664826218, "learning_rate": 4.961435843478751e-08, "loss": 0.6806, "step": 855 }, { "epoch": 2.5884123401053425, "grad_norm": 3.655653854351189, "learning_rate": 4.621229016452155e-08, "loss": 0.6535, "step": 860 }, { "epoch": 2.603461249059443, "grad_norm": 4.079997158323887, "learning_rate": 4.2925389269341916e-08, "loss": 0.673, "step": 865 }, { "epoch": 2.618510158013544, "grad_norm": 3.8325795147951207, "learning_rate": 3.975448993611652e-08, "loss": 0.6709, "step": 870 }, { "epoch": 2.6335590669676447, "grad_norm": 3.778742993294259, "learning_rate": 3.67003969115251e-08, "loss": 0.6619, "step": 875 }, { "epoch": 2.648607975921746, "grad_norm": 4.10366570220581, "learning_rate": 3.376388529782215e-08, "loss": 0.6848, "step": 880 }, { "epoch": 2.6636568848758464, "grad_norm": 3.8699319815585236, "learning_rate": 3.094570035612226e-08, "loss": 0.6853, "step": 885 }, { "epoch": 2.6787057938299474, "grad_norm": 4.043194008918937, "learning_rate": 2.8246557317259723e-08, "loss": 0.6745, "step": 890 }, { "epoch": 2.693754702784048, "grad_norm": 3.99392087783422, "learning_rate": 2.5667141200268694e-08, "loss": 0.6577, "step": 895 }, { "epoch": 2.708803611738149, "grad_norm": 3.6631248500731814, "learning_rate": 2.3208106638531842e-08, "loss": 0.6757, "step": 900 }, { "epoch": 2.708803611738149, "eval_loss": 0.7914727926254272, "eval_runtime": 164.3819, "eval_samples_per_second": 57.488, "eval_steps_per_second": 0.9, "step": 900 }, { "epoch": 2.7238525206922497, "grad_norm": 3.9578522639967537, "learning_rate": 2.087007771363969e-08, "loss": 0.6736, "step": 905 }, { "epoch": 2.7389014296463507, "grad_norm": 3.739728228760654, "learning_rate": 1.8653647797004236e-08, "loss": 0.6729, "step": 910 }, { "epoch": 2.7539503386004514, "grad_norm": 3.8722809533250326, "learning_rate": 1.655937939926655e-08, "loss": 0.6841, "step": 915 }, { "epoch": 2.7689992475545524, "grad_norm": 3.8241600199550945, "learning_rate": 1.4587804027536454e-08, "loss": 0.6917, "step": 920 }, { "epoch": 2.784048156508653, "grad_norm": 3.7138354028978506, "learning_rate": 1.2739422050500436e-08, "loss": 0.6751, "step": 925 }, { "epoch": 2.799097065462754, "grad_norm": 3.790111519043104, "learning_rate": 1.101470257143261e-08, "loss": 0.6738, "step": 930 }, { "epoch": 2.8141459744168547, "grad_norm": 3.561873269533578, "learning_rate": 9.414083309140453e-09, "loss": 0.6952, "step": 935 }, { "epoch": 2.8291948833709557, "grad_norm": 3.6893556293107364, "learning_rate": 7.93797048687539e-09, "loss": 0.6761, "step": 940 }, { "epoch": 2.8442437923250563, "grad_norm": 3.8751112944911412, "learning_rate": 6.5867387292369295e-09, "loss": 0.6932, "step": 945 }, { "epoch": 2.8592927012791574, "grad_norm": 4.004750596943983, "learning_rate": 5.360730967096272e-09, "loss": 0.6791, "step": 950 }, { "epoch": 2.8592927012791574, "eval_loss": 0.7912743091583252, "eval_runtime": 164.3927, "eval_samples_per_second": 57.484, "eval_steps_per_second": 0.9, "step": 950 }, { "epoch": 2.874341610233258, "grad_norm": 3.8028887659361637, "learning_rate": 4.260258350563317e-09, "loss": 0.6716, "step": 955 }, { "epoch": 2.889390519187359, "grad_norm": 3.8634306165906107, "learning_rate": 3.285600170019609e-09, "loss": 0.6917, "step": 960 }, { "epoch": 2.9044394281414596, "grad_norm": 3.6280717262306363, "learning_rate": 2.437003785236702e-09, "loss": 0.665, "step": 965 }, { "epoch": 2.9194883370955607, "grad_norm": 4.013775538099351, "learning_rate": 1.714684562598545e-09, "loss": 0.6845, "step": 970 }, { "epoch": 2.9345372460496613, "grad_norm": 3.8102208297791, "learning_rate": 1.1188258204433144e-09, "loss": 0.6673, "step": 975 }, { "epoch": 2.9495861550037623, "grad_norm": 3.649482883492926, "learning_rate": 6.49578782538851e-10, "loss": 0.6772, "step": 980 }, { "epoch": 2.964635063957863, "grad_norm": 3.9407869896383483, "learning_rate": 3.070625397031401e-10, "loss": 0.6743, "step": 985 }, { "epoch": 2.979683972911964, "grad_norm": 3.857935656292572, "learning_rate": 9.136401958059759e-11, "loss": 0.6633, "step": 990 }, { "epoch": 2.9947328818660646, "grad_norm": 3.934124028791183, "learning_rate": 2.5379645800516215e-12, "loss": 0.672, "step": 995 }, { "epoch": 2.9977426636568847, "step": 996, "total_flos": 5872633472090112.0, "train_loss": 0.7687147306988996, "train_runtime": 16194.6861, "train_samples_per_second": 15.755, "train_steps_per_second": 0.062 } ], "logging_steps": 5, "max_steps": 996, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5872633472090112.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }