|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9878213802435725, |
|
"eval_steps": 50, |
|
"global_step": 552, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02706359945872801, |
|
"grad_norm": 18.62096133756944, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7202, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05412719891745602, |
|
"grad_norm": 12.425223769043265, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5858, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08119079837618404, |
|
"grad_norm": 7.4903509661138825, |
|
"learning_rate": 9.997900331216397e-07, |
|
"loss": 1.2962, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10825439783491204, |
|
"grad_norm": 4.423017017969494, |
|
"learning_rate": 9.991603088309193e-07, |
|
"loss": 1.1481, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13531799729364005, |
|
"grad_norm": 4.05084233893573, |
|
"learning_rate": 9.981113560128126e-07, |
|
"loss": 1.0683, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16238159675236807, |
|
"grad_norm": 3.6533701555355194, |
|
"learning_rate": 9.966440556487147e-07, |
|
"loss": 1.0347, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18944519621109607, |
|
"grad_norm": 3.5509774718765037, |
|
"learning_rate": 9.947596400765342e-07, |
|
"loss": 0.9829, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2165087956698241, |
|
"grad_norm": 3.389187665280499, |
|
"learning_rate": 9.924596919556916e-07, |
|
"loss": 0.9751, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2435723951285521, |
|
"grad_norm": 3.7276246548668697, |
|
"learning_rate": 9.897461429378964e-07, |
|
"loss": 0.9534, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2706359945872801, |
|
"grad_norm": 3.6238376734485906, |
|
"learning_rate": 9.866212720448147e-07, |
|
"loss": 0.9375, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2706359945872801, |
|
"eval_loss": 0.9354712963104248, |
|
"eval_runtime": 91.9855, |
|
"eval_samples_per_second": 57.074, |
|
"eval_steps_per_second": 0.902, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2976995940460081, |
|
"grad_norm": 3.650038732579689, |
|
"learning_rate": 9.830877037539935e-07, |
|
"loss": 0.9241, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.32476319350473615, |
|
"grad_norm": 3.282303443355455, |
|
"learning_rate": 9.791484057946465e-07, |
|
"loss": 0.9301, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35182679296346414, |
|
"grad_norm": 3.5367161380439316, |
|
"learning_rate": 9.748066866551555e-07, |
|
"loss": 0.9141, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.37889039242219213, |
|
"grad_norm": 3.384471001472233, |
|
"learning_rate": 9.700661928043785e-07, |
|
"loss": 0.9099, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4059539918809202, |
|
"grad_norm": 3.6506849242542603, |
|
"learning_rate": 9.649309056290984e-07, |
|
"loss": 0.9034, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4330175913396482, |
|
"grad_norm": 3.5334381489912654, |
|
"learning_rate": 9.594051380901858e-07, |
|
"loss": 0.8755, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.46008119079837617, |
|
"grad_norm": 3.617605597301526, |
|
"learning_rate": 9.534935311002834e-07, |
|
"loss": 0.887, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4871447902571042, |
|
"grad_norm": 3.8303351034768016, |
|
"learning_rate": 9.472010496260544e-07, |
|
"loss": 0.8868, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5142083897158322, |
|
"grad_norm": 3.4133330247065485, |
|
"learning_rate": 9.405329785182678e-07, |
|
"loss": 0.8685, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5412719891745602, |
|
"grad_norm": 3.598327334615042, |
|
"learning_rate": 9.334949180732244e-07, |
|
"loss": 0.8824, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5412719891745602, |
|
"eval_loss": 0.8775722980499268, |
|
"eval_runtime": 91.8061, |
|
"eval_samples_per_second": 57.186, |
|
"eval_steps_per_second": 0.904, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5683355886332883, |
|
"grad_norm": 3.4406636804059803, |
|
"learning_rate": 9.260927793292497e-07, |
|
"loss": 0.8847, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5953991880920162, |
|
"grad_norm": 3.7093344605385363, |
|
"learning_rate": 9.183327791022047e-07, |
|
"loss": 0.8716, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6224627875507442, |
|
"grad_norm": 3.6150174879821275, |
|
"learning_rate": 9.102214347641843e-07, |
|
"loss": 0.879, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6495263870094723, |
|
"grad_norm": 3.323783724540155, |
|
"learning_rate": 9.017655587697883e-07, |
|
"loss": 0.849, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6765899864682002, |
|
"grad_norm": 3.4870778817811705, |
|
"learning_rate": 8.929722529345623e-07, |
|
"loss": 0.8623, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7036535859269283, |
|
"grad_norm": 3.3887325632646554, |
|
"learning_rate": 8.83848902470413e-07, |
|
"loss": 0.8821, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7307171853856563, |
|
"grad_norm": 3.4877253492258253, |
|
"learning_rate": 8.744031697830088e-07, |
|
"loss": 0.864, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7577807848443843, |
|
"grad_norm": 3.4539745143868643, |
|
"learning_rate": 8.646429880363746e-07, |
|
"loss": 0.8469, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7848443843031123, |
|
"grad_norm": 3.6501092648768645, |
|
"learning_rate": 8.545765544900846e-07, |
|
"loss": 0.8488, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8119079837618404, |
|
"grad_norm": 3.6661416233299233, |
|
"learning_rate": 8.442123236146508e-07, |
|
"loss": 0.8549, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8119079837618404, |
|
"eval_loss": 0.8518306612968445, |
|
"eval_runtime": 91.7277, |
|
"eval_samples_per_second": 57.235, |
|
"eval_steps_per_second": 0.905, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8389715832205683, |
|
"grad_norm": 3.6360925835102895, |
|
"learning_rate": 8.33558999990887e-07, |
|
"loss": 0.846, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8660351826792964, |
|
"grad_norm": 3.325785655756218, |
|
"learning_rate": 8.22625530999215e-07, |
|
"loss": 0.8485, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8930987821380244, |
|
"grad_norm": 3.5191969682264492, |
|
"learning_rate": 8.114210993050502e-07, |
|
"loss": 0.86, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9201623815967523, |
|
"grad_norm": 3.505912957460407, |
|
"learning_rate": 7.999551151465791e-07, |
|
"loss": 0.8445, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9472259810554804, |
|
"grad_norm": 3.567280090097035, |
|
"learning_rate": 7.88237208431406e-07, |
|
"loss": 0.8366, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9742895805142084, |
|
"grad_norm": 3.5124657783307143, |
|
"learning_rate": 7.762772206487065e-07, |
|
"loss": 0.8377, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0013531799729365, |
|
"grad_norm": 3.661655068528439, |
|
"learning_rate": 7.640851966036805e-07, |
|
"loss": 0.8402, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0284167794316643, |
|
"grad_norm": 3.443540295008513, |
|
"learning_rate": 7.516713759812464e-07, |
|
"loss": 0.7742, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0554803788903924, |
|
"grad_norm": 3.5551780337145975, |
|
"learning_rate": 7.390461847460628e-07, |
|
"loss": 0.7723, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0825439783491204, |
|
"grad_norm": 3.4346090387850965, |
|
"learning_rate": 7.262202263860988e-07, |
|
"loss": 0.7825, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0825439783491204, |
|
"eval_loss": 0.8397604823112488, |
|
"eval_runtime": 91.7114, |
|
"eval_samples_per_second": 57.245, |
|
"eval_steps_per_second": 0.905, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1096075778078485, |
|
"grad_norm": 3.346387935116934, |
|
"learning_rate": 7.1320427300711e-07, |
|
"loss": 0.7622, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1366711772665765, |
|
"grad_norm": 3.6975628986759532, |
|
"learning_rate": 7.000092562854959e-07, |
|
"loss": 0.7683, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1637347767253043, |
|
"grad_norm": 3.534180683955908, |
|
"learning_rate": 6.866462582871401e-07, |
|
"loss": 0.7536, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1907983761840324, |
|
"grad_norm": 3.6284911794109322, |
|
"learning_rate": 6.731265021599436e-07, |
|
"loss": 0.775, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2178619756427604, |
|
"grad_norm": 3.9686622547711816, |
|
"learning_rate": 6.594613427078674e-07, |
|
"loss": 0.7736, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2449255751014885, |
|
"grad_norm": 3.4953313507774797, |
|
"learning_rate": 6.456622568544011e-07, |
|
"loss": 0.7723, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2719891745602165, |
|
"grad_norm": 3.600480261317538, |
|
"learning_rate": 6.317408340034684e-07, |
|
"loss": 0.7675, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2990527740189446, |
|
"grad_norm": 4.160648083143722, |
|
"learning_rate": 6.177087663058625e-07, |
|
"loss": 0.786, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3261163734776726, |
|
"grad_norm": 3.554038996352176, |
|
"learning_rate": 6.035778388393893e-07, |
|
"loss": 0.7569, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.3531799729364005, |
|
"grad_norm": 3.4314153682677517, |
|
"learning_rate": 5.893599197109624e-07, |
|
"loss": 0.759, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3531799729364005, |
|
"eval_loss": 0.8301268219947815, |
|
"eval_runtime": 91.7237, |
|
"eval_samples_per_second": 57.237, |
|
"eval_steps_per_second": 0.905, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3802435723951285, |
|
"grad_norm": 3.405464319479644, |
|
"learning_rate": 5.750669500889666e-07, |
|
"loss": 0.7549, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4073071718538566, |
|
"grad_norm": 3.500530938219073, |
|
"learning_rate": 5.607109341742578e-07, |
|
"loss": 0.7801, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4343707713125846, |
|
"grad_norm": 3.726591852861668, |
|
"learning_rate": 5.463039291182256e-07, |
|
"loss": 0.7765, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4614343707713127, |
|
"grad_norm": 3.619160960040315, |
|
"learning_rate": 5.318580348963825e-07, |
|
"loss": 0.7662, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4884979702300405, |
|
"grad_norm": 3.4244376461150208, |
|
"learning_rate": 5.173853841459877e-07, |
|
"loss": 0.7554, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5155615696887685, |
|
"grad_norm": 3.700667266849164, |
|
"learning_rate": 5.028981319762399e-07, |
|
"loss": 0.7616, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5426251691474966, |
|
"grad_norm": 3.567507416375757, |
|
"learning_rate": 4.884084457595956e-07, |
|
"loss": 0.7818, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5696887686062246, |
|
"grad_norm": 3.7802418009731755, |
|
"learning_rate": 4.7392849491278817e-07, |
|
"loss": 0.7691, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5967523680649527, |
|
"grad_norm": 3.501336855850476, |
|
"learning_rate": 4.5947044067613e-07, |
|
"loss": 0.7681, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.6238159675236807, |
|
"grad_norm": 3.656073742916776, |
|
"learning_rate": 4.4504642589968217e-07, |
|
"loss": 0.7676, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6238159675236807, |
|
"eval_loss": 0.8219158053398132, |
|
"eval_runtime": 91.7533, |
|
"eval_samples_per_second": 57.219, |
|
"eval_steps_per_second": 0.905, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6508795669824088, |
|
"grad_norm": 3.704023712304986, |
|
"learning_rate": 4.3066856484486847e-07, |
|
"loss": 0.7602, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6779431664411368, |
|
"grad_norm": 3.504247839905232, |
|
"learning_rate": 4.1634893301010165e-07, |
|
"loss": 0.7589, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7050067658998647, |
|
"grad_norm": 3.470803136242184, |
|
"learning_rate": 4.0209955698896445e-07, |
|
"loss": 0.7475, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.7320703653585927, |
|
"grad_norm": 3.5356035894949045, |
|
"learning_rate": 3.8793240436946385e-07, |
|
"loss": 0.7548, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7591339648173205, |
|
"grad_norm": 3.559368281490328, |
|
"learning_rate": 3.738593736828426e-07, |
|
"loss": 0.761, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7861975642760486, |
|
"grad_norm": 3.4859137753013942, |
|
"learning_rate": 3.598922844103902e-07, |
|
"loss": 0.7704, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8132611637347766, |
|
"grad_norm": 3.6109852467133607, |
|
"learning_rate": 3.4604286705664397e-07, |
|
"loss": 0.7578, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.8403247631935047, |
|
"grad_norm": 3.478328670865475, |
|
"learning_rate": 3.323227532973193e-07, |
|
"loss": 0.7548, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8673883626522327, |
|
"grad_norm": 3.670794273772087, |
|
"learning_rate": 3.187434662102434e-07, |
|
"loss": 0.7624, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.8944519621109608, |
|
"grad_norm": 3.6564249976504963, |
|
"learning_rate": 3.0531641059749634e-07, |
|
"loss": 0.7624, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8944519621109608, |
|
"eval_loss": 0.81601482629776, |
|
"eval_runtime": 91.595, |
|
"eval_samples_per_second": 57.318, |
|
"eval_steps_per_second": 0.906, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9215155615696888, |
|
"grad_norm": 3.8249796571875754, |
|
"learning_rate": 2.920528634068885e-07, |
|
"loss": 0.7342, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.9485791610284169, |
|
"grad_norm": 3.4390639442266044, |
|
"learning_rate": 2.789639642608184e-07, |
|
"loss": 0.7552, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.975642760487145, |
|
"grad_norm": 3.681741592514357, |
|
"learning_rate": 2.6606070610046526e-07, |
|
"loss": 0.7578, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.002706359945873, |
|
"grad_norm": 3.7841567374808815, |
|
"learning_rate": 2.533539259531757e-07, |
|
"loss": 0.755, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.029769959404601, |
|
"grad_norm": 3.406665416810156, |
|
"learning_rate": 2.408542958307957e-07, |
|
"loss": 0.6915, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.0568335588633286, |
|
"grad_norm": 3.627816770705192, |
|
"learning_rate": 2.2857231376659514e-07, |
|
"loss": 0.7139, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.0838971583220567, |
|
"grad_norm": 3.8009146897382164, |
|
"learning_rate": 2.1651829499831043e-07, |
|
"loss": 0.7127, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.1109607577807847, |
|
"grad_norm": 3.7612594070053578, |
|
"learning_rate": 2.0470236330471126e-07, |
|
"loss": 0.7188, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.138024357239513, |
|
"grad_norm": 3.881855102735637, |
|
"learning_rate": 1.9313444250296846e-07, |
|
"loss": 0.7319, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.165087956698241, |
|
"grad_norm": 3.8666379554848103, |
|
"learning_rate": 1.818242481139613e-07, |
|
"loss": 0.6976, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.165087956698241, |
|
"eval_loss": 0.8192901611328125, |
|
"eval_runtime": 91.7299, |
|
"eval_samples_per_second": 57.233, |
|
"eval_steps_per_second": 0.905, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.192151556156969, |
|
"grad_norm": 3.6983015115543676, |
|
"learning_rate": 1.7078127920252783e-07, |
|
"loss": 0.7018, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.219215155615697, |
|
"grad_norm": 3.4856114871878408, |
|
"learning_rate": 1.600148103995087e-07, |
|
"loss": 0.7062, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.246278755074425, |
|
"grad_norm": 3.7609793781320042, |
|
"learning_rate": 1.4953388411228602e-07, |
|
"loss": 0.714, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.273342354533153, |
|
"grad_norm": 3.7207726096154206, |
|
"learning_rate": 1.3934730293035936e-07, |
|
"loss": 0.7028, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.300405953991881, |
|
"grad_norm": 3.9694406933094193, |
|
"learning_rate": 1.2946362223233614e-07, |
|
"loss": 0.7023, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.3274695534506087, |
|
"grad_norm": 3.63014175931209, |
|
"learning_rate": 1.198911430005478e-07, |
|
"loss": 0.7022, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.3545331529093367, |
|
"grad_norm": 3.7202986392499295, |
|
"learning_rate": 1.1063790484932462e-07, |
|
"loss": 0.7083, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.381596752368065, |
|
"grad_norm": 3.640272659998316, |
|
"learning_rate": 1.0171167927278368e-07, |
|
"loss": 0.6996, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.408660351826793, |
|
"grad_norm": 3.830774844166055, |
|
"learning_rate": 9.311996311780446e-08, |
|
"loss": 0.7032, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.435723951285521, |
|
"grad_norm": 3.835735833278635, |
|
"learning_rate": 8.486997228767012e-08, |
|
"loss": 0.7004, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.435723951285521, |
|
"eval_loss": 0.8185598254203796, |
|
"eval_runtime": 91.6401, |
|
"eval_samples_per_second": 57.289, |
|
"eval_steps_per_second": 0.906, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.462787550744249, |
|
"grad_norm": 3.8096679910803526, |
|
"learning_rate": 7.696863568166518e-08, |
|
"loss": 0.6951, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.489851150202977, |
|
"grad_norm": 3.9855447041279612, |
|
"learning_rate": 6.942258937571771e-08, |
|
"loss": 0.7148, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.516914749661705, |
|
"grad_norm": 3.8539380052102943, |
|
"learning_rate": 6.2238171048975e-08, |
|
"loss": 0.7011, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.543978349120433, |
|
"grad_norm": 4.0430762185988725, |
|
"learning_rate": 5.5421414660992705e-08, |
|
"loss": 0.7206, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.571041948579161, |
|
"grad_norm": 3.936740213024643, |
|
"learning_rate": 4.8978045384008125e-08, |
|
"loss": 0.7116, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.598105548037889, |
|
"grad_norm": 3.9575285108660196, |
|
"learning_rate": 4.2913474794554036e-08, |
|
"loss": 0.7192, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.6251691474966172, |
|
"grad_norm": 3.8004579862768604, |
|
"learning_rate": 3.723279632845155e-08, |
|
"loss": 0.7213, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.6522327469553453, |
|
"grad_norm": 3.9896579923804953, |
|
"learning_rate": 3.194078100299863e-08, |
|
"loss": 0.7058, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.6792963464140733, |
|
"grad_norm": 3.485486386140113, |
|
"learning_rate": 2.7041873409947734e-08, |
|
"loss": 0.7155, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.706359945872801, |
|
"grad_norm": 3.7310098177240625, |
|
"learning_rate": 2.2540187982637627e-08, |
|
"loss": 0.7133, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.706359945872801, |
|
"eval_loss": 0.817565381526947, |
|
"eval_runtime": 91.7069, |
|
"eval_samples_per_second": 57.248, |
|
"eval_steps_per_second": 0.905, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.733423545331529, |
|
"grad_norm": 3.6063602953215654, |
|
"learning_rate": 1.8439505540414458e-08, |
|
"loss": 0.7024, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.760487144790257, |
|
"grad_norm": 3.6598065079223527, |
|
"learning_rate": 1.4743270113244277e-08, |
|
"loss": 0.7039, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.787550744248985, |
|
"grad_norm": 3.7941708315282847, |
|
"learning_rate": 1.1454586049184589e-08, |
|
"loss": 0.7096, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.814614343707713, |
|
"grad_norm": 4.0192290237097765, |
|
"learning_rate": 8.576215407142651e-09, |
|
"loss": 0.6872, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.841677943166441, |
|
"grad_norm": 3.671284433438612, |
|
"learning_rate": 6.110575637112425e-09, |
|
"loss": 0.6934, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.8687415426251692, |
|
"grad_norm": 4.26951192895122, |
|
"learning_rate": 4.059737549836517e-09, |
|
"loss": 0.7066, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.8958051420838973, |
|
"grad_norm": 3.732791917417492, |
|
"learning_rate": 2.425423577599783e-09, |
|
"loss": 0.6894, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.9228687415426253, |
|
"grad_norm": 3.626360935172459, |
|
"learning_rate": 1.209006327614226e-09, |
|
"loss": 0.7122, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.949932341001353, |
|
"grad_norm": 3.649121404665413, |
|
"learning_rate": 4.115074292109777e-10, |
|
"loss": 0.7, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.976995940460081, |
|
"grad_norm": 3.550586809689184, |
|
"learning_rate": 3.3596675806824013e-11, |
|
"loss": 0.7, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.976995940460081, |
|
"eval_loss": 0.8177570104598999, |
|
"eval_runtime": 91.665, |
|
"eval_samples_per_second": 57.274, |
|
"eval_steps_per_second": 0.905, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.9878213802435725, |
|
"step": 552, |
|
"total_flos": 3254608239525888.0, |
|
"train_loss": 0.8081599063631417, |
|
"train_runtime": 8273.4491, |
|
"train_samples_per_second": 17.133, |
|
"train_steps_per_second": 0.067 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 552, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3254608239525888.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|