|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.997968855788761, |
|
"eval_steps": 50, |
|
"global_step": 1107, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013540961408259987, |
|
"grad_norm": 18.463732975176427, |
|
"learning_rate": 5e-07, |
|
"loss": 1.749, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.027081922816519974, |
|
"grad_norm": 12.15126324106773, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5921, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.040622884224779957, |
|
"grad_norm": 7.7718176367972, |
|
"learning_rate": 9.9994874230328e-07, |
|
"loss": 1.2948, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05416384563303995, |
|
"grad_norm": 4.024003853982352, |
|
"learning_rate": 9.997949797225268e-07, |
|
"loss": 1.1393, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06770480704129993, |
|
"grad_norm": 3.796718690402949, |
|
"learning_rate": 9.995387437838025e-07, |
|
"loss": 1.0604, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08124576844955991, |
|
"grad_norm": 3.4511274356883295, |
|
"learning_rate": 9.991800870233637e-07, |
|
"loss": 1.0272, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0947867298578199, |
|
"grad_norm": 3.6605756447735724, |
|
"learning_rate": 9.98719082976888e-07, |
|
"loss": 1.003, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1083276912660799, |
|
"grad_norm": 3.5416553128618156, |
|
"learning_rate": 9.981558261643982e-07, |
|
"loss": 0.9719, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12186865267433988, |
|
"grad_norm": 3.5000585696846245, |
|
"learning_rate": 9.97490432070881e-07, |
|
"loss": 0.9584, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13540961408259986, |
|
"grad_norm": 3.805541453776684, |
|
"learning_rate": 9.967230371226118e-07, |
|
"loss": 0.9444, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13540961408259986, |
|
"eval_loss": 0.9377400875091553, |
|
"eval_runtime": 182.856, |
|
"eval_samples_per_second": 57.422, |
|
"eval_steps_per_second": 0.902, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14895057549085985, |
|
"grad_norm": 3.6018765357986844, |
|
"learning_rate": 9.958537986591803e-07, |
|
"loss": 0.9198, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16249153689911983, |
|
"grad_norm": 3.5280703732572545, |
|
"learning_rate": 9.948828949012327e-07, |
|
"loss": 0.9181, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17603249830737983, |
|
"grad_norm": 3.6595010678642925, |
|
"learning_rate": 9.938105249139305e-07, |
|
"loss": 0.9296, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1895734597156398, |
|
"grad_norm": 3.323630668058957, |
|
"learning_rate": 9.92636908566136e-07, |
|
"loss": 0.9241, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2031144211238998, |
|
"grad_norm": 3.425849289666098, |
|
"learning_rate": 9.913622864853324e-07, |
|
"loss": 0.8917, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2166553825321598, |
|
"grad_norm": 3.9967436509747025, |
|
"learning_rate": 9.89986920008288e-07, |
|
"loss": 0.8988, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23019634394041977, |
|
"grad_norm": 3.436149937331634, |
|
"learning_rate": 9.885110911274738e-07, |
|
"loss": 0.8774, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.24373730534867977, |
|
"grad_norm": 3.642660003309155, |
|
"learning_rate": 9.869351024332466e-07, |
|
"loss": 0.8787, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25727826675693977, |
|
"grad_norm": 3.3574313517064978, |
|
"learning_rate": 9.852592770518082e-07, |
|
"loss": 0.8897, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2708192281651997, |
|
"grad_norm": 3.534867148811258, |
|
"learning_rate": 9.834839585789557e-07, |
|
"loss": 0.8668, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2708192281651997, |
|
"eval_loss": 0.8815732002258301, |
|
"eval_runtime": 182.989, |
|
"eval_samples_per_second": 57.38, |
|
"eval_steps_per_second": 0.902, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2843601895734597, |
|
"grad_norm": 3.476334412992533, |
|
"learning_rate": 9.816095110096324e-07, |
|
"loss": 0.8806, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2979011509817197, |
|
"grad_norm": 3.5218146742622856, |
|
"learning_rate": 9.796363186632983e-07, |
|
"loss": 0.8895, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3114421123899797, |
|
"grad_norm": 3.6344919656503545, |
|
"learning_rate": 9.775647861051328e-07, |
|
"loss": 0.8675, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.32498307379823965, |
|
"grad_norm": 3.8058175800193537, |
|
"learning_rate": 9.753953380630862e-07, |
|
"loss": 0.8739, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33852403520649965, |
|
"grad_norm": 3.5458347600694515, |
|
"learning_rate": 9.731284193407981e-07, |
|
"loss": 0.8536, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.35206499661475965, |
|
"grad_norm": 3.4919069279296138, |
|
"learning_rate": 9.707644947263975e-07, |
|
"loss": 0.8598, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.36560595802301965, |
|
"grad_norm": 3.2003162498092848, |
|
"learning_rate": 9.683040488972086e-07, |
|
"loss": 0.8628, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3791469194312796, |
|
"grad_norm": 3.337658064243358, |
|
"learning_rate": 9.657475863203756e-07, |
|
"loss": 0.8633, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3926878808395396, |
|
"grad_norm": 3.4844992093115987, |
|
"learning_rate": 9.63095631149432e-07, |
|
"loss": 0.859, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4062288422477996, |
|
"grad_norm": 3.5999127416589296, |
|
"learning_rate": 9.603487271168336e-07, |
|
"loss": 0.8436, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4062288422477996, |
|
"eval_loss": 0.8559273481369019, |
|
"eval_runtime": 182.9208, |
|
"eval_samples_per_second": 57.402, |
|
"eval_steps_per_second": 0.902, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4197698036560596, |
|
"grad_norm": 3.4543800855029807, |
|
"learning_rate": 9.575074374224758e-07, |
|
"loss": 0.8431, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4333107650643196, |
|
"grad_norm": 3.574884162982998, |
|
"learning_rate": 9.545723446182201e-07, |
|
"loss": 0.8466, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.44685172647257954, |
|
"grad_norm": 3.230714522717894, |
|
"learning_rate": 9.515440504884539e-07, |
|
"loss": 0.829, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.46039268788083954, |
|
"grad_norm": 3.38192753716993, |
|
"learning_rate": 9.484231759267054e-07, |
|
"loss": 0.8462, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47393364928909953, |
|
"grad_norm": 3.562118447176729, |
|
"learning_rate": 9.452103608083417e-07, |
|
"loss": 0.8602, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.48747461069735953, |
|
"grad_norm": 3.2476691940428353, |
|
"learning_rate": 9.419062638593748e-07, |
|
"loss": 0.8266, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5010155721056195, |
|
"grad_norm": 3.494748883924626, |
|
"learning_rate": 9.385115625214021e-07, |
|
"loss": 0.8219, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5145565335138795, |
|
"grad_norm": 3.4365818103766452, |
|
"learning_rate": 9.350269528127101e-07, |
|
"loss": 0.8438, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5280974949221394, |
|
"grad_norm": 3.245299565361765, |
|
"learning_rate": 9.31453149185569e-07, |
|
"loss": 0.8337, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5416384563303994, |
|
"grad_norm": 3.603292666759954, |
|
"learning_rate": 9.277908843797492e-07, |
|
"loss": 0.8192, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5416384563303994, |
|
"eval_loss": 0.8383815884590149, |
|
"eval_runtime": 183.0029, |
|
"eval_samples_per_second": 57.376, |
|
"eval_steps_per_second": 0.902, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5551794177386594, |
|
"grad_norm": 3.3561089172228797, |
|
"learning_rate": 9.240409092722852e-07, |
|
"loss": 0.8483, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5687203791469194, |
|
"grad_norm": 3.351512113878825, |
|
"learning_rate": 9.20203992723524e-07, |
|
"loss": 0.8109, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5822613405551794, |
|
"grad_norm": 3.4763361691712293, |
|
"learning_rate": 9.162809214194851e-07, |
|
"loss": 0.8335, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5958023019634394, |
|
"grad_norm": 3.292224110319897, |
|
"learning_rate": 9.122724997105646e-07, |
|
"loss": 0.8465, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6093432633716994, |
|
"grad_norm": 3.566456464228769, |
|
"learning_rate": 9.0817954944662e-07, |
|
"loss": 0.8257, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6228842247799594, |
|
"grad_norm": 3.6677580289310088, |
|
"learning_rate": 9.040029098084643e-07, |
|
"loss": 0.8479, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6364251861882194, |
|
"grad_norm": 3.4606320291357546, |
|
"learning_rate": 8.997434371358092e-07, |
|
"loss": 0.8412, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6499661475964793, |
|
"grad_norm": 3.2432058195015467, |
|
"learning_rate": 8.954020047516884e-07, |
|
"loss": 0.7984, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6635071090047393, |
|
"grad_norm": 3.24817794736711, |
|
"learning_rate": 8.909795027833996e-07, |
|
"loss": 0.834, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6770480704129993, |
|
"grad_norm": 3.653634156312455, |
|
"learning_rate": 8.864768379800016e-07, |
|
"loss": 0.8285, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6770480704129993, |
|
"eval_loss": 0.8261091709136963, |
|
"eval_runtime": 182.9038, |
|
"eval_samples_per_second": 57.407, |
|
"eval_steps_per_second": 0.902, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6905890318212593, |
|
"grad_norm": 3.5255613554356384, |
|
"learning_rate": 8.81894933526402e-07, |
|
"loss": 0.8322, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7041299932295193, |
|
"grad_norm": 3.727303124024427, |
|
"learning_rate": 8.772347288540763e-07, |
|
"loss": 0.8384, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7176709546377793, |
|
"grad_norm": 3.731075409838065, |
|
"learning_rate": 8.724971794484555e-07, |
|
"loss": 0.8263, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7312119160460393, |
|
"grad_norm": 3.6981830618689484, |
|
"learning_rate": 8.676832566530221e-07, |
|
"loss": 0.8165, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7447528774542993, |
|
"grad_norm": 3.498794505216147, |
|
"learning_rate": 8.62793947470155e-07, |
|
"loss": 0.8037, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7582938388625592, |
|
"grad_norm": 3.5635605377852024, |
|
"learning_rate": 8.578302543587629e-07, |
|
"loss": 0.7896, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7718348002708192, |
|
"grad_norm": 3.63798214652303, |
|
"learning_rate": 8.527931950287506e-07, |
|
"loss": 0.8129, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7853757616790792, |
|
"grad_norm": 3.518857236471562, |
|
"learning_rate": 8.47683802232356e-07, |
|
"loss": 0.8182, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7989167230873392, |
|
"grad_norm": 3.5608373625732694, |
|
"learning_rate": 8.425031235524045e-07, |
|
"loss": 0.8256, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8124576844955992, |
|
"grad_norm": 3.6755850332375286, |
|
"learning_rate": 8.372522211875224e-07, |
|
"loss": 0.8132, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8124576844955992, |
|
"eval_loss": 0.8165345191955566, |
|
"eval_runtime": 182.8879, |
|
"eval_samples_per_second": 57.412, |
|
"eval_steps_per_second": 0.902, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8259986459038592, |
|
"grad_norm": 3.596321476724727, |
|
"learning_rate": 8.319321717343535e-07, |
|
"loss": 0.8149, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8395396073121192, |
|
"grad_norm": 3.7225323914645747, |
|
"learning_rate": 8.265440659668234e-07, |
|
"loss": 0.8124, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8530805687203792, |
|
"grad_norm": 3.5663300158516016, |
|
"learning_rate": 8.210890086124977e-07, |
|
"loss": 0.7906, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8666215301286392, |
|
"grad_norm": 3.4256587260000604, |
|
"learning_rate": 8.155681181260776e-07, |
|
"loss": 0.81, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8801624915368991, |
|
"grad_norm": 3.4929314527584077, |
|
"learning_rate": 8.099825264600842e-07, |
|
"loss": 0.818, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8937034529451591, |
|
"grad_norm": 3.5559616159997742, |
|
"learning_rate": 8.04333378832772e-07, |
|
"loss": 0.8067, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9072444143534191, |
|
"grad_norm": 3.3786924383796544, |
|
"learning_rate": 7.98621833493324e-07, |
|
"loss": 0.802, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9207853757616791, |
|
"grad_norm": 3.6281675631205377, |
|
"learning_rate": 7.928490614843757e-07, |
|
"loss": 0.7991, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9343263371699391, |
|
"grad_norm": 3.5783671775239068, |
|
"learning_rate": 7.870162464019143e-07, |
|
"loss": 0.7961, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9478672985781991, |
|
"grad_norm": 3.523112182862875, |
|
"learning_rate": 7.811245841526062e-07, |
|
"loss": 0.7997, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9478672985781991, |
|
"eval_loss": 0.8075853586196899, |
|
"eval_runtime": 182.8371, |
|
"eval_samples_per_second": 57.428, |
|
"eval_steps_per_second": 0.902, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9614082599864591, |
|
"grad_norm": 3.327823061584991, |
|
"learning_rate": 7.75175282708598e-07, |
|
"loss": 0.7955, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9749492213947191, |
|
"grad_norm": 3.353254883691996, |
|
"learning_rate": 7.691695618598466e-07, |
|
"loss": 0.7945, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.988490182802979, |
|
"grad_norm": 3.4573365027199547, |
|
"learning_rate": 7.631086529640229e-07, |
|
"loss": 0.8037, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.002031144211239, |
|
"grad_norm": 3.704603819142361, |
|
"learning_rate": 7.569937986940475e-07, |
|
"loss": 0.7833, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.015572105619499, |
|
"grad_norm": 3.338765249405955, |
|
"learning_rate": 7.508262527833028e-07, |
|
"loss": 0.736, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.029113067027759, |
|
"grad_norm": 3.5005014577141154, |
|
"learning_rate": 7.446072797685799e-07, |
|
"loss": 0.7393, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.042654028436019, |
|
"grad_norm": 3.5415652623772753, |
|
"learning_rate": 7.383381547308099e-07, |
|
"loss": 0.7461, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0561949898442788, |
|
"grad_norm": 3.430934641227424, |
|
"learning_rate": 7.320201630336318e-07, |
|
"loss": 0.7359, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0697359512525388, |
|
"grad_norm": 3.6468766296195896, |
|
"learning_rate": 7.256546000598551e-07, |
|
"loss": 0.7306, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0832769126607988, |
|
"grad_norm": 4.055674159253643, |
|
"learning_rate": 7.192427709458655e-07, |
|
"loss": 0.7335, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0832769126607988, |
|
"eval_loss": 0.804972231388092, |
|
"eval_runtime": 182.8412, |
|
"eval_samples_per_second": 57.427, |
|
"eval_steps_per_second": 0.902, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0968178740690588, |
|
"grad_norm": 3.7131013971600244, |
|
"learning_rate": 7.127859903140311e-07, |
|
"loss": 0.7346, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1103588354773188, |
|
"grad_norm": 3.496845579088122, |
|
"learning_rate": 7.062855820031659e-07, |
|
"loss": 0.7409, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1238997968855788, |
|
"grad_norm": 3.652963743841258, |
|
"learning_rate": 6.997428787971005e-07, |
|
"loss": 0.7236, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1374407582938388, |
|
"grad_norm": 3.6316016258989916, |
|
"learning_rate": 6.93159222151422e-07, |
|
"loss": 0.734, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1509817197020988, |
|
"grad_norm": 4.106556416672479, |
|
"learning_rate": 6.86535961918433e-07, |
|
"loss": 0.7425, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.1645226811103588, |
|
"grad_norm": 3.4406596945092764, |
|
"learning_rate": 6.798744560703904e-07, |
|
"loss": 0.7271, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1780636425186188, |
|
"grad_norm": 3.5644142212223886, |
|
"learning_rate": 6.731760704210802e-07, |
|
"loss": 0.729, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1916046039268788, |
|
"grad_norm": 3.5207779341108316, |
|
"learning_rate": 6.66442178345783e-07, |
|
"loss": 0.7295, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2051455653351388, |
|
"grad_norm": 3.8426763234364643, |
|
"learning_rate": 6.596741604996897e-07, |
|
"loss": 0.7285, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2186865267433988, |
|
"grad_norm": 3.537416567969802, |
|
"learning_rate": 6.528734045348248e-07, |
|
"loss": 0.7466, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2186865267433988, |
|
"eval_loss": 0.7996942400932312, |
|
"eval_runtime": 182.8629, |
|
"eval_samples_per_second": 57.42, |
|
"eval_steps_per_second": 0.902, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2322274881516588, |
|
"grad_norm": 3.5566566741869603, |
|
"learning_rate": 6.460413048155354e-07, |
|
"loss": 0.7291, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2457684495599188, |
|
"grad_norm": 3.6983928953608323, |
|
"learning_rate": 6.391792621326027e-07, |
|
"loss": 0.7502, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2593094109681786, |
|
"grad_norm": 3.679881191981186, |
|
"learning_rate": 6.322886834160377e-07, |
|
"loss": 0.7375, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2728503723764386, |
|
"grad_norm": 3.635647992456833, |
|
"learning_rate": 6.253709814466167e-07, |
|
"loss": 0.7446, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2863913337846986, |
|
"grad_norm": 3.7784368285832675, |
|
"learning_rate": 6.184275745662179e-07, |
|
"loss": 0.7307, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2999322951929586, |
|
"grad_norm": 3.975716487359526, |
|
"learning_rate": 6.114598863870178e-07, |
|
"loss": 0.727, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3134732566012186, |
|
"grad_norm": 3.8898112879763818, |
|
"learning_rate": 6.044693454996059e-07, |
|
"loss": 0.7351, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.3270142180094786, |
|
"grad_norm": 3.883701681898452, |
|
"learning_rate": 5.974573851800817e-07, |
|
"loss": 0.7376, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3405551794177386, |
|
"grad_norm": 3.5463439014644695, |
|
"learning_rate": 5.904254430961869e-07, |
|
"loss": 0.7413, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3540961408259986, |
|
"grad_norm": 3.5029550543033374, |
|
"learning_rate": 5.833749610125401e-07, |
|
"loss": 0.7264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3540961408259986, |
|
"eval_loss": 0.7957150340080261, |
|
"eval_runtime": 182.9392, |
|
"eval_samples_per_second": 57.396, |
|
"eval_steps_per_second": 0.902, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3676371022342586, |
|
"grad_norm": 3.9766897722099124, |
|
"learning_rate": 5.763073844950309e-07, |
|
"loss": 0.7327, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.3811780636425186, |
|
"grad_norm": 3.80954804006231, |
|
"learning_rate": 5.69224162614434e-07, |
|
"loss": 0.7443, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3947190250507786, |
|
"grad_norm": 3.524183540063996, |
|
"learning_rate": 5.621267476493052e-07, |
|
"loss": 0.7345, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.4082599864590386, |
|
"grad_norm": 3.620522282934874, |
|
"learning_rate": 5.550165947882196e-07, |
|
"loss": 0.7236, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4218009478672986, |
|
"grad_norm": 3.768208934721801, |
|
"learning_rate": 5.478951618314132e-07, |
|
"loss": 0.7165, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.4353419092755586, |
|
"grad_norm": 3.5182066154018345, |
|
"learning_rate": 5.407639088918888e-07, |
|
"loss": 0.7297, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4488828706838186, |
|
"grad_norm": 3.824867160497226, |
|
"learning_rate": 5.33624298096048e-07, |
|
"loss": 0.7307, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.4624238320920786, |
|
"grad_norm": 3.488297423659722, |
|
"learning_rate": 5.264777932839104e-07, |
|
"loss": 0.7487, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4759647935003386, |
|
"grad_norm": 3.493644177453455, |
|
"learning_rate": 5.193258597089809e-07, |
|
"loss": 0.7291, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.4895057549085986, |
|
"grad_norm": 3.6179715470055824, |
|
"learning_rate": 5.121699637378282e-07, |
|
"loss": 0.7286, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4895057549085986, |
|
"eval_loss": 0.7910673022270203, |
|
"eval_runtime": 182.8784, |
|
"eval_samples_per_second": 57.415, |
|
"eval_steps_per_second": 0.902, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5030467163168586, |
|
"grad_norm": 3.5922430809215777, |
|
"learning_rate": 5.050115725494339e-07, |
|
"loss": 0.7179, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.5165876777251186, |
|
"grad_norm": 3.4032672519789786, |
|
"learning_rate": 4.978521538343764e-07, |
|
"loss": 0.7366, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5301286391333786, |
|
"grad_norm": 3.6567954327498327, |
|
"learning_rate": 4.906931754939083e-07, |
|
"loss": 0.7391, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5436696005416386, |
|
"grad_norm": 3.5644839916468243, |
|
"learning_rate": 4.835361053389921e-07, |
|
"loss": 0.7288, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5572105619498986, |
|
"grad_norm": 3.650299513434273, |
|
"learning_rate": 4.763824107893532e-07, |
|
"loss": 0.729, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5707515233581584, |
|
"grad_norm": 3.8055072261605116, |
|
"learning_rate": 4.692335585726145e-07, |
|
"loss": 0.7177, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5842924847664184, |
|
"grad_norm": 3.5985047420048697, |
|
"learning_rate": 4.6209101442357116e-07, |
|
"loss": 0.7224, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5978334461746784, |
|
"grad_norm": 3.547274876419971, |
|
"learning_rate": 4.549562427836701e-07, |
|
"loss": 0.7327, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6113744075829384, |
|
"grad_norm": 3.511159114275773, |
|
"learning_rate": 4.4783070650075537e-07, |
|
"loss": 0.7406, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.6249153689911984, |
|
"grad_norm": 3.46127753253687, |
|
"learning_rate": 4.407158665291376e-07, |
|
"loss": 0.7251, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6249153689911984, |
|
"eval_loss": 0.7875649333000183, |
|
"eval_runtime": 182.8944, |
|
"eval_samples_per_second": 57.41, |
|
"eval_steps_per_second": 0.902, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6384563303994584, |
|
"grad_norm": 3.409510769647259, |
|
"learning_rate": 4.336131816300548e-07, |
|
"loss": 0.719, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6519972918077184, |
|
"grad_norm": 3.5447361212717925, |
|
"learning_rate": 4.265241080725808e-07, |
|
"loss": 0.7287, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6655382532159784, |
|
"grad_norm": 4.059666587103208, |
|
"learning_rate": 4.194500993350453e-07, |
|
"loss": 0.7399, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6790792146242384, |
|
"grad_norm": 3.774832103683009, |
|
"learning_rate": 4.1239260580702634e-07, |
|
"loss": 0.7386, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6926201760324981, |
|
"grad_norm": 3.5490582421055885, |
|
"learning_rate": 4.053530744919749e-07, |
|
"loss": 0.7246, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.7061611374407581, |
|
"grad_norm": 3.747116763144729, |
|
"learning_rate": 3.983329487105363e-07, |
|
"loss": 0.7372, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7197020988490181, |
|
"grad_norm": 3.4822384940575795, |
|
"learning_rate": 3.913336678046232e-07, |
|
"loss": 0.7323, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.7332430602572781, |
|
"grad_norm": 3.825022969793332, |
|
"learning_rate": 3.8435666684230726e-07, |
|
"loss": 0.7097, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7467840216655381, |
|
"grad_norm": 3.696972226883697, |
|
"learning_rate": 3.774033763235861e-07, |
|
"loss": 0.7304, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.7603249830737981, |
|
"grad_norm": 3.821368374115622, |
|
"learning_rate": 3.7047522188708606e-07, |
|
"loss": 0.727, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7603249830737981, |
|
"eval_loss": 0.7839689254760742, |
|
"eval_runtime": 182.875, |
|
"eval_samples_per_second": 57.416, |
|
"eval_steps_per_second": 0.902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7738659444820581, |
|
"grad_norm": 3.665010892266409, |
|
"learning_rate": 3.635736240177627e-07, |
|
"loss": 0.7223, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.7874069058903181, |
|
"grad_norm": 3.8705676996293916, |
|
"learning_rate": 3.5669999775565816e-07, |
|
"loss": 0.7313, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8009478672985781, |
|
"grad_norm": 3.5842111135660057, |
|
"learning_rate": 3.4985575240577365e-07, |
|
"loss": 0.7321, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.8144888287068381, |
|
"grad_norm": 3.4424140077908767, |
|
"learning_rate": 3.4304229124911856e-07, |
|
"loss": 0.7316, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8280297901150981, |
|
"grad_norm": 3.4068267304866646, |
|
"learning_rate": 3.362610112549955e-07, |
|
"loss": 0.704, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.8415707515233581, |
|
"grad_norm": 3.685081317112416, |
|
"learning_rate": 3.295133027945778e-07, |
|
"loss": 0.7167, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8551117129316181, |
|
"grad_norm": 3.7176624860947345, |
|
"learning_rate": 3.228005493558402e-07, |
|
"loss": 0.7094, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.8686526743398781, |
|
"grad_norm": 3.718001701556429, |
|
"learning_rate": 3.1612412725990305e-07, |
|
"loss": 0.7312, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8821936357481381, |
|
"grad_norm": 3.491455379714816, |
|
"learning_rate": 3.0948540537884185e-07, |
|
"loss": 0.7264, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.8957345971563981, |
|
"grad_norm": 3.7382317800607376, |
|
"learning_rate": 3.0288574485502756e-07, |
|
"loss": 0.7277, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8957345971563981, |
|
"eval_loss": 0.7811039090156555, |
|
"eval_runtime": 182.9386, |
|
"eval_samples_per_second": 57.396, |
|
"eval_steps_per_second": 0.902, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9092755585646581, |
|
"grad_norm": 3.4296946924569442, |
|
"learning_rate": 2.9632649882205083e-07, |
|
"loss": 0.7287, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.9228165199729181, |
|
"grad_norm": 3.810691598305239, |
|
"learning_rate": 2.8980901212728723e-07, |
|
"loss": 0.7193, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9363574813811781, |
|
"grad_norm": 3.3584278752072496, |
|
"learning_rate": 2.833346210561619e-07, |
|
"loss": 0.7112, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.9498984427894381, |
|
"grad_norm": 3.5743899932997185, |
|
"learning_rate": 2.769046530581708e-07, |
|
"loss": 0.7235, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9634394041976981, |
|
"grad_norm": 3.8331759574897375, |
|
"learning_rate": 2.705204264747125e-07, |
|
"loss": 0.724, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.9769803656059581, |
|
"grad_norm": 3.6084594988279908, |
|
"learning_rate": 2.6418325026878665e-07, |
|
"loss": 0.7156, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9905213270142181, |
|
"grad_norm": 3.56873955236049, |
|
"learning_rate": 2.578944237566174e-07, |
|
"loss": 0.7163, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.004062288422478, |
|
"grad_norm": 4.103162581101771, |
|
"learning_rate": 2.5165523634125337e-07, |
|
"loss": 0.7161, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.017603249830738, |
|
"grad_norm": 3.7064092978900844, |
|
"learning_rate": 2.454669672481996e-07, |
|
"loss": 0.6754, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.031144211238998, |
|
"grad_norm": 3.6900542156905196, |
|
"learning_rate": 2.393308852631373e-07, |
|
"loss": 0.6724, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.031144211238998, |
|
"eval_loss": 0.7857776880264282, |
|
"eval_runtime": 183.0378, |
|
"eval_samples_per_second": 57.365, |
|
"eval_steps_per_second": 0.901, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.044685172647258, |
|
"grad_norm": 3.5536516436485255, |
|
"learning_rate": 2.3324824847178494e-07, |
|
"loss": 0.6887, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.058226134055518, |
|
"grad_norm": 3.7965121612299564, |
|
"learning_rate": 2.2722030400194975e-07, |
|
"loss": 0.666, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.071767095463778, |
|
"grad_norm": 3.7936989153822007, |
|
"learning_rate": 2.2124828776782955e-07, |
|
"loss": 0.6789, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.085308056872038, |
|
"grad_norm": 3.5401816973807043, |
|
"learning_rate": 2.1533342421661228e-07, |
|
"loss": 0.6665, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0988490182802977, |
|
"grad_norm": 3.7787164350636555, |
|
"learning_rate": 2.0947692607742618e-07, |
|
"loss": 0.6755, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.1123899796885577, |
|
"grad_norm": 3.9382718196335267, |
|
"learning_rate": 2.0367999411269282e-07, |
|
"loss": 0.6821, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.1259309410968177, |
|
"grad_norm": 3.8112238429444782, |
|
"learning_rate": 1.9794381687193456e-07, |
|
"loss": 0.6805, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.1394719025050777, |
|
"grad_norm": 3.8744335724512204, |
|
"learning_rate": 1.9226957044808494e-07, |
|
"loss": 0.6657, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1530128639133377, |
|
"grad_norm": 3.7804638456283346, |
|
"learning_rate": 1.866584182363528e-07, |
|
"loss": 0.6789, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.1665538253215977, |
|
"grad_norm": 3.8021451485147963, |
|
"learning_rate": 1.811115106956918e-07, |
|
"loss": 0.6883, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1665538253215977, |
|
"eval_loss": 0.7850033044815063, |
|
"eval_runtime": 182.9949, |
|
"eval_samples_per_second": 57.379, |
|
"eval_steps_per_second": 0.902, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1800947867298577, |
|
"grad_norm": 3.864215108703362, |
|
"learning_rate": 1.7562998511291943e-07, |
|
"loss": 0.6811, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.1936357481381177, |
|
"grad_norm": 3.8300913859664667, |
|
"learning_rate": 1.702149653695395e-07, |
|
"loss": 0.6766, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2071767095463777, |
|
"grad_norm": 3.8635188226813666, |
|
"learning_rate": 1.6486756171131062e-07, |
|
"loss": 0.675, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.2207176709546377, |
|
"grad_norm": 3.9283113465457355, |
|
"learning_rate": 1.595888705206128e-07, |
|
"loss": 0.6678, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2342586323628977, |
|
"grad_norm": 3.726732058605602, |
|
"learning_rate": 1.5437997409165476e-07, |
|
"loss": 0.6733, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.2477995937711577, |
|
"grad_norm": 3.6073721199402318, |
|
"learning_rate": 1.4924194040856973e-07, |
|
"loss": 0.6794, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2613405551794177, |
|
"grad_norm": 3.920320006141431, |
|
"learning_rate": 1.4417582292644691e-07, |
|
"loss": 0.6871, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.2748815165876777, |
|
"grad_norm": 3.781911882917061, |
|
"learning_rate": 1.3918266035534027e-07, |
|
"loss": 0.6774, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2884224779959377, |
|
"grad_norm": 4.013058729107201, |
|
"learning_rate": 1.3426347644730047e-07, |
|
"loss": 0.6816, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.3019634394041977, |
|
"grad_norm": 3.776810144116961, |
|
"learning_rate": 1.2941927978647526e-07, |
|
"loss": 0.6709, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3019634394041977, |
|
"eval_loss": 0.7840232253074646, |
|
"eval_runtime": 182.9773, |
|
"eval_samples_per_second": 57.384, |
|
"eval_steps_per_second": 0.902, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3155044008124577, |
|
"grad_norm": 4.0267886200903344, |
|
"learning_rate": 1.2465106358231753e-07, |
|
"loss": 0.6765, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.3290453622207177, |
|
"grad_norm": 3.79514301881657, |
|
"learning_rate": 1.1995980546594775e-07, |
|
"loss": 0.6633, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3425863236289777, |
|
"grad_norm": 3.7026026349952086, |
|
"learning_rate": 1.153464672897091e-07, |
|
"loss": 0.678, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.3561272850372377, |
|
"grad_norm": 3.977299316585606, |
|
"learning_rate": 1.108119949299578e-07, |
|
"loss": 0.6875, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3696682464454977, |
|
"grad_norm": 3.9505974017459544, |
|
"learning_rate": 1.0635731809312992e-07, |
|
"loss": 0.6955, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.3832092078537577, |
|
"grad_norm": 3.9944161998447116, |
|
"learning_rate": 1.0198335012512271e-07, |
|
"loss": 0.6843, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.3967501692620177, |
|
"grad_norm": 4.08644897660094, |
|
"learning_rate": 9.769098782403041e-08, |
|
"loss": 0.7081, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.4102911306702777, |
|
"grad_norm": 4.033807984306314, |
|
"learning_rate": 9.348111125627278e-08, |
|
"loss": 0.6758, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4238320920785377, |
|
"grad_norm": 3.615156557294799, |
|
"learning_rate": 8.935458357615583e-08, |
|
"loss": 0.6718, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.4373730534867977, |
|
"grad_norm": 3.876477554855966, |
|
"learning_rate": 8.531225084889654e-08, |
|
"loss": 0.6598, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4373730534867977, |
|
"eval_loss": 0.7834283113479614, |
|
"eval_runtime": 183.0709, |
|
"eval_samples_per_second": 57.355, |
|
"eval_steps_per_second": 0.901, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4509140148950577, |
|
"grad_norm": 3.6988949380997336, |
|
"learning_rate": 8.135494187715475e-08, |
|
"loss": 0.6603, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.4644549763033177, |
|
"grad_norm": 3.931452073089016, |
|
"learning_rate": 7.748346803110295e-08, |
|
"loss": 0.6832, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4779959377115777, |
|
"grad_norm": 3.8160191178139047, |
|
"learning_rate": 7.369862308207025e-08, |
|
"loss": 0.6583, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.4915368991198377, |
|
"grad_norm": 3.8469114382677874, |
|
"learning_rate": 7.000118303979463e-08, |
|
"loss": 0.6808, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5050778605280977, |
|
"grad_norm": 3.7497256852290115, |
|
"learning_rate": 6.639190599331746e-08, |
|
"loss": 0.6762, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.518618821936357, |
|
"grad_norm": 3.663314489242292, |
|
"learning_rate": 6.287153195555173e-08, |
|
"loss": 0.6663, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5321597833446177, |
|
"grad_norm": 3.8930436232018333, |
|
"learning_rate": 5.944078271155639e-08, |
|
"loss": 0.6648, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.545700744752877, |
|
"grad_norm": 3.6616608952378904, |
|
"learning_rate": 5.610036167054838e-08, |
|
"loss": 0.6596, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5592417061611377, |
|
"grad_norm": 3.986331709466641, |
|
"learning_rate": 5.2850953721682635e-08, |
|
"loss": 0.669, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.572782667569397, |
|
"grad_norm": 3.960581833122488, |
|
"learning_rate": 4.969322509362761e-08, |
|
"loss": 0.674, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.572782667569397, |
|
"eval_loss": 0.7830283641815186, |
|
"eval_runtime": 182.8342, |
|
"eval_samples_per_second": 57.429, |
|
"eval_steps_per_second": 0.902, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5863236289776577, |
|
"grad_norm": 3.8618441431288217, |
|
"learning_rate": 4.662782321796849e-08, |
|
"loss": 0.6713, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.599864590385917, |
|
"grad_norm": 3.5409233232724335, |
|
"learning_rate": 4.365537659646418e-08, |
|
"loss": 0.6747, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6134055517941777, |
|
"grad_norm": 3.744738202206873, |
|
"learning_rate": 4.0776494672184356e-08, |
|
"loss": 0.6846, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.626946513202437, |
|
"grad_norm": 3.727245201869487, |
|
"learning_rate": 3.799176770455526e-08, |
|
"loss": 0.6616, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.640487474610697, |
|
"grad_norm": 3.7258573002382147, |
|
"learning_rate": 3.530176664833834e-08, |
|
"loss": 0.675, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.654028436018957, |
|
"grad_norm": 3.869690791825916, |
|
"learning_rate": 3.270704303656696e-08, |
|
"loss": 0.6875, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.667569397427217, |
|
"grad_norm": 4.064616477774205, |
|
"learning_rate": 3.020812886746477e-08, |
|
"loss": 0.6808, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.681110358835477, |
|
"grad_norm": 3.704371552936023, |
|
"learning_rate": 2.7805536495370373e-08, |
|
"loss": 0.6687, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.694651320243737, |
|
"grad_norm": 4.055603563401218, |
|
"learning_rate": 2.5499758525688197e-08, |
|
"loss": 0.6584, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.708192281651997, |
|
"grad_norm": 3.749384489878185, |
|
"learning_rate": 2.329126771388995e-08, |
|
"loss": 0.656, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.708192281651997, |
|
"eval_loss": 0.7828182578086853, |
|
"eval_runtime": 182.6838, |
|
"eval_samples_per_second": 57.476, |
|
"eval_steps_per_second": 0.903, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.721733243060257, |
|
"grad_norm": 3.869599930871293, |
|
"learning_rate": 2.1180516868584464e-08, |
|
"loss": 0.6716, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.735274204468517, |
|
"grad_norm": 3.930506514677681, |
|
"learning_rate": 1.916793875867839e-08, |
|
"loss": 0.6822, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.748815165876777, |
|
"grad_norm": 3.8320813018837616, |
|
"learning_rate": 1.7253946024645472e-08, |
|
"loss": 0.6627, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.762356127285037, |
|
"grad_norm": 3.986150848206186, |
|
"learning_rate": 1.5438931093921804e-08, |
|
"loss": 0.6727, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.775897088693297, |
|
"grad_norm": 3.764718626888124, |
|
"learning_rate": 1.372326610044705e-08, |
|
"loss": 0.6618, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.789438050101557, |
|
"grad_norm": 3.7384921853849393, |
|
"learning_rate": 1.2107302808364638e-08, |
|
"loss": 0.6614, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.802979011509817, |
|
"grad_norm": 3.8134941063063867, |
|
"learning_rate": 1.0591372539900056e-08, |
|
"loss": 0.6665, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.816519972918077, |
|
"grad_norm": 3.7735093872780197, |
|
"learning_rate": 9.175786107429085e-09, |
|
"loss": 0.6643, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.830060934326337, |
|
"grad_norm": 3.987550484105897, |
|
"learning_rate": 7.860833749751772e-09, |
|
"loss": 0.6739, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.843601895734597, |
|
"grad_norm": 3.9380769036431893, |
|
"learning_rate": 6.6467850725848705e-09, |
|
"loss": 0.6741, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.843601895734597, |
|
"eval_loss": 0.7824584245681763, |
|
"eval_runtime": 183.0944, |
|
"eval_samples_per_second": 57.347, |
|
"eval_steps_per_second": 0.901, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 4.011659897593238, |
|
"learning_rate": 5.5338889932838306e-09, |
|
"loss": 0.6842, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.870683818551117, |
|
"grad_norm": 3.808051194891409, |
|
"learning_rate": 4.5223736898076235e-09, |
|
"loss": 0.6806, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.884224779959377, |
|
"grad_norm": 3.8839001927108856, |
|
"learning_rate": 3.612446553934723e-09, |
|
"loss": 0.6679, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.897765741367637, |
|
"grad_norm": 3.9713825006231054, |
|
"learning_rate": 2.804294148741948e-09, |
|
"loss": 0.6733, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.911306702775897, |
|
"grad_norm": 3.783696934827739, |
|
"learning_rate": 2.0980821703527886e-09, |
|
"loss": 0.6736, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.924847664184157, |
|
"grad_norm": 3.770344601359413, |
|
"learning_rate": 1.4939554139648536e-09, |
|
"loss": 0.6649, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.938388625592417, |
|
"grad_norm": 3.8357523688679565, |
|
"learning_rate": 9.920377441623994e-10, |
|
"loss": 0.6718, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.951929587000677, |
|
"grad_norm": 4.03615476407359, |
|
"learning_rate": 5.92432069520199e-10, |
|
"loss": 0.6805, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9654705484089368, |
|
"grad_norm": 3.8321791456875283, |
|
"learning_rate": 2.9522032150419705e-10, |
|
"loss": 0.6629, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.979011509817197, |
|
"grad_norm": 3.9373632743696056, |
|
"learning_rate": 1.0046343767294852e-10, |
|
"loss": 0.6592, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.979011509817197, |
|
"eval_loss": 0.7824262976646423, |
|
"eval_runtime": 182.8708, |
|
"eval_samples_per_second": 57.418, |
|
"eval_steps_per_second": 0.902, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9925524712254568, |
|
"grad_norm": 3.6875439191522075, |
|
"learning_rate": 8.201349183611927e-12, |
|
"loss": 0.6534, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.997968855788761, |
|
"step": 1107, |
|
"total_flos": 6527139780231168.0, |
|
"train_loss": 0.7629147509572306, |
|
"train_runtime": 18558.2767, |
|
"train_samples_per_second": 15.276, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1107, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6527139780231168.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|