|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9957446808510637, |
|
"eval_steps": 500, |
|
"global_step": 1320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02269503546099291, |
|
"grad_norm": 2.4400690431145335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9537, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04539007092198582, |
|
"grad_norm": 1.0371553805334768, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8304, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06808510638297872, |
|
"grad_norm": 0.8207371359291633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7801, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09078014184397164, |
|
"grad_norm": 0.7210586495004421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7522, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11347517730496454, |
|
"grad_norm": 0.815177982428463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.739, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13617021276595745, |
|
"grad_norm": 2.1607006669022053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.731, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15886524822695036, |
|
"grad_norm": 0.8260032479277283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7245, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18156028368794327, |
|
"grad_norm": 0.6043557996158753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.709, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20425531914893616, |
|
"grad_norm": 0.6817812588474443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7065, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22695035460992907, |
|
"grad_norm": 0.6633270827705589, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7042, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24964539007092199, |
|
"grad_norm": 0.8788640026205816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6937, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2723404255319149, |
|
"grad_norm": 0.6318189161156693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2950354609929078, |
|
"grad_norm": 0.6116775844736505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6882, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3177304964539007, |
|
"grad_norm": 0.5442399734240484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6829, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.772917712294573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6778, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.36312056737588655, |
|
"grad_norm": 0.5042750571740913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6826, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.38581560283687943, |
|
"grad_norm": 0.5272476451832978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6791, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4085106382978723, |
|
"grad_norm": 0.6477667523961554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6778, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.43120567375886526, |
|
"grad_norm": 0.6825001542564298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6727, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.45390070921985815, |
|
"grad_norm": 0.7223077265284692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6766, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4765957446808511, |
|
"grad_norm": 0.7610671419515379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6754, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.49929078014184397, |
|
"grad_norm": 0.5918397478370985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6707, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5219858156028369, |
|
"grad_norm": 0.5937553589475805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6695, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5446808510638298, |
|
"grad_norm": 0.6308694971546095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6706, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5673758865248227, |
|
"grad_norm": 0.5145635067713508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5900709219858156, |
|
"grad_norm": 0.8960582295099536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.659, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6127659574468085, |
|
"grad_norm": 0.5714201206969344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6648, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6354609929078014, |
|
"grad_norm": 0.5218865972600176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6692, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6581560283687943, |
|
"grad_norm": 0.908425551953448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6603, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.5937599038511239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6616, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7035460992907802, |
|
"grad_norm": 0.5791492049959488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6573, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7262411347517731, |
|
"grad_norm": 0.5544912073084471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7489361702127659, |
|
"grad_norm": 0.6161595885325214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6535, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7716312056737589, |
|
"grad_norm": 0.5292955751782062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6575, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7943262411347518, |
|
"grad_norm": 0.5125989549019483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8170212765957446, |
|
"grad_norm": 0.4999207740358057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.652, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8397163120567376, |
|
"grad_norm": 0.5737129637592092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6487, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8624113475177305, |
|
"grad_norm": 0.697556121614893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6537, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8851063829787233, |
|
"grad_norm": 0.8803924578376542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9078014184397163, |
|
"grad_norm": 0.7209729745886211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6499, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9304964539007092, |
|
"grad_norm": 0.631225301000223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6493, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9531914893617022, |
|
"grad_norm": 0.5302593707285189, |
|
"learning_rate": 5e-06, |
|
"loss": 0.651, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.975886524822695, |
|
"grad_norm": 0.5216458807768983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6438, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9985815602836879, |
|
"grad_norm": 0.5024039028278129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6478, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9985815602836879, |
|
"eval_loss": 0.64774489402771, |
|
"eval_runtime": 312.1779, |
|
"eval_samples_per_second": 38.036, |
|
"eval_steps_per_second": 0.596, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 0.6519926768853314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6161, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0439716312056737, |
|
"grad_norm": 0.575055735064131, |
|
"learning_rate": 5e-06, |
|
"loss": 0.589, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.9082080666134971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5912, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0893617021276596, |
|
"grad_norm": 0.583715919536177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5913, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1120567375886525, |
|
"grad_norm": 0.5975048919359871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5976, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1347517730496455, |
|
"grad_norm": 0.5418419185217853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5931, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1574468085106382, |
|
"grad_norm": 0.638492552707852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5913, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1801418439716311, |
|
"grad_norm": 0.5450441529585099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5888, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.202836879432624, |
|
"grad_norm": 0.667596474347774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5926, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.225531914893617, |
|
"grad_norm": 0.5901671600943978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5905, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.24822695035461, |
|
"grad_norm": 0.5680819366472465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5915, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.270921985815603, |
|
"grad_norm": 0.7905201905390314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5948, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2936170212765958, |
|
"grad_norm": 0.5581341122325792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5904, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3163120567375888, |
|
"grad_norm": 0.5968971489169257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5927, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3390070921985815, |
|
"grad_norm": 0.6127986193776547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5889, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.5540746342979398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5998, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3843971631205674, |
|
"grad_norm": 0.6578848756357453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5933, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4070921985815603, |
|
"grad_norm": 0.752983431690844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5967, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4297872340425533, |
|
"grad_norm": 0.6009296481522326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5886, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.452482269503546, |
|
"grad_norm": 0.6855926736224828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5931, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.475177304964539, |
|
"grad_norm": 0.5390133364015494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5926, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4978723404255319, |
|
"grad_norm": 0.6338310115530009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5969, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5205673758865248, |
|
"grad_norm": 0.5298069642997515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5868, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5432624113475177, |
|
"grad_norm": 0.5254529387411777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5893, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.5659574468085107, |
|
"grad_norm": 0.5090891733108535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5937, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5886524822695036, |
|
"grad_norm": 0.6276632546884366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5856, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6113475177304966, |
|
"grad_norm": 0.5618088874975952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5796, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6340425531914895, |
|
"grad_norm": 0.5451154576203426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.584, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6567375886524822, |
|
"grad_norm": 0.5751145914968212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5882, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.6794326241134752, |
|
"grad_norm": 0.5457326197852193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5893, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.5538870750639828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5895, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.724822695035461, |
|
"grad_norm": 0.6482629759445236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.7475177304964538, |
|
"grad_norm": 0.604796917686035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5908, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7702127659574467, |
|
"grad_norm": 0.5570179578262068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5935, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7929078014184396, |
|
"grad_norm": 0.5359469482507023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5833, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8156028368794326, |
|
"grad_norm": 0.6340496302965002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5853, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8382978723404255, |
|
"grad_norm": 0.5742298835017674, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5823, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8609929078014185, |
|
"grad_norm": 0.5827541219871901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5903, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.8836879432624114, |
|
"grad_norm": 0.528321132004614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5832, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9063829787234043, |
|
"grad_norm": 0.565101788703942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5855, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9290780141843973, |
|
"grad_norm": 0.601792497070637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5842, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.9517730496453902, |
|
"grad_norm": 0.5088677439673144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5821, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.974468085106383, |
|
"grad_norm": 0.6353626361966195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5865, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9971631205673759, |
|
"grad_norm": 0.627955882058217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5869, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9994326241134752, |
|
"eval_loss": 0.63065505027771, |
|
"eval_runtime": 299.005, |
|
"eval_samples_per_second": 39.712, |
|
"eval_steps_per_second": 0.622, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 2.0198581560283686, |
|
"grad_norm": 0.7421753177971604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5575, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 0.7475116659953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5333, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.0652482269503545, |
|
"grad_norm": 0.6094678843754806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5281, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.0879432624113474, |
|
"grad_norm": 0.6416679206326653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5307, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.1106382978723404, |
|
"grad_norm": 0.6149270041451872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5253, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 0.5944382793137648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5321, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.1560283687943262, |
|
"grad_norm": 0.5206404660542666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5327, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.178723404255319, |
|
"grad_norm": 0.5421603421972422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5283, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.201418439716312, |
|
"grad_norm": 0.630367827768556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5312, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.224113475177305, |
|
"grad_norm": 0.5289786942278032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5281, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.246808510638298, |
|
"grad_norm": 0.6186809984064454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5287, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.269503546099291, |
|
"grad_norm": 0.5941474628916863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5349, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.2921985815602834, |
|
"grad_norm": 0.570443876715086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5295, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.3148936170212764, |
|
"grad_norm": 0.6792218051762158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5323, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.3375886524822693, |
|
"grad_norm": 0.5535124105821935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5277, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.3602836879432623, |
|
"grad_norm": 0.667112587037914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5276, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.5987387148760719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5295, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.405673758865248, |
|
"grad_norm": 0.5484898675236806, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5328, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.428368794326241, |
|
"grad_norm": 0.5736373550799053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.534, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.451063829787234, |
|
"grad_norm": 0.5626598264859632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5356, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.473758865248227, |
|
"grad_norm": 0.6153434121318484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5306, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.49645390070922, |
|
"grad_norm": 0.7252891365142108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5307, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.519148936170213, |
|
"grad_norm": 0.6153968835692192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5331, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.541843971631206, |
|
"grad_norm": 0.5969808676825302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5363, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.5645390070921987, |
|
"grad_norm": 0.5692435999805617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5391, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.5872340425531917, |
|
"grad_norm": 0.6180618030016519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5299, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.6099290780141846, |
|
"grad_norm": 0.543137746234749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5374, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.6326241134751776, |
|
"grad_norm": 0.5200265379748215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5309, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.65531914893617, |
|
"grad_norm": 0.5194882503023576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5293, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.678014184397163, |
|
"grad_norm": 0.5570786417431203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5316, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.700709219858156, |
|
"grad_norm": 0.6007520350434941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5347, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.5662193506846984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5339, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.746099290780142, |
|
"grad_norm": 0.6675198025626778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5346, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.7687943262411348, |
|
"grad_norm": 0.6689385730256848, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5303, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.7914893617021277, |
|
"grad_norm": 0.6400833035990827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5315, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.8141843971631206, |
|
"grad_norm": 0.6835204230746162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5367, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.8368794326241136, |
|
"grad_norm": 0.5862756798841194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5346, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.8595744680851065, |
|
"grad_norm": 0.5727844470422598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5376, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.8822695035460995, |
|
"grad_norm": 0.5445857583169009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5294, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.904964539007092, |
|
"grad_norm": 0.6161117247407584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5309, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.927659574468085, |
|
"grad_norm": 0.6027552334532725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5359, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.950354609929078, |
|
"grad_norm": 0.6005574689486347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.531, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.9730496453900708, |
|
"grad_norm": 0.5590382510647179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5348, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.9957446808510637, |
|
"grad_norm": 0.5507647179314145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5325, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.9957446808510637, |
|
"eval_loss": 0.6309003829956055, |
|
"eval_runtime": 301.6902, |
|
"eval_samples_per_second": 39.358, |
|
"eval_steps_per_second": 0.617, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.9957446808510637, |
|
"step": 1320, |
|
"total_flos": 2210839784325120.0, |
|
"train_loss": 0.6033726125052481, |
|
"train_runtime": 44159.109, |
|
"train_samples_per_second": 15.326, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2210839784325120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|