|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 12900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 126.4291763305664, |
|
"learning_rate": 0.0003969, |
|
"loss": 57.794, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 47.50299072265625, |
|
"eval_runtime": 18.4297, |
|
"eval_samples_per_second": 2.604, |
|
"eval_steps_per_second": 0.651, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 110.41481018066406, |
|
"learning_rate": 0.00039380000000000003, |
|
"loss": 45.635, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 42.4621696472168, |
|
"eval_runtime": 17.0743, |
|
"eval_samples_per_second": 2.811, |
|
"eval_steps_per_second": 0.703, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 119.6634750366211, |
|
"learning_rate": 0.0003907, |
|
"loss": 43.6742, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 40.13833999633789, |
|
"eval_runtime": 18.4155, |
|
"eval_samples_per_second": 2.606, |
|
"eval_steps_per_second": 0.652, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.10077519379845, |
|
"grad_norm": 118.22651672363281, |
|
"learning_rate": 0.0003876, |
|
"loss": 40.4744, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.875968992248062, |
|
"grad_norm": 134.10031127929688, |
|
"learning_rate": 0.0003845, |
|
"loss": 37.5286, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 41.196353912353516, |
|
"eval_runtime": 17.0535, |
|
"eval_samples_per_second": 2.815, |
|
"eval_steps_per_second": 0.704, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 4.651162790697675, |
|
"grad_norm": 80.17464447021484, |
|
"learning_rate": 0.00038140000000000005, |
|
"loss": 33.7618, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.43735122680664, |
|
"eval_runtime": 19.3245, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 0.621, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.426356589147287, |
|
"grad_norm": 372.6417236328125, |
|
"learning_rate": 0.00037830000000000003, |
|
"loss": 31.5899, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.824180603027344, |
|
"eval_runtime": 17.2226, |
|
"eval_samples_per_second": 2.787, |
|
"eval_steps_per_second": 0.697, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 6.2015503875969, |
|
"grad_norm": 226.50611877441406, |
|
"learning_rate": 0.0003752, |
|
"loss": 31.2097, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.976744186046512, |
|
"grad_norm": 99.72407531738281, |
|
"learning_rate": 0.0003721, |
|
"loss": 29.0727, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 33.32234191894531, |
|
"eval_runtime": 18.5858, |
|
"eval_samples_per_second": 2.583, |
|
"eval_steps_per_second": 0.646, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 7.751937984496124, |
|
"grad_norm": 107.07185363769531, |
|
"learning_rate": 0.000369, |
|
"loss": 27.8483, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.962478637695312, |
|
"eval_runtime": 17.3408, |
|
"eval_samples_per_second": 2.768, |
|
"eval_steps_per_second": 0.692, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 8.527131782945737, |
|
"grad_norm": 82.22681427001953, |
|
"learning_rate": 0.0003659, |
|
"loss": 26.0904, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.708364486694336, |
|
"eval_runtime": 18.5166, |
|
"eval_samples_per_second": 2.592, |
|
"eval_steps_per_second": 0.648, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 9.30232558139535, |
|
"grad_norm": 184.18087768554688, |
|
"learning_rate": 0.00036280000000000004, |
|
"loss": 26.1043, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.808786392211914, |
|
"eval_runtime": 17.2835, |
|
"eval_samples_per_second": 2.777, |
|
"eval_steps_per_second": 0.694, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 10.077519379844961, |
|
"grad_norm": 82.96527862548828, |
|
"learning_rate": 0.0003597, |
|
"loss": 26.1461, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.852713178294573, |
|
"grad_norm": 76.06727600097656, |
|
"learning_rate": 0.00035660000000000005, |
|
"loss": 24.3038, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.336084365844727, |
|
"eval_runtime": 18.0862, |
|
"eval_samples_per_second": 2.654, |
|
"eval_steps_per_second": 0.663, |
|
"step": 1419 |
|
}, |
|
{ |
|
"epoch": 11.627906976744185, |
|
"grad_norm": 57.900630950927734, |
|
"learning_rate": 0.00035350000000000003, |
|
"loss": 23.6493, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.202993392944336, |
|
"eval_runtime": 16.9497, |
|
"eval_samples_per_second": 2.832, |
|
"eval_steps_per_second": 0.708, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 12.4031007751938, |
|
"grad_norm": 60.565650939941406, |
|
"learning_rate": 0.0003504, |
|
"loss": 23.9146, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.08062744140625, |
|
"eval_runtime": 17.9909, |
|
"eval_samples_per_second": 2.668, |
|
"eval_steps_per_second": 0.667, |
|
"step": 1677 |
|
}, |
|
{ |
|
"epoch": 13.178294573643411, |
|
"grad_norm": 75.52674865722656, |
|
"learning_rate": 0.00034730000000000004, |
|
"loss": 23.033, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 13.953488372093023, |
|
"grad_norm": 45.41800308227539, |
|
"learning_rate": 0.0003442, |
|
"loss": 21.9133, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.397354125976562, |
|
"eval_runtime": 17.0643, |
|
"eval_samples_per_second": 2.813, |
|
"eval_steps_per_second": 0.703, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 14.728682170542635, |
|
"grad_norm": 46.21736145019531, |
|
"learning_rate": 0.0003411, |
|
"loss": 22.3071, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 32.09249496459961, |
|
"eval_runtime": 17.5136, |
|
"eval_samples_per_second": 2.741, |
|
"eval_steps_per_second": 0.685, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 15.503875968992247, |
|
"grad_norm": 29.049968719482422, |
|
"learning_rate": 0.000338, |
|
"loss": 21.0819, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.936742782592773, |
|
"eval_runtime": 17.2024, |
|
"eval_samples_per_second": 2.79, |
|
"eval_steps_per_second": 0.698, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 16.27906976744186, |
|
"grad_norm": 70.16988372802734, |
|
"learning_rate": 0.0003349, |
|
"loss": 21.0089, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.042001724243164, |
|
"eval_runtime": 17.9057, |
|
"eval_samples_per_second": 2.681, |
|
"eval_steps_per_second": 0.67, |
|
"step": 2193 |
|
}, |
|
{ |
|
"epoch": 17.05426356589147, |
|
"grad_norm": 77.12342834472656, |
|
"learning_rate": 0.00033180000000000004, |
|
"loss": 21.1193, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 17.829457364341085, |
|
"grad_norm": 40.200958251953125, |
|
"learning_rate": 0.0003287, |
|
"loss": 20.9169, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.293771743774414, |
|
"eval_runtime": 17.1083, |
|
"eval_samples_per_second": 2.806, |
|
"eval_steps_per_second": 0.701, |
|
"step": 2322 |
|
}, |
|
{ |
|
"epoch": 18.6046511627907, |
|
"grad_norm": 31.95384979248047, |
|
"learning_rate": 0.0003256, |
|
"loss": 19.7935, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.394454956054688, |
|
"eval_runtime": 18.6297, |
|
"eval_samples_per_second": 2.577, |
|
"eval_steps_per_second": 0.644, |
|
"step": 2451 |
|
}, |
|
{ |
|
"epoch": 19.37984496124031, |
|
"grad_norm": 54.4798698425293, |
|
"learning_rate": 0.00032250000000000003, |
|
"loss": 19.8749, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.845718383789062, |
|
"eval_runtime": 17.4128, |
|
"eval_samples_per_second": 2.757, |
|
"eval_steps_per_second": 0.689, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 20.155038759689923, |
|
"grad_norm": 61.0432243347168, |
|
"learning_rate": 0.0003194, |
|
"loss": 19.6959, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 20.930232558139537, |
|
"grad_norm": 64.08226013183594, |
|
"learning_rate": 0.0003163, |
|
"loss": 19.2973, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.071313858032227, |
|
"eval_runtime": 18.6354, |
|
"eval_samples_per_second": 2.576, |
|
"eval_steps_per_second": 0.644, |
|
"step": 2709 |
|
}, |
|
{ |
|
"epoch": 21.705426356589147, |
|
"grad_norm": 54.097721099853516, |
|
"learning_rate": 0.0003132, |
|
"loss": 18.5436, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.084577560424805, |
|
"eval_runtime": 17.3593, |
|
"eval_samples_per_second": 2.765, |
|
"eval_steps_per_second": 0.691, |
|
"step": 2838 |
|
}, |
|
{ |
|
"epoch": 22.48062015503876, |
|
"grad_norm": 37.09468460083008, |
|
"learning_rate": 0.00031010000000000006, |
|
"loss": 18.5996, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.88102149963379, |
|
"eval_runtime": 17.5696, |
|
"eval_samples_per_second": 2.732, |
|
"eval_steps_per_second": 0.683, |
|
"step": 2967 |
|
}, |
|
{ |
|
"epoch": 23.25581395348837, |
|
"grad_norm": 40.900291442871094, |
|
"learning_rate": 0.00030700000000000004, |
|
"loss": 19.1228, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.301599502563477, |
|
"eval_runtime": 17.3752, |
|
"eval_samples_per_second": 2.763, |
|
"eval_steps_per_second": 0.691, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 24.031007751937985, |
|
"grad_norm": 16.258365631103516, |
|
"learning_rate": 0.0003039, |
|
"loss": 18.2692, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 24.8062015503876, |
|
"grad_norm": 53.048091888427734, |
|
"learning_rate": 0.0003008, |
|
"loss": 18.0519, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.71547508239746, |
|
"eval_runtime": 18.0599, |
|
"eval_samples_per_second": 2.658, |
|
"eval_steps_per_second": 0.664, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 25.58139534883721, |
|
"grad_norm": 24.09917640686035, |
|
"learning_rate": 0.0002977, |
|
"loss": 17.7073, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 28.716806411743164, |
|
"eval_runtime": 17.32, |
|
"eval_samples_per_second": 2.771, |
|
"eval_steps_per_second": 0.693, |
|
"step": 3354 |
|
}, |
|
{ |
|
"epoch": 26.356589147286822, |
|
"grad_norm": 86.21530151367188, |
|
"learning_rate": 0.0002946, |
|
"loss": 17.5055, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 28.989931106567383, |
|
"eval_runtime": 17.6249, |
|
"eval_samples_per_second": 2.723, |
|
"eval_steps_per_second": 0.681, |
|
"step": 3483 |
|
}, |
|
{ |
|
"epoch": 27.131782945736433, |
|
"grad_norm": 20.129249572753906, |
|
"learning_rate": 0.0002915, |
|
"loss": 18.054, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 27.906976744186046, |
|
"grad_norm": 22.863677978515625, |
|
"learning_rate": 0.0002884, |
|
"loss": 17.4854, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.19437599182129, |
|
"eval_runtime": 17.6593, |
|
"eval_samples_per_second": 2.718, |
|
"eval_steps_per_second": 0.68, |
|
"step": 3612 |
|
}, |
|
{ |
|
"epoch": 28.68217054263566, |
|
"grad_norm": 13.843172073364258, |
|
"learning_rate": 0.0002853, |
|
"loss": 17.0048, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.28289031982422, |
|
"eval_runtime": 18.1574, |
|
"eval_samples_per_second": 2.644, |
|
"eval_steps_per_second": 0.661, |
|
"step": 3741 |
|
}, |
|
{ |
|
"epoch": 29.45736434108527, |
|
"grad_norm": 20.727048873901367, |
|
"learning_rate": 0.0002822, |
|
"loss": 16.8731, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.1208438873291, |
|
"eval_runtime": 16.9906, |
|
"eval_samples_per_second": 2.825, |
|
"eval_steps_per_second": 0.706, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 30.232558139534884, |
|
"grad_norm": 19.721155166625977, |
|
"learning_rate": 0.0002791, |
|
"loss": 16.683, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.758291244506836, |
|
"eval_runtime": 17.7849, |
|
"eval_samples_per_second": 2.699, |
|
"eval_steps_per_second": 0.675, |
|
"step": 3999 |
|
}, |
|
{ |
|
"epoch": 31.007751937984494, |
|
"grad_norm": 25.496213912963867, |
|
"learning_rate": 0.00027600000000000004, |
|
"loss": 16.9178, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 31.782945736434108, |
|
"grad_norm": 26.640628814697266, |
|
"learning_rate": 0.0002729, |
|
"loss": 16.6109, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.623199462890625, |
|
"eval_runtime": 16.843, |
|
"eval_samples_per_second": 2.85, |
|
"eval_steps_per_second": 0.712, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 32.55813953488372, |
|
"grad_norm": 25.067975997924805, |
|
"learning_rate": 0.0002698, |
|
"loss": 15.8261, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.416189193725586, |
|
"eval_runtime": 17.6752, |
|
"eval_samples_per_second": 2.716, |
|
"eval_steps_per_second": 0.679, |
|
"step": 4257 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 30.74283218383789, |
|
"learning_rate": 0.00026670000000000003, |
|
"loss": 16.9002, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.438751220703125, |
|
"eval_runtime": 16.8785, |
|
"eval_samples_per_second": 2.844, |
|
"eval_steps_per_second": 0.711, |
|
"step": 4386 |
|
}, |
|
{ |
|
"epoch": 34.10852713178294, |
|
"grad_norm": 72.6050033569336, |
|
"learning_rate": 0.0002636, |
|
"loss": 15.7742, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 34.883720930232556, |
|
"grad_norm": 93.86290740966797, |
|
"learning_rate": 0.00026050000000000004, |
|
"loss": 16.3081, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.97564697265625, |
|
"eval_runtime": 17.4518, |
|
"eval_samples_per_second": 2.75, |
|
"eval_steps_per_second": 0.688, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 35.65891472868217, |
|
"grad_norm": 24.93766975402832, |
|
"learning_rate": 0.0002574, |
|
"loss": 15.4745, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 28.821380615234375, |
|
"eval_runtime": 16.9764, |
|
"eval_samples_per_second": 2.827, |
|
"eval_steps_per_second": 0.707, |
|
"step": 4644 |
|
}, |
|
{ |
|
"epoch": 36.434108527131784, |
|
"grad_norm": 33.70745849609375, |
|
"learning_rate": 0.0002543, |
|
"loss": 15.938, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 29.100107192993164, |
|
"eval_runtime": 17.5981, |
|
"eval_samples_per_second": 2.728, |
|
"eval_steps_per_second": 0.682, |
|
"step": 4773 |
|
}, |
|
{ |
|
"epoch": 37.2093023255814, |
|
"grad_norm": 59.88523864746094, |
|
"learning_rate": 0.00025120000000000003, |
|
"loss": 14.9862, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 37.98449612403101, |
|
"grad_norm": 20.979228973388672, |
|
"learning_rate": 0.0002481, |
|
"loss": 15.9947, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.053319931030273, |
|
"eval_runtime": 17.0472, |
|
"eval_samples_per_second": 2.816, |
|
"eval_steps_per_second": 0.704, |
|
"step": 4902 |
|
}, |
|
{ |
|
"epoch": 38.75968992248062, |
|
"grad_norm": 17.90158462524414, |
|
"learning_rate": 0.000245, |
|
"loss": 15.2328, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.62113380432129, |
|
"eval_runtime": 17.318, |
|
"eval_samples_per_second": 2.772, |
|
"eval_steps_per_second": 0.693, |
|
"step": 5031 |
|
}, |
|
{ |
|
"epoch": 39.53488372093023, |
|
"grad_norm": 33.11941909790039, |
|
"learning_rate": 0.00024190000000000003, |
|
"loss": 15.202, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 33.138301849365234, |
|
"eval_runtime": 17.0128, |
|
"eval_samples_per_second": 2.821, |
|
"eval_steps_per_second": 0.705, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 40.310077519379846, |
|
"grad_norm": 15.685113906860352, |
|
"learning_rate": 0.0002388, |
|
"loss": 15.0583, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.408859252929688, |
|
"eval_runtime": 17.9066, |
|
"eval_samples_per_second": 2.681, |
|
"eval_steps_per_second": 0.67, |
|
"step": 5289 |
|
}, |
|
{ |
|
"epoch": 41.08527131782946, |
|
"grad_norm": 20.353235244750977, |
|
"learning_rate": 0.00023569999999999998, |
|
"loss": 14.7257, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 41.86046511627907, |
|
"grad_norm": 22.713470458984375, |
|
"learning_rate": 0.00023259999999999996, |
|
"loss": 14.573, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.568130493164062, |
|
"eval_runtime": 17.7042, |
|
"eval_samples_per_second": 2.711, |
|
"eval_steps_per_second": 0.678, |
|
"step": 5418 |
|
}, |
|
{ |
|
"epoch": 42.63565891472868, |
|
"grad_norm": 24.60871696472168, |
|
"learning_rate": 0.0002295, |
|
"loss": 14.7401, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.554765701293945, |
|
"eval_runtime": 18.0539, |
|
"eval_samples_per_second": 2.659, |
|
"eval_steps_per_second": 0.665, |
|
"step": 5547 |
|
}, |
|
{ |
|
"epoch": 43.41085271317829, |
|
"grad_norm": 36.38352966308594, |
|
"learning_rate": 0.0002264, |
|
"loss": 14.6052, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.39527702331543, |
|
"eval_runtime": 17.3086, |
|
"eval_samples_per_second": 2.773, |
|
"eval_steps_per_second": 0.693, |
|
"step": 5676 |
|
}, |
|
{ |
|
"epoch": 44.18604651162791, |
|
"grad_norm": 14.869057655334473, |
|
"learning_rate": 0.00022330000000000003, |
|
"loss": 13.9636, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 44.96124031007752, |
|
"grad_norm": 12.379744529724121, |
|
"learning_rate": 0.0002202, |
|
"loss": 14.1299, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.81528663635254, |
|
"eval_runtime": 17.5777, |
|
"eval_samples_per_second": 2.731, |
|
"eval_steps_per_second": 0.683, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 45.736434108527135, |
|
"grad_norm": 51.40928649902344, |
|
"learning_rate": 0.00021710000000000005, |
|
"loss": 13.6851, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 30.969324111938477, |
|
"eval_runtime": 17.5744, |
|
"eval_samples_per_second": 2.731, |
|
"eval_steps_per_second": 0.683, |
|
"step": 5934 |
|
}, |
|
{ |
|
"epoch": 46.51162790697674, |
|
"grad_norm": 13.089680671691895, |
|
"learning_rate": 0.00021400000000000002, |
|
"loss": 14.6677, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.936065673828125, |
|
"eval_runtime": 18.038, |
|
"eval_samples_per_second": 2.661, |
|
"eval_steps_per_second": 0.665, |
|
"step": 6063 |
|
}, |
|
{ |
|
"epoch": 47.286821705426355, |
|
"grad_norm": 15.198484420776367, |
|
"learning_rate": 0.0002109, |
|
"loss": 13.6493, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.3327751159668, |
|
"eval_runtime": 17.0231, |
|
"eval_samples_per_second": 2.82, |
|
"eval_steps_per_second": 0.705, |
|
"step": 6192 |
|
}, |
|
{ |
|
"epoch": 48.06201550387597, |
|
"grad_norm": 17.197790145874023, |
|
"learning_rate": 0.00020780000000000004, |
|
"loss": 13.7191, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 48.83720930232558, |
|
"grad_norm": 22.198528289794922, |
|
"learning_rate": 0.00020470000000000002, |
|
"loss": 14.166, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 32.62310791015625, |
|
"eval_runtime": 17.8418, |
|
"eval_samples_per_second": 2.69, |
|
"eval_steps_per_second": 0.673, |
|
"step": 6321 |
|
}, |
|
{ |
|
"epoch": 49.6124031007752, |
|
"grad_norm": 12.192609786987305, |
|
"learning_rate": 0.00020160000000000002, |
|
"loss": 13.7388, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 33.17361831665039, |
|
"eval_runtime": 16.9796, |
|
"eval_samples_per_second": 2.827, |
|
"eval_steps_per_second": 0.707, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 50.3875968992248, |
|
"grad_norm": 24.24190330505371, |
|
"learning_rate": 0.0001985, |
|
"loss": 13.0849, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.95216369628906, |
|
"eval_runtime": 17.6852, |
|
"eval_samples_per_second": 2.714, |
|
"eval_steps_per_second": 0.679, |
|
"step": 6579 |
|
}, |
|
{ |
|
"epoch": 51.16279069767442, |
|
"grad_norm": 10.653921127319336, |
|
"learning_rate": 0.0001954, |
|
"loss": 13.7478, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 51.93798449612403, |
|
"grad_norm": 12.344590187072754, |
|
"learning_rate": 0.00019229999999999999, |
|
"loss": 13.2502, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.79899215698242, |
|
"eval_runtime": 17.0885, |
|
"eval_samples_per_second": 2.809, |
|
"eval_steps_per_second": 0.702, |
|
"step": 6708 |
|
}, |
|
{ |
|
"epoch": 52.713178294573645, |
|
"grad_norm": 11.102241516113281, |
|
"learning_rate": 0.0001892, |
|
"loss": 13.5116, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 31.57374382019043, |
|
"eval_runtime": 17.6271, |
|
"eval_samples_per_second": 2.723, |
|
"eval_steps_per_second": 0.681, |
|
"step": 6837 |
|
}, |
|
{ |
|
"epoch": 53.48837209302326, |
|
"grad_norm": 10.652983665466309, |
|
"learning_rate": 0.00018610000000000002, |
|
"loss": 12.6993, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 33.26504898071289, |
|
"eval_runtime": 17.4525, |
|
"eval_samples_per_second": 2.75, |
|
"eval_steps_per_second": 0.688, |
|
"step": 6966 |
|
}, |
|
{ |
|
"epoch": 54.263565891472865, |
|
"grad_norm": 12.211697578430176, |
|
"learning_rate": 0.00018300000000000003, |
|
"loss": 13.3602, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.891380310058594, |
|
"eval_runtime": 18.9822, |
|
"eval_samples_per_second": 2.529, |
|
"eval_steps_per_second": 0.632, |
|
"step": 7095 |
|
}, |
|
{ |
|
"epoch": 55.03875968992248, |
|
"grad_norm": 14.056374549865723, |
|
"learning_rate": 0.0001799, |
|
"loss": 12.9955, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 55.81395348837209, |
|
"grad_norm": 10.69999885559082, |
|
"learning_rate": 0.00017680000000000001, |
|
"loss": 12.9585, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.98616409301758, |
|
"eval_runtime": 17.2599, |
|
"eval_samples_per_second": 2.781, |
|
"eval_steps_per_second": 0.695, |
|
"step": 7224 |
|
}, |
|
{ |
|
"epoch": 56.58914728682171, |
|
"grad_norm": 7.194685459136963, |
|
"learning_rate": 0.00017370000000000002, |
|
"loss": 12.7434, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.91057205200195, |
|
"eval_runtime": 18.4913, |
|
"eval_samples_per_second": 2.596, |
|
"eval_steps_per_second": 0.649, |
|
"step": 7353 |
|
}, |
|
{ |
|
"epoch": 57.36434108527132, |
|
"grad_norm": 9.769908905029297, |
|
"learning_rate": 0.0001706, |
|
"loss": 12.7299, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.010562896728516, |
|
"eval_runtime": 17.0454, |
|
"eval_samples_per_second": 2.816, |
|
"eval_steps_per_second": 0.704, |
|
"step": 7482 |
|
}, |
|
{ |
|
"epoch": 58.13953488372093, |
|
"grad_norm": 18.091665267944336, |
|
"learning_rate": 0.0001675, |
|
"loss": 12.3929, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 58.91472868217054, |
|
"grad_norm": 8.427603721618652, |
|
"learning_rate": 0.00016439999999999998, |
|
"loss": 12.717, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.35882568359375, |
|
"eval_runtime": 18.0781, |
|
"eval_samples_per_second": 2.655, |
|
"eval_steps_per_second": 0.664, |
|
"step": 7611 |
|
}, |
|
{ |
|
"epoch": 59.689922480620154, |
|
"grad_norm": 15.918642044067383, |
|
"learning_rate": 0.00016130000000000002, |
|
"loss": 12.0563, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.09232711791992, |
|
"eval_runtime": 16.9066, |
|
"eval_samples_per_second": 2.839, |
|
"eval_steps_per_second": 0.71, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 60.46511627906977, |
|
"grad_norm": 13.870895385742188, |
|
"learning_rate": 0.00015820000000000002, |
|
"loss": 13.012, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.73225402832031, |
|
"eval_runtime": 17.9846, |
|
"eval_samples_per_second": 2.669, |
|
"eval_steps_per_second": 0.667, |
|
"step": 7869 |
|
}, |
|
{ |
|
"epoch": 61.24031007751938, |
|
"grad_norm": 7.798965930938721, |
|
"learning_rate": 0.00015510000000000003, |
|
"loss": 12.2878, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 34.9967155456543, |
|
"eval_runtime": 17.0439, |
|
"eval_samples_per_second": 2.816, |
|
"eval_steps_per_second": 0.704, |
|
"step": 7998 |
|
}, |
|
{ |
|
"epoch": 62.01550387596899, |
|
"grad_norm": 8.46688461303711, |
|
"learning_rate": 0.000152, |
|
"loss": 12.3515, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 62.7906976744186, |
|
"grad_norm": 9.745466232299805, |
|
"learning_rate": 0.00014890000000000001, |
|
"loss": 12.2794, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 37.55772399902344, |
|
"eval_runtime": 18.0451, |
|
"eval_samples_per_second": 2.66, |
|
"eval_steps_per_second": 0.665, |
|
"step": 8127 |
|
}, |
|
{ |
|
"epoch": 63.565891472868216, |
|
"grad_norm": 7.328401565551758, |
|
"learning_rate": 0.0001458, |
|
"loss": 12.4147, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 37.27333068847656, |
|
"eval_runtime": 19.2621, |
|
"eval_samples_per_second": 2.492, |
|
"eval_steps_per_second": 0.623, |
|
"step": 8256 |
|
}, |
|
{ |
|
"epoch": 64.34108527131782, |
|
"grad_norm": 12.89833927154541, |
|
"learning_rate": 0.0001427, |
|
"loss": 12.0032, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.3015022277832, |
|
"eval_runtime": 17.8838, |
|
"eval_samples_per_second": 2.684, |
|
"eval_steps_per_second": 0.671, |
|
"step": 8385 |
|
}, |
|
{ |
|
"epoch": 65.11627906976744, |
|
"grad_norm": 15.308392524719238, |
|
"learning_rate": 0.00013959999999999998, |
|
"loss": 11.7392, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 65.89147286821705, |
|
"grad_norm": 12.101038932800293, |
|
"learning_rate": 0.00013650000000000004, |
|
"loss": 12.2793, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.280582427978516, |
|
"eval_runtime": 17.6628, |
|
"eval_samples_per_second": 2.718, |
|
"eval_steps_per_second": 0.679, |
|
"step": 8514 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 12.754354476928711, |
|
"learning_rate": 0.00013340000000000002, |
|
"loss": 12.2309, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.24875259399414, |
|
"eval_runtime": 17.2522, |
|
"eval_samples_per_second": 2.782, |
|
"eval_steps_per_second": 0.696, |
|
"step": 8643 |
|
}, |
|
{ |
|
"epoch": 67.44186046511628, |
|
"grad_norm": 9.756113052368164, |
|
"learning_rate": 0.00013030000000000002, |
|
"loss": 11.7082, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.66865158081055, |
|
"eval_runtime": 18.2695, |
|
"eval_samples_per_second": 2.627, |
|
"eval_steps_per_second": 0.657, |
|
"step": 8772 |
|
}, |
|
{ |
|
"epoch": 68.21705426356588, |
|
"grad_norm": 9.372435569763184, |
|
"learning_rate": 0.0001272, |
|
"loss": 11.5136, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 68.9922480620155, |
|
"grad_norm": 15.044282913208008, |
|
"learning_rate": 0.0001241, |
|
"loss": 11.8694, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.04698944091797, |
|
"eval_runtime": 17.3888, |
|
"eval_samples_per_second": 2.76, |
|
"eval_steps_per_second": 0.69, |
|
"step": 8901 |
|
}, |
|
{ |
|
"epoch": 69.76744186046511, |
|
"grad_norm": 9.250027656555176, |
|
"learning_rate": 0.000121, |
|
"loss": 11.782, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.40549087524414, |
|
"eval_runtime": 18.0194, |
|
"eval_samples_per_second": 2.664, |
|
"eval_steps_per_second": 0.666, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 70.54263565891473, |
|
"grad_norm": 16.64485740661621, |
|
"learning_rate": 0.00011789999999999999, |
|
"loss": 11.6254, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.70663070678711, |
|
"eval_runtime": 17.3613, |
|
"eval_samples_per_second": 2.765, |
|
"eval_steps_per_second": 0.691, |
|
"step": 9159 |
|
}, |
|
{ |
|
"epoch": 71.31782945736434, |
|
"grad_norm": 15.693510055541992, |
|
"learning_rate": 0.00011479999999999997, |
|
"loss": 11.5873, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.10844421386719, |
|
"eval_runtime": 17.5839, |
|
"eval_samples_per_second": 2.73, |
|
"eval_steps_per_second": 0.682, |
|
"step": 9288 |
|
}, |
|
{ |
|
"epoch": 72.09302325581395, |
|
"grad_norm": 7.485771179199219, |
|
"learning_rate": 0.00011170000000000003, |
|
"loss": 11.6159, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 72.86821705426357, |
|
"grad_norm": 15.41925048828125, |
|
"learning_rate": 0.00010860000000000004, |
|
"loss": 11.6251, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.29316329956055, |
|
"eval_runtime": 17.0634, |
|
"eval_samples_per_second": 2.813, |
|
"eval_steps_per_second": 0.703, |
|
"step": 9417 |
|
}, |
|
{ |
|
"epoch": 73.64341085271317, |
|
"grad_norm": 16.74988555908203, |
|
"learning_rate": 0.00010550000000000002, |
|
"loss": 11.4589, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.55695724487305, |
|
"eval_runtime": 17.9041, |
|
"eval_samples_per_second": 2.681, |
|
"eval_steps_per_second": 0.67, |
|
"step": 9546 |
|
}, |
|
{ |
|
"epoch": 74.4186046511628, |
|
"grad_norm": 146.4043426513672, |
|
"learning_rate": 0.00010240000000000001, |
|
"loss": 11.7378, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 35.988651275634766, |
|
"eval_runtime": 17.0167, |
|
"eval_samples_per_second": 2.821, |
|
"eval_steps_per_second": 0.705, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 75.1937984496124, |
|
"grad_norm": 10.800848960876465, |
|
"learning_rate": 9.93e-05, |
|
"loss": 11.4043, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 75.96899224806202, |
|
"grad_norm": 9.41010570526123, |
|
"learning_rate": 9.62e-05, |
|
"loss": 11.4933, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.47134017944336, |
|
"eval_runtime": 17.8569, |
|
"eval_samples_per_second": 2.688, |
|
"eval_steps_per_second": 0.672, |
|
"step": 9804 |
|
}, |
|
{ |
|
"epoch": 76.74418604651163, |
|
"grad_norm": 19.61960220336914, |
|
"learning_rate": 9.31e-05, |
|
"loss": 11.2566, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 36.96221923828125, |
|
"eval_runtime": 17.0325, |
|
"eval_samples_per_second": 2.818, |
|
"eval_steps_per_second": 0.705, |
|
"step": 9933 |
|
}, |
|
{ |
|
"epoch": 77.51937984496124, |
|
"grad_norm": 9.528326034545898, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 11.25, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 37.10159683227539, |
|
"eval_runtime": 18.225, |
|
"eval_samples_per_second": 2.634, |
|
"eval_steps_per_second": 0.658, |
|
"step": 10062 |
|
}, |
|
{ |
|
"epoch": 78.29457364341086, |
|
"grad_norm": 8.064875602722168, |
|
"learning_rate": 8.690000000000003e-05, |
|
"loss": 11.2962, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 37.87105178833008, |
|
"eval_runtime": 16.9948, |
|
"eval_samples_per_second": 2.824, |
|
"eval_steps_per_second": 0.706, |
|
"step": 10191 |
|
}, |
|
{ |
|
"epoch": 79.06976744186046, |
|
"grad_norm": 10.451505661010742, |
|
"learning_rate": 8.380000000000002e-05, |
|
"loss": 11.1642, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 79.84496124031008, |
|
"grad_norm": 7.318461894989014, |
|
"learning_rate": 8.070000000000001e-05, |
|
"loss": 11.0868, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.571414947509766, |
|
"eval_runtime": 18.0923, |
|
"eval_samples_per_second": 2.653, |
|
"eval_steps_per_second": 0.663, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 80.62015503875969, |
|
"grad_norm": 10.301888465881348, |
|
"learning_rate": 7.760000000000002e-05, |
|
"loss": 11.2786, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.1493034362793, |
|
"eval_runtime": 17.0167, |
|
"eval_samples_per_second": 2.821, |
|
"eval_steps_per_second": 0.705, |
|
"step": 10449 |
|
}, |
|
{ |
|
"epoch": 81.3953488372093, |
|
"grad_norm": 8.667201042175293, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 11.1528, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.0099983215332, |
|
"eval_runtime": 17.9494, |
|
"eval_samples_per_second": 2.674, |
|
"eval_steps_per_second": 0.669, |
|
"step": 10578 |
|
}, |
|
{ |
|
"epoch": 82.17054263565892, |
|
"grad_norm": 5.117663860321045, |
|
"learning_rate": 7.14e-05, |
|
"loss": 10.9299, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 82.94573643410853, |
|
"grad_norm": 5.9621806144714355, |
|
"learning_rate": 6.829999999999999e-05, |
|
"loss": 11.089, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.5473518371582, |
|
"eval_runtime": 17.0039, |
|
"eval_samples_per_second": 2.823, |
|
"eval_steps_per_second": 0.706, |
|
"step": 10707 |
|
}, |
|
{ |
|
"epoch": 83.72093023255815, |
|
"grad_norm": 6.17501974105835, |
|
"learning_rate": 6.519999999999999e-05, |
|
"loss": 10.954, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.940486907958984, |
|
"eval_runtime": 17.721, |
|
"eval_samples_per_second": 2.709, |
|
"eval_steps_per_second": 0.677, |
|
"step": 10836 |
|
}, |
|
{ |
|
"epoch": 84.49612403100775, |
|
"grad_norm": 31.69377326965332, |
|
"learning_rate": 6.210000000000003e-05, |
|
"loss": 11.0157, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.3872184753418, |
|
"eval_runtime": 16.9062, |
|
"eval_samples_per_second": 2.839, |
|
"eval_steps_per_second": 0.71, |
|
"step": 10965 |
|
}, |
|
{ |
|
"epoch": 85.27131782945736, |
|
"grad_norm": 7.1002984046936035, |
|
"learning_rate": 5.9000000000000025e-05, |
|
"loss": 10.9849, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.4875373840332, |
|
"eval_runtime": 17.7347, |
|
"eval_samples_per_second": 2.707, |
|
"eval_steps_per_second": 0.677, |
|
"step": 11094 |
|
}, |
|
{ |
|
"epoch": 86.04651162790698, |
|
"grad_norm": 13.370129585266113, |
|
"learning_rate": 5.590000000000002e-05, |
|
"loss": 11.0614, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 86.82170542635659, |
|
"grad_norm": 5.192051887512207, |
|
"learning_rate": 5.28e-05, |
|
"loss": 10.5423, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.11787796020508, |
|
"eval_runtime": 16.9675, |
|
"eval_samples_per_second": 2.829, |
|
"eval_steps_per_second": 0.707, |
|
"step": 11223 |
|
}, |
|
{ |
|
"epoch": 87.59689922480621, |
|
"grad_norm": 5.747579097747803, |
|
"learning_rate": 4.97e-05, |
|
"loss": 11.1968, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.4084358215332, |
|
"eval_runtime": 17.9374, |
|
"eval_samples_per_second": 2.676, |
|
"eval_steps_per_second": 0.669, |
|
"step": 11352 |
|
}, |
|
{ |
|
"epoch": 88.37209302325581, |
|
"grad_norm": 11.57238483428955, |
|
"learning_rate": 4.66e-05, |
|
"loss": 10.6376, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.82176971435547, |
|
"eval_runtime": 16.9422, |
|
"eval_samples_per_second": 2.833, |
|
"eval_steps_per_second": 0.708, |
|
"step": 11481 |
|
}, |
|
{ |
|
"epoch": 89.14728682170542, |
|
"grad_norm": 8.44890308380127, |
|
"learning_rate": 4.3499999999999993e-05, |
|
"loss": 10.8035, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 89.92248062015504, |
|
"grad_norm": 7.732810974121094, |
|
"learning_rate": 4.0399999999999986e-05, |
|
"loss": 10.7131, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.25526428222656, |
|
"eval_runtime": 18.1455, |
|
"eval_samples_per_second": 2.645, |
|
"eval_steps_per_second": 0.661, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 90.69767441860465, |
|
"grad_norm": 6.144818305969238, |
|
"learning_rate": 3.7300000000000026e-05, |
|
"loss": 10.8252, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.136837005615234, |
|
"eval_runtime": 16.9741, |
|
"eval_samples_per_second": 2.828, |
|
"eval_steps_per_second": 0.707, |
|
"step": 11739 |
|
}, |
|
{ |
|
"epoch": 91.47286821705427, |
|
"grad_norm": 4.7243475914001465, |
|
"learning_rate": 3.420000000000002e-05, |
|
"loss": 10.6456, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 38.91936111450195, |
|
"eval_runtime": 17.9915, |
|
"eval_samples_per_second": 2.668, |
|
"eval_steps_per_second": 0.667, |
|
"step": 11868 |
|
}, |
|
{ |
|
"epoch": 92.24806201550388, |
|
"grad_norm": 13.374404907226562, |
|
"learning_rate": 3.110000000000002e-05, |
|
"loss": 10.8488, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.595462799072266, |
|
"eval_runtime": 16.9478, |
|
"eval_samples_per_second": 2.832, |
|
"eval_steps_per_second": 0.708, |
|
"step": 11997 |
|
}, |
|
{ |
|
"epoch": 93.02325581395348, |
|
"grad_norm": 9.406952857971191, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 10.5219, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 93.7984496124031, |
|
"grad_norm": 5.360720634460449, |
|
"learning_rate": 2.49e-05, |
|
"loss": 10.8675, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.47597885131836, |
|
"eval_runtime": 17.7005, |
|
"eval_samples_per_second": 2.712, |
|
"eval_steps_per_second": 0.678, |
|
"step": 12126 |
|
}, |
|
{ |
|
"epoch": 94.57364341085271, |
|
"grad_norm": 3.855013132095337, |
|
"learning_rate": 2.1799999999999995e-05, |
|
"loss": 10.4757, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 40.484397888183594, |
|
"eval_runtime": 16.9799, |
|
"eval_samples_per_second": 2.827, |
|
"eval_steps_per_second": 0.707, |
|
"step": 12255 |
|
}, |
|
{ |
|
"epoch": 95.34883720930233, |
|
"grad_norm": 7.718498229980469, |
|
"learning_rate": 1.8699999999999987e-05, |
|
"loss": 10.3191, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.06733322143555, |
|
"eval_runtime": 17.7268, |
|
"eval_samples_per_second": 2.708, |
|
"eval_steps_per_second": 0.677, |
|
"step": 12384 |
|
}, |
|
{ |
|
"epoch": 96.12403100775194, |
|
"grad_norm": 3.9156548976898193, |
|
"learning_rate": 1.5599999999999983e-05, |
|
"loss": 10.6169, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 96.89922480620154, |
|
"grad_norm": 3.464470148086548, |
|
"learning_rate": 1.2500000000000023e-05, |
|
"loss": 10.6073, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.37672805786133, |
|
"eval_runtime": 16.9277, |
|
"eval_samples_per_second": 2.836, |
|
"eval_steps_per_second": 0.709, |
|
"step": 12513 |
|
}, |
|
{ |
|
"epoch": 97.67441860465117, |
|
"grad_norm": 5.994311332702637, |
|
"learning_rate": 9.400000000000018e-06, |
|
"loss": 10.3038, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.69685745239258, |
|
"eval_runtime": 18.0251, |
|
"eval_samples_per_second": 2.663, |
|
"eval_steps_per_second": 0.666, |
|
"step": 12642 |
|
}, |
|
{ |
|
"epoch": 98.44961240310077, |
|
"grad_norm": 4.125715732574463, |
|
"learning_rate": 6.300000000000012e-06, |
|
"loss": 11.0709, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.93254470825195, |
|
"eval_runtime": 17.9269, |
|
"eval_samples_per_second": 2.678, |
|
"eval_steps_per_second": 0.669, |
|
"step": 12771 |
|
}, |
|
{ |
|
"epoch": 99.2248062015504, |
|
"grad_norm": 4.381137371063232, |
|
"learning_rate": 3.200000000000005e-06, |
|
"loss": 10.2398, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 4.022866249084473, |
|
"learning_rate": 9.999999999999998e-08, |
|
"loss": 10.5951, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_dummy": 1.0, |
|
"eval_loss": 39.87546157836914, |
|
"eval_runtime": 17.8005, |
|
"eval_samples_per_second": 2.697, |
|
"eval_steps_per_second": 0.674, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"step": 12900, |
|
"total_flos": 1.4631500418239693e+19, |
|
"train_loss": 16.309644344832545, |
|
"train_runtime": 27199.2721, |
|
"train_samples_per_second": 1.897, |
|
"train_steps_per_second": 0.474 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 12900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4631500418239693e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|