mask2former-finetuned-ER-Mito-LD3 / trainer_state.json
Dnq2025's picture
End of training
65ed337 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 100.0,
"eval_steps": 500,
"global_step": 12900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.7751937984496124,
"grad_norm": 126.4291763305664,
"learning_rate": 0.0003969,
"loss": 57.794,
"step": 100
},
{
"epoch": 1.0,
"eval_dummy": 1.0,
"eval_loss": 47.50299072265625,
"eval_runtime": 18.4297,
"eval_samples_per_second": 2.604,
"eval_steps_per_second": 0.651,
"step": 129
},
{
"epoch": 1.550387596899225,
"grad_norm": 110.41481018066406,
"learning_rate": 0.00039380000000000003,
"loss": 45.635,
"step": 200
},
{
"epoch": 2.0,
"eval_dummy": 1.0,
"eval_loss": 42.4621696472168,
"eval_runtime": 17.0743,
"eval_samples_per_second": 2.811,
"eval_steps_per_second": 0.703,
"step": 258
},
{
"epoch": 2.3255813953488373,
"grad_norm": 119.6634750366211,
"learning_rate": 0.0003907,
"loss": 43.6742,
"step": 300
},
{
"epoch": 3.0,
"eval_dummy": 1.0,
"eval_loss": 40.13833999633789,
"eval_runtime": 18.4155,
"eval_samples_per_second": 2.606,
"eval_steps_per_second": 0.652,
"step": 387
},
{
"epoch": 3.10077519379845,
"grad_norm": 118.22651672363281,
"learning_rate": 0.0003876,
"loss": 40.4744,
"step": 400
},
{
"epoch": 3.875968992248062,
"grad_norm": 134.10031127929688,
"learning_rate": 0.0003845,
"loss": 37.5286,
"step": 500
},
{
"epoch": 4.0,
"eval_dummy": 1.0,
"eval_loss": 41.196353912353516,
"eval_runtime": 17.0535,
"eval_samples_per_second": 2.815,
"eval_steps_per_second": 0.704,
"step": 516
},
{
"epoch": 4.651162790697675,
"grad_norm": 80.17464447021484,
"learning_rate": 0.00038140000000000005,
"loss": 33.7618,
"step": 600
},
{
"epoch": 5.0,
"eval_dummy": 1.0,
"eval_loss": 34.43735122680664,
"eval_runtime": 19.3245,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 0.621,
"step": 645
},
{
"epoch": 5.426356589147287,
"grad_norm": 372.6417236328125,
"learning_rate": 0.00037830000000000003,
"loss": 31.5899,
"step": 700
},
{
"epoch": 6.0,
"eval_dummy": 1.0,
"eval_loss": 39.824180603027344,
"eval_runtime": 17.2226,
"eval_samples_per_second": 2.787,
"eval_steps_per_second": 0.697,
"step": 774
},
{
"epoch": 6.2015503875969,
"grad_norm": 226.50611877441406,
"learning_rate": 0.0003752,
"loss": 31.2097,
"step": 800
},
{
"epoch": 6.976744186046512,
"grad_norm": 99.72407531738281,
"learning_rate": 0.0003721,
"loss": 29.0727,
"step": 900
},
{
"epoch": 7.0,
"eval_dummy": 1.0,
"eval_loss": 33.32234191894531,
"eval_runtime": 18.5858,
"eval_samples_per_second": 2.583,
"eval_steps_per_second": 0.646,
"step": 903
},
{
"epoch": 7.751937984496124,
"grad_norm": 107.07185363769531,
"learning_rate": 0.000369,
"loss": 27.8483,
"step": 1000
},
{
"epoch": 8.0,
"eval_dummy": 1.0,
"eval_loss": 30.962478637695312,
"eval_runtime": 17.3408,
"eval_samples_per_second": 2.768,
"eval_steps_per_second": 0.692,
"step": 1032
},
{
"epoch": 8.527131782945737,
"grad_norm": 82.22681427001953,
"learning_rate": 0.0003659,
"loss": 26.0904,
"step": 1100
},
{
"epoch": 9.0,
"eval_dummy": 1.0,
"eval_loss": 31.708364486694336,
"eval_runtime": 18.5166,
"eval_samples_per_second": 2.592,
"eval_steps_per_second": 0.648,
"step": 1161
},
{
"epoch": 9.30232558139535,
"grad_norm": 184.18087768554688,
"learning_rate": 0.00036280000000000004,
"loss": 26.1043,
"step": 1200
},
{
"epoch": 10.0,
"eval_dummy": 1.0,
"eval_loss": 31.808786392211914,
"eval_runtime": 17.2835,
"eval_samples_per_second": 2.777,
"eval_steps_per_second": 0.694,
"step": 1290
},
{
"epoch": 10.077519379844961,
"grad_norm": 82.96527862548828,
"learning_rate": 0.0003597,
"loss": 26.1461,
"step": 1300
},
{
"epoch": 10.852713178294573,
"grad_norm": 76.06727600097656,
"learning_rate": 0.00035660000000000005,
"loss": 24.3038,
"step": 1400
},
{
"epoch": 11.0,
"eval_dummy": 1.0,
"eval_loss": 30.336084365844727,
"eval_runtime": 18.0862,
"eval_samples_per_second": 2.654,
"eval_steps_per_second": 0.663,
"step": 1419
},
{
"epoch": 11.627906976744185,
"grad_norm": 57.900630950927734,
"learning_rate": 0.00035350000000000003,
"loss": 23.6493,
"step": 1500
},
{
"epoch": 12.0,
"eval_dummy": 1.0,
"eval_loss": 30.202993392944336,
"eval_runtime": 16.9497,
"eval_samples_per_second": 2.832,
"eval_steps_per_second": 0.708,
"step": 1548
},
{
"epoch": 12.4031007751938,
"grad_norm": 60.565650939941406,
"learning_rate": 0.0003504,
"loss": 23.9146,
"step": 1600
},
{
"epoch": 13.0,
"eval_dummy": 1.0,
"eval_loss": 31.08062744140625,
"eval_runtime": 17.9909,
"eval_samples_per_second": 2.668,
"eval_steps_per_second": 0.667,
"step": 1677
},
{
"epoch": 13.178294573643411,
"grad_norm": 75.52674865722656,
"learning_rate": 0.00034730000000000004,
"loss": 23.033,
"step": 1700
},
{
"epoch": 13.953488372093023,
"grad_norm": 45.41800308227539,
"learning_rate": 0.0003442,
"loss": 21.9133,
"step": 1800
},
{
"epoch": 14.0,
"eval_dummy": 1.0,
"eval_loss": 31.397354125976562,
"eval_runtime": 17.0643,
"eval_samples_per_second": 2.813,
"eval_steps_per_second": 0.703,
"step": 1806
},
{
"epoch": 14.728682170542635,
"grad_norm": 46.21736145019531,
"learning_rate": 0.0003411,
"loss": 22.3071,
"step": 1900
},
{
"epoch": 15.0,
"eval_dummy": 1.0,
"eval_loss": 32.09249496459961,
"eval_runtime": 17.5136,
"eval_samples_per_second": 2.741,
"eval_steps_per_second": 0.685,
"step": 1935
},
{
"epoch": 15.503875968992247,
"grad_norm": 29.049968719482422,
"learning_rate": 0.000338,
"loss": 21.0819,
"step": 2000
},
{
"epoch": 16.0,
"eval_dummy": 1.0,
"eval_loss": 29.936742782592773,
"eval_runtime": 17.2024,
"eval_samples_per_second": 2.79,
"eval_steps_per_second": 0.698,
"step": 2064
},
{
"epoch": 16.27906976744186,
"grad_norm": 70.16988372802734,
"learning_rate": 0.0003349,
"loss": 21.0089,
"step": 2100
},
{
"epoch": 17.0,
"eval_dummy": 1.0,
"eval_loss": 30.042001724243164,
"eval_runtime": 17.9057,
"eval_samples_per_second": 2.681,
"eval_steps_per_second": 0.67,
"step": 2193
},
{
"epoch": 17.05426356589147,
"grad_norm": 77.12342834472656,
"learning_rate": 0.00033180000000000004,
"loss": 21.1193,
"step": 2200
},
{
"epoch": 17.829457364341085,
"grad_norm": 40.200958251953125,
"learning_rate": 0.0003287,
"loss": 20.9169,
"step": 2300
},
{
"epoch": 18.0,
"eval_dummy": 1.0,
"eval_loss": 29.293771743774414,
"eval_runtime": 17.1083,
"eval_samples_per_second": 2.806,
"eval_steps_per_second": 0.701,
"step": 2322
},
{
"epoch": 18.6046511627907,
"grad_norm": 31.95384979248047,
"learning_rate": 0.0003256,
"loss": 19.7935,
"step": 2400
},
{
"epoch": 19.0,
"eval_dummy": 1.0,
"eval_loss": 31.394454956054688,
"eval_runtime": 18.6297,
"eval_samples_per_second": 2.577,
"eval_steps_per_second": 0.644,
"step": 2451
},
{
"epoch": 19.37984496124031,
"grad_norm": 54.4798698425293,
"learning_rate": 0.00032250000000000003,
"loss": 19.8749,
"step": 2500
},
{
"epoch": 20.0,
"eval_dummy": 1.0,
"eval_loss": 29.845718383789062,
"eval_runtime": 17.4128,
"eval_samples_per_second": 2.757,
"eval_steps_per_second": 0.689,
"step": 2580
},
{
"epoch": 20.155038759689923,
"grad_norm": 61.0432243347168,
"learning_rate": 0.0003194,
"loss": 19.6959,
"step": 2600
},
{
"epoch": 20.930232558139537,
"grad_norm": 64.08226013183594,
"learning_rate": 0.0003163,
"loss": 19.2973,
"step": 2700
},
{
"epoch": 21.0,
"eval_dummy": 1.0,
"eval_loss": 29.071313858032227,
"eval_runtime": 18.6354,
"eval_samples_per_second": 2.576,
"eval_steps_per_second": 0.644,
"step": 2709
},
{
"epoch": 21.705426356589147,
"grad_norm": 54.097721099853516,
"learning_rate": 0.0003132,
"loss": 18.5436,
"step": 2800
},
{
"epoch": 22.0,
"eval_dummy": 1.0,
"eval_loss": 29.084577560424805,
"eval_runtime": 17.3593,
"eval_samples_per_second": 2.765,
"eval_steps_per_second": 0.691,
"step": 2838
},
{
"epoch": 22.48062015503876,
"grad_norm": 37.09468460083008,
"learning_rate": 0.00031010000000000006,
"loss": 18.5996,
"step": 2900
},
{
"epoch": 23.0,
"eval_dummy": 1.0,
"eval_loss": 29.88102149963379,
"eval_runtime": 17.5696,
"eval_samples_per_second": 2.732,
"eval_steps_per_second": 0.683,
"step": 2967
},
{
"epoch": 23.25581395348837,
"grad_norm": 40.900291442871094,
"learning_rate": 0.00030700000000000004,
"loss": 19.1228,
"step": 3000
},
{
"epoch": 24.0,
"eval_dummy": 1.0,
"eval_loss": 29.301599502563477,
"eval_runtime": 17.3752,
"eval_samples_per_second": 2.763,
"eval_steps_per_second": 0.691,
"step": 3096
},
{
"epoch": 24.031007751937985,
"grad_norm": 16.258365631103516,
"learning_rate": 0.0003039,
"loss": 18.2692,
"step": 3100
},
{
"epoch": 24.8062015503876,
"grad_norm": 53.048091888427734,
"learning_rate": 0.0003008,
"loss": 18.0519,
"step": 3200
},
{
"epoch": 25.0,
"eval_dummy": 1.0,
"eval_loss": 30.71547508239746,
"eval_runtime": 18.0599,
"eval_samples_per_second": 2.658,
"eval_steps_per_second": 0.664,
"step": 3225
},
{
"epoch": 25.58139534883721,
"grad_norm": 24.09917640686035,
"learning_rate": 0.0002977,
"loss": 17.7073,
"step": 3300
},
{
"epoch": 26.0,
"eval_dummy": 1.0,
"eval_loss": 28.716806411743164,
"eval_runtime": 17.32,
"eval_samples_per_second": 2.771,
"eval_steps_per_second": 0.693,
"step": 3354
},
{
"epoch": 26.356589147286822,
"grad_norm": 86.21530151367188,
"learning_rate": 0.0002946,
"loss": 17.5055,
"step": 3400
},
{
"epoch": 27.0,
"eval_dummy": 1.0,
"eval_loss": 28.989931106567383,
"eval_runtime": 17.6249,
"eval_samples_per_second": 2.723,
"eval_steps_per_second": 0.681,
"step": 3483
},
{
"epoch": 27.131782945736433,
"grad_norm": 20.129249572753906,
"learning_rate": 0.0002915,
"loss": 18.054,
"step": 3500
},
{
"epoch": 27.906976744186046,
"grad_norm": 22.863677978515625,
"learning_rate": 0.0002884,
"loss": 17.4854,
"step": 3600
},
{
"epoch": 28.0,
"eval_dummy": 1.0,
"eval_loss": 30.19437599182129,
"eval_runtime": 17.6593,
"eval_samples_per_second": 2.718,
"eval_steps_per_second": 0.68,
"step": 3612
},
{
"epoch": 28.68217054263566,
"grad_norm": 13.843172073364258,
"learning_rate": 0.0002853,
"loss": 17.0048,
"step": 3700
},
{
"epoch": 29.0,
"eval_dummy": 1.0,
"eval_loss": 29.28289031982422,
"eval_runtime": 18.1574,
"eval_samples_per_second": 2.644,
"eval_steps_per_second": 0.661,
"step": 3741
},
{
"epoch": 29.45736434108527,
"grad_norm": 20.727048873901367,
"learning_rate": 0.0002822,
"loss": 16.8731,
"step": 3800
},
{
"epoch": 30.0,
"eval_dummy": 1.0,
"eval_loss": 30.1208438873291,
"eval_runtime": 16.9906,
"eval_samples_per_second": 2.825,
"eval_steps_per_second": 0.706,
"step": 3870
},
{
"epoch": 30.232558139534884,
"grad_norm": 19.721155166625977,
"learning_rate": 0.0002791,
"loss": 16.683,
"step": 3900
},
{
"epoch": 31.0,
"eval_dummy": 1.0,
"eval_loss": 30.758291244506836,
"eval_runtime": 17.7849,
"eval_samples_per_second": 2.699,
"eval_steps_per_second": 0.675,
"step": 3999
},
{
"epoch": 31.007751937984494,
"grad_norm": 25.496213912963867,
"learning_rate": 0.00027600000000000004,
"loss": 16.9178,
"step": 4000
},
{
"epoch": 31.782945736434108,
"grad_norm": 26.640628814697266,
"learning_rate": 0.0002729,
"loss": 16.6109,
"step": 4100
},
{
"epoch": 32.0,
"eval_dummy": 1.0,
"eval_loss": 30.623199462890625,
"eval_runtime": 16.843,
"eval_samples_per_second": 2.85,
"eval_steps_per_second": 0.712,
"step": 4128
},
{
"epoch": 32.55813953488372,
"grad_norm": 25.067975997924805,
"learning_rate": 0.0002698,
"loss": 15.8261,
"step": 4200
},
{
"epoch": 33.0,
"eval_dummy": 1.0,
"eval_loss": 29.416189193725586,
"eval_runtime": 17.6752,
"eval_samples_per_second": 2.716,
"eval_steps_per_second": 0.679,
"step": 4257
},
{
"epoch": 33.333333333333336,
"grad_norm": 30.74283218383789,
"learning_rate": 0.00026670000000000003,
"loss": 16.9002,
"step": 4300
},
{
"epoch": 34.0,
"eval_dummy": 1.0,
"eval_loss": 30.438751220703125,
"eval_runtime": 16.8785,
"eval_samples_per_second": 2.844,
"eval_steps_per_second": 0.711,
"step": 4386
},
{
"epoch": 34.10852713178294,
"grad_norm": 72.6050033569336,
"learning_rate": 0.0002636,
"loss": 15.7742,
"step": 4400
},
{
"epoch": 34.883720930232556,
"grad_norm": 93.86290740966797,
"learning_rate": 0.00026050000000000004,
"loss": 16.3081,
"step": 4500
},
{
"epoch": 35.0,
"eval_dummy": 1.0,
"eval_loss": 29.97564697265625,
"eval_runtime": 17.4518,
"eval_samples_per_second": 2.75,
"eval_steps_per_second": 0.688,
"step": 4515
},
{
"epoch": 35.65891472868217,
"grad_norm": 24.93766975402832,
"learning_rate": 0.0002574,
"loss": 15.4745,
"step": 4600
},
{
"epoch": 36.0,
"eval_dummy": 1.0,
"eval_loss": 28.821380615234375,
"eval_runtime": 16.9764,
"eval_samples_per_second": 2.827,
"eval_steps_per_second": 0.707,
"step": 4644
},
{
"epoch": 36.434108527131784,
"grad_norm": 33.70745849609375,
"learning_rate": 0.0002543,
"loss": 15.938,
"step": 4700
},
{
"epoch": 37.0,
"eval_dummy": 1.0,
"eval_loss": 29.100107192993164,
"eval_runtime": 17.5981,
"eval_samples_per_second": 2.728,
"eval_steps_per_second": 0.682,
"step": 4773
},
{
"epoch": 37.2093023255814,
"grad_norm": 59.88523864746094,
"learning_rate": 0.00025120000000000003,
"loss": 14.9862,
"step": 4800
},
{
"epoch": 37.98449612403101,
"grad_norm": 20.979228973388672,
"learning_rate": 0.0002481,
"loss": 15.9947,
"step": 4900
},
{
"epoch": 38.0,
"eval_dummy": 1.0,
"eval_loss": 31.053319931030273,
"eval_runtime": 17.0472,
"eval_samples_per_second": 2.816,
"eval_steps_per_second": 0.704,
"step": 4902
},
{
"epoch": 38.75968992248062,
"grad_norm": 17.90158462524414,
"learning_rate": 0.000245,
"loss": 15.2328,
"step": 5000
},
{
"epoch": 39.0,
"eval_dummy": 1.0,
"eval_loss": 31.62113380432129,
"eval_runtime": 17.318,
"eval_samples_per_second": 2.772,
"eval_steps_per_second": 0.693,
"step": 5031
},
{
"epoch": 39.53488372093023,
"grad_norm": 33.11941909790039,
"learning_rate": 0.00024190000000000003,
"loss": 15.202,
"step": 5100
},
{
"epoch": 40.0,
"eval_dummy": 1.0,
"eval_loss": 33.138301849365234,
"eval_runtime": 17.0128,
"eval_samples_per_second": 2.821,
"eval_steps_per_second": 0.705,
"step": 5160
},
{
"epoch": 40.310077519379846,
"grad_norm": 15.685113906860352,
"learning_rate": 0.0002388,
"loss": 15.0583,
"step": 5200
},
{
"epoch": 41.0,
"eval_dummy": 1.0,
"eval_loss": 31.408859252929688,
"eval_runtime": 17.9066,
"eval_samples_per_second": 2.681,
"eval_steps_per_second": 0.67,
"step": 5289
},
{
"epoch": 41.08527131782946,
"grad_norm": 20.353235244750977,
"learning_rate": 0.00023569999999999998,
"loss": 14.7257,
"step": 5300
},
{
"epoch": 41.86046511627907,
"grad_norm": 22.713470458984375,
"learning_rate": 0.00023259999999999996,
"loss": 14.573,
"step": 5400
},
{
"epoch": 42.0,
"eval_dummy": 1.0,
"eval_loss": 31.568130493164062,
"eval_runtime": 17.7042,
"eval_samples_per_second": 2.711,
"eval_steps_per_second": 0.678,
"step": 5418
},
{
"epoch": 42.63565891472868,
"grad_norm": 24.60871696472168,
"learning_rate": 0.0002295,
"loss": 14.7401,
"step": 5500
},
{
"epoch": 43.0,
"eval_dummy": 1.0,
"eval_loss": 30.554765701293945,
"eval_runtime": 18.0539,
"eval_samples_per_second": 2.659,
"eval_steps_per_second": 0.665,
"step": 5547
},
{
"epoch": 43.41085271317829,
"grad_norm": 36.38352966308594,
"learning_rate": 0.0002264,
"loss": 14.6052,
"step": 5600
},
{
"epoch": 44.0,
"eval_dummy": 1.0,
"eval_loss": 31.39527702331543,
"eval_runtime": 17.3086,
"eval_samples_per_second": 2.773,
"eval_steps_per_second": 0.693,
"step": 5676
},
{
"epoch": 44.18604651162791,
"grad_norm": 14.869057655334473,
"learning_rate": 0.00022330000000000003,
"loss": 13.9636,
"step": 5700
},
{
"epoch": 44.96124031007752,
"grad_norm": 12.379744529724121,
"learning_rate": 0.0002202,
"loss": 14.1299,
"step": 5800
},
{
"epoch": 45.0,
"eval_dummy": 1.0,
"eval_loss": 30.81528663635254,
"eval_runtime": 17.5777,
"eval_samples_per_second": 2.731,
"eval_steps_per_second": 0.683,
"step": 5805
},
{
"epoch": 45.736434108527135,
"grad_norm": 51.40928649902344,
"learning_rate": 0.00021710000000000005,
"loss": 13.6851,
"step": 5900
},
{
"epoch": 46.0,
"eval_dummy": 1.0,
"eval_loss": 30.969324111938477,
"eval_runtime": 17.5744,
"eval_samples_per_second": 2.731,
"eval_steps_per_second": 0.683,
"step": 5934
},
{
"epoch": 46.51162790697674,
"grad_norm": 13.089680671691895,
"learning_rate": 0.00021400000000000002,
"loss": 14.6677,
"step": 6000
},
{
"epoch": 47.0,
"eval_dummy": 1.0,
"eval_loss": 31.936065673828125,
"eval_runtime": 18.038,
"eval_samples_per_second": 2.661,
"eval_steps_per_second": 0.665,
"step": 6063
},
{
"epoch": 47.286821705426355,
"grad_norm": 15.198484420776367,
"learning_rate": 0.0002109,
"loss": 13.6493,
"step": 6100
},
{
"epoch": 48.0,
"eval_dummy": 1.0,
"eval_loss": 34.3327751159668,
"eval_runtime": 17.0231,
"eval_samples_per_second": 2.82,
"eval_steps_per_second": 0.705,
"step": 6192
},
{
"epoch": 48.06201550387597,
"grad_norm": 17.197790145874023,
"learning_rate": 0.00020780000000000004,
"loss": 13.7191,
"step": 6200
},
{
"epoch": 48.83720930232558,
"grad_norm": 22.198528289794922,
"learning_rate": 0.00020470000000000002,
"loss": 14.166,
"step": 6300
},
{
"epoch": 49.0,
"eval_dummy": 1.0,
"eval_loss": 32.62310791015625,
"eval_runtime": 17.8418,
"eval_samples_per_second": 2.69,
"eval_steps_per_second": 0.673,
"step": 6321
},
{
"epoch": 49.6124031007752,
"grad_norm": 12.192609786987305,
"learning_rate": 0.00020160000000000002,
"loss": 13.7388,
"step": 6400
},
{
"epoch": 50.0,
"eval_dummy": 1.0,
"eval_loss": 33.17361831665039,
"eval_runtime": 16.9796,
"eval_samples_per_second": 2.827,
"eval_steps_per_second": 0.707,
"step": 6450
},
{
"epoch": 50.3875968992248,
"grad_norm": 24.24190330505371,
"learning_rate": 0.0001985,
"loss": 13.0849,
"step": 6500
},
{
"epoch": 51.0,
"eval_dummy": 1.0,
"eval_loss": 34.95216369628906,
"eval_runtime": 17.6852,
"eval_samples_per_second": 2.714,
"eval_steps_per_second": 0.679,
"step": 6579
},
{
"epoch": 51.16279069767442,
"grad_norm": 10.653921127319336,
"learning_rate": 0.0001954,
"loss": 13.7478,
"step": 6600
},
{
"epoch": 51.93798449612403,
"grad_norm": 12.344590187072754,
"learning_rate": 0.00019229999999999999,
"loss": 13.2502,
"step": 6700
},
{
"epoch": 52.0,
"eval_dummy": 1.0,
"eval_loss": 35.79899215698242,
"eval_runtime": 17.0885,
"eval_samples_per_second": 2.809,
"eval_steps_per_second": 0.702,
"step": 6708
},
{
"epoch": 52.713178294573645,
"grad_norm": 11.102241516113281,
"learning_rate": 0.0001892,
"loss": 13.5116,
"step": 6800
},
{
"epoch": 53.0,
"eval_dummy": 1.0,
"eval_loss": 31.57374382019043,
"eval_runtime": 17.6271,
"eval_samples_per_second": 2.723,
"eval_steps_per_second": 0.681,
"step": 6837
},
{
"epoch": 53.48837209302326,
"grad_norm": 10.652983665466309,
"learning_rate": 0.00018610000000000002,
"loss": 12.6993,
"step": 6900
},
{
"epoch": 54.0,
"eval_dummy": 1.0,
"eval_loss": 33.26504898071289,
"eval_runtime": 17.4525,
"eval_samples_per_second": 2.75,
"eval_steps_per_second": 0.688,
"step": 6966
},
{
"epoch": 54.263565891472865,
"grad_norm": 12.211697578430176,
"learning_rate": 0.00018300000000000003,
"loss": 13.3602,
"step": 7000
},
{
"epoch": 55.0,
"eval_dummy": 1.0,
"eval_loss": 34.891380310058594,
"eval_runtime": 18.9822,
"eval_samples_per_second": 2.529,
"eval_steps_per_second": 0.632,
"step": 7095
},
{
"epoch": 55.03875968992248,
"grad_norm": 14.056374549865723,
"learning_rate": 0.0001799,
"loss": 12.9955,
"step": 7100
},
{
"epoch": 55.81395348837209,
"grad_norm": 10.69999885559082,
"learning_rate": 0.00017680000000000001,
"loss": 12.9585,
"step": 7200
},
{
"epoch": 56.0,
"eval_dummy": 1.0,
"eval_loss": 35.98616409301758,
"eval_runtime": 17.2599,
"eval_samples_per_second": 2.781,
"eval_steps_per_second": 0.695,
"step": 7224
},
{
"epoch": 56.58914728682171,
"grad_norm": 7.194685459136963,
"learning_rate": 0.00017370000000000002,
"loss": 12.7434,
"step": 7300
},
{
"epoch": 57.0,
"eval_dummy": 1.0,
"eval_loss": 34.91057205200195,
"eval_runtime": 18.4913,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.649,
"step": 7353
},
{
"epoch": 57.36434108527132,
"grad_norm": 9.769908905029297,
"learning_rate": 0.0001706,
"loss": 12.7299,
"step": 7400
},
{
"epoch": 58.0,
"eval_dummy": 1.0,
"eval_loss": 34.010562896728516,
"eval_runtime": 17.0454,
"eval_samples_per_second": 2.816,
"eval_steps_per_second": 0.704,
"step": 7482
},
{
"epoch": 58.13953488372093,
"grad_norm": 18.091665267944336,
"learning_rate": 0.0001675,
"loss": 12.3929,
"step": 7500
},
{
"epoch": 58.91472868217054,
"grad_norm": 8.427603721618652,
"learning_rate": 0.00016439999999999998,
"loss": 12.717,
"step": 7600
},
{
"epoch": 59.0,
"eval_dummy": 1.0,
"eval_loss": 36.35882568359375,
"eval_runtime": 18.0781,
"eval_samples_per_second": 2.655,
"eval_steps_per_second": 0.664,
"step": 7611
},
{
"epoch": 59.689922480620154,
"grad_norm": 15.918642044067383,
"learning_rate": 0.00016130000000000002,
"loss": 12.0563,
"step": 7700
},
{
"epoch": 60.0,
"eval_dummy": 1.0,
"eval_loss": 35.09232711791992,
"eval_runtime": 16.9066,
"eval_samples_per_second": 2.839,
"eval_steps_per_second": 0.71,
"step": 7740
},
{
"epoch": 60.46511627906977,
"grad_norm": 13.870895385742188,
"learning_rate": 0.00015820000000000002,
"loss": 13.012,
"step": 7800
},
{
"epoch": 61.0,
"eval_dummy": 1.0,
"eval_loss": 38.73225402832031,
"eval_runtime": 17.9846,
"eval_samples_per_second": 2.669,
"eval_steps_per_second": 0.667,
"step": 7869
},
{
"epoch": 61.24031007751938,
"grad_norm": 7.798965930938721,
"learning_rate": 0.00015510000000000003,
"loss": 12.2878,
"step": 7900
},
{
"epoch": 62.0,
"eval_dummy": 1.0,
"eval_loss": 34.9967155456543,
"eval_runtime": 17.0439,
"eval_samples_per_second": 2.816,
"eval_steps_per_second": 0.704,
"step": 7998
},
{
"epoch": 62.01550387596899,
"grad_norm": 8.46688461303711,
"learning_rate": 0.000152,
"loss": 12.3515,
"step": 8000
},
{
"epoch": 62.7906976744186,
"grad_norm": 9.745466232299805,
"learning_rate": 0.00014890000000000001,
"loss": 12.2794,
"step": 8100
},
{
"epoch": 63.0,
"eval_dummy": 1.0,
"eval_loss": 37.55772399902344,
"eval_runtime": 18.0451,
"eval_samples_per_second": 2.66,
"eval_steps_per_second": 0.665,
"step": 8127
},
{
"epoch": 63.565891472868216,
"grad_norm": 7.328401565551758,
"learning_rate": 0.0001458,
"loss": 12.4147,
"step": 8200
},
{
"epoch": 64.0,
"eval_dummy": 1.0,
"eval_loss": 37.27333068847656,
"eval_runtime": 19.2621,
"eval_samples_per_second": 2.492,
"eval_steps_per_second": 0.623,
"step": 8256
},
{
"epoch": 64.34108527131782,
"grad_norm": 12.89833927154541,
"learning_rate": 0.0001427,
"loss": 12.0032,
"step": 8300
},
{
"epoch": 65.0,
"eval_dummy": 1.0,
"eval_loss": 35.3015022277832,
"eval_runtime": 17.8838,
"eval_samples_per_second": 2.684,
"eval_steps_per_second": 0.671,
"step": 8385
},
{
"epoch": 65.11627906976744,
"grad_norm": 15.308392524719238,
"learning_rate": 0.00013959999999999998,
"loss": 11.7392,
"step": 8400
},
{
"epoch": 65.89147286821705,
"grad_norm": 12.101038932800293,
"learning_rate": 0.00013650000000000004,
"loss": 12.2793,
"step": 8500
},
{
"epoch": 66.0,
"eval_dummy": 1.0,
"eval_loss": 35.280582427978516,
"eval_runtime": 17.6628,
"eval_samples_per_second": 2.718,
"eval_steps_per_second": 0.679,
"step": 8514
},
{
"epoch": 66.66666666666667,
"grad_norm": 12.754354476928711,
"learning_rate": 0.00013340000000000002,
"loss": 12.2309,
"step": 8600
},
{
"epoch": 67.0,
"eval_dummy": 1.0,
"eval_loss": 36.24875259399414,
"eval_runtime": 17.2522,
"eval_samples_per_second": 2.782,
"eval_steps_per_second": 0.696,
"step": 8643
},
{
"epoch": 67.44186046511628,
"grad_norm": 9.756113052368164,
"learning_rate": 0.00013030000000000002,
"loss": 11.7082,
"step": 8700
},
{
"epoch": 68.0,
"eval_dummy": 1.0,
"eval_loss": 35.66865158081055,
"eval_runtime": 18.2695,
"eval_samples_per_second": 2.627,
"eval_steps_per_second": 0.657,
"step": 8772
},
{
"epoch": 68.21705426356588,
"grad_norm": 9.372435569763184,
"learning_rate": 0.0001272,
"loss": 11.5136,
"step": 8800
},
{
"epoch": 68.9922480620155,
"grad_norm": 15.044282913208008,
"learning_rate": 0.0001241,
"loss": 11.8694,
"step": 8900
},
{
"epoch": 69.0,
"eval_dummy": 1.0,
"eval_loss": 36.04698944091797,
"eval_runtime": 17.3888,
"eval_samples_per_second": 2.76,
"eval_steps_per_second": 0.69,
"step": 8901
},
{
"epoch": 69.76744186046511,
"grad_norm": 9.250027656555176,
"learning_rate": 0.000121,
"loss": 11.782,
"step": 9000
},
{
"epoch": 70.0,
"eval_dummy": 1.0,
"eval_loss": 35.40549087524414,
"eval_runtime": 18.0194,
"eval_samples_per_second": 2.664,
"eval_steps_per_second": 0.666,
"step": 9030
},
{
"epoch": 70.54263565891473,
"grad_norm": 16.64485740661621,
"learning_rate": 0.00011789999999999999,
"loss": 11.6254,
"step": 9100
},
{
"epoch": 71.0,
"eval_dummy": 1.0,
"eval_loss": 36.70663070678711,
"eval_runtime": 17.3613,
"eval_samples_per_second": 2.765,
"eval_steps_per_second": 0.691,
"step": 9159
},
{
"epoch": 71.31782945736434,
"grad_norm": 15.693510055541992,
"learning_rate": 0.00011479999999999997,
"loss": 11.5873,
"step": 9200
},
{
"epoch": 72.0,
"eval_dummy": 1.0,
"eval_loss": 36.10844421386719,
"eval_runtime": 17.5839,
"eval_samples_per_second": 2.73,
"eval_steps_per_second": 0.682,
"step": 9288
},
{
"epoch": 72.09302325581395,
"grad_norm": 7.485771179199219,
"learning_rate": 0.00011170000000000003,
"loss": 11.6159,
"step": 9300
},
{
"epoch": 72.86821705426357,
"grad_norm": 15.41925048828125,
"learning_rate": 0.00010860000000000004,
"loss": 11.6251,
"step": 9400
},
{
"epoch": 73.0,
"eval_dummy": 1.0,
"eval_loss": 38.29316329956055,
"eval_runtime": 17.0634,
"eval_samples_per_second": 2.813,
"eval_steps_per_second": 0.703,
"step": 9417
},
{
"epoch": 73.64341085271317,
"grad_norm": 16.74988555908203,
"learning_rate": 0.00010550000000000002,
"loss": 11.4589,
"step": 9500
},
{
"epoch": 74.0,
"eval_dummy": 1.0,
"eval_loss": 36.55695724487305,
"eval_runtime": 17.9041,
"eval_samples_per_second": 2.681,
"eval_steps_per_second": 0.67,
"step": 9546
},
{
"epoch": 74.4186046511628,
"grad_norm": 146.4043426513672,
"learning_rate": 0.00010240000000000001,
"loss": 11.7378,
"step": 9600
},
{
"epoch": 75.0,
"eval_dummy": 1.0,
"eval_loss": 35.988651275634766,
"eval_runtime": 17.0167,
"eval_samples_per_second": 2.821,
"eval_steps_per_second": 0.705,
"step": 9675
},
{
"epoch": 75.1937984496124,
"grad_norm": 10.800848960876465,
"learning_rate": 9.93e-05,
"loss": 11.4043,
"step": 9700
},
{
"epoch": 75.96899224806202,
"grad_norm": 9.41010570526123,
"learning_rate": 9.62e-05,
"loss": 11.4933,
"step": 9800
},
{
"epoch": 76.0,
"eval_dummy": 1.0,
"eval_loss": 36.47134017944336,
"eval_runtime": 17.8569,
"eval_samples_per_second": 2.688,
"eval_steps_per_second": 0.672,
"step": 9804
},
{
"epoch": 76.74418604651163,
"grad_norm": 19.61960220336914,
"learning_rate": 9.31e-05,
"loss": 11.2566,
"step": 9900
},
{
"epoch": 77.0,
"eval_dummy": 1.0,
"eval_loss": 36.96221923828125,
"eval_runtime": 17.0325,
"eval_samples_per_second": 2.818,
"eval_steps_per_second": 0.705,
"step": 9933
},
{
"epoch": 77.51937984496124,
"grad_norm": 9.528326034545898,
"learning_rate": 8.999999999999999e-05,
"loss": 11.25,
"step": 10000
},
{
"epoch": 78.0,
"eval_dummy": 1.0,
"eval_loss": 37.10159683227539,
"eval_runtime": 18.225,
"eval_samples_per_second": 2.634,
"eval_steps_per_second": 0.658,
"step": 10062
},
{
"epoch": 78.29457364341086,
"grad_norm": 8.064875602722168,
"learning_rate": 8.690000000000003e-05,
"loss": 11.2962,
"step": 10100
},
{
"epoch": 79.0,
"eval_dummy": 1.0,
"eval_loss": 37.87105178833008,
"eval_runtime": 16.9948,
"eval_samples_per_second": 2.824,
"eval_steps_per_second": 0.706,
"step": 10191
},
{
"epoch": 79.06976744186046,
"grad_norm": 10.451505661010742,
"learning_rate": 8.380000000000002e-05,
"loss": 11.1642,
"step": 10200
},
{
"epoch": 79.84496124031008,
"grad_norm": 7.318461894989014,
"learning_rate": 8.070000000000001e-05,
"loss": 11.0868,
"step": 10300
},
{
"epoch": 80.0,
"eval_dummy": 1.0,
"eval_loss": 38.571414947509766,
"eval_runtime": 18.0923,
"eval_samples_per_second": 2.653,
"eval_steps_per_second": 0.663,
"step": 10320
},
{
"epoch": 80.62015503875969,
"grad_norm": 10.301888465881348,
"learning_rate": 7.760000000000002e-05,
"loss": 11.2786,
"step": 10400
},
{
"epoch": 81.0,
"eval_dummy": 1.0,
"eval_loss": 38.1493034362793,
"eval_runtime": 17.0167,
"eval_samples_per_second": 2.821,
"eval_steps_per_second": 0.705,
"step": 10449
},
{
"epoch": 81.3953488372093,
"grad_norm": 8.667201042175293,
"learning_rate": 7.450000000000001e-05,
"loss": 11.1528,
"step": 10500
},
{
"epoch": 82.0,
"eval_dummy": 1.0,
"eval_loss": 39.0099983215332,
"eval_runtime": 17.9494,
"eval_samples_per_second": 2.674,
"eval_steps_per_second": 0.669,
"step": 10578
},
{
"epoch": 82.17054263565892,
"grad_norm": 5.117663860321045,
"learning_rate": 7.14e-05,
"loss": 10.9299,
"step": 10600
},
{
"epoch": 82.94573643410853,
"grad_norm": 5.9621806144714355,
"learning_rate": 6.829999999999999e-05,
"loss": 11.089,
"step": 10700
},
{
"epoch": 83.0,
"eval_dummy": 1.0,
"eval_loss": 38.5473518371582,
"eval_runtime": 17.0039,
"eval_samples_per_second": 2.823,
"eval_steps_per_second": 0.706,
"step": 10707
},
{
"epoch": 83.72093023255815,
"grad_norm": 6.17501974105835,
"learning_rate": 6.519999999999999e-05,
"loss": 10.954,
"step": 10800
},
{
"epoch": 84.0,
"eval_dummy": 1.0,
"eval_loss": 38.940486907958984,
"eval_runtime": 17.721,
"eval_samples_per_second": 2.709,
"eval_steps_per_second": 0.677,
"step": 10836
},
{
"epoch": 84.49612403100775,
"grad_norm": 31.69377326965332,
"learning_rate": 6.210000000000003e-05,
"loss": 11.0157,
"step": 10900
},
{
"epoch": 85.0,
"eval_dummy": 1.0,
"eval_loss": 39.3872184753418,
"eval_runtime": 16.9062,
"eval_samples_per_second": 2.839,
"eval_steps_per_second": 0.71,
"step": 10965
},
{
"epoch": 85.27131782945736,
"grad_norm": 7.1002984046936035,
"learning_rate": 5.9000000000000025e-05,
"loss": 10.9849,
"step": 11000
},
{
"epoch": 86.0,
"eval_dummy": 1.0,
"eval_loss": 39.4875373840332,
"eval_runtime": 17.7347,
"eval_samples_per_second": 2.707,
"eval_steps_per_second": 0.677,
"step": 11094
},
{
"epoch": 86.04651162790698,
"grad_norm": 13.370129585266113,
"learning_rate": 5.590000000000002e-05,
"loss": 11.0614,
"step": 11100
},
{
"epoch": 86.82170542635659,
"grad_norm": 5.192051887512207,
"learning_rate": 5.28e-05,
"loss": 10.5423,
"step": 11200
},
{
"epoch": 87.0,
"eval_dummy": 1.0,
"eval_loss": 39.11787796020508,
"eval_runtime": 16.9675,
"eval_samples_per_second": 2.829,
"eval_steps_per_second": 0.707,
"step": 11223
},
{
"epoch": 87.59689922480621,
"grad_norm": 5.747579097747803,
"learning_rate": 4.97e-05,
"loss": 11.1968,
"step": 11300
},
{
"epoch": 88.0,
"eval_dummy": 1.0,
"eval_loss": 39.4084358215332,
"eval_runtime": 17.9374,
"eval_samples_per_second": 2.676,
"eval_steps_per_second": 0.669,
"step": 11352
},
{
"epoch": 88.37209302325581,
"grad_norm": 11.57238483428955,
"learning_rate": 4.66e-05,
"loss": 10.6376,
"step": 11400
},
{
"epoch": 89.0,
"eval_dummy": 1.0,
"eval_loss": 39.82176971435547,
"eval_runtime": 16.9422,
"eval_samples_per_second": 2.833,
"eval_steps_per_second": 0.708,
"step": 11481
},
{
"epoch": 89.14728682170542,
"grad_norm": 8.44890308380127,
"learning_rate": 4.3499999999999993e-05,
"loss": 10.8035,
"step": 11500
},
{
"epoch": 89.92248062015504,
"grad_norm": 7.732810974121094,
"learning_rate": 4.0399999999999986e-05,
"loss": 10.7131,
"step": 11600
},
{
"epoch": 90.0,
"eval_dummy": 1.0,
"eval_loss": 39.25526428222656,
"eval_runtime": 18.1455,
"eval_samples_per_second": 2.645,
"eval_steps_per_second": 0.661,
"step": 11610
},
{
"epoch": 90.69767441860465,
"grad_norm": 6.144818305969238,
"learning_rate": 3.7300000000000026e-05,
"loss": 10.8252,
"step": 11700
},
{
"epoch": 91.0,
"eval_dummy": 1.0,
"eval_loss": 39.136837005615234,
"eval_runtime": 16.9741,
"eval_samples_per_second": 2.828,
"eval_steps_per_second": 0.707,
"step": 11739
},
{
"epoch": 91.47286821705427,
"grad_norm": 4.7243475914001465,
"learning_rate": 3.420000000000002e-05,
"loss": 10.6456,
"step": 11800
},
{
"epoch": 92.0,
"eval_dummy": 1.0,
"eval_loss": 38.91936111450195,
"eval_runtime": 17.9915,
"eval_samples_per_second": 2.668,
"eval_steps_per_second": 0.667,
"step": 11868
},
{
"epoch": 92.24806201550388,
"grad_norm": 13.374404907226562,
"learning_rate": 3.110000000000002e-05,
"loss": 10.8488,
"step": 11900
},
{
"epoch": 93.0,
"eval_dummy": 1.0,
"eval_loss": 39.595462799072266,
"eval_runtime": 16.9478,
"eval_samples_per_second": 2.832,
"eval_steps_per_second": 0.708,
"step": 11997
},
{
"epoch": 93.02325581395348,
"grad_norm": 9.406952857971191,
"learning_rate": 2.8000000000000003e-05,
"loss": 10.5219,
"step": 12000
},
{
"epoch": 93.7984496124031,
"grad_norm": 5.360720634460449,
"learning_rate": 2.49e-05,
"loss": 10.8675,
"step": 12100
},
{
"epoch": 94.0,
"eval_dummy": 1.0,
"eval_loss": 39.47597885131836,
"eval_runtime": 17.7005,
"eval_samples_per_second": 2.712,
"eval_steps_per_second": 0.678,
"step": 12126
},
{
"epoch": 94.57364341085271,
"grad_norm": 3.855013132095337,
"learning_rate": 2.1799999999999995e-05,
"loss": 10.4757,
"step": 12200
},
{
"epoch": 95.0,
"eval_dummy": 1.0,
"eval_loss": 40.484397888183594,
"eval_runtime": 16.9799,
"eval_samples_per_second": 2.827,
"eval_steps_per_second": 0.707,
"step": 12255
},
{
"epoch": 95.34883720930233,
"grad_norm": 7.718498229980469,
"learning_rate": 1.8699999999999987e-05,
"loss": 10.3191,
"step": 12300
},
{
"epoch": 96.0,
"eval_dummy": 1.0,
"eval_loss": 39.06733322143555,
"eval_runtime": 17.7268,
"eval_samples_per_second": 2.708,
"eval_steps_per_second": 0.677,
"step": 12384
},
{
"epoch": 96.12403100775194,
"grad_norm": 3.9156548976898193,
"learning_rate": 1.5599999999999983e-05,
"loss": 10.6169,
"step": 12400
},
{
"epoch": 96.89922480620154,
"grad_norm": 3.464470148086548,
"learning_rate": 1.2500000000000023e-05,
"loss": 10.6073,
"step": 12500
},
{
"epoch": 97.0,
"eval_dummy": 1.0,
"eval_loss": 39.37672805786133,
"eval_runtime": 16.9277,
"eval_samples_per_second": 2.836,
"eval_steps_per_second": 0.709,
"step": 12513
},
{
"epoch": 97.67441860465117,
"grad_norm": 5.994311332702637,
"learning_rate": 9.400000000000018e-06,
"loss": 10.3038,
"step": 12600
},
{
"epoch": 98.0,
"eval_dummy": 1.0,
"eval_loss": 39.69685745239258,
"eval_runtime": 18.0251,
"eval_samples_per_second": 2.663,
"eval_steps_per_second": 0.666,
"step": 12642
},
{
"epoch": 98.44961240310077,
"grad_norm": 4.125715732574463,
"learning_rate": 6.300000000000012e-06,
"loss": 11.0709,
"step": 12700
},
{
"epoch": 99.0,
"eval_dummy": 1.0,
"eval_loss": 39.93254470825195,
"eval_runtime": 17.9269,
"eval_samples_per_second": 2.678,
"eval_steps_per_second": 0.669,
"step": 12771
},
{
"epoch": 99.2248062015504,
"grad_norm": 4.381137371063232,
"learning_rate": 3.200000000000005e-06,
"loss": 10.2398,
"step": 12800
},
{
"epoch": 100.0,
"grad_norm": 4.022866249084473,
"learning_rate": 9.999999999999998e-08,
"loss": 10.5951,
"step": 12900
},
{
"epoch": 100.0,
"eval_dummy": 1.0,
"eval_loss": 39.87546157836914,
"eval_runtime": 17.8005,
"eval_samples_per_second": 2.697,
"eval_steps_per_second": 0.674,
"step": 12900
},
{
"epoch": 100.0,
"step": 12900,
"total_flos": 1.4631500418239693e+19,
"train_loss": 16.309644344832545,
"train_runtime": 27199.2721,
"train_samples_per_second": 1.897,
"train_steps_per_second": 0.474
}
],
"logging_steps": 100,
"max_steps": 12900,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4631500418239693e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}