bitersun's picture
Upload folder using huggingface_hub
c085812 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3120,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016025641025641025,
"grad_norm": 5.729795932769775,
"learning_rate": 4.25531914893617e-08,
"loss": 1.033,
"step": 5
},
{
"epoch": 0.003205128205128205,
"grad_norm": 9.286614418029785,
"learning_rate": 9.574468085106382e-08,
"loss": 1.0218,
"step": 10
},
{
"epoch": 0.004807692307692308,
"grad_norm": 13.221362113952637,
"learning_rate": 1.4893617021276595e-07,
"loss": 1.065,
"step": 15
},
{
"epoch": 0.00641025641025641,
"grad_norm": 11.769225120544434,
"learning_rate": 2.0212765957446807e-07,
"loss": 1.0354,
"step": 20
},
{
"epoch": 0.008012820512820512,
"grad_norm": 14.483790397644043,
"learning_rate": 2.5531914893617016e-07,
"loss": 1.0337,
"step": 25
},
{
"epoch": 0.009615384615384616,
"grad_norm": 7.531055450439453,
"learning_rate": 3.085106382978723e-07,
"loss": 1.0472,
"step": 30
},
{
"epoch": 0.011217948717948718,
"grad_norm": 6.638299942016602,
"learning_rate": 3.617021276595745e-07,
"loss": 0.9963,
"step": 35
},
{
"epoch": 0.01282051282051282,
"grad_norm": 16.26578140258789,
"learning_rate": 4.148936170212766e-07,
"loss": 1.0217,
"step": 40
},
{
"epoch": 0.014423076923076924,
"grad_norm": 17.74137306213379,
"learning_rate": 4.6808510638297873e-07,
"loss": 1.0535,
"step": 45
},
{
"epoch": 0.016025641025641024,
"grad_norm": 12.711201667785645,
"learning_rate": 5.212765957446809e-07,
"loss": 0.8812,
"step": 50
},
{
"epoch": 0.017628205128205128,
"grad_norm": 16.832317352294922,
"learning_rate": 5.74468085106383e-07,
"loss": 1.0726,
"step": 55
},
{
"epoch": 0.019230769230769232,
"grad_norm": 9.954739570617676,
"learning_rate": 6.276595744680851e-07,
"loss": 1.0394,
"step": 60
},
{
"epoch": 0.020833333333333332,
"grad_norm": 15.11540699005127,
"learning_rate": 6.808510638297872e-07,
"loss": 0.928,
"step": 65
},
{
"epoch": 0.022435897435897436,
"grad_norm": 12.173163414001465,
"learning_rate": 7.340425531914893e-07,
"loss": 1.1422,
"step": 70
},
{
"epoch": 0.02403846153846154,
"grad_norm": 4.052046775817871,
"learning_rate": 7.872340425531915e-07,
"loss": 1.0003,
"step": 75
},
{
"epoch": 0.02564102564102564,
"grad_norm": 8.027369499206543,
"learning_rate": 8.404255319148936e-07,
"loss": 0.8759,
"step": 80
},
{
"epoch": 0.027243589743589744,
"grad_norm": 12.160286903381348,
"learning_rate": 8.936170212765957e-07,
"loss": 1.1069,
"step": 85
},
{
"epoch": 0.028846153846153848,
"grad_norm": 10.5565185546875,
"learning_rate": 9.468085106382978e-07,
"loss": 1.0333,
"step": 90
},
{
"epoch": 0.030448717948717948,
"grad_norm": 7.140777111053467,
"learning_rate": 1e-06,
"loss": 0.9822,
"step": 95
},
{
"epoch": 0.03205128205128205,
"grad_norm": 10.643312454223633,
"learning_rate": 9.983476536682088e-07,
"loss": 0.9494,
"step": 100
},
{
"epoch": 0.03365384615384615,
"grad_norm": 8.473272323608398,
"learning_rate": 9.966953073364177e-07,
"loss": 0.945,
"step": 105
},
{
"epoch": 0.035256410256410256,
"grad_norm": 4.137317180633545,
"learning_rate": 9.950429610046264e-07,
"loss": 1.0005,
"step": 110
},
{
"epoch": 0.03685897435897436,
"grad_norm": 7.508554458618164,
"learning_rate": 9.933906146728353e-07,
"loss": 0.8444,
"step": 115
},
{
"epoch": 0.038461538461538464,
"grad_norm": 6.77846097946167,
"learning_rate": 9.917382683410441e-07,
"loss": 0.9563,
"step": 120
},
{
"epoch": 0.04006410256410257,
"grad_norm": 9.547110557556152,
"learning_rate": 9.90085922009253e-07,
"loss": 0.9827,
"step": 125
},
{
"epoch": 0.041666666666666664,
"grad_norm": 13.068811416625977,
"learning_rate": 9.884335756774619e-07,
"loss": 0.8061,
"step": 130
},
{
"epoch": 0.04326923076923077,
"grad_norm": 8.728320121765137,
"learning_rate": 9.867812293456708e-07,
"loss": 0.888,
"step": 135
},
{
"epoch": 0.04487179487179487,
"grad_norm": 3.790550470352173,
"learning_rate": 9.851288830138796e-07,
"loss": 0.8734,
"step": 140
},
{
"epoch": 0.046474358974358976,
"grad_norm": 14.830401420593262,
"learning_rate": 9.834765366820885e-07,
"loss": 0.9757,
"step": 145
},
{
"epoch": 0.04807692307692308,
"grad_norm": 7.318249702453613,
"learning_rate": 9.818241903502974e-07,
"loss": 0.9218,
"step": 150
},
{
"epoch": 0.049679487179487176,
"grad_norm": 3.7414937019348145,
"learning_rate": 9.801718440185063e-07,
"loss": 0.9611,
"step": 155
},
{
"epoch": 0.05128205128205128,
"grad_norm": 6.792606353759766,
"learning_rate": 9.78519497686715e-07,
"loss": 0.9476,
"step": 160
},
{
"epoch": 0.052884615384615384,
"grad_norm": 4.267696380615234,
"learning_rate": 9.768671513549238e-07,
"loss": 0.8234,
"step": 165
},
{
"epoch": 0.05448717948717949,
"grad_norm": 5.2466959953308105,
"learning_rate": 9.752148050231327e-07,
"loss": 0.8669,
"step": 170
},
{
"epoch": 0.05608974358974359,
"grad_norm": 11.836358070373535,
"learning_rate": 9.735624586913416e-07,
"loss": 0.7708,
"step": 175
},
{
"epoch": 0.057692307692307696,
"grad_norm": 5.974247932434082,
"learning_rate": 9.719101123595505e-07,
"loss": 0.8937,
"step": 180
},
{
"epoch": 0.05929487179487179,
"grad_norm": 3.665184497833252,
"learning_rate": 9.702577660277593e-07,
"loss": 0.875,
"step": 185
},
{
"epoch": 0.060897435897435896,
"grad_norm": 4.605494022369385,
"learning_rate": 9.686054196959682e-07,
"loss": 0.7801,
"step": 190
},
{
"epoch": 0.0625,
"grad_norm": 6.7438836097717285,
"learning_rate": 9.66953073364177e-07,
"loss": 0.8834,
"step": 195
},
{
"epoch": 0.0641025641025641,
"grad_norm": 10.840106010437012,
"learning_rate": 9.65300727032386e-07,
"loss": 0.9003,
"step": 200
},
{
"epoch": 0.06570512820512821,
"grad_norm": 5.037222385406494,
"learning_rate": 9.636483807005949e-07,
"loss": 0.7513,
"step": 205
},
{
"epoch": 0.0673076923076923,
"grad_norm": 3.903541326522827,
"learning_rate": 9.619960343688035e-07,
"loss": 0.8828,
"step": 210
},
{
"epoch": 0.06891025641025642,
"grad_norm": 6.461961269378662,
"learning_rate": 9.603436880370124e-07,
"loss": 0.7606,
"step": 215
},
{
"epoch": 0.07051282051282051,
"grad_norm": 6.433114528656006,
"learning_rate": 9.586913417052213e-07,
"loss": 0.8829,
"step": 220
},
{
"epoch": 0.07211538461538461,
"grad_norm": 6.478908538818359,
"learning_rate": 9.570389953734302e-07,
"loss": 0.86,
"step": 225
},
{
"epoch": 0.07371794871794872,
"grad_norm": 5.247589588165283,
"learning_rate": 9.55386649041639e-07,
"loss": 0.8562,
"step": 230
},
{
"epoch": 0.07532051282051282,
"grad_norm": 8.098102569580078,
"learning_rate": 9.53734302709848e-07,
"loss": 0.974,
"step": 235
},
{
"epoch": 0.07692307692307693,
"grad_norm": 10.06252670288086,
"learning_rate": 9.520819563780568e-07,
"loss": 0.9281,
"step": 240
},
{
"epoch": 0.07852564102564102,
"grad_norm": 3.637204885482788,
"learning_rate": 9.504296100462657e-07,
"loss": 0.8829,
"step": 245
},
{
"epoch": 0.08012820512820513,
"grad_norm": 4.503812313079834,
"learning_rate": 9.487772637144745e-07,
"loss": 0.7286,
"step": 250
},
{
"epoch": 0.08173076923076923,
"grad_norm": 8.717390060424805,
"learning_rate": 9.471249173826834e-07,
"loss": 0.6812,
"step": 255
},
{
"epoch": 0.08333333333333333,
"grad_norm": 6.7273640632629395,
"learning_rate": 9.454725710508922e-07,
"loss": 0.808,
"step": 260
},
{
"epoch": 0.08493589743589744,
"grad_norm": 4.702677249908447,
"learning_rate": 9.438202247191011e-07,
"loss": 0.9192,
"step": 265
},
{
"epoch": 0.08653846153846154,
"grad_norm": 4.1625285148620605,
"learning_rate": 9.4216787838731e-07,
"loss": 0.7835,
"step": 270
},
{
"epoch": 0.08814102564102565,
"grad_norm": 5.688870906829834,
"learning_rate": 9.405155320555188e-07,
"loss": 0.8823,
"step": 275
},
{
"epoch": 0.08974358974358974,
"grad_norm": 9.040973663330078,
"learning_rate": 9.388631857237277e-07,
"loss": 0.9733,
"step": 280
},
{
"epoch": 0.09134615384615384,
"grad_norm": 4.173698902130127,
"learning_rate": 9.372108393919365e-07,
"loss": 0.7514,
"step": 285
},
{
"epoch": 0.09294871794871795,
"grad_norm": 7.822443962097168,
"learning_rate": 9.355584930601454e-07,
"loss": 0.8867,
"step": 290
},
{
"epoch": 0.09455128205128205,
"grad_norm": 8.641590118408203,
"learning_rate": 9.339061467283542e-07,
"loss": 0.833,
"step": 295
},
{
"epoch": 0.09615384615384616,
"grad_norm": 4.389246463775635,
"learning_rate": 9.322538003965631e-07,
"loss": 0.892,
"step": 300
},
{
"epoch": 0.09775641025641026,
"grad_norm": 4.615504741668701,
"learning_rate": 9.30601454064772e-07,
"loss": 0.8854,
"step": 305
},
{
"epoch": 0.09935897435897435,
"grad_norm": 7.86992073059082,
"learning_rate": 9.289491077329808e-07,
"loss": 0.8405,
"step": 310
},
{
"epoch": 0.10096153846153846,
"grad_norm": 7.31835412979126,
"learning_rate": 9.272967614011896e-07,
"loss": 0.8817,
"step": 315
},
{
"epoch": 0.10256410256410256,
"grad_norm": 3.403594970703125,
"learning_rate": 9.256444150693985e-07,
"loss": 0.8149,
"step": 320
},
{
"epoch": 0.10416666666666667,
"grad_norm": 7.1932806968688965,
"learning_rate": 9.239920687376074e-07,
"loss": 0.9139,
"step": 325
},
{
"epoch": 0.10576923076923077,
"grad_norm": 5.652829170227051,
"learning_rate": 9.223397224058163e-07,
"loss": 0.8335,
"step": 330
},
{
"epoch": 0.10737179487179487,
"grad_norm": 4.582092761993408,
"learning_rate": 9.20687376074025e-07,
"loss": 0.8175,
"step": 335
},
{
"epoch": 0.10897435897435898,
"grad_norm": 6.207703113555908,
"learning_rate": 9.190350297422339e-07,
"loss": 0.8367,
"step": 340
},
{
"epoch": 0.11057692307692307,
"grad_norm": 7.341710567474365,
"learning_rate": 9.173826834104428e-07,
"loss": 0.838,
"step": 345
},
{
"epoch": 0.11217948717948718,
"grad_norm": 4.263551712036133,
"learning_rate": 9.157303370786517e-07,
"loss": 0.8722,
"step": 350
},
{
"epoch": 0.11378205128205128,
"grad_norm": 5.9049601554870605,
"learning_rate": 9.140779907468606e-07,
"loss": 0.8158,
"step": 355
},
{
"epoch": 0.11538461538461539,
"grad_norm": 6.010617256164551,
"learning_rate": 9.124256444150693e-07,
"loss": 0.7757,
"step": 360
},
{
"epoch": 0.11698717948717949,
"grad_norm": 5.599278450012207,
"learning_rate": 9.107732980832782e-07,
"loss": 0.8487,
"step": 365
},
{
"epoch": 0.11858974358974358,
"grad_norm": 6.303196907043457,
"learning_rate": 9.091209517514871e-07,
"loss": 0.8727,
"step": 370
},
{
"epoch": 0.1201923076923077,
"grad_norm": 5.943972110748291,
"learning_rate": 9.07468605419696e-07,
"loss": 0.7266,
"step": 375
},
{
"epoch": 0.12179487179487179,
"grad_norm": 10.433466911315918,
"learning_rate": 9.058162590879048e-07,
"loss": 0.8264,
"step": 380
},
{
"epoch": 0.1233974358974359,
"grad_norm": 6.700842380523682,
"learning_rate": 9.041639127561136e-07,
"loss": 0.9768,
"step": 385
},
{
"epoch": 0.125,
"grad_norm": 10.210798263549805,
"learning_rate": 9.025115664243225e-07,
"loss": 0.8168,
"step": 390
},
{
"epoch": 0.1266025641025641,
"grad_norm": 4.839009761810303,
"learning_rate": 9.008592200925314e-07,
"loss": 0.8856,
"step": 395
},
{
"epoch": 0.1282051282051282,
"grad_norm": 8.077885627746582,
"learning_rate": 8.992068737607403e-07,
"loss": 0.9729,
"step": 400
},
{
"epoch": 0.12980769230769232,
"grad_norm": 8.734336853027344,
"learning_rate": 8.975545274289491e-07,
"loss": 0.9824,
"step": 405
},
{
"epoch": 0.13141025641025642,
"grad_norm": 5.260401725769043,
"learning_rate": 8.959021810971579e-07,
"loss": 0.8476,
"step": 410
},
{
"epoch": 0.1330128205128205,
"grad_norm": 5.269688129425049,
"learning_rate": 8.942498347653668e-07,
"loss": 0.8591,
"step": 415
},
{
"epoch": 0.1346153846153846,
"grad_norm": 4.150247097015381,
"learning_rate": 8.925974884335757e-07,
"loss": 0.8461,
"step": 420
},
{
"epoch": 0.1362179487179487,
"grad_norm": 4.139176845550537,
"learning_rate": 8.909451421017845e-07,
"loss": 0.9335,
"step": 425
},
{
"epoch": 0.13782051282051283,
"grad_norm": 7.5222554206848145,
"learning_rate": 8.892927957699934e-07,
"loss": 0.7143,
"step": 430
},
{
"epoch": 0.13942307692307693,
"grad_norm": 12.695758819580078,
"learning_rate": 8.876404494382022e-07,
"loss": 0.8184,
"step": 435
},
{
"epoch": 0.14102564102564102,
"grad_norm": 8.057138442993164,
"learning_rate": 8.859881031064111e-07,
"loss": 0.9017,
"step": 440
},
{
"epoch": 0.14262820512820512,
"grad_norm": 8.482138633728027,
"learning_rate": 8.843357567746199e-07,
"loss": 0.9694,
"step": 445
},
{
"epoch": 0.14423076923076922,
"grad_norm": 12.769122123718262,
"learning_rate": 8.826834104428288e-07,
"loss": 0.8384,
"step": 450
},
{
"epoch": 0.14583333333333334,
"grad_norm": 5.045727252960205,
"learning_rate": 8.810310641110377e-07,
"loss": 0.8156,
"step": 455
},
{
"epoch": 0.14743589743589744,
"grad_norm": 9.09874153137207,
"learning_rate": 8.793787177792465e-07,
"loss": 0.8116,
"step": 460
},
{
"epoch": 0.14903846153846154,
"grad_norm": 6.691732883453369,
"learning_rate": 8.777263714474553e-07,
"loss": 0.8814,
"step": 465
},
{
"epoch": 0.15064102564102563,
"grad_norm": 5.676293849945068,
"learning_rate": 8.760740251156642e-07,
"loss": 0.8186,
"step": 470
},
{
"epoch": 0.15224358974358973,
"grad_norm": 8.919610977172852,
"learning_rate": 8.744216787838731e-07,
"loss": 0.7442,
"step": 475
},
{
"epoch": 0.15384615384615385,
"grad_norm": 4.288793087005615,
"learning_rate": 8.72769332452082e-07,
"loss": 0.8538,
"step": 480
},
{
"epoch": 0.15544871794871795,
"grad_norm": 8.457489013671875,
"learning_rate": 8.711169861202908e-07,
"loss": 0.8284,
"step": 485
},
{
"epoch": 0.15705128205128205,
"grad_norm": 8.613219261169434,
"learning_rate": 8.694646397884996e-07,
"loss": 0.8465,
"step": 490
},
{
"epoch": 0.15865384615384615,
"grad_norm": 5.168330192565918,
"learning_rate": 8.678122934567085e-07,
"loss": 0.854,
"step": 495
},
{
"epoch": 0.16025641025641027,
"grad_norm": 6.283329010009766,
"learning_rate": 8.661599471249174e-07,
"loss": 0.9902,
"step": 500
},
{
"epoch": 0.16185897435897437,
"grad_norm": 8.224679946899414,
"learning_rate": 8.645076007931263e-07,
"loss": 0.9261,
"step": 505
},
{
"epoch": 0.16346153846153846,
"grad_norm": 3.9687061309814453,
"learning_rate": 8.62855254461335e-07,
"loss": 0.8671,
"step": 510
},
{
"epoch": 0.16506410256410256,
"grad_norm": 3.925053358078003,
"learning_rate": 8.612029081295439e-07,
"loss": 0.6827,
"step": 515
},
{
"epoch": 0.16666666666666666,
"grad_norm": 4.103531837463379,
"learning_rate": 8.595505617977528e-07,
"loss": 0.9075,
"step": 520
},
{
"epoch": 0.16826923076923078,
"grad_norm": 4.411681175231934,
"learning_rate": 8.578982154659617e-07,
"loss": 0.7698,
"step": 525
},
{
"epoch": 0.16987179487179488,
"grad_norm": 8.91723346710205,
"learning_rate": 8.562458691341706e-07,
"loss": 0.9759,
"step": 530
},
{
"epoch": 0.17147435897435898,
"grad_norm": 3.293285846710205,
"learning_rate": 8.545935228023793e-07,
"loss": 0.7131,
"step": 535
},
{
"epoch": 0.17307692307692307,
"grad_norm": 4.500021934509277,
"learning_rate": 8.529411764705882e-07,
"loss": 0.6866,
"step": 540
},
{
"epoch": 0.17467948717948717,
"grad_norm": 3.7127466201782227,
"learning_rate": 8.512888301387971e-07,
"loss": 0.7112,
"step": 545
},
{
"epoch": 0.1762820512820513,
"grad_norm": 5.5667877197265625,
"learning_rate": 8.49636483807006e-07,
"loss": 0.7516,
"step": 550
},
{
"epoch": 0.1778846153846154,
"grad_norm": 4.206048965454102,
"learning_rate": 8.479841374752148e-07,
"loss": 0.8639,
"step": 555
},
{
"epoch": 0.1794871794871795,
"grad_norm": 3.593855857849121,
"learning_rate": 8.463317911434236e-07,
"loss": 0.7413,
"step": 560
},
{
"epoch": 0.18108974358974358,
"grad_norm": 9.683537483215332,
"learning_rate": 8.446794448116325e-07,
"loss": 0.9477,
"step": 565
},
{
"epoch": 0.18269230769230768,
"grad_norm": 5.113137245178223,
"learning_rate": 8.430270984798414e-07,
"loss": 0.8425,
"step": 570
},
{
"epoch": 0.1842948717948718,
"grad_norm": 10.013446807861328,
"learning_rate": 8.413747521480502e-07,
"loss": 0.9511,
"step": 575
},
{
"epoch": 0.1858974358974359,
"grad_norm": 7.936026573181152,
"learning_rate": 8.397224058162591e-07,
"loss": 0.8367,
"step": 580
},
{
"epoch": 0.1875,
"grad_norm": 4.949577331542969,
"learning_rate": 8.38070059484468e-07,
"loss": 0.7833,
"step": 585
},
{
"epoch": 0.1891025641025641,
"grad_norm": 5.491623878479004,
"learning_rate": 8.364177131526768e-07,
"loss": 0.7967,
"step": 590
},
{
"epoch": 0.1907051282051282,
"grad_norm": 9.594220161437988,
"learning_rate": 8.347653668208857e-07,
"loss": 0.8505,
"step": 595
},
{
"epoch": 0.19230769230769232,
"grad_norm": 6.291924476623535,
"learning_rate": 8.331130204890945e-07,
"loss": 0.7231,
"step": 600
},
{
"epoch": 0.19391025641025642,
"grad_norm": 5.185746192932129,
"learning_rate": 8.314606741573034e-07,
"loss": 0.8033,
"step": 605
},
{
"epoch": 0.1955128205128205,
"grad_norm": 9.937252044677734,
"learning_rate": 8.298083278255123e-07,
"loss": 0.8159,
"step": 610
},
{
"epoch": 0.1971153846153846,
"grad_norm": 3.5764591693878174,
"learning_rate": 8.281559814937211e-07,
"loss": 0.9405,
"step": 615
},
{
"epoch": 0.1987179487179487,
"grad_norm": 4.1528496742248535,
"learning_rate": 8.265036351619299e-07,
"loss": 0.7852,
"step": 620
},
{
"epoch": 0.20032051282051283,
"grad_norm": 4.072427272796631,
"learning_rate": 8.248512888301388e-07,
"loss": 0.7844,
"step": 625
},
{
"epoch": 0.20192307692307693,
"grad_norm": 8.563277244567871,
"learning_rate": 8.231989424983477e-07,
"loss": 0.8309,
"step": 630
},
{
"epoch": 0.20352564102564102,
"grad_norm": 6.037329196929932,
"learning_rate": 8.215465961665566e-07,
"loss": 0.782,
"step": 635
},
{
"epoch": 0.20512820512820512,
"grad_norm": 5.000993728637695,
"learning_rate": 8.198942498347653e-07,
"loss": 0.9419,
"step": 640
},
{
"epoch": 0.20673076923076922,
"grad_norm": 4.175522327423096,
"learning_rate": 8.182419035029742e-07,
"loss": 0.8316,
"step": 645
},
{
"epoch": 0.20833333333333334,
"grad_norm": 5.075506210327148,
"learning_rate": 8.165895571711831e-07,
"loss": 0.8471,
"step": 650
},
{
"epoch": 0.20993589743589744,
"grad_norm": 5.188806533813477,
"learning_rate": 8.14937210839392e-07,
"loss": 0.8379,
"step": 655
},
{
"epoch": 0.21153846153846154,
"grad_norm": 6.2080078125,
"learning_rate": 8.132848645076009e-07,
"loss": 0.9081,
"step": 660
},
{
"epoch": 0.21314102564102563,
"grad_norm": 4.525467395782471,
"learning_rate": 8.116325181758096e-07,
"loss": 0.8066,
"step": 665
},
{
"epoch": 0.21474358974358973,
"grad_norm": 5.5678582191467285,
"learning_rate": 8.099801718440185e-07,
"loss": 0.7192,
"step": 670
},
{
"epoch": 0.21634615384615385,
"grad_norm": 6.47728157043457,
"learning_rate": 8.083278255122274e-07,
"loss": 0.7436,
"step": 675
},
{
"epoch": 0.21794871794871795,
"grad_norm": 4.739030838012695,
"learning_rate": 8.066754791804363e-07,
"loss": 0.8783,
"step": 680
},
{
"epoch": 0.21955128205128205,
"grad_norm": 6.747486591339111,
"learning_rate": 8.050231328486451e-07,
"loss": 0.8484,
"step": 685
},
{
"epoch": 0.22115384615384615,
"grad_norm": 6.090416431427002,
"learning_rate": 8.033707865168539e-07,
"loss": 0.8766,
"step": 690
},
{
"epoch": 0.22275641025641027,
"grad_norm": 5.005781650543213,
"learning_rate": 8.017184401850628e-07,
"loss": 0.8299,
"step": 695
},
{
"epoch": 0.22435897435897437,
"grad_norm": 5.198122024536133,
"learning_rate": 8.000660938532717e-07,
"loss": 0.8107,
"step": 700
},
{
"epoch": 0.22596153846153846,
"grad_norm": 5.170607089996338,
"learning_rate": 7.984137475214805e-07,
"loss": 0.8751,
"step": 705
},
{
"epoch": 0.22756410256410256,
"grad_norm": 4.371824741363525,
"learning_rate": 7.967614011896894e-07,
"loss": 0.8545,
"step": 710
},
{
"epoch": 0.22916666666666666,
"grad_norm": 7.1865363121032715,
"learning_rate": 7.951090548578981e-07,
"loss": 0.8519,
"step": 715
},
{
"epoch": 0.23076923076923078,
"grad_norm": 11.179749488830566,
"learning_rate": 7.93456708526107e-07,
"loss": 0.7942,
"step": 720
},
{
"epoch": 0.23237179487179488,
"grad_norm": 8.086874008178711,
"learning_rate": 7.91804362194316e-07,
"loss": 0.8385,
"step": 725
},
{
"epoch": 0.23397435897435898,
"grad_norm": 5.28953218460083,
"learning_rate": 7.901520158625248e-07,
"loss": 0.9464,
"step": 730
},
{
"epoch": 0.23557692307692307,
"grad_norm": 5.9961018562316895,
"learning_rate": 7.884996695307337e-07,
"loss": 0.917,
"step": 735
},
{
"epoch": 0.23717948717948717,
"grad_norm": 6.03367805480957,
"learning_rate": 7.868473231989424e-07,
"loss": 0.7771,
"step": 740
},
{
"epoch": 0.2387820512820513,
"grad_norm": 4.500458717346191,
"learning_rate": 7.851949768671513e-07,
"loss": 0.7903,
"step": 745
},
{
"epoch": 0.2403846153846154,
"grad_norm": 3.947294235229492,
"learning_rate": 7.835426305353601e-07,
"loss": 0.795,
"step": 750
},
{
"epoch": 0.2419871794871795,
"grad_norm": 7.3017683029174805,
"learning_rate": 7.81890284203569e-07,
"loss": 0.8138,
"step": 755
},
{
"epoch": 0.24358974358974358,
"grad_norm": 3.787949562072754,
"learning_rate": 7.802379378717779e-07,
"loss": 0.6665,
"step": 760
},
{
"epoch": 0.24519230769230768,
"grad_norm": 5.326612949371338,
"learning_rate": 7.785855915399867e-07,
"loss": 0.9742,
"step": 765
},
{
"epoch": 0.2467948717948718,
"grad_norm": 6.92157506942749,
"learning_rate": 7.769332452081955e-07,
"loss": 0.841,
"step": 770
},
{
"epoch": 0.2483974358974359,
"grad_norm": 4.417288780212402,
"learning_rate": 7.752808988764044e-07,
"loss": 0.9047,
"step": 775
},
{
"epoch": 0.25,
"grad_norm": 3.6038155555725098,
"learning_rate": 7.736285525446133e-07,
"loss": 0.7922,
"step": 780
},
{
"epoch": 0.2516025641025641,
"grad_norm": 4.835304260253906,
"learning_rate": 7.719762062128222e-07,
"loss": 0.8349,
"step": 785
},
{
"epoch": 0.2532051282051282,
"grad_norm": 3.1939454078674316,
"learning_rate": 7.703238598810309e-07,
"loss": 0.7257,
"step": 790
},
{
"epoch": 0.2548076923076923,
"grad_norm": 8.088797569274902,
"learning_rate": 7.686715135492398e-07,
"loss": 0.8311,
"step": 795
},
{
"epoch": 0.2564102564102564,
"grad_norm": 7.198094367980957,
"learning_rate": 7.670191672174487e-07,
"loss": 0.8427,
"step": 800
},
{
"epoch": 0.25801282051282054,
"grad_norm": 5.080805778503418,
"learning_rate": 7.653668208856576e-07,
"loss": 0.7725,
"step": 805
},
{
"epoch": 0.25961538461538464,
"grad_norm": 3.3601558208465576,
"learning_rate": 7.637144745538665e-07,
"loss": 0.859,
"step": 810
},
{
"epoch": 0.26121794871794873,
"grad_norm": 6.839197158813477,
"learning_rate": 7.620621282220752e-07,
"loss": 0.8956,
"step": 815
},
{
"epoch": 0.26282051282051283,
"grad_norm": 4.368642807006836,
"learning_rate": 7.604097818902841e-07,
"loss": 0.9344,
"step": 820
},
{
"epoch": 0.2644230769230769,
"grad_norm": 4.079487323760986,
"learning_rate": 7.58757435558493e-07,
"loss": 0.7743,
"step": 825
},
{
"epoch": 0.266025641025641,
"grad_norm": 7.400752544403076,
"learning_rate": 7.571050892267019e-07,
"loss": 0.8653,
"step": 830
},
{
"epoch": 0.2676282051282051,
"grad_norm": 6.021170616149902,
"learning_rate": 7.554527428949107e-07,
"loss": 0.929,
"step": 835
},
{
"epoch": 0.2692307692307692,
"grad_norm": 7.803846836090088,
"learning_rate": 7.538003965631195e-07,
"loss": 0.7471,
"step": 840
},
{
"epoch": 0.2708333333333333,
"grad_norm": 11.89211654663086,
"learning_rate": 7.521480502313284e-07,
"loss": 0.7173,
"step": 845
},
{
"epoch": 0.2724358974358974,
"grad_norm": 9.066969871520996,
"learning_rate": 7.504957038995373e-07,
"loss": 0.9117,
"step": 850
},
{
"epoch": 0.27403846153846156,
"grad_norm": 5.939947128295898,
"learning_rate": 7.488433575677461e-07,
"loss": 0.7001,
"step": 855
},
{
"epoch": 0.27564102564102566,
"grad_norm": 4.300017356872559,
"learning_rate": 7.47191011235955e-07,
"loss": 0.9133,
"step": 860
},
{
"epoch": 0.27724358974358976,
"grad_norm": 3.9818003177642822,
"learning_rate": 7.455386649041638e-07,
"loss": 0.8436,
"step": 865
},
{
"epoch": 0.27884615384615385,
"grad_norm": 6.319674968719482,
"learning_rate": 7.438863185723727e-07,
"loss": 0.8385,
"step": 870
},
{
"epoch": 0.28044871794871795,
"grad_norm": 7.230429172515869,
"learning_rate": 7.422339722405816e-07,
"loss": 0.8215,
"step": 875
},
{
"epoch": 0.28205128205128205,
"grad_norm": 3.5045459270477295,
"learning_rate": 7.405816259087904e-07,
"loss": 0.7351,
"step": 880
},
{
"epoch": 0.28365384615384615,
"grad_norm": 5.423972129821777,
"learning_rate": 7.389292795769993e-07,
"loss": 0.7768,
"step": 885
},
{
"epoch": 0.28525641025641024,
"grad_norm": 9.424778938293457,
"learning_rate": 7.372769332452081e-07,
"loss": 0.903,
"step": 890
},
{
"epoch": 0.28685897435897434,
"grad_norm": 4.601898670196533,
"learning_rate": 7.35624586913417e-07,
"loss": 0.8104,
"step": 895
},
{
"epoch": 0.28846153846153844,
"grad_norm": 5.262858867645264,
"learning_rate": 7.339722405816258e-07,
"loss": 0.8147,
"step": 900
},
{
"epoch": 0.2900641025641026,
"grad_norm": 4.327410697937012,
"learning_rate": 7.323198942498347e-07,
"loss": 0.7795,
"step": 905
},
{
"epoch": 0.2916666666666667,
"grad_norm": 5.896692752838135,
"learning_rate": 7.306675479180436e-07,
"loss": 0.7502,
"step": 910
},
{
"epoch": 0.2932692307692308,
"grad_norm": 4.993595600128174,
"learning_rate": 7.290152015862524e-07,
"loss": 0.9855,
"step": 915
},
{
"epoch": 0.2948717948717949,
"grad_norm": 7.250411510467529,
"learning_rate": 7.273628552544612e-07,
"loss": 0.6526,
"step": 920
},
{
"epoch": 0.296474358974359,
"grad_norm": 5.891010761260986,
"learning_rate": 7.257105089226701e-07,
"loss": 0.8565,
"step": 925
},
{
"epoch": 0.2980769230769231,
"grad_norm": 3.8717401027679443,
"learning_rate": 7.24058162590879e-07,
"loss": 0.9124,
"step": 930
},
{
"epoch": 0.29967948717948717,
"grad_norm": 5.1769537925720215,
"learning_rate": 7.224058162590879e-07,
"loss": 0.6785,
"step": 935
},
{
"epoch": 0.30128205128205127,
"grad_norm": 4.895565986633301,
"learning_rate": 7.207534699272967e-07,
"loss": 0.763,
"step": 940
},
{
"epoch": 0.30288461538461536,
"grad_norm": 7.584598541259766,
"learning_rate": 7.191011235955055e-07,
"loss": 0.8159,
"step": 945
},
{
"epoch": 0.30448717948717946,
"grad_norm": 2.980520009994507,
"learning_rate": 7.174487772637144e-07,
"loss": 0.8258,
"step": 950
},
{
"epoch": 0.3060897435897436,
"grad_norm": 5.3033528327941895,
"learning_rate": 7.157964309319233e-07,
"loss": 0.9091,
"step": 955
},
{
"epoch": 0.3076923076923077,
"grad_norm": 5.652465343475342,
"learning_rate": 7.141440846001322e-07,
"loss": 0.7601,
"step": 960
},
{
"epoch": 0.3092948717948718,
"grad_norm": 2.7802562713623047,
"learning_rate": 7.124917382683409e-07,
"loss": 0.6819,
"step": 965
},
{
"epoch": 0.3108974358974359,
"grad_norm": 10.060710906982422,
"learning_rate": 7.108393919365498e-07,
"loss": 0.9308,
"step": 970
},
{
"epoch": 0.3125,
"grad_norm": 8.689105033874512,
"learning_rate": 7.091870456047587e-07,
"loss": 0.7976,
"step": 975
},
{
"epoch": 0.3141025641025641,
"grad_norm": 7.55824613571167,
"learning_rate": 7.075346992729676e-07,
"loss": 0.7857,
"step": 980
},
{
"epoch": 0.3157051282051282,
"grad_norm": 4.5640034675598145,
"learning_rate": 7.058823529411765e-07,
"loss": 0.8621,
"step": 985
},
{
"epoch": 0.3173076923076923,
"grad_norm": 7.791897773742676,
"learning_rate": 7.042300066093852e-07,
"loss": 0.8943,
"step": 990
},
{
"epoch": 0.3189102564102564,
"grad_norm": 4.598413944244385,
"learning_rate": 7.025776602775941e-07,
"loss": 0.8254,
"step": 995
},
{
"epoch": 0.32051282051282054,
"grad_norm": 6.27009391784668,
"learning_rate": 7.00925313945803e-07,
"loss": 0.8624,
"step": 1000
},
{
"epoch": 0.32211538461538464,
"grad_norm": 13.144405364990234,
"learning_rate": 6.992729676140119e-07,
"loss": 0.9121,
"step": 1005
},
{
"epoch": 0.32371794871794873,
"grad_norm": 3.142514944076538,
"learning_rate": 6.976206212822207e-07,
"loss": 0.8858,
"step": 1010
},
{
"epoch": 0.32532051282051283,
"grad_norm": 3.696758270263672,
"learning_rate": 6.959682749504295e-07,
"loss": 0.86,
"step": 1015
},
{
"epoch": 0.3269230769230769,
"grad_norm": 11.541287422180176,
"learning_rate": 6.943159286186384e-07,
"loss": 0.6851,
"step": 1020
},
{
"epoch": 0.328525641025641,
"grad_norm": 8.48985481262207,
"learning_rate": 6.926635822868473e-07,
"loss": 0.793,
"step": 1025
},
{
"epoch": 0.3301282051282051,
"grad_norm": 3.3774638175964355,
"learning_rate": 6.910112359550561e-07,
"loss": 0.8819,
"step": 1030
},
{
"epoch": 0.3317307692307692,
"grad_norm": 5.883586406707764,
"learning_rate": 6.89358889623265e-07,
"loss": 0.7703,
"step": 1035
},
{
"epoch": 0.3333333333333333,
"grad_norm": 4.836696624755859,
"learning_rate": 6.877065432914738e-07,
"loss": 0.7873,
"step": 1040
},
{
"epoch": 0.3349358974358974,
"grad_norm": 4.359090805053711,
"learning_rate": 6.860541969596827e-07,
"loss": 0.7078,
"step": 1045
},
{
"epoch": 0.33653846153846156,
"grad_norm": 4.49058723449707,
"learning_rate": 6.844018506278915e-07,
"loss": 0.8212,
"step": 1050
},
{
"epoch": 0.33814102564102566,
"grad_norm": 4.579678535461426,
"learning_rate": 6.827495042961004e-07,
"loss": 0.7983,
"step": 1055
},
{
"epoch": 0.33974358974358976,
"grad_norm": 9.115843772888184,
"learning_rate": 6.810971579643093e-07,
"loss": 0.6587,
"step": 1060
},
{
"epoch": 0.34134615384615385,
"grad_norm": 5.484290599822998,
"learning_rate": 6.794448116325181e-07,
"loss": 0.7931,
"step": 1065
},
{
"epoch": 0.34294871794871795,
"grad_norm": 8.550032615661621,
"learning_rate": 6.77792465300727e-07,
"loss": 0.7788,
"step": 1070
},
{
"epoch": 0.34455128205128205,
"grad_norm": 3.559866428375244,
"learning_rate": 6.761401189689358e-07,
"loss": 0.8319,
"step": 1075
},
{
"epoch": 0.34615384615384615,
"grad_norm": 4.798201560974121,
"learning_rate": 6.744877726371447e-07,
"loss": 0.8986,
"step": 1080
},
{
"epoch": 0.34775641025641024,
"grad_norm": 5.144353866577148,
"learning_rate": 6.728354263053536e-07,
"loss": 0.6799,
"step": 1085
},
{
"epoch": 0.34935897435897434,
"grad_norm": 8.058225631713867,
"learning_rate": 6.711830799735624e-07,
"loss": 0.8101,
"step": 1090
},
{
"epoch": 0.35096153846153844,
"grad_norm": 5.630926132202148,
"learning_rate": 6.695307336417712e-07,
"loss": 0.8777,
"step": 1095
},
{
"epoch": 0.3525641025641026,
"grad_norm": 6.345671653747559,
"learning_rate": 6.678783873099801e-07,
"loss": 0.6524,
"step": 1100
},
{
"epoch": 0.3541666666666667,
"grad_norm": 11.713841438293457,
"learning_rate": 6.66226040978189e-07,
"loss": 0.8206,
"step": 1105
},
{
"epoch": 0.3557692307692308,
"grad_norm": 4.345118999481201,
"learning_rate": 6.645736946463979e-07,
"loss": 0.8694,
"step": 1110
},
{
"epoch": 0.3573717948717949,
"grad_norm": 6.495255947113037,
"learning_rate": 6.629213483146066e-07,
"loss": 0.8721,
"step": 1115
},
{
"epoch": 0.358974358974359,
"grad_norm": 4.048442840576172,
"learning_rate": 6.612690019828155e-07,
"loss": 0.8307,
"step": 1120
},
{
"epoch": 0.3605769230769231,
"grad_norm": 5.020994186401367,
"learning_rate": 6.596166556510244e-07,
"loss": 0.7022,
"step": 1125
},
{
"epoch": 0.36217948717948717,
"grad_norm": 4.994934558868408,
"learning_rate": 6.579643093192333e-07,
"loss": 0.8302,
"step": 1130
},
{
"epoch": 0.36378205128205127,
"grad_norm": 3.8185691833496094,
"learning_rate": 6.563119629874422e-07,
"loss": 0.702,
"step": 1135
},
{
"epoch": 0.36538461538461536,
"grad_norm": 5.440126895904541,
"learning_rate": 6.546596166556509e-07,
"loss": 0.8589,
"step": 1140
},
{
"epoch": 0.36698717948717946,
"grad_norm": 4.935449600219727,
"learning_rate": 6.530072703238598e-07,
"loss": 0.6958,
"step": 1145
},
{
"epoch": 0.3685897435897436,
"grad_norm": 3.0225484371185303,
"learning_rate": 6.513549239920687e-07,
"loss": 0.7535,
"step": 1150
},
{
"epoch": 0.3701923076923077,
"grad_norm": 4.705560684204102,
"learning_rate": 6.497025776602776e-07,
"loss": 0.8748,
"step": 1155
},
{
"epoch": 0.3717948717948718,
"grad_norm": 7.766085147857666,
"learning_rate": 6.480502313284864e-07,
"loss": 0.7895,
"step": 1160
},
{
"epoch": 0.3733974358974359,
"grad_norm": 7.818696975708008,
"learning_rate": 6.463978849966952e-07,
"loss": 0.8482,
"step": 1165
},
{
"epoch": 0.375,
"grad_norm": 4.289005279541016,
"learning_rate": 6.447455386649041e-07,
"loss": 0.8875,
"step": 1170
},
{
"epoch": 0.3766025641025641,
"grad_norm": 4.8739752769470215,
"learning_rate": 6.43093192333113e-07,
"loss": 0.8013,
"step": 1175
},
{
"epoch": 0.3782051282051282,
"grad_norm": 3.6027133464813232,
"learning_rate": 6.414408460013219e-07,
"loss": 0.892,
"step": 1180
},
{
"epoch": 0.3798076923076923,
"grad_norm": 4.740626335144043,
"learning_rate": 6.397884996695307e-07,
"loss": 0.7148,
"step": 1185
},
{
"epoch": 0.3814102564102564,
"grad_norm": 3.2007155418395996,
"learning_rate": 6.381361533377395e-07,
"loss": 0.7979,
"step": 1190
},
{
"epoch": 0.38301282051282054,
"grad_norm": 4.596534729003906,
"learning_rate": 6.364838070059484e-07,
"loss": 0.8757,
"step": 1195
},
{
"epoch": 0.38461538461538464,
"grad_norm": 4.348639488220215,
"learning_rate": 6.348314606741573e-07,
"loss": 0.7475,
"step": 1200
},
{
"epoch": 0.38621794871794873,
"grad_norm": 4.388121128082275,
"learning_rate": 6.331791143423661e-07,
"loss": 0.7703,
"step": 1205
},
{
"epoch": 0.38782051282051283,
"grad_norm": 3.776677370071411,
"learning_rate": 6.31526768010575e-07,
"loss": 0.7684,
"step": 1210
},
{
"epoch": 0.3894230769230769,
"grad_norm": 4.856482028961182,
"learning_rate": 6.298744216787838e-07,
"loss": 0.8132,
"step": 1215
},
{
"epoch": 0.391025641025641,
"grad_norm": 8.453356742858887,
"learning_rate": 6.282220753469927e-07,
"loss": 0.8172,
"step": 1220
},
{
"epoch": 0.3926282051282051,
"grad_norm": 9.322402954101562,
"learning_rate": 6.265697290152015e-07,
"loss": 0.7704,
"step": 1225
},
{
"epoch": 0.3942307692307692,
"grad_norm": 4.034356594085693,
"learning_rate": 6.249173826834104e-07,
"loss": 0.8072,
"step": 1230
},
{
"epoch": 0.3958333333333333,
"grad_norm": 6.027692794799805,
"learning_rate": 6.232650363516193e-07,
"loss": 0.6584,
"step": 1235
},
{
"epoch": 0.3974358974358974,
"grad_norm": 3.487473487854004,
"learning_rate": 6.216126900198281e-07,
"loss": 0.8855,
"step": 1240
},
{
"epoch": 0.39903846153846156,
"grad_norm": 3.8283722400665283,
"learning_rate": 6.19960343688037e-07,
"loss": 0.8043,
"step": 1245
},
{
"epoch": 0.40064102564102566,
"grad_norm": 4.080909252166748,
"learning_rate": 6.183079973562458e-07,
"loss": 0.7189,
"step": 1250
},
{
"epoch": 0.40224358974358976,
"grad_norm": 10.283230781555176,
"learning_rate": 6.166556510244547e-07,
"loss": 0.6539,
"step": 1255
},
{
"epoch": 0.40384615384615385,
"grad_norm": 3.7215747833251953,
"learning_rate": 6.150033046926636e-07,
"loss": 0.8382,
"step": 1260
},
{
"epoch": 0.40544871794871795,
"grad_norm": 6.174632549285889,
"learning_rate": 6.133509583608724e-07,
"loss": 0.6854,
"step": 1265
},
{
"epoch": 0.40705128205128205,
"grad_norm": 5.546038627624512,
"learning_rate": 6.116986120290812e-07,
"loss": 0.633,
"step": 1270
},
{
"epoch": 0.40865384615384615,
"grad_norm": 4.6828813552856445,
"learning_rate": 6.100462656972901e-07,
"loss": 0.864,
"step": 1275
},
{
"epoch": 0.41025641025641024,
"grad_norm": 5.444061279296875,
"learning_rate": 6.08393919365499e-07,
"loss": 0.8923,
"step": 1280
},
{
"epoch": 0.41185897435897434,
"grad_norm": 9.895957946777344,
"learning_rate": 6.067415730337079e-07,
"loss": 0.91,
"step": 1285
},
{
"epoch": 0.41346153846153844,
"grad_norm": 3.8047962188720703,
"learning_rate": 6.050892267019166e-07,
"loss": 0.82,
"step": 1290
},
{
"epoch": 0.4150641025641026,
"grad_norm": 5.19016695022583,
"learning_rate": 6.034368803701255e-07,
"loss": 0.8092,
"step": 1295
},
{
"epoch": 0.4166666666666667,
"grad_norm": 3.637864112854004,
"learning_rate": 6.017845340383344e-07,
"loss": 0.8826,
"step": 1300
},
{
"epoch": 0.4182692307692308,
"grad_norm": 2.6663596630096436,
"learning_rate": 6.001321877065433e-07,
"loss": 0.6852,
"step": 1305
},
{
"epoch": 0.4198717948717949,
"grad_norm": 3.58880615234375,
"learning_rate": 5.984798413747522e-07,
"loss": 0.8336,
"step": 1310
},
{
"epoch": 0.421474358974359,
"grad_norm": 2.4447970390319824,
"learning_rate": 5.968274950429609e-07,
"loss": 0.7603,
"step": 1315
},
{
"epoch": 0.4230769230769231,
"grad_norm": 4.405289649963379,
"learning_rate": 5.951751487111698e-07,
"loss": 0.7799,
"step": 1320
},
{
"epoch": 0.42467948717948717,
"grad_norm": 4.415432929992676,
"learning_rate": 5.935228023793787e-07,
"loss": 0.7517,
"step": 1325
},
{
"epoch": 0.42628205128205127,
"grad_norm": 2.538200616836548,
"learning_rate": 5.918704560475876e-07,
"loss": 0.8617,
"step": 1330
},
{
"epoch": 0.42788461538461536,
"grad_norm": 5.295281887054443,
"learning_rate": 5.902181097157964e-07,
"loss": 0.8335,
"step": 1335
},
{
"epoch": 0.42948717948717946,
"grad_norm": 10.389196395874023,
"learning_rate": 5.885657633840052e-07,
"loss": 0.8529,
"step": 1340
},
{
"epoch": 0.4310897435897436,
"grad_norm": 4.9335713386535645,
"learning_rate": 5.869134170522141e-07,
"loss": 0.8276,
"step": 1345
},
{
"epoch": 0.4326923076923077,
"grad_norm": 3.577237367630005,
"learning_rate": 5.85261070720423e-07,
"loss": 0.7883,
"step": 1350
},
{
"epoch": 0.4342948717948718,
"grad_norm": 3.355888605117798,
"learning_rate": 5.836087243886318e-07,
"loss": 0.7451,
"step": 1355
},
{
"epoch": 0.4358974358974359,
"grad_norm": 4.57732629776001,
"learning_rate": 5.819563780568407e-07,
"loss": 0.8938,
"step": 1360
},
{
"epoch": 0.4375,
"grad_norm": 5.530088901519775,
"learning_rate": 5.803040317250495e-07,
"loss": 0.7397,
"step": 1365
},
{
"epoch": 0.4391025641025641,
"grad_norm": 3.3376779556274414,
"learning_rate": 5.786516853932584e-07,
"loss": 0.7578,
"step": 1370
},
{
"epoch": 0.4407051282051282,
"grad_norm": 3.726835012435913,
"learning_rate": 5.769993390614673e-07,
"loss": 0.787,
"step": 1375
},
{
"epoch": 0.4423076923076923,
"grad_norm": 4.771599769592285,
"learning_rate": 5.753469927296761e-07,
"loss": 0.7629,
"step": 1380
},
{
"epoch": 0.4439102564102564,
"grad_norm": 4.213784217834473,
"learning_rate": 5.73694646397885e-07,
"loss": 0.7171,
"step": 1385
},
{
"epoch": 0.44551282051282054,
"grad_norm": 5.608395576477051,
"learning_rate": 5.720423000660938e-07,
"loss": 0.6273,
"step": 1390
},
{
"epoch": 0.44711538461538464,
"grad_norm": 5.424095153808594,
"learning_rate": 5.703899537343027e-07,
"loss": 0.8648,
"step": 1395
},
{
"epoch": 0.44871794871794873,
"grad_norm": 4.200117111206055,
"learning_rate": 5.687376074025115e-07,
"loss": 0.7668,
"step": 1400
},
{
"epoch": 0.45032051282051283,
"grad_norm": 4.810688495635986,
"learning_rate": 5.670852610707204e-07,
"loss": 0.8691,
"step": 1405
},
{
"epoch": 0.4519230769230769,
"grad_norm": 5.285038948059082,
"learning_rate": 5.654329147389293e-07,
"loss": 0.8094,
"step": 1410
},
{
"epoch": 0.453525641025641,
"grad_norm": 2.9930169582366943,
"learning_rate": 5.637805684071381e-07,
"loss": 0.7669,
"step": 1415
},
{
"epoch": 0.4551282051282051,
"grad_norm": 3.244771718978882,
"learning_rate": 5.621282220753469e-07,
"loss": 0.7827,
"step": 1420
},
{
"epoch": 0.4567307692307692,
"grad_norm": 3.3853907585144043,
"learning_rate": 5.604758757435558e-07,
"loss": 0.7306,
"step": 1425
},
{
"epoch": 0.4583333333333333,
"grad_norm": 3.7965517044067383,
"learning_rate": 5.588235294117647e-07,
"loss": 0.6429,
"step": 1430
},
{
"epoch": 0.4599358974358974,
"grad_norm": 4.235316753387451,
"learning_rate": 5.571711830799736e-07,
"loss": 0.8967,
"step": 1435
},
{
"epoch": 0.46153846153846156,
"grad_norm": 6.711025238037109,
"learning_rate": 5.555188367481823e-07,
"loss": 0.8444,
"step": 1440
},
{
"epoch": 0.46314102564102566,
"grad_norm": 4.23643684387207,
"learning_rate": 5.538664904163912e-07,
"loss": 0.7664,
"step": 1445
},
{
"epoch": 0.46474358974358976,
"grad_norm": 4.946862697601318,
"learning_rate": 5.522141440846001e-07,
"loss": 0.7987,
"step": 1450
},
{
"epoch": 0.46634615384615385,
"grad_norm": 5.770292282104492,
"learning_rate": 5.50561797752809e-07,
"loss": 0.7844,
"step": 1455
},
{
"epoch": 0.46794871794871795,
"grad_norm": 11.45702838897705,
"learning_rate": 5.489094514210179e-07,
"loss": 0.8573,
"step": 1460
},
{
"epoch": 0.46955128205128205,
"grad_norm": 4.409577369689941,
"learning_rate": 5.472571050892266e-07,
"loss": 0.8122,
"step": 1465
},
{
"epoch": 0.47115384615384615,
"grad_norm": 8.126829147338867,
"learning_rate": 5.456047587574355e-07,
"loss": 0.7616,
"step": 1470
},
{
"epoch": 0.47275641025641024,
"grad_norm": 10.366379737854004,
"learning_rate": 5.439524124256444e-07,
"loss": 0.6492,
"step": 1475
},
{
"epoch": 0.47435897435897434,
"grad_norm": 5.814599514007568,
"learning_rate": 5.423000660938533e-07,
"loss": 0.8335,
"step": 1480
},
{
"epoch": 0.47596153846153844,
"grad_norm": 3.6713919639587402,
"learning_rate": 5.406477197620621e-07,
"loss": 0.7175,
"step": 1485
},
{
"epoch": 0.4775641025641026,
"grad_norm": 4.473592758178711,
"learning_rate": 5.389953734302709e-07,
"loss": 0.7772,
"step": 1490
},
{
"epoch": 0.4791666666666667,
"grad_norm": 5.191585540771484,
"learning_rate": 5.373430270984798e-07,
"loss": 0.8085,
"step": 1495
},
{
"epoch": 0.4807692307692308,
"grad_norm": 4.686864376068115,
"learning_rate": 5.356906807666887e-07,
"loss": 0.7285,
"step": 1500
},
{
"epoch": 0.4823717948717949,
"grad_norm": 6.236685276031494,
"learning_rate": 5.340383344348976e-07,
"loss": 0.8491,
"step": 1505
},
{
"epoch": 0.483974358974359,
"grad_norm": 5.375248908996582,
"learning_rate": 5.323859881031064e-07,
"loss": 0.8212,
"step": 1510
},
{
"epoch": 0.4855769230769231,
"grad_norm": 3.439789295196533,
"learning_rate": 5.307336417713153e-07,
"loss": 0.7619,
"step": 1515
},
{
"epoch": 0.48717948717948717,
"grad_norm": 4.730751991271973,
"learning_rate": 5.290812954395241e-07,
"loss": 0.8577,
"step": 1520
},
{
"epoch": 0.48878205128205127,
"grad_norm": 3.463454484939575,
"learning_rate": 5.27428949107733e-07,
"loss": 0.8779,
"step": 1525
},
{
"epoch": 0.49038461538461536,
"grad_norm": 3.2680091857910156,
"learning_rate": 5.257766027759418e-07,
"loss": 0.7626,
"step": 1530
},
{
"epoch": 0.49198717948717946,
"grad_norm": 4.192795753479004,
"learning_rate": 5.241242564441507e-07,
"loss": 0.9215,
"step": 1535
},
{
"epoch": 0.4935897435897436,
"grad_norm": 8.984251976013184,
"learning_rate": 5.224719101123596e-07,
"loss": 0.7347,
"step": 1540
},
{
"epoch": 0.4951923076923077,
"grad_norm": 5.889853477478027,
"learning_rate": 5.208195637805684e-07,
"loss": 0.8716,
"step": 1545
},
{
"epoch": 0.4967948717948718,
"grad_norm": 6.937811851501465,
"learning_rate": 5.191672174487772e-07,
"loss": 0.778,
"step": 1550
},
{
"epoch": 0.4983974358974359,
"grad_norm": 5.315396308898926,
"learning_rate": 5.175148711169861e-07,
"loss": 0.8273,
"step": 1555
},
{
"epoch": 0.5,
"grad_norm": 4.183327674865723,
"learning_rate": 5.15862524785195e-07,
"loss": 0.8231,
"step": 1560
},
{
"epoch": 0.5016025641025641,
"grad_norm": 4.254622459411621,
"learning_rate": 5.142101784534039e-07,
"loss": 0.6407,
"step": 1565
},
{
"epoch": 0.5032051282051282,
"grad_norm": 4.547656059265137,
"learning_rate": 5.125578321216127e-07,
"loss": 0.7148,
"step": 1570
},
{
"epoch": 0.5048076923076923,
"grad_norm": 5.993008613586426,
"learning_rate": 5.109054857898215e-07,
"loss": 0.7727,
"step": 1575
},
{
"epoch": 0.5064102564102564,
"grad_norm": 3.685878276824951,
"learning_rate": 5.092531394580304e-07,
"loss": 0.8638,
"step": 1580
},
{
"epoch": 0.5080128205128205,
"grad_norm": 4.194368839263916,
"learning_rate": 5.076007931262393e-07,
"loss": 0.819,
"step": 1585
},
{
"epoch": 0.5096153846153846,
"grad_norm": 3.141991376876831,
"learning_rate": 5.059484467944482e-07,
"loss": 0.7831,
"step": 1590
},
{
"epoch": 0.5112179487179487,
"grad_norm": 5.693704605102539,
"learning_rate": 5.042961004626569e-07,
"loss": 0.761,
"step": 1595
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.8469674587249756,
"learning_rate": 5.026437541308658e-07,
"loss": 0.7729,
"step": 1600
},
{
"epoch": 0.5144230769230769,
"grad_norm": 7.078847885131836,
"learning_rate": 5.009914077990747e-07,
"loss": 0.7102,
"step": 1605
},
{
"epoch": 0.5160256410256411,
"grad_norm": 4.168100357055664,
"learning_rate": 4.993390614672835e-07,
"loss": 0.6727,
"step": 1610
},
{
"epoch": 0.5176282051282052,
"grad_norm": 3.5356192588806152,
"learning_rate": 4.976867151354923e-07,
"loss": 0.8312,
"step": 1615
},
{
"epoch": 0.5192307692307693,
"grad_norm": 3.7321722507476807,
"learning_rate": 4.960343688037012e-07,
"loss": 0.8265,
"step": 1620
},
{
"epoch": 0.5208333333333334,
"grad_norm": 4.614173889160156,
"learning_rate": 4.943820224719101e-07,
"loss": 0.7464,
"step": 1625
},
{
"epoch": 0.5224358974358975,
"grad_norm": 4.419942378997803,
"learning_rate": 4.92729676140119e-07,
"loss": 0.7683,
"step": 1630
},
{
"epoch": 0.5240384615384616,
"grad_norm": 3.572216510772705,
"learning_rate": 4.910773298083277e-07,
"loss": 0.8283,
"step": 1635
},
{
"epoch": 0.5256410256410257,
"grad_norm": 3.31060528755188,
"learning_rate": 4.894249834765366e-07,
"loss": 0.8539,
"step": 1640
},
{
"epoch": 0.5272435897435898,
"grad_norm": 6.509139060974121,
"learning_rate": 4.877726371447455e-07,
"loss": 0.6647,
"step": 1645
},
{
"epoch": 0.5288461538461539,
"grad_norm": 4.024603843688965,
"learning_rate": 4.861202908129544e-07,
"loss": 0.8066,
"step": 1650
},
{
"epoch": 0.530448717948718,
"grad_norm": 3.655712604522705,
"learning_rate": 4.844679444811633e-07,
"loss": 0.7693,
"step": 1655
},
{
"epoch": 0.532051282051282,
"grad_norm": 3.420959949493408,
"learning_rate": 4.82815598149372e-07,
"loss": 0.7867,
"step": 1660
},
{
"epoch": 0.5336538461538461,
"grad_norm": 4.068134307861328,
"learning_rate": 4.811632518175809e-07,
"loss": 0.8251,
"step": 1665
},
{
"epoch": 0.5352564102564102,
"grad_norm": 4.351796627044678,
"learning_rate": 4.795109054857898e-07,
"loss": 0.7103,
"step": 1670
},
{
"epoch": 0.5368589743589743,
"grad_norm": 5.838902950286865,
"learning_rate": 4.778585591539987e-07,
"loss": 0.835,
"step": 1675
},
{
"epoch": 0.5384615384615384,
"grad_norm": 4.233332633972168,
"learning_rate": 4.762062128222075e-07,
"loss": 0.8817,
"step": 1680
},
{
"epoch": 0.5400641025641025,
"grad_norm": 4.291604042053223,
"learning_rate": 4.745538664904163e-07,
"loss": 0.6882,
"step": 1685
},
{
"epoch": 0.5416666666666666,
"grad_norm": 3.780442714691162,
"learning_rate": 4.729015201586252e-07,
"loss": 0.6711,
"step": 1690
},
{
"epoch": 0.5432692307692307,
"grad_norm": 3.0913641452789307,
"learning_rate": 4.712491738268341e-07,
"loss": 0.8828,
"step": 1695
},
{
"epoch": 0.5448717948717948,
"grad_norm": 2.7122883796691895,
"learning_rate": 4.695968274950429e-07,
"loss": 0.6597,
"step": 1700
},
{
"epoch": 0.5464743589743589,
"grad_norm": 3.7509615421295166,
"learning_rate": 4.679444811632518e-07,
"loss": 0.8139,
"step": 1705
},
{
"epoch": 0.5480769230769231,
"grad_norm": 4.429205417633057,
"learning_rate": 4.662921348314606e-07,
"loss": 0.8188,
"step": 1710
},
{
"epoch": 0.5496794871794872,
"grad_norm": 6.677905559539795,
"learning_rate": 4.646397884996695e-07,
"loss": 0.689,
"step": 1715
},
{
"epoch": 0.5512820512820513,
"grad_norm": 10.779183387756348,
"learning_rate": 4.6298744216787836e-07,
"loss": 0.7602,
"step": 1720
},
{
"epoch": 0.5528846153846154,
"grad_norm": 5.142210006713867,
"learning_rate": 4.613350958360872e-07,
"loss": 0.809,
"step": 1725
},
{
"epoch": 0.5544871794871795,
"grad_norm": 4.238507270812988,
"learning_rate": 4.5968274950429606e-07,
"loss": 0.7306,
"step": 1730
},
{
"epoch": 0.5560897435897436,
"grad_norm": 4.379620552062988,
"learning_rate": 4.580304031725049e-07,
"loss": 0.8622,
"step": 1735
},
{
"epoch": 0.5576923076923077,
"grad_norm": 5.106377124786377,
"learning_rate": 4.5637805684071377e-07,
"loss": 0.8234,
"step": 1740
},
{
"epoch": 0.5592948717948718,
"grad_norm": 4.431070327758789,
"learning_rate": 4.5472571050892265e-07,
"loss": 0.9141,
"step": 1745
},
{
"epoch": 0.5608974358974359,
"grad_norm": 3.911802053451538,
"learning_rate": 4.5307336417713147e-07,
"loss": 0.783,
"step": 1750
},
{
"epoch": 0.5625,
"grad_norm": 5.008035182952881,
"learning_rate": 4.5142101784534035e-07,
"loss": 0.8401,
"step": 1755
},
{
"epoch": 0.5641025641025641,
"grad_norm": 8.659884452819824,
"learning_rate": 4.4976867151354923e-07,
"loss": 0.6399,
"step": 1760
},
{
"epoch": 0.5657051282051282,
"grad_norm": 3.6218109130859375,
"learning_rate": 4.4811632518175805e-07,
"loss": 0.8017,
"step": 1765
},
{
"epoch": 0.5673076923076923,
"grad_norm": 8.017809867858887,
"learning_rate": 4.4646397884996693e-07,
"loss": 0.9154,
"step": 1770
},
{
"epoch": 0.5689102564102564,
"grad_norm": 4.872199535369873,
"learning_rate": 4.4481163251817576e-07,
"loss": 0.8139,
"step": 1775
},
{
"epoch": 0.5705128205128205,
"grad_norm": 3.4777655601501465,
"learning_rate": 4.4315928618638463e-07,
"loss": 0.7942,
"step": 1780
},
{
"epoch": 0.5721153846153846,
"grad_norm": 4.423591613769531,
"learning_rate": 4.415069398545935e-07,
"loss": 0.6299,
"step": 1785
},
{
"epoch": 0.5737179487179487,
"grad_norm": 10.330599784851074,
"learning_rate": 4.3985459352280234e-07,
"loss": 0.849,
"step": 1790
},
{
"epoch": 0.5753205128205128,
"grad_norm": 3.7085251808166504,
"learning_rate": 4.382022471910112e-07,
"loss": 0.7756,
"step": 1795
},
{
"epoch": 0.5769230769230769,
"grad_norm": 4.038546562194824,
"learning_rate": 4.3654990085922004e-07,
"loss": 0.9406,
"step": 1800
},
{
"epoch": 0.5785256410256411,
"grad_norm": 5.6599440574646,
"learning_rate": 4.348975545274289e-07,
"loss": 0.6654,
"step": 1805
},
{
"epoch": 0.5801282051282052,
"grad_norm": 5.489417552947998,
"learning_rate": 4.332452081956378e-07,
"loss": 0.8556,
"step": 1810
},
{
"epoch": 0.5817307692307693,
"grad_norm": 7.606975078582764,
"learning_rate": 4.315928618638466e-07,
"loss": 0.8567,
"step": 1815
},
{
"epoch": 0.5833333333333334,
"grad_norm": 6.262397766113281,
"learning_rate": 4.299405155320555e-07,
"loss": 0.8868,
"step": 1820
},
{
"epoch": 0.5849358974358975,
"grad_norm": 8.082782745361328,
"learning_rate": 4.282881692002643e-07,
"loss": 0.7651,
"step": 1825
},
{
"epoch": 0.5865384615384616,
"grad_norm": 7.61177921295166,
"learning_rate": 4.266358228684732e-07,
"loss": 0.8139,
"step": 1830
},
{
"epoch": 0.5881410256410257,
"grad_norm": 3.503220796585083,
"learning_rate": 4.249834765366821e-07,
"loss": 0.6752,
"step": 1835
},
{
"epoch": 0.5897435897435898,
"grad_norm": 6.636229038238525,
"learning_rate": 4.233311302048909e-07,
"loss": 0.8225,
"step": 1840
},
{
"epoch": 0.5913461538461539,
"grad_norm": 45.19087219238281,
"learning_rate": 4.216787838730998e-07,
"loss": 0.8639,
"step": 1845
},
{
"epoch": 0.592948717948718,
"grad_norm": 12.816862106323242,
"learning_rate": 4.200264375413086e-07,
"loss": 0.9769,
"step": 1850
},
{
"epoch": 0.594551282051282,
"grad_norm": 5.171041011810303,
"learning_rate": 4.183740912095175e-07,
"loss": 0.6907,
"step": 1855
},
{
"epoch": 0.5961538461538461,
"grad_norm": 3.1392245292663574,
"learning_rate": 4.1672174487772637e-07,
"loss": 0.7235,
"step": 1860
},
{
"epoch": 0.5977564102564102,
"grad_norm": 3.557652473449707,
"learning_rate": 4.150693985459352e-07,
"loss": 0.7241,
"step": 1865
},
{
"epoch": 0.5993589743589743,
"grad_norm": 3.4919662475585938,
"learning_rate": 4.1341705221414407e-07,
"loss": 0.7947,
"step": 1870
},
{
"epoch": 0.6009615384615384,
"grad_norm": 7.577988624572754,
"learning_rate": 4.117647058823529e-07,
"loss": 0.7991,
"step": 1875
},
{
"epoch": 0.6025641025641025,
"grad_norm": 6.581418514251709,
"learning_rate": 4.1011235955056177e-07,
"loss": 0.7415,
"step": 1880
},
{
"epoch": 0.6041666666666666,
"grad_norm": 5.872368335723877,
"learning_rate": 4.0846001321877065e-07,
"loss": 0.8145,
"step": 1885
},
{
"epoch": 0.6057692307692307,
"grad_norm": 5.491688251495361,
"learning_rate": 4.068076668869795e-07,
"loss": 0.888,
"step": 1890
},
{
"epoch": 0.6073717948717948,
"grad_norm": 6.849071025848389,
"learning_rate": 4.0515532055518835e-07,
"loss": 0.6781,
"step": 1895
},
{
"epoch": 0.6089743589743589,
"grad_norm": 3.5489501953125,
"learning_rate": 4.035029742233972e-07,
"loss": 0.7944,
"step": 1900
},
{
"epoch": 0.6105769230769231,
"grad_norm": 9.167459487915039,
"learning_rate": 4.0185062789160606e-07,
"loss": 0.7331,
"step": 1905
},
{
"epoch": 0.6121794871794872,
"grad_norm": 2.9380276203155518,
"learning_rate": 4.0019828155981494e-07,
"loss": 0.7066,
"step": 1910
},
{
"epoch": 0.6137820512820513,
"grad_norm": 3.069446325302124,
"learning_rate": 3.9854593522802376e-07,
"loss": 0.8597,
"step": 1915
},
{
"epoch": 0.6153846153846154,
"grad_norm": 4.881730079650879,
"learning_rate": 3.9689358889623264e-07,
"loss": 0.8679,
"step": 1920
},
{
"epoch": 0.6169871794871795,
"grad_norm": 7.921117305755615,
"learning_rate": 3.9524124256444146e-07,
"loss": 0.8195,
"step": 1925
},
{
"epoch": 0.6185897435897436,
"grad_norm": 7.495361328125,
"learning_rate": 3.9358889623265034e-07,
"loss": 0.7521,
"step": 1930
},
{
"epoch": 0.6201923076923077,
"grad_norm": 3.289283037185669,
"learning_rate": 3.919365499008592e-07,
"loss": 0.779,
"step": 1935
},
{
"epoch": 0.6217948717948718,
"grad_norm": 4.523643970489502,
"learning_rate": 3.9028420356906805e-07,
"loss": 0.722,
"step": 1940
},
{
"epoch": 0.6233974358974359,
"grad_norm": 4.16140079498291,
"learning_rate": 3.886318572372769e-07,
"loss": 0.7385,
"step": 1945
},
{
"epoch": 0.625,
"grad_norm": 2.602611541748047,
"learning_rate": 3.8697951090548575e-07,
"loss": 0.7818,
"step": 1950
},
{
"epoch": 0.6266025641025641,
"grad_norm": 5.022205352783203,
"learning_rate": 3.8532716457369463e-07,
"loss": 0.7388,
"step": 1955
},
{
"epoch": 0.6282051282051282,
"grad_norm": 4.107226371765137,
"learning_rate": 3.836748182419035e-07,
"loss": 0.8531,
"step": 1960
},
{
"epoch": 0.6298076923076923,
"grad_norm": 3.9306111335754395,
"learning_rate": 3.8202247191011233e-07,
"loss": 0.8112,
"step": 1965
},
{
"epoch": 0.6314102564102564,
"grad_norm": 3.1901676654815674,
"learning_rate": 3.803701255783212e-07,
"loss": 0.7661,
"step": 1970
},
{
"epoch": 0.6330128205128205,
"grad_norm": 5.7795820236206055,
"learning_rate": 3.7871777924653003e-07,
"loss": 0.7822,
"step": 1975
},
{
"epoch": 0.6346153846153846,
"grad_norm": 4.990657806396484,
"learning_rate": 3.770654329147389e-07,
"loss": 0.6767,
"step": 1980
},
{
"epoch": 0.6362179487179487,
"grad_norm": 3.1682956218719482,
"learning_rate": 3.754130865829478e-07,
"loss": 0.7403,
"step": 1985
},
{
"epoch": 0.6378205128205128,
"grad_norm": 8.12835693359375,
"learning_rate": 3.737607402511566e-07,
"loss": 0.8295,
"step": 1990
},
{
"epoch": 0.6394230769230769,
"grad_norm": 6.958061218261719,
"learning_rate": 3.721083939193655e-07,
"loss": 0.7853,
"step": 1995
},
{
"epoch": 0.6410256410256411,
"grad_norm": 9.980351448059082,
"learning_rate": 3.704560475875743e-07,
"loss": 0.8413,
"step": 2000
},
{
"epoch": 0.6426282051282052,
"grad_norm": 5.591805934906006,
"learning_rate": 3.688037012557832e-07,
"loss": 0.8321,
"step": 2005
},
{
"epoch": 0.6442307692307693,
"grad_norm": 4.056339263916016,
"learning_rate": 3.671513549239921e-07,
"loss": 0.7223,
"step": 2010
},
{
"epoch": 0.6458333333333334,
"grad_norm": 4.585841655731201,
"learning_rate": 3.654990085922009e-07,
"loss": 0.8602,
"step": 2015
},
{
"epoch": 0.6474358974358975,
"grad_norm": 14.423575401306152,
"learning_rate": 3.638466622604098e-07,
"loss": 0.8337,
"step": 2020
},
{
"epoch": 0.6490384615384616,
"grad_norm": 17.55698013305664,
"learning_rate": 3.621943159286186e-07,
"loss": 0.8524,
"step": 2025
},
{
"epoch": 0.6506410256410257,
"grad_norm": 8.060038566589355,
"learning_rate": 3.605419695968275e-07,
"loss": 0.7047,
"step": 2030
},
{
"epoch": 0.6522435897435898,
"grad_norm": 3.0732924938201904,
"learning_rate": 3.5888962326503636e-07,
"loss": 0.8203,
"step": 2035
},
{
"epoch": 0.6538461538461539,
"grad_norm": 6.2294020652771,
"learning_rate": 3.572372769332452e-07,
"loss": 0.8524,
"step": 2040
},
{
"epoch": 0.655448717948718,
"grad_norm": 5.603904724121094,
"learning_rate": 3.5558493060145406e-07,
"loss": 0.6366,
"step": 2045
},
{
"epoch": 0.657051282051282,
"grad_norm": 3.684701442718506,
"learning_rate": 3.539325842696629e-07,
"loss": 0.7765,
"step": 2050
},
{
"epoch": 0.6586538461538461,
"grad_norm": 6.113523483276367,
"learning_rate": 3.5228023793787177e-07,
"loss": 0.6858,
"step": 2055
},
{
"epoch": 0.6602564102564102,
"grad_norm": 5.9543280601501465,
"learning_rate": 3.5062789160608064e-07,
"loss": 0.8639,
"step": 2060
},
{
"epoch": 0.6618589743589743,
"grad_norm": 2.5266408920288086,
"learning_rate": 3.4897554527428947e-07,
"loss": 0.9136,
"step": 2065
},
{
"epoch": 0.6634615384615384,
"grad_norm": 4.412357807159424,
"learning_rate": 3.4732319894249835e-07,
"loss": 0.8078,
"step": 2070
},
{
"epoch": 0.6650641025641025,
"grad_norm": 3.709512948989868,
"learning_rate": 3.4567085261070717e-07,
"loss": 0.8443,
"step": 2075
},
{
"epoch": 0.6666666666666666,
"grad_norm": 3.4022634029388428,
"learning_rate": 3.4401850627891605e-07,
"loss": 0.7546,
"step": 2080
},
{
"epoch": 0.6682692307692307,
"grad_norm": 5.27069091796875,
"learning_rate": 3.4236615994712493e-07,
"loss": 0.8228,
"step": 2085
},
{
"epoch": 0.6698717948717948,
"grad_norm": 3.136031150817871,
"learning_rate": 3.4071381361533375e-07,
"loss": 0.9051,
"step": 2090
},
{
"epoch": 0.6714743589743589,
"grad_norm": 4.431833744049072,
"learning_rate": 3.3906146728354263e-07,
"loss": 0.8802,
"step": 2095
},
{
"epoch": 0.6730769230769231,
"grad_norm": 4.416879653930664,
"learning_rate": 3.3740912095175146e-07,
"loss": 0.7876,
"step": 2100
},
{
"epoch": 0.6746794871794872,
"grad_norm": 3.685245990753174,
"learning_rate": 3.3575677461996034e-07,
"loss": 0.744,
"step": 2105
},
{
"epoch": 0.6762820512820513,
"grad_norm": 4.721916198730469,
"learning_rate": 3.341044282881692e-07,
"loss": 0.7867,
"step": 2110
},
{
"epoch": 0.6778846153846154,
"grad_norm": 5.276561260223389,
"learning_rate": 3.3245208195637804e-07,
"loss": 0.8425,
"step": 2115
},
{
"epoch": 0.6794871794871795,
"grad_norm": 6.171300888061523,
"learning_rate": 3.307997356245869e-07,
"loss": 0.742,
"step": 2120
},
{
"epoch": 0.6810897435897436,
"grad_norm": 6.1108198165893555,
"learning_rate": 3.2914738929279574e-07,
"loss": 0.7814,
"step": 2125
},
{
"epoch": 0.6826923076923077,
"grad_norm": 5.54103946685791,
"learning_rate": 3.274950429610046e-07,
"loss": 0.8899,
"step": 2130
},
{
"epoch": 0.6842948717948718,
"grad_norm": 5.242672443389893,
"learning_rate": 3.258426966292135e-07,
"loss": 0.8232,
"step": 2135
},
{
"epoch": 0.6858974358974359,
"grad_norm": 11.092650413513184,
"learning_rate": 3.241903502974223e-07,
"loss": 0.7744,
"step": 2140
},
{
"epoch": 0.6875,
"grad_norm": 3.056320905685425,
"learning_rate": 3.225380039656312e-07,
"loss": 0.6096,
"step": 2145
},
{
"epoch": 0.6891025641025641,
"grad_norm": 4.238087177276611,
"learning_rate": 3.2088565763384003e-07,
"loss": 0.7236,
"step": 2150
},
{
"epoch": 0.6907051282051282,
"grad_norm": 3.4259557723999023,
"learning_rate": 3.192333113020489e-07,
"loss": 0.8002,
"step": 2155
},
{
"epoch": 0.6923076923076923,
"grad_norm": 3.611785411834717,
"learning_rate": 3.175809649702578e-07,
"loss": 0.7647,
"step": 2160
},
{
"epoch": 0.6939102564102564,
"grad_norm": 8.97962760925293,
"learning_rate": 3.159286186384666e-07,
"loss": 0.9061,
"step": 2165
},
{
"epoch": 0.6955128205128205,
"grad_norm": 15.352239608764648,
"learning_rate": 3.142762723066755e-07,
"loss": 0.7211,
"step": 2170
},
{
"epoch": 0.6971153846153846,
"grad_norm": 7.31290340423584,
"learning_rate": 3.126239259748843e-07,
"loss": 0.618,
"step": 2175
},
{
"epoch": 0.6987179487179487,
"grad_norm": 4.665528297424316,
"learning_rate": 3.109715796430932e-07,
"loss": 0.8203,
"step": 2180
},
{
"epoch": 0.7003205128205128,
"grad_norm": 17.2761287689209,
"learning_rate": 3.0931923331130207e-07,
"loss": 0.7578,
"step": 2185
},
{
"epoch": 0.7019230769230769,
"grad_norm": 9.712289810180664,
"learning_rate": 3.076668869795109e-07,
"loss": 0.7531,
"step": 2190
},
{
"epoch": 0.7035256410256411,
"grad_norm": 4.434769630432129,
"learning_rate": 3.0601454064771977e-07,
"loss": 0.7863,
"step": 2195
},
{
"epoch": 0.7051282051282052,
"grad_norm": 3.8715121746063232,
"learning_rate": 3.043621943159286e-07,
"loss": 0.7247,
"step": 2200
},
{
"epoch": 0.7067307692307693,
"grad_norm": 3.459235906600952,
"learning_rate": 3.027098479841375e-07,
"loss": 0.7149,
"step": 2205
},
{
"epoch": 0.7083333333333334,
"grad_norm": 5.98268461227417,
"learning_rate": 3.0105750165234635e-07,
"loss": 0.8021,
"step": 2210
},
{
"epoch": 0.7099358974358975,
"grad_norm": 6.481480121612549,
"learning_rate": 2.994051553205552e-07,
"loss": 0.8124,
"step": 2215
},
{
"epoch": 0.7115384615384616,
"grad_norm": 5.063220500946045,
"learning_rate": 2.9775280898876406e-07,
"loss": 0.6746,
"step": 2220
},
{
"epoch": 0.7131410256410257,
"grad_norm": 5.813882827758789,
"learning_rate": 2.9610046265697293e-07,
"loss": 0.8872,
"step": 2225
},
{
"epoch": 0.7147435897435898,
"grad_norm": 7.330856800079346,
"learning_rate": 2.9444811632518176e-07,
"loss": 0.8496,
"step": 2230
},
{
"epoch": 0.7163461538461539,
"grad_norm": 4.500095367431641,
"learning_rate": 2.9279576999339064e-07,
"loss": 0.8594,
"step": 2235
},
{
"epoch": 0.717948717948718,
"grad_norm": 7.6699137687683105,
"learning_rate": 2.9114342366159946e-07,
"loss": 0.7005,
"step": 2240
},
{
"epoch": 0.719551282051282,
"grad_norm": 3.332604169845581,
"learning_rate": 2.8949107732980834e-07,
"loss": 0.8011,
"step": 2245
},
{
"epoch": 0.7211538461538461,
"grad_norm": 7.084466457366943,
"learning_rate": 2.878387309980172e-07,
"loss": 0.7555,
"step": 2250
},
{
"epoch": 0.7227564102564102,
"grad_norm": 2.606405258178711,
"learning_rate": 2.8618638466622604e-07,
"loss": 0.8418,
"step": 2255
},
{
"epoch": 0.7243589743589743,
"grad_norm": 5.162625312805176,
"learning_rate": 2.845340383344349e-07,
"loss": 0.7081,
"step": 2260
},
{
"epoch": 0.7259615384615384,
"grad_norm": 6.1882758140563965,
"learning_rate": 2.8288169200264375e-07,
"loss": 0.7999,
"step": 2265
},
{
"epoch": 0.7275641025641025,
"grad_norm": 3.4105043411254883,
"learning_rate": 2.812293456708526e-07,
"loss": 0.7354,
"step": 2270
},
{
"epoch": 0.7291666666666666,
"grad_norm": 5.230040073394775,
"learning_rate": 2.795769993390615e-07,
"loss": 0.7022,
"step": 2275
},
{
"epoch": 0.7307692307692307,
"grad_norm": 7.303884506225586,
"learning_rate": 2.7792465300727033e-07,
"loss": 0.8529,
"step": 2280
},
{
"epoch": 0.7323717948717948,
"grad_norm": 4.611577987670898,
"learning_rate": 2.762723066754792e-07,
"loss": 0.8055,
"step": 2285
},
{
"epoch": 0.7339743589743589,
"grad_norm": 3.8788657188415527,
"learning_rate": 2.7461996034368803e-07,
"loss": 0.7476,
"step": 2290
},
{
"epoch": 0.7355769230769231,
"grad_norm": 7.592946529388428,
"learning_rate": 2.729676140118969e-07,
"loss": 0.8468,
"step": 2295
},
{
"epoch": 0.7371794871794872,
"grad_norm": 12.41851806640625,
"learning_rate": 2.713152676801058e-07,
"loss": 0.8057,
"step": 2300
},
{
"epoch": 0.7387820512820513,
"grad_norm": 3.9982833862304688,
"learning_rate": 2.6966292134831456e-07,
"loss": 0.8323,
"step": 2305
},
{
"epoch": 0.7403846153846154,
"grad_norm": 4.3113813400268555,
"learning_rate": 2.6801057501652344e-07,
"loss": 0.7667,
"step": 2310
},
{
"epoch": 0.7419871794871795,
"grad_norm": 6.139361381530762,
"learning_rate": 2.6635822868473226e-07,
"loss": 0.7687,
"step": 2315
},
{
"epoch": 0.7435897435897436,
"grad_norm": 13.496137619018555,
"learning_rate": 2.6470588235294114e-07,
"loss": 0.7224,
"step": 2320
},
{
"epoch": 0.7451923076923077,
"grad_norm": 7.981110095977783,
"learning_rate": 2.6305353602115e-07,
"loss": 0.8216,
"step": 2325
},
{
"epoch": 0.7467948717948718,
"grad_norm": 6.703426361083984,
"learning_rate": 2.6140118968935885e-07,
"loss": 0.8239,
"step": 2330
},
{
"epoch": 0.7483974358974359,
"grad_norm": 3.3382091522216797,
"learning_rate": 2.597488433575677e-07,
"loss": 0.8151,
"step": 2335
},
{
"epoch": 0.75,
"grad_norm": 5.277767181396484,
"learning_rate": 2.5809649702577655e-07,
"loss": 0.778,
"step": 2340
},
{
"epoch": 0.7516025641025641,
"grad_norm": 3.5990350246429443,
"learning_rate": 2.5644415069398543e-07,
"loss": 0.7541,
"step": 2345
},
{
"epoch": 0.7532051282051282,
"grad_norm": 4.577154159545898,
"learning_rate": 2.547918043621943e-07,
"loss": 0.7555,
"step": 2350
},
{
"epoch": 0.7548076923076923,
"grad_norm": 4.374950885772705,
"learning_rate": 2.5313945803040313e-07,
"loss": 0.8462,
"step": 2355
},
{
"epoch": 0.7564102564102564,
"grad_norm": 8.507906913757324,
"learning_rate": 2.51487111698612e-07,
"loss": 0.7591,
"step": 2360
},
{
"epoch": 0.7580128205128205,
"grad_norm": 4.493144512176514,
"learning_rate": 2.498347653668209e-07,
"loss": 0.7708,
"step": 2365
},
{
"epoch": 0.7596153846153846,
"grad_norm": 4.824530124664307,
"learning_rate": 2.481824190350297e-07,
"loss": 0.7318,
"step": 2370
},
{
"epoch": 0.7612179487179487,
"grad_norm": 4.022371292114258,
"learning_rate": 2.465300727032386e-07,
"loss": 0.7551,
"step": 2375
},
{
"epoch": 0.7628205128205128,
"grad_norm": 3.5510129928588867,
"learning_rate": 2.448777263714474e-07,
"loss": 0.628,
"step": 2380
},
{
"epoch": 0.7644230769230769,
"grad_norm": 6.095627307891846,
"learning_rate": 2.432253800396563e-07,
"loss": 0.725,
"step": 2385
},
{
"epoch": 0.7660256410256411,
"grad_norm": 3.520016670227051,
"learning_rate": 2.4157303370786517e-07,
"loss": 0.7484,
"step": 2390
},
{
"epoch": 0.7676282051282052,
"grad_norm": 6.656997203826904,
"learning_rate": 2.39920687376074e-07,
"loss": 0.7474,
"step": 2395
},
{
"epoch": 0.7692307692307693,
"grad_norm": 4.5073370933532715,
"learning_rate": 2.3826834104428288e-07,
"loss": 0.6534,
"step": 2400
},
{
"epoch": 0.7708333333333334,
"grad_norm": 5.180692195892334,
"learning_rate": 2.3661599471249173e-07,
"loss": 0.7398,
"step": 2405
},
{
"epoch": 0.7724358974358975,
"grad_norm": 4.856165885925293,
"learning_rate": 2.349636483807006e-07,
"loss": 0.8658,
"step": 2410
},
{
"epoch": 0.7740384615384616,
"grad_norm": 4.942265510559082,
"learning_rate": 2.3331130204890946e-07,
"loss": 0.8106,
"step": 2415
},
{
"epoch": 0.7756410256410257,
"grad_norm": 4.896393775939941,
"learning_rate": 2.316589557171183e-07,
"loss": 0.7782,
"step": 2420
},
{
"epoch": 0.7772435897435898,
"grad_norm": 4.911433696746826,
"learning_rate": 2.3000660938532716e-07,
"loss": 0.7034,
"step": 2425
},
{
"epoch": 0.7788461538461539,
"grad_norm": 5.983463287353516,
"learning_rate": 2.28354263053536e-07,
"loss": 0.7062,
"step": 2430
},
{
"epoch": 0.780448717948718,
"grad_norm": 5.0456414222717285,
"learning_rate": 2.267019167217449e-07,
"loss": 0.7615,
"step": 2435
},
{
"epoch": 0.782051282051282,
"grad_norm": 4.779991149902344,
"learning_rate": 2.2504957038995374e-07,
"loss": 0.6795,
"step": 2440
},
{
"epoch": 0.7836538461538461,
"grad_norm": 5.053199768066406,
"learning_rate": 2.233972240581626e-07,
"loss": 0.8048,
"step": 2445
},
{
"epoch": 0.7852564102564102,
"grad_norm": 7.191258907318115,
"learning_rate": 2.2174487772637144e-07,
"loss": 0.8043,
"step": 2450
},
{
"epoch": 0.7868589743589743,
"grad_norm": 3.500450611114502,
"learning_rate": 2.2009253139458027e-07,
"loss": 0.7147,
"step": 2455
},
{
"epoch": 0.7884615384615384,
"grad_norm": 4.963442325592041,
"learning_rate": 2.1844018506278917e-07,
"loss": 0.7803,
"step": 2460
},
{
"epoch": 0.7900641025641025,
"grad_norm": 4.3301777839660645,
"learning_rate": 2.16787838730998e-07,
"loss": 0.7901,
"step": 2465
},
{
"epoch": 0.7916666666666666,
"grad_norm": 4.038059711456299,
"learning_rate": 2.1513549239920685e-07,
"loss": 0.6812,
"step": 2470
},
{
"epoch": 0.7932692307692307,
"grad_norm": 5.824253559112549,
"learning_rate": 2.134831460674157e-07,
"loss": 0.7618,
"step": 2475
},
{
"epoch": 0.7948717948717948,
"grad_norm": 5.034027099609375,
"learning_rate": 2.1183079973562455e-07,
"loss": 0.6987,
"step": 2480
},
{
"epoch": 0.7964743589743589,
"grad_norm": 4.224520206451416,
"learning_rate": 2.1017845340383343e-07,
"loss": 0.8233,
"step": 2485
},
{
"epoch": 0.7980769230769231,
"grad_norm": 4.304800033569336,
"learning_rate": 2.0852610707204228e-07,
"loss": 0.683,
"step": 2490
},
{
"epoch": 0.7996794871794872,
"grad_norm": 6.027079105377197,
"learning_rate": 2.0687376074025114e-07,
"loss": 0.7514,
"step": 2495
},
{
"epoch": 0.8012820512820513,
"grad_norm": 7.2774882316589355,
"learning_rate": 2.0522141440846e-07,
"loss": 0.8308,
"step": 2500
},
{
"epoch": 0.8028846153846154,
"grad_norm": 7.033870220184326,
"learning_rate": 2.0356906807666884e-07,
"loss": 0.7758,
"step": 2505
},
{
"epoch": 0.8044871794871795,
"grad_norm": 3.2256860733032227,
"learning_rate": 2.0191672174487772e-07,
"loss": 0.759,
"step": 2510
},
{
"epoch": 0.8060897435897436,
"grad_norm": 7.072434425354004,
"learning_rate": 2.0026437541308657e-07,
"loss": 0.7686,
"step": 2515
},
{
"epoch": 0.8076923076923077,
"grad_norm": 3.3247132301330566,
"learning_rate": 1.9861202908129542e-07,
"loss": 0.7644,
"step": 2520
},
{
"epoch": 0.8092948717948718,
"grad_norm": 3.6884591579437256,
"learning_rate": 1.9695968274950427e-07,
"loss": 0.7558,
"step": 2525
},
{
"epoch": 0.8108974358974359,
"grad_norm": 5.145435333251953,
"learning_rate": 1.9530733641771312e-07,
"loss": 0.7459,
"step": 2530
},
{
"epoch": 0.8125,
"grad_norm": 4.134402751922607,
"learning_rate": 1.93654990085922e-07,
"loss": 0.8534,
"step": 2535
},
{
"epoch": 0.8141025641025641,
"grad_norm": 3.347599744796753,
"learning_rate": 1.9200264375413085e-07,
"loss": 0.8417,
"step": 2540
},
{
"epoch": 0.8157051282051282,
"grad_norm": 3.6410083770751953,
"learning_rate": 1.903502974223397e-07,
"loss": 0.8405,
"step": 2545
},
{
"epoch": 0.8173076923076923,
"grad_norm": 3.344439744949341,
"learning_rate": 1.8869795109054856e-07,
"loss": 0.7426,
"step": 2550
},
{
"epoch": 0.8189102564102564,
"grad_norm": 4.314718723297119,
"learning_rate": 1.870456047587574e-07,
"loss": 0.7995,
"step": 2555
},
{
"epoch": 0.8205128205128205,
"grad_norm": 6.937241077423096,
"learning_rate": 1.853932584269663e-07,
"loss": 0.8192,
"step": 2560
},
{
"epoch": 0.8221153846153846,
"grad_norm": 3.7095561027526855,
"learning_rate": 1.8374091209517514e-07,
"loss": 0.7015,
"step": 2565
},
{
"epoch": 0.8237179487179487,
"grad_norm": 4.655959606170654,
"learning_rate": 1.82088565763384e-07,
"loss": 0.7462,
"step": 2570
},
{
"epoch": 0.8253205128205128,
"grad_norm": 5.088621616363525,
"learning_rate": 1.8043621943159284e-07,
"loss": 0.7669,
"step": 2575
},
{
"epoch": 0.8269230769230769,
"grad_norm": 5.979193210601807,
"learning_rate": 1.7878387309980172e-07,
"loss": 0.9233,
"step": 2580
},
{
"epoch": 0.8285256410256411,
"grad_norm": 4.107568740844727,
"learning_rate": 1.7713152676801057e-07,
"loss": 0.868,
"step": 2585
},
{
"epoch": 0.8301282051282052,
"grad_norm": 3.6633615493774414,
"learning_rate": 1.7547918043621942e-07,
"loss": 0.7795,
"step": 2590
},
{
"epoch": 0.8317307692307693,
"grad_norm": 6.704728126525879,
"learning_rate": 1.7382683410442828e-07,
"loss": 0.714,
"step": 2595
},
{
"epoch": 0.8333333333333334,
"grad_norm": 6.485088348388672,
"learning_rate": 1.7217448777263713e-07,
"loss": 0.6941,
"step": 2600
},
{
"epoch": 0.8349358974358975,
"grad_norm": 5.1513566970825195,
"learning_rate": 1.70522141440846e-07,
"loss": 0.7235,
"step": 2605
},
{
"epoch": 0.8365384615384616,
"grad_norm": 6.590970039367676,
"learning_rate": 1.6886979510905486e-07,
"loss": 0.7834,
"step": 2610
},
{
"epoch": 0.8381410256410257,
"grad_norm": 3.539618730545044,
"learning_rate": 1.672174487772637e-07,
"loss": 0.7529,
"step": 2615
},
{
"epoch": 0.8397435897435898,
"grad_norm": 5.671098709106445,
"learning_rate": 1.6556510244547256e-07,
"loss": 0.8103,
"step": 2620
},
{
"epoch": 0.8413461538461539,
"grad_norm": 4.69738245010376,
"learning_rate": 1.639127561136814e-07,
"loss": 0.8686,
"step": 2625
},
{
"epoch": 0.842948717948718,
"grad_norm": 4.465817451477051,
"learning_rate": 1.622604097818903e-07,
"loss": 0.7111,
"step": 2630
},
{
"epoch": 0.844551282051282,
"grad_norm": 3.2771265506744385,
"learning_rate": 1.6060806345009914e-07,
"loss": 0.7292,
"step": 2635
},
{
"epoch": 0.8461538461538461,
"grad_norm": 7.632739067077637,
"learning_rate": 1.58955717118308e-07,
"loss": 0.786,
"step": 2640
},
{
"epoch": 0.8477564102564102,
"grad_norm": 4.397324085235596,
"learning_rate": 1.5730337078651685e-07,
"loss": 0.8378,
"step": 2645
},
{
"epoch": 0.8493589743589743,
"grad_norm": 3.7814230918884277,
"learning_rate": 1.556510244547257e-07,
"loss": 0.7088,
"step": 2650
},
{
"epoch": 0.8509615384615384,
"grad_norm": 3.752884864807129,
"learning_rate": 1.5399867812293457e-07,
"loss": 0.8107,
"step": 2655
},
{
"epoch": 0.8525641025641025,
"grad_norm": 3.5255517959594727,
"learning_rate": 1.5234633179114343e-07,
"loss": 0.7736,
"step": 2660
},
{
"epoch": 0.8541666666666666,
"grad_norm": 3.8665730953216553,
"learning_rate": 1.5069398545935228e-07,
"loss": 0.687,
"step": 2665
},
{
"epoch": 0.8557692307692307,
"grad_norm": 4.789595127105713,
"learning_rate": 1.4904163912756113e-07,
"loss": 0.718,
"step": 2670
},
{
"epoch": 0.8573717948717948,
"grad_norm": 3.834465265274048,
"learning_rate": 1.4738929279576998e-07,
"loss": 0.8931,
"step": 2675
},
{
"epoch": 0.8589743589743589,
"grad_norm": 7.070734977722168,
"learning_rate": 1.4573694646397886e-07,
"loss": 0.7071,
"step": 2680
},
{
"epoch": 0.8605769230769231,
"grad_norm": 2.9893038272857666,
"learning_rate": 1.440846001321877e-07,
"loss": 0.6394,
"step": 2685
},
{
"epoch": 0.8621794871794872,
"grad_norm": 5.302039623260498,
"learning_rate": 1.4243225380039656e-07,
"loss": 0.7887,
"step": 2690
},
{
"epoch": 0.8637820512820513,
"grad_norm": 2.9799692630767822,
"learning_rate": 1.4077990746860541e-07,
"loss": 0.851,
"step": 2695
},
{
"epoch": 0.8653846153846154,
"grad_norm": 4.219221115112305,
"learning_rate": 1.3912756113681427e-07,
"loss": 0.7475,
"step": 2700
},
{
"epoch": 0.8669871794871795,
"grad_norm": 3.4256138801574707,
"learning_rate": 1.3747521480502314e-07,
"loss": 0.806,
"step": 2705
},
{
"epoch": 0.8685897435897436,
"grad_norm": 5.021873474121094,
"learning_rate": 1.35822868473232e-07,
"loss": 0.7701,
"step": 2710
},
{
"epoch": 0.8701923076923077,
"grad_norm": 4.23788595199585,
"learning_rate": 1.3417052214144085e-07,
"loss": 0.7122,
"step": 2715
},
{
"epoch": 0.8717948717948718,
"grad_norm": 4.850051403045654,
"learning_rate": 1.325181758096497e-07,
"loss": 0.6861,
"step": 2720
},
{
"epoch": 0.8733974358974359,
"grad_norm": 5.633250713348389,
"learning_rate": 1.3086582947785855e-07,
"loss": 0.7611,
"step": 2725
},
{
"epoch": 0.875,
"grad_norm": 7.429291248321533,
"learning_rate": 1.2921348314606743e-07,
"loss": 0.6987,
"step": 2730
},
{
"epoch": 0.8766025641025641,
"grad_norm": 6.112792491912842,
"learning_rate": 1.2756113681427628e-07,
"loss": 0.7675,
"step": 2735
},
{
"epoch": 0.8782051282051282,
"grad_norm": 9.730607986450195,
"learning_rate": 1.2590879048248513e-07,
"loss": 0.7429,
"step": 2740
},
{
"epoch": 0.8798076923076923,
"grad_norm": 3.8280539512634277,
"learning_rate": 1.2425644415069398e-07,
"loss": 0.7058,
"step": 2745
},
{
"epoch": 0.8814102564102564,
"grad_norm": 5.016750812530518,
"learning_rate": 1.2260409781890284e-07,
"loss": 0.6931,
"step": 2750
},
{
"epoch": 0.8830128205128205,
"grad_norm": 3.027902603149414,
"learning_rate": 1.209517514871117e-07,
"loss": 0.744,
"step": 2755
},
{
"epoch": 0.8846153846153846,
"grad_norm": 5.9112629890441895,
"learning_rate": 1.1929940515532057e-07,
"loss": 0.7883,
"step": 2760
},
{
"epoch": 0.8862179487179487,
"grad_norm": 3.9589760303497314,
"learning_rate": 1.176470588235294e-07,
"loss": 0.6751,
"step": 2765
},
{
"epoch": 0.8878205128205128,
"grad_norm": 12.412994384765625,
"learning_rate": 1.1599471249173827e-07,
"loss": 0.741,
"step": 2770
},
{
"epoch": 0.8894230769230769,
"grad_norm": 6.313468933105469,
"learning_rate": 1.1434236615994712e-07,
"loss": 0.7083,
"step": 2775
},
{
"epoch": 0.8910256410256411,
"grad_norm": 3.4576292037963867,
"learning_rate": 1.1269001982815597e-07,
"loss": 0.6078,
"step": 2780
},
{
"epoch": 0.8926282051282052,
"grad_norm": 3.770681142807007,
"learning_rate": 1.1103767349636484e-07,
"loss": 0.7976,
"step": 2785
},
{
"epoch": 0.8942307692307693,
"grad_norm": 4.323639392852783,
"learning_rate": 1.0938532716457369e-07,
"loss": 0.726,
"step": 2790
},
{
"epoch": 0.8958333333333334,
"grad_norm": 6.223001480102539,
"learning_rate": 1.0773298083278255e-07,
"loss": 0.7428,
"step": 2795
},
{
"epoch": 0.8974358974358975,
"grad_norm": 4.867865085601807,
"learning_rate": 1.060806345009914e-07,
"loss": 0.747,
"step": 2800
},
{
"epoch": 0.8990384615384616,
"grad_norm": 4.22167444229126,
"learning_rate": 1.0442828816920026e-07,
"loss": 0.7824,
"step": 2805
},
{
"epoch": 0.9006410256410257,
"grad_norm": 3.4794094562530518,
"learning_rate": 1.0277594183740912e-07,
"loss": 0.7904,
"step": 2810
},
{
"epoch": 0.9022435897435898,
"grad_norm": 3.968479633331299,
"learning_rate": 1.0112359550561797e-07,
"loss": 0.8853,
"step": 2815
},
{
"epoch": 0.9038461538461539,
"grad_norm": 3.1891181468963623,
"learning_rate": 9.947124917382684e-08,
"loss": 0.7753,
"step": 2820
},
{
"epoch": 0.905448717948718,
"grad_norm": 4.9156646728515625,
"learning_rate": 9.781890284203569e-08,
"loss": 0.7521,
"step": 2825
},
{
"epoch": 0.907051282051282,
"grad_norm": 4.938701152801514,
"learning_rate": 9.616655651024454e-08,
"loss": 0.7361,
"step": 2830
},
{
"epoch": 0.9086538461538461,
"grad_norm": 4.312582492828369,
"learning_rate": 9.451421017845341e-08,
"loss": 0.7044,
"step": 2835
},
{
"epoch": 0.9102564102564102,
"grad_norm": 7.3174519538879395,
"learning_rate": 9.286186384666226e-08,
"loss": 0.7778,
"step": 2840
},
{
"epoch": 0.9118589743589743,
"grad_norm": 8.664481163024902,
"learning_rate": 9.120951751487112e-08,
"loss": 0.8317,
"step": 2845
},
{
"epoch": 0.9134615384615384,
"grad_norm": 8.050248146057129,
"learning_rate": 8.955717118307998e-08,
"loss": 0.7777,
"step": 2850
},
{
"epoch": 0.9150641025641025,
"grad_norm": 6.539444446563721,
"learning_rate": 8.790482485128881e-08,
"loss": 0.8357,
"step": 2855
},
{
"epoch": 0.9166666666666666,
"grad_norm": 6.118063449859619,
"learning_rate": 8.625247851949768e-08,
"loss": 0.6746,
"step": 2860
},
{
"epoch": 0.9182692307692307,
"grad_norm": 4.888671398162842,
"learning_rate": 8.460013218770653e-08,
"loss": 0.7677,
"step": 2865
},
{
"epoch": 0.9198717948717948,
"grad_norm": 5.636521816253662,
"learning_rate": 8.29477858559154e-08,
"loss": 0.7025,
"step": 2870
},
{
"epoch": 0.9214743589743589,
"grad_norm": 3.849520683288574,
"learning_rate": 8.129543952412425e-08,
"loss": 0.7187,
"step": 2875
},
{
"epoch": 0.9230769230769231,
"grad_norm": 5.312481880187988,
"learning_rate": 7.964309319233311e-08,
"loss": 0.669,
"step": 2880
},
{
"epoch": 0.9246794871794872,
"grad_norm": 6.7007527351379395,
"learning_rate": 7.799074686054196e-08,
"loss": 0.7571,
"step": 2885
},
{
"epoch": 0.9262820512820513,
"grad_norm": 5.961256980895996,
"learning_rate": 7.633840052875081e-08,
"loss": 0.733,
"step": 2890
},
{
"epoch": 0.9278846153846154,
"grad_norm": 8.099090576171875,
"learning_rate": 7.468605419695968e-08,
"loss": 0.8415,
"step": 2895
},
{
"epoch": 0.9294871794871795,
"grad_norm": 3.7094759941101074,
"learning_rate": 7.303370786516853e-08,
"loss": 0.9158,
"step": 2900
},
{
"epoch": 0.9310897435897436,
"grad_norm": 7.212512016296387,
"learning_rate": 7.13813615333774e-08,
"loss": 0.815,
"step": 2905
},
{
"epoch": 0.9326923076923077,
"grad_norm": 5.013028144836426,
"learning_rate": 6.972901520158625e-08,
"loss": 0.7161,
"step": 2910
},
{
"epoch": 0.9342948717948718,
"grad_norm": 5.3960041999816895,
"learning_rate": 6.80766688697951e-08,
"loss": 0.7817,
"step": 2915
},
{
"epoch": 0.9358974358974359,
"grad_norm": 3.4956471920013428,
"learning_rate": 6.642432253800396e-08,
"loss": 0.8383,
"step": 2920
},
{
"epoch": 0.9375,
"grad_norm": 3.654330253601074,
"learning_rate": 6.477197620621282e-08,
"loss": 0.8125,
"step": 2925
},
{
"epoch": 0.9391025641025641,
"grad_norm": 6.255533695220947,
"learning_rate": 6.311962987442168e-08,
"loss": 0.7734,
"step": 2930
},
{
"epoch": 0.9407051282051282,
"grad_norm": 4.802107810974121,
"learning_rate": 6.146728354263053e-08,
"loss": 0.709,
"step": 2935
},
{
"epoch": 0.9423076923076923,
"grad_norm": 6.442443370819092,
"learning_rate": 5.981493721083938e-08,
"loss": 0.7668,
"step": 2940
},
{
"epoch": 0.9439102564102564,
"grad_norm": 3.025623083114624,
"learning_rate": 5.816259087904825e-08,
"loss": 0.7797,
"step": 2945
},
{
"epoch": 0.9455128205128205,
"grad_norm": 4.99326753616333,
"learning_rate": 5.65102445472571e-08,
"loss": 0.7969,
"step": 2950
},
{
"epoch": 0.9471153846153846,
"grad_norm": 8.48199462890625,
"learning_rate": 5.485789821546596e-08,
"loss": 0.7861,
"step": 2955
},
{
"epoch": 0.9487179487179487,
"grad_norm": 4.070643901824951,
"learning_rate": 5.320555188367482e-08,
"loss": 0.9045,
"step": 2960
},
{
"epoch": 0.9503205128205128,
"grad_norm": 4.508942127227783,
"learning_rate": 5.1553205551883676e-08,
"loss": 0.806,
"step": 2965
},
{
"epoch": 0.9519230769230769,
"grad_norm": 5.224105358123779,
"learning_rate": 4.9900859220092534e-08,
"loss": 0.7537,
"step": 2970
},
{
"epoch": 0.9535256410256411,
"grad_norm": 5.267168998718262,
"learning_rate": 4.8248512888301386e-08,
"loss": 0.7458,
"step": 2975
},
{
"epoch": 0.9551282051282052,
"grad_norm": 14.058978080749512,
"learning_rate": 4.659616655651024e-08,
"loss": 0.8491,
"step": 2980
},
{
"epoch": 0.9567307692307693,
"grad_norm": 7.71165657043457,
"learning_rate": 4.4943820224719096e-08,
"loss": 0.7255,
"step": 2985
},
{
"epoch": 0.9583333333333334,
"grad_norm": 3.65620493888855,
"learning_rate": 4.3291473892927954e-08,
"loss": 0.7917,
"step": 2990
},
{
"epoch": 0.9599358974358975,
"grad_norm": 11.238397598266602,
"learning_rate": 4.163912756113681e-08,
"loss": 0.7828,
"step": 2995
},
{
"epoch": 0.9615384615384616,
"grad_norm": 6.159839630126953,
"learning_rate": 3.998678122934567e-08,
"loss": 0.7724,
"step": 3000
},
{
"epoch": 0.9631410256410257,
"grad_norm": 4.247456073760986,
"learning_rate": 3.833443489755452e-08,
"loss": 0.7635,
"step": 3005
},
{
"epoch": 0.9647435897435898,
"grad_norm": 5.236011505126953,
"learning_rate": 3.668208856576338e-08,
"loss": 0.7782,
"step": 3010
},
{
"epoch": 0.9663461538461539,
"grad_norm": 4.830688953399658,
"learning_rate": 3.502974223397224e-08,
"loss": 0.7962,
"step": 3015
},
{
"epoch": 0.967948717948718,
"grad_norm": 6.072144508361816,
"learning_rate": 3.33773959021811e-08,
"loss": 0.9383,
"step": 3020
},
{
"epoch": 0.969551282051282,
"grad_norm": 3.7657108306884766,
"learning_rate": 3.1725049570389955e-08,
"loss": 0.9029,
"step": 3025
},
{
"epoch": 0.9711538461538461,
"grad_norm": 5.47902774810791,
"learning_rate": 3.007270323859881e-08,
"loss": 0.8262,
"step": 3030
},
{
"epoch": 0.9727564102564102,
"grad_norm": 4.847268104553223,
"learning_rate": 2.8420356906807665e-08,
"loss": 0.8054,
"step": 3035
},
{
"epoch": 0.9743589743589743,
"grad_norm": 6.062643527984619,
"learning_rate": 2.676801057501652e-08,
"loss": 0.7808,
"step": 3040
},
{
"epoch": 0.9759615384615384,
"grad_norm": 5.440711498260498,
"learning_rate": 2.511566424322538e-08,
"loss": 0.8026,
"step": 3045
},
{
"epoch": 0.9775641025641025,
"grad_norm": 2.9105708599090576,
"learning_rate": 2.3463317911434237e-08,
"loss": 0.67,
"step": 3050
},
{
"epoch": 0.9791666666666666,
"grad_norm": 5.284862518310547,
"learning_rate": 2.1810971579643092e-08,
"loss": 0.7455,
"step": 3055
},
{
"epoch": 0.9807692307692307,
"grad_norm": 3.7022602558135986,
"learning_rate": 2.015862524785195e-08,
"loss": 0.7627,
"step": 3060
},
{
"epoch": 0.9823717948717948,
"grad_norm": 7.428618907928467,
"learning_rate": 1.850627891606081e-08,
"loss": 0.7116,
"step": 3065
},
{
"epoch": 0.9839743589743589,
"grad_norm": 6.064960956573486,
"learning_rate": 1.685393258426966e-08,
"loss": 0.8331,
"step": 3070
},
{
"epoch": 0.9855769230769231,
"grad_norm": 6.40654182434082,
"learning_rate": 1.520158625247852e-08,
"loss": 0.7827,
"step": 3075
},
{
"epoch": 0.9871794871794872,
"grad_norm": 4.364375114440918,
"learning_rate": 1.3549239920687375e-08,
"loss": 0.8266,
"step": 3080
},
{
"epoch": 0.9887820512820513,
"grad_norm": 6.127290725708008,
"learning_rate": 1.1896893588896232e-08,
"loss": 0.7636,
"step": 3085
},
{
"epoch": 0.9903846153846154,
"grad_norm": 2.9324896335601807,
"learning_rate": 1.0244547257105088e-08,
"loss": 0.6256,
"step": 3090
},
{
"epoch": 0.9919871794871795,
"grad_norm": 3.2810983657836914,
"learning_rate": 8.592200925313947e-09,
"loss": 0.7826,
"step": 3095
},
{
"epoch": 0.9935897435897436,
"grad_norm": 5.652727127075195,
"learning_rate": 6.939854593522802e-09,
"loss": 0.6604,
"step": 3100
},
{
"epoch": 0.9951923076923077,
"grad_norm": 3.927150011062622,
"learning_rate": 5.287508261731658e-09,
"loss": 0.7523,
"step": 3105
},
{
"epoch": 0.9967948717948718,
"grad_norm": 4.154155731201172,
"learning_rate": 3.6351619299405156e-09,
"loss": 0.7649,
"step": 3110
},
{
"epoch": 0.9983974358974359,
"grad_norm": 6.479323387145996,
"learning_rate": 1.9828155981493722e-09,
"loss": 0.731,
"step": 3115
},
{
"epoch": 1.0,
"grad_norm": 3.997898817062378,
"learning_rate": 3.3046926635822863e-10,
"loss": 0.8839,
"step": 3120
},
{
"epoch": 1.0,
"step": 3120,
"total_flos": 9.05641071889875e+17,
"train_loss": 0.8058771748573352,
"train_runtime": 7172.3863,
"train_samples_per_second": 6.959,
"train_steps_per_second": 0.435
}
],
"logging_steps": 5,
"max_steps": 3120,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.05641071889875e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}