{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016025641025641025, "grad_norm": 5.729795932769775, "learning_rate": 4.25531914893617e-08, "loss": 1.033, "step": 5 }, { "epoch": 0.003205128205128205, "grad_norm": 9.286614418029785, "learning_rate": 9.574468085106382e-08, "loss": 1.0218, "step": 10 }, { "epoch": 0.004807692307692308, "grad_norm": 13.221362113952637, "learning_rate": 1.4893617021276595e-07, "loss": 1.065, "step": 15 }, { "epoch": 0.00641025641025641, "grad_norm": 11.769225120544434, "learning_rate": 2.0212765957446807e-07, "loss": 1.0354, "step": 20 }, { "epoch": 0.008012820512820512, "grad_norm": 14.483790397644043, "learning_rate": 2.5531914893617016e-07, "loss": 1.0337, "step": 25 }, { "epoch": 0.009615384615384616, "grad_norm": 7.531055450439453, "learning_rate": 3.085106382978723e-07, "loss": 1.0472, "step": 30 }, { "epoch": 0.011217948717948718, "grad_norm": 6.638299942016602, "learning_rate": 3.617021276595745e-07, "loss": 0.9963, "step": 35 }, { "epoch": 0.01282051282051282, "grad_norm": 16.26578140258789, "learning_rate": 4.148936170212766e-07, "loss": 1.0217, "step": 40 }, { "epoch": 0.014423076923076924, "grad_norm": 17.74137306213379, "learning_rate": 4.6808510638297873e-07, "loss": 1.0535, "step": 45 }, { "epoch": 0.016025641025641024, "grad_norm": 12.711201667785645, "learning_rate": 5.212765957446809e-07, "loss": 0.8812, "step": 50 }, { "epoch": 0.017628205128205128, "grad_norm": 16.832317352294922, "learning_rate": 5.74468085106383e-07, "loss": 1.0726, "step": 55 }, { "epoch": 0.019230769230769232, "grad_norm": 9.954739570617676, "learning_rate": 6.276595744680851e-07, "loss": 1.0394, "step": 60 }, { "epoch": 0.020833333333333332, "grad_norm": 15.11540699005127, "learning_rate": 6.808510638297872e-07, "loss": 0.928, "step": 65 }, { "epoch": 0.022435897435897436, "grad_norm": 12.173163414001465, "learning_rate": 7.340425531914893e-07, "loss": 1.1422, "step": 70 }, { "epoch": 0.02403846153846154, "grad_norm": 4.052046775817871, "learning_rate": 7.872340425531915e-07, "loss": 1.0003, "step": 75 }, { "epoch": 0.02564102564102564, "grad_norm": 8.027369499206543, "learning_rate": 8.404255319148936e-07, "loss": 0.8759, "step": 80 }, { "epoch": 0.027243589743589744, "grad_norm": 12.160286903381348, "learning_rate": 8.936170212765957e-07, "loss": 1.1069, "step": 85 }, { "epoch": 0.028846153846153848, "grad_norm": 10.5565185546875, "learning_rate": 9.468085106382978e-07, "loss": 1.0333, "step": 90 }, { "epoch": 0.030448717948717948, "grad_norm": 7.140777111053467, "learning_rate": 1e-06, "loss": 0.9822, "step": 95 }, { "epoch": 0.03205128205128205, "grad_norm": 10.643312454223633, "learning_rate": 9.983476536682088e-07, "loss": 0.9494, "step": 100 }, { "epoch": 0.03365384615384615, "grad_norm": 8.473272323608398, "learning_rate": 9.966953073364177e-07, "loss": 0.945, "step": 105 }, { "epoch": 0.035256410256410256, "grad_norm": 4.137317180633545, "learning_rate": 9.950429610046264e-07, "loss": 1.0005, "step": 110 }, { "epoch": 0.03685897435897436, "grad_norm": 7.508554458618164, "learning_rate": 9.933906146728353e-07, "loss": 0.8444, "step": 115 }, { "epoch": 0.038461538461538464, "grad_norm": 6.77846097946167, "learning_rate": 9.917382683410441e-07, "loss": 0.9563, "step": 120 }, { "epoch": 0.04006410256410257, "grad_norm": 9.547110557556152, "learning_rate": 9.90085922009253e-07, "loss": 0.9827, "step": 125 }, { "epoch": 0.041666666666666664, "grad_norm": 13.068811416625977, "learning_rate": 9.884335756774619e-07, "loss": 0.8061, "step": 130 }, { "epoch": 0.04326923076923077, "grad_norm": 8.728320121765137, "learning_rate": 9.867812293456708e-07, "loss": 0.888, "step": 135 }, { "epoch": 0.04487179487179487, "grad_norm": 3.790550470352173, "learning_rate": 9.851288830138796e-07, "loss": 0.8734, "step": 140 }, { "epoch": 0.046474358974358976, "grad_norm": 14.830401420593262, "learning_rate": 9.834765366820885e-07, "loss": 0.9757, "step": 145 }, { "epoch": 0.04807692307692308, "grad_norm": 7.318249702453613, "learning_rate": 9.818241903502974e-07, "loss": 0.9218, "step": 150 }, { "epoch": 0.049679487179487176, "grad_norm": 3.7414937019348145, "learning_rate": 9.801718440185063e-07, "loss": 0.9611, "step": 155 }, { "epoch": 0.05128205128205128, "grad_norm": 6.792606353759766, "learning_rate": 9.78519497686715e-07, "loss": 0.9476, "step": 160 }, { "epoch": 0.052884615384615384, "grad_norm": 4.267696380615234, "learning_rate": 9.768671513549238e-07, "loss": 0.8234, "step": 165 }, { "epoch": 0.05448717948717949, "grad_norm": 5.2466959953308105, "learning_rate": 9.752148050231327e-07, "loss": 0.8669, "step": 170 }, { "epoch": 0.05608974358974359, "grad_norm": 11.836358070373535, "learning_rate": 9.735624586913416e-07, "loss": 0.7708, "step": 175 }, { "epoch": 0.057692307692307696, "grad_norm": 5.974247932434082, "learning_rate": 9.719101123595505e-07, "loss": 0.8937, "step": 180 }, { "epoch": 0.05929487179487179, "grad_norm": 3.665184497833252, "learning_rate": 9.702577660277593e-07, "loss": 0.875, "step": 185 }, { "epoch": 0.060897435897435896, "grad_norm": 4.605494022369385, "learning_rate": 9.686054196959682e-07, "loss": 0.7801, "step": 190 }, { "epoch": 0.0625, "grad_norm": 6.7438836097717285, "learning_rate": 9.66953073364177e-07, "loss": 0.8834, "step": 195 }, { "epoch": 0.0641025641025641, "grad_norm": 10.840106010437012, "learning_rate": 9.65300727032386e-07, "loss": 0.9003, "step": 200 }, { "epoch": 0.06570512820512821, "grad_norm": 5.037222385406494, "learning_rate": 9.636483807005949e-07, "loss": 0.7513, "step": 205 }, { "epoch": 0.0673076923076923, "grad_norm": 3.903541326522827, "learning_rate": 9.619960343688035e-07, "loss": 0.8828, "step": 210 }, { "epoch": 0.06891025641025642, "grad_norm": 6.461961269378662, "learning_rate": 9.603436880370124e-07, "loss": 0.7606, "step": 215 }, { "epoch": 0.07051282051282051, "grad_norm": 6.433114528656006, "learning_rate": 9.586913417052213e-07, "loss": 0.8829, "step": 220 }, { "epoch": 0.07211538461538461, "grad_norm": 6.478908538818359, "learning_rate": 9.570389953734302e-07, "loss": 0.86, "step": 225 }, { "epoch": 0.07371794871794872, "grad_norm": 5.247589588165283, "learning_rate": 9.55386649041639e-07, "loss": 0.8562, "step": 230 }, { "epoch": 0.07532051282051282, "grad_norm": 8.098102569580078, "learning_rate": 9.53734302709848e-07, "loss": 0.974, "step": 235 }, { "epoch": 0.07692307692307693, "grad_norm": 10.06252670288086, "learning_rate": 9.520819563780568e-07, "loss": 0.9281, "step": 240 }, { "epoch": 0.07852564102564102, "grad_norm": 3.637204885482788, "learning_rate": 9.504296100462657e-07, "loss": 0.8829, "step": 245 }, { "epoch": 0.08012820512820513, "grad_norm": 4.503812313079834, "learning_rate": 9.487772637144745e-07, "loss": 0.7286, "step": 250 }, { "epoch": 0.08173076923076923, "grad_norm": 8.717390060424805, "learning_rate": 9.471249173826834e-07, "loss": 0.6812, "step": 255 }, { "epoch": 0.08333333333333333, "grad_norm": 6.7273640632629395, "learning_rate": 9.454725710508922e-07, "loss": 0.808, "step": 260 }, { "epoch": 0.08493589743589744, "grad_norm": 4.702677249908447, "learning_rate": 9.438202247191011e-07, "loss": 0.9192, "step": 265 }, { "epoch": 0.08653846153846154, "grad_norm": 4.1625285148620605, "learning_rate": 9.4216787838731e-07, "loss": 0.7835, "step": 270 }, { "epoch": 0.08814102564102565, "grad_norm": 5.688870906829834, "learning_rate": 9.405155320555188e-07, "loss": 0.8823, "step": 275 }, { "epoch": 0.08974358974358974, "grad_norm": 9.040973663330078, "learning_rate": 9.388631857237277e-07, "loss": 0.9733, "step": 280 }, { "epoch": 0.09134615384615384, "grad_norm": 4.173698902130127, "learning_rate": 9.372108393919365e-07, "loss": 0.7514, "step": 285 }, { "epoch": 0.09294871794871795, "grad_norm": 7.822443962097168, "learning_rate": 9.355584930601454e-07, "loss": 0.8867, "step": 290 }, { "epoch": 0.09455128205128205, "grad_norm": 8.641590118408203, "learning_rate": 9.339061467283542e-07, "loss": 0.833, "step": 295 }, { "epoch": 0.09615384615384616, "grad_norm": 4.389246463775635, "learning_rate": 9.322538003965631e-07, "loss": 0.892, "step": 300 }, { "epoch": 0.09775641025641026, "grad_norm": 4.615504741668701, "learning_rate": 9.30601454064772e-07, "loss": 0.8854, "step": 305 }, { "epoch": 0.09935897435897435, "grad_norm": 7.86992073059082, "learning_rate": 9.289491077329808e-07, "loss": 0.8405, "step": 310 }, { "epoch": 0.10096153846153846, "grad_norm": 7.31835412979126, "learning_rate": 9.272967614011896e-07, "loss": 0.8817, "step": 315 }, { "epoch": 0.10256410256410256, "grad_norm": 3.403594970703125, "learning_rate": 9.256444150693985e-07, "loss": 0.8149, "step": 320 }, { "epoch": 0.10416666666666667, "grad_norm": 7.1932806968688965, "learning_rate": 9.239920687376074e-07, "loss": 0.9139, "step": 325 }, { "epoch": 0.10576923076923077, "grad_norm": 5.652829170227051, "learning_rate": 9.223397224058163e-07, "loss": 0.8335, "step": 330 }, { "epoch": 0.10737179487179487, "grad_norm": 4.582092761993408, "learning_rate": 9.20687376074025e-07, "loss": 0.8175, "step": 335 }, { "epoch": 0.10897435897435898, "grad_norm": 6.207703113555908, "learning_rate": 9.190350297422339e-07, "loss": 0.8367, "step": 340 }, { "epoch": 0.11057692307692307, "grad_norm": 7.341710567474365, "learning_rate": 9.173826834104428e-07, "loss": 0.838, "step": 345 }, { "epoch": 0.11217948717948718, "grad_norm": 4.263551712036133, "learning_rate": 9.157303370786517e-07, "loss": 0.8722, "step": 350 }, { "epoch": 0.11378205128205128, "grad_norm": 5.9049601554870605, "learning_rate": 9.140779907468606e-07, "loss": 0.8158, "step": 355 }, { "epoch": 0.11538461538461539, "grad_norm": 6.010617256164551, "learning_rate": 9.124256444150693e-07, "loss": 0.7757, "step": 360 }, { "epoch": 0.11698717948717949, "grad_norm": 5.599278450012207, "learning_rate": 9.107732980832782e-07, "loss": 0.8487, "step": 365 }, { "epoch": 0.11858974358974358, "grad_norm": 6.303196907043457, "learning_rate": 9.091209517514871e-07, "loss": 0.8727, "step": 370 }, { "epoch": 0.1201923076923077, "grad_norm": 5.943972110748291, "learning_rate": 9.07468605419696e-07, "loss": 0.7266, "step": 375 }, { "epoch": 0.12179487179487179, "grad_norm": 10.433466911315918, "learning_rate": 9.058162590879048e-07, "loss": 0.8264, "step": 380 }, { "epoch": 0.1233974358974359, "grad_norm": 6.700842380523682, "learning_rate": 9.041639127561136e-07, "loss": 0.9768, "step": 385 }, { "epoch": 0.125, "grad_norm": 10.210798263549805, "learning_rate": 9.025115664243225e-07, "loss": 0.8168, "step": 390 }, { "epoch": 0.1266025641025641, "grad_norm": 4.839009761810303, "learning_rate": 9.008592200925314e-07, "loss": 0.8856, "step": 395 }, { "epoch": 0.1282051282051282, "grad_norm": 8.077885627746582, "learning_rate": 8.992068737607403e-07, "loss": 0.9729, "step": 400 }, { "epoch": 0.12980769230769232, "grad_norm": 8.734336853027344, "learning_rate": 8.975545274289491e-07, "loss": 0.9824, "step": 405 }, { "epoch": 0.13141025641025642, "grad_norm": 5.260401725769043, "learning_rate": 8.959021810971579e-07, "loss": 0.8476, "step": 410 }, { "epoch": 0.1330128205128205, "grad_norm": 5.269688129425049, "learning_rate": 8.942498347653668e-07, "loss": 0.8591, "step": 415 }, { "epoch": 0.1346153846153846, "grad_norm": 4.150247097015381, "learning_rate": 8.925974884335757e-07, "loss": 0.8461, "step": 420 }, { "epoch": 0.1362179487179487, "grad_norm": 4.139176845550537, "learning_rate": 8.909451421017845e-07, "loss": 0.9335, "step": 425 }, { "epoch": 0.13782051282051283, "grad_norm": 7.5222554206848145, "learning_rate": 8.892927957699934e-07, "loss": 0.7143, "step": 430 }, { "epoch": 0.13942307692307693, "grad_norm": 12.695758819580078, "learning_rate": 8.876404494382022e-07, "loss": 0.8184, "step": 435 }, { "epoch": 0.14102564102564102, "grad_norm": 8.057138442993164, "learning_rate": 8.859881031064111e-07, "loss": 0.9017, "step": 440 }, { "epoch": 0.14262820512820512, "grad_norm": 8.482138633728027, "learning_rate": 8.843357567746199e-07, "loss": 0.9694, "step": 445 }, { "epoch": 0.14423076923076922, "grad_norm": 12.769122123718262, "learning_rate": 8.826834104428288e-07, "loss": 0.8384, "step": 450 }, { "epoch": 0.14583333333333334, "grad_norm": 5.045727252960205, "learning_rate": 8.810310641110377e-07, "loss": 0.8156, "step": 455 }, { "epoch": 0.14743589743589744, "grad_norm": 9.09874153137207, "learning_rate": 8.793787177792465e-07, "loss": 0.8116, "step": 460 }, { "epoch": 0.14903846153846154, "grad_norm": 6.691732883453369, "learning_rate": 8.777263714474553e-07, "loss": 0.8814, "step": 465 }, { "epoch": 0.15064102564102563, "grad_norm": 5.676293849945068, "learning_rate": 8.760740251156642e-07, "loss": 0.8186, "step": 470 }, { "epoch": 0.15224358974358973, "grad_norm": 8.919610977172852, "learning_rate": 8.744216787838731e-07, "loss": 0.7442, "step": 475 }, { "epoch": 0.15384615384615385, "grad_norm": 4.288793087005615, "learning_rate": 8.72769332452082e-07, "loss": 0.8538, "step": 480 }, { "epoch": 0.15544871794871795, "grad_norm": 8.457489013671875, "learning_rate": 8.711169861202908e-07, "loss": 0.8284, "step": 485 }, { "epoch": 0.15705128205128205, "grad_norm": 8.613219261169434, "learning_rate": 8.694646397884996e-07, "loss": 0.8465, "step": 490 }, { "epoch": 0.15865384615384615, "grad_norm": 5.168330192565918, "learning_rate": 8.678122934567085e-07, "loss": 0.854, "step": 495 }, { "epoch": 0.16025641025641027, "grad_norm": 6.283329010009766, "learning_rate": 8.661599471249174e-07, "loss": 0.9902, "step": 500 }, { "epoch": 0.16185897435897437, "grad_norm": 8.224679946899414, "learning_rate": 8.645076007931263e-07, "loss": 0.9261, "step": 505 }, { "epoch": 0.16346153846153846, "grad_norm": 3.9687061309814453, "learning_rate": 8.62855254461335e-07, "loss": 0.8671, "step": 510 }, { "epoch": 0.16506410256410256, "grad_norm": 3.925053358078003, "learning_rate": 8.612029081295439e-07, "loss": 0.6827, "step": 515 }, { "epoch": 0.16666666666666666, "grad_norm": 4.103531837463379, "learning_rate": 8.595505617977528e-07, "loss": 0.9075, "step": 520 }, { "epoch": 0.16826923076923078, "grad_norm": 4.411681175231934, "learning_rate": 8.578982154659617e-07, "loss": 0.7698, "step": 525 }, { "epoch": 0.16987179487179488, "grad_norm": 8.91723346710205, "learning_rate": 8.562458691341706e-07, "loss": 0.9759, "step": 530 }, { "epoch": 0.17147435897435898, "grad_norm": 3.293285846710205, "learning_rate": 8.545935228023793e-07, "loss": 0.7131, "step": 535 }, { "epoch": 0.17307692307692307, "grad_norm": 4.500021934509277, "learning_rate": 8.529411764705882e-07, "loss": 0.6866, "step": 540 }, { "epoch": 0.17467948717948717, "grad_norm": 3.7127466201782227, "learning_rate": 8.512888301387971e-07, "loss": 0.7112, "step": 545 }, { "epoch": 0.1762820512820513, "grad_norm": 5.5667877197265625, "learning_rate": 8.49636483807006e-07, "loss": 0.7516, "step": 550 }, { "epoch": 0.1778846153846154, "grad_norm": 4.206048965454102, "learning_rate": 8.479841374752148e-07, "loss": 0.8639, "step": 555 }, { "epoch": 0.1794871794871795, "grad_norm": 3.593855857849121, "learning_rate": 8.463317911434236e-07, "loss": 0.7413, "step": 560 }, { "epoch": 0.18108974358974358, "grad_norm": 9.683537483215332, "learning_rate": 8.446794448116325e-07, "loss": 0.9477, "step": 565 }, { "epoch": 0.18269230769230768, "grad_norm": 5.113137245178223, "learning_rate": 8.430270984798414e-07, "loss": 0.8425, "step": 570 }, { "epoch": 0.1842948717948718, "grad_norm": 10.013446807861328, "learning_rate": 8.413747521480502e-07, "loss": 0.9511, "step": 575 }, { "epoch": 0.1858974358974359, "grad_norm": 7.936026573181152, "learning_rate": 8.397224058162591e-07, "loss": 0.8367, "step": 580 }, { "epoch": 0.1875, "grad_norm": 4.949577331542969, "learning_rate": 8.38070059484468e-07, "loss": 0.7833, "step": 585 }, { "epoch": 0.1891025641025641, "grad_norm": 5.491623878479004, "learning_rate": 8.364177131526768e-07, "loss": 0.7967, "step": 590 }, { "epoch": 0.1907051282051282, "grad_norm": 9.594220161437988, "learning_rate": 8.347653668208857e-07, "loss": 0.8505, "step": 595 }, { "epoch": 0.19230769230769232, "grad_norm": 6.291924476623535, "learning_rate": 8.331130204890945e-07, "loss": 0.7231, "step": 600 }, { "epoch": 0.19391025641025642, "grad_norm": 5.185746192932129, "learning_rate": 8.314606741573034e-07, "loss": 0.8033, "step": 605 }, { "epoch": 0.1955128205128205, "grad_norm": 9.937252044677734, "learning_rate": 8.298083278255123e-07, "loss": 0.8159, "step": 610 }, { "epoch": 0.1971153846153846, "grad_norm": 3.5764591693878174, "learning_rate": 8.281559814937211e-07, "loss": 0.9405, "step": 615 }, { "epoch": 0.1987179487179487, "grad_norm": 4.1528496742248535, "learning_rate": 8.265036351619299e-07, "loss": 0.7852, "step": 620 }, { "epoch": 0.20032051282051283, "grad_norm": 4.072427272796631, "learning_rate": 8.248512888301388e-07, "loss": 0.7844, "step": 625 }, { "epoch": 0.20192307692307693, "grad_norm": 8.563277244567871, "learning_rate": 8.231989424983477e-07, "loss": 0.8309, "step": 630 }, { "epoch": 0.20352564102564102, "grad_norm": 6.037329196929932, "learning_rate": 8.215465961665566e-07, "loss": 0.782, "step": 635 }, { "epoch": 0.20512820512820512, "grad_norm": 5.000993728637695, "learning_rate": 8.198942498347653e-07, "loss": 0.9419, "step": 640 }, { "epoch": 0.20673076923076922, "grad_norm": 4.175522327423096, "learning_rate": 8.182419035029742e-07, "loss": 0.8316, "step": 645 }, { "epoch": 0.20833333333333334, "grad_norm": 5.075506210327148, "learning_rate": 8.165895571711831e-07, "loss": 0.8471, "step": 650 }, { "epoch": 0.20993589743589744, "grad_norm": 5.188806533813477, "learning_rate": 8.14937210839392e-07, "loss": 0.8379, "step": 655 }, { "epoch": 0.21153846153846154, "grad_norm": 6.2080078125, "learning_rate": 8.132848645076009e-07, "loss": 0.9081, "step": 660 }, { "epoch": 0.21314102564102563, "grad_norm": 4.525467395782471, "learning_rate": 8.116325181758096e-07, "loss": 0.8066, "step": 665 }, { "epoch": 0.21474358974358973, "grad_norm": 5.5678582191467285, "learning_rate": 8.099801718440185e-07, "loss": 0.7192, "step": 670 }, { "epoch": 0.21634615384615385, "grad_norm": 6.47728157043457, "learning_rate": 8.083278255122274e-07, "loss": 0.7436, "step": 675 }, { "epoch": 0.21794871794871795, "grad_norm": 4.739030838012695, "learning_rate": 8.066754791804363e-07, "loss": 0.8783, "step": 680 }, { "epoch": 0.21955128205128205, "grad_norm": 6.747486591339111, "learning_rate": 8.050231328486451e-07, "loss": 0.8484, "step": 685 }, { "epoch": 0.22115384615384615, "grad_norm": 6.090416431427002, "learning_rate": 8.033707865168539e-07, "loss": 0.8766, "step": 690 }, { "epoch": 0.22275641025641027, "grad_norm": 5.005781650543213, "learning_rate": 8.017184401850628e-07, "loss": 0.8299, "step": 695 }, { "epoch": 0.22435897435897437, "grad_norm": 5.198122024536133, "learning_rate": 8.000660938532717e-07, "loss": 0.8107, "step": 700 }, { "epoch": 0.22596153846153846, "grad_norm": 5.170607089996338, "learning_rate": 7.984137475214805e-07, "loss": 0.8751, "step": 705 }, { "epoch": 0.22756410256410256, "grad_norm": 4.371824741363525, "learning_rate": 7.967614011896894e-07, "loss": 0.8545, "step": 710 }, { "epoch": 0.22916666666666666, "grad_norm": 7.1865363121032715, "learning_rate": 7.951090548578981e-07, "loss": 0.8519, "step": 715 }, { "epoch": 0.23076923076923078, "grad_norm": 11.179749488830566, "learning_rate": 7.93456708526107e-07, "loss": 0.7942, "step": 720 }, { "epoch": 0.23237179487179488, "grad_norm": 8.086874008178711, "learning_rate": 7.91804362194316e-07, "loss": 0.8385, "step": 725 }, { "epoch": 0.23397435897435898, "grad_norm": 5.28953218460083, "learning_rate": 7.901520158625248e-07, "loss": 0.9464, "step": 730 }, { "epoch": 0.23557692307692307, "grad_norm": 5.9961018562316895, "learning_rate": 7.884996695307337e-07, "loss": 0.917, "step": 735 }, { "epoch": 0.23717948717948717, "grad_norm": 6.03367805480957, "learning_rate": 7.868473231989424e-07, "loss": 0.7771, "step": 740 }, { "epoch": 0.2387820512820513, "grad_norm": 4.500458717346191, "learning_rate": 7.851949768671513e-07, "loss": 0.7903, "step": 745 }, { "epoch": 0.2403846153846154, "grad_norm": 3.947294235229492, "learning_rate": 7.835426305353601e-07, "loss": 0.795, "step": 750 }, { "epoch": 0.2419871794871795, "grad_norm": 7.3017683029174805, "learning_rate": 7.81890284203569e-07, "loss": 0.8138, "step": 755 }, { "epoch": 0.24358974358974358, "grad_norm": 3.787949562072754, "learning_rate": 7.802379378717779e-07, "loss": 0.6665, "step": 760 }, { "epoch": 0.24519230769230768, "grad_norm": 5.326612949371338, "learning_rate": 7.785855915399867e-07, "loss": 0.9742, "step": 765 }, { "epoch": 0.2467948717948718, "grad_norm": 6.92157506942749, "learning_rate": 7.769332452081955e-07, "loss": 0.841, "step": 770 }, { "epoch": 0.2483974358974359, "grad_norm": 4.417288780212402, "learning_rate": 7.752808988764044e-07, "loss": 0.9047, "step": 775 }, { "epoch": 0.25, "grad_norm": 3.6038155555725098, "learning_rate": 7.736285525446133e-07, "loss": 0.7922, "step": 780 }, { "epoch": 0.2516025641025641, "grad_norm": 4.835304260253906, "learning_rate": 7.719762062128222e-07, "loss": 0.8349, "step": 785 }, { "epoch": 0.2532051282051282, "grad_norm": 3.1939454078674316, "learning_rate": 7.703238598810309e-07, "loss": 0.7257, "step": 790 }, { "epoch": 0.2548076923076923, "grad_norm": 8.088797569274902, "learning_rate": 7.686715135492398e-07, "loss": 0.8311, "step": 795 }, { "epoch": 0.2564102564102564, "grad_norm": 7.198094367980957, "learning_rate": 7.670191672174487e-07, "loss": 0.8427, "step": 800 }, { "epoch": 0.25801282051282054, "grad_norm": 5.080805778503418, "learning_rate": 7.653668208856576e-07, "loss": 0.7725, "step": 805 }, { "epoch": 0.25961538461538464, "grad_norm": 3.3601558208465576, "learning_rate": 7.637144745538665e-07, "loss": 0.859, "step": 810 }, { "epoch": 0.26121794871794873, "grad_norm": 6.839197158813477, "learning_rate": 7.620621282220752e-07, "loss": 0.8956, "step": 815 }, { "epoch": 0.26282051282051283, "grad_norm": 4.368642807006836, "learning_rate": 7.604097818902841e-07, "loss": 0.9344, "step": 820 }, { "epoch": 0.2644230769230769, "grad_norm": 4.079487323760986, "learning_rate": 7.58757435558493e-07, "loss": 0.7743, "step": 825 }, { "epoch": 0.266025641025641, "grad_norm": 7.400752544403076, "learning_rate": 7.571050892267019e-07, "loss": 0.8653, "step": 830 }, { "epoch": 0.2676282051282051, "grad_norm": 6.021170616149902, "learning_rate": 7.554527428949107e-07, "loss": 0.929, "step": 835 }, { "epoch": 0.2692307692307692, "grad_norm": 7.803846836090088, "learning_rate": 7.538003965631195e-07, "loss": 0.7471, "step": 840 }, { "epoch": 0.2708333333333333, "grad_norm": 11.89211654663086, "learning_rate": 7.521480502313284e-07, "loss": 0.7173, "step": 845 }, { "epoch": 0.2724358974358974, "grad_norm": 9.066969871520996, "learning_rate": 7.504957038995373e-07, "loss": 0.9117, "step": 850 }, { "epoch": 0.27403846153846156, "grad_norm": 5.939947128295898, "learning_rate": 7.488433575677461e-07, "loss": 0.7001, "step": 855 }, { "epoch": 0.27564102564102566, "grad_norm": 4.300017356872559, "learning_rate": 7.47191011235955e-07, "loss": 0.9133, "step": 860 }, { "epoch": 0.27724358974358976, "grad_norm": 3.9818003177642822, "learning_rate": 7.455386649041638e-07, "loss": 0.8436, "step": 865 }, { "epoch": 0.27884615384615385, "grad_norm": 6.319674968719482, "learning_rate": 7.438863185723727e-07, "loss": 0.8385, "step": 870 }, { "epoch": 0.28044871794871795, "grad_norm": 7.230429172515869, "learning_rate": 7.422339722405816e-07, "loss": 0.8215, "step": 875 }, { "epoch": 0.28205128205128205, "grad_norm": 3.5045459270477295, "learning_rate": 7.405816259087904e-07, "loss": 0.7351, "step": 880 }, { "epoch": 0.28365384615384615, "grad_norm": 5.423972129821777, "learning_rate": 7.389292795769993e-07, "loss": 0.7768, "step": 885 }, { "epoch": 0.28525641025641024, "grad_norm": 9.424778938293457, "learning_rate": 7.372769332452081e-07, "loss": 0.903, "step": 890 }, { "epoch": 0.28685897435897434, "grad_norm": 4.601898670196533, "learning_rate": 7.35624586913417e-07, "loss": 0.8104, "step": 895 }, { "epoch": 0.28846153846153844, "grad_norm": 5.262858867645264, "learning_rate": 7.339722405816258e-07, "loss": 0.8147, "step": 900 }, { "epoch": 0.2900641025641026, "grad_norm": 4.327410697937012, "learning_rate": 7.323198942498347e-07, "loss": 0.7795, "step": 905 }, { "epoch": 0.2916666666666667, "grad_norm": 5.896692752838135, "learning_rate": 7.306675479180436e-07, "loss": 0.7502, "step": 910 }, { "epoch": 0.2932692307692308, "grad_norm": 4.993595600128174, "learning_rate": 7.290152015862524e-07, "loss": 0.9855, "step": 915 }, { "epoch": 0.2948717948717949, "grad_norm": 7.250411510467529, "learning_rate": 7.273628552544612e-07, "loss": 0.6526, "step": 920 }, { "epoch": 0.296474358974359, "grad_norm": 5.891010761260986, "learning_rate": 7.257105089226701e-07, "loss": 0.8565, "step": 925 }, { "epoch": 0.2980769230769231, "grad_norm": 3.8717401027679443, "learning_rate": 7.24058162590879e-07, "loss": 0.9124, "step": 930 }, { "epoch": 0.29967948717948717, "grad_norm": 5.1769537925720215, "learning_rate": 7.224058162590879e-07, "loss": 0.6785, "step": 935 }, { "epoch": 0.30128205128205127, "grad_norm": 4.895565986633301, "learning_rate": 7.207534699272967e-07, "loss": 0.763, "step": 940 }, { "epoch": 0.30288461538461536, "grad_norm": 7.584598541259766, "learning_rate": 7.191011235955055e-07, "loss": 0.8159, "step": 945 }, { "epoch": 0.30448717948717946, "grad_norm": 2.980520009994507, "learning_rate": 7.174487772637144e-07, "loss": 0.8258, "step": 950 }, { "epoch": 0.3060897435897436, "grad_norm": 5.3033528327941895, "learning_rate": 7.157964309319233e-07, "loss": 0.9091, "step": 955 }, { "epoch": 0.3076923076923077, "grad_norm": 5.652465343475342, "learning_rate": 7.141440846001322e-07, "loss": 0.7601, "step": 960 }, { "epoch": 0.3092948717948718, "grad_norm": 2.7802562713623047, "learning_rate": 7.124917382683409e-07, "loss": 0.6819, "step": 965 }, { "epoch": 0.3108974358974359, "grad_norm": 10.060710906982422, "learning_rate": 7.108393919365498e-07, "loss": 0.9308, "step": 970 }, { "epoch": 0.3125, "grad_norm": 8.689105033874512, "learning_rate": 7.091870456047587e-07, "loss": 0.7976, "step": 975 }, { "epoch": 0.3141025641025641, "grad_norm": 7.55824613571167, "learning_rate": 7.075346992729676e-07, "loss": 0.7857, "step": 980 }, { "epoch": 0.3157051282051282, "grad_norm": 4.5640034675598145, "learning_rate": 7.058823529411765e-07, "loss": 0.8621, "step": 985 }, { "epoch": 0.3173076923076923, "grad_norm": 7.791897773742676, "learning_rate": 7.042300066093852e-07, "loss": 0.8943, "step": 990 }, { "epoch": 0.3189102564102564, "grad_norm": 4.598413944244385, "learning_rate": 7.025776602775941e-07, "loss": 0.8254, "step": 995 }, { "epoch": 0.32051282051282054, "grad_norm": 6.27009391784668, "learning_rate": 7.00925313945803e-07, "loss": 0.8624, "step": 1000 }, { "epoch": 0.32211538461538464, "grad_norm": 13.144405364990234, "learning_rate": 6.992729676140119e-07, "loss": 0.9121, "step": 1005 }, { "epoch": 0.32371794871794873, "grad_norm": 3.142514944076538, "learning_rate": 6.976206212822207e-07, "loss": 0.8858, "step": 1010 }, { "epoch": 0.32532051282051283, "grad_norm": 3.696758270263672, "learning_rate": 6.959682749504295e-07, "loss": 0.86, "step": 1015 }, { "epoch": 0.3269230769230769, "grad_norm": 11.541287422180176, "learning_rate": 6.943159286186384e-07, "loss": 0.6851, "step": 1020 }, { "epoch": 0.328525641025641, "grad_norm": 8.48985481262207, "learning_rate": 6.926635822868473e-07, "loss": 0.793, "step": 1025 }, { "epoch": 0.3301282051282051, "grad_norm": 3.3774638175964355, "learning_rate": 6.910112359550561e-07, "loss": 0.8819, "step": 1030 }, { "epoch": 0.3317307692307692, "grad_norm": 5.883586406707764, "learning_rate": 6.89358889623265e-07, "loss": 0.7703, "step": 1035 }, { "epoch": 0.3333333333333333, "grad_norm": 4.836696624755859, "learning_rate": 6.877065432914738e-07, "loss": 0.7873, "step": 1040 }, { "epoch": 0.3349358974358974, "grad_norm": 4.359090805053711, "learning_rate": 6.860541969596827e-07, "loss": 0.7078, "step": 1045 }, { "epoch": 0.33653846153846156, "grad_norm": 4.49058723449707, "learning_rate": 6.844018506278915e-07, "loss": 0.8212, "step": 1050 }, { "epoch": 0.33814102564102566, "grad_norm": 4.579678535461426, "learning_rate": 6.827495042961004e-07, "loss": 0.7983, "step": 1055 }, { "epoch": 0.33974358974358976, "grad_norm": 9.115843772888184, "learning_rate": 6.810971579643093e-07, "loss": 0.6587, "step": 1060 }, { "epoch": 0.34134615384615385, "grad_norm": 5.484290599822998, "learning_rate": 6.794448116325181e-07, "loss": 0.7931, "step": 1065 }, { "epoch": 0.34294871794871795, "grad_norm": 8.550032615661621, "learning_rate": 6.77792465300727e-07, "loss": 0.7788, "step": 1070 }, { "epoch": 0.34455128205128205, "grad_norm": 3.559866428375244, "learning_rate": 6.761401189689358e-07, "loss": 0.8319, "step": 1075 }, { "epoch": 0.34615384615384615, "grad_norm": 4.798201560974121, "learning_rate": 6.744877726371447e-07, "loss": 0.8986, "step": 1080 }, { "epoch": 0.34775641025641024, "grad_norm": 5.144353866577148, "learning_rate": 6.728354263053536e-07, "loss": 0.6799, "step": 1085 }, { "epoch": 0.34935897435897434, "grad_norm": 8.058225631713867, "learning_rate": 6.711830799735624e-07, "loss": 0.8101, "step": 1090 }, { "epoch": 0.35096153846153844, "grad_norm": 5.630926132202148, "learning_rate": 6.695307336417712e-07, "loss": 0.8777, "step": 1095 }, { "epoch": 0.3525641025641026, "grad_norm": 6.345671653747559, "learning_rate": 6.678783873099801e-07, "loss": 0.6524, "step": 1100 }, { "epoch": 0.3541666666666667, "grad_norm": 11.713841438293457, "learning_rate": 6.66226040978189e-07, "loss": 0.8206, "step": 1105 }, { "epoch": 0.3557692307692308, "grad_norm": 4.345118999481201, "learning_rate": 6.645736946463979e-07, "loss": 0.8694, "step": 1110 }, { "epoch": 0.3573717948717949, "grad_norm": 6.495255947113037, "learning_rate": 6.629213483146066e-07, "loss": 0.8721, "step": 1115 }, { "epoch": 0.358974358974359, "grad_norm": 4.048442840576172, "learning_rate": 6.612690019828155e-07, "loss": 0.8307, "step": 1120 }, { "epoch": 0.3605769230769231, "grad_norm": 5.020994186401367, "learning_rate": 6.596166556510244e-07, "loss": 0.7022, "step": 1125 }, { "epoch": 0.36217948717948717, "grad_norm": 4.994934558868408, "learning_rate": 6.579643093192333e-07, "loss": 0.8302, "step": 1130 }, { "epoch": 0.36378205128205127, "grad_norm": 3.8185691833496094, "learning_rate": 6.563119629874422e-07, "loss": 0.702, "step": 1135 }, { "epoch": 0.36538461538461536, "grad_norm": 5.440126895904541, "learning_rate": 6.546596166556509e-07, "loss": 0.8589, "step": 1140 }, { "epoch": 0.36698717948717946, "grad_norm": 4.935449600219727, "learning_rate": 6.530072703238598e-07, "loss": 0.6958, "step": 1145 }, { "epoch": 0.3685897435897436, "grad_norm": 3.0225484371185303, "learning_rate": 6.513549239920687e-07, "loss": 0.7535, "step": 1150 }, { "epoch": 0.3701923076923077, "grad_norm": 4.705560684204102, "learning_rate": 6.497025776602776e-07, "loss": 0.8748, "step": 1155 }, { "epoch": 0.3717948717948718, "grad_norm": 7.766085147857666, "learning_rate": 6.480502313284864e-07, "loss": 0.7895, "step": 1160 }, { "epoch": 0.3733974358974359, "grad_norm": 7.818696975708008, "learning_rate": 6.463978849966952e-07, "loss": 0.8482, "step": 1165 }, { "epoch": 0.375, "grad_norm": 4.289005279541016, "learning_rate": 6.447455386649041e-07, "loss": 0.8875, "step": 1170 }, { "epoch": 0.3766025641025641, "grad_norm": 4.8739752769470215, "learning_rate": 6.43093192333113e-07, "loss": 0.8013, "step": 1175 }, { "epoch": 0.3782051282051282, "grad_norm": 3.6027133464813232, "learning_rate": 6.414408460013219e-07, "loss": 0.892, "step": 1180 }, { "epoch": 0.3798076923076923, "grad_norm": 4.740626335144043, "learning_rate": 6.397884996695307e-07, "loss": 0.7148, "step": 1185 }, { "epoch": 0.3814102564102564, "grad_norm": 3.2007155418395996, "learning_rate": 6.381361533377395e-07, "loss": 0.7979, "step": 1190 }, { "epoch": 0.38301282051282054, "grad_norm": 4.596534729003906, "learning_rate": 6.364838070059484e-07, "loss": 0.8757, "step": 1195 }, { "epoch": 0.38461538461538464, "grad_norm": 4.348639488220215, "learning_rate": 6.348314606741573e-07, "loss": 0.7475, "step": 1200 }, { "epoch": 0.38621794871794873, "grad_norm": 4.388121128082275, "learning_rate": 6.331791143423661e-07, "loss": 0.7703, "step": 1205 }, { "epoch": 0.38782051282051283, "grad_norm": 3.776677370071411, "learning_rate": 6.31526768010575e-07, "loss": 0.7684, "step": 1210 }, { "epoch": 0.3894230769230769, "grad_norm": 4.856482028961182, "learning_rate": 6.298744216787838e-07, "loss": 0.8132, "step": 1215 }, { "epoch": 0.391025641025641, "grad_norm": 8.453356742858887, "learning_rate": 6.282220753469927e-07, "loss": 0.8172, "step": 1220 }, { "epoch": 0.3926282051282051, "grad_norm": 9.322402954101562, "learning_rate": 6.265697290152015e-07, "loss": 0.7704, "step": 1225 }, { "epoch": 0.3942307692307692, "grad_norm": 4.034356594085693, "learning_rate": 6.249173826834104e-07, "loss": 0.8072, "step": 1230 }, { "epoch": 0.3958333333333333, "grad_norm": 6.027692794799805, "learning_rate": 6.232650363516193e-07, "loss": 0.6584, "step": 1235 }, { "epoch": 0.3974358974358974, "grad_norm": 3.487473487854004, "learning_rate": 6.216126900198281e-07, "loss": 0.8855, "step": 1240 }, { "epoch": 0.39903846153846156, "grad_norm": 3.8283722400665283, "learning_rate": 6.19960343688037e-07, "loss": 0.8043, "step": 1245 }, { "epoch": 0.40064102564102566, "grad_norm": 4.080909252166748, "learning_rate": 6.183079973562458e-07, "loss": 0.7189, "step": 1250 }, { "epoch": 0.40224358974358976, "grad_norm": 10.283230781555176, "learning_rate": 6.166556510244547e-07, "loss": 0.6539, "step": 1255 }, { "epoch": 0.40384615384615385, "grad_norm": 3.7215747833251953, "learning_rate": 6.150033046926636e-07, "loss": 0.8382, "step": 1260 }, { "epoch": 0.40544871794871795, "grad_norm": 6.174632549285889, "learning_rate": 6.133509583608724e-07, "loss": 0.6854, "step": 1265 }, { "epoch": 0.40705128205128205, "grad_norm": 5.546038627624512, "learning_rate": 6.116986120290812e-07, "loss": 0.633, "step": 1270 }, { "epoch": 0.40865384615384615, "grad_norm": 4.6828813552856445, "learning_rate": 6.100462656972901e-07, "loss": 0.864, "step": 1275 }, { "epoch": 0.41025641025641024, "grad_norm": 5.444061279296875, "learning_rate": 6.08393919365499e-07, "loss": 0.8923, "step": 1280 }, { "epoch": 0.41185897435897434, "grad_norm": 9.895957946777344, "learning_rate": 6.067415730337079e-07, "loss": 0.91, "step": 1285 }, { "epoch": 0.41346153846153844, "grad_norm": 3.8047962188720703, "learning_rate": 6.050892267019166e-07, "loss": 0.82, "step": 1290 }, { "epoch": 0.4150641025641026, "grad_norm": 5.19016695022583, "learning_rate": 6.034368803701255e-07, "loss": 0.8092, "step": 1295 }, { "epoch": 0.4166666666666667, "grad_norm": 3.637864112854004, "learning_rate": 6.017845340383344e-07, "loss": 0.8826, "step": 1300 }, { "epoch": 0.4182692307692308, "grad_norm": 2.6663596630096436, "learning_rate": 6.001321877065433e-07, "loss": 0.6852, "step": 1305 }, { "epoch": 0.4198717948717949, "grad_norm": 3.58880615234375, "learning_rate": 5.984798413747522e-07, "loss": 0.8336, "step": 1310 }, { "epoch": 0.421474358974359, "grad_norm": 2.4447970390319824, "learning_rate": 5.968274950429609e-07, "loss": 0.7603, "step": 1315 }, { "epoch": 0.4230769230769231, "grad_norm": 4.405289649963379, "learning_rate": 5.951751487111698e-07, "loss": 0.7799, "step": 1320 }, { "epoch": 0.42467948717948717, "grad_norm": 4.415432929992676, "learning_rate": 5.935228023793787e-07, "loss": 0.7517, "step": 1325 }, { "epoch": 0.42628205128205127, "grad_norm": 2.538200616836548, "learning_rate": 5.918704560475876e-07, "loss": 0.8617, "step": 1330 }, { "epoch": 0.42788461538461536, "grad_norm": 5.295281887054443, "learning_rate": 5.902181097157964e-07, "loss": 0.8335, "step": 1335 }, { "epoch": 0.42948717948717946, "grad_norm": 10.389196395874023, "learning_rate": 5.885657633840052e-07, "loss": 0.8529, "step": 1340 }, { "epoch": 0.4310897435897436, "grad_norm": 4.9335713386535645, "learning_rate": 5.869134170522141e-07, "loss": 0.8276, "step": 1345 }, { "epoch": 0.4326923076923077, "grad_norm": 3.577237367630005, "learning_rate": 5.85261070720423e-07, "loss": 0.7883, "step": 1350 }, { "epoch": 0.4342948717948718, "grad_norm": 3.355888605117798, "learning_rate": 5.836087243886318e-07, "loss": 0.7451, "step": 1355 }, { "epoch": 0.4358974358974359, "grad_norm": 4.57732629776001, "learning_rate": 5.819563780568407e-07, "loss": 0.8938, "step": 1360 }, { "epoch": 0.4375, "grad_norm": 5.530088901519775, "learning_rate": 5.803040317250495e-07, "loss": 0.7397, "step": 1365 }, { "epoch": 0.4391025641025641, "grad_norm": 3.3376779556274414, "learning_rate": 5.786516853932584e-07, "loss": 0.7578, "step": 1370 }, { "epoch": 0.4407051282051282, "grad_norm": 3.726835012435913, "learning_rate": 5.769993390614673e-07, "loss": 0.787, "step": 1375 }, { "epoch": 0.4423076923076923, "grad_norm": 4.771599769592285, "learning_rate": 5.753469927296761e-07, "loss": 0.7629, "step": 1380 }, { "epoch": 0.4439102564102564, "grad_norm": 4.213784217834473, "learning_rate": 5.73694646397885e-07, "loss": 0.7171, "step": 1385 }, { "epoch": 0.44551282051282054, "grad_norm": 5.608395576477051, "learning_rate": 5.720423000660938e-07, "loss": 0.6273, "step": 1390 }, { "epoch": 0.44711538461538464, "grad_norm": 5.424095153808594, "learning_rate": 5.703899537343027e-07, "loss": 0.8648, "step": 1395 }, { "epoch": 0.44871794871794873, "grad_norm": 4.200117111206055, "learning_rate": 5.687376074025115e-07, "loss": 0.7668, "step": 1400 }, { "epoch": 0.45032051282051283, "grad_norm": 4.810688495635986, "learning_rate": 5.670852610707204e-07, "loss": 0.8691, "step": 1405 }, { "epoch": 0.4519230769230769, "grad_norm": 5.285038948059082, "learning_rate": 5.654329147389293e-07, "loss": 0.8094, "step": 1410 }, { "epoch": 0.453525641025641, "grad_norm": 2.9930169582366943, "learning_rate": 5.637805684071381e-07, "loss": 0.7669, "step": 1415 }, { "epoch": 0.4551282051282051, "grad_norm": 3.244771718978882, "learning_rate": 5.621282220753469e-07, "loss": 0.7827, "step": 1420 }, { "epoch": 0.4567307692307692, "grad_norm": 3.3853907585144043, "learning_rate": 5.604758757435558e-07, "loss": 0.7306, "step": 1425 }, { "epoch": 0.4583333333333333, "grad_norm": 3.7965517044067383, "learning_rate": 5.588235294117647e-07, "loss": 0.6429, "step": 1430 }, { "epoch": 0.4599358974358974, "grad_norm": 4.235316753387451, "learning_rate": 5.571711830799736e-07, "loss": 0.8967, "step": 1435 }, { "epoch": 0.46153846153846156, "grad_norm": 6.711025238037109, "learning_rate": 5.555188367481823e-07, "loss": 0.8444, "step": 1440 }, { "epoch": 0.46314102564102566, "grad_norm": 4.23643684387207, "learning_rate": 5.538664904163912e-07, "loss": 0.7664, "step": 1445 }, { "epoch": 0.46474358974358976, "grad_norm": 4.946862697601318, "learning_rate": 5.522141440846001e-07, "loss": 0.7987, "step": 1450 }, { "epoch": 0.46634615384615385, "grad_norm": 5.770292282104492, "learning_rate": 5.50561797752809e-07, "loss": 0.7844, "step": 1455 }, { "epoch": 0.46794871794871795, "grad_norm": 11.45702838897705, "learning_rate": 5.489094514210179e-07, "loss": 0.8573, "step": 1460 }, { "epoch": 0.46955128205128205, "grad_norm": 4.409577369689941, "learning_rate": 5.472571050892266e-07, "loss": 0.8122, "step": 1465 }, { "epoch": 0.47115384615384615, "grad_norm": 8.126829147338867, "learning_rate": 5.456047587574355e-07, "loss": 0.7616, "step": 1470 }, { "epoch": 0.47275641025641024, "grad_norm": 10.366379737854004, "learning_rate": 5.439524124256444e-07, "loss": 0.6492, "step": 1475 }, { "epoch": 0.47435897435897434, "grad_norm": 5.814599514007568, "learning_rate": 5.423000660938533e-07, "loss": 0.8335, "step": 1480 }, { "epoch": 0.47596153846153844, "grad_norm": 3.6713919639587402, "learning_rate": 5.406477197620621e-07, "loss": 0.7175, "step": 1485 }, { "epoch": 0.4775641025641026, "grad_norm": 4.473592758178711, "learning_rate": 5.389953734302709e-07, "loss": 0.7772, "step": 1490 }, { "epoch": 0.4791666666666667, "grad_norm": 5.191585540771484, "learning_rate": 5.373430270984798e-07, "loss": 0.8085, "step": 1495 }, { "epoch": 0.4807692307692308, "grad_norm": 4.686864376068115, "learning_rate": 5.356906807666887e-07, "loss": 0.7285, "step": 1500 }, { "epoch": 0.4823717948717949, "grad_norm": 6.236685276031494, "learning_rate": 5.340383344348976e-07, "loss": 0.8491, "step": 1505 }, { "epoch": 0.483974358974359, "grad_norm": 5.375248908996582, "learning_rate": 5.323859881031064e-07, "loss": 0.8212, "step": 1510 }, { "epoch": 0.4855769230769231, "grad_norm": 3.439789295196533, "learning_rate": 5.307336417713153e-07, "loss": 0.7619, "step": 1515 }, { "epoch": 0.48717948717948717, "grad_norm": 4.730751991271973, "learning_rate": 5.290812954395241e-07, "loss": 0.8577, "step": 1520 }, { "epoch": 0.48878205128205127, "grad_norm": 3.463454484939575, "learning_rate": 5.27428949107733e-07, "loss": 0.8779, "step": 1525 }, { "epoch": 0.49038461538461536, "grad_norm": 3.2680091857910156, "learning_rate": 5.257766027759418e-07, "loss": 0.7626, "step": 1530 }, { "epoch": 0.49198717948717946, "grad_norm": 4.192795753479004, "learning_rate": 5.241242564441507e-07, "loss": 0.9215, "step": 1535 }, { "epoch": 0.4935897435897436, "grad_norm": 8.984251976013184, "learning_rate": 5.224719101123596e-07, "loss": 0.7347, "step": 1540 }, { "epoch": 0.4951923076923077, "grad_norm": 5.889853477478027, "learning_rate": 5.208195637805684e-07, "loss": 0.8716, "step": 1545 }, { "epoch": 0.4967948717948718, "grad_norm": 6.937811851501465, "learning_rate": 5.191672174487772e-07, "loss": 0.778, "step": 1550 }, { "epoch": 0.4983974358974359, "grad_norm": 5.315396308898926, "learning_rate": 5.175148711169861e-07, "loss": 0.8273, "step": 1555 }, { "epoch": 0.5, "grad_norm": 4.183327674865723, "learning_rate": 5.15862524785195e-07, "loss": 0.8231, "step": 1560 }, { "epoch": 0.5016025641025641, "grad_norm": 4.254622459411621, "learning_rate": 5.142101784534039e-07, "loss": 0.6407, "step": 1565 }, { "epoch": 0.5032051282051282, "grad_norm": 4.547656059265137, "learning_rate": 5.125578321216127e-07, "loss": 0.7148, "step": 1570 }, { "epoch": 0.5048076923076923, "grad_norm": 5.993008613586426, "learning_rate": 5.109054857898215e-07, "loss": 0.7727, "step": 1575 }, { "epoch": 0.5064102564102564, "grad_norm": 3.685878276824951, "learning_rate": 5.092531394580304e-07, "loss": 0.8638, "step": 1580 }, { "epoch": 0.5080128205128205, "grad_norm": 4.194368839263916, "learning_rate": 5.076007931262393e-07, "loss": 0.819, "step": 1585 }, { "epoch": 0.5096153846153846, "grad_norm": 3.141991376876831, "learning_rate": 5.059484467944482e-07, "loss": 0.7831, "step": 1590 }, { "epoch": 0.5112179487179487, "grad_norm": 5.693704605102539, "learning_rate": 5.042961004626569e-07, "loss": 0.761, "step": 1595 }, { "epoch": 0.5128205128205128, "grad_norm": 2.8469674587249756, "learning_rate": 5.026437541308658e-07, "loss": 0.7729, "step": 1600 }, { "epoch": 0.5144230769230769, "grad_norm": 7.078847885131836, "learning_rate": 5.009914077990747e-07, "loss": 0.7102, "step": 1605 }, { "epoch": 0.5160256410256411, "grad_norm": 4.168100357055664, "learning_rate": 4.993390614672835e-07, "loss": 0.6727, "step": 1610 }, { "epoch": 0.5176282051282052, "grad_norm": 3.5356192588806152, "learning_rate": 4.976867151354923e-07, "loss": 0.8312, "step": 1615 }, { "epoch": 0.5192307692307693, "grad_norm": 3.7321722507476807, "learning_rate": 4.960343688037012e-07, "loss": 0.8265, "step": 1620 }, { "epoch": 0.5208333333333334, "grad_norm": 4.614173889160156, "learning_rate": 4.943820224719101e-07, "loss": 0.7464, "step": 1625 }, { "epoch": 0.5224358974358975, "grad_norm": 4.419942378997803, "learning_rate": 4.92729676140119e-07, "loss": 0.7683, "step": 1630 }, { "epoch": 0.5240384615384616, "grad_norm": 3.572216510772705, "learning_rate": 4.910773298083277e-07, "loss": 0.8283, "step": 1635 }, { "epoch": 0.5256410256410257, "grad_norm": 3.31060528755188, "learning_rate": 4.894249834765366e-07, "loss": 0.8539, "step": 1640 }, { "epoch": 0.5272435897435898, "grad_norm": 6.509139060974121, "learning_rate": 4.877726371447455e-07, "loss": 0.6647, "step": 1645 }, { "epoch": 0.5288461538461539, "grad_norm": 4.024603843688965, "learning_rate": 4.861202908129544e-07, "loss": 0.8066, "step": 1650 }, { "epoch": 0.530448717948718, "grad_norm": 3.655712604522705, "learning_rate": 4.844679444811633e-07, "loss": 0.7693, "step": 1655 }, { "epoch": 0.532051282051282, "grad_norm": 3.420959949493408, "learning_rate": 4.82815598149372e-07, "loss": 0.7867, "step": 1660 }, { "epoch": 0.5336538461538461, "grad_norm": 4.068134307861328, "learning_rate": 4.811632518175809e-07, "loss": 0.8251, "step": 1665 }, { "epoch": 0.5352564102564102, "grad_norm": 4.351796627044678, "learning_rate": 4.795109054857898e-07, "loss": 0.7103, "step": 1670 }, { "epoch": 0.5368589743589743, "grad_norm": 5.838902950286865, "learning_rate": 4.778585591539987e-07, "loss": 0.835, "step": 1675 }, { "epoch": 0.5384615384615384, "grad_norm": 4.233332633972168, "learning_rate": 4.762062128222075e-07, "loss": 0.8817, "step": 1680 }, { "epoch": 0.5400641025641025, "grad_norm": 4.291604042053223, "learning_rate": 4.745538664904163e-07, "loss": 0.6882, "step": 1685 }, { "epoch": 0.5416666666666666, "grad_norm": 3.780442714691162, "learning_rate": 4.729015201586252e-07, "loss": 0.6711, "step": 1690 }, { "epoch": 0.5432692307692307, "grad_norm": 3.0913641452789307, "learning_rate": 4.712491738268341e-07, "loss": 0.8828, "step": 1695 }, { "epoch": 0.5448717948717948, "grad_norm": 2.7122883796691895, "learning_rate": 4.695968274950429e-07, "loss": 0.6597, "step": 1700 }, { "epoch": 0.5464743589743589, "grad_norm": 3.7509615421295166, "learning_rate": 4.679444811632518e-07, "loss": 0.8139, "step": 1705 }, { "epoch": 0.5480769230769231, "grad_norm": 4.429205417633057, "learning_rate": 4.662921348314606e-07, "loss": 0.8188, "step": 1710 }, { "epoch": 0.5496794871794872, "grad_norm": 6.677905559539795, "learning_rate": 4.646397884996695e-07, "loss": 0.689, "step": 1715 }, { "epoch": 0.5512820512820513, "grad_norm": 10.779183387756348, "learning_rate": 4.6298744216787836e-07, "loss": 0.7602, "step": 1720 }, { "epoch": 0.5528846153846154, "grad_norm": 5.142210006713867, "learning_rate": 4.613350958360872e-07, "loss": 0.809, "step": 1725 }, { "epoch": 0.5544871794871795, "grad_norm": 4.238507270812988, "learning_rate": 4.5968274950429606e-07, "loss": 0.7306, "step": 1730 }, { "epoch": 0.5560897435897436, "grad_norm": 4.379620552062988, "learning_rate": 4.580304031725049e-07, "loss": 0.8622, "step": 1735 }, { "epoch": 0.5576923076923077, "grad_norm": 5.106377124786377, "learning_rate": 4.5637805684071377e-07, "loss": 0.8234, "step": 1740 }, { "epoch": 0.5592948717948718, "grad_norm": 4.431070327758789, "learning_rate": 4.5472571050892265e-07, "loss": 0.9141, "step": 1745 }, { "epoch": 0.5608974358974359, "grad_norm": 3.911802053451538, "learning_rate": 4.5307336417713147e-07, "loss": 0.783, "step": 1750 }, { "epoch": 0.5625, "grad_norm": 5.008035182952881, "learning_rate": 4.5142101784534035e-07, "loss": 0.8401, "step": 1755 }, { "epoch": 0.5641025641025641, "grad_norm": 8.659884452819824, "learning_rate": 4.4976867151354923e-07, "loss": 0.6399, "step": 1760 }, { "epoch": 0.5657051282051282, "grad_norm": 3.6218109130859375, "learning_rate": 4.4811632518175805e-07, "loss": 0.8017, "step": 1765 }, { "epoch": 0.5673076923076923, "grad_norm": 8.017809867858887, "learning_rate": 4.4646397884996693e-07, "loss": 0.9154, "step": 1770 }, { "epoch": 0.5689102564102564, "grad_norm": 4.872199535369873, "learning_rate": 4.4481163251817576e-07, "loss": 0.8139, "step": 1775 }, { "epoch": 0.5705128205128205, "grad_norm": 3.4777655601501465, "learning_rate": 4.4315928618638463e-07, "loss": 0.7942, "step": 1780 }, { "epoch": 0.5721153846153846, "grad_norm": 4.423591613769531, "learning_rate": 4.415069398545935e-07, "loss": 0.6299, "step": 1785 }, { "epoch": 0.5737179487179487, "grad_norm": 10.330599784851074, "learning_rate": 4.3985459352280234e-07, "loss": 0.849, "step": 1790 }, { "epoch": 0.5753205128205128, "grad_norm": 3.7085251808166504, "learning_rate": 4.382022471910112e-07, "loss": 0.7756, "step": 1795 }, { "epoch": 0.5769230769230769, "grad_norm": 4.038546562194824, "learning_rate": 4.3654990085922004e-07, "loss": 0.9406, "step": 1800 }, { "epoch": 0.5785256410256411, "grad_norm": 5.6599440574646, "learning_rate": 4.348975545274289e-07, "loss": 0.6654, "step": 1805 }, { "epoch": 0.5801282051282052, "grad_norm": 5.489417552947998, "learning_rate": 4.332452081956378e-07, "loss": 0.8556, "step": 1810 }, { "epoch": 0.5817307692307693, "grad_norm": 7.606975078582764, "learning_rate": 4.315928618638466e-07, "loss": 0.8567, "step": 1815 }, { "epoch": 0.5833333333333334, "grad_norm": 6.262397766113281, "learning_rate": 4.299405155320555e-07, "loss": 0.8868, "step": 1820 }, { "epoch": 0.5849358974358975, "grad_norm": 8.082782745361328, "learning_rate": 4.282881692002643e-07, "loss": 0.7651, "step": 1825 }, { "epoch": 0.5865384615384616, "grad_norm": 7.61177921295166, "learning_rate": 4.266358228684732e-07, "loss": 0.8139, "step": 1830 }, { "epoch": 0.5881410256410257, "grad_norm": 3.503220796585083, "learning_rate": 4.249834765366821e-07, "loss": 0.6752, "step": 1835 }, { "epoch": 0.5897435897435898, "grad_norm": 6.636229038238525, "learning_rate": 4.233311302048909e-07, "loss": 0.8225, "step": 1840 }, { "epoch": 0.5913461538461539, "grad_norm": 45.19087219238281, "learning_rate": 4.216787838730998e-07, "loss": 0.8639, "step": 1845 }, { "epoch": 0.592948717948718, "grad_norm": 12.816862106323242, "learning_rate": 4.200264375413086e-07, "loss": 0.9769, "step": 1850 }, { "epoch": 0.594551282051282, "grad_norm": 5.171041011810303, "learning_rate": 4.183740912095175e-07, "loss": 0.6907, "step": 1855 }, { "epoch": 0.5961538461538461, "grad_norm": 3.1392245292663574, "learning_rate": 4.1672174487772637e-07, "loss": 0.7235, "step": 1860 }, { "epoch": 0.5977564102564102, "grad_norm": 3.557652473449707, "learning_rate": 4.150693985459352e-07, "loss": 0.7241, "step": 1865 }, { "epoch": 0.5993589743589743, "grad_norm": 3.4919662475585938, "learning_rate": 4.1341705221414407e-07, "loss": 0.7947, "step": 1870 }, { "epoch": 0.6009615384615384, "grad_norm": 7.577988624572754, "learning_rate": 4.117647058823529e-07, "loss": 0.7991, "step": 1875 }, { "epoch": 0.6025641025641025, "grad_norm": 6.581418514251709, "learning_rate": 4.1011235955056177e-07, "loss": 0.7415, "step": 1880 }, { "epoch": 0.6041666666666666, "grad_norm": 5.872368335723877, "learning_rate": 4.0846001321877065e-07, "loss": 0.8145, "step": 1885 }, { "epoch": 0.6057692307692307, "grad_norm": 5.491688251495361, "learning_rate": 4.068076668869795e-07, "loss": 0.888, "step": 1890 }, { "epoch": 0.6073717948717948, "grad_norm": 6.849071025848389, "learning_rate": 4.0515532055518835e-07, "loss": 0.6781, "step": 1895 }, { "epoch": 0.6089743589743589, "grad_norm": 3.5489501953125, "learning_rate": 4.035029742233972e-07, "loss": 0.7944, "step": 1900 }, { "epoch": 0.6105769230769231, "grad_norm": 9.167459487915039, "learning_rate": 4.0185062789160606e-07, "loss": 0.7331, "step": 1905 }, { "epoch": 0.6121794871794872, "grad_norm": 2.9380276203155518, "learning_rate": 4.0019828155981494e-07, "loss": 0.7066, "step": 1910 }, { "epoch": 0.6137820512820513, "grad_norm": 3.069446325302124, "learning_rate": 3.9854593522802376e-07, "loss": 0.8597, "step": 1915 }, { "epoch": 0.6153846153846154, "grad_norm": 4.881730079650879, "learning_rate": 3.9689358889623264e-07, "loss": 0.8679, "step": 1920 }, { "epoch": 0.6169871794871795, "grad_norm": 7.921117305755615, "learning_rate": 3.9524124256444146e-07, "loss": 0.8195, "step": 1925 }, { "epoch": 0.6185897435897436, "grad_norm": 7.495361328125, "learning_rate": 3.9358889623265034e-07, "loss": 0.7521, "step": 1930 }, { "epoch": 0.6201923076923077, "grad_norm": 3.289283037185669, "learning_rate": 3.919365499008592e-07, "loss": 0.779, "step": 1935 }, { "epoch": 0.6217948717948718, "grad_norm": 4.523643970489502, "learning_rate": 3.9028420356906805e-07, "loss": 0.722, "step": 1940 }, { "epoch": 0.6233974358974359, "grad_norm": 4.16140079498291, "learning_rate": 3.886318572372769e-07, "loss": 0.7385, "step": 1945 }, { "epoch": 0.625, "grad_norm": 2.602611541748047, "learning_rate": 3.8697951090548575e-07, "loss": 0.7818, "step": 1950 }, { "epoch": 0.6266025641025641, "grad_norm": 5.022205352783203, "learning_rate": 3.8532716457369463e-07, "loss": 0.7388, "step": 1955 }, { "epoch": 0.6282051282051282, "grad_norm": 4.107226371765137, "learning_rate": 3.836748182419035e-07, "loss": 0.8531, "step": 1960 }, { "epoch": 0.6298076923076923, "grad_norm": 3.9306111335754395, "learning_rate": 3.8202247191011233e-07, "loss": 0.8112, "step": 1965 }, { "epoch": 0.6314102564102564, "grad_norm": 3.1901676654815674, "learning_rate": 3.803701255783212e-07, "loss": 0.7661, "step": 1970 }, { "epoch": 0.6330128205128205, "grad_norm": 5.7795820236206055, "learning_rate": 3.7871777924653003e-07, "loss": 0.7822, "step": 1975 }, { "epoch": 0.6346153846153846, "grad_norm": 4.990657806396484, "learning_rate": 3.770654329147389e-07, "loss": 0.6767, "step": 1980 }, { "epoch": 0.6362179487179487, "grad_norm": 3.1682956218719482, "learning_rate": 3.754130865829478e-07, "loss": 0.7403, "step": 1985 }, { "epoch": 0.6378205128205128, "grad_norm": 8.12835693359375, "learning_rate": 3.737607402511566e-07, "loss": 0.8295, "step": 1990 }, { "epoch": 0.6394230769230769, "grad_norm": 6.958061218261719, "learning_rate": 3.721083939193655e-07, "loss": 0.7853, "step": 1995 }, { "epoch": 0.6410256410256411, "grad_norm": 9.980351448059082, "learning_rate": 3.704560475875743e-07, "loss": 0.8413, "step": 2000 }, { "epoch": 0.6426282051282052, "grad_norm": 5.591805934906006, "learning_rate": 3.688037012557832e-07, "loss": 0.8321, "step": 2005 }, { "epoch": 0.6442307692307693, "grad_norm": 4.056339263916016, "learning_rate": 3.671513549239921e-07, "loss": 0.7223, "step": 2010 }, { "epoch": 0.6458333333333334, "grad_norm": 4.585841655731201, "learning_rate": 3.654990085922009e-07, "loss": 0.8602, "step": 2015 }, { "epoch": 0.6474358974358975, "grad_norm": 14.423575401306152, "learning_rate": 3.638466622604098e-07, "loss": 0.8337, "step": 2020 }, { "epoch": 0.6490384615384616, "grad_norm": 17.55698013305664, "learning_rate": 3.621943159286186e-07, "loss": 0.8524, "step": 2025 }, { "epoch": 0.6506410256410257, "grad_norm": 8.060038566589355, "learning_rate": 3.605419695968275e-07, "loss": 0.7047, "step": 2030 }, { "epoch": 0.6522435897435898, "grad_norm": 3.0732924938201904, "learning_rate": 3.5888962326503636e-07, "loss": 0.8203, "step": 2035 }, { "epoch": 0.6538461538461539, "grad_norm": 6.2294020652771, "learning_rate": 3.572372769332452e-07, "loss": 0.8524, "step": 2040 }, { "epoch": 0.655448717948718, "grad_norm": 5.603904724121094, "learning_rate": 3.5558493060145406e-07, "loss": 0.6366, "step": 2045 }, { "epoch": 0.657051282051282, "grad_norm": 3.684701442718506, "learning_rate": 3.539325842696629e-07, "loss": 0.7765, "step": 2050 }, { "epoch": 0.6586538461538461, "grad_norm": 6.113523483276367, "learning_rate": 3.5228023793787177e-07, "loss": 0.6858, "step": 2055 }, { "epoch": 0.6602564102564102, "grad_norm": 5.9543280601501465, "learning_rate": 3.5062789160608064e-07, "loss": 0.8639, "step": 2060 }, { "epoch": 0.6618589743589743, "grad_norm": 2.5266408920288086, "learning_rate": 3.4897554527428947e-07, "loss": 0.9136, "step": 2065 }, { "epoch": 0.6634615384615384, "grad_norm": 4.412357807159424, "learning_rate": 3.4732319894249835e-07, "loss": 0.8078, "step": 2070 }, { "epoch": 0.6650641025641025, "grad_norm": 3.709512948989868, "learning_rate": 3.4567085261070717e-07, "loss": 0.8443, "step": 2075 }, { "epoch": 0.6666666666666666, "grad_norm": 3.4022634029388428, "learning_rate": 3.4401850627891605e-07, "loss": 0.7546, "step": 2080 }, { "epoch": 0.6682692307692307, "grad_norm": 5.27069091796875, "learning_rate": 3.4236615994712493e-07, "loss": 0.8228, "step": 2085 }, { "epoch": 0.6698717948717948, "grad_norm": 3.136031150817871, "learning_rate": 3.4071381361533375e-07, "loss": 0.9051, "step": 2090 }, { "epoch": 0.6714743589743589, "grad_norm": 4.431833744049072, "learning_rate": 3.3906146728354263e-07, "loss": 0.8802, "step": 2095 }, { "epoch": 0.6730769230769231, "grad_norm": 4.416879653930664, "learning_rate": 3.3740912095175146e-07, "loss": 0.7876, "step": 2100 }, { "epoch": 0.6746794871794872, "grad_norm": 3.685245990753174, "learning_rate": 3.3575677461996034e-07, "loss": 0.744, "step": 2105 }, { "epoch": 0.6762820512820513, "grad_norm": 4.721916198730469, "learning_rate": 3.341044282881692e-07, "loss": 0.7867, "step": 2110 }, { "epoch": 0.6778846153846154, "grad_norm": 5.276561260223389, "learning_rate": 3.3245208195637804e-07, "loss": 0.8425, "step": 2115 }, { "epoch": 0.6794871794871795, "grad_norm": 6.171300888061523, "learning_rate": 3.307997356245869e-07, "loss": 0.742, "step": 2120 }, { "epoch": 0.6810897435897436, "grad_norm": 6.1108198165893555, "learning_rate": 3.2914738929279574e-07, "loss": 0.7814, "step": 2125 }, { "epoch": 0.6826923076923077, "grad_norm": 5.54103946685791, "learning_rate": 3.274950429610046e-07, "loss": 0.8899, "step": 2130 }, { "epoch": 0.6842948717948718, "grad_norm": 5.242672443389893, "learning_rate": 3.258426966292135e-07, "loss": 0.8232, "step": 2135 }, { "epoch": 0.6858974358974359, "grad_norm": 11.092650413513184, "learning_rate": 3.241903502974223e-07, "loss": 0.7744, "step": 2140 }, { "epoch": 0.6875, "grad_norm": 3.056320905685425, "learning_rate": 3.225380039656312e-07, "loss": 0.6096, "step": 2145 }, { "epoch": 0.6891025641025641, "grad_norm": 4.238087177276611, "learning_rate": 3.2088565763384003e-07, "loss": 0.7236, "step": 2150 }, { "epoch": 0.6907051282051282, "grad_norm": 3.4259557723999023, "learning_rate": 3.192333113020489e-07, "loss": 0.8002, "step": 2155 }, { "epoch": 0.6923076923076923, "grad_norm": 3.611785411834717, "learning_rate": 3.175809649702578e-07, "loss": 0.7647, "step": 2160 }, { "epoch": 0.6939102564102564, "grad_norm": 8.97962760925293, "learning_rate": 3.159286186384666e-07, "loss": 0.9061, "step": 2165 }, { "epoch": 0.6955128205128205, "grad_norm": 15.352239608764648, "learning_rate": 3.142762723066755e-07, "loss": 0.7211, "step": 2170 }, { "epoch": 0.6971153846153846, "grad_norm": 7.31290340423584, "learning_rate": 3.126239259748843e-07, "loss": 0.618, "step": 2175 }, { "epoch": 0.6987179487179487, "grad_norm": 4.665528297424316, "learning_rate": 3.109715796430932e-07, "loss": 0.8203, "step": 2180 }, { "epoch": 0.7003205128205128, "grad_norm": 17.2761287689209, "learning_rate": 3.0931923331130207e-07, "loss": 0.7578, "step": 2185 }, { "epoch": 0.7019230769230769, "grad_norm": 9.712289810180664, "learning_rate": 3.076668869795109e-07, "loss": 0.7531, "step": 2190 }, { "epoch": 0.7035256410256411, "grad_norm": 4.434769630432129, "learning_rate": 3.0601454064771977e-07, "loss": 0.7863, "step": 2195 }, { "epoch": 0.7051282051282052, "grad_norm": 3.8715121746063232, "learning_rate": 3.043621943159286e-07, "loss": 0.7247, "step": 2200 }, { "epoch": 0.7067307692307693, "grad_norm": 3.459235906600952, "learning_rate": 3.027098479841375e-07, "loss": 0.7149, "step": 2205 }, { "epoch": 0.7083333333333334, "grad_norm": 5.98268461227417, "learning_rate": 3.0105750165234635e-07, "loss": 0.8021, "step": 2210 }, { "epoch": 0.7099358974358975, "grad_norm": 6.481480121612549, "learning_rate": 2.994051553205552e-07, "loss": 0.8124, "step": 2215 }, { "epoch": 0.7115384615384616, "grad_norm": 5.063220500946045, "learning_rate": 2.9775280898876406e-07, "loss": 0.6746, "step": 2220 }, { "epoch": 0.7131410256410257, "grad_norm": 5.813882827758789, "learning_rate": 2.9610046265697293e-07, "loss": 0.8872, "step": 2225 }, { "epoch": 0.7147435897435898, "grad_norm": 7.330856800079346, "learning_rate": 2.9444811632518176e-07, "loss": 0.8496, "step": 2230 }, { "epoch": 0.7163461538461539, "grad_norm": 4.500095367431641, "learning_rate": 2.9279576999339064e-07, "loss": 0.8594, "step": 2235 }, { "epoch": 0.717948717948718, "grad_norm": 7.6699137687683105, "learning_rate": 2.9114342366159946e-07, "loss": 0.7005, "step": 2240 }, { "epoch": 0.719551282051282, "grad_norm": 3.332604169845581, "learning_rate": 2.8949107732980834e-07, "loss": 0.8011, "step": 2245 }, { "epoch": 0.7211538461538461, "grad_norm": 7.084466457366943, "learning_rate": 2.878387309980172e-07, "loss": 0.7555, "step": 2250 }, { "epoch": 0.7227564102564102, "grad_norm": 2.606405258178711, "learning_rate": 2.8618638466622604e-07, "loss": 0.8418, "step": 2255 }, { "epoch": 0.7243589743589743, "grad_norm": 5.162625312805176, "learning_rate": 2.845340383344349e-07, "loss": 0.7081, "step": 2260 }, { "epoch": 0.7259615384615384, "grad_norm": 6.1882758140563965, "learning_rate": 2.8288169200264375e-07, "loss": 0.7999, "step": 2265 }, { "epoch": 0.7275641025641025, "grad_norm": 3.4105043411254883, "learning_rate": 2.812293456708526e-07, "loss": 0.7354, "step": 2270 }, { "epoch": 0.7291666666666666, "grad_norm": 5.230040073394775, "learning_rate": 2.795769993390615e-07, "loss": 0.7022, "step": 2275 }, { "epoch": 0.7307692307692307, "grad_norm": 7.303884506225586, "learning_rate": 2.7792465300727033e-07, "loss": 0.8529, "step": 2280 }, { "epoch": 0.7323717948717948, "grad_norm": 4.611577987670898, "learning_rate": 2.762723066754792e-07, "loss": 0.8055, "step": 2285 }, { "epoch": 0.7339743589743589, "grad_norm": 3.8788657188415527, "learning_rate": 2.7461996034368803e-07, "loss": 0.7476, "step": 2290 }, { "epoch": 0.7355769230769231, "grad_norm": 7.592946529388428, "learning_rate": 2.729676140118969e-07, "loss": 0.8468, "step": 2295 }, { "epoch": 0.7371794871794872, "grad_norm": 12.41851806640625, "learning_rate": 2.713152676801058e-07, "loss": 0.8057, "step": 2300 }, { "epoch": 0.7387820512820513, "grad_norm": 3.9982833862304688, "learning_rate": 2.6966292134831456e-07, "loss": 0.8323, "step": 2305 }, { "epoch": 0.7403846153846154, "grad_norm": 4.3113813400268555, "learning_rate": 2.6801057501652344e-07, "loss": 0.7667, "step": 2310 }, { "epoch": 0.7419871794871795, "grad_norm": 6.139361381530762, "learning_rate": 2.6635822868473226e-07, "loss": 0.7687, "step": 2315 }, { "epoch": 0.7435897435897436, "grad_norm": 13.496137619018555, "learning_rate": 2.6470588235294114e-07, "loss": 0.7224, "step": 2320 }, { "epoch": 0.7451923076923077, "grad_norm": 7.981110095977783, "learning_rate": 2.6305353602115e-07, "loss": 0.8216, "step": 2325 }, { "epoch": 0.7467948717948718, "grad_norm": 6.703426361083984, "learning_rate": 2.6140118968935885e-07, "loss": 0.8239, "step": 2330 }, { "epoch": 0.7483974358974359, "grad_norm": 3.3382091522216797, "learning_rate": 2.597488433575677e-07, "loss": 0.8151, "step": 2335 }, { "epoch": 0.75, "grad_norm": 5.277767181396484, "learning_rate": 2.5809649702577655e-07, "loss": 0.778, "step": 2340 }, { "epoch": 0.7516025641025641, "grad_norm": 3.5990350246429443, "learning_rate": 2.5644415069398543e-07, "loss": 0.7541, "step": 2345 }, { "epoch": 0.7532051282051282, "grad_norm": 4.577154159545898, "learning_rate": 2.547918043621943e-07, "loss": 0.7555, "step": 2350 }, { "epoch": 0.7548076923076923, "grad_norm": 4.374950885772705, "learning_rate": 2.5313945803040313e-07, "loss": 0.8462, "step": 2355 }, { "epoch": 0.7564102564102564, "grad_norm": 8.507906913757324, "learning_rate": 2.51487111698612e-07, "loss": 0.7591, "step": 2360 }, { "epoch": 0.7580128205128205, "grad_norm": 4.493144512176514, "learning_rate": 2.498347653668209e-07, "loss": 0.7708, "step": 2365 }, { "epoch": 0.7596153846153846, "grad_norm": 4.824530124664307, "learning_rate": 2.481824190350297e-07, "loss": 0.7318, "step": 2370 }, { "epoch": 0.7612179487179487, "grad_norm": 4.022371292114258, "learning_rate": 2.465300727032386e-07, "loss": 0.7551, "step": 2375 }, { "epoch": 0.7628205128205128, "grad_norm": 3.5510129928588867, "learning_rate": 2.448777263714474e-07, "loss": 0.628, "step": 2380 }, { "epoch": 0.7644230769230769, "grad_norm": 6.095627307891846, "learning_rate": 2.432253800396563e-07, "loss": 0.725, "step": 2385 }, { "epoch": 0.7660256410256411, "grad_norm": 3.520016670227051, "learning_rate": 2.4157303370786517e-07, "loss": 0.7484, "step": 2390 }, { "epoch": 0.7676282051282052, "grad_norm": 6.656997203826904, "learning_rate": 2.39920687376074e-07, "loss": 0.7474, "step": 2395 }, { "epoch": 0.7692307692307693, "grad_norm": 4.5073370933532715, "learning_rate": 2.3826834104428288e-07, "loss": 0.6534, "step": 2400 }, { "epoch": 0.7708333333333334, "grad_norm": 5.180692195892334, "learning_rate": 2.3661599471249173e-07, "loss": 0.7398, "step": 2405 }, { "epoch": 0.7724358974358975, "grad_norm": 4.856165885925293, "learning_rate": 2.349636483807006e-07, "loss": 0.8658, "step": 2410 }, { "epoch": 0.7740384615384616, "grad_norm": 4.942265510559082, "learning_rate": 2.3331130204890946e-07, "loss": 0.8106, "step": 2415 }, { "epoch": 0.7756410256410257, "grad_norm": 4.896393775939941, "learning_rate": 2.316589557171183e-07, "loss": 0.7782, "step": 2420 }, { "epoch": 0.7772435897435898, "grad_norm": 4.911433696746826, "learning_rate": 2.3000660938532716e-07, "loss": 0.7034, "step": 2425 }, { "epoch": 0.7788461538461539, "grad_norm": 5.983463287353516, "learning_rate": 2.28354263053536e-07, "loss": 0.7062, "step": 2430 }, { "epoch": 0.780448717948718, "grad_norm": 5.0456414222717285, "learning_rate": 2.267019167217449e-07, "loss": 0.7615, "step": 2435 }, { "epoch": 0.782051282051282, "grad_norm": 4.779991149902344, "learning_rate": 2.2504957038995374e-07, "loss": 0.6795, "step": 2440 }, { "epoch": 0.7836538461538461, "grad_norm": 5.053199768066406, "learning_rate": 2.233972240581626e-07, "loss": 0.8048, "step": 2445 }, { "epoch": 0.7852564102564102, "grad_norm": 7.191258907318115, "learning_rate": 2.2174487772637144e-07, "loss": 0.8043, "step": 2450 }, { "epoch": 0.7868589743589743, "grad_norm": 3.500450611114502, "learning_rate": 2.2009253139458027e-07, "loss": 0.7147, "step": 2455 }, { "epoch": 0.7884615384615384, "grad_norm": 4.963442325592041, "learning_rate": 2.1844018506278917e-07, "loss": 0.7803, "step": 2460 }, { "epoch": 0.7900641025641025, "grad_norm": 4.3301777839660645, "learning_rate": 2.16787838730998e-07, "loss": 0.7901, "step": 2465 }, { "epoch": 0.7916666666666666, "grad_norm": 4.038059711456299, "learning_rate": 2.1513549239920685e-07, "loss": 0.6812, "step": 2470 }, { "epoch": 0.7932692307692307, "grad_norm": 5.824253559112549, "learning_rate": 2.134831460674157e-07, "loss": 0.7618, "step": 2475 }, { "epoch": 0.7948717948717948, "grad_norm": 5.034027099609375, "learning_rate": 2.1183079973562455e-07, "loss": 0.6987, "step": 2480 }, { "epoch": 0.7964743589743589, "grad_norm": 4.224520206451416, "learning_rate": 2.1017845340383343e-07, "loss": 0.8233, "step": 2485 }, { "epoch": 0.7980769230769231, "grad_norm": 4.304800033569336, "learning_rate": 2.0852610707204228e-07, "loss": 0.683, "step": 2490 }, { "epoch": 0.7996794871794872, "grad_norm": 6.027079105377197, "learning_rate": 2.0687376074025114e-07, "loss": 0.7514, "step": 2495 }, { "epoch": 0.8012820512820513, "grad_norm": 7.2774882316589355, "learning_rate": 2.0522141440846e-07, "loss": 0.8308, "step": 2500 }, { "epoch": 0.8028846153846154, "grad_norm": 7.033870220184326, "learning_rate": 2.0356906807666884e-07, "loss": 0.7758, "step": 2505 }, { "epoch": 0.8044871794871795, "grad_norm": 3.2256860733032227, "learning_rate": 2.0191672174487772e-07, "loss": 0.759, "step": 2510 }, { "epoch": 0.8060897435897436, "grad_norm": 7.072434425354004, "learning_rate": 2.0026437541308657e-07, "loss": 0.7686, "step": 2515 }, { "epoch": 0.8076923076923077, "grad_norm": 3.3247132301330566, "learning_rate": 1.9861202908129542e-07, "loss": 0.7644, "step": 2520 }, { "epoch": 0.8092948717948718, "grad_norm": 3.6884591579437256, "learning_rate": 1.9695968274950427e-07, "loss": 0.7558, "step": 2525 }, { "epoch": 0.8108974358974359, "grad_norm": 5.145435333251953, "learning_rate": 1.9530733641771312e-07, "loss": 0.7459, "step": 2530 }, { "epoch": 0.8125, "grad_norm": 4.134402751922607, "learning_rate": 1.93654990085922e-07, "loss": 0.8534, "step": 2535 }, { "epoch": 0.8141025641025641, "grad_norm": 3.347599744796753, "learning_rate": 1.9200264375413085e-07, "loss": 0.8417, "step": 2540 }, { "epoch": 0.8157051282051282, "grad_norm": 3.6410083770751953, "learning_rate": 1.903502974223397e-07, "loss": 0.8405, "step": 2545 }, { "epoch": 0.8173076923076923, "grad_norm": 3.344439744949341, "learning_rate": 1.8869795109054856e-07, "loss": 0.7426, "step": 2550 }, { "epoch": 0.8189102564102564, "grad_norm": 4.314718723297119, "learning_rate": 1.870456047587574e-07, "loss": 0.7995, "step": 2555 }, { "epoch": 0.8205128205128205, "grad_norm": 6.937241077423096, "learning_rate": 1.853932584269663e-07, "loss": 0.8192, "step": 2560 }, { "epoch": 0.8221153846153846, "grad_norm": 3.7095561027526855, "learning_rate": 1.8374091209517514e-07, "loss": 0.7015, "step": 2565 }, { "epoch": 0.8237179487179487, "grad_norm": 4.655959606170654, "learning_rate": 1.82088565763384e-07, "loss": 0.7462, "step": 2570 }, { "epoch": 0.8253205128205128, "grad_norm": 5.088621616363525, "learning_rate": 1.8043621943159284e-07, "loss": 0.7669, "step": 2575 }, { "epoch": 0.8269230769230769, "grad_norm": 5.979193210601807, "learning_rate": 1.7878387309980172e-07, "loss": 0.9233, "step": 2580 }, { "epoch": 0.8285256410256411, "grad_norm": 4.107568740844727, "learning_rate": 1.7713152676801057e-07, "loss": 0.868, "step": 2585 }, { "epoch": 0.8301282051282052, "grad_norm": 3.6633615493774414, "learning_rate": 1.7547918043621942e-07, "loss": 0.7795, "step": 2590 }, { "epoch": 0.8317307692307693, "grad_norm": 6.704728126525879, "learning_rate": 1.7382683410442828e-07, "loss": 0.714, "step": 2595 }, { "epoch": 0.8333333333333334, "grad_norm": 6.485088348388672, "learning_rate": 1.7217448777263713e-07, "loss": 0.6941, "step": 2600 }, { "epoch": 0.8349358974358975, "grad_norm": 5.1513566970825195, "learning_rate": 1.70522141440846e-07, "loss": 0.7235, "step": 2605 }, { "epoch": 0.8365384615384616, "grad_norm": 6.590970039367676, "learning_rate": 1.6886979510905486e-07, "loss": 0.7834, "step": 2610 }, { "epoch": 0.8381410256410257, "grad_norm": 3.539618730545044, "learning_rate": 1.672174487772637e-07, "loss": 0.7529, "step": 2615 }, { "epoch": 0.8397435897435898, "grad_norm": 5.671098709106445, "learning_rate": 1.6556510244547256e-07, "loss": 0.8103, "step": 2620 }, { "epoch": 0.8413461538461539, "grad_norm": 4.69738245010376, "learning_rate": 1.639127561136814e-07, "loss": 0.8686, "step": 2625 }, { "epoch": 0.842948717948718, "grad_norm": 4.465817451477051, "learning_rate": 1.622604097818903e-07, "loss": 0.7111, "step": 2630 }, { "epoch": 0.844551282051282, "grad_norm": 3.2771265506744385, "learning_rate": 1.6060806345009914e-07, "loss": 0.7292, "step": 2635 }, { "epoch": 0.8461538461538461, "grad_norm": 7.632739067077637, "learning_rate": 1.58955717118308e-07, "loss": 0.786, "step": 2640 }, { "epoch": 0.8477564102564102, "grad_norm": 4.397324085235596, "learning_rate": 1.5730337078651685e-07, "loss": 0.8378, "step": 2645 }, { "epoch": 0.8493589743589743, "grad_norm": 3.7814230918884277, "learning_rate": 1.556510244547257e-07, "loss": 0.7088, "step": 2650 }, { "epoch": 0.8509615384615384, "grad_norm": 3.752884864807129, "learning_rate": 1.5399867812293457e-07, "loss": 0.8107, "step": 2655 }, { "epoch": 0.8525641025641025, "grad_norm": 3.5255517959594727, "learning_rate": 1.5234633179114343e-07, "loss": 0.7736, "step": 2660 }, { "epoch": 0.8541666666666666, "grad_norm": 3.8665730953216553, "learning_rate": 1.5069398545935228e-07, "loss": 0.687, "step": 2665 }, { "epoch": 0.8557692307692307, "grad_norm": 4.789595127105713, "learning_rate": 1.4904163912756113e-07, "loss": 0.718, "step": 2670 }, { "epoch": 0.8573717948717948, "grad_norm": 3.834465265274048, "learning_rate": 1.4738929279576998e-07, "loss": 0.8931, "step": 2675 }, { "epoch": 0.8589743589743589, "grad_norm": 7.070734977722168, "learning_rate": 1.4573694646397886e-07, "loss": 0.7071, "step": 2680 }, { "epoch": 0.8605769230769231, "grad_norm": 2.9893038272857666, "learning_rate": 1.440846001321877e-07, "loss": 0.6394, "step": 2685 }, { "epoch": 0.8621794871794872, "grad_norm": 5.302039623260498, "learning_rate": 1.4243225380039656e-07, "loss": 0.7887, "step": 2690 }, { "epoch": 0.8637820512820513, "grad_norm": 2.9799692630767822, "learning_rate": 1.4077990746860541e-07, "loss": 0.851, "step": 2695 }, { "epoch": 0.8653846153846154, "grad_norm": 4.219221115112305, "learning_rate": 1.3912756113681427e-07, "loss": 0.7475, "step": 2700 }, { "epoch": 0.8669871794871795, "grad_norm": 3.4256138801574707, "learning_rate": 1.3747521480502314e-07, "loss": 0.806, "step": 2705 }, { "epoch": 0.8685897435897436, "grad_norm": 5.021873474121094, "learning_rate": 1.35822868473232e-07, "loss": 0.7701, "step": 2710 }, { "epoch": 0.8701923076923077, "grad_norm": 4.23788595199585, "learning_rate": 1.3417052214144085e-07, "loss": 0.7122, "step": 2715 }, { "epoch": 0.8717948717948718, "grad_norm": 4.850051403045654, "learning_rate": 1.325181758096497e-07, "loss": 0.6861, "step": 2720 }, { "epoch": 0.8733974358974359, "grad_norm": 5.633250713348389, "learning_rate": 1.3086582947785855e-07, "loss": 0.7611, "step": 2725 }, { "epoch": 0.875, "grad_norm": 7.429291248321533, "learning_rate": 1.2921348314606743e-07, "loss": 0.6987, "step": 2730 }, { "epoch": 0.8766025641025641, "grad_norm": 6.112792491912842, "learning_rate": 1.2756113681427628e-07, "loss": 0.7675, "step": 2735 }, { "epoch": 0.8782051282051282, "grad_norm": 9.730607986450195, "learning_rate": 1.2590879048248513e-07, "loss": 0.7429, "step": 2740 }, { "epoch": 0.8798076923076923, "grad_norm": 3.8280539512634277, "learning_rate": 1.2425644415069398e-07, "loss": 0.7058, "step": 2745 }, { "epoch": 0.8814102564102564, "grad_norm": 5.016750812530518, "learning_rate": 1.2260409781890284e-07, "loss": 0.6931, "step": 2750 }, { "epoch": 0.8830128205128205, "grad_norm": 3.027902603149414, "learning_rate": 1.209517514871117e-07, "loss": 0.744, "step": 2755 }, { "epoch": 0.8846153846153846, "grad_norm": 5.9112629890441895, "learning_rate": 1.1929940515532057e-07, "loss": 0.7883, "step": 2760 }, { "epoch": 0.8862179487179487, "grad_norm": 3.9589760303497314, "learning_rate": 1.176470588235294e-07, "loss": 0.6751, "step": 2765 }, { "epoch": 0.8878205128205128, "grad_norm": 12.412994384765625, "learning_rate": 1.1599471249173827e-07, "loss": 0.741, "step": 2770 }, { "epoch": 0.8894230769230769, "grad_norm": 6.313468933105469, "learning_rate": 1.1434236615994712e-07, "loss": 0.7083, "step": 2775 }, { "epoch": 0.8910256410256411, "grad_norm": 3.4576292037963867, "learning_rate": 1.1269001982815597e-07, "loss": 0.6078, "step": 2780 }, { "epoch": 0.8926282051282052, "grad_norm": 3.770681142807007, "learning_rate": 1.1103767349636484e-07, "loss": 0.7976, "step": 2785 }, { "epoch": 0.8942307692307693, "grad_norm": 4.323639392852783, "learning_rate": 1.0938532716457369e-07, "loss": 0.726, "step": 2790 }, { "epoch": 0.8958333333333334, "grad_norm": 6.223001480102539, "learning_rate": 1.0773298083278255e-07, "loss": 0.7428, "step": 2795 }, { "epoch": 0.8974358974358975, "grad_norm": 4.867865085601807, "learning_rate": 1.060806345009914e-07, "loss": 0.747, "step": 2800 }, { "epoch": 0.8990384615384616, "grad_norm": 4.22167444229126, "learning_rate": 1.0442828816920026e-07, "loss": 0.7824, "step": 2805 }, { "epoch": 0.9006410256410257, "grad_norm": 3.4794094562530518, "learning_rate": 1.0277594183740912e-07, "loss": 0.7904, "step": 2810 }, { "epoch": 0.9022435897435898, "grad_norm": 3.968479633331299, "learning_rate": 1.0112359550561797e-07, "loss": 0.8853, "step": 2815 }, { "epoch": 0.9038461538461539, "grad_norm": 3.1891181468963623, "learning_rate": 9.947124917382684e-08, "loss": 0.7753, "step": 2820 }, { "epoch": 0.905448717948718, "grad_norm": 4.9156646728515625, "learning_rate": 9.781890284203569e-08, "loss": 0.7521, "step": 2825 }, { "epoch": 0.907051282051282, "grad_norm": 4.938701152801514, "learning_rate": 9.616655651024454e-08, "loss": 0.7361, "step": 2830 }, { "epoch": 0.9086538461538461, "grad_norm": 4.312582492828369, "learning_rate": 9.451421017845341e-08, "loss": 0.7044, "step": 2835 }, { "epoch": 0.9102564102564102, "grad_norm": 7.3174519538879395, "learning_rate": 9.286186384666226e-08, "loss": 0.7778, "step": 2840 }, { "epoch": 0.9118589743589743, "grad_norm": 8.664481163024902, "learning_rate": 9.120951751487112e-08, "loss": 0.8317, "step": 2845 }, { "epoch": 0.9134615384615384, "grad_norm": 8.050248146057129, "learning_rate": 8.955717118307998e-08, "loss": 0.7777, "step": 2850 }, { "epoch": 0.9150641025641025, "grad_norm": 6.539444446563721, "learning_rate": 8.790482485128881e-08, "loss": 0.8357, "step": 2855 }, { "epoch": 0.9166666666666666, "grad_norm": 6.118063449859619, "learning_rate": 8.625247851949768e-08, "loss": 0.6746, "step": 2860 }, { "epoch": 0.9182692307692307, "grad_norm": 4.888671398162842, "learning_rate": 8.460013218770653e-08, "loss": 0.7677, "step": 2865 }, { "epoch": 0.9198717948717948, "grad_norm": 5.636521816253662, "learning_rate": 8.29477858559154e-08, "loss": 0.7025, "step": 2870 }, { "epoch": 0.9214743589743589, "grad_norm": 3.849520683288574, "learning_rate": 8.129543952412425e-08, "loss": 0.7187, "step": 2875 }, { "epoch": 0.9230769230769231, "grad_norm": 5.312481880187988, "learning_rate": 7.964309319233311e-08, "loss": 0.669, "step": 2880 }, { "epoch": 0.9246794871794872, "grad_norm": 6.7007527351379395, "learning_rate": 7.799074686054196e-08, "loss": 0.7571, "step": 2885 }, { "epoch": 0.9262820512820513, "grad_norm": 5.961256980895996, "learning_rate": 7.633840052875081e-08, "loss": 0.733, "step": 2890 }, { "epoch": 0.9278846153846154, "grad_norm": 8.099090576171875, "learning_rate": 7.468605419695968e-08, "loss": 0.8415, "step": 2895 }, { "epoch": 0.9294871794871795, "grad_norm": 3.7094759941101074, "learning_rate": 7.303370786516853e-08, "loss": 0.9158, "step": 2900 }, { "epoch": 0.9310897435897436, "grad_norm": 7.212512016296387, "learning_rate": 7.13813615333774e-08, "loss": 0.815, "step": 2905 }, { "epoch": 0.9326923076923077, "grad_norm": 5.013028144836426, "learning_rate": 6.972901520158625e-08, "loss": 0.7161, "step": 2910 }, { "epoch": 0.9342948717948718, "grad_norm": 5.3960041999816895, "learning_rate": 6.80766688697951e-08, "loss": 0.7817, "step": 2915 }, { "epoch": 0.9358974358974359, "grad_norm": 3.4956471920013428, "learning_rate": 6.642432253800396e-08, "loss": 0.8383, "step": 2920 }, { "epoch": 0.9375, "grad_norm": 3.654330253601074, "learning_rate": 6.477197620621282e-08, "loss": 0.8125, "step": 2925 }, { "epoch": 0.9391025641025641, "grad_norm": 6.255533695220947, "learning_rate": 6.311962987442168e-08, "loss": 0.7734, "step": 2930 }, { "epoch": 0.9407051282051282, "grad_norm": 4.802107810974121, "learning_rate": 6.146728354263053e-08, "loss": 0.709, "step": 2935 }, { "epoch": 0.9423076923076923, "grad_norm": 6.442443370819092, "learning_rate": 5.981493721083938e-08, "loss": 0.7668, "step": 2940 }, { "epoch": 0.9439102564102564, "grad_norm": 3.025623083114624, "learning_rate": 5.816259087904825e-08, "loss": 0.7797, "step": 2945 }, { "epoch": 0.9455128205128205, "grad_norm": 4.99326753616333, "learning_rate": 5.65102445472571e-08, "loss": 0.7969, "step": 2950 }, { "epoch": 0.9471153846153846, "grad_norm": 8.48199462890625, "learning_rate": 5.485789821546596e-08, "loss": 0.7861, "step": 2955 }, { "epoch": 0.9487179487179487, "grad_norm": 4.070643901824951, "learning_rate": 5.320555188367482e-08, "loss": 0.9045, "step": 2960 }, { "epoch": 0.9503205128205128, "grad_norm": 4.508942127227783, "learning_rate": 5.1553205551883676e-08, "loss": 0.806, "step": 2965 }, { "epoch": 0.9519230769230769, "grad_norm": 5.224105358123779, "learning_rate": 4.9900859220092534e-08, "loss": 0.7537, "step": 2970 }, { "epoch": 0.9535256410256411, "grad_norm": 5.267168998718262, "learning_rate": 4.8248512888301386e-08, "loss": 0.7458, "step": 2975 }, { "epoch": 0.9551282051282052, "grad_norm": 14.058978080749512, "learning_rate": 4.659616655651024e-08, "loss": 0.8491, "step": 2980 }, { "epoch": 0.9567307692307693, "grad_norm": 7.71165657043457, "learning_rate": 4.4943820224719096e-08, "loss": 0.7255, "step": 2985 }, { "epoch": 0.9583333333333334, "grad_norm": 3.65620493888855, "learning_rate": 4.3291473892927954e-08, "loss": 0.7917, "step": 2990 }, { "epoch": 0.9599358974358975, "grad_norm": 11.238397598266602, "learning_rate": 4.163912756113681e-08, "loss": 0.7828, "step": 2995 }, { "epoch": 0.9615384615384616, "grad_norm": 6.159839630126953, "learning_rate": 3.998678122934567e-08, "loss": 0.7724, "step": 3000 }, { "epoch": 0.9631410256410257, "grad_norm": 4.247456073760986, "learning_rate": 3.833443489755452e-08, "loss": 0.7635, "step": 3005 }, { "epoch": 0.9647435897435898, "grad_norm": 5.236011505126953, "learning_rate": 3.668208856576338e-08, "loss": 0.7782, "step": 3010 }, { "epoch": 0.9663461538461539, "grad_norm": 4.830688953399658, "learning_rate": 3.502974223397224e-08, "loss": 0.7962, "step": 3015 }, { "epoch": 0.967948717948718, "grad_norm": 6.072144508361816, "learning_rate": 3.33773959021811e-08, "loss": 0.9383, "step": 3020 }, { "epoch": 0.969551282051282, "grad_norm": 3.7657108306884766, "learning_rate": 3.1725049570389955e-08, "loss": 0.9029, "step": 3025 }, { "epoch": 0.9711538461538461, "grad_norm": 5.47902774810791, "learning_rate": 3.007270323859881e-08, "loss": 0.8262, "step": 3030 }, { "epoch": 0.9727564102564102, "grad_norm": 4.847268104553223, "learning_rate": 2.8420356906807665e-08, "loss": 0.8054, "step": 3035 }, { "epoch": 0.9743589743589743, "grad_norm": 6.062643527984619, "learning_rate": 2.676801057501652e-08, "loss": 0.7808, "step": 3040 }, { "epoch": 0.9759615384615384, "grad_norm": 5.440711498260498, "learning_rate": 2.511566424322538e-08, "loss": 0.8026, "step": 3045 }, { "epoch": 0.9775641025641025, "grad_norm": 2.9105708599090576, "learning_rate": 2.3463317911434237e-08, "loss": 0.67, "step": 3050 }, { "epoch": 0.9791666666666666, "grad_norm": 5.284862518310547, "learning_rate": 2.1810971579643092e-08, "loss": 0.7455, "step": 3055 }, { "epoch": 0.9807692307692307, "grad_norm": 3.7022602558135986, "learning_rate": 2.015862524785195e-08, "loss": 0.7627, "step": 3060 }, { "epoch": 0.9823717948717948, "grad_norm": 7.428618907928467, "learning_rate": 1.850627891606081e-08, "loss": 0.7116, "step": 3065 }, { "epoch": 0.9839743589743589, "grad_norm": 6.064960956573486, "learning_rate": 1.685393258426966e-08, "loss": 0.8331, "step": 3070 }, { "epoch": 0.9855769230769231, "grad_norm": 6.40654182434082, "learning_rate": 1.520158625247852e-08, "loss": 0.7827, "step": 3075 }, { "epoch": 0.9871794871794872, "grad_norm": 4.364375114440918, "learning_rate": 1.3549239920687375e-08, "loss": 0.8266, "step": 3080 }, { "epoch": 0.9887820512820513, "grad_norm": 6.127290725708008, "learning_rate": 1.1896893588896232e-08, "loss": 0.7636, "step": 3085 }, { "epoch": 0.9903846153846154, "grad_norm": 2.9324896335601807, "learning_rate": 1.0244547257105088e-08, "loss": 0.6256, "step": 3090 }, { "epoch": 0.9919871794871795, "grad_norm": 3.2810983657836914, "learning_rate": 8.592200925313947e-09, "loss": 0.7826, "step": 3095 }, { "epoch": 0.9935897435897436, "grad_norm": 5.652727127075195, "learning_rate": 6.939854593522802e-09, "loss": 0.6604, "step": 3100 }, { "epoch": 0.9951923076923077, "grad_norm": 3.927150011062622, "learning_rate": 5.287508261731658e-09, "loss": 0.7523, "step": 3105 }, { "epoch": 0.9967948717948718, "grad_norm": 4.154155731201172, "learning_rate": 3.6351619299405156e-09, "loss": 0.7649, "step": 3110 }, { "epoch": 0.9983974358974359, "grad_norm": 6.479323387145996, "learning_rate": 1.9828155981493722e-09, "loss": 0.731, "step": 3115 }, { "epoch": 1.0, "grad_norm": 3.997898817062378, "learning_rate": 3.3046926635822863e-10, "loss": 0.8839, "step": 3120 }, { "epoch": 1.0, "step": 3120, "total_flos": 9.05641071889875e+17, "train_loss": 0.8058771748573352, "train_runtime": 7172.3863, "train_samples_per_second": 6.959, "train_steps_per_second": 0.435 } ], "logging_steps": 5, "max_steps": 3120, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.05641071889875e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }