{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9101047191622467, "eval_steps": 500, "global_step": 1673, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005439956480348157, "grad_norm": 3.38836669921875, "learning_rate": 1.0000000000000002e-06, "loss": 1.5701, "step": 1 }, { "epoch": 0.0010879912960696314, "grad_norm": 5.072265625, "learning_rate": 2.0000000000000003e-06, "loss": 1.8077, "step": 2 }, { "epoch": 0.0016319869441044472, "grad_norm": 6.873793125152588, "learning_rate": 3e-06, "loss": 2.0557, "step": 3 }, { "epoch": 0.002175982592139263, "grad_norm": 10.95473861694336, "learning_rate": 4.000000000000001e-06, "loss": 2.3217, "step": 4 }, { "epoch": 0.0027199782401740786, "grad_norm": 16.71257972717285, "learning_rate": 5e-06, "loss": 2.6151, "step": 5 }, { "epoch": 0.0032639738882088943, "grad_norm": 13.412471771240234, "learning_rate": 6e-06, "loss": 2.6208, "step": 6 }, { "epoch": 0.00380796953624371, "grad_norm": 12.718871116638184, "learning_rate": 7.000000000000001e-06, "loss": 2.3968, "step": 7 }, { "epoch": 0.004351965184278526, "grad_norm": 10.58668041229248, "learning_rate": 8.000000000000001e-06, "loss": 2.347, "step": 8 }, { "epoch": 0.004895960832313341, "grad_norm": 9.376850128173828, "learning_rate": 9e-06, "loss": 2.1666, "step": 9 }, { "epoch": 0.005439956480348157, "grad_norm": 9.83341121673584, "learning_rate": 1e-05, "loss": 2.187, "step": 10 }, { "epoch": 0.0059839521283829725, "grad_norm": 10.35899829864502, "learning_rate": 1.1000000000000001e-05, "loss": 2.3889, "step": 11 }, { "epoch": 0.006527947776417789, "grad_norm": 9.567532539367676, "learning_rate": 1.2e-05, "loss": 2.1716, "step": 12 }, { "epoch": 0.007071943424452604, "grad_norm": 10.39463996887207, "learning_rate": 1.3000000000000001e-05, "loss": 2.0629, "step": 13 }, { "epoch": 0.00761593907248742, "grad_norm": 10.951495170593262, "learning_rate": 1.4000000000000001e-05, "loss": 2.1067, "step": 14 }, { "epoch": 0.008159934720522236, "grad_norm": 10.39360237121582, "learning_rate": 1.5e-05, "loss": 2.1045, "step": 15 }, { "epoch": 0.008703930368557052, "grad_norm": 8.992118835449219, "learning_rate": 1.6000000000000003e-05, "loss": 1.8881, "step": 16 }, { "epoch": 0.009247926016591867, "grad_norm": 12.166930198669434, "learning_rate": 1.7000000000000003e-05, "loss": 1.904, "step": 17 }, { "epoch": 0.009791921664626682, "grad_norm": 11.167236328125, "learning_rate": 1.8e-05, "loss": 1.7453, "step": 18 }, { "epoch": 0.0103359173126615, "grad_norm": 10.210038185119629, "learning_rate": 1.9e-05, "loss": 1.6291, "step": 19 }, { "epoch": 0.010879912960696314, "grad_norm": 9.85534381866455, "learning_rate": 2e-05, "loss": 1.6658, "step": 20 }, { "epoch": 0.01142390860873113, "grad_norm": 8.728784561157227, "learning_rate": 2.1e-05, "loss": 1.7798, "step": 21 }, { "epoch": 0.011967904256765945, "grad_norm": 8.889214515686035, "learning_rate": 2.2000000000000003e-05, "loss": 1.6524, "step": 22 }, { "epoch": 0.012511899904800762, "grad_norm": 8.525418281555176, "learning_rate": 2.3000000000000003e-05, "loss": 1.4172, "step": 23 }, { "epoch": 0.013055895552835577, "grad_norm": 7.264047145843506, "learning_rate": 2.4e-05, "loss": 1.3035, "step": 24 }, { "epoch": 0.013599891200870393, "grad_norm": 7.642018795013428, "learning_rate": 2.5e-05, "loss": 1.3627, "step": 25 }, { "epoch": 0.014143886848905208, "grad_norm": 6.121570587158203, "learning_rate": 2.6000000000000002e-05, "loss": 1.4722, "step": 26 }, { "epoch": 0.014687882496940025, "grad_norm": 5.287453651428223, "learning_rate": 2.7000000000000002e-05, "loss": 1.2153, "step": 27 }, { "epoch": 0.01523187814497484, "grad_norm": 6.000918865203857, "learning_rate": 2.8000000000000003e-05, "loss": 1.2598, "step": 28 }, { "epoch": 0.015775873793009657, "grad_norm": 4.720274448394775, "learning_rate": 2.9e-05, "loss": 1.0974, "step": 29 }, { "epoch": 0.016319869441044473, "grad_norm": 4.979583263397217, "learning_rate": 3e-05, "loss": 1.0875, "step": 30 }, { "epoch": 0.016863865089079288, "grad_norm": 5.400400161743164, "learning_rate": 3.1e-05, "loss": 1.1341, "step": 31 }, { "epoch": 0.017407860737114103, "grad_norm": 4.670845985412598, "learning_rate": 3.2000000000000005e-05, "loss": 1.0387, "step": 32 }, { "epoch": 0.01795185638514892, "grad_norm": 6.725168704986572, "learning_rate": 3.3e-05, "loss": 1.1592, "step": 33 }, { "epoch": 0.018495852033183734, "grad_norm": 5.518282890319824, "learning_rate": 3.4000000000000007e-05, "loss": 0.9551, "step": 34 }, { "epoch": 0.01903984768121855, "grad_norm": 6.766972541809082, "learning_rate": 3.5e-05, "loss": 1.106, "step": 35 }, { "epoch": 0.019583843329253364, "grad_norm": 5.7482829093933105, "learning_rate": 3.6e-05, "loss": 0.9587, "step": 36 }, { "epoch": 0.020127838977288183, "grad_norm": 6.030055046081543, "learning_rate": 3.7e-05, "loss": 0.9725, "step": 37 }, { "epoch": 0.020671834625323, "grad_norm": 5.512350559234619, "learning_rate": 3.8e-05, "loss": 1.0616, "step": 38 }, { "epoch": 0.021215830273357814, "grad_norm": 6.611831188201904, "learning_rate": 3.9000000000000006e-05, "loss": 1.0294, "step": 39 }, { "epoch": 0.02175982592139263, "grad_norm": 6.287083625793457, "learning_rate": 4e-05, "loss": 1.0733, "step": 40 }, { "epoch": 0.022303821569427444, "grad_norm": 6.098505973815918, "learning_rate": 4.1e-05, "loss": 0.9451, "step": 41 }, { "epoch": 0.02284781721746226, "grad_norm": 5.412561893463135, "learning_rate": 4.2e-05, "loss": 0.9901, "step": 42 }, { "epoch": 0.023391812865497075, "grad_norm": 4.6482319831848145, "learning_rate": 4.3e-05, "loss": 0.9832, "step": 43 }, { "epoch": 0.02393580851353189, "grad_norm": 4.945935249328613, "learning_rate": 4.4000000000000006e-05, "loss": 0.9122, "step": 44 }, { "epoch": 0.02447980416156671, "grad_norm": 5.810155391693115, "learning_rate": 4.5e-05, "loss": 1.0048, "step": 45 }, { "epoch": 0.025023799809601524, "grad_norm": 6.7664408683776855, "learning_rate": 4.600000000000001e-05, "loss": 0.9801, "step": 46 }, { "epoch": 0.02556779545763634, "grad_norm": 5.613837242126465, "learning_rate": 4.7e-05, "loss": 0.8579, "step": 47 }, { "epoch": 0.026111791105671155, "grad_norm": 5.115418434143066, "learning_rate": 4.8e-05, "loss": 0.954, "step": 48 }, { "epoch": 0.02665578675370597, "grad_norm": 4.536484718322754, "learning_rate": 4.9e-05, "loss": 0.8319, "step": 49 }, { "epoch": 0.027199782401740785, "grad_norm": 5.139177322387695, "learning_rate": 5e-05, "loss": 0.7448, "step": 50 }, { "epoch": 0.0277437780497756, "grad_norm": 5.259976863861084, "learning_rate": 5.1000000000000006e-05, "loss": 1.7348, "step": 51 }, { "epoch": 0.028287773697810416, "grad_norm": 6.83056640625, "learning_rate": 5.2000000000000004e-05, "loss": 1.6731, "step": 52 }, { "epoch": 0.028831769345845235, "grad_norm": 4.867329120635986, "learning_rate": 5.300000000000001e-05, "loss": 1.4409, "step": 53 }, { "epoch": 0.02937576499388005, "grad_norm": 3.623828649520874, "learning_rate": 5.4000000000000005e-05, "loss": 1.3544, "step": 54 }, { "epoch": 0.029919760641914865, "grad_norm": 4.515971660614014, "learning_rate": 5.500000000000001e-05, "loss": 1.3325, "step": 55 }, { "epoch": 0.03046375628994968, "grad_norm": 5.825387001037598, "learning_rate": 5.6000000000000006e-05, "loss": 1.283, "step": 56 }, { "epoch": 0.031007751937984496, "grad_norm": 4.172207832336426, "learning_rate": 5.6999999999999996e-05, "loss": 1.6582, "step": 57 }, { "epoch": 0.031551747586019314, "grad_norm": 4.117741584777832, "learning_rate": 5.8e-05, "loss": 1.4677, "step": 58 }, { "epoch": 0.032095743234054126, "grad_norm": 3.4425783157348633, "learning_rate": 5.9e-05, "loss": 1.36, "step": 59 }, { "epoch": 0.032639738882088945, "grad_norm": 3.7454018592834473, "learning_rate": 6e-05, "loss": 1.2685, "step": 60 }, { "epoch": 0.03318373453012376, "grad_norm": 3.344118118286133, "learning_rate": 6.1e-05, "loss": 1.3097, "step": 61 }, { "epoch": 0.033727730178158576, "grad_norm": 4.348125457763672, "learning_rate": 6.2e-05, "loss": 1.3251, "step": 62 }, { "epoch": 0.03427172582619339, "grad_norm": 4.010519027709961, "learning_rate": 6.3e-05, "loss": 1.3863, "step": 63 }, { "epoch": 0.034815721474228206, "grad_norm": 3.8831474781036377, "learning_rate": 6.400000000000001e-05, "loss": 1.1983, "step": 64 }, { "epoch": 0.035359717122263025, "grad_norm": 3.6556529998779297, "learning_rate": 6.500000000000001e-05, "loss": 0.964, "step": 65 }, { "epoch": 0.03590371277029784, "grad_norm": 4.303621292114258, "learning_rate": 6.6e-05, "loss": 1.1377, "step": 66 }, { "epoch": 0.036447708418332656, "grad_norm": 3.6041345596313477, "learning_rate": 6.7e-05, "loss": 0.9164, "step": 67 }, { "epoch": 0.03699170406636747, "grad_norm": 3.7285428047180176, "learning_rate": 6.800000000000001e-05, "loss": 1.0987, "step": 68 }, { "epoch": 0.037535699714402286, "grad_norm": 4.511111736297607, "learning_rate": 6.9e-05, "loss": 1.029, "step": 69 }, { "epoch": 0.0380796953624371, "grad_norm": 4.622512340545654, "learning_rate": 7e-05, "loss": 1.0785, "step": 70 }, { "epoch": 0.03862369101047192, "grad_norm": 4.719239711761475, "learning_rate": 7.1e-05, "loss": 1.1572, "step": 71 }, { "epoch": 0.03916768665850673, "grad_norm": 3.7064616680145264, "learning_rate": 7.2e-05, "loss": 1.0928, "step": 72 }, { "epoch": 0.03971168230654155, "grad_norm": 3.3874435424804688, "learning_rate": 7.3e-05, "loss": 0.8967, "step": 73 }, { "epoch": 0.040255677954576366, "grad_norm": 4.353206634521484, "learning_rate": 7.4e-05, "loss": 0.9987, "step": 74 }, { "epoch": 0.04079967360261118, "grad_norm": 3.937009572982788, "learning_rate": 7.500000000000001e-05, "loss": 0.8808, "step": 75 }, { "epoch": 0.041343669250646, "grad_norm": 4.458731174468994, "learning_rate": 7.6e-05, "loss": 1.0192, "step": 76 }, { "epoch": 0.04188766489868081, "grad_norm": 4.187011241912842, "learning_rate": 7.7e-05, "loss": 1.062, "step": 77 }, { "epoch": 0.04243166054671563, "grad_norm": 4.091516971588135, "learning_rate": 7.800000000000001e-05, "loss": 0.8806, "step": 78 }, { "epoch": 0.04297565619475044, "grad_norm": 4.072109222412109, "learning_rate": 7.900000000000001e-05, "loss": 0.8327, "step": 79 }, { "epoch": 0.04351965184278526, "grad_norm": 4.590778350830078, "learning_rate": 8e-05, "loss": 0.9602, "step": 80 }, { "epoch": 0.044063647490820076, "grad_norm": 4.060493469238281, "learning_rate": 8.1e-05, "loss": 0.6523, "step": 81 }, { "epoch": 0.04460764313885489, "grad_norm": 4.273815631866455, "learning_rate": 8.2e-05, "loss": 0.8426, "step": 82 }, { "epoch": 0.04515163878688971, "grad_norm": 3.6501169204711914, "learning_rate": 8.3e-05, "loss": 0.8794, "step": 83 }, { "epoch": 0.04569563443492452, "grad_norm": 5.391927242279053, "learning_rate": 8.4e-05, "loss": 1.0774, "step": 84 }, { "epoch": 0.04623963008295934, "grad_norm": 7.685778617858887, "learning_rate": 8.5e-05, "loss": 1.0909, "step": 85 }, { "epoch": 0.04678362573099415, "grad_norm": 4.7363386154174805, "learning_rate": 8.6e-05, "loss": 1.0342, "step": 86 }, { "epoch": 0.04732762137902897, "grad_norm": 3.9276182651519775, "learning_rate": 8.7e-05, "loss": 0.7593, "step": 87 }, { "epoch": 0.04787161702706378, "grad_norm": 5.698952674865723, "learning_rate": 8.800000000000001e-05, "loss": 1.059, "step": 88 }, { "epoch": 0.0484156126750986, "grad_norm": 4.903502941131592, "learning_rate": 8.900000000000001e-05, "loss": 0.8328, "step": 89 }, { "epoch": 0.04895960832313342, "grad_norm": 4.954896926879883, "learning_rate": 9e-05, "loss": 0.7982, "step": 90 }, { "epoch": 0.04950360397116823, "grad_norm": 5.2430009841918945, "learning_rate": 9.1e-05, "loss": 0.8603, "step": 91 }, { "epoch": 0.05004759961920305, "grad_norm": 5.98779821395874, "learning_rate": 9.200000000000001e-05, "loss": 0.9211, "step": 92 }, { "epoch": 0.05059159526723786, "grad_norm": 5.324619770050049, "learning_rate": 9.300000000000001e-05, "loss": 0.909, "step": 93 }, { "epoch": 0.05113559091527268, "grad_norm": 5.485590934753418, "learning_rate": 9.4e-05, "loss": 0.9821, "step": 94 }, { "epoch": 0.05167958656330749, "grad_norm": 4.731006622314453, "learning_rate": 9.5e-05, "loss": 0.9084, "step": 95 }, { "epoch": 0.05222358221134231, "grad_norm": 4.7967047691345215, "learning_rate": 9.6e-05, "loss": 0.8122, "step": 96 }, { "epoch": 0.05276757785937713, "grad_norm": 3.980215311050415, "learning_rate": 9.7e-05, "loss": 0.7282, "step": 97 }, { "epoch": 0.05331157350741194, "grad_norm": 5.7652411460876465, "learning_rate": 9.8e-05, "loss": 0.8433, "step": 98 }, { "epoch": 0.05385556915544676, "grad_norm": 5.613971710205078, "learning_rate": 9.900000000000001e-05, "loss": 0.7737, "step": 99 }, { "epoch": 0.05439956480348157, "grad_norm": 8.238423347473145, "learning_rate": 0.0001, "loss": 0.6204, "step": 100 }, { "epoch": 0.05494356045151639, "grad_norm": 7.8958740234375, "learning_rate": 9.999991831541789e-05, "loss": 1.5829, "step": 101 }, { "epoch": 0.0554875560995512, "grad_norm": 8.566459655761719, "learning_rate": 9.999967326193847e-05, "loss": 1.4051, "step": 102 }, { "epoch": 0.05603155174758602, "grad_norm": 5.071866989135742, "learning_rate": 9.999926484036237e-05, "loss": 1.4436, "step": 103 }, { "epoch": 0.05657554739562083, "grad_norm": 3.6401336193084717, "learning_rate": 9.999869305202412e-05, "loss": 1.3108, "step": 104 }, { "epoch": 0.05711954304365565, "grad_norm": 5.350152492523193, "learning_rate": 9.999795789879196e-05, "loss": 1.2944, "step": 105 }, { "epoch": 0.05766353869169047, "grad_norm": 4.014231204986572, "learning_rate": 9.999705938306789e-05, "loss": 1.2456, "step": 106 }, { "epoch": 0.05820753433972528, "grad_norm": 4.270825386047363, "learning_rate": 9.999599750778772e-05, "loss": 1.0673, "step": 107 }, { "epoch": 0.0587515299877601, "grad_norm": 2.8886313438415527, "learning_rate": 9.999477227642103e-05, "loss": 1.1549, "step": 108 }, { "epoch": 0.05929552563579491, "grad_norm": 3.0361084938049316, "learning_rate": 9.999338369297106e-05, "loss": 1.1786, "step": 109 }, { "epoch": 0.05983952128382973, "grad_norm": 3.420243978500366, "learning_rate": 9.999183176197491e-05, "loss": 0.913, "step": 110 }, { "epoch": 0.06038351693186454, "grad_norm": 3.284538745880127, "learning_rate": 9.999011648850329e-05, "loss": 0.9328, "step": 111 }, { "epoch": 0.06092751257989936, "grad_norm": 3.4964373111724854, "learning_rate": 9.998823787816066e-05, "loss": 0.9557, "step": 112 }, { "epoch": 0.06147150822793418, "grad_norm": 4.4130730628967285, "learning_rate": 9.998619593708518e-05, "loss": 1.1965, "step": 113 }, { "epoch": 0.06201550387596899, "grad_norm": 5.17526912689209, "learning_rate": 9.998399067194864e-05, "loss": 1.0819, "step": 114 }, { "epoch": 0.0625594995240038, "grad_norm": 3.8703389167785645, "learning_rate": 9.99816220899565e-05, "loss": 1.1341, "step": 115 }, { "epoch": 0.06310349517203863, "grad_norm": 3.6758615970611572, "learning_rate": 9.997909019884781e-05, "loss": 0.9225, "step": 116 }, { "epoch": 0.06364749082007344, "grad_norm": 3.864262580871582, "learning_rate": 9.997639500689523e-05, "loss": 0.9298, "step": 117 }, { "epoch": 0.06419148646810825, "grad_norm": 3.3097100257873535, "learning_rate": 9.9973536522905e-05, "loss": 0.8581, "step": 118 }, { "epoch": 0.06473548211614306, "grad_norm": 3.399921417236328, "learning_rate": 9.997051475621687e-05, "loss": 0.656, "step": 119 }, { "epoch": 0.06527947776417789, "grad_norm": 6.201872825622559, "learning_rate": 9.996732971670408e-05, "loss": 0.9634, "step": 120 }, { "epoch": 0.0658234734122127, "grad_norm": 5.1146159172058105, "learning_rate": 9.996398141477344e-05, "loss": 0.7987, "step": 121 }, { "epoch": 0.06636746906024751, "grad_norm": 3.8747925758361816, "learning_rate": 9.996046986136509e-05, "loss": 0.6058, "step": 122 }, { "epoch": 0.06691146470828234, "grad_norm": 4.072218418121338, "learning_rate": 9.995679506795264e-05, "loss": 0.6199, "step": 123 }, { "epoch": 0.06745546035631715, "grad_norm": 3.835451602935791, "learning_rate": 9.995295704654304e-05, "loss": 0.6719, "step": 124 }, { "epoch": 0.06799945600435196, "grad_norm": 3.933903932571411, "learning_rate": 9.994895580967658e-05, "loss": 0.7532, "step": 125 }, { "epoch": 0.06854345165238677, "grad_norm": 4.284427165985107, "learning_rate": 9.994479137042683e-05, "loss": 0.965, "step": 126 }, { "epoch": 0.0690874473004216, "grad_norm": 4.310452938079834, "learning_rate": 9.994046374240062e-05, "loss": 0.6384, "step": 127 }, { "epoch": 0.06963144294845641, "grad_norm": 4.394256591796875, "learning_rate": 9.993597293973796e-05, "loss": 0.6912, "step": 128 }, { "epoch": 0.07017543859649122, "grad_norm": 4.5574049949646, "learning_rate": 9.993131897711202e-05, "loss": 0.7561, "step": 129 }, { "epoch": 0.07071943424452605, "grad_norm": 3.37497615814209, "learning_rate": 9.992650186972909e-05, "loss": 0.6864, "step": 130 }, { "epoch": 0.07126342989256086, "grad_norm": 3.805138111114502, "learning_rate": 9.99215216333285e-05, "loss": 0.6454, "step": 131 }, { "epoch": 0.07180742554059567, "grad_norm": 4.915141582489014, "learning_rate": 9.99163782841826e-05, "loss": 0.8278, "step": 132 }, { "epoch": 0.07235142118863049, "grad_norm": 3.5271239280700684, "learning_rate": 9.991107183909664e-05, "loss": 0.6025, "step": 133 }, { "epoch": 0.07289541683666531, "grad_norm": 5.5649943351745605, "learning_rate": 9.990560231540889e-05, "loss": 0.98, "step": 134 }, { "epoch": 0.07343941248470012, "grad_norm": 3.6706109046936035, "learning_rate": 9.989996973099032e-05, "loss": 0.4958, "step": 135 }, { "epoch": 0.07398340813273493, "grad_norm": 4.406979084014893, "learning_rate": 9.989417410424475e-05, "loss": 0.8303, "step": 136 }, { "epoch": 0.07452740378076976, "grad_norm": 4.29072904586792, "learning_rate": 9.988821545410874e-05, "loss": 0.6656, "step": 137 }, { "epoch": 0.07507139942880457, "grad_norm": 3.8733510971069336, "learning_rate": 9.988209380005144e-05, "loss": 0.8013, "step": 138 }, { "epoch": 0.07561539507683938, "grad_norm": 3.9693851470947266, "learning_rate": 9.987580916207468e-05, "loss": 0.6557, "step": 139 }, { "epoch": 0.0761593907248742, "grad_norm": 3.9533190727233887, "learning_rate": 9.986936156071278e-05, "loss": 0.7801, "step": 140 }, { "epoch": 0.07670338637290902, "grad_norm": 5.204038619995117, "learning_rate": 9.98627510170325e-05, "loss": 0.6994, "step": 141 }, { "epoch": 0.07724738202094383, "grad_norm": 3.741140842437744, "learning_rate": 9.985597755263302e-05, "loss": 0.6959, "step": 142 }, { "epoch": 0.07779137766897865, "grad_norm": 3.9401040077209473, "learning_rate": 9.984904118964588e-05, "loss": 0.7518, "step": 143 }, { "epoch": 0.07833537331701346, "grad_norm": 3.843142032623291, "learning_rate": 9.98419419507348e-05, "loss": 0.7302, "step": 144 }, { "epoch": 0.07887936896504828, "grad_norm": 3.4802918434143066, "learning_rate": 9.983467985909573e-05, "loss": 0.539, "step": 145 }, { "epoch": 0.0794233646130831, "grad_norm": 3.8871712684631348, "learning_rate": 9.98272549384567e-05, "loss": 0.64, "step": 146 }, { "epoch": 0.0799673602611179, "grad_norm": 4.858451843261719, "learning_rate": 9.981966721307778e-05, "loss": 0.6278, "step": 147 }, { "epoch": 0.08051135590915273, "grad_norm": 6.457993984222412, "learning_rate": 9.981191670775097e-05, "loss": 0.828, "step": 148 }, { "epoch": 0.08105535155718754, "grad_norm": 5.394967079162598, "learning_rate": 9.980400344780015e-05, "loss": 0.6283, "step": 149 }, { "epoch": 0.08159934720522236, "grad_norm": 4.716073036193848, "learning_rate": 9.979592745908095e-05, "loss": 0.5237, "step": 150 }, { "epoch": 0.08214334285325717, "grad_norm": 8.803768157958984, "learning_rate": 9.978768876798075e-05, "loss": 1.6453, "step": 151 }, { "epoch": 0.082687338501292, "grad_norm": 8.073860168457031, "learning_rate": 9.977928740141851e-05, "loss": 1.2131, "step": 152 }, { "epoch": 0.0832313341493268, "grad_norm": 5.193158149719238, "learning_rate": 9.977072338684469e-05, "loss": 1.1683, "step": 153 }, { "epoch": 0.08377532979736162, "grad_norm": 3.6848511695861816, "learning_rate": 9.976199675224123e-05, "loss": 1.1919, "step": 154 }, { "epoch": 0.08431932544539644, "grad_norm": 6.1418352127075195, "learning_rate": 9.975310752612137e-05, "loss": 1.0148, "step": 155 }, { "epoch": 0.08486332109343125, "grad_norm": 5.173342227935791, "learning_rate": 9.974405573752965e-05, "loss": 1.1601, "step": 156 }, { "epoch": 0.08540731674146607, "grad_norm": 5.259501934051514, "learning_rate": 9.97348414160417e-05, "loss": 1.2669, "step": 157 }, { "epoch": 0.08595131238950088, "grad_norm": 5.553162097930908, "learning_rate": 9.972546459176425e-05, "loss": 1.0207, "step": 158 }, { "epoch": 0.0864953080375357, "grad_norm": 3.6189613342285156, "learning_rate": 9.9715925295335e-05, "loss": 1.077, "step": 159 }, { "epoch": 0.08703930368557052, "grad_norm": 3.863940715789795, "learning_rate": 9.970622355792247e-05, "loss": 1.0444, "step": 160 }, { "epoch": 0.08758329933360533, "grad_norm": 3.6799099445343018, "learning_rate": 9.969635941122595e-05, "loss": 1.1076, "step": 161 }, { "epoch": 0.08812729498164015, "grad_norm": 3.5157275199890137, "learning_rate": 9.968633288747539e-05, "loss": 0.9906, "step": 162 }, { "epoch": 0.08867129062967496, "grad_norm": 3.7690622806549072, "learning_rate": 9.96761440194313e-05, "loss": 0.8519, "step": 163 }, { "epoch": 0.08921528627770978, "grad_norm": 3.7737107276916504, "learning_rate": 9.96657928403846e-05, "loss": 0.93, "step": 164 }, { "epoch": 0.08975928192574459, "grad_norm": 4.396078586578369, "learning_rate": 9.965527938415655e-05, "loss": 0.7957, "step": 165 }, { "epoch": 0.09030327757377941, "grad_norm": 3.6368908882141113, "learning_rate": 9.964460368509867e-05, "loss": 0.7628, "step": 166 }, { "epoch": 0.09084727322181423, "grad_norm": 3.999582529067993, "learning_rate": 9.963376577809256e-05, "loss": 1.0093, "step": 167 }, { "epoch": 0.09139126886984904, "grad_norm": 3.631183385848999, "learning_rate": 9.962276569854977e-05, "loss": 0.812, "step": 168 }, { "epoch": 0.09193526451788386, "grad_norm": 3.597949504852295, "learning_rate": 9.961160348241185e-05, "loss": 0.8612, "step": 169 }, { "epoch": 0.09247926016591868, "grad_norm": 3.922459602355957, "learning_rate": 9.960027916614998e-05, "loss": 0.6447, "step": 170 }, { "epoch": 0.09302325581395349, "grad_norm": 4.093118190765381, "learning_rate": 9.958879278676506e-05, "loss": 0.771, "step": 171 }, { "epoch": 0.0935672514619883, "grad_norm": 4.338310241699219, "learning_rate": 9.95771443817875e-05, "loss": 0.7756, "step": 172 }, { "epoch": 0.09411124711002312, "grad_norm": 4.6187214851379395, "learning_rate": 9.95653339892771e-05, "loss": 0.6301, "step": 173 }, { "epoch": 0.09465524275805794, "grad_norm": 5.765655994415283, "learning_rate": 9.955336164782292e-05, "loss": 0.8923, "step": 174 }, { "epoch": 0.09519923840609275, "grad_norm": 3.763749361038208, "learning_rate": 9.95412273965432e-05, "loss": 0.7223, "step": 175 }, { "epoch": 0.09574323405412756, "grad_norm": 5.41685676574707, "learning_rate": 9.952893127508522e-05, "loss": 0.7741, "step": 176 }, { "epoch": 0.09628722970216239, "grad_norm": 4.086045265197754, "learning_rate": 9.95164733236251e-05, "loss": 0.82, "step": 177 }, { "epoch": 0.0968312253501972, "grad_norm": 3.8068294525146484, "learning_rate": 9.950385358286772e-05, "loss": 0.7578, "step": 178 }, { "epoch": 0.09737522099823201, "grad_norm": 3.4807732105255127, "learning_rate": 9.949107209404665e-05, "loss": 0.7299, "step": 179 }, { "epoch": 0.09791921664626684, "grad_norm": 3.770779848098755, "learning_rate": 9.947812889892387e-05, "loss": 0.4788, "step": 180 }, { "epoch": 0.09846321229430165, "grad_norm": 3.3061537742614746, "learning_rate": 9.94650240397898e-05, "loss": 0.5554, "step": 181 }, { "epoch": 0.09900720794233646, "grad_norm": 3.435459613800049, "learning_rate": 9.9451757559463e-05, "loss": 0.5934, "step": 182 }, { "epoch": 0.09955120359037127, "grad_norm": 4.068090915679932, "learning_rate": 9.943832950129018e-05, "loss": 0.6486, "step": 183 }, { "epoch": 0.1000951992384061, "grad_norm": 3.6461687088012695, "learning_rate": 9.942473990914593e-05, "loss": 0.5539, "step": 184 }, { "epoch": 0.10063919488644091, "grad_norm": 3.375304698944092, "learning_rate": 9.941098882743267e-05, "loss": 0.5907, "step": 185 }, { "epoch": 0.10118319053447572, "grad_norm": 4.3684000968933105, "learning_rate": 9.939707630108044e-05, "loss": 0.4971, "step": 186 }, { "epoch": 0.10172718618251055, "grad_norm": 4.59498405456543, "learning_rate": 9.93830023755468e-05, "loss": 0.783, "step": 187 }, { "epoch": 0.10227118183054536, "grad_norm": 4.35309362411499, "learning_rate": 9.936876709681668e-05, "loss": 0.8445, "step": 188 }, { "epoch": 0.10281517747858017, "grad_norm": 4.092062473297119, "learning_rate": 9.935437051140216e-05, "loss": 0.6617, "step": 189 }, { "epoch": 0.10335917312661498, "grad_norm": 4.912999629974365, "learning_rate": 9.933981266634243e-05, "loss": 0.7095, "step": 190 }, { "epoch": 0.1039031687746498, "grad_norm": 3.889961004257202, "learning_rate": 9.932509360920353e-05, "loss": 0.5883, "step": 191 }, { "epoch": 0.10444716442268462, "grad_norm": 3.8380653858184814, "learning_rate": 9.931021338807828e-05, "loss": 0.7511, "step": 192 }, { "epoch": 0.10499116007071943, "grad_norm": 3.8490562438964844, "learning_rate": 9.929517205158605e-05, "loss": 0.753, "step": 193 }, { "epoch": 0.10553515571875426, "grad_norm": 4.2623114585876465, "learning_rate": 9.927996964887265e-05, "loss": 0.7111, "step": 194 }, { "epoch": 0.10607915136678907, "grad_norm": 3.584996223449707, "learning_rate": 9.926460622961016e-05, "loss": 0.5562, "step": 195 }, { "epoch": 0.10662314701482388, "grad_norm": 4.3349151611328125, "learning_rate": 9.924908184399677e-05, "loss": 0.7465, "step": 196 }, { "epoch": 0.10716714266285869, "grad_norm": 5.700389862060547, "learning_rate": 9.92333965427566e-05, "loss": 0.6043, "step": 197 }, { "epoch": 0.10771113831089352, "grad_norm": 3.9633548259735107, "learning_rate": 9.921755037713952e-05, "loss": 0.4735, "step": 198 }, { "epoch": 0.10825513395892833, "grad_norm": 4.987261772155762, "learning_rate": 9.920154339892104e-05, "loss": 0.5381, "step": 199 }, { "epoch": 0.10879912960696314, "grad_norm": 5.922599792480469, "learning_rate": 9.91853756604021e-05, "loss": 0.662, "step": 200 }, { "epoch": 0.10934312525499797, "grad_norm": 9.180115699768066, "learning_rate": 9.916904721440887e-05, "loss": 1.4626, "step": 201 }, { "epoch": 0.10988712090303278, "grad_norm": 11.197164535522461, "learning_rate": 9.915255811429267e-05, "loss": 1.1606, "step": 202 }, { "epoch": 0.11043111655106759, "grad_norm": 6.968974590301514, "learning_rate": 9.91359084139297e-05, "loss": 1.0573, "step": 203 }, { "epoch": 0.1109751121991024, "grad_norm": 5.653061866760254, "learning_rate": 9.911909816772091e-05, "loss": 1.0411, "step": 204 }, { "epoch": 0.11151910784713723, "grad_norm": 3.8857879638671875, "learning_rate": 9.910212743059182e-05, "loss": 1.0978, "step": 205 }, { "epoch": 0.11206310349517204, "grad_norm": 4.828501224517822, "learning_rate": 9.908499625799235e-05, "loss": 1.151, "step": 206 }, { "epoch": 0.11260709914320685, "grad_norm": 4.59937858581543, "learning_rate": 9.906770470589657e-05, "loss": 0.8705, "step": 207 }, { "epoch": 0.11315109479124166, "grad_norm": 3.9703354835510254, "learning_rate": 9.905025283080265e-05, "loss": 1.0188, "step": 208 }, { "epoch": 0.11369509043927649, "grad_norm": 4.520991325378418, "learning_rate": 9.903264068973252e-05, "loss": 1.1154, "step": 209 }, { "epoch": 0.1142390860873113, "grad_norm": 2.8581595420837402, "learning_rate": 9.901486834023182e-05, "loss": 0.8804, "step": 210 }, { "epoch": 0.11478308173534611, "grad_norm": 2.818852186203003, "learning_rate": 9.899693584036959e-05, "loss": 0.6256, "step": 211 }, { "epoch": 0.11532707738338094, "grad_norm": 2.7698662281036377, "learning_rate": 9.897884324873824e-05, "loss": 0.748, "step": 212 }, { "epoch": 0.11587107303141575, "grad_norm": 3.437847375869751, "learning_rate": 9.896059062445315e-05, "loss": 0.5411, "step": 213 }, { "epoch": 0.11641506867945056, "grad_norm": 3.3071494102478027, "learning_rate": 9.894217802715266e-05, "loss": 0.6827, "step": 214 }, { "epoch": 0.11695906432748537, "grad_norm": 5.194654941558838, "learning_rate": 9.892360551699779e-05, "loss": 0.8476, "step": 215 }, { "epoch": 0.1175030599755202, "grad_norm": 4.239700794219971, "learning_rate": 9.890487315467205e-05, "loss": 0.7344, "step": 216 }, { "epoch": 0.11804705562355501, "grad_norm": 3.279238224029541, "learning_rate": 9.888598100138123e-05, "loss": 0.6435, "step": 217 }, { "epoch": 0.11859105127158982, "grad_norm": 3.0847952365875244, "learning_rate": 9.886692911885322e-05, "loss": 0.6646, "step": 218 }, { "epoch": 0.11913504691962465, "grad_norm": 3.883422613143921, "learning_rate": 9.884771756933788e-05, "loss": 0.8261, "step": 219 }, { "epoch": 0.11967904256765946, "grad_norm": 4.688355922698975, "learning_rate": 9.882834641560666e-05, "loss": 0.865, "step": 220 }, { "epoch": 0.12022303821569427, "grad_norm": 3.9643490314483643, "learning_rate": 9.880881572095256e-05, "loss": 0.7875, "step": 221 }, { "epoch": 0.12076703386372908, "grad_norm": 3.2995986938476562, "learning_rate": 9.878912554918982e-05, "loss": 0.6232, "step": 222 }, { "epoch": 0.12131102951176391, "grad_norm": 4.972283363342285, "learning_rate": 9.876927596465381e-05, "loss": 0.642, "step": 223 }, { "epoch": 0.12185502515979872, "grad_norm": 3.9055376052856445, "learning_rate": 9.874926703220073e-05, "loss": 0.6841, "step": 224 }, { "epoch": 0.12239902080783353, "grad_norm": 3.449025869369507, "learning_rate": 9.872909881720741e-05, "loss": 0.6034, "step": 225 }, { "epoch": 0.12294301645586836, "grad_norm": 3.7777464389801025, "learning_rate": 9.870877138557116e-05, "loss": 0.7417, "step": 226 }, { "epoch": 0.12348701210390317, "grad_norm": 3.878636121749878, "learning_rate": 9.868828480370948e-05, "loss": 0.9144, "step": 227 }, { "epoch": 0.12403100775193798, "grad_norm": 3.266227960586548, "learning_rate": 9.866763913855988e-05, "loss": 0.5052, "step": 228 }, { "epoch": 0.1245750033999728, "grad_norm": 3.630361557006836, "learning_rate": 9.864683445757966e-05, "loss": 0.5823, "step": 229 }, { "epoch": 0.1251189990480076, "grad_norm": 5.809257984161377, "learning_rate": 9.86258708287457e-05, "loss": 0.7234, "step": 230 }, { "epoch": 0.12566299469604242, "grad_norm": 3.636202573776245, "learning_rate": 9.860474832055421e-05, "loss": 0.5697, "step": 231 }, { "epoch": 0.12620699034407726, "grad_norm": 4.1308698654174805, "learning_rate": 9.85834670020205e-05, "loss": 0.5481, "step": 232 }, { "epoch": 0.12675098599211207, "grad_norm": 3.994006395339966, "learning_rate": 9.856202694267882e-05, "loss": 0.7726, "step": 233 }, { "epoch": 0.12729498164014688, "grad_norm": 3.168381929397583, "learning_rate": 9.854042821258205e-05, "loss": 0.6186, "step": 234 }, { "epoch": 0.1278389772881817, "grad_norm": 3.7775866985321045, "learning_rate": 9.851867088230152e-05, "loss": 0.6819, "step": 235 }, { "epoch": 0.1283829729362165, "grad_norm": 5.084011077880859, "learning_rate": 9.849675502292676e-05, "loss": 0.7595, "step": 236 }, { "epoch": 0.12892696858425132, "grad_norm": 4.404250621795654, "learning_rate": 9.847468070606529e-05, "loss": 0.4957, "step": 237 }, { "epoch": 0.12947096423228613, "grad_norm": 4.471977710723877, "learning_rate": 9.845244800384237e-05, "loss": 0.8609, "step": 238 }, { "epoch": 0.13001495988032097, "grad_norm": 4.33115816116333, "learning_rate": 9.843005698890076e-05, "loss": 0.8729, "step": 239 }, { "epoch": 0.13055895552835578, "grad_norm": 4.372913837432861, "learning_rate": 9.840750773440046e-05, "loss": 0.7003, "step": 240 }, { "epoch": 0.1311029511763906, "grad_norm": 13.594120979309082, "learning_rate": 9.838480031401856e-05, "loss": 0.4909, "step": 241 }, { "epoch": 0.1316469468244254, "grad_norm": 3.739894151687622, "learning_rate": 9.83619348019489e-05, "loss": 0.5736, "step": 242 }, { "epoch": 0.13219094247246022, "grad_norm": 3.5895159244537354, "learning_rate": 9.833891127290187e-05, "loss": 0.6542, "step": 243 }, { "epoch": 0.13273493812049503, "grad_norm": 3.4815824031829834, "learning_rate": 9.831572980210413e-05, "loss": 0.5842, "step": 244 }, { "epoch": 0.13327893376852984, "grad_norm": 4.5859503746032715, "learning_rate": 9.82923904652985e-05, "loss": 0.5462, "step": 245 }, { "epoch": 0.13382292941656468, "grad_norm": 4.3524394035339355, "learning_rate": 9.826889333874348e-05, "loss": 0.7556, "step": 246 }, { "epoch": 0.1343669250645995, "grad_norm": 4.778489589691162, "learning_rate": 9.82452384992132e-05, "loss": 0.8911, "step": 247 }, { "epoch": 0.1349109207126343, "grad_norm": 5.628232479095459, "learning_rate": 9.82214260239971e-05, "loss": 0.6036, "step": 248 }, { "epoch": 0.13545491636066911, "grad_norm": 3.089691162109375, "learning_rate": 9.819745599089967e-05, "loss": 0.4086, "step": 249 }, { "epoch": 0.13599891200870393, "grad_norm": 3.5158333778381348, "learning_rate": 9.817332847824017e-05, "loss": 0.3802, "step": 250 }, { "epoch": 0.13654290765673874, "grad_norm": 7.708734512329102, "learning_rate": 9.814904356485245e-05, "loss": 1.1865, "step": 251 }, { "epoch": 0.13708690330477355, "grad_norm": 7.100734710693359, "learning_rate": 9.812460133008463e-05, "loss": 1.2724, "step": 252 }, { "epoch": 0.1376308989528084, "grad_norm": 4.217385768890381, "learning_rate": 9.810000185379884e-05, "loss": 0.9565, "step": 253 }, { "epoch": 0.1381748946008432, "grad_norm": 3.69364070892334, "learning_rate": 9.807524521637102e-05, "loss": 0.8894, "step": 254 }, { "epoch": 0.138718890248878, "grad_norm": 3.3172481060028076, "learning_rate": 9.80503314986906e-05, "loss": 0.4784, "step": 255 }, { "epoch": 0.13926288589691282, "grad_norm": 3.854806423187256, "learning_rate": 9.802526078216021e-05, "loss": 0.977, "step": 256 }, { "epoch": 0.13980688154494764, "grad_norm": 4.505311965942383, "learning_rate": 9.800003314869552e-05, "loss": 0.9085, "step": 257 }, { "epoch": 0.14035087719298245, "grad_norm": 3.6617014408111572, "learning_rate": 9.797464868072488e-05, "loss": 0.6134, "step": 258 }, { "epoch": 0.14089487284101726, "grad_norm": 3.0914864540100098, "learning_rate": 9.794910746118904e-05, "loss": 0.6069, "step": 259 }, { "epoch": 0.1414388684890521, "grad_norm": 2.7831509113311768, "learning_rate": 9.792340957354101e-05, "loss": 0.7208, "step": 260 }, { "epoch": 0.1419828641370869, "grad_norm": 3.8282506465911865, "learning_rate": 9.789755510174559e-05, "loss": 0.7539, "step": 261 }, { "epoch": 0.14252685978512172, "grad_norm": 4.248205661773682, "learning_rate": 9.787154413027926e-05, "loss": 0.8612, "step": 262 }, { "epoch": 0.14307085543315654, "grad_norm": 3.3364360332489014, "learning_rate": 9.784537674412984e-05, "loss": 0.7864, "step": 263 }, { "epoch": 0.14361485108119135, "grad_norm": 3.9294607639312744, "learning_rate": 9.781905302879622e-05, "loss": 0.6409, "step": 264 }, { "epoch": 0.14415884672922616, "grad_norm": 3.940366506576538, "learning_rate": 9.779257307028804e-05, "loss": 0.6134, "step": 265 }, { "epoch": 0.14470284237726097, "grad_norm": 4.0217132568359375, "learning_rate": 9.77659369551255e-05, "loss": 0.6979, "step": 266 }, { "epoch": 0.1452468380252958, "grad_norm": 3.34411883354187, "learning_rate": 9.7739144770339e-05, "loss": 0.6393, "step": 267 }, { "epoch": 0.14579083367333062, "grad_norm": 3.490530490875244, "learning_rate": 9.771219660346886e-05, "loss": 0.6515, "step": 268 }, { "epoch": 0.14633482932136543, "grad_norm": 2.8912596702575684, "learning_rate": 9.768509254256507e-05, "loss": 0.589, "step": 269 }, { "epoch": 0.14687882496940025, "grad_norm": 3.1983423233032227, "learning_rate": 9.765783267618698e-05, "loss": 0.4703, "step": 270 }, { "epoch": 0.14742282061743506, "grad_norm": 6.012607097625732, "learning_rate": 9.763041709340305e-05, "loss": 0.6027, "step": 271 }, { "epoch": 0.14796681626546987, "grad_norm": 4.234292030334473, "learning_rate": 9.760284588379047e-05, "loss": 0.6427, "step": 272 }, { "epoch": 0.14851081191350468, "grad_norm": 2.7778170108795166, "learning_rate": 9.757511913743496e-05, "loss": 0.4076, "step": 273 }, { "epoch": 0.14905480756153952, "grad_norm": 4.017724990844727, "learning_rate": 9.754723694493043e-05, "loss": 0.7158, "step": 274 }, { "epoch": 0.14959880320957433, "grad_norm": 2.5353634357452393, "learning_rate": 9.751919939737868e-05, "loss": 0.3274, "step": 275 }, { "epoch": 0.15014279885760914, "grad_norm": 2.8899948596954346, "learning_rate": 9.749100658638914e-05, "loss": 0.52, "step": 276 }, { "epoch": 0.15068679450564396, "grad_norm": 3.805589199066162, "learning_rate": 9.746265860407851e-05, "loss": 0.6539, "step": 277 }, { "epoch": 0.15123079015367877, "grad_norm": 3.945589303970337, "learning_rate": 9.743415554307053e-05, "loss": 0.428, "step": 278 }, { "epoch": 0.15177478580171358, "grad_norm": 4.500221252441406, "learning_rate": 9.740549749649561e-05, "loss": 0.813, "step": 279 }, { "epoch": 0.1523187814497484, "grad_norm": 3.298353910446167, "learning_rate": 9.737668455799059e-05, "loss": 0.4532, "step": 280 }, { "epoch": 0.15286277709778323, "grad_norm": 5.691685676574707, "learning_rate": 9.734771682169837e-05, "loss": 0.3705, "step": 281 }, { "epoch": 0.15340677274581804, "grad_norm": 4.102056503295898, "learning_rate": 9.731859438226765e-05, "loss": 0.6743, "step": 282 }, { "epoch": 0.15395076839385285, "grad_norm": 4.290487766265869, "learning_rate": 9.72893173348526e-05, "loss": 0.8118, "step": 283 }, { "epoch": 0.15449476404188767, "grad_norm": 4.7721123695373535, "learning_rate": 9.725988577511256e-05, "loss": 0.7011, "step": 284 }, { "epoch": 0.15503875968992248, "grad_norm": 4.545146942138672, "learning_rate": 9.723029979921172e-05, "loss": 0.7251, "step": 285 }, { "epoch": 0.1555827553379573, "grad_norm": 3.8677685260772705, "learning_rate": 9.72005595038188e-05, "loss": 0.5582, "step": 286 }, { "epoch": 0.1561267509859921, "grad_norm": 3.2510883808135986, "learning_rate": 9.717066498610673e-05, "loss": 0.4719, "step": 287 }, { "epoch": 0.15667074663402691, "grad_norm": 3.684166193008423, "learning_rate": 9.714061634375238e-05, "loss": 0.4816, "step": 288 }, { "epoch": 0.15721474228206175, "grad_norm": 3.989572286605835, "learning_rate": 9.711041367493617e-05, "loss": 0.3596, "step": 289 }, { "epoch": 0.15775873793009657, "grad_norm": 3.842026472091675, "learning_rate": 9.70800570783418e-05, "loss": 0.6706, "step": 290 }, { "epoch": 0.15830273357813138, "grad_norm": 4.102128028869629, "learning_rate": 9.704954665315589e-05, "loss": 0.6535, "step": 291 }, { "epoch": 0.1588467292261662, "grad_norm": 4.248791694641113, "learning_rate": 9.701888249906772e-05, "loss": 0.5291, "step": 292 }, { "epoch": 0.159390724874201, "grad_norm": 3.3994100093841553, "learning_rate": 9.69880647162688e-05, "loss": 0.5799, "step": 293 }, { "epoch": 0.1599347205222358, "grad_norm": 5.348114967346191, "learning_rate": 9.695709340545268e-05, "loss": 0.4673, "step": 294 }, { "epoch": 0.16047871617027062, "grad_norm": 3.7261531352996826, "learning_rate": 9.692596866781447e-05, "loss": 0.6257, "step": 295 }, { "epoch": 0.16102271181830546, "grad_norm": 3.681056022644043, "learning_rate": 9.689469060505064e-05, "loss": 0.4667, "step": 296 }, { "epoch": 0.16156670746634028, "grad_norm": 4.871854305267334, "learning_rate": 9.68632593193586e-05, "loss": 0.6336, "step": 297 }, { "epoch": 0.1621107031143751, "grad_norm": 4.232208251953125, "learning_rate": 9.68316749134364e-05, "loss": 0.4957, "step": 298 }, { "epoch": 0.1626546987624099, "grad_norm": 3.9273123741149902, "learning_rate": 9.679993749048241e-05, "loss": 0.3405, "step": 299 }, { "epoch": 0.1631986944104447, "grad_norm": 3.474198341369629, "learning_rate": 9.676804715419494e-05, "loss": 0.2774, "step": 300 }, { "epoch": 0.16374269005847952, "grad_norm": 3.935596227645874, "learning_rate": 9.673600400877197e-05, "loss": 1.0718, "step": 301 }, { "epoch": 0.16428668570651433, "grad_norm": 5.070333480834961, "learning_rate": 9.670380815891071e-05, "loss": 1.2516, "step": 302 }, { "epoch": 0.16483068135454917, "grad_norm": 3.17016339302063, "learning_rate": 9.667145970980735e-05, "loss": 0.7746, "step": 303 }, { "epoch": 0.165374677002584, "grad_norm": 3.2615132331848145, "learning_rate": 9.66389587671567e-05, "loss": 1.0655, "step": 304 }, { "epoch": 0.1659186726506188, "grad_norm": 2.9330790042877197, "learning_rate": 9.660630543715174e-05, "loss": 0.8144, "step": 305 }, { "epoch": 0.1664626682986536, "grad_norm": 3.5980565547943115, "learning_rate": 9.657349982648348e-05, "loss": 0.7537, "step": 306 }, { "epoch": 0.16700666394668842, "grad_norm": 4.186583042144775, "learning_rate": 9.654054204234035e-05, "loss": 1.0416, "step": 307 }, { "epoch": 0.16755065959472323, "grad_norm": 4.238248348236084, "learning_rate": 9.650743219240813e-05, "loss": 0.8766, "step": 308 }, { "epoch": 0.16809465524275805, "grad_norm": 2.7023186683654785, "learning_rate": 9.647417038486935e-05, "loss": 0.8421, "step": 309 }, { "epoch": 0.16863865089079288, "grad_norm": 2.2210030555725098, "learning_rate": 9.644075672840312e-05, "loss": 0.6393, "step": 310 }, { "epoch": 0.1691826465388277, "grad_norm": 3.340064287185669, "learning_rate": 9.640719133218461e-05, "loss": 0.7222, "step": 311 }, { "epoch": 0.1697266421868625, "grad_norm": 2.752986431121826, "learning_rate": 9.637347430588489e-05, "loss": 0.7285, "step": 312 }, { "epoch": 0.17027063783489732, "grad_norm": 3.792200803756714, "learning_rate": 9.633960575967036e-05, "loss": 0.8311, "step": 313 }, { "epoch": 0.17081463348293213, "grad_norm": 2.7241642475128174, "learning_rate": 9.630558580420258e-05, "loss": 0.5596, "step": 314 }, { "epoch": 0.17135862913096694, "grad_norm": 2.9896748065948486, "learning_rate": 9.627141455063777e-05, "loss": 0.5855, "step": 315 }, { "epoch": 0.17190262477900176, "grad_norm": 3.365631580352783, "learning_rate": 9.62370921106265e-05, "loss": 0.802, "step": 316 }, { "epoch": 0.1724466204270366, "grad_norm": 2.9429214000701904, "learning_rate": 9.620261859631336e-05, "loss": 0.4751, "step": 317 }, { "epoch": 0.1729906160750714, "grad_norm": 3.172883987426758, "learning_rate": 9.61679941203365e-05, "loss": 0.704, "step": 318 }, { "epoch": 0.17353461172310622, "grad_norm": 3.1179656982421875, "learning_rate": 9.61332187958274e-05, "loss": 0.5997, "step": 319 }, { "epoch": 0.17407860737114103, "grad_norm": 4.106421947479248, "learning_rate": 9.609829273641034e-05, "loss": 0.6834, "step": 320 }, { "epoch": 0.17462260301917584, "grad_norm": 3.558425188064575, "learning_rate": 9.606321605620215e-05, "loss": 0.4522, "step": 321 }, { "epoch": 0.17516659866721065, "grad_norm": 3.4440102577209473, "learning_rate": 9.60279888698118e-05, "loss": 0.5252, "step": 322 }, { "epoch": 0.17571059431524547, "grad_norm": 3.069347381591797, "learning_rate": 9.599261129234e-05, "loss": 0.4708, "step": 323 }, { "epoch": 0.1762545899632803, "grad_norm": 3.0099222660064697, "learning_rate": 9.595708343937885e-05, "loss": 0.4839, "step": 324 }, { "epoch": 0.17679858561131512, "grad_norm": 4.047853469848633, "learning_rate": 9.592140542701147e-05, "loss": 0.6995, "step": 325 }, { "epoch": 0.17734258125934993, "grad_norm": 5.8451690673828125, "learning_rate": 9.588557737181161e-05, "loss": 0.6524, "step": 326 }, { "epoch": 0.17788657690738474, "grad_norm": 3.4044129848480225, "learning_rate": 9.584959939084323e-05, "loss": 0.5789, "step": 327 }, { "epoch": 0.17843057255541955, "grad_norm": 2.6063244342803955, "learning_rate": 9.581347160166023e-05, "loss": 0.3981, "step": 328 }, { "epoch": 0.17897456820345437, "grad_norm": 3.1639511585235596, "learning_rate": 9.57771941223059e-05, "loss": 0.5017, "step": 329 }, { "epoch": 0.17951856385148918, "grad_norm": 4.458259105682373, "learning_rate": 9.574076707131269e-05, "loss": 0.6199, "step": 330 }, { "epoch": 0.18006255949952402, "grad_norm": 3.484513521194458, "learning_rate": 9.570419056770173e-05, "loss": 0.4992, "step": 331 }, { "epoch": 0.18060655514755883, "grad_norm": 3.6440675258636475, "learning_rate": 9.566746473098249e-05, "loss": 0.728, "step": 332 }, { "epoch": 0.18115055079559364, "grad_norm": 3.6214029788970947, "learning_rate": 9.563058968115235e-05, "loss": 0.6306, "step": 333 }, { "epoch": 0.18169454644362845, "grad_norm": 4.163621425628662, "learning_rate": 9.559356553869623e-05, "loss": 0.5599, "step": 334 }, { "epoch": 0.18223854209166326, "grad_norm": 3.128582000732422, "learning_rate": 9.555639242458617e-05, "loss": 0.4747, "step": 335 }, { "epoch": 0.18278253773969808, "grad_norm": 3.582385778427124, "learning_rate": 9.5519070460281e-05, "loss": 0.6223, "step": 336 }, { "epoch": 0.1833265333877329, "grad_norm": 3.4408421516418457, "learning_rate": 9.548159976772592e-05, "loss": 0.6564, "step": 337 }, { "epoch": 0.18387052903576773, "grad_norm": 3.8188998699188232, "learning_rate": 9.544398046935199e-05, "loss": 0.5648, "step": 338 }, { "epoch": 0.18441452468380254, "grad_norm": 3.693319320678711, "learning_rate": 9.54062126880759e-05, "loss": 0.4852, "step": 339 }, { "epoch": 0.18495852033183735, "grad_norm": 3.278353214263916, "learning_rate": 9.536829654729948e-05, "loss": 0.4719, "step": 340 }, { "epoch": 0.18550251597987216, "grad_norm": 3.1802175045013428, "learning_rate": 9.533023217090926e-05, "loss": 0.2903, "step": 341 }, { "epoch": 0.18604651162790697, "grad_norm": 4.163318634033203, "learning_rate": 9.529201968327616e-05, "loss": 0.5665, "step": 342 }, { "epoch": 0.18659050727594179, "grad_norm": 3.4346909523010254, "learning_rate": 9.525365920925504e-05, "loss": 0.4353, "step": 343 }, { "epoch": 0.1871345029239766, "grad_norm": 3.1426050662994385, "learning_rate": 9.521515087418427e-05, "loss": 0.5412, "step": 344 }, { "epoch": 0.18767849857201144, "grad_norm": 4.696324825286865, "learning_rate": 9.517649480388532e-05, "loss": 0.7633, "step": 345 }, { "epoch": 0.18822249422004625, "grad_norm": 3.404768705368042, "learning_rate": 9.51376911246624e-05, "loss": 0.5478, "step": 346 }, { "epoch": 0.18876648986808106, "grad_norm": 3.7347939014434814, "learning_rate": 9.5098739963302e-05, "loss": 0.5964, "step": 347 }, { "epoch": 0.18931048551611587, "grad_norm": 5.7211737632751465, "learning_rate": 9.50596414470725e-05, "loss": 0.762, "step": 348 }, { "epoch": 0.18985448116415068, "grad_norm": 3.6385090351104736, "learning_rate": 9.502039570372373e-05, "loss": 0.4021, "step": 349 }, { "epoch": 0.1903984768121855, "grad_norm": 4.371354579925537, "learning_rate": 9.498100286148659e-05, "loss": 0.369, "step": 350 }, { "epoch": 0.1909424724602203, "grad_norm": 4.7883758544921875, "learning_rate": 9.494146304907257e-05, "loss": 1.0321, "step": 351 }, { "epoch": 0.19148646810825512, "grad_norm": 5.076574325561523, "learning_rate": 9.490177639567341e-05, "loss": 1.3139, "step": 352 }, { "epoch": 0.19203046375628996, "grad_norm": 4.486371994018555, "learning_rate": 9.486194303096062e-05, "loss": 1.0706, "step": 353 }, { "epoch": 0.19257445940432477, "grad_norm": 3.449143409729004, "learning_rate": 9.482196308508506e-05, "loss": 0.808, "step": 354 }, { "epoch": 0.19311845505235958, "grad_norm": 3.4026379585266113, "learning_rate": 9.478183668867654e-05, "loss": 0.8259, "step": 355 }, { "epoch": 0.1936624507003944, "grad_norm": 3.3752269744873047, "learning_rate": 9.474156397284337e-05, "loss": 0.6569, "step": 356 }, { "epoch": 0.1942064463484292, "grad_norm": 4.627280235290527, "learning_rate": 9.470114506917198e-05, "loss": 0.833, "step": 357 }, { "epoch": 0.19475044199646402, "grad_norm": 3.3835442066192627, "learning_rate": 9.466058010972639e-05, "loss": 0.7032, "step": 358 }, { "epoch": 0.19529443764449883, "grad_norm": 3.1237292289733887, "learning_rate": 9.461986922704787e-05, "loss": 0.6341, "step": 359 }, { "epoch": 0.19583843329253367, "grad_norm": 2.9925172328948975, "learning_rate": 9.457901255415447e-05, "loss": 0.7526, "step": 360 }, { "epoch": 0.19638242894056848, "grad_norm": 3.6769769191741943, "learning_rate": 9.453801022454062e-05, "loss": 0.723, "step": 361 }, { "epoch": 0.1969264245886033, "grad_norm": 3.1583139896392822, "learning_rate": 9.449686237217666e-05, "loss": 0.7234, "step": 362 }, { "epoch": 0.1974704202366381, "grad_norm": 3.082484722137451, "learning_rate": 9.445556913150838e-05, "loss": 0.5891, "step": 363 }, { "epoch": 0.19801441588467292, "grad_norm": 2.9803946018218994, "learning_rate": 9.44141306374566e-05, "loss": 0.6255, "step": 364 }, { "epoch": 0.19855841153270773, "grad_norm": 3.6369974613189697, "learning_rate": 9.43725470254168e-05, "loss": 0.5567, "step": 365 }, { "epoch": 0.19910240718074254, "grad_norm": 3.330982208251953, "learning_rate": 9.433081843125856e-05, "loss": 0.4234, "step": 366 }, { "epoch": 0.19964640282877738, "grad_norm": 3.7523093223571777, "learning_rate": 9.428894499132517e-05, "loss": 0.5777, "step": 367 }, { "epoch": 0.2001903984768122, "grad_norm": 2.911546468734741, "learning_rate": 9.424692684243325e-05, "loss": 0.4518, "step": 368 }, { "epoch": 0.200734394124847, "grad_norm": 3.439955949783325, "learning_rate": 9.420476412187216e-05, "loss": 0.6691, "step": 369 }, { "epoch": 0.20127838977288182, "grad_norm": 2.650742530822754, "learning_rate": 9.41624569674037e-05, "loss": 0.38, "step": 370 }, { "epoch": 0.20182238542091663, "grad_norm": 2.7445318698883057, "learning_rate": 9.412000551726156e-05, "loss": 0.4194, "step": 371 }, { "epoch": 0.20236638106895144, "grad_norm": 3.7591845989227295, "learning_rate": 9.407740991015087e-05, "loss": 0.6206, "step": 372 }, { "epoch": 0.20291037671698625, "grad_norm": 3.162815809249878, "learning_rate": 9.403467028524781e-05, "loss": 0.3825, "step": 373 }, { "epoch": 0.2034543723650211, "grad_norm": 4.513307571411133, "learning_rate": 9.399178678219915e-05, "loss": 0.8259, "step": 374 }, { "epoch": 0.2039983680130559, "grad_norm": 8.635912895202637, "learning_rate": 9.394875954112169e-05, "loss": 0.5062, "step": 375 }, { "epoch": 0.20454236366109071, "grad_norm": 2.986172914505005, "learning_rate": 9.390558870260193e-05, "loss": 0.4251, "step": 376 }, { "epoch": 0.20508635930912553, "grad_norm": 2.7449796199798584, "learning_rate": 9.386227440769557e-05, "loss": 0.4703, "step": 377 }, { "epoch": 0.20563035495716034, "grad_norm": 3.67938494682312, "learning_rate": 9.381881679792697e-05, "loss": 0.601, "step": 378 }, { "epoch": 0.20617435060519515, "grad_norm": 3.341700315475464, "learning_rate": 9.377521601528884e-05, "loss": 0.5231, "step": 379 }, { "epoch": 0.20671834625322996, "grad_norm": 3.18094539642334, "learning_rate": 9.373147220224162e-05, "loss": 0.4013, "step": 380 }, { "epoch": 0.2072623419012648, "grad_norm": 3.9187755584716797, "learning_rate": 9.368758550171313e-05, "loss": 0.3713, "step": 381 }, { "epoch": 0.2078063375492996, "grad_norm": 4.070806980133057, "learning_rate": 9.364355605709804e-05, "loss": 0.6402, "step": 382 }, { "epoch": 0.20835033319733443, "grad_norm": 4.059103488922119, "learning_rate": 9.359938401225741e-05, "loss": 0.6007, "step": 383 }, { "epoch": 0.20889432884536924, "grad_norm": 3.8672549724578857, "learning_rate": 9.355506951151824e-05, "loss": 0.6114, "step": 384 }, { "epoch": 0.20943832449340405, "grad_norm": 2.8916361331939697, "learning_rate": 9.351061269967301e-05, "loss": 0.4034, "step": 385 }, { "epoch": 0.20998232014143886, "grad_norm": 4.261205196380615, "learning_rate": 9.346601372197914e-05, "loss": 0.6808, "step": 386 }, { "epoch": 0.21052631578947367, "grad_norm": 4.240063190460205, "learning_rate": 9.342127272415859e-05, "loss": 0.5731, "step": 387 }, { "epoch": 0.2110703114375085, "grad_norm": 3.174260139465332, "learning_rate": 9.337638985239736e-05, "loss": 0.4544, "step": 388 }, { "epoch": 0.21161430708554332, "grad_norm": 3.8930206298828125, "learning_rate": 9.333136525334498e-05, "loss": 0.5565, "step": 389 }, { "epoch": 0.21215830273357814, "grad_norm": 4.449538230895996, "learning_rate": 9.328619907411409e-05, "loss": 0.7026, "step": 390 }, { "epoch": 0.21270229838161295, "grad_norm": 3.6032214164733887, "learning_rate": 9.32408914622799e-05, "loss": 0.485, "step": 391 }, { "epoch": 0.21324629402964776, "grad_norm": 3.226893424987793, "learning_rate": 9.319544256587974e-05, "loss": 0.563, "step": 392 }, { "epoch": 0.21379028967768257, "grad_norm": 3.6823930740356445, "learning_rate": 9.314985253341258e-05, "loss": 0.5282, "step": 393 }, { "epoch": 0.21433428532571738, "grad_norm": 2.9988889694213867, "learning_rate": 9.310412151383852e-05, "loss": 0.4229, "step": 394 }, { "epoch": 0.21487828097375222, "grad_norm": 3.6856372356414795, "learning_rate": 9.305824965657834e-05, "loss": 0.54, "step": 395 }, { "epoch": 0.21542227662178703, "grad_norm": 3.2524585723876953, "learning_rate": 9.301223711151298e-05, "loss": 0.5354, "step": 396 }, { "epoch": 0.21596627226982185, "grad_norm": 4.0347514152526855, "learning_rate": 9.296608402898306e-05, "loss": 0.5083, "step": 397 }, { "epoch": 0.21651026791785666, "grad_norm": 4.220754623413086, "learning_rate": 9.291979055978838e-05, "loss": 0.6556, "step": 398 }, { "epoch": 0.21705426356589147, "grad_norm": 3.6733181476593018, "learning_rate": 9.287335685518745e-05, "loss": 0.3408, "step": 399 }, { "epoch": 0.21759825921392628, "grad_norm": 4.316866874694824, "learning_rate": 9.282678306689699e-05, "loss": 0.399, "step": 400 }, { "epoch": 0.2181422548619611, "grad_norm": 2.873260974884033, "learning_rate": 9.278006934709141e-05, "loss": 1.1463, "step": 401 }, { "epoch": 0.21868625050999593, "grad_norm": 4.634504795074463, "learning_rate": 9.273321584840234e-05, "loss": 0.9317, "step": 402 }, { "epoch": 0.21923024615803074, "grad_norm": 4.208846092224121, "learning_rate": 9.268622272391811e-05, "loss": 0.8378, "step": 403 }, { "epoch": 0.21977424180606556, "grad_norm": 3.619878053665161, "learning_rate": 9.263909012718331e-05, "loss": 0.8824, "step": 404 }, { "epoch": 0.22031823745410037, "grad_norm": 3.3463990688323975, "learning_rate": 9.259181821219813e-05, "loss": 0.487, "step": 405 }, { "epoch": 0.22086223310213518, "grad_norm": 3.479128122329712, "learning_rate": 9.254440713341807e-05, "loss": 0.6933, "step": 406 }, { "epoch": 0.22140622875017, "grad_norm": 2.6258251667022705, "learning_rate": 9.24968570457533e-05, "loss": 0.6385, "step": 407 }, { "epoch": 0.2219502243982048, "grad_norm": 3.3969478607177734, "learning_rate": 9.244916810456821e-05, "loss": 0.7672, "step": 408 }, { "epoch": 0.22249422004623964, "grad_norm": 2.701378345489502, "learning_rate": 9.240134046568078e-05, "loss": 0.5735, "step": 409 }, { "epoch": 0.22303821569427446, "grad_norm": 2.9163026809692383, "learning_rate": 9.23533742853623e-05, "loss": 0.8483, "step": 410 }, { "epoch": 0.22358221134230927, "grad_norm": 3.177183151245117, "learning_rate": 9.230526972033662e-05, "loss": 0.8823, "step": 411 }, { "epoch": 0.22412620699034408, "grad_norm": 4.825070858001709, "learning_rate": 9.225702692777982e-05, "loss": 0.5245, "step": 412 }, { "epoch": 0.2246702026383789, "grad_norm": 2.174508571624756, "learning_rate": 9.22086460653196e-05, "loss": 0.5227, "step": 413 }, { "epoch": 0.2252141982864137, "grad_norm": 2.9526684284210205, "learning_rate": 9.216012729103475e-05, "loss": 0.5253, "step": 414 }, { "epoch": 0.22575819393444851, "grad_norm": 2.8273720741271973, "learning_rate": 9.211147076345475e-05, "loss": 0.6101, "step": 415 }, { "epoch": 0.22630218958248333, "grad_norm": 3.1453208923339844, "learning_rate": 9.206267664155907e-05, "loss": 0.5456, "step": 416 }, { "epoch": 0.22684618523051817, "grad_norm": 2.809300184249878, "learning_rate": 9.201374508477682e-05, "loss": 0.4578, "step": 417 }, { "epoch": 0.22739018087855298, "grad_norm": 2.9063663482666016, "learning_rate": 9.196467625298619e-05, "loss": 0.4534, "step": 418 }, { "epoch": 0.2279341765265878, "grad_norm": 2.9758522510528564, "learning_rate": 9.191547030651383e-05, "loss": 0.5089, "step": 419 }, { "epoch": 0.2284781721746226, "grad_norm": 2.4397923946380615, "learning_rate": 9.186612740613443e-05, "loss": 0.4022, "step": 420 }, { "epoch": 0.2290221678226574, "grad_norm": 2.9920082092285156, "learning_rate": 9.181664771307015e-05, "loss": 0.3095, "step": 421 }, { "epoch": 0.22956616347069222, "grad_norm": 3.703774929046631, "learning_rate": 9.176703138899014e-05, "loss": 0.5465, "step": 422 }, { "epoch": 0.23011015911872704, "grad_norm": 3.048585891723633, "learning_rate": 9.171727859600993e-05, "loss": 0.4299, "step": 423 }, { "epoch": 0.23065415476676188, "grad_norm": 2.550218105316162, "learning_rate": 9.166738949669096e-05, "loss": 0.2719, "step": 424 }, { "epoch": 0.2311981504147967, "grad_norm": 3.233604669570923, "learning_rate": 9.161736425404003e-05, "loss": 0.4621, "step": 425 }, { "epoch": 0.2317421460628315, "grad_norm": 2.8593106269836426, "learning_rate": 9.156720303150881e-05, "loss": 0.4376, "step": 426 }, { "epoch": 0.2322861417108663, "grad_norm": 2.716887950897217, "learning_rate": 9.15169059929932e-05, "loss": 0.3702, "step": 427 }, { "epoch": 0.23283013735890112, "grad_norm": 2.5790464878082275, "learning_rate": 9.146647330283296e-05, "loss": 0.3587, "step": 428 }, { "epoch": 0.23337413300693594, "grad_norm": 3.2930092811584473, "learning_rate": 9.141590512581098e-05, "loss": 0.4279, "step": 429 }, { "epoch": 0.23391812865497075, "grad_norm": 2.47341251373291, "learning_rate": 9.136520162715287e-05, "loss": 0.3091, "step": 430 }, { "epoch": 0.2344621243030056, "grad_norm": 3.601699113845825, "learning_rate": 9.13143629725264e-05, "loss": 0.4368, "step": 431 }, { "epoch": 0.2350061199510404, "grad_norm": 2.5605740547180176, "learning_rate": 9.126338932804097e-05, "loss": 0.2157, "step": 432 }, { "epoch": 0.2355501155990752, "grad_norm": 2.935905933380127, "learning_rate": 9.121228086024699e-05, "loss": 0.3614, "step": 433 }, { "epoch": 0.23609411124711002, "grad_norm": 5.631600379943848, "learning_rate": 9.11610377361354e-05, "loss": 0.6254, "step": 434 }, { "epoch": 0.23663810689514483, "grad_norm": 2.928715229034424, "learning_rate": 9.110966012313715e-05, "loss": 0.3766, "step": 435 }, { "epoch": 0.23718210254317965, "grad_norm": 4.133535385131836, "learning_rate": 9.105814818912258e-05, "loss": 0.5628, "step": 436 }, { "epoch": 0.23772609819121446, "grad_norm": 3.722503423690796, "learning_rate": 9.100650210240093e-05, "loss": 0.564, "step": 437 }, { "epoch": 0.2382700938392493, "grad_norm": 3.9614462852478027, "learning_rate": 9.095472203171976e-05, "loss": 0.536, "step": 438 }, { "epoch": 0.2388140894872841, "grad_norm": 2.712907552719116, "learning_rate": 9.090280814626439e-05, "loss": 0.269, "step": 439 }, { "epoch": 0.23935808513531892, "grad_norm": 3.45647931098938, "learning_rate": 9.085076061565743e-05, "loss": 0.3867, "step": 440 }, { "epoch": 0.23990208078335373, "grad_norm": 4.043107509613037, "learning_rate": 9.079857960995806e-05, "loss": 0.6881, "step": 441 }, { "epoch": 0.24044607643138854, "grad_norm": 3.983325242996216, "learning_rate": 9.074626529966164e-05, "loss": 0.4765, "step": 442 }, { "epoch": 0.24099007207942336, "grad_norm": 3.3885247707366943, "learning_rate": 9.069381785569908e-05, "loss": 0.3956, "step": 443 }, { "epoch": 0.24153406772745817, "grad_norm": 2.862793207168579, "learning_rate": 9.06412374494363e-05, "loss": 0.3285, "step": 444 }, { "epoch": 0.242078063375493, "grad_norm": 74.19651794433594, "learning_rate": 9.058852425267359e-05, "loss": 0.6868, "step": 445 }, { "epoch": 0.24262205902352782, "grad_norm": 4.249988079071045, "learning_rate": 9.053567843764521e-05, "loss": 0.5848, "step": 446 }, { "epoch": 0.24316605467156263, "grad_norm": 3.803861618041992, "learning_rate": 9.048270017701868e-05, "loss": 0.4667, "step": 447 }, { "epoch": 0.24371005031959744, "grad_norm": 4.550891876220703, "learning_rate": 9.042958964389427e-05, "loss": 0.4985, "step": 448 }, { "epoch": 0.24425404596763226, "grad_norm": 2.38836407661438, "learning_rate": 9.037634701180448e-05, "loss": 0.2012, "step": 449 }, { "epoch": 0.24479804161566707, "grad_norm": 2.603590250015259, "learning_rate": 9.032297245471337e-05, "loss": 0.1953, "step": 450 }, { "epoch": 0.24534203726370188, "grad_norm": 2.8660459518432617, "learning_rate": 9.026946614701609e-05, "loss": 0.9378, "step": 451 }, { "epoch": 0.24588603291173672, "grad_norm": 3.9907586574554443, "learning_rate": 9.021582826353824e-05, "loss": 0.9511, "step": 452 }, { "epoch": 0.24643002855977153, "grad_norm": 2.759866714477539, "learning_rate": 9.016205897953536e-05, "loss": 0.7207, "step": 453 }, { "epoch": 0.24697402420780634, "grad_norm": 3.6777336597442627, "learning_rate": 9.010815847069231e-05, "loss": 0.8477, "step": 454 }, { "epoch": 0.24751801985584115, "grad_norm": 4.0103888511657715, "learning_rate": 9.00541269131227e-05, "loss": 0.5074, "step": 455 }, { "epoch": 0.24806201550387597, "grad_norm": 2.6912076473236084, "learning_rate": 8.999996448336832e-05, "loss": 0.7287, "step": 456 }, { "epoch": 0.24860601115191078, "grad_norm": 3.8389523029327393, "learning_rate": 8.994567135839864e-05, "loss": 0.6549, "step": 457 }, { "epoch": 0.2491500067999456, "grad_norm": 4.546677589416504, "learning_rate": 8.989124771561007e-05, "loss": 0.674, "step": 458 }, { "epoch": 0.24969400244798043, "grad_norm": 2.420579433441162, "learning_rate": 8.983669373282551e-05, "loss": 0.4063, "step": 459 }, { "epoch": 0.2502379980960152, "grad_norm": 2.698164939880371, "learning_rate": 8.978200958829373e-05, "loss": 0.6645, "step": 460 }, { "epoch": 0.25078199374405, "grad_norm": 2.934906244277954, "learning_rate": 8.972719546068882e-05, "loss": 0.4164, "step": 461 }, { "epoch": 0.25132598939208484, "grad_norm": 2.569288492202759, "learning_rate": 8.967225152910952e-05, "loss": 0.613, "step": 462 }, { "epoch": 0.2518699850401197, "grad_norm": 2.8090507984161377, "learning_rate": 8.961717797307872e-05, "loss": 0.5113, "step": 463 }, { "epoch": 0.2524139806881545, "grad_norm": 3.297060012817383, "learning_rate": 8.956197497254284e-05, "loss": 0.6097, "step": 464 }, { "epoch": 0.2529579763361893, "grad_norm": 2.942049741744995, "learning_rate": 8.950664270787122e-05, "loss": 0.4729, "step": 465 }, { "epoch": 0.25350197198422414, "grad_norm": 3.0533688068389893, "learning_rate": 8.945118135985561e-05, "loss": 0.4909, "step": 466 }, { "epoch": 0.25404596763225895, "grad_norm": 3.1650211811065674, "learning_rate": 8.939559110970946e-05, "loss": 0.5473, "step": 467 }, { "epoch": 0.25458996328029376, "grad_norm": 2.8507273197174072, "learning_rate": 8.933987213906745e-05, "loss": 0.4775, "step": 468 }, { "epoch": 0.2551339589283286, "grad_norm": 3.65455961227417, "learning_rate": 8.92840246299848e-05, "loss": 0.3901, "step": 469 }, { "epoch": 0.2556779545763634, "grad_norm": 2.9989473819732666, "learning_rate": 8.922804876493673e-05, "loss": 0.3993, "step": 470 }, { "epoch": 0.2562219502243982, "grad_norm": 3.914255380630493, "learning_rate": 8.917194472681784e-05, "loss": 0.6931, "step": 471 }, { "epoch": 0.256765945872433, "grad_norm": 2.8166818618774414, "learning_rate": 8.911571269894153e-05, "loss": 0.4097, "step": 472 }, { "epoch": 0.2573099415204678, "grad_norm": 2.2216925621032715, "learning_rate": 8.90593528650394e-05, "loss": 0.2785, "step": 473 }, { "epoch": 0.25785393716850263, "grad_norm": 2.1548070907592773, "learning_rate": 8.900286540926061e-05, "loss": 0.2904, "step": 474 }, { "epoch": 0.25839793281653745, "grad_norm": 2.4019575119018555, "learning_rate": 8.894625051617134e-05, "loss": 0.1806, "step": 475 }, { "epoch": 0.25894192846457226, "grad_norm": 2.4249496459960938, "learning_rate": 8.888950837075414e-05, "loss": 0.292, "step": 476 }, { "epoch": 0.2594859241126071, "grad_norm": 3.786322832107544, "learning_rate": 8.883263915840736e-05, "loss": 0.5343, "step": 477 }, { "epoch": 0.26002991976064194, "grad_norm": 3.96189022064209, "learning_rate": 8.877564306494449e-05, "loss": 0.3909, "step": 478 }, { "epoch": 0.26057391540867675, "grad_norm": 2.242741107940674, "learning_rate": 8.871852027659364e-05, "loss": 0.2851, "step": 479 }, { "epoch": 0.26111791105671156, "grad_norm": 2.6148018836975098, "learning_rate": 8.866127097999683e-05, "loss": 0.2727, "step": 480 }, { "epoch": 0.26166190670474637, "grad_norm": 2.6764285564422607, "learning_rate": 8.860389536220948e-05, "loss": 0.2542, "step": 481 }, { "epoch": 0.2622059023527812, "grad_norm": 4.473926067352295, "learning_rate": 8.85463936106997e-05, "loss": 0.3414, "step": 482 }, { "epoch": 0.262749898000816, "grad_norm": 17.045555114746094, "learning_rate": 8.848876591334776e-05, "loss": 0.4539, "step": 483 }, { "epoch": 0.2632938936488508, "grad_norm": 2.857429027557373, "learning_rate": 8.843101245844543e-05, "loss": 0.2873, "step": 484 }, { "epoch": 0.2638378892968856, "grad_norm": 2.227933883666992, "learning_rate": 8.83731334346954e-05, "loss": 0.1749, "step": 485 }, { "epoch": 0.26438188494492043, "grad_norm": 3.331014394760132, "learning_rate": 8.831512903121061e-05, "loss": 0.5441, "step": 486 }, { "epoch": 0.26492588059295524, "grad_norm": 2.699883460998535, "learning_rate": 8.82569994375137e-05, "loss": 0.3173, "step": 487 }, { "epoch": 0.26546987624099005, "grad_norm": 3.1664974689483643, "learning_rate": 8.81987448435363e-05, "loss": 0.3759, "step": 488 }, { "epoch": 0.26601387188902487, "grad_norm": 3.6668860912323, "learning_rate": 8.814036543961853e-05, "loss": 0.5909, "step": 489 }, { "epoch": 0.2665578675370597, "grad_norm": 3.928412437438965, "learning_rate": 8.808186141650823e-05, "loss": 0.4697, "step": 490 }, { "epoch": 0.26710186318509455, "grad_norm": 3.1746432781219482, "learning_rate": 8.802323296536052e-05, "loss": 0.4435, "step": 491 }, { "epoch": 0.26764585883312936, "grad_norm": 4.19642972946167, "learning_rate": 8.796448027773699e-05, "loss": 0.4508, "step": 492 }, { "epoch": 0.26818985448116417, "grad_norm": 2.9797258377075195, "learning_rate": 8.79056035456052e-05, "loss": 0.3053, "step": 493 }, { "epoch": 0.268733850129199, "grad_norm": 3.718085765838623, "learning_rate": 8.7846602961338e-05, "loss": 0.4562, "step": 494 }, { "epoch": 0.2692778457772338, "grad_norm": 3.082533597946167, "learning_rate": 8.778747871771292e-05, "loss": 0.4634, "step": 495 }, { "epoch": 0.2698218414252686, "grad_norm": 3.464052438735962, "learning_rate": 8.772823100791151e-05, "loss": 0.4541, "step": 496 }, { "epoch": 0.2703658370733034, "grad_norm": 3.8297595977783203, "learning_rate": 8.766886002551877e-05, "loss": 0.5457, "step": 497 }, { "epoch": 0.27090983272133823, "grad_norm": 3.514178514480591, "learning_rate": 8.760936596452242e-05, "loss": 0.2584, "step": 498 }, { "epoch": 0.27145382836937304, "grad_norm": 3.449949264526367, "learning_rate": 8.754974901931239e-05, "loss": 0.2228, "step": 499 }, { "epoch": 0.27199782401740785, "grad_norm": 5.18220329284668, "learning_rate": 8.749000938468006e-05, "loss": 0.572, "step": 500 }, { "epoch": 0.27254181966544266, "grad_norm": 2.8962366580963135, "learning_rate": 8.743014725581775e-05, "loss": 0.9612, "step": 501 }, { "epoch": 0.2730858153134775, "grad_norm": 3.259406089782715, "learning_rate": 8.737016282831797e-05, "loss": 0.6541, "step": 502 }, { "epoch": 0.2736298109615123, "grad_norm": 3.3317062854766846, "learning_rate": 8.731005629817283e-05, "loss": 0.7991, "step": 503 }, { "epoch": 0.2741738066095471, "grad_norm": 2.826852321624756, "learning_rate": 8.724982786177339e-05, "loss": 0.7053, "step": 504 }, { "epoch": 0.27471780225758197, "grad_norm": 3.1145474910736084, "learning_rate": 8.718947771590903e-05, "loss": 0.6151, "step": 505 }, { "epoch": 0.2752617979056168, "grad_norm": 3.38984751701355, "learning_rate": 8.712900605776686e-05, "loss": 0.4306, "step": 506 }, { "epoch": 0.2758057935536516, "grad_norm": 2.7825777530670166, "learning_rate": 8.706841308493093e-05, "loss": 0.6612, "step": 507 }, { "epoch": 0.2763497892016864, "grad_norm": 3.127641439437866, "learning_rate": 8.700769899538168e-05, "loss": 0.6369, "step": 508 }, { "epoch": 0.2768937848497212, "grad_norm": 2.5201339721679688, "learning_rate": 8.694686398749535e-05, "loss": 0.483, "step": 509 }, { "epoch": 0.277437780497756, "grad_norm": 2.7066657543182373, "learning_rate": 8.688590826004322e-05, "loss": 0.5466, "step": 510 }, { "epoch": 0.27798177614579084, "grad_norm": 2.4882519245147705, "learning_rate": 8.682483201219099e-05, "loss": 0.5021, "step": 511 }, { "epoch": 0.27852577179382565, "grad_norm": 3.132108211517334, "learning_rate": 8.676363544349822e-05, "loss": 0.4543, "step": 512 }, { "epoch": 0.27906976744186046, "grad_norm": 2.171466827392578, "learning_rate": 8.67023187539175e-05, "loss": 0.4082, "step": 513 }, { "epoch": 0.2796137630898953, "grad_norm": 2.172581434249878, "learning_rate": 8.6640882143794e-05, "loss": 0.3068, "step": 514 }, { "epoch": 0.2801577587379301, "grad_norm": 2.4063403606414795, "learning_rate": 8.657932581386466e-05, "loss": 0.4783, "step": 515 }, { "epoch": 0.2807017543859649, "grad_norm": 2.657961845397949, "learning_rate": 8.65176499652576e-05, "loss": 0.3352, "step": 516 }, { "epoch": 0.2812457500339997, "grad_norm": 2.808401107788086, "learning_rate": 8.645585479949144e-05, "loss": 0.4888, "step": 517 }, { "epoch": 0.2817897456820345, "grad_norm": 2.7104082107543945, "learning_rate": 8.639394051847472e-05, "loss": 0.4614, "step": 518 }, { "epoch": 0.28233374133006933, "grad_norm": 3.1302847862243652, "learning_rate": 8.633190732450506e-05, "loss": 0.4534, "step": 519 }, { "epoch": 0.2828777369781042, "grad_norm": 3.1849703788757324, "learning_rate": 8.626975542026873e-05, "loss": 0.4204, "step": 520 }, { "epoch": 0.283421732626139, "grad_norm": 2.9377870559692383, "learning_rate": 8.620748500883982e-05, "loss": 0.4601, "step": 521 }, { "epoch": 0.2839657282741738, "grad_norm": 2.8251383304595947, "learning_rate": 8.614509629367962e-05, "loss": 0.444, "step": 522 }, { "epoch": 0.28450972392220863, "grad_norm": 2.2704224586486816, "learning_rate": 8.608258947863598e-05, "loss": 0.2376, "step": 523 }, { "epoch": 0.28505371957024345, "grad_norm": 2.863485336303711, "learning_rate": 8.601996476794261e-05, "loss": 0.5192, "step": 524 }, { "epoch": 0.28559771521827826, "grad_norm": 10.227802276611328, "learning_rate": 8.595722236621846e-05, "loss": 0.3734, "step": 525 }, { "epoch": 0.28614171086631307, "grad_norm": 3.367239236831665, "learning_rate": 8.589436247846699e-05, "loss": 0.512, "step": 526 }, { "epoch": 0.2866857065143479, "grad_norm": 3.3550360202789307, "learning_rate": 8.583138531007555e-05, "loss": 0.3466, "step": 527 }, { "epoch": 0.2872297021623827, "grad_norm": 4.407619953155518, "learning_rate": 8.57682910668147e-05, "loss": 0.6476, "step": 528 }, { "epoch": 0.2877736978104175, "grad_norm": 3.0294065475463867, "learning_rate": 8.57050799548375e-05, "loss": 0.4762, "step": 529 }, { "epoch": 0.2883176934584523, "grad_norm": 2.7545666694641113, "learning_rate": 8.564175218067888e-05, "loss": 0.3946, "step": 530 }, { "epoch": 0.28886168910648713, "grad_norm": 3.3645520210266113, "learning_rate": 8.557830795125496e-05, "loss": 0.5108, "step": 531 }, { "epoch": 0.28940568475452194, "grad_norm": 2.5364809036254883, "learning_rate": 8.551474747386237e-05, "loss": 0.2534, "step": 532 }, { "epoch": 0.28994968040255675, "grad_norm": 3.3608193397521973, "learning_rate": 8.54510709561775e-05, "loss": 0.4111, "step": 533 }, { "epoch": 0.2904936760505916, "grad_norm": 2.904505729675293, "learning_rate": 8.5387278606256e-05, "loss": 0.4436, "step": 534 }, { "epoch": 0.29103767169862643, "grad_norm": 2.2496747970581055, "learning_rate": 8.532337063253189e-05, "loss": 0.2317, "step": 535 }, { "epoch": 0.29158166734666124, "grad_norm": 3.56933856010437, "learning_rate": 8.525934724381704e-05, "loss": 0.4004, "step": 536 }, { "epoch": 0.29212566299469606, "grad_norm": 2.5870110988616943, "learning_rate": 8.519520864930038e-05, "loss": 0.2695, "step": 537 }, { "epoch": 0.29266965864273087, "grad_norm": 3.052124500274658, "learning_rate": 8.513095505854728e-05, "loss": 0.187, "step": 538 }, { "epoch": 0.2932136542907657, "grad_norm": 3.1799957752227783, "learning_rate": 8.506658668149886e-05, "loss": 0.3887, "step": 539 }, { "epoch": 0.2937576499388005, "grad_norm": 3.956324815750122, "learning_rate": 8.500210372847127e-05, "loss": 0.4797, "step": 540 }, { "epoch": 0.2943016455868353, "grad_norm": 3.2343764305114746, "learning_rate": 8.493750641015505e-05, "loss": 0.4163, "step": 541 }, { "epoch": 0.2948456412348701, "grad_norm": 2.953278064727783, "learning_rate": 8.487279493761437e-05, "loss": 0.2884, "step": 542 }, { "epoch": 0.2953896368829049, "grad_norm": 4.27315616607666, "learning_rate": 8.480796952228644e-05, "loss": 0.5698, "step": 543 }, { "epoch": 0.29593363253093974, "grad_norm": 4.479121685028076, "learning_rate": 8.474303037598074e-05, "loss": 0.3501, "step": 544 }, { "epoch": 0.29647762817897455, "grad_norm": 3.648958206176758, "learning_rate": 8.467797771087833e-05, "loss": 0.3139, "step": 545 }, { "epoch": 0.29702162382700936, "grad_norm": 2.9666337966918945, "learning_rate": 8.461281173953119e-05, "loss": 0.2718, "step": 546 }, { "epoch": 0.2975656194750442, "grad_norm": 3.672086238861084, "learning_rate": 8.454753267486158e-05, "loss": 0.3503, "step": 547 }, { "epoch": 0.29810961512307904, "grad_norm": 3.907465934753418, "learning_rate": 8.448214073016116e-05, "loss": 0.5314, "step": 548 }, { "epoch": 0.29865361077111385, "grad_norm": 3.2654552459716797, "learning_rate": 8.441663611909053e-05, "loss": 0.2918, "step": 549 }, { "epoch": 0.29919760641914867, "grad_norm": 3.5053250789642334, "learning_rate": 8.435101905567833e-05, "loss": 0.2655, "step": 550 }, { "epoch": 0.2997416020671835, "grad_norm": 2.7899842262268066, "learning_rate": 8.428528975432066e-05, "loss": 0.8069, "step": 551 }, { "epoch": 0.3002855977152183, "grad_norm": 3.4976909160614014, "learning_rate": 8.421944842978036e-05, "loss": 0.7143, "step": 552 }, { "epoch": 0.3008295933632531, "grad_norm": 2.3880560398101807, "learning_rate": 8.415349529718623e-05, "loss": 0.5766, "step": 553 }, { "epoch": 0.3013735890112879, "grad_norm": 2.3048553466796875, "learning_rate": 8.408743057203249e-05, "loss": 0.5326, "step": 554 }, { "epoch": 0.3019175846593227, "grad_norm": 2.5904994010925293, "learning_rate": 8.402125447017788e-05, "loss": 0.5967, "step": 555 }, { "epoch": 0.30246158030735754, "grad_norm": 2.1667346954345703, "learning_rate": 8.395496720784511e-05, "loss": 0.3368, "step": 556 }, { "epoch": 0.30300557595539235, "grad_norm": 2.738069772720337, "learning_rate": 8.388856900162004e-05, "loss": 0.5862, "step": 557 }, { "epoch": 0.30354957160342716, "grad_norm": 3.325639009475708, "learning_rate": 8.38220600684511e-05, "loss": 0.5497, "step": 558 }, { "epoch": 0.30409356725146197, "grad_norm": 2.720137357711792, "learning_rate": 8.375544062564844e-05, "loss": 0.4025, "step": 559 }, { "epoch": 0.3046375628994968, "grad_norm": 2.2373790740966797, "learning_rate": 8.368871089088333e-05, "loss": 0.5544, "step": 560 }, { "epoch": 0.3051815585475316, "grad_norm": 2.769669771194458, "learning_rate": 8.362187108218737e-05, "loss": 0.6729, "step": 561 }, { "epoch": 0.30572555419556646, "grad_norm": 2.4594335556030273, "learning_rate": 8.355492141795185e-05, "loss": 0.5694, "step": 562 }, { "epoch": 0.3062695498436013, "grad_norm": 2.5098025798797607, "learning_rate": 8.348786211692697e-05, "loss": 0.5048, "step": 563 }, { "epoch": 0.3068135454916361, "grad_norm": 2.231506824493408, "learning_rate": 8.342069339822119e-05, "loss": 0.3928, "step": 564 }, { "epoch": 0.3073575411396709, "grad_norm": 2.2407584190368652, "learning_rate": 8.335341548130045e-05, "loss": 0.4558, "step": 565 }, { "epoch": 0.3079015367877057, "grad_norm": 5.452847957611084, "learning_rate": 8.328602858598747e-05, "loss": 0.5858, "step": 566 }, { "epoch": 0.3084455324357405, "grad_norm": 2.296234369277954, "learning_rate": 8.321853293246109e-05, "loss": 0.2619, "step": 567 }, { "epoch": 0.30898952808377533, "grad_norm": 2.5752458572387695, "learning_rate": 8.31509287412555e-05, "loss": 0.3166, "step": 568 }, { "epoch": 0.30953352373181015, "grad_norm": 2.086369752883911, "learning_rate": 8.308321623325944e-05, "loss": 0.3188, "step": 569 }, { "epoch": 0.31007751937984496, "grad_norm": 1.696516513824463, "learning_rate": 8.301539562971569e-05, "loss": 0.1904, "step": 570 }, { "epoch": 0.31062151502787977, "grad_norm": 2.6791365146636963, "learning_rate": 8.294746715222012e-05, "loss": 0.2184, "step": 571 }, { "epoch": 0.3111655106759146, "grad_norm": 4.0010151863098145, "learning_rate": 8.287943102272111e-05, "loss": 0.3646, "step": 572 }, { "epoch": 0.3117095063239494, "grad_norm": 3.951964855194092, "learning_rate": 8.281128746351879e-05, "loss": 0.5919, "step": 573 }, { "epoch": 0.3122535019719842, "grad_norm": 2.759225845336914, "learning_rate": 8.274303669726426e-05, "loss": 0.3338, "step": 574 }, { "epoch": 0.312797497620019, "grad_norm": 3.2935709953308105, "learning_rate": 8.267467894695894e-05, "loss": 0.3954, "step": 575 }, { "epoch": 0.31334149326805383, "grad_norm": 2.391289234161377, "learning_rate": 8.260621443595382e-05, "loss": 0.2935, "step": 576 }, { "epoch": 0.3138854889160887, "grad_norm": 3.461540937423706, "learning_rate": 8.253764338794867e-05, "loss": 0.4573, "step": 577 }, { "epoch": 0.3144294845641235, "grad_norm": 2.262362241744995, "learning_rate": 8.246896602699142e-05, "loss": 0.2355, "step": 578 }, { "epoch": 0.3149734802121583, "grad_norm": 3.401052951812744, "learning_rate": 8.240018257747728e-05, "loss": 0.5041, "step": 579 }, { "epoch": 0.31551747586019313, "grad_norm": 3.0692005157470703, "learning_rate": 8.233129326414819e-05, "loss": 0.4064, "step": 580 }, { "epoch": 0.31606147150822794, "grad_norm": 2.797919988632202, "learning_rate": 8.226229831209194e-05, "loss": 0.3522, "step": 581 }, { "epoch": 0.31660546715626275, "grad_norm": 2.740946054458618, "learning_rate": 8.219319794674145e-05, "loss": 0.3731, "step": 582 }, { "epoch": 0.31714946280429757, "grad_norm": 4.093087673187256, "learning_rate": 8.212399239387412e-05, "loss": 0.4848, "step": 583 }, { "epoch": 0.3176934584523324, "grad_norm": 2.516523599624634, "learning_rate": 8.2054681879611e-05, "loss": 0.2369, "step": 584 }, { "epoch": 0.3182374541003672, "grad_norm": 3.26690936088562, "learning_rate": 8.198526663041615e-05, "loss": 0.3477, "step": 585 }, { "epoch": 0.318781449748402, "grad_norm": 3.6261494159698486, "learning_rate": 8.191574687309575e-05, "loss": 0.421, "step": 586 }, { "epoch": 0.3193254453964368, "grad_norm": 3.1609513759613037, "learning_rate": 8.184612283479748e-05, "loss": 0.4924, "step": 587 }, { "epoch": 0.3198694410444716, "grad_norm": 2.0844192504882812, "learning_rate": 8.177639474300983e-05, "loss": 0.1931, "step": 588 }, { "epoch": 0.32041343669250644, "grad_norm": 3.4553945064544678, "learning_rate": 8.170656282556113e-05, "loss": 0.2404, "step": 589 }, { "epoch": 0.32095743234054125, "grad_norm": 2.9863662719726562, "learning_rate": 8.163662731061907e-05, "loss": 0.4048, "step": 590 }, { "epoch": 0.3215014279885761, "grad_norm": 3.3357396125793457, "learning_rate": 8.156658842668974e-05, "loss": 0.3496, "step": 591 }, { "epoch": 0.32204542363661093, "grad_norm": 3.472661256790161, "learning_rate": 8.149644640261704e-05, "loss": 0.4723, "step": 592 }, { "epoch": 0.32258941928464574, "grad_norm": 3.100168466567993, "learning_rate": 8.142620146758186e-05, "loss": 0.3781, "step": 593 }, { "epoch": 0.32313341493268055, "grad_norm": 3.313366174697876, "learning_rate": 8.13558538511013e-05, "loss": 0.4615, "step": 594 }, { "epoch": 0.32367741058071536, "grad_norm": 2.2047817707061768, "learning_rate": 8.1285403783028e-05, "loss": 0.2364, "step": 595 }, { "epoch": 0.3242214062287502, "grad_norm": 2.3736391067504883, "learning_rate": 8.121485149354933e-05, "loss": 0.2266, "step": 596 }, { "epoch": 0.324765401876785, "grad_norm": 3.0762939453125, "learning_rate": 8.114419721318667e-05, "loss": 0.408, "step": 597 }, { "epoch": 0.3253093975248198, "grad_norm": 2.6867945194244385, "learning_rate": 8.107344117279463e-05, "loss": 0.2425, "step": 598 }, { "epoch": 0.3258533931728546, "grad_norm": 2.473153591156006, "learning_rate": 8.100258360356032e-05, "loss": 0.2521, "step": 599 }, { "epoch": 0.3263973888208894, "grad_norm": 3.506434679031372, "learning_rate": 8.093162473700256e-05, "loss": 0.2034, "step": 600 }, { "epoch": 0.32694138446892423, "grad_norm": 2.474342107772827, "learning_rate": 8.086056480497117e-05, "loss": 0.7031, "step": 601 }, { "epoch": 0.32748538011695905, "grad_norm": 3.2538814544677734, "learning_rate": 8.078940403964618e-05, "loss": 0.7621, "step": 602 }, { "epoch": 0.32802937576499386, "grad_norm": 3.200930118560791, "learning_rate": 8.071814267353712e-05, "loss": 0.6281, "step": 603 }, { "epoch": 0.32857337141302867, "grad_norm": 2.7716193199157715, "learning_rate": 8.064678093948215e-05, "loss": 0.5629, "step": 604 }, { "epoch": 0.32911736706106354, "grad_norm": 2.4081013202667236, "learning_rate": 8.05753190706474e-05, "loss": 0.5004, "step": 605 }, { "epoch": 0.32966136270909835, "grad_norm": 3.7836034297943115, "learning_rate": 8.050375730052621e-05, "loss": 0.4399, "step": 606 }, { "epoch": 0.33020535835713316, "grad_norm": 2.703606605529785, "learning_rate": 8.043209586293832e-05, "loss": 0.4822, "step": 607 }, { "epoch": 0.330749354005168, "grad_norm": 2.0203452110290527, "learning_rate": 8.03603349920291e-05, "loss": 0.476, "step": 608 }, { "epoch": 0.3312933496532028, "grad_norm": 2.3296728134155273, "learning_rate": 8.02884749222688e-05, "loss": 0.4039, "step": 609 }, { "epoch": 0.3318373453012376, "grad_norm": 2.964188575744629, "learning_rate": 8.021651588845184e-05, "loss": 0.6935, "step": 610 }, { "epoch": 0.3323813409492724, "grad_norm": 2.9720089435577393, "learning_rate": 8.014445812569595e-05, "loss": 0.4919, "step": 611 }, { "epoch": 0.3329253365973072, "grad_norm": 1.5845167636871338, "learning_rate": 8.007230186944146e-05, "loss": 0.1891, "step": 612 }, { "epoch": 0.33346933224534203, "grad_norm": 2.4344890117645264, "learning_rate": 8.000004735545053e-05, "loss": 0.4655, "step": 613 }, { "epoch": 0.33401332789337684, "grad_norm": 2.04115629196167, "learning_rate": 7.992769481980631e-05, "loss": 0.3021, "step": 614 }, { "epoch": 0.33455732354141166, "grad_norm": 2.868072748184204, "learning_rate": 7.985524449891231e-05, "loss": 0.5381, "step": 615 }, { "epoch": 0.33510131918944647, "grad_norm": 1.7905510663986206, "learning_rate": 7.97826966294915e-05, "loss": 0.1587, "step": 616 }, { "epoch": 0.3356453148374813, "grad_norm": 2.690568447113037, "learning_rate": 7.971005144858553e-05, "loss": 0.3672, "step": 617 }, { "epoch": 0.3361893104855161, "grad_norm": 2.744029998779297, "learning_rate": 7.963730919355408e-05, "loss": 0.1915, "step": 618 }, { "epoch": 0.33673330613355096, "grad_norm": 2.651421308517456, "learning_rate": 7.9564470102074e-05, "loss": 0.4016, "step": 619 }, { "epoch": 0.33727730178158577, "grad_norm": 2.361420154571533, "learning_rate": 7.949153441213849e-05, "loss": 0.222, "step": 620 }, { "epoch": 0.3378212974296206, "grad_norm": 2.381561517715454, "learning_rate": 7.941850236205639e-05, "loss": 0.2244, "step": 621 }, { "epoch": 0.3383652930776554, "grad_norm": 2.808429479598999, "learning_rate": 7.934537419045143e-05, "loss": 0.3426, "step": 622 }, { "epoch": 0.3389092887256902, "grad_norm": 2.1379776000976562, "learning_rate": 7.927215013626137e-05, "loss": 0.2723, "step": 623 }, { "epoch": 0.339453284373725, "grad_norm": 3.124458074569702, "learning_rate": 7.919883043873725e-05, "loss": 0.3619, "step": 624 }, { "epoch": 0.33999728002175983, "grad_norm": 2.4810128211975098, "learning_rate": 7.912541533744263e-05, "loss": 0.3674, "step": 625 }, { "epoch": 0.34054127566979464, "grad_norm": 2.3322668075561523, "learning_rate": 7.905190507225278e-05, "loss": 0.3221, "step": 626 }, { "epoch": 0.34108527131782945, "grad_norm": 2.7971115112304688, "learning_rate": 7.897829988335394e-05, "loss": 0.2038, "step": 627 }, { "epoch": 0.34162926696586426, "grad_norm": 3.3051013946533203, "learning_rate": 7.890460001124242e-05, "loss": 0.3453, "step": 628 }, { "epoch": 0.3421732626138991, "grad_norm": 3.0126700401306152, "learning_rate": 7.883080569672401e-05, "loss": 0.4533, "step": 629 }, { "epoch": 0.3427172582619339, "grad_norm": 1.787818193435669, "learning_rate": 7.875691718091295e-05, "loss": 0.1656, "step": 630 }, { "epoch": 0.3432612539099687, "grad_norm": 1.958438754081726, "learning_rate": 7.868293470523143e-05, "loss": 0.1185, "step": 631 }, { "epoch": 0.3438052495580035, "grad_norm": 2.7631659507751465, "learning_rate": 7.860885851140847e-05, "loss": 0.3842, "step": 632 }, { "epoch": 0.3443492452060384, "grad_norm": 3.9796078205108643, "learning_rate": 7.853468884147943e-05, "loss": 0.3844, "step": 633 }, { "epoch": 0.3448932408540732, "grad_norm": 2.9881227016448975, "learning_rate": 7.846042593778506e-05, "loss": 0.1861, "step": 634 }, { "epoch": 0.345437236502108, "grad_norm": 8.718302726745605, "learning_rate": 7.83860700429707e-05, "loss": 0.6502, "step": 635 }, { "epoch": 0.3459812321501428, "grad_norm": 2.949648141860962, "learning_rate": 7.831162139998557e-05, "loss": 0.2691, "step": 636 }, { "epoch": 0.3465252277981776, "grad_norm": 2.5569653511047363, "learning_rate": 7.823708025208193e-05, "loss": 0.3289, "step": 637 }, { "epoch": 0.34706922344621244, "grad_norm": 3.7310521602630615, "learning_rate": 7.816244684281426e-05, "loss": 0.3348, "step": 638 }, { "epoch": 0.34761321909424725, "grad_norm": 4.717464447021484, "learning_rate": 7.808772141603855e-05, "loss": 0.4284, "step": 639 }, { "epoch": 0.34815721474228206, "grad_norm": 2.435135841369629, "learning_rate": 7.801290421591136e-05, "loss": 0.3545, "step": 640 }, { "epoch": 0.3487012103903169, "grad_norm": 2.11711049079895, "learning_rate": 7.793799548688922e-05, "loss": 0.1774, "step": 641 }, { "epoch": 0.3492452060383517, "grad_norm": 2.951214551925659, "learning_rate": 7.78629954737276e-05, "loss": 0.3587, "step": 642 }, { "epoch": 0.3497892016863865, "grad_norm": 2.651959180831909, "learning_rate": 7.778790442148033e-05, "loss": 0.2702, "step": 643 }, { "epoch": 0.3503331973344213, "grad_norm": 2.6745662689208984, "learning_rate": 7.771272257549864e-05, "loss": 0.2698, "step": 644 }, { "epoch": 0.3508771929824561, "grad_norm": 3.514754056930542, "learning_rate": 7.763745018143044e-05, "loss": 0.4086, "step": 645 }, { "epoch": 0.35142118863049093, "grad_norm": 3.210787773132324, "learning_rate": 7.75620874852195e-05, "loss": 0.3537, "step": 646 }, { "epoch": 0.35196518427852574, "grad_norm": 3.8721444606781006, "learning_rate": 7.748663473310461e-05, "loss": 0.2008, "step": 647 }, { "epoch": 0.3525091799265606, "grad_norm": 3.3858845233917236, "learning_rate": 7.741109217161886e-05, "loss": 0.3809, "step": 648 }, { "epoch": 0.3530531755745954, "grad_norm": 3.6042556762695312, "learning_rate": 7.733546004758873e-05, "loss": 0.2735, "step": 649 }, { "epoch": 0.35359717122263024, "grad_norm": 1.8476146459579468, "learning_rate": 7.725973860813338e-05, "loss": 0.0906, "step": 650 }, { "epoch": 0.35414116687066505, "grad_norm": 2.5128464698791504, "learning_rate": 7.718392810066377e-05, "loss": 0.8658, "step": 651 }, { "epoch": 0.35468516251869986, "grad_norm": 2.144083261489868, "learning_rate": 7.710802877288187e-05, "loss": 0.4537, "step": 652 }, { "epoch": 0.35522915816673467, "grad_norm": 3.0932605266571045, "learning_rate": 7.703204087277988e-05, "loss": 0.8637, "step": 653 }, { "epoch": 0.3557731538147695, "grad_norm": 2.9740941524505615, "learning_rate": 7.695596464863941e-05, "loss": 0.5985, "step": 654 }, { "epoch": 0.3563171494628043, "grad_norm": 3.6099016666412354, "learning_rate": 7.687980034903061e-05, "loss": 0.5461, "step": 655 }, { "epoch": 0.3568611451108391, "grad_norm": 1.9659099578857422, "learning_rate": 7.680354822281148e-05, "loss": 0.2917, "step": 656 }, { "epoch": 0.3574051407588739, "grad_norm": 2.0817372798919678, "learning_rate": 7.672720851912693e-05, "loss": 0.327, "step": 657 }, { "epoch": 0.35794913640690873, "grad_norm": 2.4489848613739014, "learning_rate": 7.665078148740801e-05, "loss": 0.5234, "step": 658 }, { "epoch": 0.35849313205494354, "grad_norm": 2.659283399581909, "learning_rate": 7.657426737737115e-05, "loss": 0.5369, "step": 659 }, { "epoch": 0.35903712770297835, "grad_norm": 1.7705096006393433, "learning_rate": 7.649766643901726e-05, "loss": 0.3108, "step": 660 }, { "epoch": 0.35958112335101317, "grad_norm": 2.148360013961792, "learning_rate": 7.642097892263098e-05, "loss": 0.4436, "step": 661 }, { "epoch": 0.36012511899904803, "grad_norm": 2.1810200214385986, "learning_rate": 7.634420507877979e-05, "loss": 0.365, "step": 662 }, { "epoch": 0.36066911464708284, "grad_norm": 2.501575469970703, "learning_rate": 7.626734515831331e-05, "loss": 0.419, "step": 663 }, { "epoch": 0.36121311029511766, "grad_norm": 2.5353164672851562, "learning_rate": 7.619039941236233e-05, "loss": 0.5144, "step": 664 }, { "epoch": 0.36175710594315247, "grad_norm": 2.6715505123138428, "learning_rate": 7.611336809233807e-05, "loss": 0.4825, "step": 665 }, { "epoch": 0.3623011015911873, "grad_norm": 2.629690408706665, "learning_rate": 7.603625144993145e-05, "loss": 0.4004, "step": 666 }, { "epoch": 0.3628450972392221, "grad_norm": 2.1490893363952637, "learning_rate": 7.595904973711202e-05, "loss": 0.3139, "step": 667 }, { "epoch": 0.3633890928872569, "grad_norm": 1.984887719154358, "learning_rate": 7.588176320612741e-05, "loss": 0.287, "step": 668 }, { "epoch": 0.3639330885352917, "grad_norm": 2.0138869285583496, "learning_rate": 7.580439210950232e-05, "loss": 0.1599, "step": 669 }, { "epoch": 0.3644770841833265, "grad_norm": 2.370619297027588, "learning_rate": 7.57269367000378e-05, "loss": 0.3357, "step": 670 }, { "epoch": 0.36502107983136134, "grad_norm": 2.0663440227508545, "learning_rate": 7.564939723081035e-05, "loss": 0.21, "step": 671 }, { "epoch": 0.36556507547939615, "grad_norm": 15.536395072937012, "learning_rate": 7.557177395517112e-05, "loss": 0.2583, "step": 672 }, { "epoch": 0.36610907112743096, "grad_norm": 2.5944101810455322, "learning_rate": 7.549406712674511e-05, "loss": 0.3117, "step": 673 }, { "epoch": 0.3666530667754658, "grad_norm": 2.4052445888519287, "learning_rate": 7.541627699943035e-05, "loss": 0.2419, "step": 674 }, { "epoch": 0.3671970624235006, "grad_norm": 1.9298328161239624, "learning_rate": 7.533840382739693e-05, "loss": 0.2248, "step": 675 }, { "epoch": 0.36774105807153545, "grad_norm": 2.1662416458129883, "learning_rate": 7.526044786508641e-05, "loss": 0.1349, "step": 676 }, { "epoch": 0.36828505371957027, "grad_norm": 3.6624107360839844, "learning_rate": 7.518240936721077e-05, "loss": 0.5115, "step": 677 }, { "epoch": 0.3688290493676051, "grad_norm": 1.9071080684661865, "learning_rate": 7.510428858875172e-05, "loss": 0.1488, "step": 678 }, { "epoch": 0.3693730450156399, "grad_norm": 4.12332820892334, "learning_rate": 7.502608578495975e-05, "loss": 0.2976, "step": 679 }, { "epoch": 0.3699170406636747, "grad_norm": 3.5102076530456543, "learning_rate": 7.494780121135343e-05, "loss": 0.5541, "step": 680 }, { "epoch": 0.3704610363117095, "grad_norm": 3.98101806640625, "learning_rate": 7.486943512371842e-05, "loss": 0.3935, "step": 681 }, { "epoch": 0.3710050319597443, "grad_norm": 3.383446455001831, "learning_rate": 7.479098777810683e-05, "loss": 0.5077, "step": 682 }, { "epoch": 0.37154902760777914, "grad_norm": 2.465677261352539, "learning_rate": 7.471245943083615e-05, "loss": 0.1777, "step": 683 }, { "epoch": 0.37209302325581395, "grad_norm": 3.382866382598877, "learning_rate": 7.46338503384886e-05, "loss": 0.2249, "step": 684 }, { "epoch": 0.37263701890384876, "grad_norm": 3.7749321460723877, "learning_rate": 7.455516075791023e-05, "loss": 0.4017, "step": 685 }, { "epoch": 0.37318101455188357, "grad_norm": 3.293405771255493, "learning_rate": 7.447639094621008e-05, "loss": 0.4642, "step": 686 }, { "epoch": 0.3737250101999184, "grad_norm": 4.321906089782715, "learning_rate": 7.439754116075926e-05, "loss": 0.4099, "step": 687 }, { "epoch": 0.3742690058479532, "grad_norm": 3.31119966506958, "learning_rate": 7.431861165919027e-05, "loss": 0.4311, "step": 688 }, { "epoch": 0.374813001495988, "grad_norm": 3.4781033992767334, "learning_rate": 7.423960269939605e-05, "loss": 0.3825, "step": 689 }, { "epoch": 0.3753569971440229, "grad_norm": 3.230344772338867, "learning_rate": 7.416051453952917e-05, "loss": 0.5037, "step": 690 }, { "epoch": 0.3759009927920577, "grad_norm": 2.330538272857666, "learning_rate": 7.408134743800092e-05, "loss": 0.2517, "step": 691 }, { "epoch": 0.3764449884400925, "grad_norm": 2.9852328300476074, "learning_rate": 7.400210165348058e-05, "loss": 0.2938, "step": 692 }, { "epoch": 0.3769889840881273, "grad_norm": 3.251340866088867, "learning_rate": 7.392277744489451e-05, "loss": 0.3794, "step": 693 }, { "epoch": 0.3775329797361621, "grad_norm": 1.9832584857940674, "learning_rate": 7.384337507142531e-05, "loss": 0.1748, "step": 694 }, { "epoch": 0.37807697538419693, "grad_norm": 1.865557074546814, "learning_rate": 7.376389479251095e-05, "loss": 0.1585, "step": 695 }, { "epoch": 0.37862097103223175, "grad_norm": 3.10884952545166, "learning_rate": 7.368433686784397e-05, "loss": 0.3733, "step": 696 }, { "epoch": 0.37916496668026656, "grad_norm": 3.314511299133301, "learning_rate": 7.360470155737061e-05, "loss": 0.391, "step": 697 }, { "epoch": 0.37970896232830137, "grad_norm": 3.0642800331115723, "learning_rate": 7.352498912128996e-05, "loss": 0.3023, "step": 698 }, { "epoch": 0.3802529579763362, "grad_norm": 2.3845198154449463, "learning_rate": 7.344519982005305e-05, "loss": 0.1516, "step": 699 }, { "epoch": 0.380796953624371, "grad_norm": 3.0566601753234863, "learning_rate": 7.336533391436218e-05, "loss": 0.1618, "step": 700 }, { "epoch": 0.3813409492724058, "grad_norm": 2.486022472381592, "learning_rate": 7.328539166516983e-05, "loss": 0.7428, "step": 701 }, { "epoch": 0.3818849449204406, "grad_norm": 2.2583043575286865, "learning_rate": 7.3205373333678e-05, "loss": 0.5397, "step": 702 }, { "epoch": 0.38242894056847543, "grad_norm": 2.398348093032837, "learning_rate": 7.312527918133722e-05, "loss": 0.518, "step": 703 }, { "epoch": 0.38297293621651024, "grad_norm": 2.3080129623413086, "learning_rate": 7.30451094698458e-05, "loss": 0.5381, "step": 704 }, { "epoch": 0.3835169318645451, "grad_norm": 2.569821834564209, "learning_rate": 7.296486446114889e-05, "loss": 0.4637, "step": 705 }, { "epoch": 0.3840609275125799, "grad_norm": 2.3830647468566895, "learning_rate": 7.288454441743774e-05, "loss": 0.4895, "step": 706 }, { "epoch": 0.38460492316061473, "grad_norm": 3.5982468128204346, "learning_rate": 7.280414960114868e-05, "loss": 0.334, "step": 707 }, { "epoch": 0.38514891880864954, "grad_norm": 2.4882588386535645, "learning_rate": 7.27236802749624e-05, "loss": 0.5358, "step": 708 }, { "epoch": 0.38569291445668435, "grad_norm": 2.2346973419189453, "learning_rate": 7.264313670180301e-05, "loss": 0.3802, "step": 709 }, { "epoch": 0.38623691010471917, "grad_norm": 2.290409564971924, "learning_rate": 7.256251914483727e-05, "loss": 0.323, "step": 710 }, { "epoch": 0.386780905752754, "grad_norm": 2.728996515274048, "learning_rate": 7.248182786747362e-05, "loss": 0.4361, "step": 711 }, { "epoch": 0.3873249014007888, "grad_norm": 10.44469165802002, "learning_rate": 7.240106313336139e-05, "loss": 0.2395, "step": 712 }, { "epoch": 0.3878688970488236, "grad_norm": 2.3238625526428223, "learning_rate": 7.232022520638991e-05, "loss": 0.3174, "step": 713 }, { "epoch": 0.3884128926968584, "grad_norm": 1.977661371231079, "learning_rate": 7.22393143506877e-05, "loss": 0.3069, "step": 714 }, { "epoch": 0.3889568883448932, "grad_norm": 4.2777276039123535, "learning_rate": 7.215833083062155e-05, "loss": 0.2511, "step": 715 }, { "epoch": 0.38950088399292804, "grad_norm": 2.488213300704956, "learning_rate": 7.20772749107956e-05, "loss": 0.2843, "step": 716 }, { "epoch": 0.39004487964096285, "grad_norm": 2.450596809387207, "learning_rate": 7.199614685605066e-05, "loss": 0.2599, "step": 717 }, { "epoch": 0.39058887528899766, "grad_norm": 2.6611263751983643, "learning_rate": 7.191494693146314e-05, "loss": 0.2623, "step": 718 }, { "epoch": 0.39113287093703253, "grad_norm": 2.594430685043335, "learning_rate": 7.183367540234436e-05, "loss": 0.3281, "step": 719 }, { "epoch": 0.39167686658506734, "grad_norm": 2.1042861938476562, "learning_rate": 7.175233253423951e-05, "loss": 0.2063, "step": 720 }, { "epoch": 0.39222086223310215, "grad_norm": 2.158658027648926, "learning_rate": 7.167091859292695e-05, "loss": 0.2136, "step": 721 }, { "epoch": 0.39276485788113696, "grad_norm": 2.9247045516967773, "learning_rate": 7.158943384441721e-05, "loss": 0.3138, "step": 722 }, { "epoch": 0.3933088535291718, "grad_norm": 1.6331703662872314, "learning_rate": 7.150787855495225e-05, "loss": 0.2215, "step": 723 }, { "epoch": 0.3938528491772066, "grad_norm": 2.4122707843780518, "learning_rate": 7.142625299100437e-05, "loss": 0.3149, "step": 724 }, { "epoch": 0.3943968448252414, "grad_norm": 3.1604197025299072, "learning_rate": 7.134455741927564e-05, "loss": 0.4568, "step": 725 }, { "epoch": 0.3949408404732762, "grad_norm": 3.2514188289642334, "learning_rate": 7.126279210669677e-05, "loss": 0.4522, "step": 726 }, { "epoch": 0.395484836121311, "grad_norm": 1.948143482208252, "learning_rate": 7.118095732042643e-05, "loss": 0.2229, "step": 727 }, { "epoch": 0.39602883176934583, "grad_norm": 2.3446531295776367, "learning_rate": 7.109905332785014e-05, "loss": 0.1642, "step": 728 }, { "epoch": 0.39657282741738065, "grad_norm": 2.431694746017456, "learning_rate": 7.101708039657972e-05, "loss": 0.2649, "step": 729 }, { "epoch": 0.39711682306541546, "grad_norm": 1.7262428998947144, "learning_rate": 7.093503879445212e-05, "loss": 0.1267, "step": 730 }, { "epoch": 0.39766081871345027, "grad_norm": 2.3356051445007324, "learning_rate": 7.08529287895287e-05, "loss": 0.2102, "step": 731 }, { "epoch": 0.3982048143614851, "grad_norm": 2.676107406616211, "learning_rate": 7.077075065009433e-05, "loss": 0.2708, "step": 732 }, { "epoch": 0.39874881000951995, "grad_norm": 2.450868844985962, "learning_rate": 7.068850464465648e-05, "loss": 0.2583, "step": 733 }, { "epoch": 0.39929280565755476, "grad_norm": 2.274522542953491, "learning_rate": 7.060619104194436e-05, "loss": 0.2882, "step": 734 }, { "epoch": 0.3998368013055896, "grad_norm": 2.6707513332366943, "learning_rate": 7.05238101109081e-05, "loss": 0.347, "step": 735 }, { "epoch": 0.4003807969536244, "grad_norm": 2.443697452545166, "learning_rate": 7.044136212071773e-05, "loss": 0.2032, "step": 736 }, { "epoch": 0.4009247926016592, "grad_norm": 1.4928048849105835, "learning_rate": 7.035884734076246e-05, "loss": 0.0992, "step": 737 }, { "epoch": 0.401468788249694, "grad_norm": 2.205641746520996, "learning_rate": 7.027626604064969e-05, "loss": 0.2106, "step": 738 }, { "epoch": 0.4020127838977288, "grad_norm": 3.015530824661255, "learning_rate": 7.019361849020421e-05, "loss": 0.2839, "step": 739 }, { "epoch": 0.40255677954576363, "grad_norm": 2.4785006046295166, "learning_rate": 7.011090495946722e-05, "loss": 0.3206, "step": 740 }, { "epoch": 0.40310077519379844, "grad_norm": 2.8082737922668457, "learning_rate": 7.002812571869552e-05, "loss": 0.2514, "step": 741 }, { "epoch": 0.40364477084183326, "grad_norm": 2.8378779888153076, "learning_rate": 6.994528103836062e-05, "loss": 0.2214, "step": 742 }, { "epoch": 0.40418876648986807, "grad_norm": 2.725118398666382, "learning_rate": 6.986237118914788e-05, "loss": 0.2974, "step": 743 }, { "epoch": 0.4047327621379029, "grad_norm": 3.1860225200653076, "learning_rate": 6.97793964419555e-05, "loss": 0.3756, "step": 744 }, { "epoch": 0.4052767577859377, "grad_norm": 2.0790181159973145, "learning_rate": 6.969635706789383e-05, "loss": 0.1498, "step": 745 }, { "epoch": 0.4058207534339725, "grad_norm": 2.396280527114868, "learning_rate": 6.96132533382843e-05, "loss": 0.2301, "step": 746 }, { "epoch": 0.40636474908200737, "grad_norm": 1.9975603818893433, "learning_rate": 6.953008552465868e-05, "loss": 0.1524, "step": 747 }, { "epoch": 0.4069087447300422, "grad_norm": 1.9771376848220825, "learning_rate": 6.944685389875804e-05, "loss": 0.1298, "step": 748 }, { "epoch": 0.407452740378077, "grad_norm": 3.6680123805999756, "learning_rate": 6.936355873253206e-05, "loss": 0.295, "step": 749 }, { "epoch": 0.4079967360261118, "grad_norm": 2.7139129638671875, "learning_rate": 6.928020029813793e-05, "loss": 0.0705, "step": 750 }, { "epoch": 0.4085407316741466, "grad_norm": 2.5253779888153076, "learning_rate": 6.919677886793966e-05, "loss": 0.715, "step": 751 }, { "epoch": 0.40908472732218143, "grad_norm": 2.0197973251342773, "learning_rate": 6.911329471450698e-05, "loss": 0.5953, "step": 752 }, { "epoch": 0.40962872297021624, "grad_norm": 28.96085548400879, "learning_rate": 6.902974811061463e-05, "loss": 1.1571, "step": 753 }, { "epoch": 0.41017271861825105, "grad_norm": 3.458620071411133, "learning_rate": 6.894613932924141e-05, "loss": 0.6593, "step": 754 }, { "epoch": 0.41071671426628586, "grad_norm": 2.8579399585723877, "learning_rate": 6.886246864356926e-05, "loss": 0.4831, "step": 755 }, { "epoch": 0.4112607099143207, "grad_norm": 3.0076403617858887, "learning_rate": 6.877873632698234e-05, "loss": 0.5664, "step": 756 }, { "epoch": 0.4118047055623555, "grad_norm": 3.1301205158233643, "learning_rate": 6.869494265306623e-05, "loss": 0.6905, "step": 757 }, { "epoch": 0.4123487012103903, "grad_norm": 17.261075973510742, "learning_rate": 6.8611087895607e-05, "loss": 0.7606, "step": 758 }, { "epoch": 0.4128926968584251, "grad_norm": 2.4415838718414307, "learning_rate": 6.85271723285903e-05, "loss": 0.3939, "step": 759 }, { "epoch": 0.4134366925064599, "grad_norm": 1.8437445163726807, "learning_rate": 6.844319622620039e-05, "loss": 0.3201, "step": 760 }, { "epoch": 0.4139806881544948, "grad_norm": 2.3578732013702393, "learning_rate": 6.835915986281945e-05, "loss": 0.4184, "step": 761 }, { "epoch": 0.4145246838025296, "grad_norm": 1.6673074960708618, "learning_rate": 6.827506351302644e-05, "loss": 0.2176, "step": 762 }, { "epoch": 0.4150686794505644, "grad_norm": 1.3272790908813477, "learning_rate": 6.819090745159639e-05, "loss": 0.1858, "step": 763 }, { "epoch": 0.4156126750985992, "grad_norm": 1.9007993936538696, "learning_rate": 6.81066919534994e-05, "loss": 0.2734, "step": 764 }, { "epoch": 0.41615667074663404, "grad_norm": 1.7832809686660767, "learning_rate": 6.80224172938998e-05, "loss": 0.2132, "step": 765 }, { "epoch": 0.41670066639466885, "grad_norm": 2.427661418914795, "learning_rate": 6.793808374815518e-05, "loss": 0.4166, "step": 766 }, { "epoch": 0.41724466204270366, "grad_norm": 2.2016844749450684, "learning_rate": 6.785369159181556e-05, "loss": 0.3172, "step": 767 }, { "epoch": 0.4177886576907385, "grad_norm": 1.936922550201416, "learning_rate": 6.776924110062248e-05, "loss": 0.3019, "step": 768 }, { "epoch": 0.4183326533387733, "grad_norm": 2.674992322921753, "learning_rate": 6.768473255050804e-05, "loss": 0.2896, "step": 769 }, { "epoch": 0.4188766489868081, "grad_norm": 2.0853171348571777, "learning_rate": 6.760016621759408e-05, "loss": 0.1899, "step": 770 }, { "epoch": 0.4194206446348429, "grad_norm": 1.9417757987976074, "learning_rate": 6.751554237819122e-05, "loss": 0.1521, "step": 771 }, { "epoch": 0.4199646402828777, "grad_norm": 2.429180860519409, "learning_rate": 6.743086130879799e-05, "loss": 0.2624, "step": 772 }, { "epoch": 0.42050863593091253, "grad_norm": 2.4585742950439453, "learning_rate": 6.734612328609986e-05, "loss": 0.2411, "step": 773 }, { "epoch": 0.42105263157894735, "grad_norm": 1.6887688636779785, "learning_rate": 6.726132858696846e-05, "loss": 0.1493, "step": 774 }, { "epoch": 0.42159662722698216, "grad_norm": 3.0125555992126465, "learning_rate": 6.717647748846056e-05, "loss": 0.3807, "step": 775 }, { "epoch": 0.422140622875017, "grad_norm": 3.686215877532959, "learning_rate": 6.709157026781727e-05, "loss": 0.3398, "step": 776 }, { "epoch": 0.42268461852305184, "grad_norm": 1.8543062210083008, "learning_rate": 6.700660720246294e-05, "loss": 0.2073, "step": 777 }, { "epoch": 0.42322861417108665, "grad_norm": 2.22817325592041, "learning_rate": 6.692158857000453e-05, "loss": 0.2115, "step": 778 }, { "epoch": 0.42377260981912146, "grad_norm": 1.455254077911377, "learning_rate": 6.683651464823048e-05, "loss": 0.1167, "step": 779 }, { "epoch": 0.42431660546715627, "grad_norm": 2.715520143508911, "learning_rate": 6.67513857151099e-05, "loss": 0.2104, "step": 780 }, { "epoch": 0.4248606011151911, "grad_norm": 2.1688036918640137, "learning_rate": 6.666620204879164e-05, "loss": 0.1705, "step": 781 }, { "epoch": 0.4254045967632259, "grad_norm": 3.539428472518921, "learning_rate": 6.65809639276034e-05, "loss": 0.4202, "step": 782 }, { "epoch": 0.4259485924112607, "grad_norm": 2.284435272216797, "learning_rate": 6.649567163005078e-05, "loss": 0.1726, "step": 783 }, { "epoch": 0.4264925880592955, "grad_norm": 2.3832223415374756, "learning_rate": 6.64103254348164e-05, "loss": 0.2546, "step": 784 }, { "epoch": 0.42703658370733033, "grad_norm": 2.3883893489837646, "learning_rate": 6.6324925620759e-05, "loss": 0.2311, "step": 785 }, { "epoch": 0.42758057935536514, "grad_norm": 2.6616251468658447, "learning_rate": 6.623947246691253e-05, "loss": 0.1846, "step": 786 }, { "epoch": 0.42812457500339995, "grad_norm": 3.60538911819458, "learning_rate": 6.615396625248516e-05, "loss": 0.3559, "step": 787 }, { "epoch": 0.42866857065143477, "grad_norm": 4.734655380249023, "learning_rate": 6.606840725685849e-05, "loss": 0.4762, "step": 788 }, { "epoch": 0.4292125662994696, "grad_norm": 4.111263275146484, "learning_rate": 6.598279575958653e-05, "loss": 0.3411, "step": 789 }, { "epoch": 0.42975656194750445, "grad_norm": 3.4052531719207764, "learning_rate": 6.589713204039488e-05, "loss": 0.4038, "step": 790 }, { "epoch": 0.43030055759553926, "grad_norm": 2.4519011974334717, "learning_rate": 6.58114163791797e-05, "loss": 0.1846, "step": 791 }, { "epoch": 0.43084455324357407, "grad_norm": 1.496929407119751, "learning_rate": 6.572564905600698e-05, "loss": 0.0873, "step": 792 }, { "epoch": 0.4313885488916089, "grad_norm": 3.7296979427337646, "learning_rate": 6.563983035111137e-05, "loss": 0.3498, "step": 793 }, { "epoch": 0.4319325445396437, "grad_norm": 3.029392957687378, "learning_rate": 6.55539605448955e-05, "loss": 0.2419, "step": 794 }, { "epoch": 0.4324765401876785, "grad_norm": 2.9371817111968994, "learning_rate": 6.546803991792894e-05, "loss": 0.2695, "step": 795 }, { "epoch": 0.4330205358357133, "grad_norm": 2.7627475261688232, "learning_rate": 6.538206875094734e-05, "loss": 0.3083, "step": 796 }, { "epoch": 0.43356453148374813, "grad_norm": 3.1573476791381836, "learning_rate": 6.529604732485139e-05, "loss": 0.3313, "step": 797 }, { "epoch": 0.43410852713178294, "grad_norm": 2.357085943222046, "learning_rate": 6.52099759207061e-05, "loss": 0.2121, "step": 798 }, { "epoch": 0.43465252277981775, "grad_norm": 3.03629732131958, "learning_rate": 6.512385481973973e-05, "loss": 0.2287, "step": 799 }, { "epoch": 0.43519651842785256, "grad_norm": 1.9751111268997192, "learning_rate": 6.503768430334293e-05, "loss": 0.0823, "step": 800 }, { "epoch": 0.4357405140758874, "grad_norm": 3.0520033836364746, "learning_rate": 6.49514646530678e-05, "loss": 0.4652, "step": 801 }, { "epoch": 0.4362845097239222, "grad_norm": 17.71600341796875, "learning_rate": 6.486519615062699e-05, "loss": 0.9364, "step": 802 }, { "epoch": 0.436828505371957, "grad_norm": 2.256815195083618, "learning_rate": 6.477887907789274e-05, "loss": 0.6962, "step": 803 }, { "epoch": 0.43737250101999187, "grad_norm": 2.2563600540161133, "learning_rate": 6.469251371689606e-05, "loss": 0.4989, "step": 804 }, { "epoch": 0.4379164966680267, "grad_norm": 3.366201877593994, "learning_rate": 6.460610034982563e-05, "loss": 0.5523, "step": 805 }, { "epoch": 0.4384604923160615, "grad_norm": 2.9771225452423096, "learning_rate": 6.451963925902707e-05, "loss": 0.6102, "step": 806 }, { "epoch": 0.4390044879640963, "grad_norm": 2.400824546813965, "learning_rate": 6.44331307270019e-05, "loss": 0.3032, "step": 807 }, { "epoch": 0.4395484836121311, "grad_norm": 1.9340862035751343, "learning_rate": 6.434657503640666e-05, "loss": 0.2705, "step": 808 }, { "epoch": 0.4400924792601659, "grad_norm": 2.0820274353027344, "learning_rate": 6.425997247005194e-05, "loss": 0.4102, "step": 809 }, { "epoch": 0.44063647490820074, "grad_norm": 1.9759997129440308, "learning_rate": 6.417332331090155e-05, "loss": 0.3342, "step": 810 }, { "epoch": 0.44118047055623555, "grad_norm": 2.2367966175079346, "learning_rate": 6.408662784207149e-05, "loss": 0.4156, "step": 811 }, { "epoch": 0.44172446620427036, "grad_norm": 1.8610540628433228, "learning_rate": 6.399988634682908e-05, "loss": 0.1845, "step": 812 }, { "epoch": 0.4422684618523052, "grad_norm": 2.3945529460906982, "learning_rate": 6.391309910859203e-05, "loss": 0.1857, "step": 813 }, { "epoch": 0.44281245750034, "grad_norm": 1.9255045652389526, "learning_rate": 6.382626641092752e-05, "loss": 0.2253, "step": 814 }, { "epoch": 0.4433564531483748, "grad_norm": 2.748262643814087, "learning_rate": 6.373938853755126e-05, "loss": 0.3901, "step": 815 }, { "epoch": 0.4439004487964096, "grad_norm": 2.4845709800720215, "learning_rate": 6.365246577232656e-05, "loss": 0.2645, "step": 816 }, { "epoch": 0.4444444444444444, "grad_norm": 3.664116859436035, "learning_rate": 6.356549839926341e-05, "loss": 0.4483, "step": 817 }, { "epoch": 0.4449884400924793, "grad_norm": 1.8046294450759888, "learning_rate": 6.347848670251753e-05, "loss": 0.1835, "step": 818 }, { "epoch": 0.4455324357405141, "grad_norm": 1.4446465969085693, "learning_rate": 6.339143096638949e-05, "loss": 0.1139, "step": 819 }, { "epoch": 0.4460764313885489, "grad_norm": 2.3055546283721924, "learning_rate": 6.330433147532377e-05, "loss": 0.2289, "step": 820 }, { "epoch": 0.4466204270365837, "grad_norm": 1.6309689283370972, "learning_rate": 6.321718851390779e-05, "loss": 0.0713, "step": 821 }, { "epoch": 0.44716442268461853, "grad_norm": 1.947127342224121, "learning_rate": 6.313000236687097e-05, "loss": 0.1332, "step": 822 }, { "epoch": 0.44770841833265335, "grad_norm": 3.2428231239318848, "learning_rate": 6.30427733190839e-05, "loss": 0.2948, "step": 823 }, { "epoch": 0.44825241398068816, "grad_norm": 2.4185853004455566, "learning_rate": 6.295550165555729e-05, "loss": 0.318, "step": 824 }, { "epoch": 0.44879640962872297, "grad_norm": 2.3793704509735107, "learning_rate": 6.286818766144116e-05, "loss": 0.2317, "step": 825 }, { "epoch": 0.4493404052767578, "grad_norm": 2.4064478874206543, "learning_rate": 6.278083162202375e-05, "loss": 0.2236, "step": 826 }, { "epoch": 0.4498844009247926, "grad_norm": 3.1890416145324707, "learning_rate": 6.269343382273073e-05, "loss": 0.274, "step": 827 }, { "epoch": 0.4504283965728274, "grad_norm": 2.439483165740967, "learning_rate": 6.260599454912421e-05, "loss": 0.3033, "step": 828 }, { "epoch": 0.4509723922208622, "grad_norm": 2.3495395183563232, "learning_rate": 6.251851408690183e-05, "loss": 0.2555, "step": 829 }, { "epoch": 0.45151638786889703, "grad_norm": 2.157290458679199, "learning_rate": 6.243099272189577e-05, "loss": 0.2025, "step": 830 }, { "epoch": 0.45206038351693184, "grad_norm": 3.1265320777893066, "learning_rate": 6.234343074007189e-05, "loss": 0.3876, "step": 831 }, { "epoch": 0.45260437916496665, "grad_norm": 2.4126408100128174, "learning_rate": 6.225582842752873e-05, "loss": 0.2588, "step": 832 }, { "epoch": 0.4531483748130015, "grad_norm": 2.3481287956237793, "learning_rate": 6.216818607049665e-05, "loss": 0.2251, "step": 833 }, { "epoch": 0.45369237046103633, "grad_norm": 2.2185287475585938, "learning_rate": 6.208050395533677e-05, "loss": 0.1694, "step": 834 }, { "epoch": 0.45423636610907114, "grad_norm": 2.4583792686462402, "learning_rate": 6.199278236854023e-05, "loss": 0.168, "step": 835 }, { "epoch": 0.45478036175710596, "grad_norm": 2.144270658493042, "learning_rate": 6.190502159672701e-05, "loss": 0.2396, "step": 836 }, { "epoch": 0.45532435740514077, "grad_norm": 2.8691868782043457, "learning_rate": 6.181722192664525e-05, "loss": 0.2354, "step": 837 }, { "epoch": 0.4558683530531756, "grad_norm": 2.6682605743408203, "learning_rate": 6.17293836451701e-05, "loss": 0.2581, "step": 838 }, { "epoch": 0.4564123487012104, "grad_norm": 3.7372400760650635, "learning_rate": 6.164150703930287e-05, "loss": 0.5189, "step": 839 }, { "epoch": 0.4569563443492452, "grad_norm": 1.5731239318847656, "learning_rate": 6.155359239617015e-05, "loss": 0.0835, "step": 840 }, { "epoch": 0.45750033999728, "grad_norm": 2.7574849128723145, "learning_rate": 6.146564000302277e-05, "loss": 0.3853, "step": 841 }, { "epoch": 0.4580443356453148, "grad_norm": 3.3258438110351562, "learning_rate": 6.137765014723488e-05, "loss": 0.3787, "step": 842 }, { "epoch": 0.45858833129334964, "grad_norm": 3.6028354167938232, "learning_rate": 6.128962311630309e-05, "loss": 0.3636, "step": 843 }, { "epoch": 0.45913232694138445, "grad_norm": 2.4243099689483643, "learning_rate": 6.120155919784543e-05, "loss": 0.1675, "step": 844 }, { "epoch": 0.45967632258941926, "grad_norm": 2.3988702297210693, "learning_rate": 6.111345867960051e-05, "loss": 0.2123, "step": 845 }, { "epoch": 0.4602203182374541, "grad_norm": 2.4450230598449707, "learning_rate": 6.1025321849426456e-05, "loss": 0.2538, "step": 846 }, { "epoch": 0.46076431388548894, "grad_norm": 3.0369746685028076, "learning_rate": 6.093714899530009e-05, "loss": 0.258, "step": 847 }, { "epoch": 0.46130830953352375, "grad_norm": 3.4322900772094727, "learning_rate": 6.08489404053159e-05, "loss": 0.3118, "step": 848 }, { "epoch": 0.46185230518155856, "grad_norm": 2.5383412837982178, "learning_rate": 6.076069636768521e-05, "loss": 0.1327, "step": 849 }, { "epoch": 0.4623963008295934, "grad_norm": 2.0237081050872803, "learning_rate": 6.0672417170735054e-05, "loss": 0.1233, "step": 850 }, { "epoch": 0.4629402964776282, "grad_norm": 1.466526985168457, "learning_rate": 6.058410310290743e-05, "loss": 0.2182, "step": 851 }, { "epoch": 0.463484292125663, "grad_norm": 2.3943369388580322, "learning_rate": 6.0495754452758245e-05, "loss": 0.681, "step": 852 }, { "epoch": 0.4640282877736978, "grad_norm": 1.8700807094573975, "learning_rate": 6.040737150895644e-05, "loss": 0.2663, "step": 853 }, { "epoch": 0.4645722834217326, "grad_norm": 2.500213623046875, "learning_rate": 6.031895456028288e-05, "loss": 0.567, "step": 854 }, { "epoch": 0.46511627906976744, "grad_norm": 2.986659049987793, "learning_rate": 6.023050389562971e-05, "loss": 0.5059, "step": 855 }, { "epoch": 0.46566027471780225, "grad_norm": 3.0863966941833496, "learning_rate": 6.0142019803999106e-05, "loss": 0.4476, "step": 856 }, { "epoch": 0.46620427036583706, "grad_norm": 1.9575637578964233, "learning_rate": 6.005350257450254e-05, "loss": 0.3004, "step": 857 }, { "epoch": 0.46674826601387187, "grad_norm": 2.8038523197174072, "learning_rate": 5.9964952496359696e-05, "loss": 0.4653, "step": 858 }, { "epoch": 0.4672922616619067, "grad_norm": 1.9844330549240112, "learning_rate": 5.987636985889764e-05, "loss": 0.3345, "step": 859 }, { "epoch": 0.4678362573099415, "grad_norm": 2.4399547576904297, "learning_rate": 5.978775495154979e-05, "loss": 0.427, "step": 860 }, { "epoch": 0.46838025295797636, "grad_norm": 1.8784351348876953, "learning_rate": 5.9699108063855037e-05, "loss": 0.3099, "step": 861 }, { "epoch": 0.4689242486060112, "grad_norm": 1.6360417604446411, "learning_rate": 5.961042948545671e-05, "loss": 0.1863, "step": 862 }, { "epoch": 0.469468244254046, "grad_norm": 1.908805012702942, "learning_rate": 5.952171950610173e-05, "loss": 0.1658, "step": 863 }, { "epoch": 0.4700122399020808, "grad_norm": 1.5813353061676025, "learning_rate": 5.943297841563959e-05, "loss": 0.1458, "step": 864 }, { "epoch": 0.4705562355501156, "grad_norm": 8.277915000915527, "learning_rate": 5.934420650402146e-05, "loss": 0.3945, "step": 865 }, { "epoch": 0.4711002311981504, "grad_norm": 2.121962070465088, "learning_rate": 5.925540406129919e-05, "loss": 0.1566, "step": 866 }, { "epoch": 0.47164422684618523, "grad_norm": 2.529106616973877, "learning_rate": 5.91665713776244e-05, "loss": 0.2451, "step": 867 }, { "epoch": 0.47218822249422004, "grad_norm": 1.5192179679870605, "learning_rate": 5.907770874324752e-05, "loss": 0.1358, "step": 868 }, { "epoch": 0.47273221814225486, "grad_norm": 2.0726311206817627, "learning_rate": 5.898881644851682e-05, "loss": 0.1378, "step": 869 }, { "epoch": 0.47327621379028967, "grad_norm": 3.655271053314209, "learning_rate": 5.889989478387753e-05, "loss": 0.1844, "step": 870 }, { "epoch": 0.4738202094383245, "grad_norm": 1.8433470726013184, "learning_rate": 5.8810944039870766e-05, "loss": 0.2128, "step": 871 }, { "epoch": 0.4743642050863593, "grad_norm": 2.5581130981445312, "learning_rate": 5.872196450713274e-05, "loss": 0.2625, "step": 872 }, { "epoch": 0.4749082007343941, "grad_norm": 3.3441600799560547, "learning_rate": 5.863295647639366e-05, "loss": 0.4327, "step": 873 }, { "epoch": 0.4754521963824289, "grad_norm": 2.316532611846924, "learning_rate": 5.8543920238476924e-05, "loss": 0.253, "step": 874 }, { "epoch": 0.4759961920304638, "grad_norm": 2.913689136505127, "learning_rate": 5.8454856084297994e-05, "loss": 0.4394, "step": 875 }, { "epoch": 0.4765401876784986, "grad_norm": 2.4672515392303467, "learning_rate": 5.836576430486362e-05, "loss": 0.1861, "step": 876 }, { "epoch": 0.4770841833265334, "grad_norm": 1.9307771921157837, "learning_rate": 5.8276645191270794e-05, "loss": 0.1395, "step": 877 }, { "epoch": 0.4776281789745682, "grad_norm": 2.287076950073242, "learning_rate": 5.818749903470585e-05, "loss": 0.2123, "step": 878 }, { "epoch": 0.47817217462260303, "grad_norm": 1.8622685670852661, "learning_rate": 5.8098326126443395e-05, "loss": 0.1798, "step": 879 }, { "epoch": 0.47871617027063784, "grad_norm": 1.8883358240127563, "learning_rate": 5.800912675784552e-05, "loss": 0.1704, "step": 880 }, { "epoch": 0.47926016591867265, "grad_norm": 2.104658365249634, "learning_rate": 5.791990122036075e-05, "loss": 0.1547, "step": 881 }, { "epoch": 0.47980416156670747, "grad_norm": 2.5915865898132324, "learning_rate": 5.783064980552314e-05, "loss": 0.2837, "step": 882 }, { "epoch": 0.4803481572147423, "grad_norm": 2.0172207355499268, "learning_rate": 5.7741372804951225e-05, "loss": 0.1488, "step": 883 }, { "epoch": 0.4808921528627771, "grad_norm": 2.042238712310791, "learning_rate": 5.7652070510347225e-05, "loss": 0.1872, "step": 884 }, { "epoch": 0.4814361485108119, "grad_norm": 2.895968198776245, "learning_rate": 5.756274321349593e-05, "loss": 0.334, "step": 885 }, { "epoch": 0.4819801441588467, "grad_norm": 2.44352126121521, "learning_rate": 5.7473391206263906e-05, "loss": 0.2638, "step": 886 }, { "epoch": 0.4825241398068815, "grad_norm": 2.4072906970977783, "learning_rate": 5.738401478059835e-05, "loss": 0.2328, "step": 887 }, { "epoch": 0.48306813545491634, "grad_norm": 1.8787102699279785, "learning_rate": 5.7294614228526335e-05, "loss": 0.1463, "step": 888 }, { "epoch": 0.4836121311029512, "grad_norm": 3.6718087196350098, "learning_rate": 5.7205189842153715e-05, "loss": 0.5203, "step": 889 }, { "epoch": 0.484156126750986, "grad_norm": 4.622086048126221, "learning_rate": 5.7115741913664264e-05, "loss": 0.2992, "step": 890 }, { "epoch": 0.4847001223990208, "grad_norm": 3.3284549713134766, "learning_rate": 5.702627073531861e-05, "loss": 0.2151, "step": 891 }, { "epoch": 0.48524411804705564, "grad_norm": 2.888274908065796, "learning_rate": 5.6936776599453424e-05, "loss": 0.3189, "step": 892 }, { "epoch": 0.48578811369509045, "grad_norm": 2.6060476303100586, "learning_rate": 5.68472597984803e-05, "loss": 0.2826, "step": 893 }, { "epoch": 0.48633210934312526, "grad_norm": 2.83160400390625, "learning_rate": 5.6757720624885e-05, "loss": 0.2474, "step": 894 }, { "epoch": 0.4868761049911601, "grad_norm": 2.750730514526367, "learning_rate": 5.666815937122626e-05, "loss": 0.2074, "step": 895 }, { "epoch": 0.4874201006391949, "grad_norm": 2.810438632965088, "learning_rate": 5.657857633013507e-05, "loss": 0.3336, "step": 896 }, { "epoch": 0.4879640962872297, "grad_norm": 3.8228251934051514, "learning_rate": 5.648897179431353e-05, "loss": 0.2902, "step": 897 }, { "epoch": 0.4885080919352645, "grad_norm": 2.569610595703125, "learning_rate": 5.639934605653404e-05, "loss": 0.0747, "step": 898 }, { "epoch": 0.4890520875832993, "grad_norm": 2.697833776473999, "learning_rate": 5.6309699409638196e-05, "loss": 0.1346, "step": 899 }, { "epoch": 0.48959608323133413, "grad_norm": 6.714253902435303, "learning_rate": 5.622003214653597e-05, "loss": 0.0981, "step": 900 }, { "epoch": 0.49014007887936895, "grad_norm": 1.4149291515350342, "learning_rate": 5.6130344560204675e-05, "loss": 0.5193, "step": 901 }, { "epoch": 0.49068407452740376, "grad_norm": 1.7246063947677612, "learning_rate": 5.6040636943688065e-05, "loss": 0.5813, "step": 902 }, { "epoch": 0.49122807017543857, "grad_norm": 2.0109312534332275, "learning_rate": 5.5950909590095245e-05, "loss": 0.4045, "step": 903 }, { "epoch": 0.49177206582347344, "grad_norm": 1.7636027336120605, "learning_rate": 5.586116279259992e-05, "loss": 0.2516, "step": 904 }, { "epoch": 0.49231606147150825, "grad_norm": 2.4564461708068848, "learning_rate": 5.577139684443924e-05, "loss": 0.3753, "step": 905 }, { "epoch": 0.49286005711954306, "grad_norm": 1.612990140914917, "learning_rate": 5.568161203891301e-05, "loss": 0.2204, "step": 906 }, { "epoch": 0.49340405276757787, "grad_norm": 2.4645423889160156, "learning_rate": 5.559180866938256e-05, "loss": 0.4372, "step": 907 }, { "epoch": 0.4939480484156127, "grad_norm": 1.9195036888122559, "learning_rate": 5.5501987029269944e-05, "loss": 0.2414, "step": 908 }, { "epoch": 0.4944920440636475, "grad_norm": 1.64366614818573, "learning_rate": 5.5412147412056856e-05, "loss": 0.2254, "step": 909 }, { "epoch": 0.4950360397116823, "grad_norm": 1.6586601734161377, "learning_rate": 5.5322290111283815e-05, "loss": 0.2232, "step": 910 }, { "epoch": 0.4955800353597171, "grad_norm": 1.9492932558059692, "learning_rate": 5.5232415420548996e-05, "loss": 0.2807, "step": 911 }, { "epoch": 0.49612403100775193, "grad_norm": 2.1404612064361572, "learning_rate": 5.5142523633507514e-05, "loss": 0.3125, "step": 912 }, { "epoch": 0.49666802665578674, "grad_norm": 1.4547415971755981, "learning_rate": 5.505261504387026e-05, "loss": 0.1649, "step": 913 }, { "epoch": 0.49721202230382155, "grad_norm": 1.655846118927002, "learning_rate": 5.496268994540309e-05, "loss": 0.1948, "step": 914 }, { "epoch": 0.49775601795185637, "grad_norm": 2.4168806076049805, "learning_rate": 5.487274863192573e-05, "loss": 0.3851, "step": 915 }, { "epoch": 0.4983000135998912, "grad_norm": 1.817676305770874, "learning_rate": 5.4782791397310937e-05, "loss": 0.1292, "step": 916 }, { "epoch": 0.498844009247926, "grad_norm": 1.5200881958007812, "learning_rate": 5.4692818535483484e-05, "loss": 0.1344, "step": 917 }, { "epoch": 0.49938800489596086, "grad_norm": 1.74146568775177, "learning_rate": 5.4602830340419195e-05, "loss": 0.1161, "step": 918 }, { "epoch": 0.49993200054399567, "grad_norm": 2.5906293392181396, "learning_rate": 5.451282710614401e-05, "loss": 0.3315, "step": 919 }, { "epoch": 0.5004759961920304, "grad_norm": 2.7414915561676025, "learning_rate": 5.442280912673294e-05, "loss": 0.3397, "step": 920 }, { "epoch": 0.5010199918400653, "grad_norm": 1.9398181438446045, "learning_rate": 5.433277669630927e-05, "loss": 0.1714, "step": 921 }, { "epoch": 0.5015639874881, "grad_norm": 2.6773691177368164, "learning_rate": 5.424273010904345e-05, "loss": 0.2252, "step": 922 }, { "epoch": 0.5021079831361349, "grad_norm": 1.510254144668579, "learning_rate": 5.41526696591522e-05, "loss": 0.1127, "step": 923 }, { "epoch": 0.5026519787841697, "grad_norm": 2.424696207046509, "learning_rate": 5.4062595640897504e-05, "loss": 0.2314, "step": 924 }, { "epoch": 0.5031959744322045, "grad_norm": 2.832092761993408, "learning_rate": 5.3972508348585724e-05, "loss": 0.2459, "step": 925 }, { "epoch": 0.5037399700802394, "grad_norm": 2.5638773441314697, "learning_rate": 5.3882408076566574e-05, "loss": 0.2525, "step": 926 }, { "epoch": 0.5042839657282742, "grad_norm": 2.2012436389923096, "learning_rate": 5.379229511923217e-05, "loss": 0.1862, "step": 927 }, { "epoch": 0.504827961376309, "grad_norm": 3.6295313835144043, "learning_rate": 5.3702169771016074e-05, "loss": 0.4819, "step": 928 }, { "epoch": 0.5053719570243438, "grad_norm": 2.235771656036377, "learning_rate": 5.361203232639237e-05, "loss": 0.1369, "step": 929 }, { "epoch": 0.5059159526723787, "grad_norm": 2.939058303833008, "learning_rate": 5.3521883079874604e-05, "loss": 0.2637, "step": 930 }, { "epoch": 0.5064599483204134, "grad_norm": 2.9005789756774902, "learning_rate": 5.3431722326014944e-05, "loss": 0.3021, "step": 931 }, { "epoch": 0.5070039439684483, "grad_norm": 2.5442330837249756, "learning_rate": 5.33415503594031e-05, "loss": 0.3062, "step": 932 }, { "epoch": 0.507547939616483, "grad_norm": 2.9684722423553467, "learning_rate": 5.325136747466549e-05, "loss": 0.3011, "step": 933 }, { "epoch": 0.5080919352645179, "grad_norm": 3.1791577339172363, "learning_rate": 5.3161173966464127e-05, "loss": 0.2983, "step": 934 }, { "epoch": 0.5086359309125527, "grad_norm": 2.684966802597046, "learning_rate": 5.307097012949581e-05, "loss": 0.1177, "step": 935 }, { "epoch": 0.5091799265605875, "grad_norm": 3.0274665355682373, "learning_rate": 5.2980756258490995e-05, "loss": 0.3401, "step": 936 }, { "epoch": 0.5097239222086223, "grad_norm": 3.5732240676879883, "learning_rate": 5.289053264821303e-05, "loss": 0.3058, "step": 937 }, { "epoch": 0.5102679178566571, "grad_norm": 1.0801187753677368, "learning_rate": 5.280029959345698e-05, "loss": 0.0584, "step": 938 }, { "epoch": 0.510811913504692, "grad_norm": 1.4819438457489014, "learning_rate": 5.2710057389048885e-05, "loss": 0.0995, "step": 939 }, { "epoch": 0.5113559091527268, "grad_norm": 2.4079933166503906, "learning_rate": 5.261980632984455e-05, "loss": 0.2472, "step": 940 }, { "epoch": 0.5118999048007616, "grad_norm": 1.945965051651001, "learning_rate": 5.2529546710728803e-05, "loss": 0.1798, "step": 941 }, { "epoch": 0.5124439004487964, "grad_norm": 2.4916775226593018, "learning_rate": 5.243927882661442e-05, "loss": 0.2623, "step": 942 }, { "epoch": 0.5129878960968313, "grad_norm": 2.854509115219116, "learning_rate": 5.2349002972441185e-05, "loss": 0.3742, "step": 943 }, { "epoch": 0.513531891744866, "grad_norm": 2.527543544769287, "learning_rate": 5.2258719443174896e-05, "loss": 0.201, "step": 944 }, { "epoch": 0.5140758873929009, "grad_norm": 3.3664357662200928, "learning_rate": 5.2168428533806455e-05, "loss": 0.4759, "step": 945 }, { "epoch": 0.5146198830409356, "grad_norm": 2.2656233310699463, "learning_rate": 5.207813053935085e-05, "loss": 0.1968, "step": 946 }, { "epoch": 0.5151638786889705, "grad_norm": 3.317201614379883, "learning_rate": 5.19878257548463e-05, "loss": 0.3596, "step": 947 }, { "epoch": 0.5157078743370053, "grad_norm": 3.050137758255005, "learning_rate": 5.189751447535307e-05, "loss": 0.3328, "step": 948 }, { "epoch": 0.5162518699850401, "grad_norm": 3.361276388168335, "learning_rate": 5.180719699595277e-05, "loss": 0.3386, "step": 949 }, { "epoch": 0.5167958656330749, "grad_norm": 2.364654541015625, "learning_rate": 5.171687361174721e-05, "loss": 0.1553, "step": 950 }, { "epoch": 0.5173398612811098, "grad_norm": 1.5370625257492065, "learning_rate": 5.162654461785753e-05, "loss": 0.3677, "step": 951 }, { "epoch": 0.5178838569291445, "grad_norm": 2.1362431049346924, "learning_rate": 5.1536210309423125e-05, "loss": 0.4374, "step": 952 }, { "epoch": 0.5184278525771794, "grad_norm": 2.113161325454712, "learning_rate": 5.1445870981600854e-05, "loss": 0.5015, "step": 953 }, { "epoch": 0.5189718482252142, "grad_norm": 2.225618839263916, "learning_rate": 5.135552692956389e-05, "loss": 0.3615, "step": 954 }, { "epoch": 0.519515843873249, "grad_norm": 2.412980556488037, "learning_rate": 5.126517844850093e-05, "loss": 0.2754, "step": 955 }, { "epoch": 0.5200598395212839, "grad_norm": 2.8438656330108643, "learning_rate": 5.1174825833615015e-05, "loss": 0.3546, "step": 956 }, { "epoch": 0.5206038351693186, "grad_norm": 2.305408239364624, "learning_rate": 5.108446938012284e-05, "loss": 0.2619, "step": 957 }, { "epoch": 0.5211478308173535, "grad_norm": 1.6424214839935303, "learning_rate": 5.0994109383253506e-05, "loss": 0.2311, "step": 958 }, { "epoch": 0.5216918264653883, "grad_norm": 1.4606355428695679, "learning_rate": 5.090374613824782e-05, "loss": 0.1753, "step": 959 }, { "epoch": 0.5222358221134231, "grad_norm": 2.1308419704437256, "learning_rate": 5.08133799403571e-05, "loss": 0.3339, "step": 960 }, { "epoch": 0.5227798177614579, "grad_norm": 1.7369599342346191, "learning_rate": 5.0723011084842356e-05, "loss": 0.2899, "step": 961 }, { "epoch": 0.5233238134094927, "grad_norm": 1.23988676071167, "learning_rate": 5.063263986697326e-05, "loss": 0.125, "step": 962 }, { "epoch": 0.5238678090575275, "grad_norm": 2.094292163848877, "learning_rate": 5.054226658202728e-05, "loss": 0.1856, "step": 963 }, { "epoch": 0.5244118047055624, "grad_norm": 2.827819585800171, "learning_rate": 5.045189152528851e-05, "loss": 0.3883, "step": 964 }, { "epoch": 0.5249558003535971, "grad_norm": 2.1713154315948486, "learning_rate": 5.036151499204692e-05, "loss": 0.2657, "step": 965 }, { "epoch": 0.525499796001632, "grad_norm": 2.5538671016693115, "learning_rate": 5.027113727759729e-05, "loss": 0.274, "step": 966 }, { "epoch": 0.5260437916496667, "grad_norm": 1.9003641605377197, "learning_rate": 5.018075867723826e-05, "loss": 0.201, "step": 967 }, { "epoch": 0.5265877872977016, "grad_norm": 1.3844491243362427, "learning_rate": 5.009037948627134e-05, "loss": 0.1449, "step": 968 }, { "epoch": 0.5271317829457365, "grad_norm": 2.1980507373809814, "learning_rate": 5e-05, "loss": 0.1674, "step": 969 }, { "epoch": 0.5276757785937712, "grad_norm": 2.6166744232177734, "learning_rate": 4.990962051372867e-05, "loss": 0.164, "step": 970 }, { "epoch": 0.5282197742418061, "grad_norm": 3.299673557281494, "learning_rate": 4.981924132276175e-05, "loss": 0.4365, "step": 971 }, { "epoch": 0.5287637698898409, "grad_norm": 2.374450445175171, "learning_rate": 4.9728862722402715e-05, "loss": 0.0998, "step": 972 }, { "epoch": 0.5293077655378757, "grad_norm": 3.2740490436553955, "learning_rate": 4.963848500795309e-05, "loss": 0.2512, "step": 973 }, { "epoch": 0.5298517611859105, "grad_norm": 2.057722568511963, "learning_rate": 4.954810847471151e-05, "loss": 0.1655, "step": 974 }, { "epoch": 0.5303957568339454, "grad_norm": 1.9950605630874634, "learning_rate": 4.945773341797274e-05, "loss": 0.1621, "step": 975 }, { "epoch": 0.5309397524819801, "grad_norm": 1.4656925201416016, "learning_rate": 4.936736013302673e-05, "loss": 0.1218, "step": 976 }, { "epoch": 0.531483748130015, "grad_norm": 2.6018762588500977, "learning_rate": 4.9276988915157656e-05, "loss": 0.2213, "step": 977 }, { "epoch": 0.5320277437780497, "grad_norm": 2.9781999588012695, "learning_rate": 4.918662005964292e-05, "loss": 0.3391, "step": 978 }, { "epoch": 0.5325717394260846, "grad_norm": 3.772651195526123, "learning_rate": 4.90962538617522e-05, "loss": 0.3276, "step": 979 }, { "epoch": 0.5331157350741194, "grad_norm": 3.6225745677948, "learning_rate": 4.900589061674649e-05, "loss": 0.4294, "step": 980 }, { "epoch": 0.5336597307221542, "grad_norm": 4.355352401733398, "learning_rate": 4.891553061987718e-05, "loss": 0.2794, "step": 981 }, { "epoch": 0.5342037263701891, "grad_norm": 2.302938222885132, "learning_rate": 4.8825174166384996e-05, "loss": 0.0842, "step": 982 }, { "epoch": 0.5347477220182238, "grad_norm": 1.863863468170166, "learning_rate": 4.87348215514991e-05, "loss": 0.1234, "step": 983 }, { "epoch": 0.5352917176662587, "grad_norm": 2.852311372756958, "learning_rate": 4.86444730704361e-05, "loss": 0.2545, "step": 984 }, { "epoch": 0.5358357133142935, "grad_norm": 2.501485586166382, "learning_rate": 4.855412901839915e-05, "loss": 0.2239, "step": 985 }, { "epoch": 0.5363797089623283, "grad_norm": 2.294297218322754, "learning_rate": 4.846378969057688e-05, "loss": 0.1914, "step": 986 }, { "epoch": 0.5369237046103631, "grad_norm": 3.3399925231933594, "learning_rate": 4.8373455382142496e-05, "loss": 0.3612, "step": 987 }, { "epoch": 0.537467700258398, "grad_norm": 1.683417558670044, "learning_rate": 4.8283126388252794e-05, "loss": 0.0721, "step": 988 }, { "epoch": 0.5380116959064327, "grad_norm": 2.5815932750701904, "learning_rate": 4.819280300404724e-05, "loss": 0.1823, "step": 989 }, { "epoch": 0.5385556915544676, "grad_norm": 2.898298978805542, "learning_rate": 4.8102485524646944e-05, "loss": 0.3294, "step": 990 }, { "epoch": 0.5390996872025023, "grad_norm": 2.211742877960205, "learning_rate": 4.801217424515373e-05, "loss": 0.2111, "step": 991 }, { "epoch": 0.5396436828505372, "grad_norm": 2.7515130043029785, "learning_rate": 4.7921869460649146e-05, "loss": 0.2536, "step": 992 }, { "epoch": 0.540187678498572, "grad_norm": 3.3650286197662354, "learning_rate": 4.7831571466193556e-05, "loss": 0.3063, "step": 993 }, { "epoch": 0.5407316741466068, "grad_norm": 4.672366619110107, "learning_rate": 4.7741280556825116e-05, "loss": 0.2246, "step": 994 }, { "epoch": 0.5412756697946416, "grad_norm": 1.8745067119598389, "learning_rate": 4.765099702755883e-05, "loss": 0.1311, "step": 995 }, { "epoch": 0.5418196654426765, "grad_norm": 2.3291053771972656, "learning_rate": 4.756072117338558e-05, "loss": 0.2074, "step": 996 }, { "epoch": 0.5423636610907113, "grad_norm": 2.3059706687927246, "learning_rate": 4.74704532892712e-05, "loss": 0.1773, "step": 997 }, { "epoch": 0.5429076567387461, "grad_norm": 3.322502613067627, "learning_rate": 4.7380193670155465e-05, "loss": 0.3661, "step": 998 }, { "epoch": 0.543451652386781, "grad_norm": 1.473237156867981, "learning_rate": 4.728994261095114e-05, "loss": 0.073, "step": 999 }, { "epoch": 0.5439956480348157, "grad_norm": 2.710575580596924, "learning_rate": 4.719970040654301e-05, "loss": 0.0899, "step": 1000 }, { "epoch": 0.5445396436828506, "grad_norm": 1.2192273139953613, "learning_rate": 4.710946735178698e-05, "loss": 0.388, "step": 1001 }, { "epoch": 0.5450836393308853, "grad_norm": 1.446656346321106, "learning_rate": 4.701924374150901e-05, "loss": 0.2804, "step": 1002 }, { "epoch": 0.5456276349789202, "grad_norm": 1.5807323455810547, "learning_rate": 4.6929029870504213e-05, "loss": 0.3531, "step": 1003 }, { "epoch": 0.546171630626955, "grad_norm": 1.8053839206695557, "learning_rate": 4.683882603353587e-05, "loss": 0.2228, "step": 1004 }, { "epoch": 0.5467156262749898, "grad_norm": 1.9048701524734497, "learning_rate": 4.674863252533452e-05, "loss": 0.1964, "step": 1005 }, { "epoch": 0.5472596219230246, "grad_norm": 2.4848711490631104, "learning_rate": 4.6658449640596904e-05, "loss": 0.2889, "step": 1006 }, { "epoch": 0.5478036175710594, "grad_norm": 2.0063815116882324, "learning_rate": 4.6568277673985075e-05, "loss": 0.3249, "step": 1007 }, { "epoch": 0.5483476132190942, "grad_norm": 1.8369240760803223, "learning_rate": 4.64781169201254e-05, "loss": 0.163, "step": 1008 }, { "epoch": 0.5488916088671291, "grad_norm": 2.0438547134399414, "learning_rate": 4.638796767360765e-05, "loss": 0.3336, "step": 1009 }, { "epoch": 0.5494356045151639, "grad_norm": 1.8484487533569336, "learning_rate": 4.629783022898394e-05, "loss": 0.1981, "step": 1010 }, { "epoch": 0.5499796001631987, "grad_norm": 2.507326364517212, "learning_rate": 4.620770488076785e-05, "loss": 0.3989, "step": 1011 }, { "epoch": 0.5505235958112336, "grad_norm": 2.254758834838867, "learning_rate": 4.611759192343343e-05, "loss": 0.1449, "step": 1012 }, { "epoch": 0.5510675914592683, "grad_norm": 1.5574525594711304, "learning_rate": 4.602749165141428e-05, "loss": 0.1496, "step": 1013 }, { "epoch": 0.5516115871073032, "grad_norm": 1.5784273147583008, "learning_rate": 4.593740435910251e-05, "loss": 0.1614, "step": 1014 }, { "epoch": 0.5521555827553379, "grad_norm": 2.9995667934417725, "learning_rate": 4.584733034084782e-05, "loss": 0.1076, "step": 1015 }, { "epoch": 0.5526995784033728, "grad_norm": 2.5353121757507324, "learning_rate": 4.575726989095656e-05, "loss": 0.2793, "step": 1016 }, { "epoch": 0.5532435740514076, "grad_norm": 1.6350631713867188, "learning_rate": 4.566722330369074e-05, "loss": 0.1516, "step": 1017 }, { "epoch": 0.5537875696994424, "grad_norm": 2.582368850708008, "learning_rate": 4.557719087326707e-05, "loss": 0.2539, "step": 1018 }, { "epoch": 0.5543315653474772, "grad_norm": 1.8553560972213745, "learning_rate": 4.548717289385602e-05, "loss": 0.1723, "step": 1019 }, { "epoch": 0.554875560995512, "grad_norm": 2.793555736541748, "learning_rate": 4.5397169659580804e-05, "loss": 0.1877, "step": 1020 }, { "epoch": 0.5554195566435468, "grad_norm": 3.2320806980133057, "learning_rate": 4.5307181464516514e-05, "loss": 0.3495, "step": 1021 }, { "epoch": 0.5559635522915817, "grad_norm": 3.0536913871765137, "learning_rate": 4.521720860268907e-05, "loss": 0.3285, "step": 1022 }, { "epoch": 0.5565075479396164, "grad_norm": 2.6070117950439453, "learning_rate": 4.512725136807429e-05, "loss": 0.2889, "step": 1023 }, { "epoch": 0.5570515435876513, "grad_norm": 1.2031081914901733, "learning_rate": 4.503731005459693e-05, "loss": 0.0569, "step": 1024 }, { "epoch": 0.5575955392356862, "grad_norm": 2.3862569332122803, "learning_rate": 4.494738495612974e-05, "loss": 0.2256, "step": 1025 }, { "epoch": 0.5581395348837209, "grad_norm": 2.783047676086426, "learning_rate": 4.48574763664925e-05, "loss": 0.2958, "step": 1026 }, { "epoch": 0.5586835305317558, "grad_norm": 1.4986709356307983, "learning_rate": 4.476758457945101e-05, "loss": 0.1005, "step": 1027 }, { "epoch": 0.5592275261797905, "grad_norm": 1.5824928283691406, "learning_rate": 4.467770988871621e-05, "loss": 0.0811, "step": 1028 }, { "epoch": 0.5597715218278254, "grad_norm": 2.203159809112549, "learning_rate": 4.458785258794314e-05, "loss": 0.2384, "step": 1029 }, { "epoch": 0.5603155174758602, "grad_norm": 2.3512794971466064, "learning_rate": 4.449801297073007e-05, "loss": 0.192, "step": 1030 }, { "epoch": 0.560859513123895, "grad_norm": 2.0095512866973877, "learning_rate": 4.440819133061745e-05, "loss": 0.1017, "step": 1031 }, { "epoch": 0.5614035087719298, "grad_norm": 1.913527011871338, "learning_rate": 4.431838796108701e-05, "loss": 0.1326, "step": 1032 }, { "epoch": 0.5619475044199647, "grad_norm": 1.7646199464797974, "learning_rate": 4.422860315556075e-05, "loss": 0.0737, "step": 1033 }, { "epoch": 0.5624915000679994, "grad_norm": 2.8547816276550293, "learning_rate": 4.4138837207400096e-05, "loss": 0.3313, "step": 1034 }, { "epoch": 0.5630354957160343, "grad_norm": 2.632138252258301, "learning_rate": 4.404909040990477e-05, "loss": 0.2699, "step": 1035 }, { "epoch": 0.563579491364069, "grad_norm": 2.9194788932800293, "learning_rate": 4.395936305631197e-05, "loss": 0.3, "step": 1036 }, { "epoch": 0.5641234870121039, "grad_norm": 2.4475746154785156, "learning_rate": 4.3869655439795323e-05, "loss": 0.1902, "step": 1037 }, { "epoch": 0.5646674826601387, "grad_norm": 3.289783477783203, "learning_rate": 4.377996785346404e-05, "loss": 0.3546, "step": 1038 }, { "epoch": 0.5652114783081735, "grad_norm": 2.394810676574707, "learning_rate": 4.369030059036182e-05, "loss": 0.2024, "step": 1039 }, { "epoch": 0.5657554739562084, "grad_norm": 1.5709584951400757, "learning_rate": 4.3600653943465984e-05, "loss": 0.0964, "step": 1040 }, { "epoch": 0.5662994696042432, "grad_norm": 5.529552459716797, "learning_rate": 4.351102820568647e-05, "loss": 0.3805, "step": 1041 }, { "epoch": 0.566843465252278, "grad_norm": 2.480299472808838, "learning_rate": 4.342142366986494e-05, "loss": 0.2666, "step": 1042 }, { "epoch": 0.5673874609003128, "grad_norm": 2.7174181938171387, "learning_rate": 4.3331840628773746e-05, "loss": 0.1452, "step": 1043 }, { "epoch": 0.5679314565483476, "grad_norm": 2.3503835201263428, "learning_rate": 4.3242279375115026e-05, "loss": 0.1952, "step": 1044 }, { "epoch": 0.5684754521963824, "grad_norm": 2.9773964881896973, "learning_rate": 4.3152740201519696e-05, "loss": 0.3436, "step": 1045 }, { "epoch": 0.5690194478444173, "grad_norm": 2.796121835708618, "learning_rate": 4.3063223400546594e-05, "loss": 0.267, "step": 1046 }, { "epoch": 0.569563443492452, "grad_norm": 2.0050129890441895, "learning_rate": 4.2973729264681395e-05, "loss": 0.1757, "step": 1047 }, { "epoch": 0.5701074391404869, "grad_norm": 4.219791889190674, "learning_rate": 4.288425808633575e-05, "loss": 0.2924, "step": 1048 }, { "epoch": 0.5706514347885216, "grad_norm": 0.8270893692970276, "learning_rate": 4.2794810157846283e-05, "loss": 0.0345, "step": 1049 }, { "epoch": 0.5711954304365565, "grad_norm": 1.7071611881256104, "learning_rate": 4.270538577147367e-05, "loss": 0.0607, "step": 1050 }, { "epoch": 0.5717394260845913, "grad_norm": 1.1855194568634033, "learning_rate": 4.2615985219401664e-05, "loss": 0.5207, "step": 1051 }, { "epoch": 0.5722834217326261, "grad_norm": 1.6140190362930298, "learning_rate": 4.252660879373612e-05, "loss": 0.4531, "step": 1052 }, { "epoch": 0.572827417380661, "grad_norm": 1.6048341989517212, "learning_rate": 4.243725678650406e-05, "loss": 0.3627, "step": 1053 }, { "epoch": 0.5733714130286958, "grad_norm": 1.8193219900131226, "learning_rate": 4.234792948965279e-05, "loss": 0.2679, "step": 1054 }, { "epoch": 0.5739154086767306, "grad_norm": 1.6485986709594727, "learning_rate": 4.225862719504878e-05, "loss": 0.2294, "step": 1055 }, { "epoch": 0.5744594043247654, "grad_norm": 1.165639042854309, "learning_rate": 4.216935019447689e-05, "loss": 0.147, "step": 1056 }, { "epoch": 0.5750033999728003, "grad_norm": 1.845552682876587, "learning_rate": 4.208009877963926e-05, "loss": 0.2779, "step": 1057 }, { "epoch": 0.575547395620835, "grad_norm": 2.9298880100250244, "learning_rate": 4.199087324215449e-05, "loss": 0.6321, "step": 1058 }, { "epoch": 0.5760913912688699, "grad_norm": 1.5594950914382935, "learning_rate": 4.1901673873556624e-05, "loss": 0.2057, "step": 1059 }, { "epoch": 0.5766353869169046, "grad_norm": 1.4242677688598633, "learning_rate": 4.1812500965294176e-05, "loss": 0.2291, "step": 1060 }, { "epoch": 0.5771793825649395, "grad_norm": 1.2172269821166992, "learning_rate": 4.17233548087292e-05, "loss": 0.104, "step": 1061 }, { "epoch": 0.5777233782129743, "grad_norm": 2.319025754928589, "learning_rate": 4.163423569513639e-05, "loss": 0.2432, "step": 1062 }, { "epoch": 0.5782673738610091, "grad_norm": 1.7708770036697388, "learning_rate": 4.1545143915702024e-05, "loss": 0.2336, "step": 1063 }, { "epoch": 0.5788113695090439, "grad_norm": 1.2756093740463257, "learning_rate": 4.14560797615231e-05, "loss": 0.1127, "step": 1064 }, { "epoch": 0.5793553651570787, "grad_norm": 1.5920932292938232, "learning_rate": 4.136704352360634e-05, "loss": 0.1642, "step": 1065 }, { "epoch": 0.5798993608051135, "grad_norm": 1.7653011083602905, "learning_rate": 4.127803549286727e-05, "loss": 0.157, "step": 1066 }, { "epoch": 0.5804433564531484, "grad_norm": 1.5852701663970947, "learning_rate": 4.1189055960129246e-05, "loss": 0.1911, "step": 1067 }, { "epoch": 0.5809873521011832, "grad_norm": 1.0604381561279297, "learning_rate": 4.11001052161225e-05, "loss": 0.0607, "step": 1068 }, { "epoch": 0.581531347749218, "grad_norm": 1.4664257764816284, "learning_rate": 4.101118355148319e-05, "loss": 0.0705, "step": 1069 }, { "epoch": 0.5820753433972529, "grad_norm": 1.4670159816741943, "learning_rate": 4.092229125675249e-05, "loss": 0.1537, "step": 1070 }, { "epoch": 0.5826193390452876, "grad_norm": 2.102532386779785, "learning_rate": 4.0833428622375616e-05, "loss": 0.208, "step": 1071 }, { "epoch": 0.5831633346933225, "grad_norm": 1.9754890203475952, "learning_rate": 4.074459593870083e-05, "loss": 0.1565, "step": 1072 }, { "epoch": 0.5837073303413572, "grad_norm": 1.123861312866211, "learning_rate": 4.0655793495978555e-05, "loss": 0.0621, "step": 1073 }, { "epoch": 0.5842513259893921, "grad_norm": 2.32432222366333, "learning_rate": 4.056702158436042e-05, "loss": 0.2448, "step": 1074 }, { "epoch": 0.5847953216374269, "grad_norm": 2.725759506225586, "learning_rate": 4.0478280493898285e-05, "loss": 0.2256, "step": 1075 }, { "epoch": 0.5853393172854617, "grad_norm": 2.4367079734802246, "learning_rate": 4.0389570514543305e-05, "loss": 0.15, "step": 1076 }, { "epoch": 0.5858833129334965, "grad_norm": 2.3989126682281494, "learning_rate": 4.030089193614498e-05, "loss": 0.1959, "step": 1077 }, { "epoch": 0.5864273085815314, "grad_norm": 1.093265414237976, "learning_rate": 4.0212245048450214e-05, "loss": 0.04, "step": 1078 }, { "epoch": 0.5869713042295661, "grad_norm": 1.9894245862960815, "learning_rate": 4.012363014110237e-05, "loss": 0.134, "step": 1079 }, { "epoch": 0.587515299877601, "grad_norm": 2.7768666744232178, "learning_rate": 4.003504750364032e-05, "loss": 0.2505, "step": 1080 }, { "epoch": 0.5880592955256359, "grad_norm": 2.4392809867858887, "learning_rate": 3.994649742549749e-05, "loss": 0.2843, "step": 1081 }, { "epoch": 0.5886032911736706, "grad_norm": 1.7027760744094849, "learning_rate": 3.98579801960009e-05, "loss": 0.1065, "step": 1082 }, { "epoch": 0.5891472868217055, "grad_norm": 1.9044114351272583, "learning_rate": 3.9769496104370304e-05, "loss": 0.1187, "step": 1083 }, { "epoch": 0.5896912824697402, "grad_norm": 1.3583406209945679, "learning_rate": 3.968104543971712e-05, "loss": 0.0662, "step": 1084 }, { "epoch": 0.5902352781177751, "grad_norm": 2.8347413539886475, "learning_rate": 3.9592628491043595e-05, "loss": 0.2616, "step": 1085 }, { "epoch": 0.5907792737658099, "grad_norm": 1.6906051635742188, "learning_rate": 3.950424554724175e-05, "loss": 0.1167, "step": 1086 }, { "epoch": 0.5913232694138447, "grad_norm": 2.4014132022857666, "learning_rate": 3.941589689709258e-05, "loss": 0.1427, "step": 1087 }, { "epoch": 0.5918672650618795, "grad_norm": 3.3171188831329346, "learning_rate": 3.9327582829264964e-05, "loss": 0.3742, "step": 1088 }, { "epoch": 0.5924112607099143, "grad_norm": 2.441514730453491, "learning_rate": 3.9239303632314815e-05, "loss": 0.1466, "step": 1089 }, { "epoch": 0.5929552563579491, "grad_norm": 2.3930084705352783, "learning_rate": 3.91510595946841e-05, "loss": 0.2117, "step": 1090 }, { "epoch": 0.593499252005984, "grad_norm": 2.9923386573791504, "learning_rate": 3.906285100469992e-05, "loss": 0.2603, "step": 1091 }, { "epoch": 0.5940432476540187, "grad_norm": 6.02304220199585, "learning_rate": 3.897467815057356e-05, "loss": 0.2325, "step": 1092 }, { "epoch": 0.5945872433020536, "grad_norm": 2.157905101776123, "learning_rate": 3.888654132039951e-05, "loss": 0.109, "step": 1093 }, { "epoch": 0.5951312389500883, "grad_norm": 2.235853433609009, "learning_rate": 3.879844080215457e-05, "loss": 0.1377, "step": 1094 }, { "epoch": 0.5956752345981232, "grad_norm": 2.332381248474121, "learning_rate": 3.8710376883696927e-05, "loss": 0.1429, "step": 1095 }, { "epoch": 0.5962192302461581, "grad_norm": 3.992812156677246, "learning_rate": 3.8622349852765136e-05, "loss": 0.4439, "step": 1096 }, { "epoch": 0.5967632258941928, "grad_norm": 1.9760284423828125, "learning_rate": 3.853435999697726e-05, "loss": 0.1368, "step": 1097 }, { "epoch": 0.5973072215422277, "grad_norm": 3.30705189704895, "learning_rate": 3.8446407603829857e-05, "loss": 0.2742, "step": 1098 }, { "epoch": 0.5978512171902625, "grad_norm": 2.7652344703674316, "learning_rate": 3.835849296069713e-05, "loss": 0.1895, "step": 1099 }, { "epoch": 0.5983952128382973, "grad_norm": 2.517723560333252, "learning_rate": 3.827061635482992e-05, "loss": 0.1258, "step": 1100 }, { "epoch": 0.5989392084863321, "grad_norm": 1.201888918876648, "learning_rate": 3.8182778073354764e-05, "loss": 0.3827, "step": 1101 }, { "epoch": 0.599483204134367, "grad_norm": 1.4153257608413696, "learning_rate": 3.809497840327299e-05, "loss": 0.4465, "step": 1102 }, { "epoch": 0.6000271997824017, "grad_norm": 1.707273244857788, "learning_rate": 3.8007217631459793e-05, "loss": 0.1838, "step": 1103 }, { "epoch": 0.6005711954304366, "grad_norm": 1.8509023189544678, "learning_rate": 3.791949604466324e-05, "loss": 0.504, "step": 1104 }, { "epoch": 0.6011151910784713, "grad_norm": 2.270904064178467, "learning_rate": 3.7831813929503377e-05, "loss": 0.2445, "step": 1105 }, { "epoch": 0.6016591867265062, "grad_norm": 1.7941827774047852, "learning_rate": 3.7744171572471274e-05, "loss": 0.153, "step": 1106 }, { "epoch": 0.602203182374541, "grad_norm": 2.1824147701263428, "learning_rate": 3.7656569259928116e-05, "loss": 0.2127, "step": 1107 }, { "epoch": 0.6027471780225758, "grad_norm": 2.206667900085449, "learning_rate": 3.756900727810423e-05, "loss": 0.3083, "step": 1108 }, { "epoch": 0.6032911736706106, "grad_norm": 2.142123222351074, "learning_rate": 3.748148591309818e-05, "loss": 0.3148, "step": 1109 }, { "epoch": 0.6038351693186454, "grad_norm": 1.855591893196106, "learning_rate": 3.739400545087579e-05, "loss": 0.2204, "step": 1110 }, { "epoch": 0.6043791649666803, "grad_norm": 1.4766360521316528, "learning_rate": 3.7306566177269284e-05, "loss": 0.1048, "step": 1111 }, { "epoch": 0.6049231606147151, "grad_norm": 2.491419553756714, "learning_rate": 3.721916837797627e-05, "loss": 0.3012, "step": 1112 }, { "epoch": 0.6054671562627499, "grad_norm": 2.3039698600769043, "learning_rate": 3.713181233855886e-05, "loss": 0.166, "step": 1113 }, { "epoch": 0.6060111519107847, "grad_norm": 1.1709086894989014, "learning_rate": 3.7044498344442704e-05, "loss": 0.0586, "step": 1114 }, { "epoch": 0.6065551475588196, "grad_norm": 3.0496315956115723, "learning_rate": 3.6957226680916115e-05, "loss": 0.3599, "step": 1115 }, { "epoch": 0.6070991432068543, "grad_norm": 1.9985028505325317, "learning_rate": 3.686999763312905e-05, "loss": 0.1717, "step": 1116 }, { "epoch": 0.6076431388548892, "grad_norm": 2.392834424972534, "learning_rate": 3.678281148609224e-05, "loss": 0.2763, "step": 1117 }, { "epoch": 0.6081871345029239, "grad_norm": 2.3857150077819824, "learning_rate": 3.669566852467623e-05, "loss": 0.2197, "step": 1118 }, { "epoch": 0.6087311301509588, "grad_norm": 1.7131675481796265, "learning_rate": 3.6608569033610514e-05, "loss": 0.1284, "step": 1119 }, { "epoch": 0.6092751257989936, "grad_norm": 2.096151113510132, "learning_rate": 3.652151329748249e-05, "loss": 0.2341, "step": 1120 }, { "epoch": 0.6098191214470284, "grad_norm": 2.4356324672698975, "learning_rate": 3.643450160073662e-05, "loss": 0.2251, "step": 1121 }, { "epoch": 0.6103631170950632, "grad_norm": 2.0904552936553955, "learning_rate": 3.634753422767344e-05, "loss": 0.0933, "step": 1122 }, { "epoch": 0.6109071127430981, "grad_norm": 1.3773777484893799, "learning_rate": 3.626061146244874e-05, "loss": 0.0952, "step": 1123 }, { "epoch": 0.6114511083911329, "grad_norm": 1.8549007177352905, "learning_rate": 3.617373358907248e-05, "loss": 0.1546, "step": 1124 }, { "epoch": 0.6119951040391677, "grad_norm": 2.2880680561065674, "learning_rate": 3.6086900891407975e-05, "loss": 0.2113, "step": 1125 }, { "epoch": 0.6125390996872025, "grad_norm": 3.152132034301758, "learning_rate": 3.6000113653170934e-05, "loss": 0.2675, "step": 1126 }, { "epoch": 0.6130830953352373, "grad_norm": 1.4699492454528809, "learning_rate": 3.591337215792852e-05, "loss": 0.1128, "step": 1127 }, { "epoch": 0.6136270909832722, "grad_norm": 2.9946532249450684, "learning_rate": 3.582667668909846e-05, "loss": 0.2384, "step": 1128 }, { "epoch": 0.6141710866313069, "grad_norm": 0.6904391646385193, "learning_rate": 3.574002752994806e-05, "loss": 0.0274, "step": 1129 }, { "epoch": 0.6147150822793418, "grad_norm": 2.8604655265808105, "learning_rate": 3.565342496359336e-05, "loss": 0.2807, "step": 1130 }, { "epoch": 0.6152590779273766, "grad_norm": 2.928874969482422, "learning_rate": 3.5566869272998104e-05, "loss": 0.2215, "step": 1131 }, { "epoch": 0.6158030735754114, "grad_norm": 1.4145811796188354, "learning_rate": 3.548036074097294e-05, "loss": 0.0959, "step": 1132 }, { "epoch": 0.6163470692234462, "grad_norm": 2.2398266792297363, "learning_rate": 3.539389965017438e-05, "loss": 0.1297, "step": 1133 }, { "epoch": 0.616891064871481, "grad_norm": 2.5225863456726074, "learning_rate": 3.5307486283103966e-05, "loss": 0.1758, "step": 1134 }, { "epoch": 0.6174350605195158, "grad_norm": 2.760404586791992, "learning_rate": 3.522112092210726e-05, "loss": 0.2338, "step": 1135 }, { "epoch": 0.6179790561675507, "grad_norm": 2.712618589401245, "learning_rate": 3.5134803849373024e-05, "loss": 0.2167, "step": 1136 }, { "epoch": 0.6185230518155854, "grad_norm": 3.6655478477478027, "learning_rate": 3.504853534693221e-05, "loss": 0.3147, "step": 1137 }, { "epoch": 0.6190670474636203, "grad_norm": 2.4035439491271973, "learning_rate": 3.496231569665709e-05, "loss": 0.1319, "step": 1138 }, { "epoch": 0.6196110431116552, "grad_norm": 2.4817898273468018, "learning_rate": 3.487614518026028e-05, "loss": 0.2037, "step": 1139 }, { "epoch": 0.6201550387596899, "grad_norm": 2.0642035007476807, "learning_rate": 3.4790024079293915e-05, "loss": 0.1134, "step": 1140 }, { "epoch": 0.6206990344077248, "grad_norm": 1.9763851165771484, "learning_rate": 3.470395267514863e-05, "loss": 0.135, "step": 1141 }, { "epoch": 0.6212430300557595, "grad_norm": 2.0414886474609375, "learning_rate": 3.4617931249052695e-05, "loss": 0.2055, "step": 1142 }, { "epoch": 0.6217870257037944, "grad_norm": 2.231257677078247, "learning_rate": 3.453196008207106e-05, "loss": 0.1434, "step": 1143 }, { "epoch": 0.6223310213518292, "grad_norm": 1.9991263151168823, "learning_rate": 3.444603945510451e-05, "loss": 0.1859, "step": 1144 }, { "epoch": 0.622875016999864, "grad_norm": 2.0250093936920166, "learning_rate": 3.436016964888865e-05, "loss": 0.1643, "step": 1145 }, { "epoch": 0.6234190126478988, "grad_norm": 1.9717185497283936, "learning_rate": 3.427435094399305e-05, "loss": 0.1147, "step": 1146 }, { "epoch": 0.6239630082959337, "grad_norm": 2.828608274459839, "learning_rate": 3.41885836208203e-05, "loss": 0.2226, "step": 1147 }, { "epoch": 0.6245070039439684, "grad_norm": 2.926039934158325, "learning_rate": 3.410286795960514e-05, "loss": 0.2078, "step": 1148 }, { "epoch": 0.6250509995920033, "grad_norm": 3.4419219493865967, "learning_rate": 3.401720424041349e-05, "loss": 0.2607, "step": 1149 }, { "epoch": 0.625594995240038, "grad_norm": 2.538912534713745, "learning_rate": 3.3931592743141534e-05, "loss": 0.0951, "step": 1150 }, { "epoch": 0.6261389908880729, "grad_norm": 1.395691156387329, "learning_rate": 3.384603374751485e-05, "loss": 0.5446, "step": 1151 }, { "epoch": 0.6266829865361077, "grad_norm": 1.4391968250274658, "learning_rate": 3.376052753308748e-05, "loss": 0.3477, "step": 1152 }, { "epoch": 0.6272269821841425, "grad_norm": 1.8896480798721313, "learning_rate": 3.3675074379241e-05, "loss": 0.2914, "step": 1153 }, { "epoch": 0.6277709778321774, "grad_norm": 1.2859272956848145, "learning_rate": 3.358967456518362e-05, "loss": 0.2266, "step": 1154 }, { "epoch": 0.6283149734802121, "grad_norm": 2.247731924057007, "learning_rate": 3.3504328369949236e-05, "loss": 0.164, "step": 1155 }, { "epoch": 0.628858969128247, "grad_norm": 1.2041360139846802, "learning_rate": 3.3419036072396616e-05, "loss": 0.1018, "step": 1156 }, { "epoch": 0.6294029647762818, "grad_norm": 1.7344919443130493, "learning_rate": 3.3333797951208365e-05, "loss": 0.2848, "step": 1157 }, { "epoch": 0.6299469604243166, "grad_norm": 2.021001100540161, "learning_rate": 3.324861428489011e-05, "loss": 0.2377, "step": 1158 }, { "epoch": 0.6304909560723514, "grad_norm": 1.4695277214050293, "learning_rate": 3.316348535176953e-05, "loss": 0.2484, "step": 1159 }, { "epoch": 0.6310349517203863, "grad_norm": 1.5062239170074463, "learning_rate": 3.307841142999548e-05, "loss": 0.2018, "step": 1160 }, { "epoch": 0.631578947368421, "grad_norm": 1.9715653657913208, "learning_rate": 3.299339279753707e-05, "loss": 0.1521, "step": 1161 }, { "epoch": 0.6321229430164559, "grad_norm": 1.9886500835418701, "learning_rate": 3.290842973218276e-05, "loss": 0.1611, "step": 1162 }, { "epoch": 0.6326669386644906, "grad_norm": 1.7100028991699219, "learning_rate": 3.282352251153943e-05, "loss": 0.1346, "step": 1163 }, { "epoch": 0.6332109343125255, "grad_norm": 1.5389093160629272, "learning_rate": 3.2738671413031554e-05, "loss": 0.1311, "step": 1164 }, { "epoch": 0.6337549299605603, "grad_norm": 1.6000901460647583, "learning_rate": 3.2653876713900154e-05, "loss": 0.1522, "step": 1165 }, { "epoch": 0.6342989256085951, "grad_norm": 1.5561505556106567, "learning_rate": 3.2569138691202036e-05, "loss": 0.1236, "step": 1166 }, { "epoch": 0.63484292125663, "grad_norm": 1.795312762260437, "learning_rate": 3.2484457621808783e-05, "loss": 0.1336, "step": 1167 }, { "epoch": 0.6353869169046648, "grad_norm": 1.2574602365493774, "learning_rate": 3.239983378240593e-05, "loss": 0.0385, "step": 1168 }, { "epoch": 0.6359309125526996, "grad_norm": 2.4541492462158203, "learning_rate": 3.231526744949197e-05, "loss": 0.2381, "step": 1169 }, { "epoch": 0.6364749082007344, "grad_norm": 2.3265039920806885, "learning_rate": 3.223075889937753e-05, "loss": 0.1718, "step": 1170 }, { "epoch": 0.6370189038487692, "grad_norm": 1.699328899383545, "learning_rate": 3.214630840818444e-05, "loss": 0.1609, "step": 1171 }, { "epoch": 0.637562899496804, "grad_norm": 2.107919931411743, "learning_rate": 3.206191625184483e-05, "loss": 0.1243, "step": 1172 }, { "epoch": 0.6381068951448389, "grad_norm": 1.7328230142593384, "learning_rate": 3.197758270610022e-05, "loss": 0.0987, "step": 1173 }, { "epoch": 0.6386508907928736, "grad_norm": 1.3906108140945435, "learning_rate": 3.189330804650061e-05, "loss": 0.065, "step": 1174 }, { "epoch": 0.6391948864409085, "grad_norm": 1.432022213935852, "learning_rate": 3.1809092548403626e-05, "loss": 0.1057, "step": 1175 }, { "epoch": 0.6397388820889433, "grad_norm": 2.577094078063965, "learning_rate": 3.172493648697356e-05, "loss": 0.1574, "step": 1176 }, { "epoch": 0.6402828777369781, "grad_norm": 1.5875519514083862, "learning_rate": 3.1640840137180563e-05, "loss": 0.0779, "step": 1177 }, { "epoch": 0.6408268733850129, "grad_norm": 1.8251398801803589, "learning_rate": 3.1556803773799614e-05, "loss": 0.1298, "step": 1178 }, { "epoch": 0.6413708690330477, "grad_norm": 1.9195683002471924, "learning_rate": 3.147282767140972e-05, "loss": 0.2372, "step": 1179 }, { "epoch": 0.6419148646810825, "grad_norm": 2.0955145359039307, "learning_rate": 3.1388912104392995e-05, "loss": 0.1741, "step": 1180 }, { "epoch": 0.6424588603291174, "grad_norm": 2.1014726161956787, "learning_rate": 3.1305057346933774e-05, "loss": 0.1346, "step": 1181 }, { "epoch": 0.6430028559771522, "grad_norm": 1.8869060277938843, "learning_rate": 3.122126367301769e-05, "loss": 0.0821, "step": 1182 }, { "epoch": 0.643546851625187, "grad_norm": 1.9952136278152466, "learning_rate": 3.113753135643077e-05, "loss": 0.1446, "step": 1183 }, { "epoch": 0.6440908472732219, "grad_norm": 1.9317939281463623, "learning_rate": 3.1053860670758596e-05, "loss": 0.1098, "step": 1184 }, { "epoch": 0.6446348429212566, "grad_norm": 1.7922402620315552, "learning_rate": 3.097025188938537e-05, "loss": 0.1383, "step": 1185 }, { "epoch": 0.6451788385692915, "grad_norm": 1.5979357957839966, "learning_rate": 3.0886705285493035e-05, "loss": 0.0707, "step": 1186 }, { "epoch": 0.6457228342173262, "grad_norm": 2.765090227127075, "learning_rate": 3.080322113206036e-05, "loss": 0.2919, "step": 1187 }, { "epoch": 0.6462668298653611, "grad_norm": 1.8146288394927979, "learning_rate": 3.0719799701862065e-05, "loss": 0.0879, "step": 1188 }, { "epoch": 0.6468108255133959, "grad_norm": 2.2939014434814453, "learning_rate": 3.0636441267467955e-05, "loss": 0.2647, "step": 1189 }, { "epoch": 0.6473548211614307, "grad_norm": 2.4644508361816406, "learning_rate": 3.055314610124197e-05, "loss": 0.1772, "step": 1190 }, { "epoch": 0.6478988168094655, "grad_norm": 2.1641032695770264, "learning_rate": 3.046991447534135e-05, "loss": 0.0833, "step": 1191 }, { "epoch": 0.6484428124575004, "grad_norm": 2.7081058025360107, "learning_rate": 3.0386746661715705e-05, "loss": 0.2784, "step": 1192 }, { "epoch": 0.6489868081055351, "grad_norm": 2.3017590045928955, "learning_rate": 3.030364293210618e-05, "loss": 0.1738, "step": 1193 }, { "epoch": 0.64953080375357, "grad_norm": 2.5836246013641357, "learning_rate": 3.02206035580445e-05, "loss": 0.2791, "step": 1194 }, { "epoch": 0.6500747994016048, "grad_norm": 3.1928181648254395, "learning_rate": 3.0137628810852142e-05, "loss": 0.2362, "step": 1195 }, { "epoch": 0.6506187950496396, "grad_norm": 1.9142948389053345, "learning_rate": 3.0054718961639376e-05, "loss": 0.1389, "step": 1196 }, { "epoch": 0.6511627906976745, "grad_norm": 3.3552207946777344, "learning_rate": 2.997187428130449e-05, "loss": 0.2333, "step": 1197 }, { "epoch": 0.6517067863457092, "grad_norm": 2.474776029586792, "learning_rate": 2.98890950405328e-05, "loss": 0.1651, "step": 1198 }, { "epoch": 0.6522507819937441, "grad_norm": 1.829946756362915, "learning_rate": 2.9806381509795806e-05, "loss": 0.1234, "step": 1199 }, { "epoch": 0.6527947776417788, "grad_norm": 2.2994918823242188, "learning_rate": 2.9723733959350307e-05, "loss": 0.1272, "step": 1200 }, { "epoch": 0.6533387732898137, "grad_norm": 0.6912118792533875, "learning_rate": 2.964115265923755e-05, "loss": 0.1626, "step": 1201 }, { "epoch": 0.6538827689378485, "grad_norm": 0.9854170680046082, "learning_rate": 2.9558637879282287e-05, "loss": 0.2348, "step": 1202 }, { "epoch": 0.6544267645858833, "grad_norm": 1.3098419904708862, "learning_rate": 2.9476189889091922e-05, "loss": 0.274, "step": 1203 }, { "epoch": 0.6549707602339181, "grad_norm": 1.5287585258483887, "learning_rate": 2.939380895805564e-05, "loss": 0.1945, "step": 1204 }, { "epoch": 0.655514755881953, "grad_norm": 1.2934554815292358, "learning_rate": 2.9311495355343534e-05, "loss": 0.1119, "step": 1205 }, { "epoch": 0.6560587515299877, "grad_norm": 2.2045202255249023, "learning_rate": 2.9229249349905684e-05, "loss": 0.299, "step": 1206 }, { "epoch": 0.6566027471780226, "grad_norm": 1.7026793956756592, "learning_rate": 2.914707121047131e-05, "loss": 0.15, "step": 1207 }, { "epoch": 0.6571467428260573, "grad_norm": 1.2331981658935547, "learning_rate": 2.906496120554789e-05, "loss": 0.1447, "step": 1208 }, { "epoch": 0.6576907384740922, "grad_norm": 1.0538922548294067, "learning_rate": 2.8982919603420277e-05, "loss": 0.1053, "step": 1209 }, { "epoch": 0.6582347341221271, "grad_norm": 1.6257182359695435, "learning_rate": 2.8900946672149865e-05, "loss": 0.2732, "step": 1210 }, { "epoch": 0.6587787297701618, "grad_norm": 2.4833858013153076, "learning_rate": 2.8819042679573617e-05, "loss": 0.3522, "step": 1211 }, { "epoch": 0.6593227254181967, "grad_norm": 1.3534009456634521, "learning_rate": 2.873720789330322e-05, "loss": 0.0994, "step": 1212 }, { "epoch": 0.6598667210662315, "grad_norm": 2.0353548526763916, "learning_rate": 2.8655442580724374e-05, "loss": 0.2485, "step": 1213 }, { "epoch": 0.6604107167142663, "grad_norm": 2.049567222595215, "learning_rate": 2.8573747008995648e-05, "loss": 0.2817, "step": 1214 }, { "epoch": 0.6609547123623011, "grad_norm": 1.5964915752410889, "learning_rate": 2.8492121445047782e-05, "loss": 0.1684, "step": 1215 }, { "epoch": 0.661498708010336, "grad_norm": 1.5680066347122192, "learning_rate": 2.8410566155582784e-05, "loss": 0.1119, "step": 1216 }, { "epoch": 0.6620427036583707, "grad_norm": 1.679805040359497, "learning_rate": 2.8329081407073048e-05, "loss": 0.1081, "step": 1217 }, { "epoch": 0.6625866993064056, "grad_norm": 2.72404408454895, "learning_rate": 2.82476674657605e-05, "loss": 0.2615, "step": 1218 }, { "epoch": 0.6631306949544403, "grad_norm": 2.4218029975891113, "learning_rate": 2.816632459765568e-05, "loss": 0.2354, "step": 1219 }, { "epoch": 0.6636746906024752, "grad_norm": 2.1203408241271973, "learning_rate": 2.808505306853686e-05, "loss": 0.1716, "step": 1220 }, { "epoch": 0.66421868625051, "grad_norm": 1.7052507400512695, "learning_rate": 2.800385314394936e-05, "loss": 0.1007, "step": 1221 }, { "epoch": 0.6647626818985448, "grad_norm": 2.0369038581848145, "learning_rate": 2.7922725089204426e-05, "loss": 0.1882, "step": 1222 }, { "epoch": 0.6653066775465796, "grad_norm": 2.008768320083618, "learning_rate": 2.7841669169378475e-05, "loss": 0.1548, "step": 1223 }, { "epoch": 0.6658506731946144, "grad_norm": 2.8693575859069824, "learning_rate": 2.776068564931229e-05, "loss": 0.3479, "step": 1224 }, { "epoch": 0.6663946688426493, "grad_norm": 2.063455581665039, "learning_rate": 2.767977479361008e-05, "loss": 0.171, "step": 1225 }, { "epoch": 0.6669386644906841, "grad_norm": 1.721041202545166, "learning_rate": 2.7598936866638625e-05, "loss": 0.1509, "step": 1226 }, { "epoch": 0.6674826601387189, "grad_norm": 1.877123475074768, "learning_rate": 2.751817213252641e-05, "loss": 0.1512, "step": 1227 }, { "epoch": 0.6680266557867537, "grad_norm": 2.5307633876800537, "learning_rate": 2.743748085516275e-05, "loss": 0.2472, "step": 1228 }, { "epoch": 0.6685706514347886, "grad_norm": 1.0092939138412476, "learning_rate": 2.7356863298196998e-05, "loss": 0.0386, "step": 1229 }, { "epoch": 0.6691146470828233, "grad_norm": 1.516432523727417, "learning_rate": 2.7276319725037615e-05, "loss": 0.092, "step": 1230 }, { "epoch": 0.6696586427308582, "grad_norm": 1.6784156560897827, "learning_rate": 2.719585039885134e-05, "loss": 0.0911, "step": 1231 }, { "epoch": 0.6702026383788929, "grad_norm": 2.6024105548858643, "learning_rate": 2.7115455582562287e-05, "loss": 0.2876, "step": 1232 }, { "epoch": 0.6707466340269278, "grad_norm": 2.5423591136932373, "learning_rate": 2.7035135538851096e-05, "loss": 0.235, "step": 1233 }, { "epoch": 0.6712906296749626, "grad_norm": 1.6613472700119019, "learning_rate": 2.6954890530154213e-05, "loss": 0.1429, "step": 1234 }, { "epoch": 0.6718346253229974, "grad_norm": 2.125657558441162, "learning_rate": 2.6874720818662802e-05, "loss": 0.1742, "step": 1235 }, { "epoch": 0.6723786209710322, "grad_norm": 2.8704817295074463, "learning_rate": 2.6794626666322016e-05, "loss": 0.3092, "step": 1236 }, { "epoch": 0.672922616619067, "grad_norm": 3.1473822593688965, "learning_rate": 2.6714608334830164e-05, "loss": 0.326, "step": 1237 }, { "epoch": 0.6734666122671019, "grad_norm": 3.377565860748291, "learning_rate": 2.6634666085637817e-05, "loss": 0.2256, "step": 1238 }, { "epoch": 0.6740106079151367, "grad_norm": 3.0231833457946777, "learning_rate": 2.6554800179946947e-05, "loss": 0.2588, "step": 1239 }, { "epoch": 0.6745546035631715, "grad_norm": 2.4200613498687744, "learning_rate": 2.6475010878710083e-05, "loss": 0.1999, "step": 1240 }, { "epoch": 0.6750985992112063, "grad_norm": 2.8959238529205322, "learning_rate": 2.639529844262939e-05, "loss": 0.2377, "step": 1241 }, { "epoch": 0.6756425948592412, "grad_norm": 3.126025438308716, "learning_rate": 2.6315663132156044e-05, "loss": 0.3407, "step": 1242 }, { "epoch": 0.6761865905072759, "grad_norm": 2.5204918384552, "learning_rate": 2.6236105207489077e-05, "loss": 0.1913, "step": 1243 }, { "epoch": 0.6767305861553108, "grad_norm": 1.9199343919754028, "learning_rate": 2.6156624928574707e-05, "loss": 0.0815, "step": 1244 }, { "epoch": 0.6772745818033455, "grad_norm": 2.427741050720215, "learning_rate": 2.6077222555105497e-05, "loss": 0.1825, "step": 1245 }, { "epoch": 0.6778185774513804, "grad_norm": 2.6976699829101562, "learning_rate": 2.5997898346519422e-05, "loss": 0.1737, "step": 1246 }, { "epoch": 0.6783625730994152, "grad_norm": 4.233438491821289, "learning_rate": 2.59186525619991e-05, "loss": 0.1662, "step": 1247 }, { "epoch": 0.67890656874745, "grad_norm": 2.3868956565856934, "learning_rate": 2.5839485460470862e-05, "loss": 0.1003, "step": 1248 }, { "epoch": 0.6794505643954848, "grad_norm": 1.064082384109497, "learning_rate": 2.5760397300603933e-05, "loss": 0.0258, "step": 1249 }, { "epoch": 0.6799945600435197, "grad_norm": 1.3325947523117065, "learning_rate": 2.5681388340809732e-05, "loss": 0.0366, "step": 1250 }, { "epoch": 0.6805385556915544, "grad_norm": 1.0324221849441528, "learning_rate": 2.5602458839240762e-05, "loss": 0.3713, "step": 1251 }, { "epoch": 0.6810825513395893, "grad_norm": 1.090842604637146, "learning_rate": 2.552360905378994e-05, "loss": 0.2269, "step": 1252 }, { "epoch": 0.6816265469876241, "grad_norm": 1.4820367097854614, "learning_rate": 2.5444839242089762e-05, "loss": 0.2591, "step": 1253 }, { "epoch": 0.6821705426356589, "grad_norm": 1.5052955150604248, "learning_rate": 2.5366149661511385e-05, "loss": 0.3335, "step": 1254 }, { "epoch": 0.6827145382836938, "grad_norm": 1.9581302404403687, "learning_rate": 2.5287540569163857e-05, "loss": 0.4929, "step": 1255 }, { "epoch": 0.6832585339317285, "grad_norm": 1.1789261102676392, "learning_rate": 2.5209012221893198e-05, "loss": 0.1336, "step": 1256 }, { "epoch": 0.6838025295797634, "grad_norm": 1.9022136926651, "learning_rate": 2.513056487628156e-05, "loss": 0.1489, "step": 1257 }, { "epoch": 0.6843465252277982, "grad_norm": 1.9993866682052612, "learning_rate": 2.5052198788646585e-05, "loss": 0.2478, "step": 1258 }, { "epoch": 0.684890520875833, "grad_norm": 1.2694631814956665, "learning_rate": 2.4973914215040266e-05, "loss": 0.1567, "step": 1259 }, { "epoch": 0.6854345165238678, "grad_norm": 1.3487721681594849, "learning_rate": 2.4895711411248295e-05, "loss": 0.1781, "step": 1260 }, { "epoch": 0.6859785121719026, "grad_norm": 1.2547619342803955, "learning_rate": 2.4817590632789228e-05, "loss": 0.13, "step": 1261 }, { "epoch": 0.6865225078199374, "grad_norm": 1.811140537261963, "learning_rate": 2.473955213491359e-05, "loss": 0.2328, "step": 1262 }, { "epoch": 0.6870665034679723, "grad_norm": 1.89725923538208, "learning_rate": 2.466159617260308e-05, "loss": 0.1869, "step": 1263 }, { "epoch": 0.687610499116007, "grad_norm": 2.0366244316101074, "learning_rate": 2.458372300056969e-05, "loss": 0.1748, "step": 1264 }, { "epoch": 0.6881544947640419, "grad_norm": 1.2690801620483398, "learning_rate": 2.4505932873254884e-05, "loss": 0.0728, "step": 1265 }, { "epoch": 0.6886984904120768, "grad_norm": 1.7792497873306274, "learning_rate": 2.4428226044828896e-05, "loss": 0.1277, "step": 1266 }, { "epoch": 0.6892424860601115, "grad_norm": 1.646180272102356, "learning_rate": 2.435060276918968e-05, "loss": 0.1461, "step": 1267 }, { "epoch": 0.6897864817081464, "grad_norm": 2.2565810680389404, "learning_rate": 2.4273063299962212e-05, "loss": 0.2109, "step": 1268 }, { "epoch": 0.6903304773561811, "grad_norm": 1.8959684371948242, "learning_rate": 2.419560789049768e-05, "loss": 0.1071, "step": 1269 }, { "epoch": 0.690874473004216, "grad_norm": 2.280215263366699, "learning_rate": 2.411823679387259e-05, "loss": 0.2641, "step": 1270 }, { "epoch": 0.6914184686522508, "grad_norm": 2.543991804122925, "learning_rate": 2.404095026288799e-05, "loss": 0.2159, "step": 1271 }, { "epoch": 0.6919624643002856, "grad_norm": 1.01596999168396, "learning_rate": 2.3963748550068583e-05, "loss": 0.042, "step": 1272 }, { "epoch": 0.6925064599483204, "grad_norm": 2.69083833694458, "learning_rate": 2.388663190766191e-05, "loss": 0.3137, "step": 1273 }, { "epoch": 0.6930504555963553, "grad_norm": 1.5049816370010376, "learning_rate": 2.3809600587637682e-05, "loss": 0.0988, "step": 1274 }, { "epoch": 0.69359445124439, "grad_norm": 1.371817946434021, "learning_rate": 2.3732654841686707e-05, "loss": 0.1036, "step": 1275 }, { "epoch": 0.6941384468924249, "grad_norm": 1.9504764080047607, "learning_rate": 2.3655794921220208e-05, "loss": 0.1445, "step": 1276 }, { "epoch": 0.6946824425404596, "grad_norm": 1.8772963285446167, "learning_rate": 2.3579021077369046e-05, "loss": 0.1475, "step": 1277 }, { "epoch": 0.6952264381884945, "grad_norm": 2.0987532138824463, "learning_rate": 2.3502333560982732e-05, "loss": 0.1753, "step": 1278 }, { "epoch": 0.6957704338365293, "grad_norm": 1.7872402667999268, "learning_rate": 2.3425732622628853e-05, "loss": 0.1343, "step": 1279 }, { "epoch": 0.6963144294845641, "grad_norm": 2.2338504791259766, "learning_rate": 2.3349218512592003e-05, "loss": 0.2023, "step": 1280 }, { "epoch": 0.696858425132599, "grad_norm": 1.4990575313568115, "learning_rate": 2.3272791480873087e-05, "loss": 0.0976, "step": 1281 }, { "epoch": 0.6974024207806337, "grad_norm": 2.036332368850708, "learning_rate": 2.3196451777188514e-05, "loss": 0.1241, "step": 1282 }, { "epoch": 0.6979464164286686, "grad_norm": 2.209092140197754, "learning_rate": 2.3120199650969377e-05, "loss": 0.2232, "step": 1283 }, { "epoch": 0.6984904120767034, "grad_norm": 2.278440237045288, "learning_rate": 2.3044035351360604e-05, "loss": 0.1477, "step": 1284 }, { "epoch": 0.6990344077247382, "grad_norm": 1.4189866781234741, "learning_rate": 2.296795912722014e-05, "loss": 0.0608, "step": 1285 }, { "epoch": 0.699578403372773, "grad_norm": 2.958449363708496, "learning_rate": 2.289197122711813e-05, "loss": 0.28, "step": 1286 }, { "epoch": 0.7001223990208079, "grad_norm": 2.414830207824707, "learning_rate": 2.281607189933624e-05, "loss": 0.189, "step": 1287 }, { "epoch": 0.7006663946688426, "grad_norm": 1.8456995487213135, "learning_rate": 2.2740261391866637e-05, "loss": 0.0857, "step": 1288 }, { "epoch": 0.7012103903168775, "grad_norm": 3.415088176727295, "learning_rate": 2.2664539952411272e-05, "loss": 0.4134, "step": 1289 }, { "epoch": 0.7017543859649122, "grad_norm": 2.400167942047119, "learning_rate": 2.2588907828381145e-05, "loss": 0.1415, "step": 1290 }, { "epoch": 0.7022983816129471, "grad_norm": 1.4296655654907227, "learning_rate": 2.2513365266895387e-05, "loss": 0.0768, "step": 1291 }, { "epoch": 0.7028423772609819, "grad_norm": 2.964731216430664, "learning_rate": 2.2437912514780517e-05, "loss": 0.2515, "step": 1292 }, { "epoch": 0.7033863729090167, "grad_norm": 3.374784469604492, "learning_rate": 2.2362549818569583e-05, "loss": 0.288, "step": 1293 }, { "epoch": 0.7039303685570515, "grad_norm": 3.167635202407837, "learning_rate": 2.2287277424501356e-05, "loss": 0.2421, "step": 1294 }, { "epoch": 0.7044743642050864, "grad_norm": 2.266279458999634, "learning_rate": 2.221209557851968e-05, "loss": 0.1319, "step": 1295 }, { "epoch": 0.7050183598531212, "grad_norm": 1.9841433763504028, "learning_rate": 2.2137004526272415e-05, "loss": 0.1264, "step": 1296 }, { "epoch": 0.705562355501156, "grad_norm": 2.4431369304656982, "learning_rate": 2.2062004513110795e-05, "loss": 0.1904, "step": 1297 }, { "epoch": 0.7061063511491908, "grad_norm": 2.0613796710968018, "learning_rate": 2.1987095784088636e-05, "loss": 0.0676, "step": 1298 }, { "epoch": 0.7066503467972256, "grad_norm": 1.8927065134048462, "learning_rate": 2.1912278583961455e-05, "loss": 0.0972, "step": 1299 }, { "epoch": 0.7071943424452605, "grad_norm": 1.684683918952942, "learning_rate": 2.183755315718574e-05, "loss": 0.0491, "step": 1300 }, { "epoch": 0.7077383380932952, "grad_norm": 1.019789695739746, "learning_rate": 2.176291974791809e-05, "loss": 0.3844, "step": 1301 }, { "epoch": 0.7082823337413301, "grad_norm": 1.3516138792037964, "learning_rate": 2.1688378600014427e-05, "loss": 0.504, "step": 1302 }, { "epoch": 0.7088263293893649, "grad_norm": 1.5319172143936157, "learning_rate": 2.1613929957029312e-05, "loss": 0.4543, "step": 1303 }, { "epoch": 0.7093703250373997, "grad_norm": 1.336397409439087, "learning_rate": 2.153957406221496e-05, "loss": 0.1927, "step": 1304 }, { "epoch": 0.7099143206854345, "grad_norm": 1.8143911361694336, "learning_rate": 2.1465311158520572e-05, "loss": 0.2524, "step": 1305 }, { "epoch": 0.7104583163334693, "grad_norm": 1.7706643342971802, "learning_rate": 2.1391141488591536e-05, "loss": 0.2077, "step": 1306 }, { "epoch": 0.7110023119815041, "grad_norm": 1.3932182788848877, "learning_rate": 2.1317065294768577e-05, "loss": 0.1106, "step": 1307 }, { "epoch": 0.711546307629539, "grad_norm": 1.30764901638031, "learning_rate": 2.1243082819087044e-05, "loss": 0.1311, "step": 1308 }, { "epoch": 0.7120903032775738, "grad_norm": 2.212853193283081, "learning_rate": 2.1169194303276024e-05, "loss": 0.272, "step": 1309 }, { "epoch": 0.7126342989256086, "grad_norm": 0.9941695332527161, "learning_rate": 2.1095399988757574e-05, "loss": 0.0811, "step": 1310 }, { "epoch": 0.7131782945736435, "grad_norm": 1.3657573461532593, "learning_rate": 2.1021700116646075e-05, "loss": 0.1274, "step": 1311 }, { "epoch": 0.7137222902216782, "grad_norm": 1.4923653602600098, "learning_rate": 2.094809492774723e-05, "loss": 0.1572, "step": 1312 }, { "epoch": 0.7142662858697131, "grad_norm": 1.3693082332611084, "learning_rate": 2.0874584662557385e-05, "loss": 0.079, "step": 1313 }, { "epoch": 0.7148102815177478, "grad_norm": 2.041520595550537, "learning_rate": 2.0801169561262756e-05, "loss": 0.2294, "step": 1314 }, { "epoch": 0.7153542771657827, "grad_norm": 1.4204297065734863, "learning_rate": 2.072784986373863e-05, "loss": 0.1013, "step": 1315 }, { "epoch": 0.7158982728138175, "grad_norm": 1.8913425207138062, "learning_rate": 2.0654625809548577e-05, "loss": 0.1931, "step": 1316 }, { "epoch": 0.7164422684618523, "grad_norm": 1.4526348114013672, "learning_rate": 2.058149763794363e-05, "loss": 0.1129, "step": 1317 }, { "epoch": 0.7169862641098871, "grad_norm": 1.8512177467346191, "learning_rate": 2.050846558786152e-05, "loss": 0.1551, "step": 1318 }, { "epoch": 0.717530259757922, "grad_norm": 1.301033616065979, "learning_rate": 2.0435529897926014e-05, "loss": 0.0959, "step": 1319 }, { "epoch": 0.7180742554059567, "grad_norm": 1.7049238681793213, "learning_rate": 2.0362690806445927e-05, "loss": 0.1505, "step": 1320 }, { "epoch": 0.7186182510539916, "grad_norm": 1.0795009136199951, "learning_rate": 2.0289948551414483e-05, "loss": 0.0583, "step": 1321 }, { "epoch": 0.7191622467020263, "grad_norm": 1.794386386871338, "learning_rate": 2.0217303370508517e-05, "loss": 0.1155, "step": 1322 }, { "epoch": 0.7197062423500612, "grad_norm": 2.0598485469818115, "learning_rate": 2.0144755501087687e-05, "loss": 0.1524, "step": 1323 }, { "epoch": 0.7202502379980961, "grad_norm": 2.4599297046661377, "learning_rate": 2.0072305180193694e-05, "loss": 0.1481, "step": 1324 }, { "epoch": 0.7207942336461308, "grad_norm": 1.1674220561981201, "learning_rate": 1.9999952644549508e-05, "loss": 0.0637, "step": 1325 }, { "epoch": 0.7213382292941657, "grad_norm": 1.0906187295913696, "learning_rate": 1.992769813055854e-05, "loss": 0.0407, "step": 1326 }, { "epoch": 0.7218822249422004, "grad_norm": 0.9691844582557678, "learning_rate": 1.9855541874304063e-05, "loss": 0.0632, "step": 1327 }, { "epoch": 0.7224262205902353, "grad_norm": 2.320307970046997, "learning_rate": 1.978348411154816e-05, "loss": 0.129, "step": 1328 }, { "epoch": 0.7229702162382701, "grad_norm": 1.7659658193588257, "learning_rate": 1.9711525077731208e-05, "loss": 0.1158, "step": 1329 }, { "epoch": 0.7235142118863049, "grad_norm": 2.5360403060913086, "learning_rate": 1.9639665007970924e-05, "loss": 0.2155, "step": 1330 }, { "epoch": 0.7240582075343397, "grad_norm": 2.264679193496704, "learning_rate": 1.956790413706167e-05, "loss": 0.2178, "step": 1331 }, { "epoch": 0.7246022031823746, "grad_norm": 4.004848957061768, "learning_rate": 1.9496242699473783e-05, "loss": 0.1781, "step": 1332 }, { "epoch": 0.7251461988304093, "grad_norm": 2.5687270164489746, "learning_rate": 1.9424680929352612e-05, "loss": 0.226, "step": 1333 }, { "epoch": 0.7256901944784442, "grad_norm": 2.540254831314087, "learning_rate": 1.935321906051787e-05, "loss": 0.1755, "step": 1334 }, { "epoch": 0.7262341901264789, "grad_norm": 3.0411436557769775, "learning_rate": 1.9281857326462895e-05, "loss": 0.2089, "step": 1335 }, { "epoch": 0.7267781857745138, "grad_norm": 2.0345213413238525, "learning_rate": 1.921059596035381e-05, "loss": 0.1417, "step": 1336 }, { "epoch": 0.7273221814225487, "grad_norm": 2.542239189147949, "learning_rate": 1.9139435195028842e-05, "loss": 0.1777, "step": 1337 }, { "epoch": 0.7278661770705834, "grad_norm": 1.565001368522644, "learning_rate": 1.9068375262997472e-05, "loss": 0.0771, "step": 1338 }, { "epoch": 0.7284101727186183, "grad_norm": 2.895142078399658, "learning_rate": 1.899741639643969e-05, "loss": 0.201, "step": 1339 }, { "epoch": 0.728954168366653, "grad_norm": 2.264394998550415, "learning_rate": 1.8926558827205378e-05, "loss": 0.1119, "step": 1340 }, { "epoch": 0.7294981640146879, "grad_norm": 2.604586362838745, "learning_rate": 1.885580278681335e-05, "loss": 0.1275, "step": 1341 }, { "epoch": 0.7300421596627227, "grad_norm": 2.8844971656799316, "learning_rate": 1.8785148506450678e-05, "loss": 0.1661, "step": 1342 }, { "epoch": 0.7305861553107575, "grad_norm": 1.7892411947250366, "learning_rate": 1.8714596216972007e-05, "loss": 0.0907, "step": 1343 }, { "epoch": 0.7311301509587923, "grad_norm": 1.731468915939331, "learning_rate": 1.86441461488987e-05, "loss": 0.1127, "step": 1344 }, { "epoch": 0.7316741466068272, "grad_norm": 2.6769673824310303, "learning_rate": 1.857379853241815e-05, "loss": 0.1339, "step": 1345 }, { "epoch": 0.7322181422548619, "grad_norm": 1.6985650062561035, "learning_rate": 1.850355359738297e-05, "loss": 0.0757, "step": 1346 }, { "epoch": 0.7327621379028968, "grad_norm": 2.24050235748291, "learning_rate": 1.8433411573310255e-05, "loss": 0.114, "step": 1347 }, { "epoch": 0.7333061335509315, "grad_norm": 4.434211730957031, "learning_rate": 1.8363372689380943e-05, "loss": 0.3148, "step": 1348 }, { "epoch": 0.7338501291989664, "grad_norm": 1.7095098495483398, "learning_rate": 1.829343717443888e-05, "loss": 0.0588, "step": 1349 }, { "epoch": 0.7343941248470012, "grad_norm": 2.8711113929748535, "learning_rate": 1.8223605256990182e-05, "loss": 0.1013, "step": 1350 }, { "epoch": 0.734938120495036, "grad_norm": 0.9732653498649597, "learning_rate": 1.815387716520251e-05, "loss": 0.2126, "step": 1351 }, { "epoch": 0.7354821161430709, "grad_norm": 1.2329224348068237, "learning_rate": 1.8084253126904255e-05, "loss": 0.3185, "step": 1352 }, { "epoch": 0.7360261117911057, "grad_norm": 1.4015549421310425, "learning_rate": 1.8014733369583862e-05, "loss": 0.0854, "step": 1353 }, { "epoch": 0.7365701074391405, "grad_norm": 1.5971941947937012, "learning_rate": 1.794531812038901e-05, "loss": 0.2867, "step": 1354 }, { "epoch": 0.7371141030871753, "grad_norm": 2.9556286334991455, "learning_rate": 1.7876007606125883e-05, "loss": 0.3279, "step": 1355 }, { "epoch": 0.7376580987352102, "grad_norm": 2.348330020904541, "learning_rate": 1.7806802053258564e-05, "loss": 0.2791, "step": 1356 }, { "epoch": 0.7382020943832449, "grad_norm": 1.716884732246399, "learning_rate": 1.7737701687908088e-05, "loss": 0.1755, "step": 1357 }, { "epoch": 0.7387460900312798, "grad_norm": 1.6703510284423828, "learning_rate": 1.7668706735851815e-05, "loss": 0.1496, "step": 1358 }, { "epoch": 0.7392900856793145, "grad_norm": 1.2519649267196655, "learning_rate": 1.7599817422522725e-05, "loss": 0.1265, "step": 1359 }, { "epoch": 0.7398340813273494, "grad_norm": 1.7326717376708984, "learning_rate": 1.7531033973008592e-05, "loss": 0.3378, "step": 1360 }, { "epoch": 0.7403780769753842, "grad_norm": 1.7184239625930786, "learning_rate": 1.746235661205134e-05, "loss": 0.1525, "step": 1361 }, { "epoch": 0.740922072623419, "grad_norm": 0.8514235019683838, "learning_rate": 1.7393785564046204e-05, "loss": 0.0643, "step": 1362 }, { "epoch": 0.7414660682714538, "grad_norm": 1.642906904220581, "learning_rate": 1.7325321053041053e-05, "loss": 0.1635, "step": 1363 }, { "epoch": 0.7420100639194886, "grad_norm": 1.3439081907272339, "learning_rate": 1.725696330273575e-05, "loss": 0.1285, "step": 1364 }, { "epoch": 0.7425540595675234, "grad_norm": 0.8959057927131653, "learning_rate": 1.7188712536481232e-05, "loss": 0.0653, "step": 1365 }, { "epoch": 0.7430980552155583, "grad_norm": 1.6125050783157349, "learning_rate": 1.7120568977278895e-05, "loss": 0.0753, "step": 1366 }, { "epoch": 0.7436420508635931, "grad_norm": 1.3039488792419434, "learning_rate": 1.7052532847779885e-05, "loss": 0.0928, "step": 1367 }, { "epoch": 0.7441860465116279, "grad_norm": 1.5770121812820435, "learning_rate": 1.6984604370284313e-05, "loss": 0.1621, "step": 1368 }, { "epoch": 0.7447300421596628, "grad_norm": 3.523750066757202, "learning_rate": 1.6916783766740564e-05, "loss": 0.0439, "step": 1369 }, { "epoch": 0.7452740378076975, "grad_norm": 1.6607825756072998, "learning_rate": 1.684907125874453e-05, "loss": 0.1264, "step": 1370 }, { "epoch": 0.7458180334557324, "grad_norm": 0.900172233581543, "learning_rate": 1.67814670675389e-05, "loss": 0.0391, "step": 1371 }, { "epoch": 0.7463620291037671, "grad_norm": 1.0290570259094238, "learning_rate": 1.6713971414012537e-05, "loss": 0.0653, "step": 1372 }, { "epoch": 0.746906024751802, "grad_norm": 1.615871787071228, "learning_rate": 1.6646584518699575e-05, "loss": 0.1054, "step": 1373 }, { "epoch": 0.7474500203998368, "grad_norm": 2.139085292816162, "learning_rate": 1.6579306601778822e-05, "loss": 0.1705, "step": 1374 }, { "epoch": 0.7479940160478716, "grad_norm": 1.6429537534713745, "learning_rate": 1.6512137883073032e-05, "loss": 0.108, "step": 1375 }, { "epoch": 0.7485380116959064, "grad_norm": 2.2891499996185303, "learning_rate": 1.6445078582048155e-05, "loss": 0.1575, "step": 1376 }, { "epoch": 0.7490820073439413, "grad_norm": 1.6678372621536255, "learning_rate": 1.6378128917812645e-05, "loss": 0.1271, "step": 1377 }, { "epoch": 0.749626002991976, "grad_norm": 1.825642704963684, "learning_rate": 1.63112891091167e-05, "loss": 0.0629, "step": 1378 }, { "epoch": 0.7501699986400109, "grad_norm": 1.5395997762680054, "learning_rate": 1.6244559374351576e-05, "loss": 0.1019, "step": 1379 }, { "epoch": 0.7507139942880457, "grad_norm": 2.3386495113372803, "learning_rate": 1.617793993154891e-05, "loss": 0.142, "step": 1380 }, { "epoch": 0.7512579899360805, "grad_norm": 2.825195074081421, "learning_rate": 1.6111430998379956e-05, "loss": 0.2574, "step": 1381 }, { "epoch": 0.7518019855841154, "grad_norm": 1.4050040245056152, "learning_rate": 1.6045032792154906e-05, "loss": 0.0778, "step": 1382 }, { "epoch": 0.7523459812321501, "grad_norm": 2.29719614982605, "learning_rate": 1.5978745529822137e-05, "loss": 0.2064, "step": 1383 }, { "epoch": 0.752889976880185, "grad_norm": 2.311671018600464, "learning_rate": 1.5912569427967506e-05, "loss": 0.1563, "step": 1384 }, { "epoch": 0.7534339725282198, "grad_norm": 1.6310888528823853, "learning_rate": 1.584650470281377e-05, "loss": 0.116, "step": 1385 }, { "epoch": 0.7539779681762546, "grad_norm": 2.037574052810669, "learning_rate": 1.5780551570219665e-05, "loss": 0.126, "step": 1386 }, { "epoch": 0.7545219638242894, "grad_norm": 2.8294522762298584, "learning_rate": 1.571471024567935e-05, "loss": 0.1998, "step": 1387 }, { "epoch": 0.7550659594723242, "grad_norm": 2.1995882987976074, "learning_rate": 1.564898094432168e-05, "loss": 0.1323, "step": 1388 }, { "epoch": 0.755609955120359, "grad_norm": 1.4644718170166016, "learning_rate": 1.5583363880909467e-05, "loss": 0.068, "step": 1389 }, { "epoch": 0.7561539507683939, "grad_norm": 3.13651704788208, "learning_rate": 1.5517859269838843e-05, "loss": 0.2022, "step": 1390 }, { "epoch": 0.7566979464164286, "grad_norm": 2.2333290576934814, "learning_rate": 1.5452467325138446e-05, "loss": 0.1318, "step": 1391 }, { "epoch": 0.7572419420644635, "grad_norm": 2.4578371047973633, "learning_rate": 1.53871882604688e-05, "loss": 0.0859, "step": 1392 }, { "epoch": 0.7577859377124982, "grad_norm": 2.9757943153381348, "learning_rate": 1.5322022289121686e-05, "loss": 0.2038, "step": 1393 }, { "epoch": 0.7583299333605331, "grad_norm": 2.254362106323242, "learning_rate": 1.5256969624019286e-05, "loss": 0.1706, "step": 1394 }, { "epoch": 0.758873929008568, "grad_norm": 2.3901798725128174, "learning_rate": 1.5192030477713565e-05, "loss": 0.1259, "step": 1395 }, { "epoch": 0.7594179246566027, "grad_norm": 3.2165346145629883, "learning_rate": 1.512720506238563e-05, "loss": 0.2148, "step": 1396 }, { "epoch": 0.7599619203046376, "grad_norm": 2.57200288772583, "learning_rate": 1.506249358984495e-05, "loss": 0.1505, "step": 1397 }, { "epoch": 0.7605059159526724, "grad_norm": 2.3762388229370117, "learning_rate": 1.4997896271528739e-05, "loss": 0.1414, "step": 1398 }, { "epoch": 0.7610499116007072, "grad_norm": 0.9289442300796509, "learning_rate": 1.493341331850116e-05, "loss": 0.0208, "step": 1399 }, { "epoch": 0.761593907248742, "grad_norm": 1.4251809120178223, "learning_rate": 1.486904494145272e-05, "loss": 0.0258, "step": 1400 }, { "epoch": 0.7621379028967769, "grad_norm": 0.8108697533607483, "learning_rate": 1.4804791350699631e-05, "loss": 0.2232, "step": 1401 }, { "epoch": 0.7626818985448116, "grad_norm": 1.0441067218780518, "learning_rate": 1.4740652756182981e-05, "loss": 0.2171, "step": 1402 }, { "epoch": 0.7632258941928465, "grad_norm": 1.4856404066085815, "learning_rate": 1.4676629367468114e-05, "loss": 0.3, "step": 1403 }, { "epoch": 0.7637698898408812, "grad_norm": 1.0246180295944214, "learning_rate": 1.4612721393744005e-05, "loss": 0.1589, "step": 1404 }, { "epoch": 0.7643138854889161, "grad_norm": 0.7296196818351746, "learning_rate": 1.454892904382249e-05, "loss": 0.0747, "step": 1405 }, { "epoch": 0.7648578811369509, "grad_norm": 1.9140719175338745, "learning_rate": 1.448525252613765e-05, "loss": 0.2685, "step": 1406 }, { "epoch": 0.7654018767849857, "grad_norm": 1.1215447187423706, "learning_rate": 1.4421692048745056e-05, "loss": 0.0778, "step": 1407 }, { "epoch": 0.7659458724330205, "grad_norm": 2.042494773864746, "learning_rate": 1.4358247819321119e-05, "loss": 0.209, "step": 1408 }, { "epoch": 0.7664898680810553, "grad_norm": 1.306517243385315, "learning_rate": 1.4294920045162513e-05, "loss": 0.1598, "step": 1409 }, { "epoch": 0.7670338637290902, "grad_norm": 1.0846457481384277, "learning_rate": 1.4231708933185329e-05, "loss": 0.0908, "step": 1410 }, { "epoch": 0.767577859377125, "grad_norm": 2.331904172897339, "learning_rate": 1.4168614689924465e-05, "loss": 0.2614, "step": 1411 }, { "epoch": 0.7681218550251598, "grad_norm": 1.2342945337295532, "learning_rate": 1.4105637521533021e-05, "loss": 0.1082, "step": 1412 }, { "epoch": 0.7686658506731946, "grad_norm": 1.5291941165924072, "learning_rate": 1.4042777633781545e-05, "loss": 0.1184, "step": 1413 }, { "epoch": 0.7692098463212295, "grad_norm": 1.1705737113952637, "learning_rate": 1.3980035232057397e-05, "loss": 0.0646, "step": 1414 }, { "epoch": 0.7697538419692642, "grad_norm": 1.5332928895950317, "learning_rate": 1.3917410521364038e-05, "loss": 0.13, "step": 1415 }, { "epoch": 0.7702978376172991, "grad_norm": 1.0253257751464844, "learning_rate": 1.3854903706320371e-05, "loss": 0.073, "step": 1416 }, { "epoch": 0.7708418332653338, "grad_norm": 1.9697638750076294, "learning_rate": 1.3792514991160178e-05, "loss": 0.1595, "step": 1417 }, { "epoch": 0.7713858289133687, "grad_norm": 1.6405386924743652, "learning_rate": 1.3730244579731272e-05, "loss": 0.132, "step": 1418 }, { "epoch": 0.7719298245614035, "grad_norm": 1.1315313577651978, "learning_rate": 1.3668092675494942e-05, "loss": 0.0685, "step": 1419 }, { "epoch": 0.7724738202094383, "grad_norm": 2.335501194000244, "learning_rate": 1.3606059481525296e-05, "loss": 0.1654, "step": 1420 }, { "epoch": 0.7730178158574731, "grad_norm": 1.522095799446106, "learning_rate": 1.3544145200508551e-05, "loss": 0.0369, "step": 1421 }, { "epoch": 0.773561811505508, "grad_norm": 1.5676854848861694, "learning_rate": 1.3482350034742413e-05, "loss": 0.0801, "step": 1422 }, { "epoch": 0.7741058071535428, "grad_norm": 0.9154651165008545, "learning_rate": 1.3420674186135362e-05, "loss": 0.0496, "step": 1423 }, { "epoch": 0.7746498028015776, "grad_norm": 2.7934045791625977, "learning_rate": 1.3359117856205994e-05, "loss": 0.2958, "step": 1424 }, { "epoch": 0.7751937984496124, "grad_norm": 1.4854987859725952, "learning_rate": 1.3297681246082505e-05, "loss": 0.079, "step": 1425 }, { "epoch": 0.7757377940976472, "grad_norm": 1.3444793224334717, "learning_rate": 1.3236364556501808e-05, "loss": 0.073, "step": 1426 }, { "epoch": 0.7762817897456821, "grad_norm": 1.1872408390045166, "learning_rate": 1.3175167987809017e-05, "loss": 0.042, "step": 1427 }, { "epoch": 0.7768257853937168, "grad_norm": 2.046337604522705, "learning_rate": 1.3114091739956796e-05, "loss": 0.1128, "step": 1428 }, { "epoch": 0.7773697810417517, "grad_norm": 1.5970085859298706, "learning_rate": 1.3053136012504658e-05, "loss": 0.0896, "step": 1429 }, { "epoch": 0.7779137766897865, "grad_norm": 1.1018781661987305, "learning_rate": 1.2992301004618334e-05, "loss": 0.0392, "step": 1430 }, { "epoch": 0.7784577723378213, "grad_norm": 1.0587247610092163, "learning_rate": 1.2931586915069105e-05, "loss": 0.0482, "step": 1431 }, { "epoch": 0.7790017679858561, "grad_norm": 1.747301459312439, "learning_rate": 1.2870993942233155e-05, "loss": 0.0896, "step": 1432 }, { "epoch": 0.7795457636338909, "grad_norm": 1.300168514251709, "learning_rate": 1.2810522284090965e-05, "loss": 0.0796, "step": 1433 }, { "epoch": 0.7800897592819257, "grad_norm": 1.6375699043273926, "learning_rate": 1.2750172138226618e-05, "loss": 0.1006, "step": 1434 }, { "epoch": 0.7806337549299606, "grad_norm": 2.297020196914673, "learning_rate": 1.268994370182719e-05, "loss": 0.1081, "step": 1435 }, { "epoch": 0.7811777505779953, "grad_norm": 2.2114057540893555, "learning_rate": 1.2629837171682052e-05, "loss": 0.0814, "step": 1436 }, { "epoch": 0.7817217462260302, "grad_norm": 1.7250666618347168, "learning_rate": 1.2569852744182243e-05, "loss": 0.1275, "step": 1437 }, { "epoch": 0.7822657418740651, "grad_norm": 2.8865597248077393, "learning_rate": 1.2509990615319944e-05, "loss": 0.189, "step": 1438 }, { "epoch": 0.7828097375220998, "grad_norm": 2.3319942951202393, "learning_rate": 1.245025098068764e-05, "loss": 0.1519, "step": 1439 }, { "epoch": 0.7833537331701347, "grad_norm": 2.9569590091705322, "learning_rate": 1.23906340354776e-05, "loss": 0.205, "step": 1440 }, { "epoch": 0.7838977288181694, "grad_norm": 1.9512279033660889, "learning_rate": 1.2331139974481248e-05, "loss": 0.133, "step": 1441 }, { "epoch": 0.7844417244662043, "grad_norm": 2.3612194061279297, "learning_rate": 1.2271768992088489e-05, "loss": 0.165, "step": 1442 }, { "epoch": 0.7849857201142391, "grad_norm": 2.2667036056518555, "learning_rate": 1.2212521282287092e-05, "loss": 0.1499, "step": 1443 }, { "epoch": 0.7855297157622739, "grad_norm": 2.3945088386535645, "learning_rate": 1.2153397038662012e-05, "loss": 0.1495, "step": 1444 }, { "epoch": 0.7860737114103087, "grad_norm": 2.76117205619812, "learning_rate": 1.2094396454394797e-05, "loss": 0.2922, "step": 1445 }, { "epoch": 0.7866177070583436, "grad_norm": 2.3595540523529053, "learning_rate": 1.2035519722263023e-05, "loss": 0.1476, "step": 1446 }, { "epoch": 0.7871617027063783, "grad_norm": 4.047318935394287, "learning_rate": 1.1976767034639502e-05, "loss": 0.1823, "step": 1447 }, { "epoch": 0.7877056983544132, "grad_norm": 2.4783074855804443, "learning_rate": 1.1918138583491784e-05, "loss": 0.1299, "step": 1448 }, { "epoch": 0.7882496940024479, "grad_norm": 2.551173686981201, "learning_rate": 1.1859634560381494e-05, "loss": 0.1214, "step": 1449 }, { "epoch": 0.7887936896504828, "grad_norm": 1.925065040588379, "learning_rate": 1.1801255156463703e-05, "loss": 0.073, "step": 1450 }, { "epoch": 0.7893376852985177, "grad_norm": 0.8589705228805542, "learning_rate": 1.1743000562486317e-05, "loss": 0.2774, "step": 1451 }, { "epoch": 0.7898816809465524, "grad_norm": 1.296121597290039, "learning_rate": 1.1684870968789402e-05, "loss": 0.1509, "step": 1452 }, { "epoch": 0.7904256765945873, "grad_norm": 1.252631425857544, "learning_rate": 1.1626866565304596e-05, "loss": 0.2223, "step": 1453 }, { "epoch": 0.790969672242622, "grad_norm": 0.7784968018531799, "learning_rate": 1.1568987541554577e-05, "loss": 0.0695, "step": 1454 }, { "epoch": 0.7915136678906569, "grad_norm": 1.6878989934921265, "learning_rate": 1.1511234086652262e-05, "loss": 0.1985, "step": 1455 }, { "epoch": 0.7920576635386917, "grad_norm": 2.069828748703003, "learning_rate": 1.145360638930032e-05, "loss": 0.2687, "step": 1456 }, { "epoch": 0.7926016591867265, "grad_norm": 2.9908523559570312, "learning_rate": 1.1396104637790534e-05, "loss": 0.3874, "step": 1457 }, { "epoch": 0.7931456548347613, "grad_norm": 2.0268144607543945, "learning_rate": 1.1338729020003169e-05, "loss": 0.2378, "step": 1458 }, { "epoch": 0.7936896504827962, "grad_norm": 1.7170679569244385, "learning_rate": 1.1281479723406374e-05, "loss": 0.2915, "step": 1459 }, { "epoch": 0.7942336461308309, "grad_norm": 1.7947033643722534, "learning_rate": 1.1224356935055524e-05, "loss": 0.2425, "step": 1460 }, { "epoch": 0.7947776417788658, "grad_norm": 1.5672327280044556, "learning_rate": 1.1167360841592644e-05, "loss": 0.1897, "step": 1461 }, { "epoch": 0.7953216374269005, "grad_norm": 1.682944655418396, "learning_rate": 1.1110491629245862e-05, "loss": 0.1473, "step": 1462 }, { "epoch": 0.7958656330749354, "grad_norm": 1.9672317504882812, "learning_rate": 1.1053749483828673e-05, "loss": 0.223, "step": 1463 }, { "epoch": 0.7964096287229702, "grad_norm": 1.5830997228622437, "learning_rate": 1.09971345907394e-05, "loss": 0.1368, "step": 1464 }, { "epoch": 0.796953624371005, "grad_norm": 1.4714478254318237, "learning_rate": 1.0940647134960607e-05, "loss": 0.1093, "step": 1465 }, { "epoch": 0.7974976200190399, "grad_norm": 1.4242087602615356, "learning_rate": 1.0884287301058465e-05, "loss": 0.1026, "step": 1466 }, { "epoch": 0.7980416156670747, "grad_norm": 1.1947529315948486, "learning_rate": 1.082805527318217e-05, "loss": 0.0618, "step": 1467 }, { "epoch": 0.7985856113151095, "grad_norm": 1.940982460975647, "learning_rate": 1.0771951235063288e-05, "loss": 0.1179, "step": 1468 }, { "epoch": 0.7991296069631443, "grad_norm": 2.400984525680542, "learning_rate": 1.0715975370015196e-05, "loss": 0.0672, "step": 1469 }, { "epoch": 0.7996736026111791, "grad_norm": 2.2756922245025635, "learning_rate": 1.066012786093255e-05, "loss": 0.24, "step": 1470 }, { "epoch": 0.8002175982592139, "grad_norm": 2.2270493507385254, "learning_rate": 1.0604408890290545e-05, "loss": 0.2143, "step": 1471 }, { "epoch": 0.8007615939072488, "grad_norm": 1.608907699584961, "learning_rate": 1.0548818640144403e-05, "loss": 0.1132, "step": 1472 }, { "epoch": 0.8013055895552835, "grad_norm": 1.3964457511901855, "learning_rate": 1.0493357292128781e-05, "loss": 0.1171, "step": 1473 }, { "epoch": 0.8018495852033184, "grad_norm": 1.0506035089492798, "learning_rate": 1.0438025027457165e-05, "loss": 0.0346, "step": 1474 }, { "epoch": 0.8023935808513531, "grad_norm": 1.4303292036056519, "learning_rate": 1.038282202692129e-05, "loss": 0.0917, "step": 1475 }, { "epoch": 0.802937576499388, "grad_norm": 2.0131571292877197, "learning_rate": 1.0327748470890497e-05, "loss": 0.1674, "step": 1476 }, { "epoch": 0.8034815721474228, "grad_norm": 1.6086413860321045, "learning_rate": 1.0272804539311177e-05, "loss": 0.1131, "step": 1477 }, { "epoch": 0.8040255677954576, "grad_norm": 1.3215925693511963, "learning_rate": 1.0217990411706273e-05, "loss": 0.0851, "step": 1478 }, { "epoch": 0.8045695634434924, "grad_norm": 1.3193762302398682, "learning_rate": 1.0163306267174516e-05, "loss": 0.0616, "step": 1479 }, { "epoch": 0.8051135590915273, "grad_norm": 2.5981080532073975, "learning_rate": 1.0108752284389956e-05, "loss": 0.2827, "step": 1480 }, { "epoch": 0.8056575547395621, "grad_norm": 2.486163377761841, "learning_rate": 1.005432864160139e-05, "loss": 0.2213, "step": 1481 }, { "epoch": 0.8062015503875969, "grad_norm": 1.8398014307022095, "learning_rate": 1.0000035516631678e-05, "loss": 0.1026, "step": 1482 }, { "epoch": 0.8067455460356318, "grad_norm": 1.326554536819458, "learning_rate": 9.945873086877322e-06, "loss": 0.0598, "step": 1483 }, { "epoch": 0.8072895416836665, "grad_norm": 2.775106191635132, "learning_rate": 9.891841529307715e-06, "loss": 0.2738, "step": 1484 }, { "epoch": 0.8078335373317014, "grad_norm": 2.1927573680877686, "learning_rate": 9.837941020464648e-06, "loss": 0.0971, "step": 1485 }, { "epoch": 0.8083775329797361, "grad_norm": 2.488726854324341, "learning_rate": 9.784171736461762e-06, "loss": 0.1751, "step": 1486 }, { "epoch": 0.808921528627771, "grad_norm": 2.367971181869507, "learning_rate": 9.730533852983914e-06, "loss": 0.1714, "step": 1487 }, { "epoch": 0.8094655242758058, "grad_norm": 1.7886788845062256, "learning_rate": 9.677027545286638e-06, "loss": 0.0942, "step": 1488 }, { "epoch": 0.8100095199238406, "grad_norm": 1.8454316854476929, "learning_rate": 9.623652988195536e-06, "loss": 0.1341, "step": 1489 }, { "epoch": 0.8105535155718754, "grad_norm": 1.5037360191345215, "learning_rate": 9.570410356105724e-06, "loss": 0.0536, "step": 1490 }, { "epoch": 0.8110975112199102, "grad_norm": 2.4228768348693848, "learning_rate": 9.517299822981335e-06, "loss": 0.2055, "step": 1491 }, { "epoch": 0.811641506867945, "grad_norm": 1.0392967462539673, "learning_rate": 9.46432156235481e-06, "loss": 0.0619, "step": 1492 }, { "epoch": 0.8121855025159799, "grad_norm": 2.179161310195923, "learning_rate": 9.411475747326425e-06, "loss": 0.1245, "step": 1493 }, { "epoch": 0.8127294981640147, "grad_norm": 1.5810810327529907, "learning_rate": 9.358762550563722e-06, "loss": 0.0755, "step": 1494 }, { "epoch": 0.8132734938120495, "grad_norm": 2.773293972015381, "learning_rate": 9.306182144300917e-06, "loss": 0.0977, "step": 1495 }, { "epoch": 0.8138174894600844, "grad_norm": 3.5057032108306885, "learning_rate": 9.253734700338368e-06, "loss": 0.3488, "step": 1496 }, { "epoch": 0.8143614851081191, "grad_norm": 1.7542012929916382, "learning_rate": 9.201420390041965e-06, "loss": 0.0488, "step": 1497 }, { "epoch": 0.814905480756154, "grad_norm": 1.6453478336334229, "learning_rate": 9.149239384342572e-06, "loss": 0.0576, "step": 1498 }, { "epoch": 0.8154494764041887, "grad_norm": 0.8648439645767212, "learning_rate": 9.097191853735604e-06, "loss": 0.0116, "step": 1499 }, { "epoch": 0.8159934720522236, "grad_norm": 0.8709718585014343, "learning_rate": 9.045277968280259e-06, "loss": 0.0113, "step": 1500 }, { "epoch": 0.8165374677002584, "grad_norm": 0.7547265887260437, "learning_rate": 8.993497897599084e-06, "loss": 0.223, "step": 1501 }, { "epoch": 0.8170814633482932, "grad_norm": 0.992436408996582, "learning_rate": 8.941851810877428e-06, "loss": 0.1712, "step": 1502 }, { "epoch": 0.817625458996328, "grad_norm": 1.2754369974136353, "learning_rate": 8.890339876862858e-06, "loss": 0.2472, "step": 1503 }, { "epoch": 0.8181694546443629, "grad_norm": 0.8094856142997742, "learning_rate": 8.838962263864614e-06, "loss": 0.1188, "step": 1504 }, { "epoch": 0.8187134502923976, "grad_norm": 1.2168220281600952, "learning_rate": 8.787719139753038e-06, "loss": 0.1599, "step": 1505 }, { "epoch": 0.8192574459404325, "grad_norm": 1.2538522481918335, "learning_rate": 8.736610671959027e-06, "loss": 0.1046, "step": 1506 }, { "epoch": 0.8198014415884672, "grad_norm": 1.0030944347381592, "learning_rate": 8.685637027473598e-06, "loss": 0.0683, "step": 1507 }, { "epoch": 0.8203454372365021, "grad_norm": 1.0258569717407227, "learning_rate": 8.634798372847148e-06, "loss": 0.0789, "step": 1508 }, { "epoch": 0.820889432884537, "grad_norm": 1.493883490562439, "learning_rate": 8.584094874189042e-06, "loss": 0.1712, "step": 1509 }, { "epoch": 0.8214334285325717, "grad_norm": 0.9724941849708557, "learning_rate": 8.533526697167049e-06, "loss": 0.0996, "step": 1510 }, { "epoch": 0.8219774241806066, "grad_norm": 1.256732702255249, "learning_rate": 8.483094007006787e-06, "loss": 0.1367, "step": 1511 }, { "epoch": 0.8225214198286414, "grad_norm": 1.1733742952346802, "learning_rate": 8.432796968491208e-06, "loss": 0.1159, "step": 1512 }, { "epoch": 0.8230654154766762, "grad_norm": 1.3118342161178589, "learning_rate": 8.382635745959988e-06, "loss": 0.1014, "step": 1513 }, { "epoch": 0.823609411124711, "grad_norm": 1.6488131284713745, "learning_rate": 8.332610503309047e-06, "loss": 0.1935, "step": 1514 }, { "epoch": 0.8241534067727458, "grad_norm": 1.5959988832473755, "learning_rate": 8.282721403990084e-06, "loss": 0.11, "step": 1515 }, { "epoch": 0.8246974024207806, "grad_norm": 1.6436313390731812, "learning_rate": 8.232968611009873e-06, "loss": 0.1112, "step": 1516 }, { "epoch": 0.8252413980688155, "grad_norm": 1.7652490139007568, "learning_rate": 8.183352286929847e-06, "loss": 0.1492, "step": 1517 }, { "epoch": 0.8257853937168502, "grad_norm": 1.386525273323059, "learning_rate": 8.133872593865572e-06, "loss": 0.0786, "step": 1518 }, { "epoch": 0.8263293893648851, "grad_norm": 1.4787259101867676, "learning_rate": 8.08452969348617e-06, "loss": 0.0764, "step": 1519 }, { "epoch": 0.8268733850129198, "grad_norm": 1.336531639099121, "learning_rate": 8.035323747013812e-06, "loss": 0.0872, "step": 1520 }, { "epoch": 0.8274173806609547, "grad_norm": 1.3021833896636963, "learning_rate": 7.986254915223185e-06, "loss": 0.0629, "step": 1521 }, { "epoch": 0.8279613763089896, "grad_norm": 1.1365309953689575, "learning_rate": 7.937323358440935e-06, "loss": 0.0637, "step": 1522 }, { "epoch": 0.8285053719570243, "grad_norm": 2.3605737686157227, "learning_rate": 7.888529236545267e-06, "loss": 0.1593, "step": 1523 }, { "epoch": 0.8290493676050592, "grad_norm": 1.8843390941619873, "learning_rate": 7.839872708965257e-06, "loss": 0.0917, "step": 1524 }, { "epoch": 0.829593363253094, "grad_norm": 1.4369595050811768, "learning_rate": 7.791353934680413e-06, "loss": 0.0753, "step": 1525 }, { "epoch": 0.8301373589011288, "grad_norm": 2.1893367767333984, "learning_rate": 7.742973072220177e-06, "loss": 0.1629, "step": 1526 }, { "epoch": 0.8306813545491636, "grad_norm": 1.5978200435638428, "learning_rate": 7.694730279663375e-06, "loss": 0.0499, "step": 1527 }, { "epoch": 0.8312253501971985, "grad_norm": 0.9856419563293457, "learning_rate": 7.646625714637712e-06, "loss": 0.0342, "step": 1528 }, { "epoch": 0.8317693458452332, "grad_norm": 1.908694863319397, "learning_rate": 7.59865953431923e-06, "loss": 0.1142, "step": 1529 }, { "epoch": 0.8323133414932681, "grad_norm": 2.624952793121338, "learning_rate": 7.550831895431798e-06, "loss": 0.3445, "step": 1530 }, { "epoch": 0.8328573371413028, "grad_norm": 1.5002707242965698, "learning_rate": 7.503142954246695e-06, "loss": 0.0832, "step": 1531 }, { "epoch": 0.8334013327893377, "grad_norm": 0.8018018007278442, "learning_rate": 7.455592866581929e-06, "loss": 0.0316, "step": 1532 }, { "epoch": 0.8339453284373725, "grad_norm": 1.603079080581665, "learning_rate": 7.4081817878018825e-06, "loss": 0.122, "step": 1533 }, { "epoch": 0.8344893240854073, "grad_norm": 4.352491855621338, "learning_rate": 7.360909872816724e-06, "loss": 0.1148, "step": 1534 }, { "epoch": 0.8350333197334421, "grad_norm": 1.8911892175674438, "learning_rate": 7.313777276081879e-06, "loss": 0.1193, "step": 1535 }, { "epoch": 0.835577315381477, "grad_norm": 1.1270118951797485, "learning_rate": 7.266784151597667e-06, "loss": 0.0457, "step": 1536 }, { "epoch": 0.8361213110295118, "grad_norm": 2.0327908992767334, "learning_rate": 7.219930652908602e-06, "loss": 0.0786, "step": 1537 }, { "epoch": 0.8366653066775466, "grad_norm": 2.7913618087768555, "learning_rate": 7.173216933103022e-06, "loss": 0.2529, "step": 1538 }, { "epoch": 0.8372093023255814, "grad_norm": 1.7808500528335571, "learning_rate": 7.126643144812556e-06, "loss": 0.1105, "step": 1539 }, { "epoch": 0.8377532979736162, "grad_norm": 2.997976303100586, "learning_rate": 7.080209440211627e-06, "loss": 0.2288, "step": 1540 }, { "epoch": 0.8382972936216511, "grad_norm": 1.3418859243392944, "learning_rate": 7.033915971016952e-06, "loss": 0.0556, "step": 1541 }, { "epoch": 0.8388412892696858, "grad_norm": 2.080580234527588, "learning_rate": 6.9877628884870315e-06, "loss": 0.1315, "step": 1542 }, { "epoch": 0.8393852849177207, "grad_norm": 2.3703489303588867, "learning_rate": 6.941750343421655e-06, "loss": 0.162, "step": 1543 }, { "epoch": 0.8399292805657554, "grad_norm": 1.6766456365585327, "learning_rate": 6.895878486161483e-06, "loss": 0.086, "step": 1544 }, { "epoch": 0.8404732762137903, "grad_norm": 3.4833881855010986, "learning_rate": 6.850147466587437e-06, "loss": 0.3433, "step": 1545 }, { "epoch": 0.8410172718618251, "grad_norm": 3.487314462661743, "learning_rate": 6.804557434120268e-06, "loss": 0.1398, "step": 1546 }, { "epoch": 0.8415612675098599, "grad_norm": 2.505648612976074, "learning_rate": 6.759108537720104e-06, "loss": 0.1135, "step": 1547 }, { "epoch": 0.8421052631578947, "grad_norm": 0.9417368769645691, "learning_rate": 6.713800925885905e-06, "loss": 0.0266, "step": 1548 }, { "epoch": 0.8426492588059296, "grad_norm": 2.6039652824401855, "learning_rate": 6.668634746655023e-06, "loss": 0.1447, "step": 1549 }, { "epoch": 0.8431932544539643, "grad_norm": 0.7121729254722595, "learning_rate": 6.623610147602655e-06, "loss": 0.0089, "step": 1550 }, { "epoch": 0.8437372501019992, "grad_norm": 0.923882246017456, "learning_rate": 6.578727275841412e-06, "loss": 0.2194, "step": 1551 }, { "epoch": 0.844281245750034, "grad_norm": 0.7369918823242188, "learning_rate": 6.533986278020876e-06, "loss": 0.1086, "step": 1552 }, { "epoch": 0.8448252413980688, "grad_norm": 1.1742249727249146, "learning_rate": 6.489387300327016e-06, "loss": 0.1254, "step": 1553 }, { "epoch": 0.8453692370461037, "grad_norm": 1.4827868938446045, "learning_rate": 6.444930488481771e-06, "loss": 0.1951, "step": 1554 }, { "epoch": 0.8459132326941384, "grad_norm": 0.9186825752258301, "learning_rate": 6.400615987742603e-06, "loss": 0.0496, "step": 1555 }, { "epoch": 0.8464572283421733, "grad_norm": 1.2773072719573975, "learning_rate": 6.356443942901968e-06, "loss": 0.1205, "step": 1556 }, { "epoch": 0.847001223990208, "grad_norm": 1.5452393293380737, "learning_rate": 6.312414498286878e-06, "loss": 0.1829, "step": 1557 }, { "epoch": 0.8475452196382429, "grad_norm": 1.5889948606491089, "learning_rate": 6.2685277977583885e-06, "loss": 0.1293, "step": 1558 }, { "epoch": 0.8480892152862777, "grad_norm": 0.6459242105484009, "learning_rate": 6.2247839847111575e-06, "loss": 0.0508, "step": 1559 }, { "epoch": 0.8486332109343125, "grad_norm": 1.140173316001892, "learning_rate": 6.181183202073026e-06, "loss": 0.0899, "step": 1560 }, { "epoch": 0.8491772065823473, "grad_norm": 1.5784512758255005, "learning_rate": 6.137725592304444e-06, "loss": 0.1523, "step": 1561 }, { "epoch": 0.8497212022303822, "grad_norm": 2.3454713821411133, "learning_rate": 6.094411297398073e-06, "loss": 0.1878, "step": 1562 }, { "epoch": 0.8502651978784169, "grad_norm": 1.3627198934555054, "learning_rate": 6.051240458878315e-06, "loss": 0.1295, "step": 1563 }, { "epoch": 0.8508091935264518, "grad_norm": 2.2635772228240967, "learning_rate": 6.008213217800851e-06, "loss": 0.1888, "step": 1564 }, { "epoch": 0.8513531891744867, "grad_norm": 1.515532374382019, "learning_rate": 5.9653297147521884e-06, "loss": 0.0859, "step": 1565 }, { "epoch": 0.8518971848225214, "grad_norm": 2.1630115509033203, "learning_rate": 5.922590089849145e-06, "loss": 0.2151, "step": 1566 }, { "epoch": 0.8524411804705563, "grad_norm": 1.3857431411743164, "learning_rate": 5.879994482738443e-06, "loss": 0.1154, "step": 1567 }, { "epoch": 0.852985176118591, "grad_norm": 1.0906001329421997, "learning_rate": 5.837543032596293e-06, "loss": 0.0681, "step": 1568 }, { "epoch": 0.8535291717666259, "grad_norm": 1.3181507587432861, "learning_rate": 5.795235878127842e-06, "loss": 0.0724, "step": 1569 }, { "epoch": 0.8540731674146607, "grad_norm": 2.039085865020752, "learning_rate": 5.753073157566763e-06, "loss": 0.1496, "step": 1570 }, { "epoch": 0.8546171630626955, "grad_norm": 1.201068639755249, "learning_rate": 5.711055008674837e-06, "loss": 0.0706, "step": 1571 }, { "epoch": 0.8551611587107303, "grad_norm": 2.1874308586120605, "learning_rate": 5.66918156874146e-06, "loss": 0.2013, "step": 1572 }, { "epoch": 0.8557051543587652, "grad_norm": 2.9349591732025146, "learning_rate": 5.627452974583219e-06, "loss": 0.2482, "step": 1573 }, { "epoch": 0.8562491500067999, "grad_norm": 1.8207327127456665, "learning_rate": 5.585869362543416e-06, "loss": 0.107, "step": 1574 }, { "epoch": 0.8567931456548348, "grad_norm": 4.597315311431885, "learning_rate": 5.544430868491629e-06, "loss": 0.1733, "step": 1575 }, { "epoch": 0.8573371413028695, "grad_norm": 1.7532278299331665, "learning_rate": 5.503137627823341e-06, "loss": 0.1138, "step": 1576 }, { "epoch": 0.8578811369509044, "grad_norm": 1.958823561668396, "learning_rate": 5.461989775459381e-06, "loss": 0.1315, "step": 1577 }, { "epoch": 0.8584251325989392, "grad_norm": 1.6528518199920654, "learning_rate": 5.420987445845532e-06, "loss": 0.1118, "step": 1578 }, { "epoch": 0.858969128246974, "grad_norm": 1.382939338684082, "learning_rate": 5.380130772952147e-06, "loss": 0.0782, "step": 1579 }, { "epoch": 0.8595131238950089, "grad_norm": 3.080315113067627, "learning_rate": 5.339419890273622e-06, "loss": 0.3002, "step": 1580 }, { "epoch": 0.8600571195430436, "grad_norm": 2.677290678024292, "learning_rate": 5.298854930828029e-06, "loss": 0.1951, "step": 1581 }, { "epoch": 0.8606011151910785, "grad_norm": 2.089850664138794, "learning_rate": 5.258436027156632e-06, "loss": 0.1763, "step": 1582 }, { "epoch": 0.8611451108391133, "grad_norm": 1.5543582439422607, "learning_rate": 5.218163311323471e-06, "loss": 0.0863, "step": 1583 }, { "epoch": 0.8616891064871481, "grad_norm": 3.3499083518981934, "learning_rate": 5.1780369149149464e-06, "loss": 0.1678, "step": 1584 }, { "epoch": 0.8622331021351829, "grad_norm": 1.8637248277664185, "learning_rate": 5.1380569690393846e-06, "loss": 0.093, "step": 1585 }, { "epoch": 0.8627770977832178, "grad_norm": 3.096461296081543, "learning_rate": 5.098223604326597e-06, "loss": 0.1397, "step": 1586 }, { "epoch": 0.8633210934312525, "grad_norm": 3.825195074081421, "learning_rate": 5.058536950927445e-06, "loss": 0.3308, "step": 1587 }, { "epoch": 0.8638650890792874, "grad_norm": 1.1091595888137817, "learning_rate": 5.018997138513421e-06, "loss": 0.0389, "step": 1588 }, { "epoch": 0.8644090847273221, "grad_norm": 1.737465262413025, "learning_rate": 4.979604296276274e-06, "loss": 0.0799, "step": 1589 }, { "epoch": 0.864953080375357, "grad_norm": 2.6226816177368164, "learning_rate": 4.940358552927515e-06, "loss": 0.0915, "step": 1590 }, { "epoch": 0.8654970760233918, "grad_norm": 2.10581111907959, "learning_rate": 4.901260036698008e-06, "loss": 0.1097, "step": 1591 }, { "epoch": 0.8660410716714266, "grad_norm": 2.790656328201294, "learning_rate": 4.862308875337606e-06, "loss": 0.1321, "step": 1592 }, { "epoch": 0.8665850673194615, "grad_norm": 1.890804409980774, "learning_rate": 4.823505196114686e-06, "loss": 0.1058, "step": 1593 }, { "epoch": 0.8671290629674963, "grad_norm": 2.415065050125122, "learning_rate": 4.784849125815743e-06, "loss": 0.1026, "step": 1594 }, { "epoch": 0.8676730586155311, "grad_norm": 2.2980716228485107, "learning_rate": 4.746340790744969e-06, "loss": 0.0767, "step": 1595 }, { "epoch": 0.8682170542635659, "grad_norm": 2.5799341201782227, "learning_rate": 4.707980316723837e-06, "loss": 0.1155, "step": 1596 }, { "epoch": 0.8687610499116007, "grad_norm": 2.50065016746521, "learning_rate": 4.669767829090748e-06, "loss": 0.144, "step": 1597 }, { "epoch": 0.8693050455596355, "grad_norm": 1.8984285593032837, "learning_rate": 4.631703452700542e-06, "loss": 0.0851, "step": 1598 }, { "epoch": 0.8698490412076704, "grad_norm": 1.9078527688980103, "learning_rate": 4.593787311924103e-06, "loss": 0.0474, "step": 1599 }, { "epoch": 0.8703930368557051, "grad_norm": 2.077728271484375, "learning_rate": 4.556019530648009e-06, "loss": 0.0833, "step": 1600 }, { "epoch": 0.87093703250374, "grad_norm": 0.9673436880111694, "learning_rate": 4.5184002322740785e-06, "loss": 0.337, "step": 1601 }, { "epoch": 0.8714810281517748, "grad_norm": 0.8802148103713989, "learning_rate": 4.480929539718986e-06, "loss": 0.2077, "step": 1602 }, { "epoch": 0.8720250237998096, "grad_norm": 1.0483946800231934, "learning_rate": 4.4436075754138384e-06, "loss": 0.2234, "step": 1603 }, { "epoch": 0.8725690194478444, "grad_norm": 1.032644271850586, "learning_rate": 4.406434461303782e-06, "loss": 0.1506, "step": 1604 }, { "epoch": 0.8731130150958792, "grad_norm": 0.8552057147026062, "learning_rate": 4.369410318847661e-06, "loss": 0.0722, "step": 1605 }, { "epoch": 0.873657010743914, "grad_norm": 0.9704299569129944, "learning_rate": 4.332535269017518e-06, "loss": 0.064, "step": 1606 }, { "epoch": 0.8742010063919489, "grad_norm": 0.8319167494773865, "learning_rate": 4.29580943229827e-06, "loss": 0.0512, "step": 1607 }, { "epoch": 0.8747450020399837, "grad_norm": 0.9854416847229004, "learning_rate": 4.259232928687318e-06, "loss": 0.0557, "step": 1608 }, { "epoch": 0.8752889976880185, "grad_norm": 1.2387385368347168, "learning_rate": 4.2228058776941025e-06, "loss": 0.1301, "step": 1609 }, { "epoch": 0.8758329933360534, "grad_norm": 1.2359501123428345, "learning_rate": 4.186528398339784e-06, "loss": 0.1337, "step": 1610 }, { "epoch": 0.8763769889840881, "grad_norm": 0.8162487745285034, "learning_rate": 4.150400609156774e-06, "loss": 0.0688, "step": 1611 }, { "epoch": 0.876920984632123, "grad_norm": 2.2920849323272705, "learning_rate": 4.1144226281883965e-06, "loss": 0.2257, "step": 1612 }, { "epoch": 0.8774649802801577, "grad_norm": 0.9135187864303589, "learning_rate": 4.078594572988537e-06, "loss": 0.0623, "step": 1613 }, { "epoch": 0.8780089759281926, "grad_norm": 1.0081349611282349, "learning_rate": 4.042916560621163e-06, "loss": 0.0411, "step": 1614 }, { "epoch": 0.8785529715762274, "grad_norm": 2.3004424571990967, "learning_rate": 4.007388707660015e-06, "loss": 0.231, "step": 1615 }, { "epoch": 0.8790969672242622, "grad_norm": 1.449098825454712, "learning_rate": 3.972011130188208e-06, "loss": 0.0681, "step": 1616 }, { "epoch": 0.879640962872297, "grad_norm": 1.9416675567626953, "learning_rate": 3.93678394379785e-06, "loss": 0.0734, "step": 1617 }, { "epoch": 0.8801849585203319, "grad_norm": 2.1252365112304688, "learning_rate": 3.901707263589671e-06, "loss": 0.1672, "step": 1618 }, { "epoch": 0.8807289541683666, "grad_norm": 1.9904075860977173, "learning_rate": 3.866781204172615e-06, "loss": 0.1518, "step": 1619 }, { "epoch": 0.8812729498164015, "grad_norm": 1.4934519529342651, "learning_rate": 3.832005879663492e-06, "loss": 0.1005, "step": 1620 }, { "epoch": 0.8818169454644362, "grad_norm": 0.9808096885681152, "learning_rate": 3.797381403686656e-06, "loss": 0.0542, "step": 1621 }, { "epoch": 0.8823609411124711, "grad_norm": 1.6409112215042114, "learning_rate": 3.7629078893735124e-06, "loss": 0.1114, "step": 1622 }, { "epoch": 0.882904936760506, "grad_norm": 1.5050103664398193, "learning_rate": 3.7285854493622428e-06, "loss": 0.061, "step": 1623 }, { "epoch": 0.8834489324085407, "grad_norm": 1.97392737865448, "learning_rate": 3.6944141957974244e-06, "loss": 0.1131, "step": 1624 }, { "epoch": 0.8839929280565756, "grad_norm": 1.9159128665924072, "learning_rate": 3.6603942403296343e-06, "loss": 0.1297, "step": 1625 }, { "epoch": 0.8845369237046103, "grad_norm": 1.322691798210144, "learning_rate": 3.626525694115124e-06, "loss": 0.0346, "step": 1626 }, { "epoch": 0.8850809193526452, "grad_norm": 0.9318520426750183, "learning_rate": 3.592808667815395e-06, "loss": 0.044, "step": 1627 }, { "epoch": 0.88562491500068, "grad_norm": 2.2436294555664062, "learning_rate": 3.5592432715968902e-06, "loss": 0.141, "step": 1628 }, { "epoch": 0.8861689106487148, "grad_norm": 4.926000118255615, "learning_rate": 3.525829615130649e-06, "loss": 0.1275, "step": 1629 }, { "epoch": 0.8867129062967496, "grad_norm": 2.1689817905426025, "learning_rate": 3.4925678075918787e-06, "loss": 0.186, "step": 1630 }, { "epoch": 0.8872569019447845, "grad_norm": 1.034318447113037, "learning_rate": 3.459457957659651e-06, "loss": 0.0281, "step": 1631 }, { "epoch": 0.8878008975928192, "grad_norm": 1.549639105796814, "learning_rate": 3.426500173516539e-06, "loss": 0.0889, "step": 1632 }, { "epoch": 0.8883448932408541, "grad_norm": 1.6429660320281982, "learning_rate": 3.393694562848254e-06, "loss": 0.1069, "step": 1633 }, { "epoch": 0.8888888888888888, "grad_norm": 1.2664613723754883, "learning_rate": 3.361041232843315e-06, "loss": 0.054, "step": 1634 }, { "epoch": 0.8894328845369237, "grad_norm": 2.6217501163482666, "learning_rate": 3.32854029019265e-06, "loss": 0.1795, "step": 1635 }, { "epoch": 0.8899768801849586, "grad_norm": 2.5693042278289795, "learning_rate": 3.2961918410892966e-06, "loss": 0.1816, "step": 1636 }, { "epoch": 0.8905208758329933, "grad_norm": 1.3367880582809448, "learning_rate": 3.263995991228036e-06, "loss": 0.0832, "step": 1637 }, { "epoch": 0.8910648714810282, "grad_norm": 1.7241603136062622, "learning_rate": 3.2319528458050587e-06, "loss": 0.1019, "step": 1638 }, { "epoch": 0.891608867129063, "grad_norm": 2.629528760910034, "learning_rate": 3.200062509517604e-06, "loss": 0.2019, "step": 1639 }, { "epoch": 0.8921528627770978, "grad_norm": 1.4212032556533813, "learning_rate": 3.1683250865636114e-06, "loss": 0.077, "step": 1640 }, { "epoch": 0.8926968584251326, "grad_norm": 2.028712749481201, "learning_rate": 3.1367406806414036e-06, "loss": 0.091, "step": 1641 }, { "epoch": 0.8932408540731674, "grad_norm": 2.4595437049865723, "learning_rate": 3.1053093949493627e-06, "loss": 0.2446, "step": 1642 }, { "epoch": 0.8937848497212022, "grad_norm": 1.6820522546768188, "learning_rate": 3.074031332185534e-06, "loss": 0.0772, "step": 1643 }, { "epoch": 0.8943288453692371, "grad_norm": 3.932166814804077, "learning_rate": 3.042906594547329e-06, "loss": 0.3023, "step": 1644 }, { "epoch": 0.8948728410172718, "grad_norm": 2.2325611114501953, "learning_rate": 3.0119352837311988e-06, "loss": 0.1679, "step": 1645 }, { "epoch": 0.8954168366653067, "grad_norm": 1.5536819696426392, "learning_rate": 2.98111750093229e-06, "loss": 0.1157, "step": 1646 }, { "epoch": 0.8959608323133414, "grad_norm": 1.3292300701141357, "learning_rate": 2.9504533468441174e-06, "loss": 0.071, "step": 1647 }, { "epoch": 0.8965048279613763, "grad_norm": 1.8766157627105713, "learning_rate": 2.9199429216582195e-06, "loss": 0.1024, "step": 1648 }, { "epoch": 0.8970488236094111, "grad_norm": 1.938150405883789, "learning_rate": 2.8895863250638367e-06, "loss": 0.0493, "step": 1649 }, { "epoch": 0.8975928192574459, "grad_norm": 1.9663032293319702, "learning_rate": 2.8593836562476272e-06, "loss": 0.0837, "step": 1650 }, { "epoch": 0.8981368149054808, "grad_norm": 0.7369717359542847, "learning_rate": 2.8293350138932805e-06, "loss": 0.1826, "step": 1651 }, { "epoch": 0.8986808105535156, "grad_norm": 1.0037497282028198, "learning_rate": 2.799440496181216e-06, "loss": 0.2505, "step": 1652 }, { "epoch": 0.8992248062015504, "grad_norm": 1.3233697414398193, "learning_rate": 2.769700200788289e-06, "loss": 0.2648, "step": 1653 }, { "epoch": 0.8997688018495852, "grad_norm": 1.4565203189849854, "learning_rate": 2.7401142248874412e-06, "loss": 0.2416, "step": 1654 }, { "epoch": 0.90031279749762, "grad_norm": 1.293020248413086, "learning_rate": 2.7106826651474073e-06, "loss": 0.1664, "step": 1655 }, { "epoch": 0.9008567931456548, "grad_norm": 1.6501991748809814, "learning_rate": 2.681405617732363e-06, "loss": 0.157, "step": 1656 }, { "epoch": 0.9014007887936897, "grad_norm": 1.224089503288269, "learning_rate": 2.6522831783016345e-06, "loss": 0.0838, "step": 1657 }, { "epoch": 0.9019447844417244, "grad_norm": 1.2141624689102173, "learning_rate": 2.623315442009422e-06, "loss": 0.1234, "step": 1658 }, { "epoch": 0.9024887800897593, "grad_norm": 1.0604627132415771, "learning_rate": 2.594502503504398e-06, "loss": 0.1039, "step": 1659 }, { "epoch": 0.9030327757377941, "grad_norm": 1.0698707103729248, "learning_rate": 2.565844456929478e-06, "loss": 0.0927, "step": 1660 }, { "epoch": 0.9035767713858289, "grad_norm": 1.2410540580749512, "learning_rate": 2.537341395921494e-06, "loss": 0.094, "step": 1661 }, { "epoch": 0.9041207670338637, "grad_norm": 0.8609293103218079, "learning_rate": 2.5089934136108664e-06, "loss": 0.0744, "step": 1662 }, { "epoch": 0.9046647626818985, "grad_norm": 1.7044366598129272, "learning_rate": 2.480800602621325e-06, "loss": 0.1855, "step": 1663 }, { "epoch": 0.9052087583299333, "grad_norm": 1.3923779726028442, "learning_rate": 2.452763055069579e-06, "loss": 0.0876, "step": 1664 }, { "epoch": 0.9057527539779682, "grad_norm": 1.5099945068359375, "learning_rate": 2.4248808625650376e-06, "loss": 0.112, "step": 1665 }, { "epoch": 0.906296749626003, "grad_norm": 2.0889129638671875, "learning_rate": 2.3971541162095323e-06, "loss": 0.15, "step": 1666 }, { "epoch": 0.9068407452740378, "grad_norm": 1.6550538539886475, "learning_rate": 2.3695829065969623e-06, "loss": 0.1151, "step": 1667 }, { "epoch": 0.9073847409220727, "grad_norm": 1.2602660655975342, "learning_rate": 2.3421673238130215e-06, "loss": 0.0667, "step": 1668 }, { "epoch": 0.9079287365701074, "grad_norm": 2.6563076972961426, "learning_rate": 2.3149074574349395e-06, "loss": 0.2377, "step": 1669 }, { "epoch": 0.9084727322181423, "grad_norm": 1.3382247686386108, "learning_rate": 2.287803396531152e-06, "loss": 0.0886, "step": 1670 }, { "epoch": 0.909016727866177, "grad_norm": 1.7384536266326904, "learning_rate": 2.2608552296610075e-06, "loss": 0.0763, "step": 1671 }, { "epoch": 0.9095607235142119, "grad_norm": 1.6171133518218994, "learning_rate": 2.2340630448745015e-06, "loss": 0.1036, "step": 1672 }, { "epoch": 0.9101047191622467, "grad_norm": 1.374774694442749, "learning_rate": 2.2074269297119587e-06, "loss": 0.0765, "step": 1673 } ], "logging_steps": 1, "max_steps": 1838, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6714266653551493e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }