{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1461, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006844626967830253, "grad_norm": 133.3943328857422, "learning_rate": 1.0000000000000002e-06, "loss": 8.6817, "step": 1 }, { "epoch": 0.0013689253935660506, "grad_norm": 134.5225372314453, "learning_rate": 2.0000000000000003e-06, "loss": 9.0925, "step": 2 }, { "epoch": 0.002053388090349076, "grad_norm": 137.5360565185547, "learning_rate": 3e-06, "loss": 8.8044, "step": 3 }, { "epoch": 0.0027378507871321013, "grad_norm": 130.8012237548828, "learning_rate": 4.000000000000001e-06, "loss": 9.1971, "step": 4 }, { "epoch": 0.0034223134839151265, "grad_norm": 140.36355590820312, "learning_rate": 5e-06, "loss": 8.7898, "step": 5 }, { "epoch": 0.004106776180698152, "grad_norm": 112.94204711914062, "learning_rate": 6e-06, "loss": 9.3121, "step": 6 }, { "epoch": 0.004791238877481177, "grad_norm": 116.05644226074219, "learning_rate": 7.000000000000001e-06, "loss": 9.6282, "step": 7 }, { "epoch": 0.0054757015742642025, "grad_norm": 109.55550384521484, "learning_rate": 8.000000000000001e-06, "loss": 9.2852, "step": 8 }, { "epoch": 0.006160164271047228, "grad_norm": 114.4007797241211, "learning_rate": 9e-06, "loss": 9.4568, "step": 9 }, { "epoch": 0.006844626967830253, "grad_norm": 109.41667175292969, "learning_rate": 1e-05, "loss": 9.7598, "step": 10 }, { "epoch": 0.007529089664613279, "grad_norm": 113.44347381591797, "learning_rate": 1.1000000000000001e-05, "loss": 9.265, "step": 11 }, { "epoch": 0.008213552361396304, "grad_norm": 113.23077392578125, "learning_rate": 1.2e-05, "loss": 9.6606, "step": 12 }, { "epoch": 0.00889801505817933, "grad_norm": 109.01148986816406, "learning_rate": 1.3000000000000001e-05, "loss": 8.8583, "step": 13 }, { "epoch": 0.009582477754962354, "grad_norm": 108.45953369140625, "learning_rate": 1.4000000000000001e-05, "loss": 9.1199, "step": 14 }, { "epoch": 0.01026694045174538, "grad_norm": 109.039306640625, "learning_rate": 1.5e-05, "loss": 9.3846, "step": 15 }, { "epoch": 0.010951403148528405, "grad_norm": 106.01548767089844, "learning_rate": 1.6000000000000003e-05, "loss": 8.7933, "step": 16 }, { "epoch": 0.01163586584531143, "grad_norm": 107.05581665039062, "learning_rate": 1.7000000000000003e-05, "loss": 8.433, "step": 17 }, { "epoch": 0.012320328542094456, "grad_norm": 107.55448150634766, "learning_rate": 1.8e-05, "loss": 7.8288, "step": 18 }, { "epoch": 0.013004791238877482, "grad_norm": 104.10095977783203, "learning_rate": 1.9e-05, "loss": 7.7064, "step": 19 }, { "epoch": 0.013689253935660506, "grad_norm": 104.03397369384766, "learning_rate": 2e-05, "loss": 7.2742, "step": 20 }, { "epoch": 0.014373716632443531, "grad_norm": 110.89691925048828, "learning_rate": 2.1e-05, "loss": 6.4904, "step": 21 }, { "epoch": 0.015058179329226557, "grad_norm": 91.96565246582031, "learning_rate": 2.2000000000000003e-05, "loss": 6.3405, "step": 22 }, { "epoch": 0.01574264202600958, "grad_norm": 84.62356567382812, "learning_rate": 2.3000000000000003e-05, "loss": 5.9611, "step": 23 }, { "epoch": 0.01642710472279261, "grad_norm": 80.46206665039062, "learning_rate": 2.4e-05, "loss": 5.4951, "step": 24 }, { "epoch": 0.017111567419575632, "grad_norm": 80.10145568847656, "learning_rate": 2.5e-05, "loss": 5.3007, "step": 25 }, { "epoch": 0.01779603011635866, "grad_norm": 77.08944702148438, "learning_rate": 2.6000000000000002e-05, "loss": 4.8051, "step": 26 }, { "epoch": 0.018480492813141684, "grad_norm": 96.69658660888672, "learning_rate": 2.7000000000000002e-05, "loss": 4.5556, "step": 27 }, { "epoch": 0.019164955509924708, "grad_norm": 82.76516723632812, "learning_rate": 2.8000000000000003e-05, "loss": 3.9982, "step": 28 }, { "epoch": 0.019849418206707735, "grad_norm": 82.83394622802734, "learning_rate": 2.9e-05, "loss": 3.5758, "step": 29 }, { "epoch": 0.02053388090349076, "grad_norm": 86.39913177490234, "learning_rate": 3e-05, "loss": 3.2238, "step": 30 }, { "epoch": 0.021218343600273786, "grad_norm": 84.46778869628906, "learning_rate": 3.1e-05, "loss": 2.8134, "step": 31 }, { "epoch": 0.02190280629705681, "grad_norm": 82.8138198852539, "learning_rate": 3.2000000000000005e-05, "loss": 2.1693, "step": 32 }, { "epoch": 0.022587268993839837, "grad_norm": 62.1871452331543, "learning_rate": 3.3e-05, "loss": 1.3688, "step": 33 }, { "epoch": 0.02327173169062286, "grad_norm": 43.18509292602539, "learning_rate": 3.4000000000000007e-05, "loss": 1.0285, "step": 34 }, { "epoch": 0.023956194387405885, "grad_norm": 34.47530746459961, "learning_rate": 3.5e-05, "loss": 0.6926, "step": 35 }, { "epoch": 0.024640657084188913, "grad_norm": 61.743656158447266, "learning_rate": 3.6e-05, "loss": 0.6794, "step": 36 }, { "epoch": 0.025325119780971937, "grad_norm": 58.102176666259766, "learning_rate": 3.7e-05, "loss": 0.6367, "step": 37 }, { "epoch": 0.026009582477754964, "grad_norm": 31.824047088623047, "learning_rate": 3.8e-05, "loss": 0.4706, "step": 38 }, { "epoch": 0.026694045174537988, "grad_norm": 14.520415306091309, "learning_rate": 3.9000000000000006e-05, "loss": 0.4119, "step": 39 }, { "epoch": 0.02737850787132101, "grad_norm": 12.21712875366211, "learning_rate": 4e-05, "loss": 0.3602, "step": 40 }, { "epoch": 0.02806297056810404, "grad_norm": 8.008808135986328, "learning_rate": 4.1e-05, "loss": 0.2863, "step": 41 }, { "epoch": 0.028747433264887063, "grad_norm": 12.321932792663574, "learning_rate": 4.2e-05, "loss": 0.2318, "step": 42 }, { "epoch": 0.02943189596167009, "grad_norm": 37.28017807006836, "learning_rate": 4.3e-05, "loss": 0.4295, "step": 43 }, { "epoch": 0.030116358658453114, "grad_norm": 18.66657829284668, "learning_rate": 4.4000000000000006e-05, "loss": 0.222, "step": 44 }, { "epoch": 0.030800821355236138, "grad_norm": 16.340362548828125, "learning_rate": 4.5e-05, "loss": 0.2629, "step": 45 }, { "epoch": 0.03148528405201916, "grad_norm": 18.81378936767578, "learning_rate": 4.600000000000001e-05, "loss": 0.2339, "step": 46 }, { "epoch": 0.03216974674880219, "grad_norm": 14.666397094726562, "learning_rate": 4.7e-05, "loss": 0.2562, "step": 47 }, { "epoch": 0.03285420944558522, "grad_norm": 12.222918510437012, "learning_rate": 4.8e-05, "loss": 0.0858, "step": 48 }, { "epoch": 0.03353867214236824, "grad_norm": 12.57537841796875, "learning_rate": 4.9e-05, "loss": 0.1305, "step": 49 }, { "epoch": 0.034223134839151265, "grad_norm": 12.034661293029785, "learning_rate": 5e-05, "loss": 0.2477, "step": 50 }, { "epoch": 0.03490759753593429, "grad_norm": 19.57526397705078, "learning_rate": 5.1000000000000006e-05, "loss": 0.7291, "step": 51 }, { "epoch": 0.03559206023271732, "grad_norm": 17.879213333129883, "learning_rate": 5.2000000000000004e-05, "loss": 0.7615, "step": 52 }, { "epoch": 0.03627652292950034, "grad_norm": 15.636523246765137, "learning_rate": 5.300000000000001e-05, "loss": 0.6893, "step": 53 }, { "epoch": 0.03696098562628337, "grad_norm": 10.58729076385498, "learning_rate": 5.4000000000000005e-05, "loss": 0.2366, "step": 54 }, { "epoch": 0.03764544832306639, "grad_norm": 20.624061584472656, "learning_rate": 5.500000000000001e-05, "loss": 0.4799, "step": 55 }, { "epoch": 0.038329911019849415, "grad_norm": 19.63909149169922, "learning_rate": 5.6000000000000006e-05, "loss": 0.404, "step": 56 }, { "epoch": 0.039014373716632446, "grad_norm": 14.183073043823242, "learning_rate": 5.6999999999999996e-05, "loss": 0.2917, "step": 57 }, { "epoch": 0.03969883641341547, "grad_norm": 8.666266441345215, "learning_rate": 5.8e-05, "loss": 0.1484, "step": 58 }, { "epoch": 0.040383299110198494, "grad_norm": 15.578598022460938, "learning_rate": 5.9e-05, "loss": 0.3546, "step": 59 }, { "epoch": 0.04106776180698152, "grad_norm": 15.352912902832031, "learning_rate": 6e-05, "loss": 0.2374, "step": 60 }, { "epoch": 0.04175222450376454, "grad_norm": 16.00190544128418, "learning_rate": 6.1e-05, "loss": 0.2996, "step": 61 }, { "epoch": 0.04243668720054757, "grad_norm": 18.2515926361084, "learning_rate": 6.2e-05, "loss": 0.4819, "step": 62 }, { "epoch": 0.043121149897330596, "grad_norm": 16.319528579711914, "learning_rate": 6.3e-05, "loss": 0.2373, "step": 63 }, { "epoch": 0.04380561259411362, "grad_norm": 12.565672874450684, "learning_rate": 6.400000000000001e-05, "loss": 0.1826, "step": 64 }, { "epoch": 0.044490075290896644, "grad_norm": 19.408931732177734, "learning_rate": 6.500000000000001e-05, "loss": 0.3449, "step": 65 }, { "epoch": 0.045174537987679675, "grad_norm": 11.87861156463623, "learning_rate": 6.6e-05, "loss": 0.2552, "step": 66 }, { "epoch": 0.0458590006844627, "grad_norm": 7.692336559295654, "learning_rate": 6.7e-05, "loss": 0.1735, "step": 67 }, { "epoch": 0.04654346338124572, "grad_norm": 11.235315322875977, "learning_rate": 6.800000000000001e-05, "loss": 0.2767, "step": 68 }, { "epoch": 0.04722792607802875, "grad_norm": 21.279800415039062, "learning_rate": 6.9e-05, "loss": 0.3201, "step": 69 }, { "epoch": 0.04791238877481177, "grad_norm": 13.131220817565918, "learning_rate": 7e-05, "loss": 0.225, "step": 70 }, { "epoch": 0.0485968514715948, "grad_norm": 18.59111785888672, "learning_rate": 7.1e-05, "loss": 0.2908, "step": 71 }, { "epoch": 0.049281314168377825, "grad_norm": 20.313138961791992, "learning_rate": 7.2e-05, "loss": 0.3744, "step": 72 }, { "epoch": 0.04996577686516085, "grad_norm": 7.861342906951904, "learning_rate": 7.3e-05, "loss": 0.2741, "step": 73 }, { "epoch": 0.05065023956194387, "grad_norm": 18.81517219543457, "learning_rate": 7.4e-05, "loss": 0.2744, "step": 74 }, { "epoch": 0.0513347022587269, "grad_norm": 19.289575576782227, "learning_rate": 7.500000000000001e-05, "loss": 0.2149, "step": 75 }, { "epoch": 0.05201916495550993, "grad_norm": 15.097339630126953, "learning_rate": 7.6e-05, "loss": 0.3198, "step": 76 }, { "epoch": 0.05270362765229295, "grad_norm": 11.796150207519531, "learning_rate": 7.7e-05, "loss": 0.1797, "step": 77 }, { "epoch": 0.053388090349075976, "grad_norm": 5.355785369873047, "learning_rate": 7.800000000000001e-05, "loss": 0.1396, "step": 78 }, { "epoch": 0.054072553045859, "grad_norm": 18.625282287597656, "learning_rate": 7.900000000000001e-05, "loss": 0.16, "step": 79 }, { "epoch": 0.05475701574264202, "grad_norm": 8.895524978637695, "learning_rate": 8e-05, "loss": 0.191, "step": 80 }, { "epoch": 0.055441478439425054, "grad_norm": 11.112372398376465, "learning_rate": 8.1e-05, "loss": 0.2302, "step": 81 }, { "epoch": 0.05612594113620808, "grad_norm": 20.419580459594727, "learning_rate": 8.2e-05, "loss": 0.3812, "step": 82 }, { "epoch": 0.0568104038329911, "grad_norm": 15.033398628234863, "learning_rate": 8.3e-05, "loss": 0.3122, "step": 83 }, { "epoch": 0.057494866529774126, "grad_norm": 8.075430870056152, "learning_rate": 8.4e-05, "loss": 0.2088, "step": 84 }, { "epoch": 0.05817932922655715, "grad_norm": 13.808077812194824, "learning_rate": 8.5e-05, "loss": 0.2681, "step": 85 }, { "epoch": 0.05886379192334018, "grad_norm": 6.71148157119751, "learning_rate": 8.6e-05, "loss": 0.1601, "step": 86 }, { "epoch": 0.059548254620123205, "grad_norm": 10.607872009277344, "learning_rate": 8.7e-05, "loss": 0.1628, "step": 87 }, { "epoch": 0.06023271731690623, "grad_norm": 11.25609016418457, "learning_rate": 8.800000000000001e-05, "loss": 0.2271, "step": 88 }, { "epoch": 0.06091718001368925, "grad_norm": 7.977912425994873, "learning_rate": 8.900000000000001e-05, "loss": 0.1158, "step": 89 }, { "epoch": 0.061601642710472276, "grad_norm": 9.722609519958496, "learning_rate": 9e-05, "loss": 0.1135, "step": 90 }, { "epoch": 0.06228610540725531, "grad_norm": 49.313777923583984, "learning_rate": 9.1e-05, "loss": 0.1674, "step": 91 }, { "epoch": 0.06297056810403832, "grad_norm": 22.01307487487793, "learning_rate": 9.200000000000001e-05, "loss": 0.3624, "step": 92 }, { "epoch": 0.06365503080082136, "grad_norm": 8.820087432861328, "learning_rate": 9.300000000000001e-05, "loss": 0.1757, "step": 93 }, { "epoch": 0.06433949349760439, "grad_norm": 8.204625129699707, "learning_rate": 9.4e-05, "loss": 0.128, "step": 94 }, { "epoch": 0.0650239561943874, "grad_norm": 4.797610759735107, "learning_rate": 9.5e-05, "loss": 0.0395, "step": 95 }, { "epoch": 0.06570841889117043, "grad_norm": 13.214459419250488, "learning_rate": 9.6e-05, "loss": 0.1358, "step": 96 }, { "epoch": 0.06639288158795345, "grad_norm": 15.513224601745605, "learning_rate": 9.7e-05, "loss": 0.139, "step": 97 }, { "epoch": 0.06707734428473648, "grad_norm": 6.744882583618164, "learning_rate": 9.8e-05, "loss": 0.054, "step": 98 }, { "epoch": 0.06776180698151951, "grad_norm": 13.277941703796387, "learning_rate": 9.900000000000001e-05, "loss": 0.2499, "step": 99 }, { "epoch": 0.06844626967830253, "grad_norm": 4.069041728973389, "learning_rate": 0.0001, "loss": 0.0213, "step": 100 }, { "epoch": 0.06913073237508556, "grad_norm": 9.01382827758789, "learning_rate": 9.999986679414612e-05, "loss": 0.3325, "step": 101 }, { "epoch": 0.06981519507186858, "grad_norm": 11.294988632202148, "learning_rate": 9.999946717729422e-05, "loss": 0.3006, "step": 102 }, { "epoch": 0.07049965776865161, "grad_norm": 16.158906936645508, "learning_rate": 9.999880115157356e-05, "loss": 0.4242, "step": 103 }, { "epoch": 0.07118412046543464, "grad_norm": 4.419376373291016, "learning_rate": 9.99978687205329e-05, "loss": 0.0377, "step": 104 }, { "epoch": 0.07186858316221766, "grad_norm": 18.51973533630371, "learning_rate": 9.99966698891404e-05, "loss": 0.3585, "step": 105 }, { "epoch": 0.07255304585900069, "grad_norm": 22.38385772705078, "learning_rate": 9.999520466378376e-05, "loss": 0.4242, "step": 106 }, { "epoch": 0.0732375085557837, "grad_norm": 16.08448028564453, "learning_rate": 9.999347305227002e-05, "loss": 0.2782, "step": 107 }, { "epoch": 0.07392197125256673, "grad_norm": 34.474430084228516, "learning_rate": 9.999147506382565e-05, "loss": 0.5906, "step": 108 }, { "epoch": 0.07460643394934977, "grad_norm": 26.33028793334961, "learning_rate": 9.998921070909634e-05, "loss": 0.3691, "step": 109 }, { "epoch": 0.07529089664613278, "grad_norm": 10.850238800048828, "learning_rate": 9.998668000014713e-05, "loss": 0.2587, "step": 110 }, { "epoch": 0.07597535934291581, "grad_norm": 6.580715656280518, "learning_rate": 9.998388295046226e-05, "loss": 0.1016, "step": 111 }, { "epoch": 0.07665982203969883, "grad_norm": 6.588092803955078, "learning_rate": 9.998081957494502e-05, "loss": 0.2169, "step": 112 }, { "epoch": 0.07734428473648186, "grad_norm": 12.05201530456543, "learning_rate": 9.997748988991781e-05, "loss": 0.2638, "step": 113 }, { "epoch": 0.07802874743326489, "grad_norm": 10.036999702453613, "learning_rate": 9.997389391312198e-05, "loss": 0.2016, "step": 114 }, { "epoch": 0.07871321013004791, "grad_norm": 10.636836051940918, "learning_rate": 9.997003166371773e-05, "loss": 0.3407, "step": 115 }, { "epoch": 0.07939767282683094, "grad_norm": 11.341875076293945, "learning_rate": 9.996590316228401e-05, "loss": 0.217, "step": 116 }, { "epoch": 0.08008213552361396, "grad_norm": 10.519463539123535, "learning_rate": 9.996150843081847e-05, "loss": 0.2689, "step": 117 }, { "epoch": 0.08076659822039699, "grad_norm": 14.48704719543457, "learning_rate": 9.995684749273726e-05, "loss": 0.2911, "step": 118 }, { "epoch": 0.08145106091718002, "grad_norm": 13.348938941955566, "learning_rate": 9.995192037287495e-05, "loss": 0.2273, "step": 119 }, { "epoch": 0.08213552361396304, "grad_norm": 10.226967811584473, "learning_rate": 9.994672709748439e-05, "loss": 0.2104, "step": 120 }, { "epoch": 0.08281998631074607, "grad_norm": 11.386143684387207, "learning_rate": 9.994126769423655e-05, "loss": 0.2069, "step": 121 }, { "epoch": 0.08350444900752908, "grad_norm": 8.216964721679688, "learning_rate": 9.993554219222043e-05, "loss": 0.2307, "step": 122 }, { "epoch": 0.08418891170431211, "grad_norm": 7.292871475219727, "learning_rate": 9.992955062194284e-05, "loss": 0.1484, "step": 123 }, { "epoch": 0.08487337440109514, "grad_norm": 18.092880249023438, "learning_rate": 9.992329301532825e-05, "loss": 0.426, "step": 124 }, { "epoch": 0.08555783709787816, "grad_norm": 9.72424030303955, "learning_rate": 9.99167694057187e-05, "loss": 0.2151, "step": 125 }, { "epoch": 0.08624229979466119, "grad_norm": 14.050273895263672, "learning_rate": 9.990997982787347e-05, "loss": 0.2601, "step": 126 }, { "epoch": 0.08692676249144421, "grad_norm": 37.251556396484375, "learning_rate": 9.990292431796901e-05, "loss": 0.321, "step": 127 }, { "epoch": 0.08761122518822724, "grad_norm": 10.154332160949707, "learning_rate": 9.989560291359877e-05, "loss": 0.2761, "step": 128 }, { "epoch": 0.08829568788501027, "grad_norm": 12.595226287841797, "learning_rate": 9.988801565377288e-05, "loss": 0.2028, "step": 129 }, { "epoch": 0.08898015058179329, "grad_norm": 16.50782012939453, "learning_rate": 9.988016257891805e-05, "loss": 0.1909, "step": 130 }, { "epoch": 0.08966461327857632, "grad_norm": 9.688196182250977, "learning_rate": 9.987204373087729e-05, "loss": 0.2251, "step": 131 }, { "epoch": 0.09034907597535935, "grad_norm": 6.656239032745361, "learning_rate": 9.986365915290972e-05, "loss": 0.1847, "step": 132 }, { "epoch": 0.09103353867214237, "grad_norm": 10.905471801757812, "learning_rate": 9.985500888969035e-05, "loss": 0.2188, "step": 133 }, { "epoch": 0.0917180013689254, "grad_norm": 5.864986896514893, "learning_rate": 9.984609298730981e-05, "loss": 0.0745, "step": 134 }, { "epoch": 0.09240246406570841, "grad_norm": 5.575222015380859, "learning_rate": 9.983691149327409e-05, "loss": 0.1826, "step": 135 }, { "epoch": 0.09308692676249145, "grad_norm": 6.736145973205566, "learning_rate": 9.982746445650436e-05, "loss": 0.1104, "step": 136 }, { "epoch": 0.09377138945927448, "grad_norm": 6.073530197143555, "learning_rate": 9.981775192733665e-05, "loss": 0.0968, "step": 137 }, { "epoch": 0.0944558521560575, "grad_norm": 7.234757900238037, "learning_rate": 9.980777395752157e-05, "loss": 0.082, "step": 138 }, { "epoch": 0.09514031485284052, "grad_norm": 11.460409164428711, "learning_rate": 9.97975306002241e-05, "loss": 0.2367, "step": 139 }, { "epoch": 0.09582477754962354, "grad_norm": 9.880468368530273, "learning_rate": 9.978702191002323e-05, "loss": 0.2203, "step": 140 }, { "epoch": 0.09650924024640657, "grad_norm": 5.276390552520752, "learning_rate": 9.977624794291171e-05, "loss": 0.0902, "step": 141 }, { "epoch": 0.0971937029431896, "grad_norm": 4.643181324005127, "learning_rate": 9.97652087562958e-05, "loss": 0.1501, "step": 142 }, { "epoch": 0.09787816563997262, "grad_norm": 10.8096923828125, "learning_rate": 9.975390440899484e-05, "loss": 0.2241, "step": 143 }, { "epoch": 0.09856262833675565, "grad_norm": 9.880041122436523, "learning_rate": 9.974233496124106e-05, "loss": 0.1017, "step": 144 }, { "epoch": 0.09924709103353867, "grad_norm": 7.668142795562744, "learning_rate": 9.973050047467916e-05, "loss": 0.0805, "step": 145 }, { "epoch": 0.0999315537303217, "grad_norm": 11.288379669189453, "learning_rate": 9.971840101236609e-05, "loss": 0.1733, "step": 146 }, { "epoch": 0.10061601642710473, "grad_norm": 16.269683837890625, "learning_rate": 9.970603663877059e-05, "loss": 0.1718, "step": 147 }, { "epoch": 0.10130047912388775, "grad_norm": 4.129180908203125, "learning_rate": 9.969340741977294e-05, "loss": 0.0836, "step": 148 }, { "epoch": 0.10198494182067078, "grad_norm": 6.336442470550537, "learning_rate": 9.968051342266458e-05, "loss": 0.0808, "step": 149 }, { "epoch": 0.1026694045174538, "grad_norm": 7.243290901184082, "learning_rate": 9.966735471614775e-05, "loss": 0.1736, "step": 150 }, { "epoch": 0.10335386721423682, "grad_norm": 15.34333610534668, "learning_rate": 9.965393137033512e-05, "loss": 0.4038, "step": 151 }, { "epoch": 0.10403832991101986, "grad_norm": 5.92156982421875, "learning_rate": 9.964024345674943e-05, "loss": 0.0742, "step": 152 }, { "epoch": 0.10472279260780287, "grad_norm": 22.832197189331055, "learning_rate": 9.962629104832307e-05, "loss": 0.4899, "step": 153 }, { "epoch": 0.1054072553045859, "grad_norm": 7.452009677886963, "learning_rate": 9.961207421939774e-05, "loss": 0.1396, "step": 154 }, { "epoch": 0.10609171800136892, "grad_norm": 19.482696533203125, "learning_rate": 9.959759304572402e-05, "loss": 0.5408, "step": 155 }, { "epoch": 0.10677618069815195, "grad_norm": 17.794151306152344, "learning_rate": 9.958284760446103e-05, "loss": 0.3932, "step": 156 }, { "epoch": 0.10746064339493498, "grad_norm": 18.70676040649414, "learning_rate": 9.956783797417592e-05, "loss": 0.3695, "step": 157 }, { "epoch": 0.108145106091718, "grad_norm": 9.102005958557129, "learning_rate": 9.955256423484351e-05, "loss": 0.231, "step": 158 }, { "epoch": 0.10882956878850103, "grad_norm": 12.468313217163086, "learning_rate": 9.953702646784587e-05, "loss": 0.285, "step": 159 }, { "epoch": 0.10951403148528405, "grad_norm": 5.375672817230225, "learning_rate": 9.952122475597183e-05, "loss": 0.145, "step": 160 }, { "epoch": 0.11019849418206708, "grad_norm": 12.612732887268066, "learning_rate": 9.950515918341666e-05, "loss": 0.2746, "step": 161 }, { "epoch": 0.11088295687885011, "grad_norm": 10.144058227539062, "learning_rate": 9.948882983578142e-05, "loss": 0.2173, "step": 162 }, { "epoch": 0.11156741957563313, "grad_norm": 8.544638633728027, "learning_rate": 9.947223680007278e-05, "loss": 0.1303, "step": 163 }, { "epoch": 0.11225188227241616, "grad_norm": 8.805436134338379, "learning_rate": 9.945538016470228e-05, "loss": 0.2202, "step": 164 }, { "epoch": 0.11293634496919917, "grad_norm": 13.094097137451172, "learning_rate": 9.943826001948602e-05, "loss": 0.3081, "step": 165 }, { "epoch": 0.1136208076659822, "grad_norm": 8.539441108703613, "learning_rate": 9.942087645564414e-05, "loss": 0.2226, "step": 166 }, { "epoch": 0.11430527036276524, "grad_norm": 16.653034210205078, "learning_rate": 9.940322956580035e-05, "loss": 0.3218, "step": 167 }, { "epoch": 0.11498973305954825, "grad_norm": 6.876267910003662, "learning_rate": 9.93853194439814e-05, "loss": 0.222, "step": 168 }, { "epoch": 0.11567419575633128, "grad_norm": 5.2602925300598145, "learning_rate": 9.936714618561662e-05, "loss": 0.0888, "step": 169 }, { "epoch": 0.1163586584531143, "grad_norm": 5.9378180503845215, "learning_rate": 9.934870988753737e-05, "loss": 0.1239, "step": 170 }, { "epoch": 0.11704312114989733, "grad_norm": 9.163177490234375, "learning_rate": 9.93300106479766e-05, "loss": 0.2723, "step": 171 }, { "epoch": 0.11772758384668036, "grad_norm": 5.671283721923828, "learning_rate": 9.93110485665682e-05, "loss": 0.1959, "step": 172 }, { "epoch": 0.11841204654346338, "grad_norm": 7.381270885467529, "learning_rate": 9.92918237443466e-05, "loss": 0.1516, "step": 173 }, { "epoch": 0.11909650924024641, "grad_norm": 11.422346115112305, "learning_rate": 9.927233628374615e-05, "loss": 0.2034, "step": 174 }, { "epoch": 0.11978097193702943, "grad_norm": 6.568750858306885, "learning_rate": 9.925258628860059e-05, "loss": 0.1378, "step": 175 }, { "epoch": 0.12046543463381246, "grad_norm": 14.708685874938965, "learning_rate": 9.923257386414253e-05, "loss": 0.2484, "step": 176 }, { "epoch": 0.12114989733059549, "grad_norm": 15.484065055847168, "learning_rate": 9.921229911700287e-05, "loss": 0.1921, "step": 177 }, { "epoch": 0.1218343600273785, "grad_norm": 15.454427719116211, "learning_rate": 9.919176215521018e-05, "loss": 0.2807, "step": 178 }, { "epoch": 0.12251882272416154, "grad_norm": 12.064361572265625, "learning_rate": 9.917096308819021e-05, "loss": 0.234, "step": 179 }, { "epoch": 0.12320328542094455, "grad_norm": 15.833438873291016, "learning_rate": 9.914990202676529e-05, "loss": 0.2672, "step": 180 }, { "epoch": 0.12388774811772758, "grad_norm": 12.431648254394531, "learning_rate": 9.912857908315363e-05, "loss": 0.2649, "step": 181 }, { "epoch": 0.12457221081451061, "grad_norm": 7.222988128662109, "learning_rate": 9.91069943709689e-05, "loss": 0.2092, "step": 182 }, { "epoch": 0.12525667351129363, "grad_norm": 5.122772693634033, "learning_rate": 9.908514800521953e-05, "loss": 0.1848, "step": 183 }, { "epoch": 0.12594113620807665, "grad_norm": 6.150139331817627, "learning_rate": 9.906304010230801e-05, "loss": 0.1568, "step": 184 }, { "epoch": 0.1266255989048597, "grad_norm": 11.600407600402832, "learning_rate": 9.904067078003048e-05, "loss": 0.2814, "step": 185 }, { "epoch": 0.1273100616016427, "grad_norm": 6.324848175048828, "learning_rate": 9.901804015757588e-05, "loss": 0.1516, "step": 186 }, { "epoch": 0.12799452429842573, "grad_norm": 8.196759223937988, "learning_rate": 9.899514835552548e-05, "loss": 0.1878, "step": 187 }, { "epoch": 0.12867898699520877, "grad_norm": 4.781862258911133, "learning_rate": 9.897199549585217e-05, "loss": 0.143, "step": 188 }, { "epoch": 0.1293634496919918, "grad_norm": 6.053997993469238, "learning_rate": 9.894858170191983e-05, "loss": 0.1772, "step": 189 }, { "epoch": 0.1300479123887748, "grad_norm": 9.608360290527344, "learning_rate": 9.89249070984826e-05, "loss": 0.2557, "step": 190 }, { "epoch": 0.13073237508555785, "grad_norm": 5.647324085235596, "learning_rate": 9.890097181168431e-05, "loss": 0.1323, "step": 191 }, { "epoch": 0.13141683778234087, "grad_norm": 5.115596771240234, "learning_rate": 9.887677596905779e-05, "loss": 0.1119, "step": 192 }, { "epoch": 0.13210130047912388, "grad_norm": 6.961941719055176, "learning_rate": 9.885231969952416e-05, "loss": 0.0931, "step": 193 }, { "epoch": 0.1327857631759069, "grad_norm": 4.01815938949585, "learning_rate": 9.882760313339213e-05, "loss": 0.0779, "step": 194 }, { "epoch": 0.13347022587268995, "grad_norm": 7.847511291503906, "learning_rate": 9.880262640235737e-05, "loss": 0.1161, "step": 195 }, { "epoch": 0.13415468856947296, "grad_norm": 5.6305036544799805, "learning_rate": 9.877738963950174e-05, "loss": 0.0932, "step": 196 }, { "epoch": 0.13483915126625598, "grad_norm": 5.9582977294921875, "learning_rate": 9.875189297929263e-05, "loss": 0.0746, "step": 197 }, { "epoch": 0.13552361396303902, "grad_norm": 4.453327178955078, "learning_rate": 9.872613655758221e-05, "loss": 0.0733, "step": 198 }, { "epoch": 0.13620807665982204, "grad_norm": 5.113376140594482, "learning_rate": 9.870012051160673e-05, "loss": 0.056, "step": 199 }, { "epoch": 0.13689253935660506, "grad_norm": 10.633220672607422, "learning_rate": 9.867384497998576e-05, "loss": 0.1667, "step": 200 }, { "epoch": 0.1375770020533881, "grad_norm": 21.0787296295166, "learning_rate": 9.864731010272152e-05, "loss": 0.3112, "step": 201 }, { "epoch": 0.13826146475017112, "grad_norm": 21.05179214477539, "learning_rate": 9.862051602119801e-05, "loss": 0.3141, "step": 202 }, { "epoch": 0.13894592744695414, "grad_norm": 3.9383182525634766, "learning_rate": 9.859346287818039e-05, "loss": 0.1115, "step": 203 }, { "epoch": 0.13963039014373715, "grad_norm": 16.245912551879883, "learning_rate": 9.856615081781413e-05, "loss": 0.317, "step": 204 }, { "epoch": 0.1403148528405202, "grad_norm": 8.814828872680664, "learning_rate": 9.85385799856243e-05, "loss": 0.1945, "step": 205 }, { "epoch": 0.14099931553730322, "grad_norm": 24.294551849365234, "learning_rate": 9.851075052851475e-05, "loss": 0.5082, "step": 206 }, { "epoch": 0.14168377823408623, "grad_norm": 10.484267234802246, "learning_rate": 9.848266259476733e-05, "loss": 0.241, "step": 207 }, { "epoch": 0.14236824093086928, "grad_norm": 7.027013778686523, "learning_rate": 9.845431633404112e-05, "loss": 0.2384, "step": 208 }, { "epoch": 0.1430527036276523, "grad_norm": 11.033212661743164, "learning_rate": 9.842571189737167e-05, "loss": 0.2902, "step": 209 }, { "epoch": 0.1437371663244353, "grad_norm": 7.913508415222168, "learning_rate": 9.839684943717007e-05, "loss": 0.2218, "step": 210 }, { "epoch": 0.14442162902121836, "grad_norm": 12.220626831054688, "learning_rate": 9.83677291072223e-05, "loss": 0.2234, "step": 211 }, { "epoch": 0.14510609171800137, "grad_norm": 5.488425254821777, "learning_rate": 9.833835106268827e-05, "loss": 0.2117, "step": 212 }, { "epoch": 0.1457905544147844, "grad_norm": 4.815133094787598, "learning_rate": 9.830871546010112e-05, "loss": 0.2499, "step": 213 }, { "epoch": 0.1464750171115674, "grad_norm": 3.710526943206787, "learning_rate": 9.827882245736625e-05, "loss": 0.2237, "step": 214 }, { "epoch": 0.14715947980835045, "grad_norm": 10.712550163269043, "learning_rate": 9.824867221376058e-05, "loss": 0.277, "step": 215 }, { "epoch": 0.14784394250513347, "grad_norm": 6.414258003234863, "learning_rate": 9.821826488993167e-05, "loss": 0.2393, "step": 216 }, { "epoch": 0.14852840520191649, "grad_norm": 8.124093055725098, "learning_rate": 9.818760064789687e-05, "loss": 0.3032, "step": 217 }, { "epoch": 0.14921286789869953, "grad_norm": 5.184533596038818, "learning_rate": 9.815667965104244e-05, "loss": 0.165, "step": 218 }, { "epoch": 0.14989733059548255, "grad_norm": 8.32724380493164, "learning_rate": 9.812550206412267e-05, "loss": 0.2076, "step": 219 }, { "epoch": 0.15058179329226556, "grad_norm": 7.017328262329102, "learning_rate": 9.809406805325909e-05, "loss": 0.1432, "step": 220 }, { "epoch": 0.1512662559890486, "grad_norm": 4.140676021575928, "learning_rate": 9.806237778593941e-05, "loss": 0.1137, "step": 221 }, { "epoch": 0.15195071868583163, "grad_norm": 12.404330253601074, "learning_rate": 9.803043143101684e-05, "loss": 0.2468, "step": 222 }, { "epoch": 0.15263518138261464, "grad_norm": 4.420513153076172, "learning_rate": 9.799822915870902e-05, "loss": 0.1629, "step": 223 }, { "epoch": 0.15331964407939766, "grad_norm": 3.874195098876953, "learning_rate": 9.796577114059722e-05, "loss": 0.1211, "step": 224 }, { "epoch": 0.1540041067761807, "grad_norm": 7.152524948120117, "learning_rate": 9.793305754962532e-05, "loss": 0.1511, "step": 225 }, { "epoch": 0.15468856947296372, "grad_norm": 5.158159255981445, "learning_rate": 9.790008856009903e-05, "loss": 0.1615, "step": 226 }, { "epoch": 0.15537303216974674, "grad_norm": 6.27839994430542, "learning_rate": 9.786686434768482e-05, "loss": 0.1189, "step": 227 }, { "epoch": 0.15605749486652978, "grad_norm": 3.47102689743042, "learning_rate": 9.783338508940909e-05, "loss": 0.0906, "step": 228 }, { "epoch": 0.1567419575633128, "grad_norm": 10.171895027160645, "learning_rate": 9.779965096365717e-05, "loss": 0.3156, "step": 229 }, { "epoch": 0.15742642026009582, "grad_norm": 12.853466033935547, "learning_rate": 9.776566215017238e-05, "loss": 0.2747, "step": 230 }, { "epoch": 0.15811088295687886, "grad_norm": 6.99425745010376, "learning_rate": 9.773141883005506e-05, "loss": 0.1196, "step": 231 }, { "epoch": 0.15879534565366188, "grad_norm": 3.012573003768921, "learning_rate": 9.769692118576163e-05, "loss": 0.0971, "step": 232 }, { "epoch": 0.1594798083504449, "grad_norm": 11.016730308532715, "learning_rate": 9.766216940110366e-05, "loss": 0.1904, "step": 233 }, { "epoch": 0.1601642710472279, "grad_norm": 3.567908525466919, "learning_rate": 9.762716366124675e-05, "loss": 0.11, "step": 234 }, { "epoch": 0.16084873374401096, "grad_norm": 7.971604824066162, "learning_rate": 9.759190415270973e-05, "loss": 0.1575, "step": 235 }, { "epoch": 0.16153319644079397, "grad_norm": 3.172243595123291, "learning_rate": 9.755639106336347e-05, "loss": 0.1336, "step": 236 }, { "epoch": 0.162217659137577, "grad_norm": 9.752211570739746, "learning_rate": 9.752062458243005e-05, "loss": 0.2015, "step": 237 }, { "epoch": 0.16290212183436004, "grad_norm": 6.6701812744140625, "learning_rate": 9.748460490048164e-05, "loss": 0.1061, "step": 238 }, { "epoch": 0.16358658453114305, "grad_norm": 7.871882915496826, "learning_rate": 9.744833220943954e-05, "loss": 0.1103, "step": 239 }, { "epoch": 0.16427104722792607, "grad_norm": 6.447455406188965, "learning_rate": 9.741180670257316e-05, "loss": 0.0805, "step": 240 }, { "epoch": 0.16495550992470911, "grad_norm": 8.00991439819336, "learning_rate": 9.737502857449894e-05, "loss": 0.062, "step": 241 }, { "epoch": 0.16563997262149213, "grad_norm": 7.861396789550781, "learning_rate": 9.733799802117936e-05, "loss": 0.195, "step": 242 }, { "epoch": 0.16632443531827515, "grad_norm": 11.979039192199707, "learning_rate": 9.730071523992189e-05, "loss": 0.2133, "step": 243 }, { "epoch": 0.16700889801505817, "grad_norm": 6.658477306365967, "learning_rate": 9.72631804293779e-05, "loss": 0.0547, "step": 244 }, { "epoch": 0.1676933607118412, "grad_norm": 5.327047348022461, "learning_rate": 9.722539378954166e-05, "loss": 0.0583, "step": 245 }, { "epoch": 0.16837782340862423, "grad_norm": 3.7219507694244385, "learning_rate": 9.718735552174924e-05, "loss": 0.0814, "step": 246 }, { "epoch": 0.16906228610540724, "grad_norm": 4.378869533538818, "learning_rate": 9.714906582867742e-05, "loss": 0.0669, "step": 247 }, { "epoch": 0.1697467488021903, "grad_norm": 4.225414276123047, "learning_rate": 9.711052491434266e-05, "loss": 0.0612, "step": 248 }, { "epoch": 0.1704312114989733, "grad_norm": 9.894330978393555, "learning_rate": 9.707173298409998e-05, "loss": 0.11, "step": 249 }, { "epoch": 0.17111567419575632, "grad_norm": 12.584293365478516, "learning_rate": 9.703269024464185e-05, "loss": 0.327, "step": 250 }, { "epoch": 0.17180013689253937, "grad_norm": 6.5966620445251465, "learning_rate": 9.699339690399717e-05, "loss": 0.1421, "step": 251 }, { "epoch": 0.17248459958932238, "grad_norm": 3.7039248943328857, "learning_rate": 9.695385317153002e-05, "loss": 0.2283, "step": 252 }, { "epoch": 0.1731690622861054, "grad_norm": 7.379708290100098, "learning_rate": 9.691405925793869e-05, "loss": 0.1258, "step": 253 }, { "epoch": 0.17385352498288842, "grad_norm": 6.9973554611206055, "learning_rate": 9.687401537525442e-05, "loss": 0.2022, "step": 254 }, { "epoch": 0.17453798767967146, "grad_norm": 7.060970306396484, "learning_rate": 9.683372173684047e-05, "loss": 0.2504, "step": 255 }, { "epoch": 0.17522245037645448, "grad_norm": 8.207862854003906, "learning_rate": 9.679317855739074e-05, "loss": 0.2002, "step": 256 }, { "epoch": 0.1759069130732375, "grad_norm": 5.57830286026001, "learning_rate": 9.675238605292876e-05, "loss": 0.1647, "step": 257 }, { "epoch": 0.17659137577002054, "grad_norm": 6.950638294219971, "learning_rate": 9.671134444080658e-05, "loss": 0.1531, "step": 258 }, { "epoch": 0.17727583846680356, "grad_norm": 10.646747589111328, "learning_rate": 9.667005393970351e-05, "loss": 0.2512, "step": 259 }, { "epoch": 0.17796030116358658, "grad_norm": 4.379300117492676, "learning_rate": 9.662851476962502e-05, "loss": 0.2591, "step": 260 }, { "epoch": 0.17864476386036962, "grad_norm": 3.725334644317627, "learning_rate": 9.658672715190152e-05, "loss": 0.1325, "step": 261 }, { "epoch": 0.17932922655715264, "grad_norm": 5.019538879394531, "learning_rate": 9.654469130918721e-05, "loss": 0.2968, "step": 262 }, { "epoch": 0.18001368925393565, "grad_norm": 5.770042896270752, "learning_rate": 9.650240746545893e-05, "loss": 0.1671, "step": 263 }, { "epoch": 0.1806981519507187, "grad_norm": 4.729129791259766, "learning_rate": 9.64598758460149e-05, "loss": 0.1455, "step": 264 }, { "epoch": 0.18138261464750172, "grad_norm": 8.625890731811523, "learning_rate": 9.641709667747352e-05, "loss": 0.1746, "step": 265 }, { "epoch": 0.18206707734428473, "grad_norm": 10.511664390563965, "learning_rate": 9.637407018777223e-05, "loss": 0.1579, "step": 266 }, { "epoch": 0.18275154004106775, "grad_norm": 5.0685296058654785, "learning_rate": 9.633079660616627e-05, "loss": 0.2056, "step": 267 }, { "epoch": 0.1834360027378508, "grad_norm": 4.9627227783203125, "learning_rate": 9.628727616322737e-05, "loss": 0.176, "step": 268 }, { "epoch": 0.1841204654346338, "grad_norm": 3.1265316009521484, "learning_rate": 9.624350909084265e-05, "loss": 0.1011, "step": 269 }, { "epoch": 0.18480492813141683, "grad_norm": 6.4602861404418945, "learning_rate": 9.619949562221336e-05, "loss": 0.1687, "step": 270 }, { "epoch": 0.18548939082819987, "grad_norm": 9.5740385055542, "learning_rate": 9.615523599185352e-05, "loss": 0.1607, "step": 271 }, { "epoch": 0.1861738535249829, "grad_norm": 8.600020408630371, "learning_rate": 9.611073043558882e-05, "loss": 0.2082, "step": 272 }, { "epoch": 0.1868583162217659, "grad_norm": 9.140114784240723, "learning_rate": 9.60659791905553e-05, "loss": 0.1734, "step": 273 }, { "epoch": 0.18754277891854895, "grad_norm": 5.175637722015381, "learning_rate": 9.602098249519804e-05, "loss": 0.1393, "step": 274 }, { "epoch": 0.18822724161533197, "grad_norm": 5.420800685882568, "learning_rate": 9.597574058926999e-05, "loss": 0.1491, "step": 275 }, { "epoch": 0.188911704312115, "grad_norm": 3.4305055141448975, "learning_rate": 9.593025371383064e-05, "loss": 0.0748, "step": 276 }, { "epoch": 0.189596167008898, "grad_norm": 11.01544189453125, "learning_rate": 9.588452211124467e-05, "loss": 0.2161, "step": 277 }, { "epoch": 0.19028062970568105, "grad_norm": 9.556683540344238, "learning_rate": 9.583854602518079e-05, "loss": 0.1777, "step": 278 }, { "epoch": 0.19096509240246407, "grad_norm": 4.741940021514893, "learning_rate": 9.579232570061036e-05, "loss": 0.1309, "step": 279 }, { "epoch": 0.19164955509924708, "grad_norm": 9.034591674804688, "learning_rate": 9.574586138380605e-05, "loss": 0.2269, "step": 280 }, { "epoch": 0.19233401779603013, "grad_norm": 9.210083961486816, "learning_rate": 9.569915332234068e-05, "loss": 0.149, "step": 281 }, { "epoch": 0.19301848049281314, "grad_norm": 6.298268795013428, "learning_rate": 9.565220176508572e-05, "loss": 0.1705, "step": 282 }, { "epoch": 0.19370294318959616, "grad_norm": 7.310756206512451, "learning_rate": 9.560500696221001e-05, "loss": 0.0719, "step": 283 }, { "epoch": 0.1943874058863792, "grad_norm": 10.808761596679688, "learning_rate": 9.555756916517859e-05, "loss": 0.1505, "step": 284 }, { "epoch": 0.19507186858316222, "grad_norm": 5.134876728057861, "learning_rate": 9.55098886267511e-05, "loss": 0.1511, "step": 285 }, { "epoch": 0.19575633127994524, "grad_norm": 10.443704605102539, "learning_rate": 9.546196560098062e-05, "loss": 0.2166, "step": 286 }, { "epoch": 0.19644079397672826, "grad_norm": 8.784150123596191, "learning_rate": 9.541380034321225e-05, "loss": 0.1403, "step": 287 }, { "epoch": 0.1971252566735113, "grad_norm": 6.043169021606445, "learning_rate": 9.536539311008178e-05, "loss": 0.0848, "step": 288 }, { "epoch": 0.19780971937029432, "grad_norm": 5.932885646820068, "learning_rate": 9.531674415951426e-05, "loss": 0.1213, "step": 289 }, { "epoch": 0.19849418206707733, "grad_norm": 5.305424213409424, "learning_rate": 9.526785375072271e-05, "loss": 0.1139, "step": 290 }, { "epoch": 0.19917864476386038, "grad_norm": 6.352259635925293, "learning_rate": 9.521872214420668e-05, "loss": 0.1073, "step": 291 }, { "epoch": 0.1998631074606434, "grad_norm": 4.961869716644287, "learning_rate": 9.516934960175085e-05, "loss": 0.0604, "step": 292 }, { "epoch": 0.2005475701574264, "grad_norm": 3.918891668319702, "learning_rate": 9.51197363864237e-05, "loss": 0.036, "step": 293 }, { "epoch": 0.20123203285420946, "grad_norm": 8.48746395111084, "learning_rate": 9.506988276257604e-05, "loss": 0.0901, "step": 294 }, { "epoch": 0.20191649555099248, "grad_norm": 6.941591262817383, "learning_rate": 9.50197889958397e-05, "loss": 0.1185, "step": 295 }, { "epoch": 0.2026009582477755, "grad_norm": 9.32638931274414, "learning_rate": 9.496945535312595e-05, "loss": 0.0996, "step": 296 }, { "epoch": 0.2032854209445585, "grad_norm": 3.447404384613037, "learning_rate": 9.491888210262425e-05, "loss": 0.0653, "step": 297 }, { "epoch": 0.20396988364134155, "grad_norm": 0.9872057437896729, "learning_rate": 9.486806951380071e-05, "loss": 0.0206, "step": 298 }, { "epoch": 0.20465434633812457, "grad_norm": 3.899648666381836, "learning_rate": 9.48170178573967e-05, "loss": 0.0686, "step": 299 }, { "epoch": 0.2053388090349076, "grad_norm": 4.332979679107666, "learning_rate": 9.47657274054274e-05, "loss": 0.0624, "step": 300 }, { "epoch": 0.20602327173169063, "grad_norm": 3.909208297729492, "learning_rate": 9.471419843118037e-05, "loss": 0.0629, "step": 301 }, { "epoch": 0.20670773442847365, "grad_norm": 7.688588619232178, "learning_rate": 9.4662431209214e-05, "loss": 0.1465, "step": 302 }, { "epoch": 0.20739219712525667, "grad_norm": 9.849617004394531, "learning_rate": 9.461042601535623e-05, "loss": 0.1975, "step": 303 }, { "epoch": 0.2080766598220397, "grad_norm": 11.084503173828125, "learning_rate": 9.455818312670287e-05, "loss": 0.4934, "step": 304 }, { "epoch": 0.20876112251882273, "grad_norm": 15.024002075195312, "learning_rate": 9.450570282161628e-05, "loss": 0.2254, "step": 305 }, { "epoch": 0.20944558521560575, "grad_norm": 24.196317672729492, "learning_rate": 9.44529853797238e-05, "loss": 0.3679, "step": 306 }, { "epoch": 0.21013004791238876, "grad_norm": 11.010361671447754, "learning_rate": 9.440003108191633e-05, "loss": 0.2525, "step": 307 }, { "epoch": 0.2108145106091718, "grad_norm": 7.085261821746826, "learning_rate": 9.434684021034674e-05, "loss": 0.1405, "step": 308 }, { "epoch": 0.21149897330595482, "grad_norm": 20.37203598022461, "learning_rate": 9.429341304842847e-05, "loss": 0.3387, "step": 309 }, { "epoch": 0.21218343600273784, "grad_norm": 13.335612297058105, "learning_rate": 9.423974988083395e-05, "loss": 0.2779, "step": 310 }, { "epoch": 0.21286789869952089, "grad_norm": 10.040803909301758, "learning_rate": 9.418585099349306e-05, "loss": 0.2638, "step": 311 }, { "epoch": 0.2135523613963039, "grad_norm": 10.201689720153809, "learning_rate": 9.413171667359175e-05, "loss": 0.3804, "step": 312 }, { "epoch": 0.21423682409308692, "grad_norm": 9.714926719665527, "learning_rate": 9.407734720957033e-05, "loss": 0.1542, "step": 313 }, { "epoch": 0.21492128678986996, "grad_norm": 6.409265518188477, "learning_rate": 9.402274289112203e-05, "loss": 0.1533, "step": 314 }, { "epoch": 0.21560574948665298, "grad_norm": 4.9183669090271, "learning_rate": 9.396790400919144e-05, "loss": 0.1713, "step": 315 }, { "epoch": 0.216290212183436, "grad_norm": 4.8765130043029785, "learning_rate": 9.391283085597298e-05, "loss": 0.1497, "step": 316 }, { "epoch": 0.21697467488021902, "grad_norm": 16.26625633239746, "learning_rate": 9.385752372490929e-05, "loss": 0.2835, "step": 317 }, { "epoch": 0.21765913757700206, "grad_norm": 5.757822036743164, "learning_rate": 9.380198291068971e-05, "loss": 0.0977, "step": 318 }, { "epoch": 0.21834360027378508, "grad_norm": 6.276159763336182, "learning_rate": 9.374620870924873e-05, "loss": 0.1954, "step": 319 }, { "epoch": 0.2190280629705681, "grad_norm": 10.392669677734375, "learning_rate": 9.369020141776435e-05, "loss": 0.2143, "step": 320 }, { "epoch": 0.21971252566735114, "grad_norm": 8.383529663085938, "learning_rate": 9.36339613346565e-05, "loss": 0.1434, "step": 321 }, { "epoch": 0.22039698836413416, "grad_norm": 6.541269779205322, "learning_rate": 9.357748875958554e-05, "loss": 0.1154, "step": 322 }, { "epoch": 0.22108145106091717, "grad_norm": 9.351251602172852, "learning_rate": 9.352078399345058e-05, "loss": 0.1998, "step": 323 }, { "epoch": 0.22176591375770022, "grad_norm": 9.786504745483398, "learning_rate": 9.346384733838787e-05, "loss": 0.2387, "step": 324 }, { "epoch": 0.22245037645448323, "grad_norm": 5.136743545532227, "learning_rate": 9.340667909776926e-05, "loss": 0.1026, "step": 325 }, { "epoch": 0.22313483915126625, "grad_norm": 6.150352954864502, "learning_rate": 9.33492795762005e-05, "loss": 0.1903, "step": 326 }, { "epoch": 0.22381930184804927, "grad_norm": 7.919428825378418, "learning_rate": 9.329164907951972e-05, "loss": 0.1781, "step": 327 }, { "epoch": 0.2245037645448323, "grad_norm": 5.007681369781494, "learning_rate": 9.323378791479564e-05, "loss": 0.1414, "step": 328 }, { "epoch": 0.22518822724161533, "grad_norm": 5.823683738708496, "learning_rate": 9.317569639032616e-05, "loss": 0.1639, "step": 329 }, { "epoch": 0.22587268993839835, "grad_norm": 7.860928535461426, "learning_rate": 9.311737481563648e-05, "loss": 0.1843, "step": 330 }, { "epoch": 0.2265571526351814, "grad_norm": 2.81413197517395, "learning_rate": 9.305882350147764e-05, "loss": 0.0749, "step": 331 }, { "epoch": 0.2272416153319644, "grad_norm": 5.665268421173096, "learning_rate": 9.30000427598247e-05, "loss": 0.104, "step": 332 }, { "epoch": 0.22792607802874743, "grad_norm": 8.203513145446777, "learning_rate": 9.294103290387525e-05, "loss": 0.104, "step": 333 }, { "epoch": 0.22861054072553047, "grad_norm": 5.774232387542725, "learning_rate": 9.288179424804764e-05, "loss": 0.1401, "step": 334 }, { "epoch": 0.2292950034223135, "grad_norm": 7.830986499786377, "learning_rate": 9.282232710797927e-05, "loss": 0.1864, "step": 335 }, { "epoch": 0.2299794661190965, "grad_norm": 5.251026153564453, "learning_rate": 9.276263180052497e-05, "loss": 0.1533, "step": 336 }, { "epoch": 0.23066392881587952, "grad_norm": 2.1413843631744385, "learning_rate": 9.270270864375536e-05, "loss": 0.0665, "step": 337 }, { "epoch": 0.23134839151266257, "grad_norm": 4.4466071128845215, "learning_rate": 9.264255795695505e-05, "loss": 0.131, "step": 338 }, { "epoch": 0.23203285420944558, "grad_norm": 9.100303649902344, "learning_rate": 9.258218006062094e-05, "loss": 0.1285, "step": 339 }, { "epoch": 0.2327173169062286, "grad_norm": 3.8086304664611816, "learning_rate": 9.252157527646061e-05, "loss": 0.074, "step": 340 }, { "epoch": 0.23340177960301164, "grad_norm": 12.530363082885742, "learning_rate": 9.246074392739057e-05, "loss": 0.2704, "step": 341 }, { "epoch": 0.23408624229979466, "grad_norm": 9.272818565368652, "learning_rate": 9.239968633753449e-05, "loss": 0.1317, "step": 342 }, { "epoch": 0.23477070499657768, "grad_norm": 6.09656286239624, "learning_rate": 9.233840283222147e-05, "loss": 0.054, "step": 343 }, { "epoch": 0.23545516769336072, "grad_norm": 3.7253122329711914, "learning_rate": 9.22768937379844e-05, "loss": 0.1026, "step": 344 }, { "epoch": 0.23613963039014374, "grad_norm": 3.813018321990967, "learning_rate": 9.221515938255816e-05, "loss": 0.0628, "step": 345 }, { "epoch": 0.23682409308692676, "grad_norm": 2.965759038925171, "learning_rate": 9.21532000948778e-05, "loss": 0.0536, "step": 346 }, { "epoch": 0.2375085557837098, "grad_norm": 4.392210006713867, "learning_rate": 9.209101620507695e-05, "loss": 0.0558, "step": 347 }, { "epoch": 0.23819301848049282, "grad_norm": 7.318861484527588, "learning_rate": 9.202860804448592e-05, "loss": 0.1252, "step": 348 }, { "epoch": 0.23887748117727584, "grad_norm": 2.9982407093048096, "learning_rate": 9.196597594563e-05, "loss": 0.0625, "step": 349 }, { "epoch": 0.23956194387405885, "grad_norm": 6.909542083740234, "learning_rate": 9.190312024222772e-05, "loss": 0.0897, "step": 350 }, { "epoch": 0.2402464065708419, "grad_norm": 8.191619873046875, "learning_rate": 9.184004126918891e-05, "loss": 0.1185, "step": 351 }, { "epoch": 0.24093086926762491, "grad_norm": 22.366313934326172, "learning_rate": 9.177673936261318e-05, "loss": 0.3351, "step": 352 }, { "epoch": 0.24161533196440793, "grad_norm": 20.09576988220215, "learning_rate": 9.171321485978786e-05, "loss": 0.1632, "step": 353 }, { "epoch": 0.24229979466119098, "grad_norm": 7.235936164855957, "learning_rate": 9.16494680991864e-05, "loss": 0.1609, "step": 354 }, { "epoch": 0.242984257357974, "grad_norm": 7.256474018096924, "learning_rate": 9.158549942046647e-05, "loss": 0.1788, "step": 355 }, { "epoch": 0.243668720054757, "grad_norm": 10.954352378845215, "learning_rate": 9.152130916446816e-05, "loss": 0.2973, "step": 356 }, { "epoch": 0.24435318275154005, "grad_norm": 9.992124557495117, "learning_rate": 9.14568976732122e-05, "loss": 0.2381, "step": 357 }, { "epoch": 0.24503764544832307, "grad_norm": 10.973164558410645, "learning_rate": 9.139226528989806e-05, "loss": 0.1372, "step": 358 }, { "epoch": 0.2457221081451061, "grad_norm": 11.364020347595215, "learning_rate": 9.132741235890225e-05, "loss": 0.2209, "step": 359 }, { "epoch": 0.2464065708418891, "grad_norm": 7.4936933517456055, "learning_rate": 9.126233922577636e-05, "loss": 0.2252, "step": 360 }, { "epoch": 0.24709103353867215, "grad_norm": 6.102821350097656, "learning_rate": 9.119704623724528e-05, "loss": 0.1561, "step": 361 }, { "epoch": 0.24777549623545517, "grad_norm": 11.34000015258789, "learning_rate": 9.113153374120533e-05, "loss": 0.1367, "step": 362 }, { "epoch": 0.24845995893223818, "grad_norm": 4.869993686676025, "learning_rate": 9.106580208672245e-05, "loss": 0.2181, "step": 363 }, { "epoch": 0.24914442162902123, "grad_norm": 8.996235847473145, "learning_rate": 9.099985162403028e-05, "loss": 0.2325, "step": 364 }, { "epoch": 0.24982888432580425, "grad_norm": 5.288308143615723, "learning_rate": 9.093368270452832e-05, "loss": 0.1276, "step": 365 }, { "epoch": 0.25051334702258726, "grad_norm": 5.552043437957764, "learning_rate": 9.086729568078005e-05, "loss": 0.0577, "step": 366 }, { "epoch": 0.2511978097193703, "grad_norm": 5.532169818878174, "learning_rate": 9.080069090651113e-05, "loss": 0.1753, "step": 367 }, { "epoch": 0.2518822724161533, "grad_norm": 10.411441802978516, "learning_rate": 9.073386873660734e-05, "loss": 0.2421, "step": 368 }, { "epoch": 0.25256673511293637, "grad_norm": 5.41740608215332, "learning_rate": 9.06668295271129e-05, "loss": 0.2285, "step": 369 }, { "epoch": 0.2532511978097194, "grad_norm": 8.328063011169434, "learning_rate": 9.059957363522835e-05, "loss": 0.2479, "step": 370 }, { "epoch": 0.2539356605065024, "grad_norm": 9.708357810974121, "learning_rate": 9.053210141930888e-05, "loss": 0.1788, "step": 371 }, { "epoch": 0.2546201232032854, "grad_norm": 6.649996757507324, "learning_rate": 9.046441323886226e-05, "loss": 0.1696, "step": 372 }, { "epoch": 0.25530458590006844, "grad_norm": 4.31477689743042, "learning_rate": 9.039650945454691e-05, "loss": 0.1443, "step": 373 }, { "epoch": 0.25598904859685145, "grad_norm": 3.574660539627075, "learning_rate": 9.032839042817016e-05, "loss": 0.1269, "step": 374 }, { "epoch": 0.25667351129363447, "grad_norm": 10.2943754196167, "learning_rate": 9.026005652268609e-05, "loss": 0.1874, "step": 375 }, { "epoch": 0.25735797399041754, "grad_norm": 4.354328155517578, "learning_rate": 9.019150810219376e-05, "loss": 0.1312, "step": 376 }, { "epoch": 0.25804243668720056, "grad_norm": 5.1734466552734375, "learning_rate": 9.012274553193521e-05, "loss": 0.167, "step": 377 }, { "epoch": 0.2587268993839836, "grad_norm": 7.708397388458252, "learning_rate": 9.005376917829351e-05, "loss": 0.2728, "step": 378 }, { "epoch": 0.2594113620807666, "grad_norm": 9.843606948852539, "learning_rate": 8.998457940879083e-05, "loss": 0.2208, "step": 379 }, { "epoch": 0.2600958247775496, "grad_norm": 3.4372477531433105, "learning_rate": 8.991517659208645e-05, "loss": 0.1214, "step": 380 }, { "epoch": 0.26078028747433263, "grad_norm": 11.82792854309082, "learning_rate": 8.984556109797484e-05, "loss": 0.3262, "step": 381 }, { "epoch": 0.2614647501711157, "grad_norm": 7.460376739501953, "learning_rate": 8.977573329738364e-05, "loss": 0.2047, "step": 382 }, { "epoch": 0.2621492128678987, "grad_norm": 5.150585174560547, "learning_rate": 8.970569356237176e-05, "loss": 0.1749, "step": 383 }, { "epoch": 0.26283367556468173, "grad_norm": 6.267769813537598, "learning_rate": 8.963544226612726e-05, "loss": 0.128, "step": 384 }, { "epoch": 0.26351813826146475, "grad_norm": 3.8863582611083984, "learning_rate": 8.956497978296552e-05, "loss": 0.1071, "step": 385 }, { "epoch": 0.26420260095824777, "grad_norm": 10.326163291931152, "learning_rate": 8.949430648832716e-05, "loss": 0.1837, "step": 386 }, { "epoch": 0.2648870636550308, "grad_norm": 5.120767593383789, "learning_rate": 8.9423422758776e-05, "loss": 0.1217, "step": 387 }, { "epoch": 0.2655715263518138, "grad_norm": 2.6918227672576904, "learning_rate": 8.935232897199721e-05, "loss": 0.0812, "step": 388 }, { "epoch": 0.2662559890485969, "grad_norm": 7.4609856605529785, "learning_rate": 8.928102550679508e-05, "loss": 0.1729, "step": 389 }, { "epoch": 0.2669404517453799, "grad_norm": 4.092256546020508, "learning_rate": 8.92095127430912e-05, "loss": 0.1063, "step": 390 }, { "epoch": 0.2676249144421629, "grad_norm": 5.677734851837158, "learning_rate": 8.913779106192229e-05, "loss": 0.0745, "step": 391 }, { "epoch": 0.2683093771389459, "grad_norm": 4.711282730102539, "learning_rate": 8.90658608454383e-05, "loss": 0.0737, "step": 392 }, { "epoch": 0.26899383983572894, "grad_norm": 5.147585868835449, "learning_rate": 8.899372247690023e-05, "loss": 0.1243, "step": 393 }, { "epoch": 0.26967830253251196, "grad_norm": 3.9375510215759277, "learning_rate": 8.892137634067824e-05, "loss": 0.1464, "step": 394 }, { "epoch": 0.270362765229295, "grad_norm": 8.383220672607422, "learning_rate": 8.884882282224945e-05, "loss": 0.1619, "step": 395 }, { "epoch": 0.27104722792607805, "grad_norm": 1.7835997343063354, "learning_rate": 8.877606230819599e-05, "loss": 0.0473, "step": 396 }, { "epoch": 0.27173169062286107, "grad_norm": 5.386960029602051, "learning_rate": 8.870309518620295e-05, "loss": 0.0673, "step": 397 }, { "epoch": 0.2724161533196441, "grad_norm": 1.7976478338241577, "learning_rate": 8.862992184505622e-05, "loss": 0.0164, "step": 398 }, { "epoch": 0.2731006160164271, "grad_norm": 0.7090882658958435, "learning_rate": 8.855654267464049e-05, "loss": 0.0046, "step": 399 }, { "epoch": 0.2737850787132101, "grad_norm": 9.480133056640625, "learning_rate": 8.848295806593718e-05, "loss": 0.0624, "step": 400 }, { "epoch": 0.27446954140999313, "grad_norm": 8.015402793884277, "learning_rate": 8.84091684110223e-05, "loss": 0.2206, "step": 401 }, { "epoch": 0.2751540041067762, "grad_norm": 9.978604316711426, "learning_rate": 8.833517410306442e-05, "loss": 0.0857, "step": 402 }, { "epoch": 0.2758384668035592, "grad_norm": 2.8666634559631348, "learning_rate": 8.826097553632254e-05, "loss": 0.1151, "step": 403 }, { "epoch": 0.27652292950034224, "grad_norm": 8.964261054992676, "learning_rate": 8.818657310614398e-05, "loss": 0.2188, "step": 404 }, { "epoch": 0.27720739219712526, "grad_norm": 8.314974784851074, "learning_rate": 8.81119672089623e-05, "loss": 0.2193, "step": 405 }, { "epoch": 0.2778918548939083, "grad_norm": 11.173494338989258, "learning_rate": 8.803715824229525e-05, "loss": 0.3094, "step": 406 }, { "epoch": 0.2785763175906913, "grad_norm": 5.407591819763184, "learning_rate": 8.796214660474247e-05, "loss": 0.147, "step": 407 }, { "epoch": 0.2792607802874743, "grad_norm": 10.84803295135498, "learning_rate": 8.788693269598353e-05, "loss": 0.1591, "step": 408 }, { "epoch": 0.2799452429842574, "grad_norm": 7.194460868835449, "learning_rate": 8.781151691677579e-05, "loss": 0.2252, "step": 409 }, { "epoch": 0.2806297056810404, "grad_norm": 8.20419979095459, "learning_rate": 8.773589966895213e-05, "loss": 0.1397, "step": 410 }, { "epoch": 0.2813141683778234, "grad_norm": 4.212666988372803, "learning_rate": 8.766008135541897e-05, "loss": 0.0999, "step": 411 }, { "epoch": 0.28199863107460643, "grad_norm": 6.373012542724609, "learning_rate": 8.758406238015404e-05, "loss": 0.1567, "step": 412 }, { "epoch": 0.28268309377138945, "grad_norm": 5.8929667472839355, "learning_rate": 8.750784314820424e-05, "loss": 0.1558, "step": 413 }, { "epoch": 0.28336755646817247, "grad_norm": 7.668067932128906, "learning_rate": 8.74314240656835e-05, "loss": 0.3548, "step": 414 }, { "epoch": 0.2840520191649555, "grad_norm": 7.340702533721924, "learning_rate": 8.735480553977055e-05, "loss": 0.1751, "step": 415 }, { "epoch": 0.28473648186173856, "grad_norm": 8.723352432250977, "learning_rate": 8.727798797870687e-05, "loss": 0.2283, "step": 416 }, { "epoch": 0.28542094455852157, "grad_norm": 3.5388987064361572, "learning_rate": 8.72009717917944e-05, "loss": 0.1659, "step": 417 }, { "epoch": 0.2861054072553046, "grad_norm": 4.754316329956055, "learning_rate": 8.712375738939343e-05, "loss": 0.084, "step": 418 }, { "epoch": 0.2867898699520876, "grad_norm": 9.748307228088379, "learning_rate": 8.704634518292034e-05, "loss": 0.1344, "step": 419 }, { "epoch": 0.2874743326488706, "grad_norm": 5.002340793609619, "learning_rate": 8.696873558484553e-05, "loss": 0.1555, "step": 420 }, { "epoch": 0.28815879534565364, "grad_norm": 4.200029373168945, "learning_rate": 8.689092900869112e-05, "loss": 0.0907, "step": 421 }, { "epoch": 0.2888432580424367, "grad_norm": 8.675992965698242, "learning_rate": 8.681292586902871e-05, "loss": 0.284, "step": 422 }, { "epoch": 0.28952772073921973, "grad_norm": 8.845063209533691, "learning_rate": 8.673472658147734e-05, "loss": 0.1543, "step": 423 }, { "epoch": 0.29021218343600275, "grad_norm": 10.090123176574707, "learning_rate": 8.665633156270111e-05, "loss": 0.1893, "step": 424 }, { "epoch": 0.29089664613278576, "grad_norm": 4.0277180671691895, "learning_rate": 8.657774123040704e-05, "loss": 0.07, "step": 425 }, { "epoch": 0.2915811088295688, "grad_norm": 4.663268089294434, "learning_rate": 8.649895600334284e-05, "loss": 0.1323, "step": 426 }, { "epoch": 0.2922655715263518, "grad_norm": 19.89546775817871, "learning_rate": 8.641997630129461e-05, "loss": 0.2509, "step": 427 }, { "epoch": 0.2929500342231348, "grad_norm": 10.179508209228516, "learning_rate": 8.634080254508473e-05, "loss": 0.1475, "step": 428 }, { "epoch": 0.2936344969199179, "grad_norm": 14.65198802947998, "learning_rate": 8.626143515656948e-05, "loss": 0.1908, "step": 429 }, { "epoch": 0.2943189596167009, "grad_norm": 6.354902267456055, "learning_rate": 8.618187455863692e-05, "loss": 0.1117, "step": 430 }, { "epoch": 0.2950034223134839, "grad_norm": 6.753427505493164, "learning_rate": 8.610212117520452e-05, "loss": 0.1767, "step": 431 }, { "epoch": 0.29568788501026694, "grad_norm": 8.440505027770996, "learning_rate": 8.602217543121702e-05, "loss": 0.2615, "step": 432 }, { "epoch": 0.29637234770704995, "grad_norm": 6.9934539794921875, "learning_rate": 8.594203775264403e-05, "loss": 0.1116, "step": 433 }, { "epoch": 0.29705681040383297, "grad_norm": 6.052974700927734, "learning_rate": 8.586170856647785e-05, "loss": 0.1432, "step": 434 }, { "epoch": 0.29774127310061604, "grad_norm": 7.290167808532715, "learning_rate": 8.578118830073125e-05, "loss": 0.1923, "step": 435 }, { "epoch": 0.29842573579739906, "grad_norm": 4.989604473114014, "learning_rate": 8.570047738443502e-05, "loss": 0.0899, "step": 436 }, { "epoch": 0.2991101984941821, "grad_norm": 3.2941253185272217, "learning_rate": 8.561957624763584e-05, "loss": 0.0475, "step": 437 }, { "epoch": 0.2997946611909651, "grad_norm": 5.998294353485107, "learning_rate": 8.553848532139388e-05, "loss": 0.0787, "step": 438 }, { "epoch": 0.3004791238877481, "grad_norm": 4.609109401702881, "learning_rate": 8.545720503778061e-05, "loss": 0.0683, "step": 439 }, { "epoch": 0.30116358658453113, "grad_norm": 1.4998338222503662, "learning_rate": 8.537573582987642e-05, "loss": 0.0505, "step": 440 }, { "epoch": 0.30184804928131415, "grad_norm": 2.164229393005371, "learning_rate": 8.529407813176828e-05, "loss": 0.0365, "step": 441 }, { "epoch": 0.3025325119780972, "grad_norm": 4.486866474151611, "learning_rate": 8.521223237854758e-05, "loss": 0.0601, "step": 442 }, { "epoch": 0.30321697467488024, "grad_norm": 2.9916493892669678, "learning_rate": 8.513019900630763e-05, "loss": 0.0215, "step": 443 }, { "epoch": 0.30390143737166325, "grad_norm": 4.101856231689453, "learning_rate": 8.504797845214145e-05, "loss": 0.0646, "step": 444 }, { "epoch": 0.30458590006844627, "grad_norm": 5.063677787780762, "learning_rate": 8.496557115413942e-05, "loss": 0.0588, "step": 445 }, { "epoch": 0.3052703627652293, "grad_norm": 0.1926337480545044, "learning_rate": 8.48829775513869e-05, "loss": 0.0019, "step": 446 }, { "epoch": 0.3059548254620123, "grad_norm": 10.875621795654297, "learning_rate": 8.480019808396194e-05, "loss": 0.1779, "step": 447 }, { "epoch": 0.3066392881587953, "grad_norm": 6.47944450378418, "learning_rate": 8.471723319293295e-05, "loss": 0.1211, "step": 448 }, { "epoch": 0.3073237508555784, "grad_norm": 3.386258363723755, "learning_rate": 8.463408332035629e-05, "loss": 0.0429, "step": 449 }, { "epoch": 0.3080082135523614, "grad_norm": 4.379806041717529, "learning_rate": 8.455074890927392e-05, "loss": 0.089, "step": 450 }, { "epoch": 0.3086926762491444, "grad_norm": 17.432636260986328, "learning_rate": 8.446723040371114e-05, "loss": 0.3679, "step": 451 }, { "epoch": 0.30937713894592744, "grad_norm": 27.23634147644043, "learning_rate": 8.438352824867409e-05, "loss": 0.3321, "step": 452 }, { "epoch": 0.31006160164271046, "grad_norm": 21.83364486694336, "learning_rate": 8.429964289014742e-05, "loss": 0.2993, "step": 453 }, { "epoch": 0.3107460643394935, "grad_norm": 12.754165649414062, "learning_rate": 8.421557477509202e-05, "loss": 0.2127, "step": 454 }, { "epoch": 0.31143052703627655, "grad_norm": 10.172953605651855, "learning_rate": 8.413132435144244e-05, "loss": 0.2041, "step": 455 }, { "epoch": 0.31211498973305957, "grad_norm": 5.436285972595215, "learning_rate": 8.404689206810469e-05, "loss": 0.1615, "step": 456 }, { "epoch": 0.3127994524298426, "grad_norm": 9.97593879699707, "learning_rate": 8.396227837495374e-05, "loss": 0.1738, "step": 457 }, { "epoch": 0.3134839151266256, "grad_norm": 11.709485054016113, "learning_rate": 8.387748372283116e-05, "loss": 0.291, "step": 458 }, { "epoch": 0.3141683778234086, "grad_norm": 7.218990802764893, "learning_rate": 8.37925085635427e-05, "loss": 0.1307, "step": 459 }, { "epoch": 0.31485284052019163, "grad_norm": 7.856277942657471, "learning_rate": 8.370735334985594e-05, "loss": 0.1498, "step": 460 }, { "epoch": 0.31553730321697465, "grad_norm": 4.054375171661377, "learning_rate": 8.362201853549777e-05, "loss": 0.0851, "step": 461 }, { "epoch": 0.3162217659137577, "grad_norm": 10.466679573059082, "learning_rate": 8.353650457515205e-05, "loss": 0.2075, "step": 462 }, { "epoch": 0.31690622861054074, "grad_norm": 7.8385820388793945, "learning_rate": 8.345081192445723e-05, "loss": 0.2198, "step": 463 }, { "epoch": 0.31759069130732376, "grad_norm": 13.153767585754395, "learning_rate": 8.336494104000377e-05, "loss": 0.2614, "step": 464 }, { "epoch": 0.3182751540041068, "grad_norm": 6.044870853424072, "learning_rate": 8.327889237933189e-05, "loss": 0.1907, "step": 465 }, { "epoch": 0.3189596167008898, "grad_norm": 7.637570858001709, "learning_rate": 8.319266640092897e-05, "loss": 0.2104, "step": 466 }, { "epoch": 0.3196440793976728, "grad_norm": 15.295083045959473, "learning_rate": 8.310626356422723e-05, "loss": 0.3622, "step": 467 }, { "epoch": 0.3203285420944558, "grad_norm": 8.386652946472168, "learning_rate": 8.30196843296012e-05, "loss": 0.2304, "step": 468 }, { "epoch": 0.3210130047912389, "grad_norm": 7.470604419708252, "learning_rate": 8.293292915836537e-05, "loss": 0.1586, "step": 469 }, { "epoch": 0.3216974674880219, "grad_norm": 10.790739059448242, "learning_rate": 8.284599851277153e-05, "loss": 0.2751, "step": 470 }, { "epoch": 0.32238193018480493, "grad_norm": 7.501530647277832, "learning_rate": 8.275889285600656e-05, "loss": 0.1502, "step": 471 }, { "epoch": 0.32306639288158795, "grad_norm": 4.665273666381836, "learning_rate": 8.267161265218977e-05, "loss": 0.1231, "step": 472 }, { "epoch": 0.32375085557837097, "grad_norm": 5.489170551300049, "learning_rate": 8.258415836637055e-05, "loss": 0.1841, "step": 473 }, { "epoch": 0.324435318275154, "grad_norm": 4.910449028015137, "learning_rate": 8.249653046452578e-05, "loss": 0.1715, "step": 474 }, { "epoch": 0.32511978097193706, "grad_norm": 8.837661743164062, "learning_rate": 8.24087294135575e-05, "loss": 0.2364, "step": 475 }, { "epoch": 0.3258042436687201, "grad_norm": 4.649965286254883, "learning_rate": 8.232075568129019e-05, "loss": 0.1836, "step": 476 }, { "epoch": 0.3264887063655031, "grad_norm": 4.730966091156006, "learning_rate": 8.223260973646856e-05, "loss": 0.1101, "step": 477 }, { "epoch": 0.3271731690622861, "grad_norm": 4.1306915283203125, "learning_rate": 8.21442920487548e-05, "loss": 0.1003, "step": 478 }, { "epoch": 0.3278576317590691, "grad_norm": 9.328051567077637, "learning_rate": 8.205580308872624e-05, "loss": 0.2109, "step": 479 }, { "epoch": 0.32854209445585214, "grad_norm": 2.9510762691497803, "learning_rate": 8.196714332787281e-05, "loss": 0.1642, "step": 480 }, { "epoch": 0.32922655715263516, "grad_norm": 6.893701076507568, "learning_rate": 8.187831323859445e-05, "loss": 0.2378, "step": 481 }, { "epoch": 0.32991101984941823, "grad_norm": 5.594333648681641, "learning_rate": 8.178931329419867e-05, "loss": 0.0913, "step": 482 }, { "epoch": 0.33059548254620125, "grad_norm": 4.886159896850586, "learning_rate": 8.170014396889802e-05, "loss": 0.0933, "step": 483 }, { "epoch": 0.33127994524298426, "grad_norm": 7.292411804199219, "learning_rate": 8.161080573780756e-05, "loss": 0.1664, "step": 484 }, { "epoch": 0.3319644079397673, "grad_norm": 6.235716819763184, "learning_rate": 8.152129907694228e-05, "loss": 0.1135, "step": 485 }, { "epoch": 0.3326488706365503, "grad_norm": 5.505281448364258, "learning_rate": 8.143162446321465e-05, "loss": 0.1084, "step": 486 }, { "epoch": 0.3333333333333333, "grad_norm": 4.087010860443115, "learning_rate": 8.134178237443198e-05, "loss": 0.094, "step": 487 }, { "epoch": 0.33401779603011633, "grad_norm": 2.3706865310668945, "learning_rate": 8.125177328929399e-05, "loss": 0.0362, "step": 488 }, { "epoch": 0.3347022587268994, "grad_norm": 2.32283091545105, "learning_rate": 8.116159768739013e-05, "loss": 0.0221, "step": 489 }, { "epoch": 0.3353867214236824, "grad_norm": 4.263980388641357, "learning_rate": 8.107125604919717e-05, "loss": 0.0632, "step": 490 }, { "epoch": 0.33607118412046544, "grad_norm": 5.4305572509765625, "learning_rate": 8.098074885607645e-05, "loss": 0.105, "step": 491 }, { "epoch": 0.33675564681724846, "grad_norm": 5.506404399871826, "learning_rate": 8.089007659027153e-05, "loss": 0.0782, "step": 492 }, { "epoch": 0.33744010951403147, "grad_norm": 6.501095294952393, "learning_rate": 8.079923973490545e-05, "loss": 0.2221, "step": 493 }, { "epoch": 0.3381245722108145, "grad_norm": 2.5577752590179443, "learning_rate": 8.070823877397826e-05, "loss": 0.0173, "step": 494 }, { "epoch": 0.33880903490759756, "grad_norm": 6.6881866455078125, "learning_rate": 8.06170741923644e-05, "loss": 0.2308, "step": 495 }, { "epoch": 0.3394934976043806, "grad_norm": 4.1703667640686035, "learning_rate": 8.052574647581009e-05, "loss": 0.1529, "step": 496 }, { "epoch": 0.3401779603011636, "grad_norm": 3.4506235122680664, "learning_rate": 8.043425611093077e-05, "loss": 0.0888, "step": 497 }, { "epoch": 0.3408624229979466, "grad_norm": 2.673396348953247, "learning_rate": 8.034260358520856e-05, "loss": 0.0307, "step": 498 }, { "epoch": 0.34154688569472963, "grad_norm": 7.297218322753906, "learning_rate": 8.025078938698957e-05, "loss": 0.1033, "step": 499 }, { "epoch": 0.34223134839151265, "grad_norm": 5.094481468200684, "learning_rate": 8.015881400548135e-05, "loss": 0.0522, "step": 500 }, { "epoch": 0.34291581108829566, "grad_norm": 8.042001724243164, "learning_rate": 8.006667793075025e-05, "loss": 0.1013, "step": 501 }, { "epoch": 0.34360027378507874, "grad_norm": 10.794388771057129, "learning_rate": 7.997438165371887e-05, "loss": 0.1566, "step": 502 }, { "epoch": 0.34428473648186175, "grad_norm": 7.1532392501831055, "learning_rate": 7.988192566616337e-05, "loss": 0.0814, "step": 503 }, { "epoch": 0.34496919917864477, "grad_norm": 12.906824111938477, "learning_rate": 7.978931046071092e-05, "loss": 0.1268, "step": 504 }, { "epoch": 0.3456536618754278, "grad_norm": 9.681212425231934, "learning_rate": 7.969653653083702e-05, "loss": 0.1055, "step": 505 }, { "epoch": 0.3463381245722108, "grad_norm": 3.9869563579559326, "learning_rate": 7.960360437086286e-05, "loss": 0.1422, "step": 506 }, { "epoch": 0.3470225872689938, "grad_norm": 5.587221145629883, "learning_rate": 7.951051447595278e-05, "loss": 0.0926, "step": 507 }, { "epoch": 0.34770704996577684, "grad_norm": 3.656465530395508, "learning_rate": 7.941726734211152e-05, "loss": 0.146, "step": 508 }, { "epoch": 0.3483915126625599, "grad_norm": 5.440763473510742, "learning_rate": 7.932386346618167e-05, "loss": 0.1168, "step": 509 }, { "epoch": 0.3490759753593429, "grad_norm": 6.135490417480469, "learning_rate": 7.923030334584092e-05, "loss": 0.112, "step": 510 }, { "epoch": 0.34976043805612594, "grad_norm": 6.686113357543945, "learning_rate": 7.913658747959951e-05, "loss": 0.1462, "step": 511 }, { "epoch": 0.35044490075290896, "grad_norm": 6.898252010345459, "learning_rate": 7.904271636679753e-05, "loss": 0.0787, "step": 512 }, { "epoch": 0.351129363449692, "grad_norm": 12.427412986755371, "learning_rate": 7.894869050760225e-05, "loss": 0.2214, "step": 513 }, { "epoch": 0.351813826146475, "grad_norm": 6.402753829956055, "learning_rate": 7.885451040300544e-05, "loss": 0.0809, "step": 514 }, { "epoch": 0.35249828884325807, "grad_norm": 8.768619537353516, "learning_rate": 7.876017655482077e-05, "loss": 0.253, "step": 515 }, { "epoch": 0.3531827515400411, "grad_norm": 6.811859607696533, "learning_rate": 7.866568946568107e-05, "loss": 0.1486, "step": 516 }, { "epoch": 0.3538672142368241, "grad_norm": 11.727694511413574, "learning_rate": 7.857104963903567e-05, "loss": 0.2448, "step": 517 }, { "epoch": 0.3545516769336071, "grad_norm": 5.048339366912842, "learning_rate": 7.847625757914773e-05, "loss": 0.1434, "step": 518 }, { "epoch": 0.35523613963039014, "grad_norm": 4.049713134765625, "learning_rate": 7.838131379109155e-05, "loss": 0.0861, "step": 519 }, { "epoch": 0.35592060232717315, "grad_norm": 4.734728813171387, "learning_rate": 7.828621878074985e-05, "loss": 0.107, "step": 520 }, { "epoch": 0.35660506502395617, "grad_norm": 3.0403077602386475, "learning_rate": 7.819097305481112e-05, "loss": 0.1679, "step": 521 }, { "epoch": 0.35728952772073924, "grad_norm": 7.152106285095215, "learning_rate": 7.80955771207669e-05, "loss": 0.1535, "step": 522 }, { "epoch": 0.35797399041752226, "grad_norm": 10.22497272491455, "learning_rate": 7.800003148690903e-05, "loss": 0.2093, "step": 523 }, { "epoch": 0.3586584531143053, "grad_norm": 8.764464378356934, "learning_rate": 7.790433666232706e-05, "loss": 0.1503, "step": 524 }, { "epoch": 0.3593429158110883, "grad_norm": 5.311343193054199, "learning_rate": 7.780849315690539e-05, "loss": 0.1369, "step": 525 }, { "epoch": 0.3600273785078713, "grad_norm": 2.448129653930664, "learning_rate": 7.771250148132068e-05, "loss": 0.0769, "step": 526 }, { "epoch": 0.3607118412046543, "grad_norm": 5.760735034942627, "learning_rate": 7.761636214703905e-05, "loss": 0.1368, "step": 527 }, { "epoch": 0.3613963039014374, "grad_norm": 13.602130889892578, "learning_rate": 7.752007566631337e-05, "loss": 0.2485, "step": 528 }, { "epoch": 0.3620807665982204, "grad_norm": 2.9826390743255615, "learning_rate": 7.742364255218058e-05, "loss": 0.0795, "step": 529 }, { "epoch": 0.36276522929500343, "grad_norm": 6.516305923461914, "learning_rate": 7.732706331845887e-05, "loss": 0.135, "step": 530 }, { "epoch": 0.36344969199178645, "grad_norm": 7.035853385925293, "learning_rate": 7.723033847974504e-05, "loss": 0.1681, "step": 531 }, { "epoch": 0.36413415468856947, "grad_norm": 6.953193187713623, "learning_rate": 7.713346855141165e-05, "loss": 0.1331, "step": 532 }, { "epoch": 0.3648186173853525, "grad_norm": 5.551963806152344, "learning_rate": 7.703645404960438e-05, "loss": 0.1458, "step": 533 }, { "epoch": 0.3655030800821355, "grad_norm": 7.135836601257324, "learning_rate": 7.69392954912392e-05, "loss": 0.2048, "step": 534 }, { "epoch": 0.3661875427789186, "grad_norm": 6.385347366333008, "learning_rate": 7.684199339399967e-05, "loss": 0.1743, "step": 535 }, { "epoch": 0.3668720054757016, "grad_norm": 8.454021453857422, "learning_rate": 7.674454827633413e-05, "loss": 0.2083, "step": 536 }, { "epoch": 0.3675564681724846, "grad_norm": 4.653712272644043, "learning_rate": 7.6646960657453e-05, "loss": 0.0734, "step": 537 }, { "epoch": 0.3682409308692676, "grad_norm": 9.211790084838867, "learning_rate": 7.654923105732597e-05, "loss": 0.1648, "step": 538 }, { "epoch": 0.36892539356605064, "grad_norm": 4.986487865447998, "learning_rate": 7.645135999667921e-05, "loss": 0.1222, "step": 539 }, { "epoch": 0.36960985626283366, "grad_norm": 3.7793068885803223, "learning_rate": 7.635334799699266e-05, "loss": 0.0995, "step": 540 }, { "epoch": 0.3702943189596167, "grad_norm": 5.969120502471924, "learning_rate": 7.625519558049721e-05, "loss": 0.1975, "step": 541 }, { "epoch": 0.37097878165639975, "grad_norm": 2.04508900642395, "learning_rate": 7.615690327017194e-05, "loss": 0.075, "step": 542 }, { "epoch": 0.37166324435318276, "grad_norm": 5.6740803718566895, "learning_rate": 7.605847158974123e-05, "loss": 0.1362, "step": 543 }, { "epoch": 0.3723477070499658, "grad_norm": 1.0581105947494507, "learning_rate": 7.595990106367217e-05, "loss": 0.0093, "step": 544 }, { "epoch": 0.3730321697467488, "grad_norm": 5.604482650756836, "learning_rate": 7.586119221717161e-05, "loss": 0.1112, "step": 545 }, { "epoch": 0.3737166324435318, "grad_norm": 3.4312081336975098, "learning_rate": 7.576234557618336e-05, "loss": 0.0485, "step": 546 }, { "epoch": 0.37440109514031483, "grad_norm": 2.75480580329895, "learning_rate": 7.56633616673855e-05, "loss": 0.0772, "step": 547 }, { "epoch": 0.3750855578370979, "grad_norm": 2.1551015377044678, "learning_rate": 7.556424101818746e-05, "loss": 0.0235, "step": 548 }, { "epoch": 0.3757700205338809, "grad_norm": 2.1761882305145264, "learning_rate": 7.546498415672727e-05, "loss": 0.0292, "step": 549 }, { "epoch": 0.37645448323066394, "grad_norm": 3.735642433166504, "learning_rate": 7.536559161186874e-05, "loss": 0.0592, "step": 550 }, { "epoch": 0.37713894592744696, "grad_norm": 4.763843536376953, "learning_rate": 7.526606391319862e-05, "loss": 0.1114, "step": 551 }, { "epoch": 0.37782340862423, "grad_norm": 4.60253381729126, "learning_rate": 7.516640159102377e-05, "loss": 0.1757, "step": 552 }, { "epoch": 0.378507871321013, "grad_norm": 8.288164138793945, "learning_rate": 7.506660517636841e-05, "loss": 0.1474, "step": 553 }, { "epoch": 0.379192334017796, "grad_norm": 2.126084804534912, "learning_rate": 7.49666752009712e-05, "loss": 0.0911, "step": 554 }, { "epoch": 0.3798767967145791, "grad_norm": 4.20339822769165, "learning_rate": 7.486661219728242e-05, "loss": 0.1044, "step": 555 }, { "epoch": 0.3805612594113621, "grad_norm": 9.82011890411377, "learning_rate": 7.47664166984612e-05, "loss": 0.1468, "step": 556 }, { "epoch": 0.3812457221081451, "grad_norm": 4.054382801055908, "learning_rate": 7.466608923837265e-05, "loss": 0.028, "step": 557 }, { "epoch": 0.38193018480492813, "grad_norm": 9.398476600646973, "learning_rate": 7.456563035158492e-05, "loss": 0.136, "step": 558 }, { "epoch": 0.38261464750171115, "grad_norm": 7.4469804763793945, "learning_rate": 7.446504057336652e-05, "loss": 0.1324, "step": 559 }, { "epoch": 0.38329911019849416, "grad_norm": 10.356621742248535, "learning_rate": 7.436432043968331e-05, "loss": 0.1858, "step": 560 }, { "epoch": 0.3839835728952772, "grad_norm": 7.800370216369629, "learning_rate": 7.426347048719577e-05, "loss": 0.3151, "step": 561 }, { "epoch": 0.38466803559206025, "grad_norm": 7.713531017303467, "learning_rate": 7.416249125325606e-05, "loss": 0.1733, "step": 562 }, { "epoch": 0.38535249828884327, "grad_norm": 6.9704365730285645, "learning_rate": 7.406138327590516e-05, "loss": 0.1383, "step": 563 }, { "epoch": 0.3860369609856263, "grad_norm": 6.985771656036377, "learning_rate": 7.396014709387007e-05, "loss": 0.2384, "step": 564 }, { "epoch": 0.3867214236824093, "grad_norm": 8.908336639404297, "learning_rate": 7.385878324656088e-05, "loss": 0.3974, "step": 565 }, { "epoch": 0.3874058863791923, "grad_norm": 7.679532527923584, "learning_rate": 7.37572922740679e-05, "loss": 0.1529, "step": 566 }, { "epoch": 0.38809034907597534, "grad_norm": 6.032624244689941, "learning_rate": 7.365567471715876e-05, "loss": 0.1583, "step": 567 }, { "epoch": 0.3887748117727584, "grad_norm": 11.032501220703125, "learning_rate": 7.355393111727564e-05, "loss": 0.3033, "step": 568 }, { "epoch": 0.3894592744695414, "grad_norm": 5.357724666595459, "learning_rate": 7.345206201653224e-05, "loss": 0.1082, "step": 569 }, { "epoch": 0.39014373716632444, "grad_norm": 4.176272392272949, "learning_rate": 7.3350067957711e-05, "loss": 0.1087, "step": 570 }, { "epoch": 0.39082819986310746, "grad_norm": 6.165141582489014, "learning_rate": 7.324794948426015e-05, "loss": 0.0991, "step": 571 }, { "epoch": 0.3915126625598905, "grad_norm": 6.76448392868042, "learning_rate": 7.314570714029081e-05, "loss": 0.1821, "step": 572 }, { "epoch": 0.3921971252566735, "grad_norm": 9.525832176208496, "learning_rate": 7.304334147057412e-05, "loss": 0.2249, "step": 573 }, { "epoch": 0.3928815879534565, "grad_norm": 7.214599132537842, "learning_rate": 7.294085302053836e-05, "loss": 0.2052, "step": 574 }, { "epoch": 0.3935660506502396, "grad_norm": 6.892542362213135, "learning_rate": 7.2838242336266e-05, "loss": 0.1513, "step": 575 }, { "epoch": 0.3942505133470226, "grad_norm": 6.3590593338012695, "learning_rate": 7.273550996449076e-05, "loss": 0.1288, "step": 576 }, { "epoch": 0.3949349760438056, "grad_norm": 9.475593566894531, "learning_rate": 7.263265645259479e-05, "loss": 0.226, "step": 577 }, { "epoch": 0.39561943874058864, "grad_norm": 3.26088809967041, "learning_rate": 7.25296823486057e-05, "loss": 0.0437, "step": 578 }, { "epoch": 0.39630390143737165, "grad_norm": 2.9698143005371094, "learning_rate": 7.242658820119359e-05, "loss": 0.0825, "step": 579 }, { "epoch": 0.39698836413415467, "grad_norm": 3.6840758323669434, "learning_rate": 7.232337455966825e-05, "loss": 0.1219, "step": 580 }, { "epoch": 0.3976728268309377, "grad_norm": 5.516478061676025, "learning_rate": 7.222004197397613e-05, "loss": 0.1444, "step": 581 }, { "epoch": 0.39835728952772076, "grad_norm": 6.521612644195557, "learning_rate": 7.21165909946974e-05, "loss": 0.1098, "step": 582 }, { "epoch": 0.3990417522245038, "grad_norm": 8.887290954589844, "learning_rate": 7.201302217304318e-05, "loss": 0.1683, "step": 583 }, { "epoch": 0.3997262149212868, "grad_norm": 6.482909679412842, "learning_rate": 7.190933606085233e-05, "loss": 0.1782, "step": 584 }, { "epoch": 0.4004106776180698, "grad_norm": 8.721263885498047, "learning_rate": 7.180553321058875e-05, "loss": 0.1285, "step": 585 }, { "epoch": 0.4010951403148528, "grad_norm": 1.3197485208511353, "learning_rate": 7.170161417533836e-05, "loss": 0.0223, "step": 586 }, { "epoch": 0.40177960301163584, "grad_norm": 5.509128570556641, "learning_rate": 7.15975795088061e-05, "loss": 0.1364, "step": 587 }, { "epoch": 0.4024640657084189, "grad_norm": 3.0783398151397705, "learning_rate": 7.149342976531302e-05, "loss": 0.1096, "step": 588 }, { "epoch": 0.40314852840520193, "grad_norm": 6.428481578826904, "learning_rate": 7.138916549979333e-05, "loss": 0.1891, "step": 589 }, { "epoch": 0.40383299110198495, "grad_norm": 3.7200963497161865, "learning_rate": 7.12847872677915e-05, "loss": 0.1065, "step": 590 }, { "epoch": 0.40451745379876797, "grad_norm": 3.6621577739715576, "learning_rate": 7.118029562545915e-05, "loss": 0.064, "step": 591 }, { "epoch": 0.405201916495551, "grad_norm": 1.9148918390274048, "learning_rate": 7.107569112955224e-05, "loss": 0.0344, "step": 592 }, { "epoch": 0.405886379192334, "grad_norm": 2.9361677169799805, "learning_rate": 7.097097433742799e-05, "loss": 0.0973, "step": 593 }, { "epoch": 0.406570841889117, "grad_norm": 4.974667549133301, "learning_rate": 7.0866145807042e-05, "loss": 0.0863, "step": 594 }, { "epoch": 0.4072553045859001, "grad_norm": 5.509044647216797, "learning_rate": 7.076120609694525e-05, "loss": 0.0819, "step": 595 }, { "epoch": 0.4079397672826831, "grad_norm": 1.6528812646865845, "learning_rate": 7.065615576628107e-05, "loss": 0.0077, "step": 596 }, { "epoch": 0.4086242299794661, "grad_norm": 4.979884624481201, "learning_rate": 7.05509953747822e-05, "loss": 0.0509, "step": 597 }, { "epoch": 0.40930869267624914, "grad_norm": 2.1077194213867188, "learning_rate": 7.044572548276785e-05, "loss": 0.0272, "step": 598 }, { "epoch": 0.40999315537303216, "grad_norm": 6.067802906036377, "learning_rate": 7.034034665114066e-05, "loss": 0.097, "step": 599 }, { "epoch": 0.4106776180698152, "grad_norm": 6.381669521331787, "learning_rate": 7.023485944138372e-05, "loss": 0.0782, "step": 600 }, { "epoch": 0.4113620807665982, "grad_norm": 2.264425039291382, "learning_rate": 7.012926441555758e-05, "loss": 0.0855, "step": 601 }, { "epoch": 0.41204654346338127, "grad_norm": 5.149122714996338, "learning_rate": 7.002356213629724e-05, "loss": 0.1395, "step": 602 }, { "epoch": 0.4127310061601643, "grad_norm": 4.293155670166016, "learning_rate": 6.991775316680924e-05, "loss": 0.1228, "step": 603 }, { "epoch": 0.4134154688569473, "grad_norm": 3.085103988647461, "learning_rate": 6.981183807086851e-05, "loss": 0.1353, "step": 604 }, { "epoch": 0.4140999315537303, "grad_norm": 5.4214653968811035, "learning_rate": 6.970581741281548e-05, "loss": 0.1251, "step": 605 }, { "epoch": 0.41478439425051333, "grad_norm": 5.045284748077393, "learning_rate": 6.959969175755305e-05, "loss": 0.1403, "step": 606 }, { "epoch": 0.41546885694729635, "grad_norm": 8.736660957336426, "learning_rate": 6.949346167054358e-05, "loss": 0.1527, "step": 607 }, { "epoch": 0.4161533196440794, "grad_norm": 8.21678352355957, "learning_rate": 6.938712771780582e-05, "loss": 0.1643, "step": 608 }, { "epoch": 0.41683778234086244, "grad_norm": 5.674940586090088, "learning_rate": 6.928069046591199e-05, "loss": 0.0958, "step": 609 }, { "epoch": 0.41752224503764546, "grad_norm": 3.9406206607818604, "learning_rate": 6.917415048198467e-05, "loss": 0.096, "step": 610 }, { "epoch": 0.4182067077344285, "grad_norm": 9.412683486938477, "learning_rate": 6.906750833369386e-05, "loss": 0.2008, "step": 611 }, { "epoch": 0.4188911704312115, "grad_norm": 4.048202991485596, "learning_rate": 6.89607645892539e-05, "loss": 0.12, "step": 612 }, { "epoch": 0.4195756331279945, "grad_norm": 5.833441734313965, "learning_rate": 6.885391981742043e-05, "loss": 0.2254, "step": 613 }, { "epoch": 0.4202600958247775, "grad_norm": 3.19614315032959, "learning_rate": 6.874697458748744e-05, "loss": 0.1028, "step": 614 }, { "epoch": 0.4209445585215606, "grad_norm": 6.89042854309082, "learning_rate": 6.863992946928412e-05, "loss": 0.1039, "step": 615 }, { "epoch": 0.4216290212183436, "grad_norm": 9.974446296691895, "learning_rate": 6.853278503317197e-05, "loss": 0.1658, "step": 616 }, { "epoch": 0.42231348391512663, "grad_norm": 5.457868576049805, "learning_rate": 6.842554185004162e-05, "loss": 0.0908, "step": 617 }, { "epoch": 0.42299794661190965, "grad_norm": 10.526845932006836, "learning_rate": 6.831820049130985e-05, "loss": 0.3783, "step": 618 }, { "epoch": 0.42368240930869266, "grad_norm": 9.580425262451172, "learning_rate": 6.821076152891654e-05, "loss": 0.1303, "step": 619 }, { "epoch": 0.4243668720054757, "grad_norm": 6.656687259674072, "learning_rate": 6.810322553532167e-05, "loss": 0.1665, "step": 620 }, { "epoch": 0.42505133470225875, "grad_norm": 4.369217395782471, "learning_rate": 6.799559308350218e-05, "loss": 0.0898, "step": 621 }, { "epoch": 0.42573579739904177, "grad_norm": 6.606124401092529, "learning_rate": 6.7887864746949e-05, "loss": 0.1103, "step": 622 }, { "epoch": 0.4264202600958248, "grad_norm": 3.831536293029785, "learning_rate": 6.778004109966387e-05, "loss": 0.0785, "step": 623 }, { "epoch": 0.4271047227926078, "grad_norm": 5.25603723526001, "learning_rate": 6.767212271615649e-05, "loss": 0.1562, "step": 624 }, { "epoch": 0.4277891854893908, "grad_norm": 5.623077392578125, "learning_rate": 6.756411017144126e-05, "loss": 0.1467, "step": 625 }, { "epoch": 0.42847364818617384, "grad_norm": 2.9501168727874756, "learning_rate": 6.74560040410343e-05, "loss": 0.0988, "step": 626 }, { "epoch": 0.42915811088295686, "grad_norm": 6.494692325592041, "learning_rate": 6.734780490095039e-05, "loss": 0.1735, "step": 627 }, { "epoch": 0.42984257357973993, "grad_norm": 4.433006286621094, "learning_rate": 6.72395133276999e-05, "loss": 0.086, "step": 628 }, { "epoch": 0.43052703627652295, "grad_norm": 6.381512641906738, "learning_rate": 6.713112989828567e-05, "loss": 0.1017, "step": 629 }, { "epoch": 0.43121149897330596, "grad_norm": 4.848477840423584, "learning_rate": 6.70226551902e-05, "loss": 0.1397, "step": 630 }, { "epoch": 0.431895961670089, "grad_norm": 5.080401420593262, "learning_rate": 6.691408978142153e-05, "loss": 0.1294, "step": 631 }, { "epoch": 0.432580424366872, "grad_norm": 6.366408824920654, "learning_rate": 6.680543425041217e-05, "loss": 0.1479, "step": 632 }, { "epoch": 0.433264887063655, "grad_norm": 5.9943037033081055, "learning_rate": 6.669668917611404e-05, "loss": 0.1705, "step": 633 }, { "epoch": 0.43394934976043803, "grad_norm": 4.0005784034729, "learning_rate": 6.65878551379464e-05, "loss": 0.0673, "step": 634 }, { "epoch": 0.4346338124572211, "grad_norm": 3.0548219680786133, "learning_rate": 6.64789327158024e-05, "loss": 0.0821, "step": 635 }, { "epoch": 0.4353182751540041, "grad_norm": 3.663999557495117, "learning_rate": 6.63699224900463e-05, "loss": 0.0444, "step": 636 }, { "epoch": 0.43600273785078714, "grad_norm": 3.711920976638794, "learning_rate": 6.626082504151004e-05, "loss": 0.0894, "step": 637 }, { "epoch": 0.43668720054757015, "grad_norm": 7.126670837402344, "learning_rate": 6.615164095149042e-05, "loss": 0.1621, "step": 638 }, { "epoch": 0.43737166324435317, "grad_norm": 2.8638012409210205, "learning_rate": 6.60423708017458e-05, "loss": 0.0272, "step": 639 }, { "epoch": 0.4380561259411362, "grad_norm": 3.5263171195983887, "learning_rate": 6.593301517449317e-05, "loss": 0.0912, "step": 640 }, { "epoch": 0.43874058863791926, "grad_norm": 3.85530161857605, "learning_rate": 6.582357465240487e-05, "loss": 0.085, "step": 641 }, { "epoch": 0.4394250513347023, "grad_norm": 5.0717644691467285, "learning_rate": 6.571404981860565e-05, "loss": 0.0828, "step": 642 }, { "epoch": 0.4401095140314853, "grad_norm": 2.5986831188201904, "learning_rate": 6.560444125666947e-05, "loss": 0.0647, "step": 643 }, { "epoch": 0.4407939767282683, "grad_norm": 3.8211185932159424, "learning_rate": 6.54947495506164e-05, "loss": 0.167, "step": 644 }, { "epoch": 0.4414784394250513, "grad_norm": 3.512300491333008, "learning_rate": 6.538497528490956e-05, "loss": 0.0477, "step": 645 }, { "epoch": 0.44216290212183434, "grad_norm": 3.763268232345581, "learning_rate": 6.527511904445194e-05, "loss": 0.0469, "step": 646 }, { "epoch": 0.44284736481861736, "grad_norm": 3.623258113861084, "learning_rate": 6.516518141458327e-05, "loss": 0.0828, "step": 647 }, { "epoch": 0.44353182751540043, "grad_norm": 4.421901226043701, "learning_rate": 6.505516298107703e-05, "loss": 0.056, "step": 648 }, { "epoch": 0.44421629021218345, "grad_norm": 0.3914797008037567, "learning_rate": 6.494506433013719e-05, "loss": 0.0024, "step": 649 }, { "epoch": 0.44490075290896647, "grad_norm": 2.4494991302490234, "learning_rate": 6.483488604839513e-05, "loss": 0.0227, "step": 650 }, { "epoch": 0.4455852156057495, "grad_norm": 3.0195860862731934, "learning_rate": 6.472462872290653e-05, "loss": 0.0604, "step": 651 }, { "epoch": 0.4462696783025325, "grad_norm": 2.999337911605835, "learning_rate": 6.461429294114824e-05, "loss": 0.0777, "step": 652 }, { "epoch": 0.4469541409993155, "grad_norm": 3.339648723602295, "learning_rate": 6.450387929101515e-05, "loss": 0.0928, "step": 653 }, { "epoch": 0.44763860369609854, "grad_norm": 4.1887335777282715, "learning_rate": 6.439338836081705e-05, "loss": 0.0561, "step": 654 }, { "epoch": 0.4483230663928816, "grad_norm": 7.507865905761719, "learning_rate": 6.428282073927547e-05, "loss": 0.1471, "step": 655 }, { "epoch": 0.4490075290896646, "grad_norm": 2.752976655960083, "learning_rate": 6.417217701552059e-05, "loss": 0.0815, "step": 656 }, { "epoch": 0.44969199178644764, "grad_norm": 4.312994480133057, "learning_rate": 6.406145777908807e-05, "loss": 0.109, "step": 657 }, { "epoch": 0.45037645448323066, "grad_norm": 3.1656267642974854, "learning_rate": 6.395066361991595e-05, "loss": 0.1206, "step": 658 }, { "epoch": 0.4510609171800137, "grad_norm": 2.383669137954712, "learning_rate": 6.383979512834143e-05, "loss": 0.0511, "step": 659 }, { "epoch": 0.4517453798767967, "grad_norm": 4.5521345138549805, "learning_rate": 6.372885289509782e-05, "loss": 0.1255, "step": 660 }, { "epoch": 0.45242984257357977, "grad_norm": 4.706148147583008, "learning_rate": 6.361783751131129e-05, "loss": 0.1008, "step": 661 }, { "epoch": 0.4531143052703628, "grad_norm": 3.0235910415649414, "learning_rate": 6.350674956849783e-05, "loss": 0.1056, "step": 662 }, { "epoch": 0.4537987679671458, "grad_norm": 3.969320297241211, "learning_rate": 6.339558965855996e-05, "loss": 0.0636, "step": 663 }, { "epoch": 0.4544832306639288, "grad_norm": 3.3739521503448486, "learning_rate": 6.328435837378377e-05, "loss": 0.1169, "step": 664 }, { "epoch": 0.45516769336071183, "grad_norm": 4.904560565948486, "learning_rate": 6.317305630683554e-05, "loss": 0.1391, "step": 665 }, { "epoch": 0.45585215605749485, "grad_norm": 5.358192443847656, "learning_rate": 6.306168405075877e-05, "loss": 0.128, "step": 666 }, { "epoch": 0.45653661875427787, "grad_norm": 3.473628044128418, "learning_rate": 6.295024219897093e-05, "loss": 0.0797, "step": 667 }, { "epoch": 0.45722108145106094, "grad_norm": 10.266050338745117, "learning_rate": 6.283873134526028e-05, "loss": 0.1744, "step": 668 }, { "epoch": 0.45790554414784396, "grad_norm": 13.234397888183594, "learning_rate": 6.272715208378275e-05, "loss": 0.2535, "step": 669 }, { "epoch": 0.458590006844627, "grad_norm": 5.1101884841918945, "learning_rate": 6.26155050090588e-05, "loss": 0.1211, "step": 670 }, { "epoch": 0.45927446954141, "grad_norm": 9.990983009338379, "learning_rate": 6.250379071597018e-05, "loss": 0.3491, "step": 671 }, { "epoch": 0.459958932238193, "grad_norm": 4.402804851531982, "learning_rate": 6.239200979975679e-05, "loss": 0.0882, "step": 672 }, { "epoch": 0.460643394934976, "grad_norm": 3.7745416164398193, "learning_rate": 6.228016285601353e-05, "loss": 0.1489, "step": 673 }, { "epoch": 0.46132785763175904, "grad_norm": 6.636841297149658, "learning_rate": 6.21682504806871e-05, "loss": 0.2105, "step": 674 }, { "epoch": 0.4620123203285421, "grad_norm": 7.525152683258057, "learning_rate": 6.205627327007287e-05, "loss": 0.2167, "step": 675 }, { "epoch": 0.46269678302532513, "grad_norm": 5.288642883300781, "learning_rate": 6.19442318208116e-05, "loss": 0.1198, "step": 676 }, { "epoch": 0.46338124572210815, "grad_norm": 9.156393051147461, "learning_rate": 6.183212672988639e-05, "loss": 0.2037, "step": 677 }, { "epoch": 0.46406570841889117, "grad_norm": 13.297460556030273, "learning_rate": 6.171995859461941e-05, "loss": 0.2201, "step": 678 }, { "epoch": 0.4647501711156742, "grad_norm": 4.868882656097412, "learning_rate": 6.160772801266874e-05, "loss": 0.1272, "step": 679 }, { "epoch": 0.4654346338124572, "grad_norm": 1.959247350692749, "learning_rate": 6.149543558202521e-05, "loss": 0.0393, "step": 680 }, { "epoch": 0.46611909650924027, "grad_norm": 6.215037822723389, "learning_rate": 6.138308190100917e-05, "loss": 0.1391, "step": 681 }, { "epoch": 0.4668035592060233, "grad_norm": 7.679835319519043, "learning_rate": 6.127066756826738e-05, "loss": 0.2223, "step": 682 }, { "epoch": 0.4674880219028063, "grad_norm": 4.923123359680176, "learning_rate": 6.115819318276968e-05, "loss": 0.0976, "step": 683 }, { "epoch": 0.4681724845995893, "grad_norm": 4.207338333129883, "learning_rate": 6.104565934380596e-05, "loss": 0.0965, "step": 684 }, { "epoch": 0.46885694729637234, "grad_norm": 5.4363789558410645, "learning_rate": 6.0933066650982836e-05, "loss": 0.1019, "step": 685 }, { "epoch": 0.46954140999315536, "grad_norm": 3.468738555908203, "learning_rate": 6.082041570422059e-05, "loss": 0.1203, "step": 686 }, { "epoch": 0.4702258726899384, "grad_norm": 4.9951372146606445, "learning_rate": 6.07077071037498e-05, "loss": 0.083, "step": 687 }, { "epoch": 0.47091033538672145, "grad_norm": 8.549434661865234, "learning_rate": 6.05949414501083e-05, "loss": 0.192, "step": 688 }, { "epoch": 0.47159479808350446, "grad_norm": 4.898756504058838, "learning_rate": 6.048211934413788e-05, "loss": 0.0417, "step": 689 }, { "epoch": 0.4722792607802875, "grad_norm": 4.137899398803711, "learning_rate": 6.036924138698117e-05, "loss": 0.0852, "step": 690 }, { "epoch": 0.4729637234770705, "grad_norm": 1.9068682193756104, "learning_rate": 6.025630818007833e-05, "loss": 0.021, "step": 691 }, { "epoch": 0.4736481861738535, "grad_norm": 3.344748020172119, "learning_rate": 6.014332032516393e-05, "loss": 0.0429, "step": 692 }, { "epoch": 0.47433264887063653, "grad_norm": 1.7427654266357422, "learning_rate": 6.003027842426372e-05, "loss": 0.023, "step": 693 }, { "epoch": 0.4750171115674196, "grad_norm": 2.4438343048095703, "learning_rate": 5.991718307969143e-05, "loss": 0.0505, "step": 694 }, { "epoch": 0.4757015742642026, "grad_norm": 3.002808094024658, "learning_rate": 5.980403489404554e-05, "loss": 0.023, "step": 695 }, { "epoch": 0.47638603696098564, "grad_norm": 3.3711915016174316, "learning_rate": 5.9690834470206055e-05, "loss": 0.049, "step": 696 }, { "epoch": 0.47707049965776865, "grad_norm": 2.2680435180664062, "learning_rate": 5.957758241133136e-05, "loss": 0.0287, "step": 697 }, { "epoch": 0.47775496235455167, "grad_norm": 4.09075403213501, "learning_rate": 5.9464279320854923e-05, "loss": 0.0547, "step": 698 }, { "epoch": 0.4784394250513347, "grad_norm": 2.7409632205963135, "learning_rate": 5.935092580248216e-05, "loss": 0.0226, "step": 699 }, { "epoch": 0.4791238877481177, "grad_norm": 0.051277488470077515, "learning_rate": 5.923752246018717e-05, "loss": 0.0004, "step": 700 }, { "epoch": 0.4798083504449008, "grad_norm": 7.835565090179443, "learning_rate": 5.912406989820948e-05, "loss": 0.2243, "step": 701 }, { "epoch": 0.4804928131416838, "grad_norm": 2.837891101837158, "learning_rate": 5.901056872105092e-05, "loss": 0.0391, "step": 702 }, { "epoch": 0.4811772758384668, "grad_norm": 3.1586685180664062, "learning_rate": 5.889701953347234e-05, "loss": 0.0688, "step": 703 }, { "epoch": 0.48186173853524983, "grad_norm": 4.481542110443115, "learning_rate": 5.878342294049042e-05, "loss": 0.1417, "step": 704 }, { "epoch": 0.48254620123203285, "grad_norm": 4.772768974304199, "learning_rate": 5.866977954737438e-05, "loss": 0.0614, "step": 705 }, { "epoch": 0.48323066392881586, "grad_norm": 6.27142333984375, "learning_rate": 5.855608995964282e-05, "loss": 0.1423, "step": 706 }, { "epoch": 0.4839151266255989, "grad_norm": 3.500842571258545, "learning_rate": 5.8442354783060515e-05, "loss": 0.051, "step": 707 }, { "epoch": 0.48459958932238195, "grad_norm": 1.6878178119659424, "learning_rate": 5.8328574623635114e-05, "loss": 0.0542, "step": 708 }, { "epoch": 0.48528405201916497, "grad_norm": 3.8363585472106934, "learning_rate": 5.8214750087613924e-05, "loss": 0.1011, "step": 709 }, { "epoch": 0.485968514715948, "grad_norm": 8.47804069519043, "learning_rate": 5.810088178148074e-05, "loss": 0.141, "step": 710 }, { "epoch": 0.486652977412731, "grad_norm": 6.544390678405762, "learning_rate": 5.798697031195257e-05, "loss": 0.1418, "step": 711 }, { "epoch": 0.487337440109514, "grad_norm": 7.7474517822265625, "learning_rate": 5.787301628597638e-05, "loss": 0.1312, "step": 712 }, { "epoch": 0.48802190280629704, "grad_norm": 7.021209239959717, "learning_rate": 5.775902031072591e-05, "loss": 0.1317, "step": 713 }, { "epoch": 0.4887063655030801, "grad_norm": 7.520637512207031, "learning_rate": 5.76449829935984e-05, "loss": 0.2666, "step": 714 }, { "epoch": 0.4893908281998631, "grad_norm": 10.08985424041748, "learning_rate": 5.753090494221138e-05, "loss": 0.1912, "step": 715 }, { "epoch": 0.49007529089664614, "grad_norm": 9.202937126159668, "learning_rate": 5.741678676439945e-05, "loss": 0.0715, "step": 716 }, { "epoch": 0.49075975359342916, "grad_norm": 5.043891906738281, "learning_rate": 5.7302629068210955e-05, "loss": 0.0821, "step": 717 }, { "epoch": 0.4914442162902122, "grad_norm": 6.774025917053223, "learning_rate": 5.718843246190484e-05, "loss": 0.1178, "step": 718 }, { "epoch": 0.4921286789869952, "grad_norm": 4.967996120452881, "learning_rate": 5.707419755394734e-05, "loss": 0.1147, "step": 719 }, { "epoch": 0.4928131416837782, "grad_norm": 5.476380348205566, "learning_rate": 5.695992495300881e-05, "loss": 0.0833, "step": 720 }, { "epoch": 0.4934976043805613, "grad_norm": 5.351373672485352, "learning_rate": 5.684561526796045e-05, "loss": 0.093, "step": 721 }, { "epoch": 0.4941820670773443, "grad_norm": 10.904254913330078, "learning_rate": 5.6731269107871e-05, "loss": 0.1462, "step": 722 }, { "epoch": 0.4948665297741273, "grad_norm": 8.17954158782959, "learning_rate": 5.6616887082003565e-05, "loss": 0.1796, "step": 723 }, { "epoch": 0.49555099247091033, "grad_norm": 5.251359939575195, "learning_rate": 5.6502469799812383e-05, "loss": 0.1643, "step": 724 }, { "epoch": 0.49623545516769335, "grad_norm": 4.029839038848877, "learning_rate": 5.638801787093953e-05, "loss": 0.1441, "step": 725 }, { "epoch": 0.49691991786447637, "grad_norm": 3.5853965282440186, "learning_rate": 5.627353190521168e-05, "loss": 0.108, "step": 726 }, { "epoch": 0.4976043805612594, "grad_norm": 4.687831878662109, "learning_rate": 5.615901251263682e-05, "loss": 0.1313, "step": 727 }, { "epoch": 0.49828884325804246, "grad_norm": 5.8699774742126465, "learning_rate": 5.6044460303401156e-05, "loss": 0.1395, "step": 728 }, { "epoch": 0.4989733059548255, "grad_norm": 6.8251633644104, "learning_rate": 5.5929875887865654e-05, "loss": 0.1349, "step": 729 }, { "epoch": 0.4996577686516085, "grad_norm": 3.7832653522491455, "learning_rate": 5.581525987656291e-05, "loss": 0.1569, "step": 730 }, { "epoch": 0.5003422313483915, "grad_norm": 3.226412534713745, "learning_rate": 5.5700612880193846e-05, "loss": 0.0698, "step": 731 }, { "epoch": 0.5010266940451745, "grad_norm": 5.204970836639404, "learning_rate": 5.5585935509624544e-05, "loss": 0.1031, "step": 732 }, { "epoch": 0.5017111567419575, "grad_norm": 8.392780303955078, "learning_rate": 5.547122837588288e-05, "loss": 0.3057, "step": 733 }, { "epoch": 0.5023956194387406, "grad_norm": 4.304141044616699, "learning_rate": 5.535649209015531e-05, "loss": 0.1174, "step": 734 }, { "epoch": 0.5030800821355236, "grad_norm": 0.9668604731559753, "learning_rate": 5.5241727263783614e-05, "loss": 0.0372, "step": 735 }, { "epoch": 0.5037645448323066, "grad_norm": 6.357860088348389, "learning_rate": 5.5126934508261695e-05, "loss": 0.1685, "step": 736 }, { "epoch": 0.5044490075290896, "grad_norm": 5.614017009735107, "learning_rate": 5.5012114435232206e-05, "loss": 0.159, "step": 737 }, { "epoch": 0.5051334702258727, "grad_norm": 5.565310001373291, "learning_rate": 5.489726765648341e-05, "loss": 0.0552, "step": 738 }, { "epoch": 0.5058179329226558, "grad_norm": 3.3780341148376465, "learning_rate": 5.47823947839458e-05, "loss": 0.0704, "step": 739 }, { "epoch": 0.5065023956194388, "grad_norm": 4.278140544891357, "learning_rate": 5.466749642968897e-05, "loss": 0.0814, "step": 740 }, { "epoch": 0.5071868583162218, "grad_norm": 2.6938846111297607, "learning_rate": 5.4552573205918246e-05, "loss": 0.0703, "step": 741 }, { "epoch": 0.5078713210130048, "grad_norm": 6.953071117401123, "learning_rate": 5.4437625724971465e-05, "loss": 0.1198, "step": 742 }, { "epoch": 0.5085557837097878, "grad_norm": 3.365001678466797, "learning_rate": 5.4322654599315724e-05, "loss": 0.0516, "step": 743 }, { "epoch": 0.5092402464065708, "grad_norm": 2.691910982131958, "learning_rate": 5.4207660441544116e-05, "loss": 0.0279, "step": 744 }, { "epoch": 0.5099247091033539, "grad_norm": 4.718718528747559, "learning_rate": 5.4092643864372425e-05, "loss": 0.0892, "step": 745 }, { "epoch": 0.5106091718001369, "grad_norm": 4.679147243499756, "learning_rate": 5.397760548063591e-05, "loss": 0.0844, "step": 746 }, { "epoch": 0.5112936344969199, "grad_norm": 3.144240617752075, "learning_rate": 5.386254590328601e-05, "loss": 0.0351, "step": 747 }, { "epoch": 0.5119780971937029, "grad_norm": 5.339894771575928, "learning_rate": 5.3747465745387094e-05, "loss": 0.072, "step": 748 }, { "epoch": 0.5126625598904859, "grad_norm": 3.566080093383789, "learning_rate": 5.363236562011321e-05, "loss": 0.0333, "step": 749 }, { "epoch": 0.5133470225872689, "grad_norm": 6.320427894592285, "learning_rate": 5.351724614074477e-05, "loss": 0.0738, "step": 750 }, { "epoch": 0.5140314852840521, "grad_norm": 6.906569957733154, "learning_rate": 5.340210792066531e-05, "loss": 0.1429, "step": 751 }, { "epoch": 0.5147159479808351, "grad_norm": 6.978096008300781, "learning_rate": 5.3286951573358214e-05, "loss": 0.0798, "step": 752 }, { "epoch": 0.5154004106776181, "grad_norm": 10.238512992858887, "learning_rate": 5.317177771240349e-05, "loss": 0.0976, "step": 753 }, { "epoch": 0.5160848733744011, "grad_norm": 8.852508544921875, "learning_rate": 5.305658695147443e-05, "loss": 0.1056, "step": 754 }, { "epoch": 0.5167693360711841, "grad_norm": 8.208331108093262, "learning_rate": 5.2941379904334364e-05, "loss": 0.1638, "step": 755 }, { "epoch": 0.5174537987679672, "grad_norm": 4.964343547821045, "learning_rate": 5.282615718483343e-05, "loss": 0.06, "step": 756 }, { "epoch": 0.5181382614647502, "grad_norm": 4.8337554931640625, "learning_rate": 5.2710919406905266e-05, "loss": 0.088, "step": 757 }, { "epoch": 0.5188227241615332, "grad_norm": 4.182374000549316, "learning_rate": 5.2595667184563714e-05, "loss": 0.0772, "step": 758 }, { "epoch": 0.5195071868583162, "grad_norm": 4.3997416496276855, "learning_rate": 5.248040113189963e-05, "loss": 0.1505, "step": 759 }, { "epoch": 0.5201916495550992, "grad_norm": 5.846405029296875, "learning_rate": 5.2365121863077495e-05, "loss": 0.1146, "step": 760 }, { "epoch": 0.5208761122518822, "grad_norm": 5.653942108154297, "learning_rate": 5.2249829992332276e-05, "loss": 0.1639, "step": 761 }, { "epoch": 0.5215605749486653, "grad_norm": 2.020703077316284, "learning_rate": 5.213452613396606e-05, "loss": 0.0392, "step": 762 }, { "epoch": 0.5222450376454483, "grad_norm": 4.763345718383789, "learning_rate": 5.2019210902344774e-05, "loss": 0.1128, "step": 763 }, { "epoch": 0.5229295003422314, "grad_norm": 6.672398567199707, "learning_rate": 5.190388491189498e-05, "loss": 0.1032, "step": 764 }, { "epoch": 0.5236139630390144, "grad_norm": 5.990591526031494, "learning_rate": 5.1788548777100584e-05, "loss": 0.1571, "step": 765 }, { "epoch": 0.5242984257357974, "grad_norm": 6.739293098449707, "learning_rate": 5.1673203112499514e-05, "loss": 0.0905, "step": 766 }, { "epoch": 0.5249828884325805, "grad_norm": 6.739519119262695, "learning_rate": 5.1557848532680475e-05, "loss": 0.2011, "step": 767 }, { "epoch": 0.5256673511293635, "grad_norm": 6.0303215980529785, "learning_rate": 5.144248565227967e-05, "loss": 0.0854, "step": 768 }, { "epoch": 0.5263518138261465, "grad_norm": 4.493467330932617, "learning_rate": 5.132711508597753e-05, "loss": 0.1491, "step": 769 }, { "epoch": 0.5270362765229295, "grad_norm": 5.181914806365967, "learning_rate": 5.121173744849548e-05, "loss": 0.1318, "step": 770 }, { "epoch": 0.5277207392197125, "grad_norm": 4.230520725250244, "learning_rate": 5.109635335459256e-05, "loss": 0.098, "step": 771 }, { "epoch": 0.5284052019164955, "grad_norm": 5.118719577789307, "learning_rate": 5.098096341906226e-05, "loss": 0.1393, "step": 772 }, { "epoch": 0.5290896646132786, "grad_norm": 5.9567646980285645, "learning_rate": 5.086556825672914e-05, "loss": 0.1463, "step": 773 }, { "epoch": 0.5297741273100616, "grad_norm": 3.912684440612793, "learning_rate": 5.0750168482445694e-05, "loss": 0.0962, "step": 774 }, { "epoch": 0.5304585900068446, "grad_norm": 7.363780498504639, "learning_rate": 5.063476471108891e-05, "loss": 0.1693, "step": 775 }, { "epoch": 0.5311430527036276, "grad_norm": 3.4148874282836914, "learning_rate": 5.0519357557557125e-05, "loss": 0.1644, "step": 776 }, { "epoch": 0.5318275154004107, "grad_norm": 11.272842407226562, "learning_rate": 5.0403947636766636e-05, "loss": 0.1916, "step": 777 }, { "epoch": 0.5325119780971937, "grad_norm": 3.1405372619628906, "learning_rate": 5.0288535563648575e-05, "loss": 0.045, "step": 778 }, { "epoch": 0.5331964407939768, "grad_norm": 8.825194358825684, "learning_rate": 5.017312195314546e-05, "loss": 0.1822, "step": 779 }, { "epoch": 0.5338809034907598, "grad_norm": 7.6387715339660645, "learning_rate": 5.0057707420208044e-05, "loss": 0.1448, "step": 780 }, { "epoch": 0.5345653661875428, "grad_norm": 6.639828681945801, "learning_rate": 4.994229257979196e-05, "loss": 0.1557, "step": 781 }, { "epoch": 0.5352498288843258, "grad_norm": 9.324431419372559, "learning_rate": 4.982687804685456e-05, "loss": 0.2023, "step": 782 }, { "epoch": 0.5359342915811088, "grad_norm": 4.798715114593506, "learning_rate": 4.971146443635144e-05, "loss": 0.0487, "step": 783 }, { "epoch": 0.5366187542778919, "grad_norm": 5.285682201385498, "learning_rate": 4.959605236323338e-05, "loss": 0.1406, "step": 784 }, { "epoch": 0.5373032169746749, "grad_norm": 9.87006664276123, "learning_rate": 4.9480642442442886e-05, "loss": 0.0927, "step": 785 }, { "epoch": 0.5379876796714579, "grad_norm": 2.7305855751037598, "learning_rate": 4.93652352889111e-05, "loss": 0.0464, "step": 786 }, { "epoch": 0.5386721423682409, "grad_norm": 6.669300556182861, "learning_rate": 4.924983151755432e-05, "loss": 0.0802, "step": 787 }, { "epoch": 0.5393566050650239, "grad_norm": 5.478473663330078, "learning_rate": 4.913443174327086e-05, "loss": 0.0963, "step": 788 }, { "epoch": 0.5400410677618069, "grad_norm": 2.5427608489990234, "learning_rate": 4.9019036580937755e-05, "loss": 0.0707, "step": 789 }, { "epoch": 0.54072553045859, "grad_norm": 5.0992231369018555, "learning_rate": 4.8903646645407437e-05, "loss": 0.1154, "step": 790 }, { "epoch": 0.5414099931553731, "grad_norm": 7.1983642578125, "learning_rate": 4.878826255150453e-05, "loss": 0.1659, "step": 791 }, { "epoch": 0.5420944558521561, "grad_norm": 4.861118316650391, "learning_rate": 4.867288491402247e-05, "loss": 0.0779, "step": 792 }, { "epoch": 0.5427789185489391, "grad_norm": 5.645224094390869, "learning_rate": 4.855751434772034e-05, "loss": 0.0949, "step": 793 }, { "epoch": 0.5434633812457221, "grad_norm": 2.6943721771240234, "learning_rate": 4.844215146731952e-05, "loss": 0.0226, "step": 794 }, { "epoch": 0.5441478439425051, "grad_norm": 4.790472984313965, "learning_rate": 4.832679688750049e-05, "loss": 0.0975, "step": 795 }, { "epoch": 0.5448323066392882, "grad_norm": 4.523157119750977, "learning_rate": 4.8211451222899414e-05, "loss": 0.0679, "step": 796 }, { "epoch": 0.5455167693360712, "grad_norm": 2.8285012245178223, "learning_rate": 4.809611508810502e-05, "loss": 0.0597, "step": 797 }, { "epoch": 0.5462012320328542, "grad_norm": 2.938415765762329, "learning_rate": 4.798078909765524e-05, "loss": 0.0418, "step": 798 }, { "epoch": 0.5468856947296372, "grad_norm": 1.7410691976547241, "learning_rate": 4.7865473866033974e-05, "loss": 0.0078, "step": 799 }, { "epoch": 0.5475701574264202, "grad_norm": 2.4399964809417725, "learning_rate": 4.7750170007667736e-05, "loss": 0.0247, "step": 800 }, { "epoch": 0.5482546201232033, "grad_norm": 4.801215648651123, "learning_rate": 4.7634878136922524e-05, "loss": 0.1229, "step": 801 }, { "epoch": 0.5489390828199863, "grad_norm": 1.1044646501541138, "learning_rate": 4.7519598868100384e-05, "loss": 0.0072, "step": 802 }, { "epoch": 0.5496235455167693, "grad_norm": 2.399359703063965, "learning_rate": 4.7404332815436305e-05, "loss": 0.0546, "step": 803 }, { "epoch": 0.5503080082135524, "grad_norm": 3.4866628646850586, "learning_rate": 4.7289080593094746e-05, "loss": 0.0515, "step": 804 }, { "epoch": 0.5509924709103354, "grad_norm": 6.381715774536133, "learning_rate": 4.717384281516658e-05, "loss": 0.1577, "step": 805 }, { "epoch": 0.5516769336071184, "grad_norm": 3.3924477100372314, "learning_rate": 4.705862009566564e-05, "loss": 0.0642, "step": 806 }, { "epoch": 0.5523613963039015, "grad_norm": 3.914611577987671, "learning_rate": 4.694341304852557e-05, "loss": 0.067, "step": 807 }, { "epoch": 0.5530458590006845, "grad_norm": 5.13940954208374, "learning_rate": 4.682822228759652e-05, "loss": 0.0859, "step": 808 }, { "epoch": 0.5537303216974675, "grad_norm": 5.011077404022217, "learning_rate": 4.6713048426641784e-05, "loss": 0.0968, "step": 809 }, { "epoch": 0.5544147843942505, "grad_norm": 3.9265668392181396, "learning_rate": 4.65978920793347e-05, "loss": 0.0491, "step": 810 }, { "epoch": 0.5550992470910335, "grad_norm": 9.224593162536621, "learning_rate": 4.6482753859255226e-05, "loss": 0.1788, "step": 811 }, { "epoch": 0.5557837097878165, "grad_norm": 6.449541091918945, "learning_rate": 4.6367634379886795e-05, "loss": 0.1087, "step": 812 }, { "epoch": 0.5564681724845996, "grad_norm": 4.7026214599609375, "learning_rate": 4.62525342546129e-05, "loss": 0.206, "step": 813 }, { "epoch": 0.5571526351813826, "grad_norm": 4.3106279373168945, "learning_rate": 4.6137454096714e-05, "loss": 0.1007, "step": 814 }, { "epoch": 0.5578370978781656, "grad_norm": 8.00778865814209, "learning_rate": 4.60223945193641e-05, "loss": 0.1657, "step": 815 }, { "epoch": 0.5585215605749486, "grad_norm": 3.604494333267212, "learning_rate": 4.59073561356276e-05, "loss": 0.04, "step": 816 }, { "epoch": 0.5592060232717317, "grad_norm": 5.229626178741455, "learning_rate": 4.57923395584559e-05, "loss": 0.1531, "step": 817 }, { "epoch": 0.5598904859685148, "grad_norm": 7.218879222869873, "learning_rate": 4.567734540068429e-05, "loss": 0.1938, "step": 818 }, { "epoch": 0.5605749486652978, "grad_norm": 6.35554313659668, "learning_rate": 4.556237427502854e-05, "loss": 0.1211, "step": 819 }, { "epoch": 0.5612594113620808, "grad_norm": 5.144163131713867, "learning_rate": 4.544742679408178e-05, "loss": 0.1667, "step": 820 }, { "epoch": 0.5619438740588638, "grad_norm": 5.5775580406188965, "learning_rate": 4.533250357031104e-05, "loss": 0.0783, "step": 821 }, { "epoch": 0.5626283367556468, "grad_norm": 3.1538820266723633, "learning_rate": 4.521760521605421e-05, "loss": 0.0894, "step": 822 }, { "epoch": 0.5633127994524298, "grad_norm": 5.745071887969971, "learning_rate": 4.51027323435166e-05, "loss": 0.0641, "step": 823 }, { "epoch": 0.5639972621492129, "grad_norm": 5.599588394165039, "learning_rate": 4.4987885564767785e-05, "loss": 0.1987, "step": 824 }, { "epoch": 0.5646817248459959, "grad_norm": 6.593318462371826, "learning_rate": 4.4873065491738316e-05, "loss": 0.2226, "step": 825 }, { "epoch": 0.5653661875427789, "grad_norm": 5.574217796325684, "learning_rate": 4.475827273621639e-05, "loss": 0.1276, "step": 826 }, { "epoch": 0.5660506502395619, "grad_norm": 4.178391933441162, "learning_rate": 4.464350790984471e-05, "loss": 0.1097, "step": 827 }, { "epoch": 0.5667351129363449, "grad_norm": 12.578929901123047, "learning_rate": 4.452877162411712e-05, "loss": 0.3352, "step": 828 }, { "epoch": 0.567419575633128, "grad_norm": 6.773298740386963, "learning_rate": 4.441406449037546e-05, "loss": 0.1516, "step": 829 }, { "epoch": 0.568104038329911, "grad_norm": 4.792623519897461, "learning_rate": 4.429938711980616e-05, "loss": 0.2125, "step": 830 }, { "epoch": 0.5687885010266941, "grad_norm": 3.901940107345581, "learning_rate": 4.4184740123437115e-05, "loss": 0.0887, "step": 831 }, { "epoch": 0.5694729637234771, "grad_norm": 4.643041610717773, "learning_rate": 4.407012411213435e-05, "loss": 0.053, "step": 832 }, { "epoch": 0.5701574264202601, "grad_norm": 7.668546199798584, "learning_rate": 4.3955539696598855e-05, "loss": 0.11, "step": 833 }, { "epoch": 0.5708418891170431, "grad_norm": 9.03883171081543, "learning_rate": 4.384098748736318e-05, "loss": 0.1887, "step": 834 }, { "epoch": 0.5715263518138262, "grad_norm": 6.325910568237305, "learning_rate": 4.3726468094788355e-05, "loss": 0.2264, "step": 835 }, { "epoch": 0.5722108145106092, "grad_norm": 5.792248249053955, "learning_rate": 4.361198212906048e-05, "loss": 0.1828, "step": 836 }, { "epoch": 0.5728952772073922, "grad_norm": 2.6224963665008545, "learning_rate": 4.349753020018763e-05, "loss": 0.0559, "step": 837 }, { "epoch": 0.5735797399041752, "grad_norm": 5.736154079437256, "learning_rate": 4.338311291799645e-05, "loss": 0.1071, "step": 838 }, { "epoch": 0.5742642026009582, "grad_norm": 2.6493375301361084, "learning_rate": 4.3268730892129025e-05, "loss": 0.0352, "step": 839 }, { "epoch": 0.5749486652977412, "grad_norm": 5.429224491119385, "learning_rate": 4.315438473203956e-05, "loss": 0.1522, "step": 840 }, { "epoch": 0.5756331279945243, "grad_norm": 5.712803840637207, "learning_rate": 4.304007504699118e-05, "loss": 0.1814, "step": 841 }, { "epoch": 0.5763175906913073, "grad_norm": 1.9854469299316406, "learning_rate": 4.292580244605267e-05, "loss": 0.0449, "step": 842 }, { "epoch": 0.5770020533880903, "grad_norm": 3.262310266494751, "learning_rate": 4.2811567538095174e-05, "loss": 0.0285, "step": 843 }, { "epoch": 0.5776865160848734, "grad_norm": 7.155121326446533, "learning_rate": 4.2697370931789056e-05, "loss": 0.1346, "step": 844 }, { "epoch": 0.5783709787816564, "grad_norm": 1.3192006349563599, "learning_rate": 4.258321323560055e-05, "loss": 0.0363, "step": 845 }, { "epoch": 0.5790554414784395, "grad_norm": 4.3121771812438965, "learning_rate": 4.246909505778862e-05, "loss": 0.0753, "step": 846 }, { "epoch": 0.5797399041752225, "grad_norm": 1.3382045030593872, "learning_rate": 4.235501700640161e-05, "loss": 0.0324, "step": 847 }, { "epoch": 0.5804243668720055, "grad_norm": 4.946536540985107, "learning_rate": 4.224097968927411e-05, "loss": 0.0766, "step": 848 }, { "epoch": 0.5811088295687885, "grad_norm": 2.1124062538146973, "learning_rate": 4.212698371402363e-05, "loss": 0.0168, "step": 849 }, { "epoch": 0.5817932922655715, "grad_norm": 2.6053342819213867, "learning_rate": 4.201302968804746e-05, "loss": 0.0432, "step": 850 }, { "epoch": 0.5824777549623545, "grad_norm": 3.059352159500122, "learning_rate": 4.189911821851927e-05, "loss": 0.0814, "step": 851 }, { "epoch": 0.5831622176591376, "grad_norm": 1.5620914697647095, "learning_rate": 4.1785249912386095e-05, "loss": 0.0104, "step": 852 }, { "epoch": 0.5838466803559206, "grad_norm": 6.243402481079102, "learning_rate": 4.1671425376364905e-05, "loss": 0.0623, "step": 853 }, { "epoch": 0.5845311430527036, "grad_norm": 1.784867763519287, "learning_rate": 4.15576452169395e-05, "loss": 0.0185, "step": 854 }, { "epoch": 0.5852156057494866, "grad_norm": 2.747887134552002, "learning_rate": 4.144391004035719e-05, "loss": 0.1163, "step": 855 }, { "epoch": 0.5859000684462696, "grad_norm": 0.7318354845046997, "learning_rate": 4.133022045262564e-05, "loss": 0.0051, "step": 856 }, { "epoch": 0.5865845311430528, "grad_norm": 4.84039831161499, "learning_rate": 4.121657705950959e-05, "loss": 0.1267, "step": 857 }, { "epoch": 0.5872689938398358, "grad_norm": 3.8754220008850098, "learning_rate": 4.110298046652765e-05, "loss": 0.1779, "step": 858 }, { "epoch": 0.5879534565366188, "grad_norm": 7.82292366027832, "learning_rate": 4.09894312789491e-05, "loss": 0.1853, "step": 859 }, { "epoch": 0.5886379192334018, "grad_norm": 9.495680809020996, "learning_rate": 4.087593010179053e-05, "loss": 0.2598, "step": 860 }, { "epoch": 0.5893223819301848, "grad_norm": 4.373050212860107, "learning_rate": 4.076247753981285e-05, "loss": 0.0486, "step": 861 }, { "epoch": 0.5900068446269678, "grad_norm": 4.443106174468994, "learning_rate": 4.0649074197517837e-05, "loss": 0.1849, "step": 862 }, { "epoch": 0.5906913073237509, "grad_norm": 6.144136905670166, "learning_rate": 4.053572067914509e-05, "loss": 0.1026, "step": 863 }, { "epoch": 0.5913757700205339, "grad_norm": 5.361873626708984, "learning_rate": 4.0422417588668655e-05, "loss": 0.134, "step": 864 }, { "epoch": 0.5920602327173169, "grad_norm": 3.41215181350708, "learning_rate": 4.0309165529793956e-05, "loss": 0.0954, "step": 865 }, { "epoch": 0.5927446954140999, "grad_norm": 7.5463433265686035, "learning_rate": 4.019596510595447e-05, "loss": 0.1487, "step": 866 }, { "epoch": 0.5934291581108829, "grad_norm": 2.0411925315856934, "learning_rate": 4.008281692030859e-05, "loss": 0.1331, "step": 867 }, { "epoch": 0.5941136208076659, "grad_norm": 8.925078392028809, "learning_rate": 3.996972157573629e-05, "loss": 0.1741, "step": 868 }, { "epoch": 0.594798083504449, "grad_norm": 9.078567504882812, "learning_rate": 3.985667967483609e-05, "loss": 0.2276, "step": 869 }, { "epoch": 0.5954825462012321, "grad_norm": 3.8749094009399414, "learning_rate": 3.974369181992169e-05, "loss": 0.0809, "step": 870 }, { "epoch": 0.5961670088980151, "grad_norm": 4.615741729736328, "learning_rate": 3.963075861301886e-05, "loss": 0.0915, "step": 871 }, { "epoch": 0.5968514715947981, "grad_norm": 5.139390468597412, "learning_rate": 3.9517880655862126e-05, "loss": 0.1603, "step": 872 }, { "epoch": 0.5975359342915811, "grad_norm": 2.5255730152130127, "learning_rate": 3.94050585498917e-05, "loss": 0.069, "step": 873 }, { "epoch": 0.5982203969883642, "grad_norm": 6.3253045082092285, "learning_rate": 3.929229289625021e-05, "loss": 0.0509, "step": 874 }, { "epoch": 0.5989048596851472, "grad_norm": 4.453798770904541, "learning_rate": 3.9179584295779416e-05, "loss": 0.0868, "step": 875 }, { "epoch": 0.5995893223819302, "grad_norm": 4.584982395172119, "learning_rate": 3.906693334901717e-05, "loss": 0.1112, "step": 876 }, { "epoch": 0.6002737850787132, "grad_norm": 6.146342754364014, "learning_rate": 3.895434065619404e-05, "loss": 0.2032, "step": 877 }, { "epoch": 0.6009582477754962, "grad_norm": 7.576272964477539, "learning_rate": 3.884180681723033e-05, "loss": 0.0995, "step": 878 }, { "epoch": 0.6016427104722792, "grad_norm": 5.567491054534912, "learning_rate": 3.872933243173263e-05, "loss": 0.0793, "step": 879 }, { "epoch": 0.6023271731690623, "grad_norm": 3.418834686279297, "learning_rate": 3.861691809899084e-05, "loss": 0.0644, "step": 880 }, { "epoch": 0.6030116358658453, "grad_norm": 7.1508684158325195, "learning_rate": 3.850456441797479e-05, "loss": 0.1896, "step": 881 }, { "epoch": 0.6036960985626283, "grad_norm": 11.652444839477539, "learning_rate": 3.839227198733127e-05, "loss": 0.3148, "step": 882 }, { "epoch": 0.6043805612594113, "grad_norm": 2.5976359844207764, "learning_rate": 3.82800414053806e-05, "loss": 0.0178, "step": 883 }, { "epoch": 0.6050650239561944, "grad_norm": 6.422081470489502, "learning_rate": 3.816787327011362e-05, "loss": 0.1186, "step": 884 }, { "epoch": 0.6057494866529775, "grad_norm": 3.251467227935791, "learning_rate": 3.80557681791884e-05, "loss": 0.0737, "step": 885 }, { "epoch": 0.6064339493497605, "grad_norm": 8.956872940063477, "learning_rate": 3.794372672992715e-05, "loss": 0.18, "step": 886 }, { "epoch": 0.6071184120465435, "grad_norm": 8.867412567138672, "learning_rate": 3.78317495193129e-05, "loss": 0.1854, "step": 887 }, { "epoch": 0.6078028747433265, "grad_norm": 2.4603326320648193, "learning_rate": 3.771983714398648e-05, "loss": 0.047, "step": 888 }, { "epoch": 0.6084873374401095, "grad_norm": 2.327357769012451, "learning_rate": 3.760799020024321e-05, "loss": 0.0402, "step": 889 }, { "epoch": 0.6091718001368925, "grad_norm": 6.514804840087891, "learning_rate": 3.749620928402982e-05, "loss": 0.1179, "step": 890 }, { "epoch": 0.6098562628336756, "grad_norm": 9.8722505569458, "learning_rate": 3.738449499094121e-05, "loss": 0.1711, "step": 891 }, { "epoch": 0.6105407255304586, "grad_norm": 5.733887195587158, "learning_rate": 3.7272847916217245e-05, "loss": 0.0902, "step": 892 }, { "epoch": 0.6112251882272416, "grad_norm": 5.701957702636719, "learning_rate": 3.7161268654739736e-05, "loss": 0.0818, "step": 893 }, { "epoch": 0.6119096509240246, "grad_norm": 5.558452606201172, "learning_rate": 3.704975780102907e-05, "loss": 0.0631, "step": 894 }, { "epoch": 0.6125941136208076, "grad_norm": 1.623458981513977, "learning_rate": 3.6938315949241234e-05, "loss": 0.0143, "step": 895 }, { "epoch": 0.6132785763175906, "grad_norm": 6.29697322845459, "learning_rate": 3.682694369316446e-05, "loss": 0.0952, "step": 896 }, { "epoch": 0.6139630390143738, "grad_norm": 0.03705253452062607, "learning_rate": 3.6715641626216245e-05, "loss": 0.0002, "step": 897 }, { "epoch": 0.6146475017111568, "grad_norm": 11.599061012268066, "learning_rate": 3.660441034144003e-05, "loss": 0.1614, "step": 898 }, { "epoch": 0.6153319644079398, "grad_norm": 4.641750812530518, "learning_rate": 3.6493250431502193e-05, "loss": 0.1127, "step": 899 }, { "epoch": 0.6160164271047228, "grad_norm": 5.760473728179932, "learning_rate": 3.638216248868871e-05, "loss": 0.149, "step": 900 }, { "epoch": 0.6167008898015058, "grad_norm": 3.3628504276275635, "learning_rate": 3.627114710490219e-05, "loss": 0.0603, "step": 901 }, { "epoch": 0.6173853524982889, "grad_norm": 2.6191816329956055, "learning_rate": 3.6160204871658564e-05, "loss": 0.0508, "step": 902 }, { "epoch": 0.6180698151950719, "grad_norm": 1.12338387966156, "learning_rate": 3.6049336380084065e-05, "loss": 0.0073, "step": 903 }, { "epoch": 0.6187542778918549, "grad_norm": 2.9982059001922607, "learning_rate": 3.5938542220911935e-05, "loss": 0.0326, "step": 904 }, { "epoch": 0.6194387405886379, "grad_norm": 4.436798095703125, "learning_rate": 3.582782298447943e-05, "loss": 0.0797, "step": 905 }, { "epoch": 0.6201232032854209, "grad_norm": 3.636549234390259, "learning_rate": 3.571717926072454e-05, "loss": 0.0864, "step": 906 }, { "epoch": 0.6208076659822039, "grad_norm": 2.164902687072754, "learning_rate": 3.560661163918294e-05, "loss": 0.0332, "step": 907 }, { "epoch": 0.621492128678987, "grad_norm": 4.381768703460693, "learning_rate": 3.549612070898485e-05, "loss": 0.0581, "step": 908 }, { "epoch": 0.62217659137577, "grad_norm": 5.737043857574463, "learning_rate": 3.5385707058851756e-05, "loss": 0.1442, "step": 909 }, { "epoch": 0.6228610540725531, "grad_norm": 5.50530481338501, "learning_rate": 3.527537127709348e-05, "loss": 0.13, "step": 910 }, { "epoch": 0.6235455167693361, "grad_norm": 8.580819129943848, "learning_rate": 3.516511395160487e-05, "loss": 0.1087, "step": 911 }, { "epoch": 0.6242299794661191, "grad_norm": 4.168587684631348, "learning_rate": 3.505493566986282e-05, "loss": 0.0955, "step": 912 }, { "epoch": 0.6249144421629022, "grad_norm": 3.920548439025879, "learning_rate": 3.4944837018922966e-05, "loss": 0.0966, "step": 913 }, { "epoch": 0.6255989048596852, "grad_norm": 4.267676830291748, "learning_rate": 3.4834818585416735e-05, "loss": 0.0697, "step": 914 }, { "epoch": 0.6262833675564682, "grad_norm": 6.670324325561523, "learning_rate": 3.4724880955548075e-05, "loss": 0.1379, "step": 915 }, { "epoch": 0.6269678302532512, "grad_norm": 6.005004405975342, "learning_rate": 3.461502471509045e-05, "loss": 0.1776, "step": 916 }, { "epoch": 0.6276522929500342, "grad_norm": 4.418410778045654, "learning_rate": 3.4505250449383606e-05, "loss": 0.0327, "step": 917 }, { "epoch": 0.6283367556468172, "grad_norm": 3.255706548690796, "learning_rate": 3.4395558743330546e-05, "loss": 0.1183, "step": 918 }, { "epoch": 0.6290212183436003, "grad_norm": 6.7315449714660645, "learning_rate": 3.428595018139436e-05, "loss": 0.1098, "step": 919 }, { "epoch": 0.6297056810403833, "grad_norm": 9.334846496582031, "learning_rate": 3.417642534759515e-05, "loss": 0.1016, "step": 920 }, { "epoch": 0.6303901437371663, "grad_norm": 4.70497989654541, "learning_rate": 3.406698482550685e-05, "loss": 0.1446, "step": 921 }, { "epoch": 0.6310746064339493, "grad_norm": 3.5420589447021484, "learning_rate": 3.39576291982542e-05, "loss": 0.0789, "step": 922 }, { "epoch": 0.6317590691307323, "grad_norm": 6.970898628234863, "learning_rate": 3.384835904850959e-05, "loss": 0.0578, "step": 923 }, { "epoch": 0.6324435318275154, "grad_norm": 4.871261119842529, "learning_rate": 3.373917495848995e-05, "loss": 0.0765, "step": 924 }, { "epoch": 0.6331279945242985, "grad_norm": 3.662017822265625, "learning_rate": 3.3630077509953725e-05, "loss": 0.1466, "step": 925 }, { "epoch": 0.6338124572210815, "grad_norm": 4.384666442871094, "learning_rate": 3.35210672841976e-05, "loss": 0.1115, "step": 926 }, { "epoch": 0.6344969199178645, "grad_norm": 5.4521989822387695, "learning_rate": 3.3412144862053625e-05, "loss": 0.0739, "step": 927 }, { "epoch": 0.6351813826146475, "grad_norm": 4.014865398406982, "learning_rate": 3.3303310823885947e-05, "loss": 0.0491, "step": 928 }, { "epoch": 0.6358658453114305, "grad_norm": 9.829865455627441, "learning_rate": 3.3194565749587844e-05, "loss": 0.1266, "step": 929 }, { "epoch": 0.6365503080082136, "grad_norm": 2.621732473373413, "learning_rate": 3.308591021857849e-05, "loss": 0.074, "step": 930 }, { "epoch": 0.6372347707049966, "grad_norm": 2.080385684967041, "learning_rate": 3.2977344809800015e-05, "loss": 0.017, "step": 931 }, { "epoch": 0.6379192334017796, "grad_norm": 4.572232723236084, "learning_rate": 3.2868870101714336e-05, "loss": 0.0696, "step": 932 }, { "epoch": 0.6386036960985626, "grad_norm": 4.5740790367126465, "learning_rate": 3.2760486672300114e-05, "loss": 0.1619, "step": 933 }, { "epoch": 0.6392881587953456, "grad_norm": 3.539198398590088, "learning_rate": 3.265219509904961e-05, "loss": 0.0425, "step": 934 }, { "epoch": 0.6399726214921286, "grad_norm": 4.358295917510986, "learning_rate": 3.2543995958965714e-05, "loss": 0.1817, "step": 935 }, { "epoch": 0.6406570841889117, "grad_norm": 4.029088020324707, "learning_rate": 3.2435889828558754e-05, "loss": 0.1733, "step": 936 }, { "epoch": 0.6413415468856948, "grad_norm": 9.92212200164795, "learning_rate": 3.232787728384353e-05, "loss": 0.2783, "step": 937 }, { "epoch": 0.6420260095824778, "grad_norm": 3.882117986679077, "learning_rate": 3.221995890033614e-05, "loss": 0.0739, "step": 938 }, { "epoch": 0.6427104722792608, "grad_norm": 7.787561893463135, "learning_rate": 3.2112135253051035e-05, "loss": 0.1561, "step": 939 }, { "epoch": 0.6433949349760438, "grad_norm": 2.385794162750244, "learning_rate": 3.200440691649782e-05, "loss": 0.0368, "step": 940 }, { "epoch": 0.6440793976728268, "grad_norm": 11.048721313476562, "learning_rate": 3.189677446467832e-05, "loss": 0.1243, "step": 941 }, { "epoch": 0.6447638603696099, "grad_norm": 2.5859270095825195, "learning_rate": 3.178923847108347e-05, "loss": 0.0677, "step": 942 }, { "epoch": 0.6454483230663929, "grad_norm": 2.070570230484009, "learning_rate": 3.1681799508690166e-05, "loss": 0.0514, "step": 943 }, { "epoch": 0.6461327857631759, "grad_norm": 1.8898471593856812, "learning_rate": 3.1574458149958394e-05, "loss": 0.0134, "step": 944 }, { "epoch": 0.6468172484599589, "grad_norm": 2.7468764781951904, "learning_rate": 3.1467214966828025e-05, "loss": 0.0298, "step": 945 }, { "epoch": 0.6475017111567419, "grad_norm": 2.0681867599487305, "learning_rate": 3.136007053071588e-05, "loss": 0.0214, "step": 946 }, { "epoch": 0.648186173853525, "grad_norm": 2.8178818225860596, "learning_rate": 3.125302541251257e-05, "loss": 0.033, "step": 947 }, { "epoch": 0.648870636550308, "grad_norm": 2.2889244556427, "learning_rate": 3.114608018257958e-05, "loss": 0.0297, "step": 948 }, { "epoch": 0.649555099247091, "grad_norm": 3.091566562652588, "learning_rate": 3.1039235410746105e-05, "loss": 0.0267, "step": 949 }, { "epoch": 0.6502395619438741, "grad_norm": 4.0854363441467285, "learning_rate": 3.093249166630615e-05, "loss": 0.0361, "step": 950 }, { "epoch": 0.6509240246406571, "grad_norm": 3.438784122467041, "learning_rate": 3.082584951801533e-05, "loss": 0.0529, "step": 951 }, { "epoch": 0.6516084873374401, "grad_norm": 3.7417876720428467, "learning_rate": 3.0719309534088026e-05, "loss": 0.0543, "step": 952 }, { "epoch": 0.6522929500342232, "grad_norm": 2.690965414047241, "learning_rate": 3.061287228219418e-05, "loss": 0.0197, "step": 953 }, { "epoch": 0.6529774127310062, "grad_norm": 4.358261585235596, "learning_rate": 3.050653832945644e-05, "loss": 0.0297, "step": 954 }, { "epoch": 0.6536618754277892, "grad_norm": 5.5358123779296875, "learning_rate": 3.0400308242446955e-05, "loss": 0.1232, "step": 955 }, { "epoch": 0.6543463381245722, "grad_norm": 5.305462837219238, "learning_rate": 3.029418258718454e-05, "loss": 0.0557, "step": 956 }, { "epoch": 0.6550308008213552, "grad_norm": 2.531890392303467, "learning_rate": 3.0188161929131507e-05, "loss": 0.0626, "step": 957 }, { "epoch": 0.6557152635181382, "grad_norm": 3.8367786407470703, "learning_rate": 3.0082246833190763e-05, "loss": 0.0483, "step": 958 }, { "epoch": 0.6563997262149213, "grad_norm": 4.207793235778809, "learning_rate": 2.9976437863702767e-05, "loss": 0.1237, "step": 959 }, { "epoch": 0.6570841889117043, "grad_norm": 14.015584945678711, "learning_rate": 2.9870735584442435e-05, "loss": 0.1962, "step": 960 }, { "epoch": 0.6577686516084873, "grad_norm": 5.048133373260498, "learning_rate": 2.9765140558616284e-05, "loss": 0.1009, "step": 961 }, { "epoch": 0.6584531143052703, "grad_norm": 4.3501787185668945, "learning_rate": 2.965965334885933e-05, "loss": 0.1151, "step": 962 }, { "epoch": 0.6591375770020534, "grad_norm": 7.980549335479736, "learning_rate": 2.9554274517232156e-05, "loss": 0.1547, "step": 963 }, { "epoch": 0.6598220396988365, "grad_norm": 8.582379341125488, "learning_rate": 2.9449004625217807e-05, "loss": 0.1074, "step": 964 }, { "epoch": 0.6605065023956195, "grad_norm": 3.6728739738464355, "learning_rate": 2.9343844233718954e-05, "loss": 0.0852, "step": 965 }, { "epoch": 0.6611909650924025, "grad_norm": 3.8935229778289795, "learning_rate": 2.9238793903054757e-05, "loss": 0.1484, "step": 966 }, { "epoch": 0.6618754277891855, "grad_norm": 4.079407215118408, "learning_rate": 2.9133854192957998e-05, "loss": 0.0866, "step": 967 }, { "epoch": 0.6625598904859685, "grad_norm": 2.4043686389923096, "learning_rate": 2.902902566257203e-05, "loss": 0.0252, "step": 968 }, { "epoch": 0.6632443531827515, "grad_norm": 3.8497068881988525, "learning_rate": 2.8924308870447786e-05, "loss": 0.1846, "step": 969 }, { "epoch": 0.6639288158795346, "grad_norm": 3.248539447784424, "learning_rate": 2.8819704374540868e-05, "loss": 0.1458, "step": 970 }, { "epoch": 0.6646132785763176, "grad_norm": 5.920546054840088, "learning_rate": 2.8715212732208523e-05, "loss": 0.1163, "step": 971 }, { "epoch": 0.6652977412731006, "grad_norm": 4.5991082191467285, "learning_rate": 2.8610834500206664e-05, "loss": 0.1627, "step": 972 }, { "epoch": 0.6659822039698836, "grad_norm": 3.6878907680511475, "learning_rate": 2.8506570234687013e-05, "loss": 0.1529, "step": 973 }, { "epoch": 0.6666666666666666, "grad_norm": 7.269820213317871, "learning_rate": 2.840242049119392e-05, "loss": 0.1554, "step": 974 }, { "epoch": 0.6673511293634496, "grad_norm": 7.909129619598389, "learning_rate": 2.8298385824661634e-05, "loss": 0.1401, "step": 975 }, { "epoch": 0.6680355920602327, "grad_norm": 4.850073337554932, "learning_rate": 2.8194466789411257e-05, "loss": 0.0896, "step": 976 }, { "epoch": 0.6687200547570158, "grad_norm": 6.143699645996094, "learning_rate": 2.8090663939147676e-05, "loss": 0.1103, "step": 977 }, { "epoch": 0.6694045174537988, "grad_norm": 6.36089563369751, "learning_rate": 2.798697782695683e-05, "loss": 0.1617, "step": 978 }, { "epoch": 0.6700889801505818, "grad_norm": 5.60001277923584, "learning_rate": 2.7883409005302585e-05, "loss": 0.1814, "step": 979 }, { "epoch": 0.6707734428473648, "grad_norm": 3.621752977371216, "learning_rate": 2.7779958026023878e-05, "loss": 0.0709, "step": 980 }, { "epoch": 0.6714579055441479, "grad_norm": 5.735018730163574, "learning_rate": 2.7676625440331755e-05, "loss": 0.1375, "step": 981 }, { "epoch": 0.6721423682409309, "grad_norm": 6.533940315246582, "learning_rate": 2.7573411798806414e-05, "loss": 0.1244, "step": 982 }, { "epoch": 0.6728268309377139, "grad_norm": 4.205816268920898, "learning_rate": 2.74703176513943e-05, "loss": 0.0406, "step": 983 }, { "epoch": 0.6735112936344969, "grad_norm": 3.6509592533111572, "learning_rate": 2.7367343547405222e-05, "loss": 0.1004, "step": 984 }, { "epoch": 0.6741957563312799, "grad_norm": 4.641470432281494, "learning_rate": 2.726449003550924e-05, "loss": 0.1116, "step": 985 }, { "epoch": 0.6748802190280629, "grad_norm": 3.8386964797973633, "learning_rate": 2.716175766373401e-05, "loss": 0.0559, "step": 986 }, { "epoch": 0.675564681724846, "grad_norm": 5.10567045211792, "learning_rate": 2.705914697946164e-05, "loss": 0.1083, "step": 987 }, { "epoch": 0.676249144421629, "grad_norm": 4.513882637023926, "learning_rate": 2.6956658529425893e-05, "loss": 0.1128, "step": 988 }, { "epoch": 0.676933607118412, "grad_norm": 2.410376787185669, "learning_rate": 2.6854292859709218e-05, "loss": 0.0932, "step": 989 }, { "epoch": 0.6776180698151951, "grad_norm": 4.988163471221924, "learning_rate": 2.6752050515739873e-05, "loss": 0.0754, "step": 990 }, { "epoch": 0.6783025325119781, "grad_norm": 4.610032558441162, "learning_rate": 2.6649932042288995e-05, "loss": 0.0944, "step": 991 }, { "epoch": 0.6789869952087612, "grad_norm": 1.7913050651550293, "learning_rate": 2.654793798346775e-05, "loss": 0.0337, "step": 992 }, { "epoch": 0.6796714579055442, "grad_norm": 2.1391892433166504, "learning_rate": 2.6446068882724368e-05, "loss": 0.0313, "step": 993 }, { "epoch": 0.6803559206023272, "grad_norm": 1.932131052017212, "learning_rate": 2.6344325282841248e-05, "loss": 0.0876, "step": 994 }, { "epoch": 0.6810403832991102, "grad_norm": 1.5800524950027466, "learning_rate": 2.6242707725932126e-05, "loss": 0.0296, "step": 995 }, { "epoch": 0.6817248459958932, "grad_norm": 1.7992970943450928, "learning_rate": 2.6141216753439114e-05, "loss": 0.0132, "step": 996 }, { "epoch": 0.6824093086926762, "grad_norm": 8.021644592285156, "learning_rate": 2.6039852906129934e-05, "loss": 0.1523, "step": 997 }, { "epoch": 0.6830937713894593, "grad_norm": 0.016920212656259537, "learning_rate": 2.593861672409484e-05, "loss": 0.0001, "step": 998 }, { "epoch": 0.6837782340862423, "grad_norm": 0.19122359156608582, "learning_rate": 2.5837508746743955e-05, "loss": 0.0013, "step": 999 }, { "epoch": 0.6844626967830253, "grad_norm": 0.31367063522338867, "learning_rate": 2.5736529512804235e-05, "loss": 0.0021, "step": 1000 }, { "epoch": 0.6851471594798083, "grad_norm": 6.027254581451416, "learning_rate": 2.56356795603167e-05, "loss": 0.0294, "step": 1001 }, { "epoch": 0.6858316221765913, "grad_norm": 7.237560749053955, "learning_rate": 2.5534959426633497e-05, "loss": 0.1166, "step": 1002 }, { "epoch": 0.6865160848733745, "grad_norm": 2.598755359649658, "learning_rate": 2.5434369648415092e-05, "loss": 0.0681, "step": 1003 }, { "epoch": 0.6872005475701575, "grad_norm": 6.923910617828369, "learning_rate": 2.5333910761627356e-05, "loss": 0.0651, "step": 1004 }, { "epoch": 0.6878850102669405, "grad_norm": 2.905336380004883, "learning_rate": 2.523358330153881e-05, "loss": 0.0735, "step": 1005 }, { "epoch": 0.6885694729637235, "grad_norm": 3.7238333225250244, "learning_rate": 2.5133387802717583e-05, "loss": 0.0651, "step": 1006 }, { "epoch": 0.6892539356605065, "grad_norm": 4.9099884033203125, "learning_rate": 2.5033324799028814e-05, "loss": 0.0637, "step": 1007 }, { "epoch": 0.6899383983572895, "grad_norm": 3.0908186435699463, "learning_rate": 2.4933394823631596e-05, "loss": 0.0529, "step": 1008 }, { "epoch": 0.6906228610540726, "grad_norm": 6.304473400115967, "learning_rate": 2.4833598408976218e-05, "loss": 0.0896, "step": 1009 }, { "epoch": 0.6913073237508556, "grad_norm": 5.631266117095947, "learning_rate": 2.4733936086801394e-05, "loss": 0.1191, "step": 1010 }, { "epoch": 0.6919917864476386, "grad_norm": 4.170117378234863, "learning_rate": 2.4634408388131253e-05, "loss": 0.1422, "step": 1011 }, { "epoch": 0.6926762491444216, "grad_norm": 7.255228042602539, "learning_rate": 2.4535015843272723e-05, "loss": 0.1914, "step": 1012 }, { "epoch": 0.6933607118412046, "grad_norm": 5.757086277008057, "learning_rate": 2.4435758981812544e-05, "loss": 0.0696, "step": 1013 }, { "epoch": 0.6940451745379876, "grad_norm": 4.210714817047119, "learning_rate": 2.4336638332614515e-05, "loss": 0.1669, "step": 1014 }, { "epoch": 0.6947296372347707, "grad_norm": 8.958159446716309, "learning_rate": 2.423765442381666e-05, "loss": 0.1769, "step": 1015 }, { "epoch": 0.6954140999315537, "grad_norm": 3.73427677154541, "learning_rate": 2.413880778282842e-05, "loss": 0.0795, "step": 1016 }, { "epoch": 0.6960985626283368, "grad_norm": 7.747519016265869, "learning_rate": 2.4040098936327833e-05, "loss": 0.193, "step": 1017 }, { "epoch": 0.6967830253251198, "grad_norm": 4.145031929016113, "learning_rate": 2.3941528410258796e-05, "loss": 0.0895, "step": 1018 }, { "epoch": 0.6974674880219028, "grad_norm": 6.655278205871582, "learning_rate": 2.3843096729828085e-05, "loss": 0.0917, "step": 1019 }, { "epoch": 0.6981519507186859, "grad_norm": 4.262378215789795, "learning_rate": 2.3744804419502792e-05, "loss": 0.1147, "step": 1020 }, { "epoch": 0.6988364134154689, "grad_norm": 7.860165119171143, "learning_rate": 2.3646652003007347e-05, "loss": 0.1795, "step": 1021 }, { "epoch": 0.6995208761122519, "grad_norm": 3.5030057430267334, "learning_rate": 2.3548640003320806e-05, "loss": 0.1269, "step": 1022 }, { "epoch": 0.7002053388090349, "grad_norm": 2.6681675910949707, "learning_rate": 2.345076894267405e-05, "loss": 0.1093, "step": 1023 }, { "epoch": 0.7008898015058179, "grad_norm": 11.209885597229004, "learning_rate": 2.3353039342547017e-05, "loss": 0.1214, "step": 1024 }, { "epoch": 0.7015742642026009, "grad_norm": 3.799726724624634, "learning_rate": 2.3255451723665872e-05, "loss": 0.055, "step": 1025 }, { "epoch": 0.702258726899384, "grad_norm": 7.8830766677856445, "learning_rate": 2.3158006606000344e-05, "loss": 0.1769, "step": 1026 }, { "epoch": 0.702943189596167, "grad_norm": 11.71888256072998, "learning_rate": 2.3060704508760804e-05, "loss": 0.3058, "step": 1027 }, { "epoch": 0.70362765229295, "grad_norm": 5.45119047164917, "learning_rate": 2.2963545950395633e-05, "loss": 0.1329, "step": 1028 }, { "epoch": 0.704312114989733, "grad_norm": 4.824770927429199, "learning_rate": 2.2866531448588358e-05, "loss": 0.0794, "step": 1029 }, { "epoch": 0.7049965776865161, "grad_norm": 4.187606334686279, "learning_rate": 2.2769661520254954e-05, "loss": 0.0984, "step": 1030 }, { "epoch": 0.7056810403832992, "grad_norm": 2.6799237728118896, "learning_rate": 2.267293668154114e-05, "loss": 0.0299, "step": 1031 }, { "epoch": 0.7063655030800822, "grad_norm": 2.5534913539886475, "learning_rate": 2.2576357447819423e-05, "loss": 0.0476, "step": 1032 }, { "epoch": 0.7070499657768652, "grad_norm": 4.994041442871094, "learning_rate": 2.247992433368663e-05, "loss": 0.0562, "step": 1033 }, { "epoch": 0.7077344284736482, "grad_norm": 5.017084121704102, "learning_rate": 2.2383637852960963e-05, "loss": 0.0732, "step": 1034 }, { "epoch": 0.7084188911704312, "grad_norm": 6.5374369621276855, "learning_rate": 2.228749851867934e-05, "loss": 0.1232, "step": 1035 }, { "epoch": 0.7091033538672142, "grad_norm": 5.451394557952881, "learning_rate": 2.219150684309463e-05, "loss": 0.0667, "step": 1036 }, { "epoch": 0.7097878165639973, "grad_norm": 2.7644152641296387, "learning_rate": 2.2095663337672965e-05, "loss": 0.0768, "step": 1037 }, { "epoch": 0.7104722792607803, "grad_norm": 3.3176193237304688, "learning_rate": 2.1999968513090975e-05, "loss": 0.0381, "step": 1038 }, { "epoch": 0.7111567419575633, "grad_norm": 2.2982053756713867, "learning_rate": 2.1904422879233132e-05, "loss": 0.073, "step": 1039 }, { "epoch": 0.7118412046543463, "grad_norm": 2.5246763229370117, "learning_rate": 2.1809026945188882e-05, "loss": 0.0547, "step": 1040 }, { "epoch": 0.7125256673511293, "grad_norm": 3.3582746982574463, "learning_rate": 2.1713781219250155e-05, "loss": 0.0867, "step": 1041 }, { "epoch": 0.7132101300479123, "grad_norm": 6.711006164550781, "learning_rate": 2.161868620890846e-05, "loss": 0.239, "step": 1042 }, { "epoch": 0.7138945927446955, "grad_norm": 2.486917018890381, "learning_rate": 2.152374242085226e-05, "loss": 0.025, "step": 1043 }, { "epoch": 0.7145790554414785, "grad_norm": 8.284025192260742, "learning_rate": 2.1428950360964345e-05, "loss": 0.1523, "step": 1044 }, { "epoch": 0.7152635181382615, "grad_norm": 5.581010818481445, "learning_rate": 2.133431053431893e-05, "loss": 0.0925, "step": 1045 }, { "epoch": 0.7159479808350445, "grad_norm": 2.743018627166748, "learning_rate": 2.1239823445179235e-05, "loss": 0.0502, "step": 1046 }, { "epoch": 0.7166324435318275, "grad_norm": 2.6280624866485596, "learning_rate": 2.1145489596994574e-05, "loss": 0.0375, "step": 1047 }, { "epoch": 0.7173169062286106, "grad_norm": 3.1133267879486084, "learning_rate": 2.105130949239777e-05, "loss": 0.0344, "step": 1048 }, { "epoch": 0.7180013689253936, "grad_norm": 1.8953214883804321, "learning_rate": 2.095728363320248e-05, "loss": 0.0112, "step": 1049 }, { "epoch": 0.7186858316221766, "grad_norm": 2.414328098297119, "learning_rate": 2.08634125204005e-05, "loss": 0.0153, "step": 1050 }, { "epoch": 0.7193702943189596, "grad_norm": 3.7400143146514893, "learning_rate": 2.076969665415908e-05, "loss": 0.0502, "step": 1051 }, { "epoch": 0.7200547570157426, "grad_norm": 5.873078346252441, "learning_rate": 2.0676136533818356e-05, "loss": 0.144, "step": 1052 }, { "epoch": 0.7207392197125256, "grad_norm": 2.2231340408325195, "learning_rate": 2.058273265788848e-05, "loss": 0.0143, "step": 1053 }, { "epoch": 0.7214236824093087, "grad_norm": 2.044334650039673, "learning_rate": 2.048948552404723e-05, "loss": 0.032, "step": 1054 }, { "epoch": 0.7221081451060917, "grad_norm": 3.9261324405670166, "learning_rate": 2.0396395629137153e-05, "loss": 0.0521, "step": 1055 }, { "epoch": 0.7227926078028748, "grad_norm": 2.169109344482422, "learning_rate": 2.0303463469163003e-05, "loss": 0.0313, "step": 1056 }, { "epoch": 0.7234770704996578, "grad_norm": 5.896158695220947, "learning_rate": 2.0210689539289092e-05, "loss": 0.076, "step": 1057 }, { "epoch": 0.7241615331964408, "grad_norm": 5.363951206207275, "learning_rate": 2.0118074333836622e-05, "loss": 0.0411, "step": 1058 }, { "epoch": 0.7248459958932238, "grad_norm": 4.067416667938232, "learning_rate": 2.0025618346281132e-05, "loss": 0.0584, "step": 1059 }, { "epoch": 0.7255304585900069, "grad_norm": 4.5005717277526855, "learning_rate": 1.9933322069249756e-05, "loss": 0.1307, "step": 1060 }, { "epoch": 0.7262149212867899, "grad_norm": 3.6448659896850586, "learning_rate": 1.9841185994518657e-05, "loss": 0.1009, "step": 1061 }, { "epoch": 0.7268993839835729, "grad_norm": 4.878888130187988, "learning_rate": 1.9749210613010434e-05, "loss": 0.0891, "step": 1062 }, { "epoch": 0.7275838466803559, "grad_norm": 5.274069309234619, "learning_rate": 1.9657396414791446e-05, "loss": 0.1982, "step": 1063 }, { "epoch": 0.7282683093771389, "grad_norm": 6.308216571807861, "learning_rate": 1.9565743889069226e-05, "loss": 0.1653, "step": 1064 }, { "epoch": 0.728952772073922, "grad_norm": 10.600144386291504, "learning_rate": 1.947425352418994e-05, "loss": 0.1699, "step": 1065 }, { "epoch": 0.729637234770705, "grad_norm": 3.1227612495422363, "learning_rate": 1.938292580763561e-05, "loss": 0.0779, "step": 1066 }, { "epoch": 0.730321697467488, "grad_norm": 12.401570320129395, "learning_rate": 1.9291761226021744e-05, "loss": 0.1915, "step": 1067 }, { "epoch": 0.731006160164271, "grad_norm": 4.102994441986084, "learning_rate": 1.9200760265094558e-05, "loss": 0.084, "step": 1068 }, { "epoch": 0.731690622861054, "grad_norm": 4.432417392730713, "learning_rate": 1.9109923409728493e-05, "loss": 0.1245, "step": 1069 }, { "epoch": 0.7323750855578371, "grad_norm": 6.921361923217773, "learning_rate": 1.901925114392357e-05, "loss": 0.1767, "step": 1070 }, { "epoch": 0.7330595482546202, "grad_norm": 6.960573673248291, "learning_rate": 1.8928743950802864e-05, "loss": 0.1388, "step": 1071 }, { "epoch": 0.7337440109514032, "grad_norm": 8.243216514587402, "learning_rate": 1.8838402312609864e-05, "loss": 0.2951, "step": 1072 }, { "epoch": 0.7344284736481862, "grad_norm": 3.1551766395568848, "learning_rate": 1.8748226710706036e-05, "loss": 0.0381, "step": 1073 }, { "epoch": 0.7351129363449692, "grad_norm": 1.6338300704956055, "learning_rate": 1.8658217625568025e-05, "loss": 0.0174, "step": 1074 }, { "epoch": 0.7357973990417522, "grad_norm": 10.761858940124512, "learning_rate": 1.856837553678535e-05, "loss": 0.259, "step": 1075 }, { "epoch": 0.7364818617385352, "grad_norm": 5.4559478759765625, "learning_rate": 1.847870092305773e-05, "loss": 0.1336, "step": 1076 }, { "epoch": 0.7371663244353183, "grad_norm": 3.761915445327759, "learning_rate": 1.838919426219244e-05, "loss": 0.0442, "step": 1077 }, { "epoch": 0.7378507871321013, "grad_norm": 2.5736138820648193, "learning_rate": 1.8299856031101976e-05, "loss": 0.0654, "step": 1078 }, { "epoch": 0.7385352498288843, "grad_norm": 9.834482192993164, "learning_rate": 1.8210686705801333e-05, "loss": 0.148, "step": 1079 }, { "epoch": 0.7392197125256673, "grad_norm": 5.04593563079834, "learning_rate": 1.8121686761405555e-05, "loss": 0.1248, "step": 1080 }, { "epoch": 0.7399041752224503, "grad_norm": 6.0493316650390625, "learning_rate": 1.803285667212719e-05, "loss": 0.0628, "step": 1081 }, { "epoch": 0.7405886379192334, "grad_norm": 6.558478832244873, "learning_rate": 1.7944196911273758e-05, "loss": 0.1408, "step": 1082 }, { "epoch": 0.7412731006160165, "grad_norm": 4.060093879699707, "learning_rate": 1.7855707951245198e-05, "loss": 0.0814, "step": 1083 }, { "epoch": 0.7419575633127995, "grad_norm": 3.5769429206848145, "learning_rate": 1.7767390263531463e-05, "loss": 0.0793, "step": 1084 }, { "epoch": 0.7426420260095825, "grad_norm": 6.162482261657715, "learning_rate": 1.767924431870981e-05, "loss": 0.0872, "step": 1085 }, { "epoch": 0.7433264887063655, "grad_norm": 1.0451451539993286, "learning_rate": 1.7591270586442516e-05, "loss": 0.0078, "step": 1086 }, { "epoch": 0.7440109514031485, "grad_norm": 5.467753887176514, "learning_rate": 1.7503469535474213e-05, "loss": 0.0998, "step": 1087 }, { "epoch": 0.7446954140999316, "grad_norm": 7.59031867980957, "learning_rate": 1.7415841633629467e-05, "loss": 0.1567, "step": 1088 }, { "epoch": 0.7453798767967146, "grad_norm": 5.697047233581543, "learning_rate": 1.7328387347810244e-05, "loss": 0.114, "step": 1089 }, { "epoch": 0.7460643394934976, "grad_norm": 5.989294052124023, "learning_rate": 1.724110714399347e-05, "loss": 0.0908, "step": 1090 }, { "epoch": 0.7467488021902806, "grad_norm": 4.094980716705322, "learning_rate": 1.7154001487228477e-05, "loss": 0.1092, "step": 1091 }, { "epoch": 0.7474332648870636, "grad_norm": 4.660505294799805, "learning_rate": 1.7067070841634646e-05, "loss": 0.0734, "step": 1092 }, { "epoch": 0.7481177275838466, "grad_norm": 3.5471725463867188, "learning_rate": 1.698031567039879e-05, "loss": 0.0566, "step": 1093 }, { "epoch": 0.7488021902806297, "grad_norm": 4.905546188354492, "learning_rate": 1.6893736435772772e-05, "loss": 0.0851, "step": 1094 }, { "epoch": 0.7494866529774127, "grad_norm": 2.9793856143951416, "learning_rate": 1.680733359907104e-05, "loss": 0.0648, "step": 1095 }, { "epoch": 0.7501711156741958, "grad_norm": 3.7390031814575195, "learning_rate": 1.672110762066811e-05, "loss": 0.0508, "step": 1096 }, { "epoch": 0.7508555783709788, "grad_norm": 2.6512210369110107, "learning_rate": 1.6635058959996237e-05, "loss": 0.0164, "step": 1097 }, { "epoch": 0.7515400410677618, "grad_norm": 8.550832748413086, "learning_rate": 1.654918807554277e-05, "loss": 0.1032, "step": 1098 }, { "epoch": 0.7522245037645449, "grad_norm": 2.9233238697052, "learning_rate": 1.6463495424847947e-05, "loss": 0.0241, "step": 1099 }, { "epoch": 0.7529089664613279, "grad_norm": 3.0056545734405518, "learning_rate": 1.637798146450224e-05, "loss": 0.0108, "step": 1100 }, { "epoch": 0.7535934291581109, "grad_norm": 3.3842074871063232, "learning_rate": 1.6292646650144072e-05, "loss": 0.0436, "step": 1101 }, { "epoch": 0.7542778918548939, "grad_norm": 2.765627145767212, "learning_rate": 1.62074914364573e-05, "loss": 0.0259, "step": 1102 }, { "epoch": 0.7549623545516769, "grad_norm": 6.259449005126953, "learning_rate": 1.612251627716886e-05, "loss": 0.0458, "step": 1103 }, { "epoch": 0.75564681724846, "grad_norm": 3.271040916442871, "learning_rate": 1.6037721625046264e-05, "loss": 0.0179, "step": 1104 }, { "epoch": 0.756331279945243, "grad_norm": 2.5520167350769043, "learning_rate": 1.5953107931895332e-05, "loss": 0.0172, "step": 1105 }, { "epoch": 0.757015742642026, "grad_norm": 1.6350395679473877, "learning_rate": 1.5868675648557567e-05, "loss": 0.0092, "step": 1106 }, { "epoch": 0.757700205338809, "grad_norm": 2.5286407470703125, "learning_rate": 1.578442522490799e-05, "loss": 0.0212, "step": 1107 }, { "epoch": 0.758384668035592, "grad_norm": 2.6816582679748535, "learning_rate": 1.5700357109852576e-05, "loss": 0.0473, "step": 1108 }, { "epoch": 0.759069130732375, "grad_norm": 5.646703243255615, "learning_rate": 1.561647175132591e-05, "loss": 0.1707, "step": 1109 }, { "epoch": 0.7597535934291582, "grad_norm": 2.7315192222595215, "learning_rate": 1.553276959628887e-05, "loss": 0.0802, "step": 1110 }, { "epoch": 0.7604380561259412, "grad_norm": 4.032069683074951, "learning_rate": 1.544925109072607e-05, "loss": 0.0981, "step": 1111 }, { "epoch": 0.7611225188227242, "grad_norm": 4.174098968505859, "learning_rate": 1.536591667964372e-05, "loss": 0.0809, "step": 1112 }, { "epoch": 0.7618069815195072, "grad_norm": 7.409214496612549, "learning_rate": 1.5282766807067055e-05, "loss": 0.1933, "step": 1113 }, { "epoch": 0.7624914442162902, "grad_norm": 4.312411308288574, "learning_rate": 1.5199801916038064e-05, "loss": 0.1276, "step": 1114 }, { "epoch": 0.7631759069130732, "grad_norm": 3.014047384262085, "learning_rate": 1.5117022448613116e-05, "loss": 0.0735, "step": 1115 }, { "epoch": 0.7638603696098563, "grad_norm": 5.251741409301758, "learning_rate": 1.5034428845860599e-05, "loss": 0.1973, "step": 1116 }, { "epoch": 0.7645448323066393, "grad_norm": 4.350397109985352, "learning_rate": 1.4952021547858546e-05, "loss": 0.0377, "step": 1117 }, { "epoch": 0.7652292950034223, "grad_norm": 4.746190547943115, "learning_rate": 1.4869800993692385e-05, "loss": 0.1168, "step": 1118 }, { "epoch": 0.7659137577002053, "grad_norm": 7.891136169433594, "learning_rate": 1.4787767621452426e-05, "loss": 0.086, "step": 1119 }, { "epoch": 0.7665982203969883, "grad_norm": 3.1727960109710693, "learning_rate": 1.4705921868231726e-05, "loss": 0.0537, "step": 1120 }, { "epoch": 0.7672826830937713, "grad_norm": 4.352911949157715, "learning_rate": 1.4624264170123608e-05, "loss": 0.0967, "step": 1121 }, { "epoch": 0.7679671457905544, "grad_norm": 5.02450704574585, "learning_rate": 1.4542794962219402e-05, "loss": 0.2083, "step": 1122 }, { "epoch": 0.7686516084873375, "grad_norm": 6.078307628631592, "learning_rate": 1.4461514678606136e-05, "loss": 0.1215, "step": 1123 }, { "epoch": 0.7693360711841205, "grad_norm": 2.62972354888916, "learning_rate": 1.4380423752364186e-05, "loss": 0.0674, "step": 1124 }, { "epoch": 0.7700205338809035, "grad_norm": 5.867610454559326, "learning_rate": 1.4299522615564982e-05, "loss": 0.1102, "step": 1125 }, { "epoch": 0.7707049965776865, "grad_norm": 3.369626045227051, "learning_rate": 1.4218811699268753e-05, "loss": 0.0734, "step": 1126 }, { "epoch": 0.7713894592744696, "grad_norm": 2.6011111736297607, "learning_rate": 1.4138291433522144e-05, "loss": 0.0254, "step": 1127 }, { "epoch": 0.7720739219712526, "grad_norm": 6.0411787033081055, "learning_rate": 1.4057962247355988e-05, "loss": 0.0795, "step": 1128 }, { "epoch": 0.7727583846680356, "grad_norm": 4.8614397048950195, "learning_rate": 1.3977824568782993e-05, "loss": 0.1084, "step": 1129 }, { "epoch": 0.7734428473648186, "grad_norm": 5.7235846519470215, "learning_rate": 1.3897878824795469e-05, "loss": 0.1288, "step": 1130 }, { "epoch": 0.7741273100616016, "grad_norm": 8.10129165649414, "learning_rate": 1.38181254413631e-05, "loss": 0.1466, "step": 1131 }, { "epoch": 0.7748117727583846, "grad_norm": 4.349442958831787, "learning_rate": 1.3738564843430524e-05, "loss": 0.1028, "step": 1132 }, { "epoch": 0.7754962354551677, "grad_norm": 2.4699525833129883, "learning_rate": 1.3659197454915285e-05, "loss": 0.1179, "step": 1133 }, { "epoch": 0.7761806981519507, "grad_norm": 2.8967196941375732, "learning_rate": 1.3580023698705401e-05, "loss": 0.0415, "step": 1134 }, { "epoch": 0.7768651608487337, "grad_norm": 2.4821252822875977, "learning_rate": 1.3501043996657176e-05, "loss": 0.0853, "step": 1135 }, { "epoch": 0.7775496235455168, "grad_norm": 6.767683029174805, "learning_rate": 1.3422258769592965e-05, "loss": 0.0802, "step": 1136 }, { "epoch": 0.7782340862422998, "grad_norm": 5.227594375610352, "learning_rate": 1.33436684372989e-05, "loss": 0.1193, "step": 1137 }, { "epoch": 0.7789185489390829, "grad_norm": 2.671928882598877, "learning_rate": 1.3265273418522661e-05, "loss": 0.0402, "step": 1138 }, { "epoch": 0.7796030116358659, "grad_norm": 3.206073045730591, "learning_rate": 1.318707413097131e-05, "loss": 0.0632, "step": 1139 }, { "epoch": 0.7802874743326489, "grad_norm": 3.4504432678222656, "learning_rate": 1.3109070991308903e-05, "loss": 0.0317, "step": 1140 }, { "epoch": 0.7809719370294319, "grad_norm": 6.2530293464660645, "learning_rate": 1.3031264415154476e-05, "loss": 0.1623, "step": 1141 }, { "epoch": 0.7816563997262149, "grad_norm": 3.5220930576324463, "learning_rate": 1.2953654817079669e-05, "loss": 0.0261, "step": 1142 }, { "epoch": 0.7823408624229979, "grad_norm": 1.8763870000839233, "learning_rate": 1.2876242610606575e-05, "loss": 0.0414, "step": 1143 }, { "epoch": 0.783025325119781, "grad_norm": 2.2394182682037354, "learning_rate": 1.2799028208205615e-05, "loss": 0.0133, "step": 1144 }, { "epoch": 0.783709787816564, "grad_norm": 7.824526309967041, "learning_rate": 1.2722012021293133e-05, "loss": 0.1506, "step": 1145 }, { "epoch": 0.784394250513347, "grad_norm": 3.7418553829193115, "learning_rate": 1.2645194460229454e-05, "loss": 0.0329, "step": 1146 }, { "epoch": 0.78507871321013, "grad_norm": 1.701297402381897, "learning_rate": 1.2568575934316518e-05, "loss": 0.0087, "step": 1147 }, { "epoch": 0.785763175906913, "grad_norm": 5.602641582489014, "learning_rate": 1.2492156851795766e-05, "loss": 0.113, "step": 1148 }, { "epoch": 0.7864476386036962, "grad_norm": 3.2019312381744385, "learning_rate": 1.2415937619845974e-05, "loss": 0.0473, "step": 1149 }, { "epoch": 0.7871321013004792, "grad_norm": 2.445730447769165, "learning_rate": 1.233991864458105e-05, "loss": 0.031, "step": 1150 }, { "epoch": 0.7878165639972622, "grad_norm": 5.247284889221191, "learning_rate": 1.2264100331047879e-05, "loss": 0.1012, "step": 1151 }, { "epoch": 0.7885010266940452, "grad_norm": 1.6202764511108398, "learning_rate": 1.2188483083224238e-05, "loss": 0.0139, "step": 1152 }, { "epoch": 0.7891854893908282, "grad_norm": 8.060858726501465, "learning_rate": 1.2113067304016468e-05, "loss": 0.1643, "step": 1153 }, { "epoch": 0.7898699520876112, "grad_norm": 6.43533992767334, "learning_rate": 1.2037853395257537e-05, "loss": 0.1167, "step": 1154 }, { "epoch": 0.7905544147843943, "grad_norm": 2.413408041000366, "learning_rate": 1.1962841757704757e-05, "loss": 0.0147, "step": 1155 }, { "epoch": 0.7912388774811773, "grad_norm": 3.278813600540161, "learning_rate": 1.1888032791037696e-05, "loss": 0.018, "step": 1156 }, { "epoch": 0.7919233401779603, "grad_norm": 1.5799492597579956, "learning_rate": 1.1813426893856045e-05, "loss": 0.0101, "step": 1157 }, { "epoch": 0.7926078028747433, "grad_norm": 2.6024584770202637, "learning_rate": 1.173902446367749e-05, "loss": 0.0756, "step": 1158 }, { "epoch": 0.7932922655715263, "grad_norm": 8.351832389831543, "learning_rate": 1.1664825896935589e-05, "loss": 0.2167, "step": 1159 }, { "epoch": 0.7939767282683093, "grad_norm": 4.845625877380371, "learning_rate": 1.1590831588977708e-05, "loss": 0.0723, "step": 1160 }, { "epoch": 0.7946611909650924, "grad_norm": 4.913459777832031, "learning_rate": 1.1517041934062834e-05, "loss": 0.0375, "step": 1161 }, { "epoch": 0.7953456536618754, "grad_norm": 7.155594348907471, "learning_rate": 1.144345732535952e-05, "loss": 0.0995, "step": 1162 }, { "epoch": 0.7960301163586585, "grad_norm": 9.2155122756958, "learning_rate": 1.1370078154943798e-05, "loss": 0.0956, "step": 1163 }, { "epoch": 0.7967145790554415, "grad_norm": 2.4341330528259277, "learning_rate": 1.129690481379705e-05, "loss": 0.0602, "step": 1164 }, { "epoch": 0.7973990417522245, "grad_norm": 6.648326396942139, "learning_rate": 1.1223937691804021e-05, "loss": 0.0846, "step": 1165 }, { "epoch": 0.7980835044490076, "grad_norm": 4.48230504989624, "learning_rate": 1.115117717775056e-05, "loss": 0.0909, "step": 1166 }, { "epoch": 0.7987679671457906, "grad_norm": 7.736631870269775, "learning_rate": 1.1078623659321768e-05, "loss": 0.0734, "step": 1167 }, { "epoch": 0.7994524298425736, "grad_norm": 9.086450576782227, "learning_rate": 1.1006277523099768e-05, "loss": 0.134, "step": 1168 }, { "epoch": 0.8001368925393566, "grad_norm": 3.147892713546753, "learning_rate": 1.0934139154561713e-05, "loss": 0.1121, "step": 1169 }, { "epoch": 0.8008213552361396, "grad_norm": 7.7356038093566895, "learning_rate": 1.0862208938077706e-05, "loss": 0.1176, "step": 1170 }, { "epoch": 0.8015058179329226, "grad_norm": 7.178472995758057, "learning_rate": 1.0790487256908827e-05, "loss": 0.1662, "step": 1171 }, { "epoch": 0.8021902806297057, "grad_norm": 8.287479400634766, "learning_rate": 1.0718974493204926e-05, "loss": 0.1814, "step": 1172 }, { "epoch": 0.8028747433264887, "grad_norm": 3.273765802383423, "learning_rate": 1.064767102800282e-05, "loss": 0.1138, "step": 1173 }, { "epoch": 0.8035592060232717, "grad_norm": 2.807312488555908, "learning_rate": 1.0576577241224e-05, "loss": 0.1179, "step": 1174 }, { "epoch": 0.8042436687200547, "grad_norm": 5.2055792808532715, "learning_rate": 1.050569351167286e-05, "loss": 0.1018, "step": 1175 }, { "epoch": 0.8049281314168378, "grad_norm": 4.341903209686279, "learning_rate": 1.0435020217034492e-05, "loss": 0.0624, "step": 1176 }, { "epoch": 0.8056125941136209, "grad_norm": 6.928586483001709, "learning_rate": 1.0364557733872744e-05, "loss": 0.1406, "step": 1177 }, { "epoch": 0.8062970568104039, "grad_norm": 7.298317909240723, "learning_rate": 1.0294306437628248e-05, "loss": 0.1091, "step": 1178 }, { "epoch": 0.8069815195071869, "grad_norm": 2.441014051437378, "learning_rate": 1.0224266702616352e-05, "loss": 0.0193, "step": 1179 }, { "epoch": 0.8076659822039699, "grad_norm": 5.274850845336914, "learning_rate": 1.015443890202517e-05, "loss": 0.0786, "step": 1180 }, { "epoch": 0.8083504449007529, "grad_norm": 8.402663230895996, "learning_rate": 1.0084823407913563e-05, "loss": 0.1163, "step": 1181 }, { "epoch": 0.8090349075975359, "grad_norm": 4.211559772491455, "learning_rate": 1.001542059120919e-05, "loss": 0.0896, "step": 1182 }, { "epoch": 0.809719370294319, "grad_norm": 8.29397964477539, "learning_rate": 9.946230821706493e-06, "loss": 0.1418, "step": 1183 }, { "epoch": 0.810403832991102, "grad_norm": 4.9345574378967285, "learning_rate": 9.877254468064806e-06, "loss": 0.1191, "step": 1184 }, { "epoch": 0.811088295687885, "grad_norm": 4.524342060089111, "learning_rate": 9.808491897806243e-06, "loss": 0.1272, "step": 1185 }, { "epoch": 0.811772758384668, "grad_norm": 8.642858505249023, "learning_rate": 9.739943477313917e-06, "loss": 0.1638, "step": 1186 }, { "epoch": 0.812457221081451, "grad_norm": 6.554798126220703, "learning_rate": 9.671609571829854e-06, "loss": 0.1698, "step": 1187 }, { "epoch": 0.813141683778234, "grad_norm": 4.798029899597168, "learning_rate": 9.603490545453092e-06, "loss": 0.1778, "step": 1188 }, { "epoch": 0.8138261464750172, "grad_norm": 0.3875316083431244, "learning_rate": 9.535586761137761e-06, "loss": 0.0022, "step": 1189 }, { "epoch": 0.8145106091718002, "grad_norm": 2.557605743408203, "learning_rate": 9.467898580691125e-06, "loss": 0.0629, "step": 1190 }, { "epoch": 0.8151950718685832, "grad_norm": 3.230031967163086, "learning_rate": 9.400426364771647e-06, "loss": 0.0558, "step": 1191 }, { "epoch": 0.8158795345653662, "grad_norm": 0.030096231028437614, "learning_rate": 9.333170472887126e-06, "loss": 0.0002, "step": 1192 }, { "epoch": 0.8165639972621492, "grad_norm": 6.761434078216553, "learning_rate": 9.266131263392658e-06, "loss": 0.0807, "step": 1193 }, { "epoch": 0.8172484599589322, "grad_norm": 4.370100498199463, "learning_rate": 9.199309093488878e-06, "loss": 0.0687, "step": 1194 }, { "epoch": 0.8179329226557153, "grad_norm": 4.744657516479492, "learning_rate": 9.132704319219947e-06, "loss": 0.104, "step": 1195 }, { "epoch": 0.8186173853524983, "grad_norm": 3.3174638748168945, "learning_rate": 9.066317295471689e-06, "loss": 0.0443, "step": 1196 }, { "epoch": 0.8193018480492813, "grad_norm": 4.970818042755127, "learning_rate": 9.000148375969741e-06, "loss": 0.1029, "step": 1197 }, { "epoch": 0.8199863107460643, "grad_norm": 2.5866239070892334, "learning_rate": 8.934197913277553e-06, "loss": 0.014, "step": 1198 }, { "epoch": 0.8206707734428473, "grad_norm": 3.277988910675049, "learning_rate": 8.868466258794673e-06, "loss": 0.0407, "step": 1199 }, { "epoch": 0.8213552361396304, "grad_norm": 5.0530805587768555, "learning_rate": 8.802953762754734e-06, "loss": 0.0898, "step": 1200 }, { "epoch": 0.8220396988364134, "grad_norm": 8.34996223449707, "learning_rate": 8.737660774223655e-06, "loss": 0.1077, "step": 1201 }, { "epoch": 0.8227241615331964, "grad_norm": 0.2956388294696808, "learning_rate": 8.672587641097762e-06, "loss": 0.002, "step": 1202 }, { "epoch": 0.8234086242299795, "grad_norm": 1.9702069759368896, "learning_rate": 8.60773471010195e-06, "loss": 0.0157, "step": 1203 }, { "epoch": 0.8240930869267625, "grad_norm": 4.010120391845703, "learning_rate": 8.543102326787812e-06, "loss": 0.0611, "step": 1204 }, { "epoch": 0.8247775496235455, "grad_norm": 2.6192243099212646, "learning_rate": 8.478690835531854e-06, "loss": 0.0222, "step": 1205 }, { "epoch": 0.8254620123203286, "grad_norm": 3.2189998626708984, "learning_rate": 8.414500579533536e-06, "loss": 0.0518, "step": 1206 }, { "epoch": 0.8261464750171116, "grad_norm": 2.314542293548584, "learning_rate": 8.350531900813602e-06, "loss": 0.0174, "step": 1207 }, { "epoch": 0.8268309377138946, "grad_norm": 5.6548686027526855, "learning_rate": 8.28678514021215e-06, "loss": 0.1047, "step": 1208 }, { "epoch": 0.8275154004106776, "grad_norm": 3.949129581451416, "learning_rate": 8.223260637386826e-06, "loss": 0.0263, "step": 1209 }, { "epoch": 0.8281998631074606, "grad_norm": 3.4275383949279785, "learning_rate": 8.159958730811096e-06, "loss": 0.0895, "step": 1210 }, { "epoch": 0.8288843258042436, "grad_norm": 7.329921722412109, "learning_rate": 8.096879757772296e-06, "loss": 0.1155, "step": 1211 }, { "epoch": 0.8295687885010267, "grad_norm": 6.971200466156006, "learning_rate": 8.034024054369993e-06, "loss": 0.1258, "step": 1212 }, { "epoch": 0.8302532511978097, "grad_norm": 4.545697212219238, "learning_rate": 7.971391955514085e-06, "loss": 0.1202, "step": 1213 }, { "epoch": 0.8309377138945927, "grad_norm": 9.90560245513916, "learning_rate": 7.908983794923059e-06, "loss": 0.1371, "step": 1214 }, { "epoch": 0.8316221765913757, "grad_norm": 4.395772933959961, "learning_rate": 7.846799905122204e-06, "loss": 0.0554, "step": 1215 }, { "epoch": 0.8323066392881588, "grad_norm": 5.212583065032959, "learning_rate": 7.784840617441858e-06, "loss": 0.1375, "step": 1216 }, { "epoch": 0.8329911019849419, "grad_norm": 5.26801872253418, "learning_rate": 7.723106262015589e-06, "loss": 0.115, "step": 1217 }, { "epoch": 0.8336755646817249, "grad_norm": 5.760983467102051, "learning_rate": 7.66159716777854e-06, "loss": 0.0691, "step": 1218 }, { "epoch": 0.8343600273785079, "grad_norm": 7.9489874839782715, "learning_rate": 7.600313662465519e-06, "loss": 0.0771, "step": 1219 }, { "epoch": 0.8350444900752909, "grad_norm": 8.685220718383789, "learning_rate": 7.53925607260943e-06, "loss": 0.1579, "step": 1220 }, { "epoch": 0.8357289527720739, "grad_norm": 4.481861114501953, "learning_rate": 7.47842472353939e-06, "loss": 0.134, "step": 1221 }, { "epoch": 0.836413415468857, "grad_norm": 3.617269992828369, "learning_rate": 7.4178199393790785e-06, "loss": 0.0269, "step": 1222 }, { "epoch": 0.83709787816564, "grad_norm": 11.743636131286621, "learning_rate": 7.357442043044977e-06, "loss": 0.1661, "step": 1223 }, { "epoch": 0.837782340862423, "grad_norm": 5.093001365661621, "learning_rate": 7.297291356244645e-06, "loss": 0.106, "step": 1224 }, { "epoch": 0.838466803559206, "grad_norm": 5.526715278625488, "learning_rate": 7.237368199475025e-06, "loss": 0.126, "step": 1225 }, { "epoch": 0.839151266255989, "grad_norm": 3.8150475025177, "learning_rate": 7.177672892020742e-06, "loss": 0.0992, "step": 1226 }, { "epoch": 0.839835728952772, "grad_norm": 4.728708267211914, "learning_rate": 7.118205751952373e-06, "loss": 0.126, "step": 1227 }, { "epoch": 0.840520191649555, "grad_norm": 1.6052964925765991, "learning_rate": 7.0589670961247524e-06, "loss": 0.0272, "step": 1228 }, { "epoch": 0.8412046543463382, "grad_norm": 7.084295272827148, "learning_rate": 6.9999572401753144e-06, "loss": 0.0772, "step": 1229 }, { "epoch": 0.8418891170431212, "grad_norm": 3.3350062370300293, "learning_rate": 6.9411764985223786e-06, "loss": 0.0513, "step": 1230 }, { "epoch": 0.8425735797399042, "grad_norm": 9.144457817077637, "learning_rate": 6.882625184363534e-06, "loss": 0.1462, "step": 1231 }, { "epoch": 0.8432580424366872, "grad_norm": 4.1398024559021, "learning_rate": 6.824303609673843e-06, "loss": 0.0565, "step": 1232 }, { "epoch": 0.8439425051334702, "grad_norm": 6.477858066558838, "learning_rate": 6.7662120852043545e-06, "loss": 0.1274, "step": 1233 }, { "epoch": 0.8446269678302533, "grad_norm": 1.8654345273971558, "learning_rate": 6.708350920480294e-06, "loss": 0.0177, "step": 1234 }, { "epoch": 0.8453114305270363, "grad_norm": 3.078305721282959, "learning_rate": 6.650720423799501e-06, "loss": 0.0944, "step": 1235 }, { "epoch": 0.8459958932238193, "grad_norm": 5.185081481933594, "learning_rate": 6.5933209022307486e-06, "loss": 0.1332, "step": 1236 }, { "epoch": 0.8466803559206023, "grad_norm": 2.1968178749084473, "learning_rate": 6.536152661612138e-06, "loss": 0.054, "step": 1237 }, { "epoch": 0.8473648186173853, "grad_norm": 4.308799743652344, "learning_rate": 6.4792160065494216e-06, "loss": 0.1043, "step": 1238 }, { "epoch": 0.8480492813141683, "grad_norm": 1.641650676727295, "learning_rate": 6.422511240414469e-06, "loss": 0.028, "step": 1239 }, { "epoch": 0.8487337440109514, "grad_norm": 5.506241798400879, "learning_rate": 6.36603866534351e-06, "loss": 0.1871, "step": 1240 }, { "epoch": 0.8494182067077344, "grad_norm": 6.009611129760742, "learning_rate": 6.309798582235671e-06, "loss": 0.0932, "step": 1241 }, { "epoch": 0.8501026694045175, "grad_norm": 3.9081287384033203, "learning_rate": 6.2537912907512815e-06, "loss": 0.0664, "step": 1242 }, { "epoch": 0.8507871321013005, "grad_norm": 1.3740370273590088, "learning_rate": 6.198017089310287e-06, "loss": 0.0169, "step": 1243 }, { "epoch": 0.8514715947980835, "grad_norm": 2.6452078819274902, "learning_rate": 6.1424762750907304e-06, "loss": 0.0235, "step": 1244 }, { "epoch": 0.8521560574948666, "grad_norm": 5.416455268859863, "learning_rate": 6.087169144027033e-06, "loss": 0.0827, "step": 1245 }, { "epoch": 0.8528405201916496, "grad_norm": 3.9231443405151367, "learning_rate": 6.032095990808567e-06, "loss": 0.025, "step": 1246 }, { "epoch": 0.8535249828884326, "grad_norm": 2.054417371749878, "learning_rate": 5.977257108877982e-06, "loss": 0.0179, "step": 1247 }, { "epoch": 0.8542094455852156, "grad_norm": 3.5200412273406982, "learning_rate": 5.922652790429678e-06, "loss": 0.0306, "step": 1248 }, { "epoch": 0.8548939082819986, "grad_norm": 4.1857171058654785, "learning_rate": 5.868283326408253e-06, "loss": 0.0457, "step": 1249 }, { "epoch": 0.8555783709787816, "grad_norm": 2.303556203842163, "learning_rate": 5.814149006506942e-06, "loss": 0.0199, "step": 1250 }, { "epoch": 0.8562628336755647, "grad_norm": 3.58789324760437, "learning_rate": 5.7602501191660605e-06, "loss": 0.0596, "step": 1251 }, { "epoch": 0.8569472963723477, "grad_norm": 1.7570583820343018, "learning_rate": 5.706586951571541e-06, "loss": 0.0162, "step": 1252 }, { "epoch": 0.8576317590691307, "grad_norm": 0.9937019348144531, "learning_rate": 5.6531597896532615e-06, "loss": 0.0061, "step": 1253 }, { "epoch": 0.8583162217659137, "grad_norm": 3.4211318492889404, "learning_rate": 5.599968918083676e-06, "loss": 0.0265, "step": 1254 }, { "epoch": 0.8590006844626967, "grad_norm": 0.22060324251651764, "learning_rate": 5.547014620276203e-06, "loss": 0.0026, "step": 1255 }, { "epoch": 0.8596851471594799, "grad_norm": 2.6318535804748535, "learning_rate": 5.4942971783837286e-06, "loss": 0.0324, "step": 1256 }, { "epoch": 0.8603696098562629, "grad_norm": 6.467898845672607, "learning_rate": 5.44181687329714e-06, "loss": 0.0924, "step": 1257 }, { "epoch": 0.8610540725530459, "grad_norm": 5.078124046325684, "learning_rate": 5.389573984643786e-06, "loss": 0.1602, "step": 1258 }, { "epoch": 0.8617385352498289, "grad_norm": 5.606751441955566, "learning_rate": 5.337568790785996e-06, "loss": 0.1447, "step": 1259 }, { "epoch": 0.8624229979466119, "grad_norm": 6.647364139556885, "learning_rate": 5.285801568819648e-06, "loss": 0.0701, "step": 1260 }, { "epoch": 0.8631074606433949, "grad_norm": 5.2311859130859375, "learning_rate": 5.234272594572603e-06, "loss": 0.0525, "step": 1261 }, { "epoch": 0.863791923340178, "grad_norm": 4.615151405334473, "learning_rate": 5.182982142603298e-06, "loss": 0.1179, "step": 1262 }, { "epoch": 0.864476386036961, "grad_norm": 6.6702470779418945, "learning_rate": 5.131930486199304e-06, "loss": 0.1369, "step": 1263 }, { "epoch": 0.865160848733744, "grad_norm": 4.919125080108643, "learning_rate": 5.0811178973757535e-06, "loss": 0.0962, "step": 1264 }, { "epoch": 0.865845311430527, "grad_norm": 5.890649795532227, "learning_rate": 5.030544646874063e-06, "loss": 0.1152, "step": 1265 }, { "epoch": 0.86652977412731, "grad_norm": 8.196867942810059, "learning_rate": 4.980211004160307e-06, "loss": 0.1421, "step": 1266 }, { "epoch": 0.867214236824093, "grad_norm": 5.347480773925781, "learning_rate": 4.930117237423954e-06, "loss": 0.1066, "step": 1267 }, { "epoch": 0.8678986995208761, "grad_norm": 4.288012504577637, "learning_rate": 4.880263613576313e-06, "loss": 0.0375, "step": 1268 }, { "epoch": 0.8685831622176592, "grad_norm": 4.05540132522583, "learning_rate": 4.830650398249165e-06, "loss": 0.1275, "step": 1269 }, { "epoch": 0.8692676249144422, "grad_norm": 5.343271255493164, "learning_rate": 4.781277855793326e-06, "loss": 0.1165, "step": 1270 }, { "epoch": 0.8699520876112252, "grad_norm": 9.227301597595215, "learning_rate": 4.732146249277297e-06, "loss": 0.1656, "step": 1271 }, { "epoch": 0.8706365503080082, "grad_norm": 7.519947528839111, "learning_rate": 4.68325584048574e-06, "loss": 0.1245, "step": 1272 }, { "epoch": 0.8713210130047913, "grad_norm": 7.988525390625, "learning_rate": 4.634606889918231e-06, "loss": 0.1358, "step": 1273 }, { "epoch": 0.8720054757015743, "grad_norm": 2.133824586868286, "learning_rate": 4.586199656787754e-06, "loss": 0.0335, "step": 1274 }, { "epoch": 0.8726899383983573, "grad_norm": 3.6323320865631104, "learning_rate": 4.5380343990193974e-06, "loss": 0.1015, "step": 1275 }, { "epoch": 0.8733744010951403, "grad_norm": 6.27833890914917, "learning_rate": 4.490111373248918e-06, "loss": 0.1573, "step": 1276 }, { "epoch": 0.8740588637919233, "grad_norm": 4.010247707366943, "learning_rate": 4.442430834821415e-06, "loss": 0.1087, "step": 1277 }, { "epoch": 0.8747433264887063, "grad_norm": 7.817002296447754, "learning_rate": 4.39499303778998e-06, "loss": 0.1406, "step": 1278 }, { "epoch": 0.8754277891854894, "grad_norm": 3.466080665588379, "learning_rate": 4.347798234914296e-06, "loss": 0.1211, "step": 1279 }, { "epoch": 0.8761122518822724, "grad_norm": 7.224392414093018, "learning_rate": 4.3008466776593205e-06, "loss": 0.0955, "step": 1280 }, { "epoch": 0.8767967145790554, "grad_norm": 4.414182186126709, "learning_rate": 4.2541386161939426e-06, "loss": 0.1146, "step": 1281 }, { "epoch": 0.8774811772758385, "grad_norm": 4.7707085609436035, "learning_rate": 4.207674299389658e-06, "loss": 0.0971, "step": 1282 }, { "epoch": 0.8781656399726215, "grad_norm": 6.178817272186279, "learning_rate": 4.161453974819213e-06, "loss": 0.0848, "step": 1283 }, { "epoch": 0.8788501026694046, "grad_norm": 4.172438144683838, "learning_rate": 4.1154778887553425e-06, "loss": 0.0628, "step": 1284 }, { "epoch": 0.8795345653661876, "grad_norm": 2.7260279655456543, "learning_rate": 4.069746286169373e-06, "loss": 0.0228, "step": 1285 }, { "epoch": 0.8802190280629706, "grad_norm": 1.9490443468093872, "learning_rate": 4.024259410730008e-06, "loss": 0.0251, "step": 1286 }, { "epoch": 0.8809034907597536, "grad_norm": 8.672955513000488, "learning_rate": 3.979017504801969e-06, "loss": 0.0946, "step": 1287 }, { "epoch": 0.8815879534565366, "grad_norm": 4.4492597579956055, "learning_rate": 3.9340208094447215e-06, "loss": 0.0818, "step": 1288 }, { "epoch": 0.8822724161533196, "grad_norm": 4.287400245666504, "learning_rate": 3.889269564411191e-06, "loss": 0.034, "step": 1289 }, { "epoch": 0.8829568788501027, "grad_norm": 4.263371467590332, "learning_rate": 3.844764008146501e-06, "loss": 0.0844, "step": 1290 }, { "epoch": 0.8836413415468857, "grad_norm": 2.1707305908203125, "learning_rate": 3.8005043777866513e-06, "loss": 0.0466, "step": 1291 }, { "epoch": 0.8843258042436687, "grad_norm": 1.354305624961853, "learning_rate": 3.756490909157356e-06, "loss": 0.0135, "step": 1292 }, { "epoch": 0.8850102669404517, "grad_norm": 4.980593681335449, "learning_rate": 3.712723836772647e-06, "loss": 0.0613, "step": 1293 }, { "epoch": 0.8856947296372347, "grad_norm": 4.083224296569824, "learning_rate": 3.669203393833748e-06, "loss": 0.0466, "step": 1294 }, { "epoch": 0.8863791923340179, "grad_norm": 2.5384273529052734, "learning_rate": 3.6259298122277685e-06, "loss": 0.0228, "step": 1295 }, { "epoch": 0.8870636550308009, "grad_norm": 4.565820217132568, "learning_rate": 3.582903322526482e-06, "loss": 0.0708, "step": 1296 }, { "epoch": 0.8877481177275839, "grad_norm": 1.421690583229065, "learning_rate": 3.5401241539851138e-06, "loss": 0.017, "step": 1297 }, { "epoch": 0.8884325804243669, "grad_norm": 2.3901710510253906, "learning_rate": 3.4975925345410664e-06, "loss": 0.0166, "step": 1298 }, { "epoch": 0.8891170431211499, "grad_norm": 3.700740337371826, "learning_rate": 3.4553086908127863e-06, "loss": 0.0722, "step": 1299 }, { "epoch": 0.8898015058179329, "grad_norm": 3.197348117828369, "learning_rate": 3.4132728480984864e-06, "loss": 0.0266, "step": 1300 }, { "epoch": 0.890485968514716, "grad_norm": 3.4714057445526123, "learning_rate": 3.3714852303749877e-06, "loss": 0.0617, "step": 1301 }, { "epoch": 0.891170431211499, "grad_norm": 2.990771770477295, "learning_rate": 3.3299460602964906e-06, "loss": 0.0522, "step": 1302 }, { "epoch": 0.891854893908282, "grad_norm": 1.3805162906646729, "learning_rate": 3.2886555591934265e-06, "loss": 0.009, "step": 1303 }, { "epoch": 0.892539356605065, "grad_norm": 4.852303504943848, "learning_rate": 3.247613947071243e-06, "loss": 0.0693, "step": 1304 }, { "epoch": 0.893223819301848, "grad_norm": 5.726833820343018, "learning_rate": 3.206821442609287e-06, "loss": 0.0626, "step": 1305 }, { "epoch": 0.893908281998631, "grad_norm": 4.184272289276123, "learning_rate": 3.1662782631595388e-06, "loss": 0.0416, "step": 1306 }, { "epoch": 0.894592744695414, "grad_norm": 2.2505366802215576, "learning_rate": 3.125984624745576e-06, "loss": 0.0238, "step": 1307 }, { "epoch": 0.8952772073921971, "grad_norm": 2.2352890968322754, "learning_rate": 3.085940742061333e-06, "loss": 0.0268, "step": 1308 }, { "epoch": 0.8959616700889802, "grad_norm": 2.3092434406280518, "learning_rate": 3.0461468284699923e-06, "loss": 0.0294, "step": 1309 }, { "epoch": 0.8966461327857632, "grad_norm": 3.8010802268981934, "learning_rate": 3.0066030960028413e-06, "loss": 0.0803, "step": 1310 }, { "epoch": 0.8973305954825462, "grad_norm": 3.804884195327759, "learning_rate": 2.9673097553581385e-06, "loss": 0.0864, "step": 1311 }, { "epoch": 0.8980150581793293, "grad_norm": 3.6318366527557373, "learning_rate": 2.928267015900027e-06, "loss": 0.0652, "step": 1312 }, { "epoch": 0.8986995208761123, "grad_norm": 3.874138355255127, "learning_rate": 2.8894750856573507e-06, "loss": 0.0555, "step": 1313 }, { "epoch": 0.8993839835728953, "grad_norm": 7.086063861846924, "learning_rate": 2.850934171322589e-06, "loss": 0.1675, "step": 1314 }, { "epoch": 0.9000684462696783, "grad_norm": 6.144205093383789, "learning_rate": 2.8126444782507753e-06, "loss": 0.117, "step": 1315 }, { "epoch": 0.9007529089664613, "grad_norm": 6.633049964904785, "learning_rate": 2.774606210458347e-06, "loss": 0.0781, "step": 1316 }, { "epoch": 0.9014373716632443, "grad_norm": 2.1156790256500244, "learning_rate": 2.736819570622101e-06, "loss": 0.0916, "step": 1317 }, { "epoch": 0.9021218343600274, "grad_norm": 2.3888399600982666, "learning_rate": 2.6992847600781192e-06, "loss": 0.0858, "step": 1318 }, { "epoch": 0.9028062970568104, "grad_norm": 6.681771278381348, "learning_rate": 2.6620019788206384e-06, "loss": 0.1496, "step": 1319 }, { "epoch": 0.9034907597535934, "grad_norm": 4.452361106872559, "learning_rate": 2.6249714255010605e-06, "loss": 0.1181, "step": 1320 }, { "epoch": 0.9041752224503764, "grad_norm": 7.078209400177002, "learning_rate": 2.5881932974268442e-06, "loss": 0.1403, "step": 1321 }, { "epoch": 0.9048596851471595, "grad_norm": 6.734772205352783, "learning_rate": 2.5516677905604634e-06, "loss": 0.1183, "step": 1322 }, { "epoch": 0.9055441478439425, "grad_norm": 1.9680936336517334, "learning_rate": 2.515395099518375e-06, "loss": 0.0747, "step": 1323 }, { "epoch": 0.9062286105407256, "grad_norm": 4.753897190093994, "learning_rate": 2.479375417569968e-06, "loss": 0.0378, "step": 1324 }, { "epoch": 0.9069130732375086, "grad_norm": 3.660781145095825, "learning_rate": 2.4436089366365376e-06, "loss": 0.0579, "step": 1325 }, { "epoch": 0.9075975359342916, "grad_norm": 4.698939800262451, "learning_rate": 2.4080958472902872e-06, "loss": 0.0969, "step": 1326 }, { "epoch": 0.9082819986310746, "grad_norm": 4.52027702331543, "learning_rate": 2.3728363387532425e-06, "loss": 0.0818, "step": 1327 }, { "epoch": 0.9089664613278576, "grad_norm": 3.1826400756835938, "learning_rate": 2.337830598896351e-06, "loss": 0.0732, "step": 1328 }, { "epoch": 0.9096509240246407, "grad_norm": 6.043643951416016, "learning_rate": 2.303078814238374e-06, "loss": 0.1012, "step": 1329 }, { "epoch": 0.9103353867214237, "grad_norm": 8.07490348815918, "learning_rate": 2.2685811699449554e-06, "loss": 0.0942, "step": 1330 }, { "epoch": 0.9110198494182067, "grad_norm": 8.63931941986084, "learning_rate": 2.234337849827639e-06, "loss": 0.1383, "step": 1331 }, { "epoch": 0.9117043121149897, "grad_norm": 3.7313835620880127, "learning_rate": 2.2003490363428357e-06, "loss": 0.1426, "step": 1332 }, { "epoch": 0.9123887748117727, "grad_norm": 4.644273281097412, "learning_rate": 2.166614910590908e-06, "loss": 0.1148, "step": 1333 }, { "epoch": 0.9130732375085557, "grad_norm": 6.328210830688477, "learning_rate": 2.1331356523151823e-06, "loss": 0.0914, "step": 1334 }, { "epoch": 0.9137577002053389, "grad_norm": 4.468108654022217, "learning_rate": 2.099911439900981e-06, "loss": 0.0513, "step": 1335 }, { "epoch": 0.9144421629021219, "grad_norm": 2.367246627807617, "learning_rate": 2.066942450374687e-06, "loss": 0.0632, "step": 1336 }, { "epoch": 0.9151266255989049, "grad_norm": 5.087795734405518, "learning_rate": 2.034228859402798e-06, "loss": 0.0861, "step": 1337 }, { "epoch": 0.9158110882956879, "grad_norm": 4.036954402923584, "learning_rate": 2.0017708412909787e-06, "loss": 0.1058, "step": 1338 }, { "epoch": 0.9164955509924709, "grad_norm": 4.227419376373291, "learning_rate": 1.9695685689831656e-06, "loss": 0.0766, "step": 1339 }, { "epoch": 0.917180013689254, "grad_norm": 2.160324811935425, "learning_rate": 1.9376222140605914e-06, "loss": 0.0461, "step": 1340 }, { "epoch": 0.917864476386037, "grad_norm": 4.940765857696533, "learning_rate": 1.9059319467409242e-06, "loss": 0.1125, "step": 1341 }, { "epoch": 0.91854893908282, "grad_norm": 3.540290117263794, "learning_rate": 1.8744979358773284e-06, "loss": 0.05, "step": 1342 }, { "epoch": 0.919233401779603, "grad_norm": 1.3912843465805054, "learning_rate": 1.8433203489575723e-06, "loss": 0.0046, "step": 1343 }, { "epoch": 0.919917864476386, "grad_norm": 2.680098056793213, "learning_rate": 1.8123993521031334e-06, "loss": 0.0292, "step": 1344 }, { "epoch": 0.920602327173169, "grad_norm": 5.626460552215576, "learning_rate": 1.7817351100683277e-06, "loss": 0.0461, "step": 1345 }, { "epoch": 0.921286789869952, "grad_norm": 5.09351110458374, "learning_rate": 1.7513277862394262e-06, "loss": 0.0523, "step": 1346 }, { "epoch": 0.9219712525667351, "grad_norm": 2.5888571739196777, "learning_rate": 1.7211775426337562e-06, "loss": 0.0223, "step": 1347 }, { "epoch": 0.9226557152635181, "grad_norm": 1.9445496797561646, "learning_rate": 1.6912845398988853e-06, "loss": 0.0128, "step": 1348 }, { "epoch": 0.9233401779603012, "grad_norm": 2.9751782417297363, "learning_rate": 1.6616489373117273e-06, "loss": 0.0199, "step": 1349 }, { "epoch": 0.9240246406570842, "grad_norm": 1.747573733329773, "learning_rate": 1.6322708927777098e-06, "loss": 0.012, "step": 1350 }, { "epoch": 0.9247091033538672, "grad_norm": 4.274850368499756, "learning_rate": 1.603150562829936e-06, "loss": 0.0514, "step": 1351 }, { "epoch": 0.9253935660506503, "grad_norm": 1.4837676286697388, "learning_rate": 1.5742881026283519e-06, "loss": 0.0089, "step": 1352 }, { "epoch": 0.9260780287474333, "grad_norm": 4.931734085083008, "learning_rate": 1.5456836659588803e-06, "loss": 0.069, "step": 1353 }, { "epoch": 0.9267624914442163, "grad_norm": 3.928190231323242, "learning_rate": 1.5173374052326771e-06, "loss": 0.0655, "step": 1354 }, { "epoch": 0.9274469541409993, "grad_norm": 2.626641035079956, "learning_rate": 1.4892494714852544e-06, "loss": 0.0274, "step": 1355 }, { "epoch": 0.9281314168377823, "grad_norm": 3.005250930786133, "learning_rate": 1.4614200143756973e-06, "loss": 0.0407, "step": 1356 }, { "epoch": 0.9288158795345653, "grad_norm": 0.8168134093284607, "learning_rate": 1.4338491821858702e-06, "loss": 0.0046, "step": 1357 }, { "epoch": 0.9295003422313484, "grad_norm": 4.171207904815674, "learning_rate": 1.4065371218196178e-06, "loss": 0.0677, "step": 1358 }, { "epoch": 0.9301848049281314, "grad_norm": 2.4291889667510986, "learning_rate": 1.3794839788019987e-06, "loss": 0.0237, "step": 1359 }, { "epoch": 0.9308692676249144, "grad_norm": 5.774150371551514, "learning_rate": 1.3526898972784973e-06, "loss": 0.0787, "step": 1360 }, { "epoch": 0.9315537303216974, "grad_norm": 3.9753997325897217, "learning_rate": 1.3261550200142414e-06, "loss": 0.0786, "step": 1361 }, { "epoch": 0.9322381930184805, "grad_norm": 3.591026782989502, "learning_rate": 1.2998794883932797e-06, "loss": 0.1196, "step": 1362 }, { "epoch": 0.9329226557152636, "grad_norm": 7.167970180511475, "learning_rate": 1.2738634424177998e-06, "loss": 0.0602, "step": 1363 }, { "epoch": 0.9336071184120466, "grad_norm": 8.780488014221191, "learning_rate": 1.2481070207073786e-06, "loss": 0.0878, "step": 1364 }, { "epoch": 0.9342915811088296, "grad_norm": 6.992488861083984, "learning_rate": 1.2226103604982718e-06, "loss": 0.1404, "step": 1365 }, { "epoch": 0.9349760438056126, "grad_norm": 3.035493850708008, "learning_rate": 1.197373597642637e-06, "loss": 0.0515, "step": 1366 }, { "epoch": 0.9356605065023956, "grad_norm": 6.414846420288086, "learning_rate": 1.1723968666078777e-06, "loss": 0.143, "step": 1367 }, { "epoch": 0.9363449691991786, "grad_norm": 5.241714954376221, "learning_rate": 1.1476803004758508e-06, "loss": 0.1307, "step": 1368 }, { "epoch": 0.9370294318959617, "grad_norm": 2.6015050411224365, "learning_rate": 1.123224030942216e-06, "loss": 0.0283, "step": 1369 }, { "epoch": 0.9377138945927447, "grad_norm": 4.854617118835449, "learning_rate": 1.0990281883156982e-06, "loss": 0.0655, "step": 1370 }, { "epoch": 0.9383983572895277, "grad_norm": 6.307740688323975, "learning_rate": 1.0750929015174216e-06, "loss": 0.0963, "step": 1371 }, { "epoch": 0.9390828199863107, "grad_norm": 4.626981735229492, "learning_rate": 1.0514182980801813e-06, "loss": 0.0648, "step": 1372 }, { "epoch": 0.9397672826830937, "grad_norm": 8.870864868164062, "learning_rate": 1.0280045041478282e-06, "loss": 0.1671, "step": 1373 }, { "epoch": 0.9404517453798767, "grad_norm": 5.165658473968506, "learning_rate": 1.0048516444745248e-06, "loss": 0.0779, "step": 1374 }, { "epoch": 0.9411362080766599, "grad_norm": 2.9996368885040283, "learning_rate": 9.81959842424135e-07, "loss": 0.0718, "step": 1375 }, { "epoch": 0.9418206707734429, "grad_norm": 4.682353973388672, "learning_rate": 9.593292199695402e-07, "loss": 0.0602, "step": 1376 }, { "epoch": 0.9425051334702259, "grad_norm": 7.107354640960693, "learning_rate": 9.369598976919913e-07, "loss": 0.0992, "step": 1377 }, { "epoch": 0.9431895961670089, "grad_norm": 3.286827325820923, "learning_rate": 9.148519947804801e-07, "loss": 0.0232, "step": 1378 }, { "epoch": 0.9438740588637919, "grad_norm": 5.353722095489502, "learning_rate": 8.930056290310906e-07, "loss": 0.0851, "step": 1379 }, { "epoch": 0.944558521560575, "grad_norm": 4.995370864868164, "learning_rate": 8.714209168463716e-07, "loss": 0.106, "step": 1380 }, { "epoch": 0.945242984257358, "grad_norm": 8.396575927734375, "learning_rate": 8.500979732347258e-07, "loss": 0.1224, "step": 1381 }, { "epoch": 0.945927446954141, "grad_norm": 3.076171875, "learning_rate": 8.290369118097885e-07, "loss": 0.1004, "step": 1382 }, { "epoch": 0.946611909650924, "grad_norm": 9.175048828125, "learning_rate": 8.08237844789822e-07, "loss": 0.1662, "step": 1383 }, { "epoch": 0.947296372347707, "grad_norm": 6.039389610290527, "learning_rate": 7.877008829971388e-07, "loss": 0.0957, "step": 1384 }, { "epoch": 0.94798083504449, "grad_norm": 4.737037658691406, "learning_rate": 7.674261358574686e-07, "loss": 0.058, "step": 1385 }, { "epoch": 0.9486652977412731, "grad_norm": 7.839178562164307, "learning_rate": 7.47413711399414e-07, "loss": 0.0961, "step": 1386 }, { "epoch": 0.9493497604380561, "grad_norm": 6.584840774536133, "learning_rate": 7.276637162538625e-07, "loss": 0.1076, "step": 1387 }, { "epoch": 0.9500342231348392, "grad_norm": 8.81566333770752, "learning_rate": 7.081762556534088e-07, "loss": 0.2085, "step": 1388 }, { "epoch": 0.9507186858316222, "grad_norm": 3.18674373626709, "learning_rate": 6.889514334318059e-07, "loss": 0.0573, "step": 1389 }, { "epoch": 0.9514031485284052, "grad_norm": 2.0325143337249756, "learning_rate": 6.699893520234091e-07, "loss": 0.0709, "step": 1390 }, { "epoch": 0.9520876112251883, "grad_norm": 5.399415016174316, "learning_rate": 6.51290112462627e-07, "loss": 0.0572, "step": 1391 }, { "epoch": 0.9527720739219713, "grad_norm": 3.468303918838501, "learning_rate": 6.32853814383394e-07, "loss": 0.0346, "step": 1392 }, { "epoch": 0.9534565366187543, "grad_norm": 3.1648104190826416, "learning_rate": 6.146805560186097e-07, "loss": 0.0331, "step": 1393 }, { "epoch": 0.9541409993155373, "grad_norm": 2.0967931747436523, "learning_rate": 5.967704341996616e-07, "loss": 0.015, "step": 1394 }, { "epoch": 0.9548254620123203, "grad_norm": 0.4599141776561737, "learning_rate": 5.79123544355864e-07, "loss": 0.0025, "step": 1395 }, { "epoch": 0.9555099247091033, "grad_norm": 8.466023445129395, "learning_rate": 5.617399805139867e-07, "loss": 0.1385, "step": 1396 }, { "epoch": 0.9561943874058864, "grad_norm": 6.917329788208008, "learning_rate": 5.446198352977272e-07, "loss": 0.0552, "step": 1397 }, { "epoch": 0.9568788501026694, "grad_norm": 4.484160423278809, "learning_rate": 5.27763199927217e-07, "loss": 0.0448, "step": 1398 }, { "epoch": 0.9575633127994524, "grad_norm": 4.956172943115234, "learning_rate": 5.111701642185718e-07, "loss": 0.0427, "step": 1399 }, { "epoch": 0.9582477754962354, "grad_norm": 0.3013894259929657, "learning_rate": 4.948408165833584e-07, "loss": 0.0012, "step": 1400 }, { "epoch": 0.9589322381930184, "grad_norm": 6.551176071166992, "learning_rate": 4.787752440281734e-07, "loss": 0.0766, "step": 1401 }, { "epoch": 0.9596167008898016, "grad_norm": 0.5637772679328918, "learning_rate": 4.629735321541484e-07, "loss": 0.0033, "step": 1402 }, { "epoch": 0.9603011635865846, "grad_norm": 6.791173934936523, "learning_rate": 4.474357651564953e-07, "loss": 0.1144, "step": 1403 }, { "epoch": 0.9609856262833676, "grad_norm": 4.32711935043335, "learning_rate": 4.321620258240844e-07, "loss": 0.1274, "step": 1404 }, { "epoch": 0.9616700889801506, "grad_norm": 2.397711992263794, "learning_rate": 4.171523955389722e-07, "loss": 0.022, "step": 1405 }, { "epoch": 0.9623545516769336, "grad_norm": 2.0217554569244385, "learning_rate": 4.024069542759801e-07, "loss": 0.02, "step": 1406 }, { "epoch": 0.9630390143737166, "grad_norm": 1.8484454154968262, "learning_rate": 3.879257806022774e-07, "loss": 0.0312, "step": 1407 }, { "epoch": 0.9637234770704997, "grad_norm": 2.3836281299591064, "learning_rate": 3.7370895167694877e-07, "loss": 0.0227, "step": 1408 }, { "epoch": 0.9644079397672827, "grad_norm": 7.112321376800537, "learning_rate": 3.597565432505834e-07, "loss": 0.1548, "step": 1409 }, { "epoch": 0.9650924024640657, "grad_norm": 5.822161674499512, "learning_rate": 3.460686296648807e-07, "loss": 0.1358, "step": 1410 }, { "epoch": 0.9657768651608487, "grad_norm": 5.20412015914917, "learning_rate": 3.3264528385225093e-07, "loss": 0.1561, "step": 1411 }, { "epoch": 0.9664613278576317, "grad_norm": 5.528499126434326, "learning_rate": 3.194865773354261e-07, "loss": 0.14, "step": 1412 }, { "epoch": 0.9671457905544147, "grad_norm": 2.8952019214630127, "learning_rate": 3.0659258022707194e-07, "loss": 0.0528, "step": 1413 }, { "epoch": 0.9678302532511978, "grad_norm": 9.746217727661133, "learning_rate": 2.9396336122942124e-07, "loss": 0.1759, "step": 1414 }, { "epoch": 0.9685147159479809, "grad_norm": 7.285802364349365, "learning_rate": 2.815989876339187e-07, "loss": 0.1615, "step": 1415 }, { "epoch": 0.9691991786447639, "grad_norm": 3.064793348312378, "learning_rate": 2.6949952532083765e-07, "loss": 0.0779, "step": 1416 }, { "epoch": 0.9698836413415469, "grad_norm": 3.797314405441284, "learning_rate": 2.576650387589419e-07, "loss": 0.1399, "step": 1417 }, { "epoch": 0.9705681040383299, "grad_norm": 3.434673547744751, "learning_rate": 2.4609559100515764e-07, "loss": 0.0267, "step": 1418 }, { "epoch": 0.971252566735113, "grad_norm": 3.7085533142089844, "learning_rate": 2.3479124370419657e-07, "loss": 0.0942, "step": 1419 }, { "epoch": 0.971937029431896, "grad_norm": 4.564137935638428, "learning_rate": 2.2375205708828338e-07, "loss": 0.1456, "step": 1420 }, { "epoch": 0.972621492128679, "grad_norm": 7.3797078132629395, "learning_rate": 2.1297808997678415e-07, "loss": 0.1273, "step": 1421 }, { "epoch": 0.973305954825462, "grad_norm": 5.065762519836426, "learning_rate": 2.0246939977591196e-07, "loss": 0.1321, "step": 1422 }, { "epoch": 0.973990417522245, "grad_norm": 6.011019706726074, "learning_rate": 1.9222604247843834e-07, "loss": 0.0684, "step": 1423 }, { "epoch": 0.974674880219028, "grad_norm": 5.845041751861572, "learning_rate": 1.8224807266336018e-07, "loss": 0.1291, "step": 1424 }, { "epoch": 0.9753593429158111, "grad_norm": 6.961440086364746, "learning_rate": 1.725355434956444e-07, "loss": 0.143, "step": 1425 }, { "epoch": 0.9760438056125941, "grad_norm": 3.862208604812622, "learning_rate": 1.6308850672591692e-07, "loss": 0.108, "step": 1426 }, { "epoch": 0.9767282683093771, "grad_norm": 7.996629238128662, "learning_rate": 1.53907012690202e-07, "loss": 0.1056, "step": 1427 }, { "epoch": 0.9774127310061602, "grad_norm": 5.006128787994385, "learning_rate": 1.4499111030965574e-07, "loss": 0.1256, "step": 1428 }, { "epoch": 0.9780971937029432, "grad_norm": 4.146512985229492, "learning_rate": 1.3634084709028272e-07, "loss": 0.089, "step": 1429 }, { "epoch": 0.9787816563997263, "grad_norm": 4.4759955406188965, "learning_rate": 1.2795626912271986e-07, "loss": 0.0976, "step": 1430 }, { "epoch": 0.9794661190965093, "grad_norm": 3.621474266052246, "learning_rate": 1.1983742108195862e-07, "loss": 0.0654, "step": 1431 }, { "epoch": 0.9801505817932923, "grad_norm": 5.536753177642822, "learning_rate": 1.1198434622712306e-07, "loss": 0.0654, "step": 1432 }, { "epoch": 0.9808350444900753, "grad_norm": 3.996016263961792, "learning_rate": 1.0439708640123114e-07, "loss": 0.1034, "step": 1433 }, { "epoch": 0.9815195071868583, "grad_norm": 4.767721652984619, "learning_rate": 9.707568203098927e-08, "loss": 0.101, "step": 1434 }, { "epoch": 0.9822039698836413, "grad_norm": 5.450812816619873, "learning_rate": 9.002017212654256e-08, "loss": 0.1114, "step": 1435 }, { "epoch": 0.9828884325804244, "grad_norm": 6.155400276184082, "learning_rate": 8.32305942813083e-08, "loss": 0.1511, "step": 1436 }, { "epoch": 0.9835728952772074, "grad_norm": 2.8270835876464844, "learning_rate": 7.670698467174276e-08, "loss": 0.1228, "step": 1437 }, { "epoch": 0.9842573579739904, "grad_norm": 4.037028789520264, "learning_rate": 7.044937805716356e-08, "loss": 0.0387, "step": 1438 }, { "epoch": 0.9849418206707734, "grad_norm": 4.5118231773376465, "learning_rate": 6.445780777957212e-08, "loss": 0.0882, "step": 1439 }, { "epoch": 0.9856262833675564, "grad_norm": 3.140321969985962, "learning_rate": 5.873230576345368e-08, "loss": 0.0577, "step": 1440 }, { "epoch": 0.9863107460643394, "grad_norm": 5.758755683898926, "learning_rate": 5.3272902515622e-08, "loss": 0.1024, "step": 1441 }, { "epoch": 0.9869952087611226, "grad_norm": 3.8980624675750732, "learning_rate": 4.807962712505276e-08, "loss": 0.0456, "step": 1442 }, { "epoch": 0.9876796714579056, "grad_norm": 2.870884418487549, "learning_rate": 4.315250726274478e-08, "loss": 0.0301, "step": 1443 }, { "epoch": 0.9883641341546886, "grad_norm": 3.6751551628112793, "learning_rate": 3.8491569181531293e-08, "loss": 0.0501, "step": 1444 }, { "epoch": 0.9890485968514716, "grad_norm": 1.2044950723648071, "learning_rate": 3.409683771599115e-08, "loss": 0.008, "step": 1445 }, { "epoch": 0.9897330595482546, "grad_norm": 2.276776075363159, "learning_rate": 2.996833628228779e-08, "loss": 0.0245, "step": 1446 }, { "epoch": 0.9904175222450377, "grad_norm": 3.170147657394409, "learning_rate": 2.610608687803051e-08, "loss": 0.0717, "step": 1447 }, { "epoch": 0.9911019849418207, "grad_norm": 4.859658241271973, "learning_rate": 2.2510110082196722e-08, "loss": 0.0491, "step": 1448 }, { "epoch": 0.9917864476386037, "grad_norm": 1.459184169769287, "learning_rate": 1.918042505498763e-08, "loss": 0.012, "step": 1449 }, { "epoch": 0.9924709103353867, "grad_norm": 3.944338083267212, "learning_rate": 1.6117049537750507e-08, "loss": 0.0303, "step": 1450 }, { "epoch": 0.9931553730321697, "grad_norm": 6.976804733276367, "learning_rate": 1.3319999852867693e-08, "loss": 0.1195, "step": 1451 }, { "epoch": 0.9938398357289527, "grad_norm": 8.14913272857666, "learning_rate": 1.0789290903667758e-08, "loss": 0.1702, "step": 1452 }, { "epoch": 0.9945242984257358, "grad_norm": 4.773228168487549, "learning_rate": 8.52493617436445e-09, "loss": 0.0661, "step": 1453 }, { "epoch": 0.9952087611225188, "grad_norm": 5.04503870010376, "learning_rate": 6.526947729973421e-09, "loss": 0.0937, "step": 1454 }, { "epoch": 0.9958932238193019, "grad_norm": 5.2035956382751465, "learning_rate": 4.795336216240065e-09, "loss": 0.1382, "step": 1455 }, { "epoch": 0.9965776865160849, "grad_norm": 3.6034438610076904, "learning_rate": 3.3301108596006656e-09, "loss": 0.0522, "step": 1456 }, { "epoch": 0.9972621492128679, "grad_norm": 5.288224220275879, "learning_rate": 2.1312794671157763e-09, "loss": 0.0916, "step": 1457 }, { "epoch": 0.997946611909651, "grad_norm": 3.3839728832244873, "learning_rate": 1.1988484264369159e-09, "loss": 0.0178, "step": 1458 }, { "epoch": 0.998631074606434, "grad_norm": 4.3025383949279785, "learning_rate": 5.328227057788127e-10, "loss": 0.1126, "step": 1459 }, { "epoch": 0.999315537303217, "grad_norm": 5.363297462463379, "learning_rate": 1.3320585388054697e-10, "loss": 0.1048, "step": 1460 }, { "epoch": 1.0, "grad_norm": 1.1440646648406982, "learning_rate": 0.0, "loss": 0.008, "step": 1461 } ], "logging_steps": 1, "max_steps": 1461, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.526363504818258e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }