{ "best_metric": 1.602339506149292, "best_model_checkpoint": "miner_id_24/checkpoint-2200", "epoch": 0.13363400985550822, "eval_steps": 200, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.772643209125294e-05, "grad_norm": 8.156052589416504, "learning_rate": 6.666666666666667e-06, "loss": 12.5501, "step": 1 }, { "epoch": 4.772643209125294e-05, "eval_loss": 3.1170473098754883, "eval_runtime": 95.9454, "eval_samples_per_second": 8.786, "eval_steps_per_second": 4.398, "step": 1 }, { "epoch": 9.545286418250588e-05, "grad_norm": 9.274253845214844, "learning_rate": 1.3333333333333333e-05, "loss": 12.0242, "step": 2 }, { "epoch": 0.00014317929627375881, "grad_norm": 7.303795337677002, "learning_rate": 2e-05, "loss": 12.9485, "step": 3 }, { "epoch": 0.00019090572836501175, "grad_norm": 8.46444320678711, "learning_rate": 2.6666666666666667e-05, "loss": 13.4995, "step": 4 }, { "epoch": 0.0002386321604562647, "grad_norm": 15.278627395629883, "learning_rate": 3.3333333333333335e-05, "loss": 12.9846, "step": 5 }, { "epoch": 0.00028635859254751763, "grad_norm": 13.401786804199219, "learning_rate": 4e-05, "loss": 13.6205, "step": 6 }, { "epoch": 0.00033408502463877054, "grad_norm": 11.049997329711914, "learning_rate": 4.666666666666667e-05, "loss": 10.501, "step": 7 }, { "epoch": 0.0003818114567300235, "grad_norm": 7.455288887023926, "learning_rate": 5.333333333333333e-05, "loss": 9.7858, "step": 8 }, { "epoch": 0.0004295378888212764, "grad_norm": 8.7234468460083, "learning_rate": 6e-05, "loss": 14.5011, "step": 9 }, { "epoch": 0.0004772643209125294, "grad_norm": 12.033916473388672, "learning_rate": 6.666666666666667e-05, "loss": 14.7143, "step": 10 }, { "epoch": 0.0005249907530037823, "grad_norm": 8.46088981628418, "learning_rate": 7.333333333333333e-05, "loss": 10.654, "step": 11 }, { "epoch": 0.0005727171850950353, "grad_norm": 6.730998516082764, "learning_rate": 8e-05, "loss": 9.5019, "step": 12 }, { "epoch": 0.0006204436171862882, "grad_norm": 9.581258773803711, "learning_rate": 8.666666666666667e-05, "loss": 13.3483, "step": 13 }, { "epoch": 0.0006681700492775411, "grad_norm": 9.198789596557617, "learning_rate": 9.333333333333334e-05, "loss": 11.0057, "step": 14 }, { "epoch": 0.000715896481368794, "grad_norm": 9.850581169128418, "learning_rate": 0.0001, "loss": 9.8186, "step": 15 }, { "epoch": 0.000763622913460047, "grad_norm": 6.305893898010254, "learning_rate": 0.00010666666666666667, "loss": 8.5701, "step": 16 }, { "epoch": 0.0008113493455513, "grad_norm": 13.963780403137207, "learning_rate": 0.00011333333333333334, "loss": 12.3859, "step": 17 }, { "epoch": 0.0008590757776425528, "grad_norm": 9.068361282348633, "learning_rate": 0.00012, "loss": 9.8092, "step": 18 }, { "epoch": 0.0009068022097338058, "grad_norm": 7.555756092071533, "learning_rate": 0.00012666666666666666, "loss": 8.3725, "step": 19 }, { "epoch": 0.0009545286418250588, "grad_norm": 8.059484481811523, "learning_rate": 0.00013333333333333334, "loss": 9.078, "step": 20 }, { "epoch": 0.0010022550739163117, "grad_norm": 8.002023696899414, "learning_rate": 0.00014, "loss": 8.0143, "step": 21 }, { "epoch": 0.0010499815060075646, "grad_norm": 7.968870162963867, "learning_rate": 0.00014666666666666666, "loss": 9.9062, "step": 22 }, { "epoch": 0.0010977079380988177, "grad_norm": 7.092574119567871, "learning_rate": 0.00015333333333333334, "loss": 8.0895, "step": 23 }, { "epoch": 0.0011454343701900705, "grad_norm": 6.445763111114502, "learning_rate": 0.00016, "loss": 8.1857, "step": 24 }, { "epoch": 0.0011931608022813234, "grad_norm": 6.455519676208496, "learning_rate": 0.0001666666666666667, "loss": 7.3903, "step": 25 }, { "epoch": 0.0012408872343725764, "grad_norm": 5.985039234161377, "learning_rate": 0.00017333333333333334, "loss": 7.787, "step": 26 }, { "epoch": 0.0012886136664638293, "grad_norm": 5.788380146026611, "learning_rate": 0.00018, "loss": 7.7579, "step": 27 }, { "epoch": 0.0013363400985550822, "grad_norm": 7.366934299468994, "learning_rate": 0.0001866666666666667, "loss": 8.845, "step": 28 }, { "epoch": 0.0013840665306463352, "grad_norm": 6.39143180847168, "learning_rate": 0.00019333333333333333, "loss": 7.9672, "step": 29 }, { "epoch": 0.001431792962737588, "grad_norm": 5.3974289894104, "learning_rate": 0.0002, "loss": 6.9981, "step": 30 }, { "epoch": 0.0014795193948288412, "grad_norm": 9.380810737609863, "learning_rate": 0.00019999999987497673, "loss": 7.2882, "step": 31 }, { "epoch": 0.001527245826920094, "grad_norm": 6.234286785125732, "learning_rate": 0.00019999999949990685, "loss": 7.2575, "step": 32 }, { "epoch": 0.0015749722590113469, "grad_norm": 7.635664463043213, "learning_rate": 0.00019999999887479038, "loss": 9.1833, "step": 33 }, { "epoch": 0.0016226986911026, "grad_norm": 5.845251083374023, "learning_rate": 0.00019999999799962735, "loss": 6.3639, "step": 34 }, { "epoch": 0.0016704251231938528, "grad_norm": 6.020627021789551, "learning_rate": 0.00019999999687441772, "loss": 8.5162, "step": 35 }, { "epoch": 0.0017181515552851057, "grad_norm": 7.508661270141602, "learning_rate": 0.00019999999549916153, "loss": 7.2014, "step": 36 }, { "epoch": 0.0017658779873763587, "grad_norm": 63.37688446044922, "learning_rate": 0.00019999999387385878, "loss": 8.6904, "step": 37 }, { "epoch": 0.0018136044194676116, "grad_norm": 6.846808433532715, "learning_rate": 0.00019999999199850943, "loss": 9.6905, "step": 38 }, { "epoch": 0.0018613308515588647, "grad_norm": 7.53931999206543, "learning_rate": 0.0001999999898731135, "loss": 7.0634, "step": 39 }, { "epoch": 0.0019090572836501175, "grad_norm": 7.517793655395508, "learning_rate": 0.00019999998749767108, "loss": 8.7579, "step": 40 }, { "epoch": 0.0019567837157413704, "grad_norm": 6.793584823608398, "learning_rate": 0.00019999998487218207, "loss": 6.5392, "step": 41 }, { "epoch": 0.0020045101478326235, "grad_norm": 5.361453056335449, "learning_rate": 0.0001999999819966465, "loss": 7.2578, "step": 42 }, { "epoch": 0.0020522365799238765, "grad_norm": 5.260591506958008, "learning_rate": 0.00019999997887106444, "loss": 6.9882, "step": 43 }, { "epoch": 0.002099963012015129, "grad_norm": 5.026567459106445, "learning_rate": 0.0001999999754954358, "loss": 6.6926, "step": 44 }, { "epoch": 0.0021476894441063822, "grad_norm": 4.541498184204102, "learning_rate": 0.00019999997186976066, "loss": 5.9309, "step": 45 }, { "epoch": 0.0021954158761976353, "grad_norm": 4.459763050079346, "learning_rate": 0.000199999967994039, "loss": 5.5164, "step": 46 }, { "epoch": 0.002243142308288888, "grad_norm": 5.727092266082764, "learning_rate": 0.00019999996386827084, "loss": 6.5041, "step": 47 }, { "epoch": 0.002290868740380141, "grad_norm": 5.828995227813721, "learning_rate": 0.00019999995949245616, "loss": 8.1646, "step": 48 }, { "epoch": 0.002338595172471394, "grad_norm": 5.327517509460449, "learning_rate": 0.00019999995486659505, "loss": 5.8227, "step": 49 }, { "epoch": 0.0023863216045626467, "grad_norm": 6.896716117858887, "learning_rate": 0.0001999999499906874, "loss": 7.6883, "step": 50 }, { "epoch": 0.0024340480366539, "grad_norm": 7.029616832733154, "learning_rate": 0.00019999994486473335, "loss": 8.2224, "step": 51 }, { "epoch": 0.002481774468745153, "grad_norm": 8.25517749786377, "learning_rate": 0.00019999993948873284, "loss": 8.5333, "step": 52 }, { "epoch": 0.0025295009008364055, "grad_norm": 7.740312576293945, "learning_rate": 0.0001999999338626859, "loss": 9.7768, "step": 53 }, { "epoch": 0.0025772273329276586, "grad_norm": 8.219902992248535, "learning_rate": 0.00019999992798659254, "loss": 9.2211, "step": 54 }, { "epoch": 0.0026249537650189117, "grad_norm": 5.783295154571533, "learning_rate": 0.00019999992186045277, "loss": 5.6035, "step": 55 }, { "epoch": 0.0026726801971101643, "grad_norm": 6.954888820648193, "learning_rate": 0.0001999999154842666, "loss": 7.6935, "step": 56 }, { "epoch": 0.0027204066292014174, "grad_norm": 4.878210544586182, "learning_rate": 0.00019999990885803407, "loss": 4.9927, "step": 57 }, { "epoch": 0.0027681330612926705, "grad_norm": 8.59322738647461, "learning_rate": 0.00019999990198175518, "loss": 8.2478, "step": 58 }, { "epoch": 0.0028158594933839235, "grad_norm": 7.328903675079346, "learning_rate": 0.00019999989485542996, "loss": 7.7432, "step": 59 }, { "epoch": 0.002863585925475176, "grad_norm": 7.1839599609375, "learning_rate": 0.00019999988747905842, "loss": 8.6058, "step": 60 }, { "epoch": 0.0029113123575664293, "grad_norm": 6.993515491485596, "learning_rate": 0.00019999987985264058, "loss": 8.6523, "step": 61 }, { "epoch": 0.0029590387896576823, "grad_norm": 7.754829406738281, "learning_rate": 0.00019999987197617643, "loss": 7.7147, "step": 62 }, { "epoch": 0.003006765221748935, "grad_norm": 6.434046268463135, "learning_rate": 0.00019999986384966603, "loss": 6.4865, "step": 63 }, { "epoch": 0.003054491653840188, "grad_norm": 4.396024703979492, "learning_rate": 0.0001999998554731094, "loss": 5.6103, "step": 64 }, { "epoch": 0.003102218085931441, "grad_norm": 5.07852840423584, "learning_rate": 0.00019999984684650654, "loss": 7.4121, "step": 65 }, { "epoch": 0.0031499445180226938, "grad_norm": 6.753673076629639, "learning_rate": 0.00019999983796985748, "loss": 7.207, "step": 66 }, { "epoch": 0.003197670950113947, "grad_norm": 5.469264030456543, "learning_rate": 0.00019999982884316223, "loss": 5.161, "step": 67 }, { "epoch": 0.0032453973822052, "grad_norm": 6.778441905975342, "learning_rate": 0.00019999981946642085, "loss": 6.7959, "step": 68 }, { "epoch": 0.0032931238142964525, "grad_norm": 5.161728382110596, "learning_rate": 0.0001999998098396333, "loss": 6.7924, "step": 69 }, { "epoch": 0.0033408502463877056, "grad_norm": 6.251170635223389, "learning_rate": 0.0001999997999627997, "loss": 7.2183, "step": 70 }, { "epoch": 0.0033885766784789587, "grad_norm": 6.930290222167969, "learning_rate": 0.00019999978983591997, "loss": 7.5263, "step": 71 }, { "epoch": 0.0034363031105702113, "grad_norm": 7.092405796051025, "learning_rate": 0.0001999997794589942, "loss": 9.01, "step": 72 }, { "epoch": 0.0034840295426614644, "grad_norm": 6.196310997009277, "learning_rate": 0.00019999976883202237, "loss": 7.5049, "step": 73 }, { "epoch": 0.0035317559747527175, "grad_norm": 5.769287586212158, "learning_rate": 0.00019999975795500455, "loss": 7.797, "step": 74 }, { "epoch": 0.0035794824068439706, "grad_norm": 5.971030235290527, "learning_rate": 0.00019999974682794076, "loss": 6.4961, "step": 75 }, { "epoch": 0.003627208838935223, "grad_norm": 4.6939697265625, "learning_rate": 0.000199999735450831, "loss": 6.9824, "step": 76 }, { "epoch": 0.0036749352710264763, "grad_norm": 7.828939437866211, "learning_rate": 0.00019999972382367536, "loss": 10.0984, "step": 77 }, { "epoch": 0.0037226617031177293, "grad_norm": 5.222005367279053, "learning_rate": 0.00019999971194647377, "loss": 7.0103, "step": 78 }, { "epoch": 0.003770388135208982, "grad_norm": 5.610576152801514, "learning_rate": 0.00019999969981922638, "loss": 7.0152, "step": 79 }, { "epoch": 0.003818114567300235, "grad_norm": 7.603675842285156, "learning_rate": 0.00019999968744193312, "loss": 6.8389, "step": 80 }, { "epoch": 0.003865840999391488, "grad_norm": 5.232549667358398, "learning_rate": 0.00019999967481459406, "loss": 6.354, "step": 81 }, { "epoch": 0.003913567431482741, "grad_norm": 6.516291618347168, "learning_rate": 0.00019999966193720922, "loss": 7.2243, "step": 82 }, { "epoch": 0.003961293863573994, "grad_norm": 5.813276290893555, "learning_rate": 0.00019999964880977866, "loss": 6.343, "step": 83 }, { "epoch": 0.004009020295665247, "grad_norm": 4.524149417877197, "learning_rate": 0.0001999996354323024, "loss": 7.7151, "step": 84 }, { "epoch": 0.0040567467277565, "grad_norm": 4.268386363983154, "learning_rate": 0.00019999962180478043, "loss": 6.0883, "step": 85 }, { "epoch": 0.004104473159847753, "grad_norm": 16.34364891052246, "learning_rate": 0.00019999960792721285, "loss": 10.0296, "step": 86 }, { "epoch": 0.004152199591939005, "grad_norm": 4.498778820037842, "learning_rate": 0.00019999959379959968, "loss": 7.1151, "step": 87 }, { "epoch": 0.004199926024030258, "grad_norm": 5.004024028778076, "learning_rate": 0.00019999957942194092, "loss": 6.8547, "step": 88 }, { "epoch": 0.004247652456121511, "grad_norm": 5.410999298095703, "learning_rate": 0.00019999956479423663, "loss": 7.0691, "step": 89 }, { "epoch": 0.0042953788882127645, "grad_norm": 5.083662033081055, "learning_rate": 0.00019999954991648683, "loss": 7.1524, "step": 90 }, { "epoch": 0.004343105320304018, "grad_norm": 4.258586406707764, "learning_rate": 0.0001999995347886916, "loss": 6.7408, "step": 91 }, { "epoch": 0.004390831752395271, "grad_norm": 3.56187105178833, "learning_rate": 0.00019999951941085097, "loss": 4.324, "step": 92 }, { "epoch": 0.004438558184486523, "grad_norm": 5.012781143188477, "learning_rate": 0.0001999995037829649, "loss": 6.2041, "step": 93 }, { "epoch": 0.004486284616577776, "grad_norm": 6.942680835723877, "learning_rate": 0.0001999994879050335, "loss": 5.8268, "step": 94 }, { "epoch": 0.004534011048669029, "grad_norm": 5.844181060791016, "learning_rate": 0.0001999994717770568, "loss": 6.4547, "step": 95 }, { "epoch": 0.004581737480760282, "grad_norm": 5.277218341827393, "learning_rate": 0.00019999945539903486, "loss": 6.5379, "step": 96 }, { "epoch": 0.004629463912851535, "grad_norm": 6.048219680786133, "learning_rate": 0.0001999994387709677, "loss": 6.9474, "step": 97 }, { "epoch": 0.004677190344942788, "grad_norm": 10.516984939575195, "learning_rate": 0.0001999994218928553, "loss": 6.5548, "step": 98 }, { "epoch": 0.004724916777034041, "grad_norm": 4.922785758972168, "learning_rate": 0.00019999940476469785, "loss": 6.4903, "step": 99 }, { "epoch": 0.0047726432091252935, "grad_norm": 5.574140548706055, "learning_rate": 0.00019999938738649524, "loss": 6.753, "step": 100 }, { "epoch": 0.004820369641216547, "grad_norm": 5.692989349365234, "learning_rate": 0.00019999936975824761, "loss": 7.8935, "step": 101 }, { "epoch": 0.0048680960733078, "grad_norm": 3.9527132511138916, "learning_rate": 0.00019999935187995496, "loss": 5.6235, "step": 102 }, { "epoch": 0.004915822505399053, "grad_norm": 4.653379440307617, "learning_rate": 0.00019999933375161737, "loss": 6.1317, "step": 103 }, { "epoch": 0.004963548937490306, "grad_norm": 5.225219249725342, "learning_rate": 0.00019999931537323482, "loss": 7.7239, "step": 104 }, { "epoch": 0.005011275369581559, "grad_norm": 5.7530517578125, "learning_rate": 0.00019999929674480743, "loss": 9.6706, "step": 105 }, { "epoch": 0.005059001801672811, "grad_norm": 4.033590316772461, "learning_rate": 0.0001999992778663352, "loss": 5.506, "step": 106 }, { "epoch": 0.005106728233764064, "grad_norm": 6.715490341186523, "learning_rate": 0.0001999992587378182, "loss": 7.9386, "step": 107 }, { "epoch": 0.005154454665855317, "grad_norm": 4.433899402618408, "learning_rate": 0.00019999923935925648, "loss": 7.3948, "step": 108 }, { "epoch": 0.00520218109794657, "grad_norm": 4.419589519500732, "learning_rate": 0.00019999921973065004, "loss": 5.1343, "step": 109 }, { "epoch": 0.005249907530037823, "grad_norm": 3.680023193359375, "learning_rate": 0.00019999919985199898, "loss": 5.7496, "step": 110 }, { "epoch": 0.0052976339621290764, "grad_norm": 4.177944660186768, "learning_rate": 0.00019999917972330334, "loss": 5.9936, "step": 111 }, { "epoch": 0.005345360394220329, "grad_norm": 4.320446014404297, "learning_rate": 0.0001999991593445632, "loss": 7.4622, "step": 112 }, { "epoch": 0.005393086826311582, "grad_norm": 3.077852725982666, "learning_rate": 0.00019999913871577855, "loss": 3.8202, "step": 113 }, { "epoch": 0.005440813258402835, "grad_norm": 4.988670825958252, "learning_rate": 0.0001999991178369495, "loss": 6.3419, "step": 114 }, { "epoch": 0.005488539690494088, "grad_norm": 6.100823879241943, "learning_rate": 0.00019999909670807603, "loss": 7.4201, "step": 115 }, { "epoch": 0.005536266122585341, "grad_norm": 5.46522331237793, "learning_rate": 0.00019999907532915824, "loss": 7.8764, "step": 116 }, { "epoch": 0.005583992554676594, "grad_norm": 5.026033878326416, "learning_rate": 0.0001999990537001962, "loss": 8.8585, "step": 117 }, { "epoch": 0.005631718986767847, "grad_norm": 5.422964096069336, "learning_rate": 0.0001999990318211899, "loss": 7.0363, "step": 118 }, { "epoch": 0.005679445418859099, "grad_norm": 5.562786102294922, "learning_rate": 0.0001999990096921395, "loss": 8.6947, "step": 119 }, { "epoch": 0.005727171850950352, "grad_norm": 4.488889217376709, "learning_rate": 0.00019999898731304496, "loss": 6.6352, "step": 120 }, { "epoch": 0.0057748982830416054, "grad_norm": 6.206261157989502, "learning_rate": 0.00019999896468390636, "loss": 8.9197, "step": 121 }, { "epoch": 0.0058226247151328585, "grad_norm": 3.885154962539673, "learning_rate": 0.00019999894180472376, "loss": 6.5824, "step": 122 }, { "epoch": 0.005870351147224112, "grad_norm": 4.657262325286865, "learning_rate": 0.00019999891867549725, "loss": 7.4743, "step": 123 }, { "epoch": 0.005918077579315365, "grad_norm": 4.30035924911499, "learning_rate": 0.00019999889529622684, "loss": 7.2783, "step": 124 }, { "epoch": 0.005965804011406617, "grad_norm": 9.447369575500488, "learning_rate": 0.00019999887166691262, "loss": 7.4739, "step": 125 }, { "epoch": 0.00601353044349787, "grad_norm": 4.820959568023682, "learning_rate": 0.00019999884778755465, "loss": 6.1332, "step": 126 }, { "epoch": 0.006061256875589123, "grad_norm": 6.934280872344971, "learning_rate": 0.00019999882365815294, "loss": 6.8292, "step": 127 }, { "epoch": 0.006108983307680376, "grad_norm": 4.936176776885986, "learning_rate": 0.00019999879927870762, "loss": 8.7501, "step": 128 }, { "epoch": 0.006156709739771629, "grad_norm": 5.515492916107178, "learning_rate": 0.0001999987746492187, "loss": 7.3772, "step": 129 }, { "epoch": 0.006204436171862882, "grad_norm": 5.130324363708496, "learning_rate": 0.00019999874976968626, "loss": 9.1506, "step": 130 }, { "epoch": 0.006252162603954135, "grad_norm": 4.996119976043701, "learning_rate": 0.00019999872464011037, "loss": 6.213, "step": 131 }, { "epoch": 0.0062998890360453875, "grad_norm": 4.5611114501953125, "learning_rate": 0.0001999986992604911, "loss": 7.0564, "step": 132 }, { "epoch": 0.006347615468136641, "grad_norm": 12.032221794128418, "learning_rate": 0.0001999986736308285, "loss": 4.7228, "step": 133 }, { "epoch": 0.006395341900227894, "grad_norm": 4.067997455596924, "learning_rate": 0.0001999986477511226, "loss": 6.1123, "step": 134 }, { "epoch": 0.006443068332319147, "grad_norm": 5.277679920196533, "learning_rate": 0.0001999986216213735, "loss": 8.8908, "step": 135 }, { "epoch": 0.0064907947644104, "grad_norm": 6.415206432342529, "learning_rate": 0.0001999985952415813, "loss": 7.0391, "step": 136 }, { "epoch": 0.006538521196501653, "grad_norm": 6.522372245788574, "learning_rate": 0.000199998568611746, "loss": 8.9838, "step": 137 }, { "epoch": 0.006586247628592905, "grad_norm": 5.012221336364746, "learning_rate": 0.0001999985417318677, "loss": 7.661, "step": 138 }, { "epoch": 0.006633974060684158, "grad_norm": 5.40250825881958, "learning_rate": 0.00019999851460194646, "loss": 7.2033, "step": 139 }, { "epoch": 0.006681700492775411, "grad_norm": 5.069441795349121, "learning_rate": 0.00019999848722198236, "loss": 6.6148, "step": 140 }, { "epoch": 0.006729426924866664, "grad_norm": 5.3148064613342285, "learning_rate": 0.00019999845959197546, "loss": 7.3913, "step": 141 }, { "epoch": 0.006777153356957917, "grad_norm": 4.556090354919434, "learning_rate": 0.00019999843171192582, "loss": 4.5144, "step": 142 }, { "epoch": 0.0068248797890491705, "grad_norm": 4.365043640136719, "learning_rate": 0.0001999984035818335, "loss": 6.4603, "step": 143 }, { "epoch": 0.006872606221140423, "grad_norm": 7.352154731750488, "learning_rate": 0.0001999983752016986, "loss": 6.9395, "step": 144 }, { "epoch": 0.006920332653231676, "grad_norm": 5.900674819946289, "learning_rate": 0.00019999834657152122, "loss": 5.3865, "step": 145 }, { "epoch": 0.006968059085322929, "grad_norm": 4.4542670249938965, "learning_rate": 0.00019999831769130133, "loss": 5.7073, "step": 146 }, { "epoch": 0.007015785517414182, "grad_norm": 4.517548561096191, "learning_rate": 0.0001999982885610391, "loss": 5.8635, "step": 147 }, { "epoch": 0.007063511949505435, "grad_norm": 4.733120441436768, "learning_rate": 0.00019999825918073455, "loss": 7.162, "step": 148 }, { "epoch": 0.007111238381596688, "grad_norm": 5.98193359375, "learning_rate": 0.00019999822955038776, "loss": 8.3258, "step": 149 }, { "epoch": 0.007158964813687941, "grad_norm": 5.3873610496521, "learning_rate": 0.00019999819966999883, "loss": 7.9001, "step": 150 }, { "epoch": 0.007206691245779193, "grad_norm": 8.724983215332031, "learning_rate": 0.00019999816953956783, "loss": 8.8156, "step": 151 }, { "epoch": 0.007254417677870446, "grad_norm": 5.680585861206055, "learning_rate": 0.0001999981391590948, "loss": 6.7082, "step": 152 }, { "epoch": 0.0073021441099616995, "grad_norm": 6.000738620758057, "learning_rate": 0.00019999810852857985, "loss": 7.2123, "step": 153 }, { "epoch": 0.0073498705420529525, "grad_norm": 5.984689712524414, "learning_rate": 0.00019999807764802303, "loss": 6.9426, "step": 154 }, { "epoch": 0.007397596974144206, "grad_norm": 6.926332473754883, "learning_rate": 0.00019999804651742447, "loss": 7.4237, "step": 155 }, { "epoch": 0.007445323406235459, "grad_norm": 4.661206245422363, "learning_rate": 0.00019999801513678417, "loss": 7.7377, "step": 156 }, { "epoch": 0.007493049838326711, "grad_norm": 4.511832237243652, "learning_rate": 0.00019999798350610229, "loss": 5.908, "step": 157 }, { "epoch": 0.007540776270417964, "grad_norm": 4.867547512054443, "learning_rate": 0.00019999795162537886, "loss": 6.5205, "step": 158 }, { "epoch": 0.007588502702509217, "grad_norm": 6.225820541381836, "learning_rate": 0.00019999791949461393, "loss": 7.0262, "step": 159 }, { "epoch": 0.00763622913460047, "grad_norm": 5.769981384277344, "learning_rate": 0.00019999788711380764, "loss": 6.9616, "step": 160 }, { "epoch": 0.007683955566691723, "grad_norm": 5.535105228424072, "learning_rate": 0.00019999785448296005, "loss": 7.2659, "step": 161 }, { "epoch": 0.007731681998782976, "grad_norm": 4.4427642822265625, "learning_rate": 0.00019999782160207125, "loss": 5.8697, "step": 162 }, { "epoch": 0.007779408430874229, "grad_norm": 7.328268527984619, "learning_rate": 0.0001999977884711413, "loss": 6.6268, "step": 163 }, { "epoch": 0.007827134862965482, "grad_norm": 6.006573677062988, "learning_rate": 0.00019999775509017028, "loss": 9.1153, "step": 164 }, { "epoch": 0.007874861295056735, "grad_norm": 4.760677814483643, "learning_rate": 0.00019999772145915833, "loss": 6.9565, "step": 165 }, { "epoch": 0.007922587727147988, "grad_norm": 4.592026233673096, "learning_rate": 0.00019999768757810547, "loss": 5.6614, "step": 166 }, { "epoch": 0.00797031415923924, "grad_norm": 4.596296310424805, "learning_rate": 0.00019999765344701182, "loss": 6.0717, "step": 167 }, { "epoch": 0.008018040591330494, "grad_norm": 5.7095417976379395, "learning_rate": 0.00019999761906587743, "loss": 8.2684, "step": 168 }, { "epoch": 0.008065767023421747, "grad_norm": 5.1179986000061035, "learning_rate": 0.00019999758443470245, "loss": 6.6392, "step": 169 }, { "epoch": 0.008113493455513, "grad_norm": 5.473196983337402, "learning_rate": 0.00019999754955348688, "loss": 5.9417, "step": 170 }, { "epoch": 0.008161219887604253, "grad_norm": 4.3178324699401855, "learning_rate": 0.0001999975144222309, "loss": 6.3279, "step": 171 }, { "epoch": 0.008208946319695506, "grad_norm": 6.818999767303467, "learning_rate": 0.00019999747904093451, "loss": 7.9789, "step": 172 }, { "epoch": 0.008256672751786757, "grad_norm": 4.3126983642578125, "learning_rate": 0.00019999744340959787, "loss": 6.9174, "step": 173 }, { "epoch": 0.00830439918387801, "grad_norm": 5.491178512573242, "learning_rate": 0.00019999740752822104, "loss": 7.5142, "step": 174 }, { "epoch": 0.008352125615969264, "grad_norm": 6.331181526184082, "learning_rate": 0.0001999973713968041, "loss": 7.3434, "step": 175 }, { "epoch": 0.008399852048060517, "grad_norm": 5.401471138000488, "learning_rate": 0.00019999733501534713, "loss": 6.7435, "step": 176 }, { "epoch": 0.00844757848015177, "grad_norm": 4.26669979095459, "learning_rate": 0.00019999729838385025, "loss": 5.7267, "step": 177 }, { "epoch": 0.008495304912243023, "grad_norm": 3.7150814533233643, "learning_rate": 0.00019999726150231355, "loss": 5.5032, "step": 178 }, { "epoch": 0.008543031344334276, "grad_norm": 5.715814113616943, "learning_rate": 0.00019999722437073712, "loss": 7.3256, "step": 179 }, { "epoch": 0.008590757776425529, "grad_norm": 4.25509786605835, "learning_rate": 0.000199997186989121, "loss": 5.542, "step": 180 }, { "epoch": 0.008638484208516782, "grad_norm": 4.596546173095703, "learning_rate": 0.0001999971493574654, "loss": 6.845, "step": 181 }, { "epoch": 0.008686210640608035, "grad_norm": 5.3774309158325195, "learning_rate": 0.00019999711147577028, "loss": 7.6096, "step": 182 }, { "epoch": 0.008733937072699288, "grad_norm": 5.344915390014648, "learning_rate": 0.00019999707334403586, "loss": 7.2455, "step": 183 }, { "epoch": 0.008781663504790541, "grad_norm": 5.402298450469971, "learning_rate": 0.00019999703496226213, "loss": 7.7967, "step": 184 }, { "epoch": 0.008829389936881794, "grad_norm": 3.6488380432128906, "learning_rate": 0.00019999699633044927, "loss": 4.383, "step": 185 }, { "epoch": 0.008877116368973046, "grad_norm": 6.237378120422363, "learning_rate": 0.0001999969574485973, "loss": 8.9452, "step": 186 }, { "epoch": 0.008924842801064299, "grad_norm": 6.196676731109619, "learning_rate": 0.00019999691831670634, "loss": 5.9055, "step": 187 }, { "epoch": 0.008972569233155552, "grad_norm": 5.825589656829834, "learning_rate": 0.00019999687893477655, "loss": 7.6183, "step": 188 }, { "epoch": 0.009020295665246805, "grad_norm": 4.964183807373047, "learning_rate": 0.00019999683930280795, "loss": 6.7619, "step": 189 }, { "epoch": 0.009068022097338058, "grad_norm": 5.618199825286865, "learning_rate": 0.0001999967994208007, "loss": 8.0128, "step": 190 }, { "epoch": 0.009115748529429311, "grad_norm": 4.945575714111328, "learning_rate": 0.00019999675928875484, "loss": 5.723, "step": 191 }, { "epoch": 0.009163474961520564, "grad_norm": 3.8588993549346924, "learning_rate": 0.0001999967189066705, "loss": 6.2638, "step": 192 }, { "epoch": 0.009211201393611817, "grad_norm": 5.020188331604004, "learning_rate": 0.0001999966782745478, "loss": 7.6009, "step": 193 }, { "epoch": 0.00925892782570307, "grad_norm": 4.792932033538818, "learning_rate": 0.0001999966373923868, "loss": 6.7725, "step": 194 }, { "epoch": 0.009306654257794323, "grad_norm": 4.707699775695801, "learning_rate": 0.00019999659626018763, "loss": 6.275, "step": 195 }, { "epoch": 0.009354380689885576, "grad_norm": 4.85133171081543, "learning_rate": 0.0001999965548779504, "loss": 8.7349, "step": 196 }, { "epoch": 0.00940210712197683, "grad_norm": 5.022207260131836, "learning_rate": 0.00019999651324567518, "loss": 7.265, "step": 197 }, { "epoch": 0.009449833554068083, "grad_norm": 6.977590560913086, "learning_rate": 0.0001999964713633621, "loss": 6.1847, "step": 198 }, { "epoch": 0.009497559986159334, "grad_norm": 4.008247375488281, "learning_rate": 0.00019999642923101128, "loss": 5.4763, "step": 199 }, { "epoch": 0.009545286418250587, "grad_norm": 5.237358093261719, "learning_rate": 0.0001999963868486228, "loss": 8.1989, "step": 200 }, { "epoch": 0.009545286418250587, "eval_loss": 1.6985455751419067, "eval_runtime": 96.4787, "eval_samples_per_second": 8.738, "eval_steps_per_second": 4.374, "step": 200 }, { "epoch": 0.00959301285034184, "grad_norm": 3.706386089324951, "learning_rate": 0.00019999634421619673, "loss": 5.2003, "step": 201 }, { "epoch": 0.009640739282433093, "grad_norm": 4.2931294441223145, "learning_rate": 0.00019999630133373325, "loss": 5.9312, "step": 202 }, { "epoch": 0.009688465714524346, "grad_norm": 4.851586818695068, "learning_rate": 0.00019999625820123245, "loss": 6.7144, "step": 203 }, { "epoch": 0.0097361921466156, "grad_norm": 5.349869728088379, "learning_rate": 0.0001999962148186944, "loss": 6.6918, "step": 204 }, { "epoch": 0.009783918578706852, "grad_norm": 5.367208003997803, "learning_rate": 0.00019999617118611924, "loss": 7.5381, "step": 205 }, { "epoch": 0.009831645010798105, "grad_norm": 4.719504356384277, "learning_rate": 0.00019999612730350707, "loss": 6.669, "step": 206 }, { "epoch": 0.009879371442889359, "grad_norm": 5.995404243469238, "learning_rate": 0.000199996083170858, "loss": 7.8411, "step": 207 }, { "epoch": 0.009927097874980612, "grad_norm": 5.326706409454346, "learning_rate": 0.0001999960387881721, "loss": 6.2895, "step": 208 }, { "epoch": 0.009974824307071865, "grad_norm": 4.975574016571045, "learning_rate": 0.00019999599415544957, "loss": 5.5179, "step": 209 }, { "epoch": 0.010022550739163118, "grad_norm": 5.673884391784668, "learning_rate": 0.00019999594927269047, "loss": 6.7112, "step": 210 }, { "epoch": 0.01007027717125437, "grad_norm": 5.526071071624756, "learning_rate": 0.0001999959041398949, "loss": 6.4561, "step": 211 }, { "epoch": 0.010118003603345622, "grad_norm": 5.906366348266602, "learning_rate": 0.000199995858757063, "loss": 6.8685, "step": 212 }, { "epoch": 0.010165730035436875, "grad_norm": 5.4884257316589355, "learning_rate": 0.0001999958131241949, "loss": 8.4239, "step": 213 }, { "epoch": 0.010213456467528128, "grad_norm": 3.8919754028320312, "learning_rate": 0.0001999957672412906, "loss": 6.4004, "step": 214 }, { "epoch": 0.010261182899619381, "grad_norm": 5.335618495941162, "learning_rate": 0.0001999957211083504, "loss": 5.9267, "step": 215 }, { "epoch": 0.010308909331710634, "grad_norm": 5.428693771362305, "learning_rate": 0.00019999567472537424, "loss": 7.1497, "step": 216 }, { "epoch": 0.010356635763801888, "grad_norm": 3.8885908126831055, "learning_rate": 0.0001999956280923623, "loss": 5.886, "step": 217 }, { "epoch": 0.01040436219589314, "grad_norm": 4.225867748260498, "learning_rate": 0.0001999955812093148, "loss": 7.0125, "step": 218 }, { "epoch": 0.010452088627984394, "grad_norm": 4.775414943695068, "learning_rate": 0.0001999955340762317, "loss": 6.0621, "step": 219 }, { "epoch": 0.010499815060075647, "grad_norm": 4.572904586791992, "learning_rate": 0.00019999548669311318, "loss": 5.5764, "step": 220 }, { "epoch": 0.0105475414921669, "grad_norm": 5.60729455947876, "learning_rate": 0.00019999543905995938, "loss": 8.5128, "step": 221 }, { "epoch": 0.010595267924258153, "grad_norm": 5.341472148895264, "learning_rate": 0.0001999953911767704, "loss": 7.4026, "step": 222 }, { "epoch": 0.010642994356349406, "grad_norm": 3.933389663696289, "learning_rate": 0.00019999534304354635, "loss": 4.8309, "step": 223 }, { "epoch": 0.010690720788440657, "grad_norm": 5.747697830200195, "learning_rate": 0.00019999529466028737, "loss": 7.3435, "step": 224 }, { "epoch": 0.01073844722053191, "grad_norm": 4.927333831787109, "learning_rate": 0.00019999524602699358, "loss": 6.4677, "step": 225 }, { "epoch": 0.010786173652623163, "grad_norm": 3.5867760181427, "learning_rate": 0.0001999951971436651, "loss": 5.4795, "step": 226 }, { "epoch": 0.010833900084714417, "grad_norm": 4.812954425811768, "learning_rate": 0.000199995148010302, "loss": 6.1549, "step": 227 }, { "epoch": 0.01088162651680567, "grad_norm": 6.369991302490234, "learning_rate": 0.00019999509862690448, "loss": 7.4918, "step": 228 }, { "epoch": 0.010929352948896923, "grad_norm": 5.212950229644775, "learning_rate": 0.00019999504899347262, "loss": 5.5875, "step": 229 }, { "epoch": 0.010977079380988176, "grad_norm": 4.195418357849121, "learning_rate": 0.00019999499911000656, "loss": 5.2198, "step": 230 }, { "epoch": 0.011024805813079429, "grad_norm": 4.676207065582275, "learning_rate": 0.00019999494897650645, "loss": 6.0417, "step": 231 }, { "epoch": 0.011072532245170682, "grad_norm": 4.369086265563965, "learning_rate": 0.00019999489859297235, "loss": 4.7328, "step": 232 }, { "epoch": 0.011120258677261935, "grad_norm": 5.925018310546875, "learning_rate": 0.00019999484795940443, "loss": 6.0987, "step": 233 }, { "epoch": 0.011167985109353188, "grad_norm": 4.76014518737793, "learning_rate": 0.00019999479707580282, "loss": 6.571, "step": 234 }, { "epoch": 0.011215711541444441, "grad_norm": 5.120473861694336, "learning_rate": 0.00019999474594216762, "loss": 7.5342, "step": 235 }, { "epoch": 0.011263437973535694, "grad_norm": 5.6909027099609375, "learning_rate": 0.000199994694558499, "loss": 8.6062, "step": 236 }, { "epoch": 0.011311164405626946, "grad_norm": 8.290360450744629, "learning_rate": 0.00019999464292479703, "loss": 7.7553, "step": 237 }, { "epoch": 0.011358890837718199, "grad_norm": 5.976494312286377, "learning_rate": 0.00019999459104106187, "loss": 8.6744, "step": 238 }, { "epoch": 0.011406617269809452, "grad_norm": 10.440987586975098, "learning_rate": 0.0001999945389072937, "loss": 8.9354, "step": 239 }, { "epoch": 0.011454343701900705, "grad_norm": 4.592039585113525, "learning_rate": 0.00019999448652349258, "loss": 7.377, "step": 240 }, { "epoch": 0.011502070133991958, "grad_norm": 4.670874118804932, "learning_rate": 0.00019999443388965863, "loss": 7.5659, "step": 241 }, { "epoch": 0.011549796566083211, "grad_norm": 4.679845809936523, "learning_rate": 0.00019999438100579204, "loss": 6.6256, "step": 242 }, { "epoch": 0.011597522998174464, "grad_norm": 4.715898036956787, "learning_rate": 0.0001999943278718929, "loss": 6.702, "step": 243 }, { "epoch": 0.011645249430265717, "grad_norm": 7.395649433135986, "learning_rate": 0.0001999942744879614, "loss": 7.1643, "step": 244 }, { "epoch": 0.01169297586235697, "grad_norm": 4.151764392852783, "learning_rate": 0.0001999942208539976, "loss": 5.9845, "step": 245 }, { "epoch": 0.011740702294448223, "grad_norm": 5.3397345542907715, "learning_rate": 0.00019999416697000165, "loss": 8.978, "step": 246 }, { "epoch": 0.011788428726539476, "grad_norm": 5.556086540222168, "learning_rate": 0.00019999411283597374, "loss": 7.4888, "step": 247 }, { "epoch": 0.01183615515863073, "grad_norm": 4.385659217834473, "learning_rate": 0.00019999405845191393, "loss": 6.167, "step": 248 }, { "epoch": 0.011883881590721982, "grad_norm": 4.701001167297363, "learning_rate": 0.00019999400381782244, "loss": 6.2042, "step": 249 }, { "epoch": 0.011931608022813234, "grad_norm": 4.775174140930176, "learning_rate": 0.0001999939489336993, "loss": 7.4549, "step": 250 }, { "epoch": 0.011979334454904487, "grad_norm": 5.019827365875244, "learning_rate": 0.00019999389379954477, "loss": 5.8486, "step": 251 }, { "epoch": 0.01202706088699574, "grad_norm": 5.453334331512451, "learning_rate": 0.00019999383841535888, "loss": 7.1586, "step": 252 }, { "epoch": 0.012074787319086993, "grad_norm": 4.455638408660889, "learning_rate": 0.00019999378278114183, "loss": 5.3958, "step": 253 }, { "epoch": 0.012122513751178246, "grad_norm": 4.937852382659912, "learning_rate": 0.00019999372689689376, "loss": 7.0083, "step": 254 }, { "epoch": 0.012170240183269499, "grad_norm": 4.422460079193115, "learning_rate": 0.00019999367076261476, "loss": 6.4996, "step": 255 }, { "epoch": 0.012217966615360752, "grad_norm": 3.780395269393921, "learning_rate": 0.000199993614378305, "loss": 6.0372, "step": 256 }, { "epoch": 0.012265693047452005, "grad_norm": 5.636499404907227, "learning_rate": 0.00019999355774396465, "loss": 6.3515, "step": 257 }, { "epoch": 0.012313419479543258, "grad_norm": 5.77448844909668, "learning_rate": 0.0001999935008595938, "loss": 8.0423, "step": 258 }, { "epoch": 0.012361145911634511, "grad_norm": 4.834639549255371, "learning_rate": 0.00019999344372519264, "loss": 5.8949, "step": 259 }, { "epoch": 0.012408872343725764, "grad_norm": 6.200581073760986, "learning_rate": 0.00019999338634076126, "loss": 8.6784, "step": 260 }, { "epoch": 0.012456598775817018, "grad_norm": 5.017780303955078, "learning_rate": 0.00019999332870629987, "loss": 5.4993, "step": 261 }, { "epoch": 0.01250432520790827, "grad_norm": 4.737110137939453, "learning_rate": 0.00019999327082180854, "loss": 6.2233, "step": 262 }, { "epoch": 0.012552051639999522, "grad_norm": 4.9615278244018555, "learning_rate": 0.00019999321268728747, "loss": 6.5653, "step": 263 }, { "epoch": 0.012599778072090775, "grad_norm": 4.462545871734619, "learning_rate": 0.00019999315430273683, "loss": 6.815, "step": 264 }, { "epoch": 0.012647504504182028, "grad_norm": 5.788268566131592, "learning_rate": 0.00019999309566815665, "loss": 5.2953, "step": 265 }, { "epoch": 0.012695230936273281, "grad_norm": 5.634494304656982, "learning_rate": 0.0001999930367835472, "loss": 6.1009, "step": 266 }, { "epoch": 0.012742957368364534, "grad_norm": 6.640275955200195, "learning_rate": 0.00019999297764890854, "loss": 6.454, "step": 267 }, { "epoch": 0.012790683800455787, "grad_norm": 5.759767055511475, "learning_rate": 0.0001999929182642409, "loss": 8.0766, "step": 268 }, { "epoch": 0.01283841023254704, "grad_norm": 4.624085426330566, "learning_rate": 0.00019999285862954436, "loss": 6.848, "step": 269 }, { "epoch": 0.012886136664638293, "grad_norm": 3.952364206314087, "learning_rate": 0.00019999279874481908, "loss": 5.7864, "step": 270 }, { "epoch": 0.012933863096729547, "grad_norm": 5.424389839172363, "learning_rate": 0.00019999273861006525, "loss": 6.1687, "step": 271 }, { "epoch": 0.0129815895288208, "grad_norm": 4.6196088790893555, "learning_rate": 0.00019999267822528297, "loss": 6.6814, "step": 272 }, { "epoch": 0.013029315960912053, "grad_norm": 4.935333251953125, "learning_rate": 0.00019999261759047243, "loss": 5.5486, "step": 273 }, { "epoch": 0.013077042393003306, "grad_norm": 5.395534038543701, "learning_rate": 0.00019999255670563376, "loss": 6.8173, "step": 274 }, { "epoch": 0.013124768825094559, "grad_norm": 6.942729949951172, "learning_rate": 0.0001999924955707671, "loss": 8.094, "step": 275 }, { "epoch": 0.01317249525718581, "grad_norm": 5.219112396240234, "learning_rate": 0.00019999243418587266, "loss": 6.5134, "step": 276 }, { "epoch": 0.013220221689277063, "grad_norm": 4.332059383392334, "learning_rate": 0.00019999237255095053, "loss": 6.2888, "step": 277 }, { "epoch": 0.013267948121368316, "grad_norm": 5.0153679847717285, "learning_rate": 0.0001999923106660009, "loss": 6.3642, "step": 278 }, { "epoch": 0.01331567455345957, "grad_norm": 3.461423873901367, "learning_rate": 0.0001999922485310239, "loss": 5.3825, "step": 279 }, { "epoch": 0.013363400985550822, "grad_norm": 4.702268600463867, "learning_rate": 0.0001999921861460197, "loss": 5.971, "step": 280 }, { "epoch": 0.013411127417642076, "grad_norm": 4.084397792816162, "learning_rate": 0.00019999212351098846, "loss": 6.3295, "step": 281 }, { "epoch": 0.013458853849733329, "grad_norm": 6.114694118499756, "learning_rate": 0.00019999206062593032, "loss": 7.4146, "step": 282 }, { "epoch": 0.013506580281824582, "grad_norm": 6.846054553985596, "learning_rate": 0.00019999199749084546, "loss": 6.3659, "step": 283 }, { "epoch": 0.013554306713915835, "grad_norm": 4.534778594970703, "learning_rate": 0.00019999193410573404, "loss": 6.4923, "step": 284 }, { "epoch": 0.013602033146007088, "grad_norm": 8.201200485229492, "learning_rate": 0.0001999918704705962, "loss": 8.3697, "step": 285 }, { "epoch": 0.013649759578098341, "grad_norm": 4.86665153503418, "learning_rate": 0.00019999180658543207, "loss": 6.4496, "step": 286 }, { "epoch": 0.013697486010189594, "grad_norm": 5.482726573944092, "learning_rate": 0.00019999174245024186, "loss": 6.7978, "step": 287 }, { "epoch": 0.013745212442280845, "grad_norm": 5.493237018585205, "learning_rate": 0.00019999167806502573, "loss": 7.1984, "step": 288 }, { "epoch": 0.013792938874372098, "grad_norm": 5.953492641448975, "learning_rate": 0.0001999916134297838, "loss": 7.8598, "step": 289 }, { "epoch": 0.013840665306463351, "grad_norm": 6.032693386077881, "learning_rate": 0.0001999915485445163, "loss": 9.7618, "step": 290 }, { "epoch": 0.013888391738554605, "grad_norm": 5.2243733406066895, "learning_rate": 0.00019999148340922333, "loss": 8.777, "step": 291 }, { "epoch": 0.013936118170645858, "grad_norm": 4.763432025909424, "learning_rate": 0.00019999141802390505, "loss": 8.0905, "step": 292 }, { "epoch": 0.01398384460273711, "grad_norm": 4.205863952636719, "learning_rate": 0.00019999135238856164, "loss": 4.598, "step": 293 }, { "epoch": 0.014031571034828364, "grad_norm": 4.153672218322754, "learning_rate": 0.0001999912865031933, "loss": 7.0555, "step": 294 }, { "epoch": 0.014079297466919617, "grad_norm": 4.621336460113525, "learning_rate": 0.00019999122036780013, "loss": 5.8933, "step": 295 }, { "epoch": 0.01412702389901087, "grad_norm": 4.004661560058594, "learning_rate": 0.00019999115398238235, "loss": 7.7509, "step": 296 }, { "epoch": 0.014174750331102123, "grad_norm": 4.914793014526367, "learning_rate": 0.00019999108734694012, "loss": 7.7317, "step": 297 }, { "epoch": 0.014222476763193376, "grad_norm": 5.124854564666748, "learning_rate": 0.00019999102046147358, "loss": 7.1103, "step": 298 }, { "epoch": 0.01427020319528463, "grad_norm": 4.222902297973633, "learning_rate": 0.0001999909533259829, "loss": 6.7856, "step": 299 }, { "epoch": 0.014317929627375882, "grad_norm": 3.855794668197632, "learning_rate": 0.00019999088594046827, "loss": 7.6669, "step": 300 }, { "epoch": 0.014365656059467134, "grad_norm": 4.996542453765869, "learning_rate": 0.00019999081830492983, "loss": 7.5863, "step": 301 }, { "epoch": 0.014413382491558387, "grad_norm": 3.9295811653137207, "learning_rate": 0.0001999907504193678, "loss": 6.4964, "step": 302 }, { "epoch": 0.01446110892364964, "grad_norm": 5.005653381347656, "learning_rate": 0.00019999068228378225, "loss": 7.0248, "step": 303 }, { "epoch": 0.014508835355740893, "grad_norm": 5.204959392547607, "learning_rate": 0.00019999061389817347, "loss": 6.8771, "step": 304 }, { "epoch": 0.014556561787832146, "grad_norm": 5.9217681884765625, "learning_rate": 0.00019999054526254154, "loss": 6.7342, "step": 305 }, { "epoch": 0.014604288219923399, "grad_norm": 6.10482931137085, "learning_rate": 0.0001999904763768867, "loss": 7.8614, "step": 306 }, { "epoch": 0.014652014652014652, "grad_norm": 4.919510841369629, "learning_rate": 0.00019999040724120909, "loss": 6.5849, "step": 307 }, { "epoch": 0.014699741084105905, "grad_norm": 3.9289333820343018, "learning_rate": 0.00019999033785550886, "loss": 4.5725, "step": 308 }, { "epoch": 0.014747467516197158, "grad_norm": 5.739213943481445, "learning_rate": 0.0001999902682197862, "loss": 7.8427, "step": 309 }, { "epoch": 0.014795193948288411, "grad_norm": 3.7051210403442383, "learning_rate": 0.00019999019833404132, "loss": 6.2502, "step": 310 }, { "epoch": 0.014842920380379664, "grad_norm": 4.647250175476074, "learning_rate": 0.00019999012819827433, "loss": 6.5101, "step": 311 }, { "epoch": 0.014890646812470917, "grad_norm": 3.8817758560180664, "learning_rate": 0.0001999900578124855, "loss": 4.8714, "step": 312 }, { "epoch": 0.01493837324456217, "grad_norm": 4.8853607177734375, "learning_rate": 0.0001999899871766749, "loss": 7.3876, "step": 313 }, { "epoch": 0.014986099676653422, "grad_norm": 5.011137962341309, "learning_rate": 0.00019998991629084276, "loss": 5.4221, "step": 314 }, { "epoch": 0.015033826108744675, "grad_norm": 4.79543399810791, "learning_rate": 0.00019998984515498926, "loss": 5.3682, "step": 315 }, { "epoch": 0.015081552540835928, "grad_norm": 4.967861652374268, "learning_rate": 0.00019998977376911454, "loss": 6.0652, "step": 316 }, { "epoch": 0.015129278972927181, "grad_norm": 5.12961483001709, "learning_rate": 0.00019998970213321883, "loss": 6.722, "step": 317 }, { "epoch": 0.015177005405018434, "grad_norm": 5.37574577331543, "learning_rate": 0.0001999896302473023, "loss": 6.4294, "step": 318 }, { "epoch": 0.015224731837109687, "grad_norm": 4.319894790649414, "learning_rate": 0.0001999895581113651, "loss": 6.6173, "step": 319 }, { "epoch": 0.01527245826920094, "grad_norm": 4.902976989746094, "learning_rate": 0.0001999894857254074, "loss": 6.5298, "step": 320 }, { "epoch": 0.015320184701292193, "grad_norm": 5.665980815887451, "learning_rate": 0.00019998941308942944, "loss": 6.9341, "step": 321 }, { "epoch": 0.015367911133383446, "grad_norm": 4.651304721832275, "learning_rate": 0.00019998934020343137, "loss": 5.3411, "step": 322 }, { "epoch": 0.0154156375654747, "grad_norm": 5.079586982727051, "learning_rate": 0.00019998926706741335, "loss": 5.7383, "step": 323 }, { "epoch": 0.015463363997565953, "grad_norm": 5.262585163116455, "learning_rate": 0.0001999891936813756, "loss": 7.7724, "step": 324 }, { "epoch": 0.015511090429657206, "grad_norm": 4.025634288787842, "learning_rate": 0.00019998912004531828, "loss": 4.6858, "step": 325 }, { "epoch": 0.015558816861748459, "grad_norm": 4.3046393394470215, "learning_rate": 0.00019998904615924158, "loss": 5.6154, "step": 326 }, { "epoch": 0.01560654329383971, "grad_norm": 5.861588478088379, "learning_rate": 0.00019998897202314569, "loss": 7.1517, "step": 327 }, { "epoch": 0.015654269725930963, "grad_norm": 4.287512302398682, "learning_rate": 0.00019998889763703077, "loss": 5.763, "step": 328 }, { "epoch": 0.015701996158022216, "grad_norm": 5.883302211761475, "learning_rate": 0.00019998882300089704, "loss": 7.2864, "step": 329 }, { "epoch": 0.01574972259011347, "grad_norm": 5.099184989929199, "learning_rate": 0.0001999887481147447, "loss": 7.4756, "step": 330 }, { "epoch": 0.015797449022204722, "grad_norm": 4.795839309692383, "learning_rate": 0.00019998867297857387, "loss": 7.5285, "step": 331 }, { "epoch": 0.015845175454295975, "grad_norm": 4.008845329284668, "learning_rate": 0.0001999885975923848, "loss": 5.1818, "step": 332 }, { "epoch": 0.01589290188638723, "grad_norm": 5.107858180999756, "learning_rate": 0.00019998852195617767, "loss": 7.3373, "step": 333 }, { "epoch": 0.01594062831847848, "grad_norm": 4.369069576263428, "learning_rate": 0.00019998844606995263, "loss": 6.1056, "step": 334 }, { "epoch": 0.015988354750569735, "grad_norm": 4.465891361236572, "learning_rate": 0.0001999883699337099, "loss": 6.0833, "step": 335 }, { "epoch": 0.016036081182660988, "grad_norm": 4.340141773223877, "learning_rate": 0.0001999882935474497, "loss": 7.0717, "step": 336 }, { "epoch": 0.01608380761475224, "grad_norm": 4.589889049530029, "learning_rate": 0.00019998821691117217, "loss": 5.5123, "step": 337 }, { "epoch": 0.016131534046843494, "grad_norm": 4.784783840179443, "learning_rate": 0.0001999881400248775, "loss": 7.857, "step": 338 }, { "epoch": 0.016179260478934747, "grad_norm": 5.159041881561279, "learning_rate": 0.00019998806288856592, "loss": 6.3091, "step": 339 }, { "epoch": 0.016226986911026, "grad_norm": 4.404861927032471, "learning_rate": 0.00019998798550223762, "loss": 7.1938, "step": 340 }, { "epoch": 0.016274713343117253, "grad_norm": 4.286370277404785, "learning_rate": 0.00019998790786589275, "loss": 7.7826, "step": 341 }, { "epoch": 0.016322439775208506, "grad_norm": 4.845062732696533, "learning_rate": 0.00019998782997953155, "loss": 7.5879, "step": 342 }, { "epoch": 0.01637016620729976, "grad_norm": 4.884746074676514, "learning_rate": 0.0001999877518431542, "loss": 6.0634, "step": 343 }, { "epoch": 0.016417892639391012, "grad_norm": 5.123991012573242, "learning_rate": 0.00019998767345676087, "loss": 7.1257, "step": 344 }, { "epoch": 0.016465619071482265, "grad_norm": 5.908779144287109, "learning_rate": 0.0001999875948203518, "loss": 9.0678, "step": 345 }, { "epoch": 0.016513345503573515, "grad_norm": 4.132058143615723, "learning_rate": 0.00019998751593392714, "loss": 5.7171, "step": 346 }, { "epoch": 0.016561071935664768, "grad_norm": 3.8147695064544678, "learning_rate": 0.00019998743679748712, "loss": 5.4728, "step": 347 }, { "epoch": 0.01660879836775602, "grad_norm": 3.7608211040496826, "learning_rate": 0.00019998735741103194, "loss": 5.5926, "step": 348 }, { "epoch": 0.016656524799847274, "grad_norm": 5.518613815307617, "learning_rate": 0.00019998727777456178, "loss": 7.8168, "step": 349 }, { "epoch": 0.016704251231938527, "grad_norm": 5.009797096252441, "learning_rate": 0.00019998719788807688, "loss": 7.069, "step": 350 }, { "epoch": 0.01675197766402978, "grad_norm": 4.233580589294434, "learning_rate": 0.00019998711775157734, "loss": 6.5305, "step": 351 }, { "epoch": 0.016799704096121033, "grad_norm": 3.6126842498779297, "learning_rate": 0.0001999870373650635, "loss": 4.6525, "step": 352 }, { "epoch": 0.016847430528212286, "grad_norm": 5.284352779388428, "learning_rate": 0.00019998695672853546, "loss": 7.4143, "step": 353 }, { "epoch": 0.01689515696030354, "grad_norm": 5.172549724578857, "learning_rate": 0.00019998687584199342, "loss": 7.0653, "step": 354 }, { "epoch": 0.016942883392394793, "grad_norm": 4.566983699798584, "learning_rate": 0.00019998679470543764, "loss": 7.1398, "step": 355 }, { "epoch": 0.016990609824486046, "grad_norm": 6.174628734588623, "learning_rate": 0.0001999867133188683, "loss": 6.8418, "step": 356 }, { "epoch": 0.0170383362565773, "grad_norm": 6.573761463165283, "learning_rate": 0.0001999866316822856, "loss": 7.3079, "step": 357 }, { "epoch": 0.017086062688668552, "grad_norm": 4.254095554351807, "learning_rate": 0.00019998654979568975, "loss": 5.5805, "step": 358 }, { "epoch": 0.017133789120759805, "grad_norm": 5.071292400360107, "learning_rate": 0.00019998646765908093, "loss": 7.3112, "step": 359 }, { "epoch": 0.017181515552851058, "grad_norm": 6.035824298858643, "learning_rate": 0.00019998638527245937, "loss": 7.6335, "step": 360 }, { "epoch": 0.01722924198494231, "grad_norm": 4.634344577789307, "learning_rate": 0.00019998630263582526, "loss": 6.9934, "step": 361 }, { "epoch": 0.017276968417033564, "grad_norm": 4.159761428833008, "learning_rate": 0.00019998621974917885, "loss": 5.933, "step": 362 }, { "epoch": 0.017324694849124817, "grad_norm": 5.788492679595947, "learning_rate": 0.0001999861366125203, "loss": 9.5469, "step": 363 }, { "epoch": 0.01737242128121607, "grad_norm": 5.0350542068481445, "learning_rate": 0.0001999860532258498, "loss": 7.0258, "step": 364 }, { "epoch": 0.017420147713307323, "grad_norm": 3.9429879188537598, "learning_rate": 0.00019998596958916763, "loss": 6.1842, "step": 365 }, { "epoch": 0.017467874145398576, "grad_norm": 4.744482517242432, "learning_rate": 0.00019998588570247395, "loss": 6.4803, "step": 366 }, { "epoch": 0.01751560057748983, "grad_norm": 3.6584131717681885, "learning_rate": 0.00019998580156576894, "loss": 4.6629, "step": 367 }, { "epoch": 0.017563327009581083, "grad_norm": 4.291102409362793, "learning_rate": 0.00019998571717905287, "loss": 6.2968, "step": 368 }, { "epoch": 0.017611053441672336, "grad_norm": 4.7433247566223145, "learning_rate": 0.00019998563254232594, "loss": 5.3619, "step": 369 }, { "epoch": 0.01765877987376359, "grad_norm": 7.7083048820495605, "learning_rate": 0.00019998554765558835, "loss": 7.1767, "step": 370 }, { "epoch": 0.017706506305854842, "grad_norm": 6.283130168914795, "learning_rate": 0.00019998546251884033, "loss": 5.8467, "step": 371 }, { "epoch": 0.01775423273794609, "grad_norm": 5.176546573638916, "learning_rate": 0.00019998537713208206, "loss": 7.7189, "step": 372 }, { "epoch": 0.017801959170037344, "grad_norm": 4.718236446380615, "learning_rate": 0.00019998529149531374, "loss": 7.1753, "step": 373 }, { "epoch": 0.017849685602128598, "grad_norm": 4.217880725860596, "learning_rate": 0.00019998520560853567, "loss": 6.6313, "step": 374 }, { "epoch": 0.01789741203421985, "grad_norm": 4.156486511230469, "learning_rate": 0.00019998511947174797, "loss": 6.8616, "step": 375 }, { "epoch": 0.017945138466311104, "grad_norm": 5.970060348510742, "learning_rate": 0.00019998503308495092, "loss": 8.0937, "step": 376 }, { "epoch": 0.017992864898402357, "grad_norm": 5.400389671325684, "learning_rate": 0.00019998494644814468, "loss": 7.7833, "step": 377 }, { "epoch": 0.01804059133049361, "grad_norm": 6.085134983062744, "learning_rate": 0.00019998485956132953, "loss": 8.5722, "step": 378 }, { "epoch": 0.018088317762584863, "grad_norm": 4.581125736236572, "learning_rate": 0.00019998477242450565, "loss": 4.9362, "step": 379 }, { "epoch": 0.018136044194676116, "grad_norm": 4.854187965393066, "learning_rate": 0.00019998468503767324, "loss": 5.7646, "step": 380 }, { "epoch": 0.01818377062676737, "grad_norm": 5.279097080230713, "learning_rate": 0.00019998459740083255, "loss": 5.9699, "step": 381 }, { "epoch": 0.018231497058858622, "grad_norm": 6.0616655349731445, "learning_rate": 0.0001999845095139838, "loss": 6.7272, "step": 382 }, { "epoch": 0.018279223490949875, "grad_norm": 4.58502197265625, "learning_rate": 0.00019998442137712718, "loss": 5.8863, "step": 383 }, { "epoch": 0.01832694992304113, "grad_norm": 5.454066753387451, "learning_rate": 0.00019998433299026297, "loss": 7.9997, "step": 384 }, { "epoch": 0.01837467635513238, "grad_norm": 5.5297017097473145, "learning_rate": 0.00019998424435339128, "loss": 9.2768, "step": 385 }, { "epoch": 0.018422402787223634, "grad_norm": 6.259234428405762, "learning_rate": 0.00019998415546651247, "loss": 7.4493, "step": 386 }, { "epoch": 0.018470129219314887, "grad_norm": 7.039416790008545, "learning_rate": 0.00019998406632962666, "loss": 9.3582, "step": 387 }, { "epoch": 0.01851785565140614, "grad_norm": 5.677418231964111, "learning_rate": 0.0001999839769427341, "loss": 6.5286, "step": 388 }, { "epoch": 0.018565582083497394, "grad_norm": 5.97587776184082, "learning_rate": 0.00019998388730583505, "loss": 7.9887, "step": 389 }, { "epoch": 0.018613308515588647, "grad_norm": 20.873300552368164, "learning_rate": 0.00019998379741892973, "loss": 8.7424, "step": 390 }, { "epoch": 0.0186610349476799, "grad_norm": 5.740731239318848, "learning_rate": 0.00019998370728201828, "loss": 8.7783, "step": 391 }, { "epoch": 0.018708761379771153, "grad_norm": 6.921543121337891, "learning_rate": 0.00019998361689510104, "loss": 7.8717, "step": 392 }, { "epoch": 0.018756487811862406, "grad_norm": 4.221953868865967, "learning_rate": 0.00019998352625817813, "loss": 6.5247, "step": 393 }, { "epoch": 0.01880421424395366, "grad_norm": 4.8449249267578125, "learning_rate": 0.00019998343537124986, "loss": 6.8002, "step": 394 }, { "epoch": 0.018851940676044912, "grad_norm": 4.915500164031982, "learning_rate": 0.0001999833442343164, "loss": 6.2396, "step": 395 }, { "epoch": 0.018899667108136165, "grad_norm": 3.3595244884490967, "learning_rate": 0.00019998325284737806, "loss": 4.481, "step": 396 }, { "epoch": 0.018947393540227415, "grad_norm": 4.997402667999268, "learning_rate": 0.00019998316121043493, "loss": 5.8485, "step": 397 }, { "epoch": 0.018995119972318668, "grad_norm": 5.331072807312012, "learning_rate": 0.0001999830693234874, "loss": 6.5482, "step": 398 }, { "epoch": 0.01904284640440992, "grad_norm": 5.49896764755249, "learning_rate": 0.00019998297718653557, "loss": 7.7411, "step": 399 }, { "epoch": 0.019090572836501174, "grad_norm": 5.086883068084717, "learning_rate": 0.0001999828847995797, "loss": 6.3793, "step": 400 }, { "epoch": 0.019090572836501174, "eval_loss": 1.665988564491272, "eval_runtime": 96.4558, "eval_samples_per_second": 8.74, "eval_steps_per_second": 4.375, "step": 400 }, { "epoch": 0.019138299268592427, "grad_norm": 4.632482528686523, "learning_rate": 0.0001999827921626201, "loss": 7.3474, "step": 401 }, { "epoch": 0.01918602570068368, "grad_norm": 4.1303629875183105, "learning_rate": 0.00019998269927565693, "loss": 5.0883, "step": 402 }, { "epoch": 0.019233752132774933, "grad_norm": 5.4048871994018555, "learning_rate": 0.00019998260613869043, "loss": 7.5351, "step": 403 }, { "epoch": 0.019281478564866186, "grad_norm": 4.152419567108154, "learning_rate": 0.00019998251275172085, "loss": 5.6643, "step": 404 }, { "epoch": 0.01932920499695744, "grad_norm": 4.1763763427734375, "learning_rate": 0.0001999824191147484, "loss": 5.9305, "step": 405 }, { "epoch": 0.019376931429048692, "grad_norm": 4.289309978485107, "learning_rate": 0.0001999823252277733, "loss": 4.8992, "step": 406 }, { "epoch": 0.019424657861139945, "grad_norm": 4.502870559692383, "learning_rate": 0.00019998223109079586, "loss": 5.3549, "step": 407 }, { "epoch": 0.0194723842932312, "grad_norm": 5.1425652503967285, "learning_rate": 0.00019998213670381626, "loss": 7.7652, "step": 408 }, { "epoch": 0.01952011072532245, "grad_norm": 6.1564531326293945, "learning_rate": 0.0001999820420668347, "loss": 7.4905, "step": 409 }, { "epoch": 0.019567837157413705, "grad_norm": 5.1370697021484375, "learning_rate": 0.00019998194717985151, "loss": 7.4923, "step": 410 }, { "epoch": 0.019615563589504958, "grad_norm": 4.588619709014893, "learning_rate": 0.00019998185204286683, "loss": 5.8113, "step": 411 }, { "epoch": 0.01966329002159621, "grad_norm": 6.019281387329102, "learning_rate": 0.000199981756655881, "loss": 9.552, "step": 412 }, { "epoch": 0.019711016453687464, "grad_norm": 4.031830310821533, "learning_rate": 0.00019998166101889414, "loss": 4.7074, "step": 413 }, { "epoch": 0.019758742885778717, "grad_norm": 7.4313459396362305, "learning_rate": 0.00019998156513190658, "loss": 7.329, "step": 414 }, { "epoch": 0.01980646931786997, "grad_norm": 4.952702522277832, "learning_rate": 0.00019998146899491857, "loss": 7.7571, "step": 415 }, { "epoch": 0.019854195749961223, "grad_norm": 4.877037048339844, "learning_rate": 0.00019998137260793025, "loss": 7.4194, "step": 416 }, { "epoch": 0.019901922182052476, "grad_norm": 5.064944744110107, "learning_rate": 0.00019998127597094196, "loss": 6.7546, "step": 417 }, { "epoch": 0.01994964861414373, "grad_norm": 8.260854721069336, "learning_rate": 0.00019998117908395392, "loss": 8.0974, "step": 418 }, { "epoch": 0.019997375046234982, "grad_norm": 6.553653240203857, "learning_rate": 0.0001999810819469663, "loss": 9.2404, "step": 419 }, { "epoch": 0.020045101478326235, "grad_norm": 4.128756999969482, "learning_rate": 0.00019998098455997944, "loss": 6.5102, "step": 420 }, { "epoch": 0.02009282791041749, "grad_norm": 7.040201663970947, "learning_rate": 0.00019998088692299354, "loss": 6.4265, "step": 421 }, { "epoch": 0.02014055434250874, "grad_norm": 5.790428161621094, "learning_rate": 0.00019998078903600886, "loss": 7.5237, "step": 422 }, { "epoch": 0.02018828077459999, "grad_norm": 4.903696537017822, "learning_rate": 0.0001999806908990256, "loss": 6.8793, "step": 423 }, { "epoch": 0.020236007206691244, "grad_norm": 5.0177483558654785, "learning_rate": 0.00019998059251204406, "loss": 7.4763, "step": 424 }, { "epoch": 0.020283733638782497, "grad_norm": 5.022401809692383, "learning_rate": 0.00019998049387506445, "loss": 8.9361, "step": 425 }, { "epoch": 0.02033146007087375, "grad_norm": 5.030542373657227, "learning_rate": 0.00019998039498808704, "loss": 6.4885, "step": 426 }, { "epoch": 0.020379186502965004, "grad_norm": 4.300105094909668, "learning_rate": 0.00019998029585111207, "loss": 6.0779, "step": 427 }, { "epoch": 0.020426912935056257, "grad_norm": 5.635427951812744, "learning_rate": 0.00019998019646413982, "loss": 9.2002, "step": 428 }, { "epoch": 0.02047463936714751, "grad_norm": 4.9848504066467285, "learning_rate": 0.00019998009682717046, "loss": 7.4362, "step": 429 }, { "epoch": 0.020522365799238763, "grad_norm": 5.126598834991455, "learning_rate": 0.00019997999694020428, "loss": 5.8864, "step": 430 }, { "epoch": 0.020570092231330016, "grad_norm": 5.999664783477783, "learning_rate": 0.00019997989680324154, "loss": 6.3613, "step": 431 }, { "epoch": 0.02061781866342127, "grad_norm": 4.934080600738525, "learning_rate": 0.0001999797964162825, "loss": 6.5829, "step": 432 }, { "epoch": 0.020665545095512522, "grad_norm": 4.960004806518555, "learning_rate": 0.00019997969577932741, "loss": 5.974, "step": 433 }, { "epoch": 0.020713271527603775, "grad_norm": 3.4501640796661377, "learning_rate": 0.00019997959489237647, "loss": 5.0535, "step": 434 }, { "epoch": 0.020760997959695028, "grad_norm": 5.115245342254639, "learning_rate": 0.00019997949375542997, "loss": 7.8514, "step": 435 }, { "epoch": 0.02080872439178628, "grad_norm": 4.631134033203125, "learning_rate": 0.00019997939236848818, "loss": 6.3987, "step": 436 }, { "epoch": 0.020856450823877534, "grad_norm": 5.120911598205566, "learning_rate": 0.00019997929073155133, "loss": 7.2442, "step": 437 }, { "epoch": 0.020904177255968787, "grad_norm": 4.703468322753906, "learning_rate": 0.00019997918884461966, "loss": 6.2793, "step": 438 }, { "epoch": 0.02095190368806004, "grad_norm": 4.391363143920898, "learning_rate": 0.00019997908670769347, "loss": 6.1586, "step": 439 }, { "epoch": 0.020999630120151293, "grad_norm": 6.092804908752441, "learning_rate": 0.000199978984320773, "loss": 7.2347, "step": 440 }, { "epoch": 0.021047356552242547, "grad_norm": 3.80682110786438, "learning_rate": 0.00019997888168385845, "loss": 4.6474, "step": 441 }, { "epoch": 0.0210950829843338, "grad_norm": 5.044892311096191, "learning_rate": 0.00019997877879695012, "loss": 7.3003, "step": 442 }, { "epoch": 0.021142809416425053, "grad_norm": 5.314664363861084, "learning_rate": 0.0001999786756600483, "loss": 5.7482, "step": 443 }, { "epoch": 0.021190535848516306, "grad_norm": 5.538875579833984, "learning_rate": 0.00019997857227315322, "loss": 7.7144, "step": 444 }, { "epoch": 0.02123826228060756, "grad_norm": 3.887810707092285, "learning_rate": 0.00019997846863626512, "loss": 4.8105, "step": 445 }, { "epoch": 0.021285988712698812, "grad_norm": 7.227574825286865, "learning_rate": 0.0001999783647493843, "loss": 9.3088, "step": 446 }, { "epoch": 0.021333715144790065, "grad_norm": 5.310603618621826, "learning_rate": 0.00019997826061251097, "loss": 6.6601, "step": 447 }, { "epoch": 0.021381441576881315, "grad_norm": 4.4993414878845215, "learning_rate": 0.00019997815622564542, "loss": 6.5389, "step": 448 }, { "epoch": 0.021429168008972568, "grad_norm": 4.070158958435059, "learning_rate": 0.00019997805158878792, "loss": 5.4385, "step": 449 }, { "epoch": 0.02147689444106382, "grad_norm": 4.426395893096924, "learning_rate": 0.0001999779467019387, "loss": 6.8071, "step": 450 }, { "epoch": 0.021524620873155074, "grad_norm": 5.009427547454834, "learning_rate": 0.00019997784156509804, "loss": 6.598, "step": 451 }, { "epoch": 0.021572347305246327, "grad_norm": 3.518096685409546, "learning_rate": 0.00019997773617826622, "loss": 5.3972, "step": 452 }, { "epoch": 0.02162007373733758, "grad_norm": 5.800788402557373, "learning_rate": 0.00019997763054144344, "loss": 8.1586, "step": 453 }, { "epoch": 0.021667800169428833, "grad_norm": 5.225189685821533, "learning_rate": 0.00019997752465463004, "loss": 8.0787, "step": 454 }, { "epoch": 0.021715526601520086, "grad_norm": 4.707450866699219, "learning_rate": 0.00019997741851782627, "loss": 7.1508, "step": 455 }, { "epoch": 0.02176325303361134, "grad_norm": 5.306134223937988, "learning_rate": 0.0001999773121310324, "loss": 8.0307, "step": 456 }, { "epoch": 0.021810979465702592, "grad_norm": 5.296722888946533, "learning_rate": 0.00019997720549424862, "loss": 5.7943, "step": 457 }, { "epoch": 0.021858705897793845, "grad_norm": 4.953795909881592, "learning_rate": 0.0001999770986074753, "loss": 7.4629, "step": 458 }, { "epoch": 0.0219064323298851, "grad_norm": 3.7820422649383545, "learning_rate": 0.00019997699147071262, "loss": 6.0438, "step": 459 }, { "epoch": 0.02195415876197635, "grad_norm": 4.647518157958984, "learning_rate": 0.00019997688408396092, "loss": 7.0204, "step": 460 }, { "epoch": 0.022001885194067605, "grad_norm": 3.9241981506347656, "learning_rate": 0.0001999767764472204, "loss": 6.2746, "step": 461 }, { "epoch": 0.022049611626158858, "grad_norm": 6.767636775970459, "learning_rate": 0.0001999766685604914, "loss": 9.0934, "step": 462 }, { "epoch": 0.02209733805825011, "grad_norm": 5.826857566833496, "learning_rate": 0.00019997656042377416, "loss": 7.2791, "step": 463 }, { "epoch": 0.022145064490341364, "grad_norm": 6.055583953857422, "learning_rate": 0.00019997645203706892, "loss": 8.3504, "step": 464 }, { "epoch": 0.022192790922432617, "grad_norm": 4.4791669845581055, "learning_rate": 0.000199976343400376, "loss": 6.8317, "step": 465 }, { "epoch": 0.02224051735452387, "grad_norm": 4.7874298095703125, "learning_rate": 0.00019997623451369564, "loss": 6.4061, "step": 466 }, { "epoch": 0.022288243786615123, "grad_norm": 4.830522537231445, "learning_rate": 0.00019997612537702813, "loss": 6.1932, "step": 467 }, { "epoch": 0.022335970218706376, "grad_norm": 10.66057300567627, "learning_rate": 0.0001999760159903737, "loss": 6.3374, "step": 468 }, { "epoch": 0.02238369665079763, "grad_norm": 5.479240894317627, "learning_rate": 0.0001999759063537327, "loss": 7.609, "step": 469 }, { "epoch": 0.022431423082888882, "grad_norm": 5.752477645874023, "learning_rate": 0.00019997579646710532, "loss": 7.4818, "step": 470 }, { "epoch": 0.022479149514980135, "grad_norm": 5.4071364402771, "learning_rate": 0.00019997568633049192, "loss": 8.3156, "step": 471 }, { "epoch": 0.02252687594707139, "grad_norm": 3.381730556488037, "learning_rate": 0.0001999755759438927, "loss": 5.8338, "step": 472 }, { "epoch": 0.02257460237916264, "grad_norm": 5.019676208496094, "learning_rate": 0.00019997546530730798, "loss": 7.5769, "step": 473 }, { "epoch": 0.02262232881125389, "grad_norm": 4.502069473266602, "learning_rate": 0.00019997535442073803, "loss": 6.1705, "step": 474 }, { "epoch": 0.022670055243345144, "grad_norm": 4.003905296325684, "learning_rate": 0.00019997524328418308, "loss": 6.4133, "step": 475 }, { "epoch": 0.022717781675436397, "grad_norm": 4.445511817932129, "learning_rate": 0.0001999751318976435, "loss": 6.4901, "step": 476 }, { "epoch": 0.02276550810752765, "grad_norm": 5.4654059410095215, "learning_rate": 0.00019997502026111952, "loss": 7.7037, "step": 477 }, { "epoch": 0.022813234539618903, "grad_norm": 5.104085445404053, "learning_rate": 0.0001999749083746114, "loss": 5.0008, "step": 478 }, { "epoch": 0.022860960971710156, "grad_norm": 4.8859357833862305, "learning_rate": 0.00019997479623811942, "loss": 6.8619, "step": 479 }, { "epoch": 0.02290868740380141, "grad_norm": 4.528769493103027, "learning_rate": 0.0001999746838516439, "loss": 5.5094, "step": 480 }, { "epoch": 0.022956413835892663, "grad_norm": 146.96981811523438, "learning_rate": 0.00019997457121518508, "loss": 9.347, "step": 481 }, { "epoch": 0.023004140267983916, "grad_norm": 5.23137903213501, "learning_rate": 0.0001999744583287433, "loss": 7.1352, "step": 482 }, { "epoch": 0.02305186670007517, "grad_norm": 6.624316692352295, "learning_rate": 0.00019997434519231875, "loss": 10.408, "step": 483 }, { "epoch": 0.023099593132166422, "grad_norm": 4.508331775665283, "learning_rate": 0.00019997423180591182, "loss": 5.9288, "step": 484 }, { "epoch": 0.023147319564257675, "grad_norm": 4.060499668121338, "learning_rate": 0.00019997411816952268, "loss": 6.0798, "step": 485 }, { "epoch": 0.023195045996348928, "grad_norm": 4.18237829208374, "learning_rate": 0.00019997400428315172, "loss": 5.4661, "step": 486 }, { "epoch": 0.02324277242844018, "grad_norm": 4.785374641418457, "learning_rate": 0.00019997389014679914, "loss": 6.6623, "step": 487 }, { "epoch": 0.023290498860531434, "grad_norm": 4.686644554138184, "learning_rate": 0.0001999737757604653, "loss": 5.8418, "step": 488 }, { "epoch": 0.023338225292622687, "grad_norm": 5.85218620300293, "learning_rate": 0.00019997366112415042, "loss": 7.4947, "step": 489 }, { "epoch": 0.02338595172471394, "grad_norm": 4.505075931549072, "learning_rate": 0.00019997354623785483, "loss": 6.1, "step": 490 }, { "epoch": 0.023433678156805193, "grad_norm": 4.551297664642334, "learning_rate": 0.0001999734311015788, "loss": 6.1406, "step": 491 }, { "epoch": 0.023481404588896446, "grad_norm": 5.397796630859375, "learning_rate": 0.0001999733157153226, "loss": 6.647, "step": 492 }, { "epoch": 0.0235291310209877, "grad_norm": 5.0919389724731445, "learning_rate": 0.0001999732000790866, "loss": 7.104, "step": 493 }, { "epoch": 0.023576857453078953, "grad_norm": 4.778090476989746, "learning_rate": 0.00019997308419287098, "loss": 5.9611, "step": 494 }, { "epoch": 0.023624583885170206, "grad_norm": 5.284306049346924, "learning_rate": 0.00019997296805667606, "loss": 7.7483, "step": 495 }, { "epoch": 0.02367231031726146, "grad_norm": 5.61501407623291, "learning_rate": 0.0001999728516705022, "loss": 7.5037, "step": 496 }, { "epoch": 0.02372003674935271, "grad_norm": 3.628138780593872, "learning_rate": 0.0001999727350343496, "loss": 4.2402, "step": 497 }, { "epoch": 0.023767763181443965, "grad_norm": 4.0569610595703125, "learning_rate": 0.00019997261814821862, "loss": 5.6909, "step": 498 }, { "epoch": 0.023815489613535218, "grad_norm": 6.206594944000244, "learning_rate": 0.00019997250101210951, "loss": 7.2116, "step": 499 }, { "epoch": 0.023863216045626467, "grad_norm": 4.913888931274414, "learning_rate": 0.00019997238362602258, "loss": 7.5945, "step": 500 }, { "epoch": 0.02391094247771772, "grad_norm": 4.092263698577881, "learning_rate": 0.00019997226598995812, "loss": 5.8894, "step": 501 }, { "epoch": 0.023958668909808974, "grad_norm": 5.271243572235107, "learning_rate": 0.00019997214810391642, "loss": 6.3197, "step": 502 }, { "epoch": 0.024006395341900227, "grad_norm": 4.649320125579834, "learning_rate": 0.00019997202996789777, "loss": 6.1021, "step": 503 }, { "epoch": 0.02405412177399148, "grad_norm": 4.871811866760254, "learning_rate": 0.0001999719115819025, "loss": 6.8396, "step": 504 }, { "epoch": 0.024101848206082733, "grad_norm": 4.113831520080566, "learning_rate": 0.00019997179294593088, "loss": 5.3965, "step": 505 }, { "epoch": 0.024149574638173986, "grad_norm": 7.8706464767456055, "learning_rate": 0.00019997167405998316, "loss": 7.6885, "step": 506 }, { "epoch": 0.02419730107026524, "grad_norm": 5.635750770568848, "learning_rate": 0.00019997155492405973, "loss": 7.599, "step": 507 }, { "epoch": 0.024245027502356492, "grad_norm": 4.669138431549072, "learning_rate": 0.00019997143553816083, "loss": 7.1527, "step": 508 }, { "epoch": 0.024292753934447745, "grad_norm": 5.8009934425354, "learning_rate": 0.00019997131590228677, "loss": 7.3978, "step": 509 }, { "epoch": 0.024340480366538998, "grad_norm": 5.105638027191162, "learning_rate": 0.00019997119601643782, "loss": 6.5293, "step": 510 }, { "epoch": 0.02438820679863025, "grad_norm": 3.7635374069213867, "learning_rate": 0.00019997107588061436, "loss": 5.0004, "step": 511 }, { "epoch": 0.024435933230721504, "grad_norm": 5.513189792633057, "learning_rate": 0.00019997095549481665, "loss": 6.0412, "step": 512 }, { "epoch": 0.024483659662812757, "grad_norm": 5.24607515335083, "learning_rate": 0.00019997083485904493, "loss": 5.6153, "step": 513 }, { "epoch": 0.02453138609490401, "grad_norm": 5.752498626708984, "learning_rate": 0.0001999707139732996, "loss": 8.8367, "step": 514 }, { "epoch": 0.024579112526995264, "grad_norm": 4.793280601501465, "learning_rate": 0.00019997059283758086, "loss": 6.8156, "step": 515 }, { "epoch": 0.024626838959086517, "grad_norm": 5.600396156311035, "learning_rate": 0.0001999704714518891, "loss": 6.5203, "step": 516 }, { "epoch": 0.02467456539117777, "grad_norm": 5.5849080085754395, "learning_rate": 0.0001999703498162246, "loss": 6.501, "step": 517 }, { "epoch": 0.024722291823269023, "grad_norm": 5.489221096038818, "learning_rate": 0.00019997022793058765, "loss": 8.4928, "step": 518 }, { "epoch": 0.024770018255360276, "grad_norm": 4.656881332397461, "learning_rate": 0.00019997010579497854, "loss": 6.5956, "step": 519 }, { "epoch": 0.02481774468745153, "grad_norm": 5.225973606109619, "learning_rate": 0.0001999699834093976, "loss": 6.2432, "step": 520 }, { "epoch": 0.024865471119542782, "grad_norm": 4.841419696807861, "learning_rate": 0.00019996986077384514, "loss": 6.3632, "step": 521 }, { "epoch": 0.024913197551634035, "grad_norm": 4.086205959320068, "learning_rate": 0.00019996973788832144, "loss": 6.4561, "step": 522 }, { "epoch": 0.024960923983725288, "grad_norm": 4.372692108154297, "learning_rate": 0.00019996961475282686, "loss": 6.1433, "step": 523 }, { "epoch": 0.02500865041581654, "grad_norm": 4.497690677642822, "learning_rate": 0.00019996949136736168, "loss": 6.7989, "step": 524 }, { "epoch": 0.02505637684790779, "grad_norm": 5.1385931968688965, "learning_rate": 0.0001999693677319262, "loss": 6.9944, "step": 525 }, { "epoch": 0.025104103279999044, "grad_norm": 4.803904056549072, "learning_rate": 0.0001999692438465207, "loss": 5.1257, "step": 526 }, { "epoch": 0.025151829712090297, "grad_norm": 5.663443088531494, "learning_rate": 0.00019996911971114552, "loss": 5.7775, "step": 527 }, { "epoch": 0.02519955614418155, "grad_norm": 6.631006240844727, "learning_rate": 0.00019996899532580097, "loss": 8.0359, "step": 528 }, { "epoch": 0.025247282576272803, "grad_norm": 5.446874618530273, "learning_rate": 0.00019996887069048737, "loss": 6.8043, "step": 529 }, { "epoch": 0.025295009008364056, "grad_norm": 5.09302282333374, "learning_rate": 0.00019996874580520505, "loss": 7.806, "step": 530 }, { "epoch": 0.02534273544045531, "grad_norm": 4.973886013031006, "learning_rate": 0.00019996862066995428, "loss": 6.8326, "step": 531 }, { "epoch": 0.025390461872546562, "grad_norm": 4.286750316619873, "learning_rate": 0.0001999684952847354, "loss": 6.4884, "step": 532 }, { "epoch": 0.025438188304637815, "grad_norm": 4.771728038787842, "learning_rate": 0.00019996836964954868, "loss": 6.2068, "step": 533 }, { "epoch": 0.02548591473672907, "grad_norm": 5.859272003173828, "learning_rate": 0.00019996824376439446, "loss": 6.1175, "step": 534 }, { "epoch": 0.02553364116882032, "grad_norm": 3.806405782699585, "learning_rate": 0.0001999681176292731, "loss": 5.7413, "step": 535 }, { "epoch": 0.025581367600911575, "grad_norm": 5.5705108642578125, "learning_rate": 0.00019996799124418488, "loss": 5.7925, "step": 536 }, { "epoch": 0.025629094033002828, "grad_norm": 4.552614688873291, "learning_rate": 0.0001999678646091301, "loss": 7.3086, "step": 537 }, { "epoch": 0.02567682046509408, "grad_norm": 4.104206562042236, "learning_rate": 0.00019996773772410907, "loss": 6.6072, "step": 538 }, { "epoch": 0.025724546897185334, "grad_norm": 4.340860843658447, "learning_rate": 0.00019996761058912216, "loss": 6.6063, "step": 539 }, { "epoch": 0.025772273329276587, "grad_norm": 4.34968900680542, "learning_rate": 0.00019996748320416963, "loss": 5.9908, "step": 540 }, { "epoch": 0.02581999976136784, "grad_norm": 4.876851558685303, "learning_rate": 0.00019996735556925184, "loss": 6.4033, "step": 541 }, { "epoch": 0.025867726193459093, "grad_norm": 4.923406600952148, "learning_rate": 0.00019996722768436908, "loss": 6.6859, "step": 542 }, { "epoch": 0.025915452625550346, "grad_norm": 4.746583938598633, "learning_rate": 0.00019996709954952168, "loss": 6.2897, "step": 543 }, { "epoch": 0.0259631790576416, "grad_norm": 4.503627300262451, "learning_rate": 0.00019996697116471, "loss": 6.0742, "step": 544 }, { "epoch": 0.026010905489732852, "grad_norm": 4.217231750488281, "learning_rate": 0.00019996684252993426, "loss": 5.3997, "step": 545 }, { "epoch": 0.026058631921824105, "grad_norm": 4.351018905639648, "learning_rate": 0.0001999667136451949, "loss": 6.0516, "step": 546 }, { "epoch": 0.02610635835391536, "grad_norm": 5.6705708503723145, "learning_rate": 0.00019996658451049216, "loss": 7.6904, "step": 547 }, { "epoch": 0.02615408478600661, "grad_norm": 4.298531532287598, "learning_rate": 0.00019996645512582642, "loss": 5.7593, "step": 548 }, { "epoch": 0.026201811218097865, "grad_norm": 6.38975715637207, "learning_rate": 0.00019996632549119795, "loss": 5.3676, "step": 549 }, { "epoch": 0.026249537650189118, "grad_norm": 4.405651569366455, "learning_rate": 0.0001999661956066071, "loss": 5.1219, "step": 550 }, { "epoch": 0.026297264082280367, "grad_norm": 5.271700382232666, "learning_rate": 0.0001999660654720542, "loss": 6.8602, "step": 551 }, { "epoch": 0.02634499051437162, "grad_norm": 5.63363790512085, "learning_rate": 0.00019996593508753955, "loss": 8.6695, "step": 552 }, { "epoch": 0.026392716946462873, "grad_norm": 5.4059295654296875, "learning_rate": 0.0001999658044530635, "loss": 7.9904, "step": 553 }, { "epoch": 0.026440443378554127, "grad_norm": 4.494891166687012, "learning_rate": 0.00019996567356862636, "loss": 5.1617, "step": 554 }, { "epoch": 0.02648816981064538, "grad_norm": 5.45679235458374, "learning_rate": 0.0001999655424342285, "loss": 6.1232, "step": 555 }, { "epoch": 0.026535896242736633, "grad_norm": 5.063662528991699, "learning_rate": 0.0001999654110498702, "loss": 6.7984, "step": 556 }, { "epoch": 0.026583622674827886, "grad_norm": 4.864143371582031, "learning_rate": 0.0001999652794155518, "loss": 6.4253, "step": 557 }, { "epoch": 0.02663134910691914, "grad_norm": 5.1836724281311035, "learning_rate": 0.00019996514753127366, "loss": 6.4102, "step": 558 }, { "epoch": 0.026679075539010392, "grad_norm": 4.303294658660889, "learning_rate": 0.00019996501539703605, "loss": 5.2869, "step": 559 }, { "epoch": 0.026726801971101645, "grad_norm": 4.715828895568848, "learning_rate": 0.00019996488301283932, "loss": 6.1933, "step": 560 }, { "epoch": 0.026774528403192898, "grad_norm": 4.3456196784973145, "learning_rate": 0.00019996475037868387, "loss": 5.3839, "step": 561 }, { "epoch": 0.02682225483528415, "grad_norm": 5.75535774230957, "learning_rate": 0.00019996461749456997, "loss": 7.2851, "step": 562 }, { "epoch": 0.026869981267375404, "grad_norm": 3.417879343032837, "learning_rate": 0.00019996448436049793, "loss": 5.374, "step": 563 }, { "epoch": 0.026917707699466657, "grad_norm": 3.585200309753418, "learning_rate": 0.00019996435097646812, "loss": 4.1998, "step": 564 }, { "epoch": 0.02696543413155791, "grad_norm": 3.3503780364990234, "learning_rate": 0.00019996421734248086, "loss": 3.9994, "step": 565 }, { "epoch": 0.027013160563649163, "grad_norm": 5.972406387329102, "learning_rate": 0.00019996408345853648, "loss": 8.5356, "step": 566 }, { "epoch": 0.027060886995740416, "grad_norm": 4.54366397857666, "learning_rate": 0.00019996394932463534, "loss": 6.4764, "step": 567 }, { "epoch": 0.02710861342783167, "grad_norm": 3.9604554176330566, "learning_rate": 0.00019996381494077778, "loss": 5.0276, "step": 568 }, { "epoch": 0.027156339859922923, "grad_norm": 5.076655864715576, "learning_rate": 0.00019996368030696408, "loss": 6.8535, "step": 569 }, { "epoch": 0.027204066292014176, "grad_norm": 3.42767071723938, "learning_rate": 0.00019996354542319464, "loss": 4.5279, "step": 570 }, { "epoch": 0.02725179272410543, "grad_norm": 5.719819068908691, "learning_rate": 0.00019996341028946977, "loss": 5.8306, "step": 571 }, { "epoch": 0.027299519156196682, "grad_norm": 4.957878589630127, "learning_rate": 0.00019996327490578978, "loss": 6.6808, "step": 572 }, { "epoch": 0.027347245588287935, "grad_norm": 4.991243362426758, "learning_rate": 0.00019996313927215503, "loss": 6.0187, "step": 573 }, { "epoch": 0.027394972020379188, "grad_norm": 4.724966526031494, "learning_rate": 0.0001999630033885659, "loss": 6.0801, "step": 574 }, { "epoch": 0.02744269845247044, "grad_norm": 5.420577526092529, "learning_rate": 0.0001999628672550227, "loss": 6.5073, "step": 575 }, { "epoch": 0.02749042488456169, "grad_norm": 6.489767074584961, "learning_rate": 0.00019996273087152572, "loss": 7.7804, "step": 576 }, { "epoch": 0.027538151316652944, "grad_norm": 5.218099117279053, "learning_rate": 0.0001999625942380754, "loss": 5.6838, "step": 577 }, { "epoch": 0.027585877748744197, "grad_norm": 3.9558331966400146, "learning_rate": 0.000199962457354672, "loss": 6.1443, "step": 578 }, { "epoch": 0.02763360418083545, "grad_norm": 4.843764781951904, "learning_rate": 0.0001999623202213159, "loss": 7.0784, "step": 579 }, { "epoch": 0.027681330612926703, "grad_norm": 6.000890254974365, "learning_rate": 0.0001999621828380074, "loss": 8.8311, "step": 580 }, { "epoch": 0.027729057045017956, "grad_norm": 7.207266807556152, "learning_rate": 0.0001999620452047469, "loss": 7.2073, "step": 581 }, { "epoch": 0.02777678347710921, "grad_norm": 4.971728801727295, "learning_rate": 0.00019996190732153474, "loss": 8.4234, "step": 582 }, { "epoch": 0.027824509909200462, "grad_norm": 4.149250030517578, "learning_rate": 0.00019996176918837125, "loss": 5.3555, "step": 583 }, { "epoch": 0.027872236341291715, "grad_norm": 4.7893829345703125, "learning_rate": 0.0001999616308052567, "loss": 5.8994, "step": 584 }, { "epoch": 0.02791996277338297, "grad_norm": 5.384006023406982, "learning_rate": 0.00019996149217219158, "loss": 6.4517, "step": 585 }, { "epoch": 0.02796768920547422, "grad_norm": 4.54136323928833, "learning_rate": 0.00019996135328917614, "loss": 6.695, "step": 586 }, { "epoch": 0.028015415637565474, "grad_norm": 6.044737815856934, "learning_rate": 0.00019996121415621074, "loss": 6.5002, "step": 587 }, { "epoch": 0.028063142069656728, "grad_norm": 4.089444160461426, "learning_rate": 0.00019996107477329576, "loss": 5.1134, "step": 588 }, { "epoch": 0.02811086850174798, "grad_norm": 6.6287312507629395, "learning_rate": 0.00019996093514043154, "loss": 7.6076, "step": 589 }, { "epoch": 0.028158594933839234, "grad_norm": 5.980465412139893, "learning_rate": 0.0001999607952576184, "loss": 7.8736, "step": 590 }, { "epoch": 0.028206321365930487, "grad_norm": 5.567748546600342, "learning_rate": 0.00019996065512485668, "loss": 7.7547, "step": 591 }, { "epoch": 0.02825404779802174, "grad_norm": 5.045673847198486, "learning_rate": 0.00019996051474214678, "loss": 6.2608, "step": 592 }, { "epoch": 0.028301774230112993, "grad_norm": 4.954861640930176, "learning_rate": 0.00019996037410948902, "loss": 7.2595, "step": 593 }, { "epoch": 0.028349500662204246, "grad_norm": 7.233619689941406, "learning_rate": 0.00019996023322688377, "loss": 6.9448, "step": 594 }, { "epoch": 0.0283972270942955, "grad_norm": 4.880596160888672, "learning_rate": 0.00019996009209433138, "loss": 7.9134, "step": 595 }, { "epoch": 0.028444953526386752, "grad_norm": 6.014434814453125, "learning_rate": 0.0001999599507118322, "loss": 7.1068, "step": 596 }, { "epoch": 0.028492679958478005, "grad_norm": 6.972095012664795, "learning_rate": 0.0001999598090793865, "loss": 8.4543, "step": 597 }, { "epoch": 0.02854040639056926, "grad_norm": 7.001162528991699, "learning_rate": 0.00019995966719699482, "loss": 9.9176, "step": 598 }, { "epoch": 0.02858813282266051, "grad_norm": 4.132091045379639, "learning_rate": 0.00019995952506465735, "loss": 6.7161, "step": 599 }, { "epoch": 0.028635859254751764, "grad_norm": 6.05724573135376, "learning_rate": 0.00019995938268237452, "loss": 8.2906, "step": 600 }, { "epoch": 0.028635859254751764, "eval_loss": 1.6445622444152832, "eval_runtime": 96.5062, "eval_samples_per_second": 8.735, "eval_steps_per_second": 4.373, "step": 600 }, { "epoch": 0.028683585686843018, "grad_norm": 4.136114597320557, "learning_rate": 0.00019995924005014666, "loss": 6.7033, "step": 601 }, { "epoch": 0.028731312118934267, "grad_norm": 4.15371561050415, "learning_rate": 0.00019995909716797412, "loss": 6.3375, "step": 602 }, { "epoch": 0.02877903855102552, "grad_norm": 5.778688430786133, "learning_rate": 0.00019995895403585728, "loss": 6.9708, "step": 603 }, { "epoch": 0.028826764983116773, "grad_norm": 4.43817663192749, "learning_rate": 0.0001999588106537965, "loss": 5.7535, "step": 604 }, { "epoch": 0.028874491415208026, "grad_norm": 4.867424011230469, "learning_rate": 0.00019995866702179213, "loss": 6.9147, "step": 605 }, { "epoch": 0.02892221784729928, "grad_norm": 4.96653938293457, "learning_rate": 0.00019995852313984454, "loss": 6.6769, "step": 606 }, { "epoch": 0.028969944279390532, "grad_norm": 4.451474189758301, "learning_rate": 0.00019995837900795403, "loss": 5.7634, "step": 607 }, { "epoch": 0.029017670711481786, "grad_norm": 7.053049087524414, "learning_rate": 0.00019995823462612108, "loss": 6.8795, "step": 608 }, { "epoch": 0.02906539714357304, "grad_norm": 4.5043206214904785, "learning_rate": 0.00019995808999434593, "loss": 5.9499, "step": 609 }, { "epoch": 0.02911312357566429, "grad_norm": 5.097994804382324, "learning_rate": 0.000199957945112629, "loss": 6.8594, "step": 610 }, { "epoch": 0.029160850007755545, "grad_norm": 6.725499153137207, "learning_rate": 0.00019995779998097064, "loss": 9.5979, "step": 611 }, { "epoch": 0.029208576439846798, "grad_norm": 4.658621788024902, "learning_rate": 0.00019995765459937125, "loss": 5.9278, "step": 612 }, { "epoch": 0.02925630287193805, "grad_norm": 4.636320114135742, "learning_rate": 0.0001999575089678311, "loss": 6.0713, "step": 613 }, { "epoch": 0.029304029304029304, "grad_norm": 4.8657050132751465, "learning_rate": 0.00019995736308635065, "loss": 7.4898, "step": 614 }, { "epoch": 0.029351755736120557, "grad_norm": 4.6084089279174805, "learning_rate": 0.00019995721695493021, "loss": 5.9211, "step": 615 }, { "epoch": 0.02939948216821181, "grad_norm": 5.468587875366211, "learning_rate": 0.0001999570705735702, "loss": 6.9393, "step": 616 }, { "epoch": 0.029447208600303063, "grad_norm": 5.4288458824157715, "learning_rate": 0.0001999569239422709, "loss": 6.7944, "step": 617 }, { "epoch": 0.029494935032394316, "grad_norm": 3.9888031482696533, "learning_rate": 0.00019995677706103276, "loss": 5.9556, "step": 618 }, { "epoch": 0.02954266146448557, "grad_norm": 3.941678047180176, "learning_rate": 0.0001999566299298561, "loss": 4.7205, "step": 619 }, { "epoch": 0.029590387896576822, "grad_norm": 4.810477256774902, "learning_rate": 0.0001999564825487413, "loss": 7.2574, "step": 620 }, { "epoch": 0.029638114328668076, "grad_norm": 5.277927875518799, "learning_rate": 0.00019995633491768874, "loss": 6.5919, "step": 621 }, { "epoch": 0.02968584076075933, "grad_norm": 5.063652038574219, "learning_rate": 0.00019995618703669879, "loss": 6.0338, "step": 622 }, { "epoch": 0.02973356719285058, "grad_norm": 4.282129764556885, "learning_rate": 0.0001999560389057718, "loss": 5.717, "step": 623 }, { "epoch": 0.029781293624941835, "grad_norm": 8.708517074584961, "learning_rate": 0.00019995589052490816, "loss": 7.7195, "step": 624 }, { "epoch": 0.029829020057033088, "grad_norm": 4.4216718673706055, "learning_rate": 0.0001999557418941082, "loss": 7.554, "step": 625 }, { "epoch": 0.02987674648912434, "grad_norm": 5.973806858062744, "learning_rate": 0.00019995559301337235, "loss": 8.7278, "step": 626 }, { "epoch": 0.02992447292121559, "grad_norm": 4.879183769226074, "learning_rate": 0.00019995544388270092, "loss": 5.2387, "step": 627 }, { "epoch": 0.029972199353306844, "grad_norm": 6.435269355773926, "learning_rate": 0.00019995529450209435, "loss": 7.2195, "step": 628 }, { "epoch": 0.030019925785398097, "grad_norm": 3.8631818294525146, "learning_rate": 0.00019995514487155297, "loss": 5.7062, "step": 629 }, { "epoch": 0.03006765221748935, "grad_norm": 4.90740966796875, "learning_rate": 0.00019995499499107715, "loss": 6.0653, "step": 630 }, { "epoch": 0.030115378649580603, "grad_norm": 6.804110527038574, "learning_rate": 0.0001999548448606673, "loss": 6.5563, "step": 631 }, { "epoch": 0.030163105081671856, "grad_norm": 4.8841729164123535, "learning_rate": 0.00019995469448032377, "loss": 7.3101, "step": 632 }, { "epoch": 0.03021083151376311, "grad_norm": 4.9309258460998535, "learning_rate": 0.00019995454385004694, "loss": 7.8652, "step": 633 }, { "epoch": 0.030258557945854362, "grad_norm": 5.22358512878418, "learning_rate": 0.0001999543929698372, "loss": 7.4832, "step": 634 }, { "epoch": 0.030306284377945615, "grad_norm": 5.597085952758789, "learning_rate": 0.0001999542418396949, "loss": 5.0475, "step": 635 }, { "epoch": 0.030354010810036868, "grad_norm": 6.181306838989258, "learning_rate": 0.00019995409045962044, "loss": 7.2928, "step": 636 }, { "epoch": 0.03040173724212812, "grad_norm": 5.379262924194336, "learning_rate": 0.0001999539388296142, "loss": 7.6771, "step": 637 }, { "epoch": 0.030449463674219374, "grad_norm": 4.6227288246154785, "learning_rate": 0.00019995378694967654, "loss": 6.2112, "step": 638 }, { "epoch": 0.030497190106310627, "grad_norm": 4.964150428771973, "learning_rate": 0.00019995363481980783, "loss": 6.7917, "step": 639 }, { "epoch": 0.03054491653840188, "grad_norm": 5.433777809143066, "learning_rate": 0.0001999534824400085, "loss": 6.1934, "step": 640 }, { "epoch": 0.030592642970493134, "grad_norm": 4.705436706542969, "learning_rate": 0.0001999533298102789, "loss": 6.2307, "step": 641 }, { "epoch": 0.030640369402584387, "grad_norm": 4.595578670501709, "learning_rate": 0.0001999531769306194, "loss": 6.5358, "step": 642 }, { "epoch": 0.03068809583467564, "grad_norm": 8.22408676147461, "learning_rate": 0.0001999530238010304, "loss": 7.9383, "step": 643 }, { "epoch": 0.030735822266766893, "grad_norm": 4.760887622833252, "learning_rate": 0.0001999528704215123, "loss": 6.5925, "step": 644 }, { "epoch": 0.030783548698858146, "grad_norm": 5.1042799949646, "learning_rate": 0.00019995271679206544, "loss": 6.2162, "step": 645 }, { "epoch": 0.0308312751309494, "grad_norm": 4.76911735534668, "learning_rate": 0.00019995256291269021, "loss": 6.9028, "step": 646 }, { "epoch": 0.030879001563040652, "grad_norm": 4.064404487609863, "learning_rate": 0.00019995240878338704, "loss": 5.6423, "step": 647 }, { "epoch": 0.030926727995131905, "grad_norm": 5.8814592361450195, "learning_rate": 0.00019995225440415628, "loss": 7.1905, "step": 648 }, { "epoch": 0.030974454427223158, "grad_norm": 3.990440845489502, "learning_rate": 0.0001999520997749983, "loss": 5.9157, "step": 649 }, { "epoch": 0.03102218085931441, "grad_norm": 4.870462417602539, "learning_rate": 0.00019995194489591354, "loss": 5.8078, "step": 650 }, { "epoch": 0.031069907291405664, "grad_norm": 4.1922407150268555, "learning_rate": 0.00019995178976690237, "loss": 5.4015, "step": 651 }, { "epoch": 0.031117633723496917, "grad_norm": 4.662567615509033, "learning_rate": 0.00019995163438796512, "loss": 6.4601, "step": 652 }, { "epoch": 0.031165360155588167, "grad_norm": 3.675523281097412, "learning_rate": 0.00019995147875910224, "loss": 4.5766, "step": 653 }, { "epoch": 0.03121308658767942, "grad_norm": 4.701218128204346, "learning_rate": 0.00019995132288031412, "loss": 6.9513, "step": 654 }, { "epoch": 0.03126081301977068, "grad_norm": 7.849576950073242, "learning_rate": 0.0001999511667516011, "loss": 7.7071, "step": 655 }, { "epoch": 0.031308539451861926, "grad_norm": 4.583592891693115, "learning_rate": 0.00019995101037296363, "loss": 6.1542, "step": 656 }, { "epoch": 0.03135626588395318, "grad_norm": 6.747661590576172, "learning_rate": 0.00019995085374440206, "loss": 5.995, "step": 657 }, { "epoch": 0.03140399231604443, "grad_norm": 5.124732971191406, "learning_rate": 0.0001999506968659168, "loss": 6.9491, "step": 658 }, { "epoch": 0.03145171874813569, "grad_norm": 5.057847023010254, "learning_rate": 0.00019995053973750825, "loss": 5.5953, "step": 659 }, { "epoch": 0.03149944518022694, "grad_norm": 5.604503154754639, "learning_rate": 0.00019995038235917675, "loss": 6.1385, "step": 660 }, { "epoch": 0.031547171612318195, "grad_norm": 4.473517894744873, "learning_rate": 0.00019995022473092276, "loss": 5.679, "step": 661 }, { "epoch": 0.031594898044409445, "grad_norm": 5.037128448486328, "learning_rate": 0.00019995006685274663, "loss": 7.2914, "step": 662 }, { "epoch": 0.0316426244765007, "grad_norm": 4.848421096801758, "learning_rate": 0.0001999499087246488, "loss": 6.4146, "step": 663 }, { "epoch": 0.03169035090859195, "grad_norm": 5.241678237915039, "learning_rate": 0.0001999497503466296, "loss": 5.7625, "step": 664 }, { "epoch": 0.03173807734068321, "grad_norm": 4.927211284637451, "learning_rate": 0.0001999495917186895, "loss": 7.3099, "step": 665 }, { "epoch": 0.03178580377277446, "grad_norm": 4.775562763214111, "learning_rate": 0.00019994943284082885, "loss": 6.2814, "step": 666 }, { "epoch": 0.031833530204865707, "grad_norm": 4.626981735229492, "learning_rate": 0.00019994927371304807, "loss": 5.8708, "step": 667 }, { "epoch": 0.03188125663695696, "grad_norm": 4.377954483032227, "learning_rate": 0.00019994911433534752, "loss": 6.8265, "step": 668 }, { "epoch": 0.03192898306904821, "grad_norm": 5.780749797821045, "learning_rate": 0.00019994895470772765, "loss": 7.0482, "step": 669 }, { "epoch": 0.03197670950113947, "grad_norm": 3.679264783859253, "learning_rate": 0.0001999487948301888, "loss": 5.0797, "step": 670 }, { "epoch": 0.03202443593323072, "grad_norm": 4.032260417938232, "learning_rate": 0.00019994863470273143, "loss": 5.2758, "step": 671 }, { "epoch": 0.032072162365321975, "grad_norm": 5.540952205657959, "learning_rate": 0.0001999484743253559, "loss": 6.8454, "step": 672 }, { "epoch": 0.032119888797413225, "grad_norm": 7.048995494842529, "learning_rate": 0.00019994831369806265, "loss": 9.0686, "step": 673 }, { "epoch": 0.03216761522950448, "grad_norm": 5.23916482925415, "learning_rate": 0.00019994815282085203, "loss": 5.9261, "step": 674 }, { "epoch": 0.03221534166159573, "grad_norm": 5.443724632263184, "learning_rate": 0.00019994799169372447, "loss": 6.905, "step": 675 }, { "epoch": 0.03226306809368699, "grad_norm": 9.625133514404297, "learning_rate": 0.00019994783031668036, "loss": 8.9539, "step": 676 }, { "epoch": 0.03231079452577824, "grad_norm": 4.8145575523376465, "learning_rate": 0.00019994766868972015, "loss": 7.8852, "step": 677 }, { "epoch": 0.032358520957869494, "grad_norm": 6.9294939041137695, "learning_rate": 0.0001999475068128442, "loss": 8.2033, "step": 678 }, { "epoch": 0.03240624738996074, "grad_norm": 5.0194220542907715, "learning_rate": 0.0001999473446860529, "loss": 7.4619, "step": 679 }, { "epoch": 0.032453973822052, "grad_norm": 5.095991134643555, "learning_rate": 0.0001999471823093467, "loss": 6.8233, "step": 680 }, { "epoch": 0.03250170025414325, "grad_norm": 6.12284517288208, "learning_rate": 0.00019994701968272595, "loss": 8.0903, "step": 681 }, { "epoch": 0.032549426686234506, "grad_norm": 4.131306171417236, "learning_rate": 0.00019994685680619113, "loss": 5.7543, "step": 682 }, { "epoch": 0.032597153118325756, "grad_norm": 4.519933700561523, "learning_rate": 0.00019994669367974258, "loss": 5.817, "step": 683 }, { "epoch": 0.03264487955041701, "grad_norm": 4.053531169891357, "learning_rate": 0.00019994653030338077, "loss": 5.69, "step": 684 }, { "epoch": 0.03269260598250826, "grad_norm": 5.133871078491211, "learning_rate": 0.00019994636667710608, "loss": 6.8428, "step": 685 }, { "epoch": 0.03274033241459952, "grad_norm": 5.971726417541504, "learning_rate": 0.00019994620280091888, "loss": 6.5619, "step": 686 }, { "epoch": 0.03278805884669077, "grad_norm": 6.018060684204102, "learning_rate": 0.0001999460386748196, "loss": 5.9128, "step": 687 }, { "epoch": 0.032835785278782025, "grad_norm": 6.530747890472412, "learning_rate": 0.00019994587429880874, "loss": 8.0376, "step": 688 }, { "epoch": 0.032883511710873274, "grad_norm": 5.91585636138916, "learning_rate": 0.0001999457096728866, "loss": 8.2237, "step": 689 }, { "epoch": 0.03293123814296453, "grad_norm": 5.142551898956299, "learning_rate": 0.00019994554479705362, "loss": 7.2366, "step": 690 }, { "epoch": 0.03297896457505578, "grad_norm": 5.149393081665039, "learning_rate": 0.0001999453796713102, "loss": 7.3637, "step": 691 }, { "epoch": 0.03302669100714703, "grad_norm": 5.167393684387207, "learning_rate": 0.00019994521429565678, "loss": 7.2236, "step": 692 }, { "epoch": 0.033074417439238286, "grad_norm": 6.182661533355713, "learning_rate": 0.0001999450486700938, "loss": 5.949, "step": 693 }, { "epoch": 0.033122143871329536, "grad_norm": 7.329122066497803, "learning_rate": 0.0001999448827946216, "loss": 6.4546, "step": 694 }, { "epoch": 0.03316987030342079, "grad_norm": 8.433281898498535, "learning_rate": 0.00019994471666924066, "loss": 6.7018, "step": 695 }, { "epoch": 0.03321759673551204, "grad_norm": 5.727491855621338, "learning_rate": 0.00019994455029395135, "loss": 6.2153, "step": 696 }, { "epoch": 0.0332653231676033, "grad_norm": 7.6868133544921875, "learning_rate": 0.00019994438366875412, "loss": 7.0914, "step": 697 }, { "epoch": 0.03331304959969455, "grad_norm": 6.715877532958984, "learning_rate": 0.00019994421679364938, "loss": 8.2489, "step": 698 }, { "epoch": 0.033360776031785805, "grad_norm": 4.513460636138916, "learning_rate": 0.00019994404966863752, "loss": 6.3282, "step": 699 }, { "epoch": 0.033408502463877054, "grad_norm": 5.645149230957031, "learning_rate": 0.00019994388229371902, "loss": 8.0029, "step": 700 }, { "epoch": 0.03345622889596831, "grad_norm": 6.176321029663086, "learning_rate": 0.0001999437146688942, "loss": 4.6995, "step": 701 }, { "epoch": 0.03350395532805956, "grad_norm": 6.801309108734131, "learning_rate": 0.00019994354679416356, "loss": 11.1667, "step": 702 }, { "epoch": 0.03355168176015082, "grad_norm": 6.066688060760498, "learning_rate": 0.0001999433786695275, "loss": 7.2694, "step": 703 }, { "epoch": 0.03359940819224207, "grad_norm": 6.609181880950928, "learning_rate": 0.00019994321029498643, "loss": 7.733, "step": 704 }, { "epoch": 0.03364713462433332, "grad_norm": 4.822350025177002, "learning_rate": 0.00019994304167054078, "loss": 6.0339, "step": 705 }, { "epoch": 0.03369486105642457, "grad_norm": 5.123159408569336, "learning_rate": 0.00019994287279619097, "loss": 8.0052, "step": 706 }, { "epoch": 0.03374258748851583, "grad_norm": 5.20865535736084, "learning_rate": 0.00019994270367193741, "loss": 6.5163, "step": 707 }, { "epoch": 0.03379031392060708, "grad_norm": 6.101910591125488, "learning_rate": 0.00019994253429778057, "loss": 8.3072, "step": 708 }, { "epoch": 0.033838040352698336, "grad_norm": 4.972561359405518, "learning_rate": 0.00019994236467372082, "loss": 6.0224, "step": 709 }, { "epoch": 0.033885766784789585, "grad_norm": 3.8703560829162598, "learning_rate": 0.00019994219479975856, "loss": 4.6104, "step": 710 }, { "epoch": 0.03393349321688084, "grad_norm": 5.2776265144348145, "learning_rate": 0.0001999420246758943, "loss": 6.9, "step": 711 }, { "epoch": 0.03398121964897209, "grad_norm": 5.502066612243652, "learning_rate": 0.0001999418543021284, "loss": 6.6334, "step": 712 }, { "epoch": 0.03402894608106335, "grad_norm": 4.197521209716797, "learning_rate": 0.00019994168367846134, "loss": 5.2104, "step": 713 }, { "epoch": 0.0340766725131546, "grad_norm": 6.760063648223877, "learning_rate": 0.0001999415128048935, "loss": 6.3762, "step": 714 }, { "epoch": 0.034124398945245854, "grad_norm": 4.419248104095459, "learning_rate": 0.00019994134168142532, "loss": 5.4752, "step": 715 }, { "epoch": 0.034172125377337104, "grad_norm": 4.384754180908203, "learning_rate": 0.00019994117030805718, "loss": 5.7785, "step": 716 }, { "epoch": 0.03421985180942835, "grad_norm": 8.127161026000977, "learning_rate": 0.0001999409986847896, "loss": 10.4695, "step": 717 }, { "epoch": 0.03426757824151961, "grad_norm": 6.404677867889404, "learning_rate": 0.00019994082681162298, "loss": 8.2666, "step": 718 }, { "epoch": 0.03431530467361086, "grad_norm": 4.863952159881592, "learning_rate": 0.00019994065468855775, "loss": 6.4182, "step": 719 }, { "epoch": 0.034363031105702116, "grad_norm": 6.348741054534912, "learning_rate": 0.00019994048231559427, "loss": 6.3373, "step": 720 }, { "epoch": 0.034410757537793366, "grad_norm": 6.7471699714660645, "learning_rate": 0.0001999403096927331, "loss": 7.206, "step": 721 }, { "epoch": 0.03445848396988462, "grad_norm": 9.817275047302246, "learning_rate": 0.00019994013681997453, "loss": 9.551, "step": 722 }, { "epoch": 0.03450621040197587, "grad_norm": 5.279532432556152, "learning_rate": 0.0001999399636973191, "loss": 6.3958, "step": 723 }, { "epoch": 0.03455393683406713, "grad_norm": 4.0019683837890625, "learning_rate": 0.0001999397903247672, "loss": 5.3662, "step": 724 }, { "epoch": 0.03460166326615838, "grad_norm": 3.9332621097564697, "learning_rate": 0.00019993961670231925, "loss": 4.4706, "step": 725 }, { "epoch": 0.034649389698249634, "grad_norm": 4.0796284675598145, "learning_rate": 0.00019993944282997573, "loss": 5.3288, "step": 726 }, { "epoch": 0.034697116130340884, "grad_norm": 4.95109748840332, "learning_rate": 0.00019993926870773705, "loss": 6.365, "step": 727 }, { "epoch": 0.03474484256243214, "grad_norm": 4.6030144691467285, "learning_rate": 0.00019993909433560362, "loss": 5.1978, "step": 728 }, { "epoch": 0.03479256899452339, "grad_norm": 4.214527130126953, "learning_rate": 0.0001999389197135759, "loss": 3.751, "step": 729 }, { "epoch": 0.03484029542661465, "grad_norm": 158.9529266357422, "learning_rate": 0.00019993874484165435, "loss": 7.7418, "step": 730 }, { "epoch": 0.034888021858705896, "grad_norm": 5.610410690307617, "learning_rate": 0.00019993856971983935, "loss": 7.2712, "step": 731 }, { "epoch": 0.03493574829079715, "grad_norm": 5.346312522888184, "learning_rate": 0.0001999383943481314, "loss": 7.0588, "step": 732 }, { "epoch": 0.0349834747228884, "grad_norm": 5.416896343231201, "learning_rate": 0.0001999382187265309, "loss": 6.0247, "step": 733 }, { "epoch": 0.03503120115497966, "grad_norm": 5.815309524536133, "learning_rate": 0.00019993804285503827, "loss": 9.2724, "step": 734 }, { "epoch": 0.03507892758707091, "grad_norm": 5.113542079925537, "learning_rate": 0.000199937866733654, "loss": 5.1882, "step": 735 }, { "epoch": 0.035126654019162165, "grad_norm": 5.898989677429199, "learning_rate": 0.0001999376903623785, "loss": 6.3857, "step": 736 }, { "epoch": 0.035174380451253415, "grad_norm": 6.181110382080078, "learning_rate": 0.00019993751374121224, "loss": 7.5154, "step": 737 }, { "epoch": 0.03522210688334467, "grad_norm": 4.788928508758545, "learning_rate": 0.00019993733687015562, "loss": 5.4642, "step": 738 }, { "epoch": 0.03526983331543592, "grad_norm": 4.074676513671875, "learning_rate": 0.00019993715974920913, "loss": 7.1135, "step": 739 }, { "epoch": 0.03531755974752718, "grad_norm": 5.06754207611084, "learning_rate": 0.00019993698237837318, "loss": 7.0229, "step": 740 }, { "epoch": 0.03536528617961843, "grad_norm": 4.875411033630371, "learning_rate": 0.0001999368047576482, "loss": 6.1099, "step": 741 }, { "epoch": 0.035413012611709684, "grad_norm": 5.274248123168945, "learning_rate": 0.00019993662688703466, "loss": 5.5344, "step": 742 }, { "epoch": 0.03546073904380093, "grad_norm": 5.2172770500183105, "learning_rate": 0.000199936448766533, "loss": 6.4255, "step": 743 }, { "epoch": 0.03550846547589218, "grad_norm": 5.027593612670898, "learning_rate": 0.00019993627039614366, "loss": 7.6667, "step": 744 }, { "epoch": 0.03555619190798344, "grad_norm": 5.327010631561279, "learning_rate": 0.00019993609177586713, "loss": 7.2605, "step": 745 }, { "epoch": 0.03560391834007469, "grad_norm": 5.417986869812012, "learning_rate": 0.00019993591290570377, "loss": 7.9533, "step": 746 }, { "epoch": 0.035651644772165945, "grad_norm": 4.68738317489624, "learning_rate": 0.00019993573378565408, "loss": 5.2312, "step": 747 }, { "epoch": 0.035699371204257195, "grad_norm": 6.335091590881348, "learning_rate": 0.0001999355544157185, "loss": 7.1459, "step": 748 }, { "epoch": 0.03574709763634845, "grad_norm": 4.362058162689209, "learning_rate": 0.0001999353747958975, "loss": 5.6782, "step": 749 }, { "epoch": 0.0357948240684397, "grad_norm": 3.864102363586426, "learning_rate": 0.00019993519492619147, "loss": 5.3774, "step": 750 }, { "epoch": 0.03584255050053096, "grad_norm": 5.608258247375488, "learning_rate": 0.00019993501480660095, "loss": 7.2503, "step": 751 }, { "epoch": 0.03589027693262221, "grad_norm": 5.483639240264893, "learning_rate": 0.0001999348344371263, "loss": 9.4644, "step": 752 }, { "epoch": 0.035938003364713464, "grad_norm": 4.786868095397949, "learning_rate": 0.00019993465381776805, "loss": 5.9619, "step": 753 }, { "epoch": 0.035985729796804714, "grad_norm": 7.74579381942749, "learning_rate": 0.0001999344729485266, "loss": 8.1731, "step": 754 }, { "epoch": 0.03603345622889597, "grad_norm": 5.121527194976807, "learning_rate": 0.0001999342918294024, "loss": 6.6349, "step": 755 }, { "epoch": 0.03608118266098722, "grad_norm": 4.717724323272705, "learning_rate": 0.00019993411046039593, "loss": 7.4566, "step": 756 }, { "epoch": 0.036128909093078476, "grad_norm": 4.772352695465088, "learning_rate": 0.0001999339288415076, "loss": 6.6679, "step": 757 }, { "epoch": 0.036176635525169726, "grad_norm": 5.122435092926025, "learning_rate": 0.0001999337469727379, "loss": 6.1561, "step": 758 }, { "epoch": 0.03622436195726098, "grad_norm": 4.617373466491699, "learning_rate": 0.0001999335648540873, "loss": 6.7975, "step": 759 }, { "epoch": 0.03627208838935223, "grad_norm": 5.723831653594971, "learning_rate": 0.00019993338248555623, "loss": 6.6176, "step": 760 }, { "epoch": 0.03631981482144349, "grad_norm": 4.896640300750732, "learning_rate": 0.00019993319986714517, "loss": 6.3736, "step": 761 }, { "epoch": 0.03636754125353474, "grad_norm": 3.952786922454834, "learning_rate": 0.0001999330169988545, "loss": 4.5168, "step": 762 }, { "epoch": 0.036415267685625995, "grad_norm": 9.36917781829834, "learning_rate": 0.00019993283388068482, "loss": 10.9873, "step": 763 }, { "epoch": 0.036462994117717244, "grad_norm": 4.090841293334961, "learning_rate": 0.00019993265051263643, "loss": 5.6641, "step": 764 }, { "epoch": 0.0365107205498085, "grad_norm": 5.485438823699951, "learning_rate": 0.0001999324668947099, "loss": 5.6358, "step": 765 }, { "epoch": 0.03655844698189975, "grad_norm": 3.9010746479034424, "learning_rate": 0.00019993228302690565, "loss": 5.9309, "step": 766 }, { "epoch": 0.03660617341399101, "grad_norm": 5.523644924163818, "learning_rate": 0.0001999320989092241, "loss": 6.3502, "step": 767 }, { "epoch": 0.03665389984608226, "grad_norm": 4.358931541442871, "learning_rate": 0.00019993191454166582, "loss": 5.2274, "step": 768 }, { "epoch": 0.036701626278173506, "grad_norm": 4.744372844696045, "learning_rate": 0.00019993172992423115, "loss": 6.9873, "step": 769 }, { "epoch": 0.03674935271026476, "grad_norm": 5.708926677703857, "learning_rate": 0.0001999315450569206, "loss": 7.2152, "step": 770 }, { "epoch": 0.03679707914235601, "grad_norm": 5.504681587219238, "learning_rate": 0.00019993135993973465, "loss": 8.1215, "step": 771 }, { "epoch": 0.03684480557444727, "grad_norm": 33.9450798034668, "learning_rate": 0.00019993117457267376, "loss": 5.6863, "step": 772 }, { "epoch": 0.03689253200653852, "grad_norm": 3.4234366416931152, "learning_rate": 0.00019993098895573836, "loss": 4.3959, "step": 773 }, { "epoch": 0.036940258438629775, "grad_norm": 4.7213969230651855, "learning_rate": 0.00019993080308892895, "loss": 5.3723, "step": 774 }, { "epoch": 0.036987984870721025, "grad_norm": 5.803373336791992, "learning_rate": 0.000199930616972246, "loss": 6.8033, "step": 775 }, { "epoch": 0.03703571130281228, "grad_norm": 5.85810661315918, "learning_rate": 0.00019993043060568993, "loss": 6.7726, "step": 776 }, { "epoch": 0.03708343773490353, "grad_norm": 4.8375067710876465, "learning_rate": 0.00019993024398926124, "loss": 5.5222, "step": 777 }, { "epoch": 0.03713116416699479, "grad_norm": 5.148411750793457, "learning_rate": 0.00019993005712296038, "loss": 8.0447, "step": 778 }, { "epoch": 0.03717889059908604, "grad_norm": 4.476199626922607, "learning_rate": 0.00019992987000678783, "loss": 5.5438, "step": 779 }, { "epoch": 0.03722661703117729, "grad_norm": 4.82620096206665, "learning_rate": 0.00019992968264074407, "loss": 6.1392, "step": 780 }, { "epoch": 0.03727434346326854, "grad_norm": 6.114699840545654, "learning_rate": 0.0001999294950248295, "loss": 6.3438, "step": 781 }, { "epoch": 0.0373220698953598, "grad_norm": 5.124344825744629, "learning_rate": 0.00019992930715904473, "loss": 7.2955, "step": 782 }, { "epoch": 0.03736979632745105, "grad_norm": 5.396039962768555, "learning_rate": 0.00019992911904339006, "loss": 5.4887, "step": 783 }, { "epoch": 0.037417522759542306, "grad_norm": 5.533827304840088, "learning_rate": 0.0001999289306778661, "loss": 7.0651, "step": 784 }, { "epoch": 0.037465249191633555, "grad_norm": 6.619016647338867, "learning_rate": 0.00019992874206247322, "loss": 6.436, "step": 785 }, { "epoch": 0.03751297562372481, "grad_norm": 4.265638828277588, "learning_rate": 0.00019992855319721193, "loss": 4.9468, "step": 786 }, { "epoch": 0.03756070205581606, "grad_norm": 6.250490665435791, "learning_rate": 0.00019992836408208272, "loss": 5.6211, "step": 787 }, { "epoch": 0.03760842848790732, "grad_norm": 6.974722385406494, "learning_rate": 0.00019992817471708608, "loss": 7.2161, "step": 788 }, { "epoch": 0.03765615491999857, "grad_norm": 4.346407890319824, "learning_rate": 0.00019992798510222245, "loss": 5.7933, "step": 789 }, { "epoch": 0.037703881352089824, "grad_norm": 5.807689666748047, "learning_rate": 0.00019992779523749226, "loss": 7.0523, "step": 790 }, { "epoch": 0.037751607784181074, "grad_norm": 5.892138957977295, "learning_rate": 0.00019992760512289604, "loss": 5.9393, "step": 791 }, { "epoch": 0.03779933421627233, "grad_norm": 5.707152366638184, "learning_rate": 0.00019992741475843427, "loss": 5.5115, "step": 792 }, { "epoch": 0.03784706064836358, "grad_norm": 6.103639602661133, "learning_rate": 0.00019992722414410743, "loss": 6.5975, "step": 793 }, { "epoch": 0.03789478708045483, "grad_norm": 4.646749496459961, "learning_rate": 0.00019992703327991599, "loss": 5.1983, "step": 794 }, { "epoch": 0.037942513512546086, "grad_norm": 4.501780033111572, "learning_rate": 0.0001999268421658604, "loss": 6.1563, "step": 795 }, { "epoch": 0.037990239944637336, "grad_norm": 4.784836769104004, "learning_rate": 0.00019992665080194113, "loss": 6.4041, "step": 796 }, { "epoch": 0.03803796637672859, "grad_norm": 3.8780477046966553, "learning_rate": 0.0001999264591881587, "loss": 5.4452, "step": 797 }, { "epoch": 0.03808569280881984, "grad_norm": 3.4854323863983154, "learning_rate": 0.00019992626732451358, "loss": 3.6153, "step": 798 }, { "epoch": 0.0381334192409111, "grad_norm": 4.899331569671631, "learning_rate": 0.00019992607521100624, "loss": 5.5189, "step": 799 }, { "epoch": 0.03818114567300235, "grad_norm": 4.477373123168945, "learning_rate": 0.00019992588284763717, "loss": 5.6457, "step": 800 }, { "epoch": 0.03818114567300235, "eval_loss": 1.6340223550796509, "eval_runtime": 96.465, "eval_samples_per_second": 8.739, "eval_steps_per_second": 4.375, "step": 800 }, { "epoch": 0.038228872105093605, "grad_norm": 5.421757221221924, "learning_rate": 0.00019992569023440684, "loss": 6.1111, "step": 801 }, { "epoch": 0.038276598537184854, "grad_norm": 4.366212844848633, "learning_rate": 0.00019992549737131573, "loss": 4.8041, "step": 802 }, { "epoch": 0.03832432496927611, "grad_norm": 10.853283882141113, "learning_rate": 0.00019992530425836432, "loss": 8.4207, "step": 803 }, { "epoch": 0.03837205140136736, "grad_norm": 7.737257957458496, "learning_rate": 0.0001999251108955531, "loss": 7.22, "step": 804 }, { "epoch": 0.03841977783345862, "grad_norm": 8.10062026977539, "learning_rate": 0.00019992491728288255, "loss": 7.6541, "step": 805 }, { "epoch": 0.038467504265549866, "grad_norm": 8.392425537109375, "learning_rate": 0.00019992472342035317, "loss": 5.9934, "step": 806 }, { "epoch": 0.03851523069764112, "grad_norm": 4.973547458648682, "learning_rate": 0.00019992452930796544, "loss": 5.6888, "step": 807 }, { "epoch": 0.03856295712973237, "grad_norm": 5.641502380371094, "learning_rate": 0.00019992433494571982, "loss": 6.3525, "step": 808 }, { "epoch": 0.03861068356182363, "grad_norm": 4.655037879943848, "learning_rate": 0.00019992414033361683, "loss": 7.5261, "step": 809 }, { "epoch": 0.03865840999391488, "grad_norm": 4.71242094039917, "learning_rate": 0.00019992394547165693, "loss": 7.0859, "step": 810 }, { "epoch": 0.038706136426006135, "grad_norm": 6.890414237976074, "learning_rate": 0.00019992375035984063, "loss": 8.5151, "step": 811 }, { "epoch": 0.038753862858097385, "grad_norm": 5.453627109527588, "learning_rate": 0.0001999235549981684, "loss": 8.0741, "step": 812 }, { "epoch": 0.03880158929018864, "grad_norm": 4.234425067901611, "learning_rate": 0.00019992335938664074, "loss": 5.458, "step": 813 }, { "epoch": 0.03884931572227989, "grad_norm": 5.465920925140381, "learning_rate": 0.00019992316352525813, "loss": 7.1554, "step": 814 }, { "epoch": 0.03889704215437115, "grad_norm": 4.905524730682373, "learning_rate": 0.00019992296741402104, "loss": 5.5541, "step": 815 }, { "epoch": 0.0389447685864624, "grad_norm": 4.786214351654053, "learning_rate": 0.00019992277105293003, "loss": 7.1823, "step": 816 }, { "epoch": 0.038992495018553654, "grad_norm": 5.380709171295166, "learning_rate": 0.00019992257444198552, "loss": 5.869, "step": 817 }, { "epoch": 0.0390402214506449, "grad_norm": 4.273427486419678, "learning_rate": 0.00019992237758118802, "loss": 6.3333, "step": 818 }, { "epoch": 0.03908794788273615, "grad_norm": 5.503867149353027, "learning_rate": 0.000199922180470538, "loss": 4.9706, "step": 819 }, { "epoch": 0.03913567431482741, "grad_norm": 18.185144424438477, "learning_rate": 0.00019992198311003605, "loss": 8.2761, "step": 820 }, { "epoch": 0.03918340074691866, "grad_norm": 4.861197471618652, "learning_rate": 0.00019992178549968256, "loss": 6.0344, "step": 821 }, { "epoch": 0.039231127179009916, "grad_norm": 5.1487956047058105, "learning_rate": 0.00019992158763947802, "loss": 6.7587, "step": 822 }, { "epoch": 0.039278853611101165, "grad_norm": 5.880436897277832, "learning_rate": 0.00019992138952942303, "loss": 6.2498, "step": 823 }, { "epoch": 0.03932658004319242, "grad_norm": 4.6059956550598145, "learning_rate": 0.000199921191169518, "loss": 5.7605, "step": 824 }, { "epoch": 0.03937430647528367, "grad_norm": 4.659731388092041, "learning_rate": 0.0001999209925597634, "loss": 7.4061, "step": 825 }, { "epoch": 0.03942203290737493, "grad_norm": 6.699813365936279, "learning_rate": 0.00019992079370015985, "loss": 7.8969, "step": 826 }, { "epoch": 0.03946975933946618, "grad_norm": 5.710276126861572, "learning_rate": 0.00019992059459070772, "loss": 6.3136, "step": 827 }, { "epoch": 0.039517485771557434, "grad_norm": 5.763699531555176, "learning_rate": 0.00019992039523140758, "loss": 6.5576, "step": 828 }, { "epoch": 0.039565212203648684, "grad_norm": 5.533675670623779, "learning_rate": 0.00019992019562225987, "loss": 6.3175, "step": 829 }, { "epoch": 0.03961293863573994, "grad_norm": 8.701111793518066, "learning_rate": 0.00019991999576326515, "loss": 10.4301, "step": 830 }, { "epoch": 0.03966066506783119, "grad_norm": 6.8197150230407715, "learning_rate": 0.00019991979565442386, "loss": 6.0745, "step": 831 }, { "epoch": 0.039708391499922446, "grad_norm": 6.417983055114746, "learning_rate": 0.0001999195952957366, "loss": 6.7477, "step": 832 }, { "epoch": 0.039756117932013696, "grad_norm": 6.684429168701172, "learning_rate": 0.00019991939468720374, "loss": 7.9432, "step": 833 }, { "epoch": 0.03980384436410495, "grad_norm": 7.779618740081787, "learning_rate": 0.00019991919382882588, "loss": 8.8549, "step": 834 }, { "epoch": 0.0398515707961962, "grad_norm": 4.943531513214111, "learning_rate": 0.0001999189927206035, "loss": 5.1178, "step": 835 }, { "epoch": 0.03989929722828746, "grad_norm": 7.583806991577148, "learning_rate": 0.00019991879136253707, "loss": 5.8212, "step": 836 }, { "epoch": 0.03994702366037871, "grad_norm": 4.468966960906982, "learning_rate": 0.00019991858975462712, "loss": 4.3436, "step": 837 }, { "epoch": 0.039994750092469965, "grad_norm": 3.747352123260498, "learning_rate": 0.00019991838789687415, "loss": 5.5256, "step": 838 }, { "epoch": 0.040042476524561214, "grad_norm": 6.284273624420166, "learning_rate": 0.00019991818578927868, "loss": 6.8324, "step": 839 }, { "epoch": 0.04009020295665247, "grad_norm": 5.346858978271484, "learning_rate": 0.00019991798343184117, "loss": 6.477, "step": 840 }, { "epoch": 0.04013792938874372, "grad_norm": 6.024190902709961, "learning_rate": 0.00019991778082456216, "loss": 8.2267, "step": 841 }, { "epoch": 0.04018565582083498, "grad_norm": 5.223745346069336, "learning_rate": 0.00019991757796744217, "loss": 6.6074, "step": 842 }, { "epoch": 0.04023338225292623, "grad_norm": 5.898392200469971, "learning_rate": 0.0001999173748604817, "loss": 6.3628, "step": 843 }, { "epoch": 0.04028110868501748, "grad_norm": 4.170562267303467, "learning_rate": 0.00019991717150368124, "loss": 5.0088, "step": 844 }, { "epoch": 0.04032883511710873, "grad_norm": 4.563225269317627, "learning_rate": 0.00019991696789704128, "loss": 5.5918, "step": 845 }, { "epoch": 0.04037656154919998, "grad_norm": 4.833973407745361, "learning_rate": 0.00019991676404056236, "loss": 7.3408, "step": 846 }, { "epoch": 0.04042428798129124, "grad_norm": 4.248901844024658, "learning_rate": 0.000199916559934245, "loss": 6.3985, "step": 847 }, { "epoch": 0.04047201441338249, "grad_norm": 5.424138069152832, "learning_rate": 0.00019991635557808972, "loss": 7.6996, "step": 848 }, { "epoch": 0.040519740845473745, "grad_norm": 7.372026443481445, "learning_rate": 0.00019991615097209698, "loss": 6.8191, "step": 849 }, { "epoch": 0.040567467277564995, "grad_norm": 5.958218574523926, "learning_rate": 0.0001999159461162673, "loss": 5.6678, "step": 850 }, { "epoch": 0.04061519370965625, "grad_norm": 4.4531402587890625, "learning_rate": 0.00019991574101060124, "loss": 5.6079, "step": 851 }, { "epoch": 0.0406629201417475, "grad_norm": 3.585167646408081, "learning_rate": 0.0001999155356550993, "loss": 5.8294, "step": 852 }, { "epoch": 0.04071064657383876, "grad_norm": 4.364635467529297, "learning_rate": 0.0001999153300497619, "loss": 6.0434, "step": 853 }, { "epoch": 0.04075837300593001, "grad_norm": 7.368249893188477, "learning_rate": 0.0001999151241945897, "loss": 8.1558, "step": 854 }, { "epoch": 0.040806099438021264, "grad_norm": 5.28629207611084, "learning_rate": 0.00019991491808958313, "loss": 6.326, "step": 855 }, { "epoch": 0.04085382587011251, "grad_norm": 6.350910663604736, "learning_rate": 0.0001999147117347427, "loss": 8.4199, "step": 856 }, { "epoch": 0.04090155230220377, "grad_norm": 3.3403286933898926, "learning_rate": 0.00019991450513006894, "loss": 4.2916, "step": 857 }, { "epoch": 0.04094927873429502, "grad_norm": 4.892611980438232, "learning_rate": 0.0001999142982755624, "loss": 6.651, "step": 858 }, { "epoch": 0.040997005166386276, "grad_norm": 6.889688968658447, "learning_rate": 0.00019991409117122358, "loss": 9.8606, "step": 859 }, { "epoch": 0.041044731598477525, "grad_norm": 4.9244818687438965, "learning_rate": 0.00019991388381705299, "loss": 6.7834, "step": 860 }, { "epoch": 0.04109245803056878, "grad_norm": 5.194118022918701, "learning_rate": 0.00019991367621305113, "loss": 5.6759, "step": 861 }, { "epoch": 0.04114018446266003, "grad_norm": 3.4580252170562744, "learning_rate": 0.00019991346835921854, "loss": 5.3421, "step": 862 }, { "epoch": 0.04118791089475129, "grad_norm": 4.443639755249023, "learning_rate": 0.0001999132602555557, "loss": 5.6086, "step": 863 }, { "epoch": 0.04123563732684254, "grad_norm": 4.395709037780762, "learning_rate": 0.0001999130519020632, "loss": 6.5863, "step": 864 }, { "epoch": 0.041283363758933794, "grad_norm": 4.652389049530029, "learning_rate": 0.00019991284329874153, "loss": 5.7735, "step": 865 }, { "epoch": 0.041331090191025044, "grad_norm": 6.625052452087402, "learning_rate": 0.0001999126344455912, "loss": 7.4755, "step": 866 }, { "epoch": 0.0413788166231163, "grad_norm": 4.4573798179626465, "learning_rate": 0.00019991242534261275, "loss": 6.2518, "step": 867 }, { "epoch": 0.04142654305520755, "grad_norm": 4.410787105560303, "learning_rate": 0.0001999122159898067, "loss": 5.1946, "step": 868 }, { "epoch": 0.04147426948729881, "grad_norm": 4.330233573913574, "learning_rate": 0.00019991200638717357, "loss": 5.605, "step": 869 }, { "epoch": 0.041521995919390056, "grad_norm": 6.827779293060303, "learning_rate": 0.00019991179653471383, "loss": 7.47, "step": 870 }, { "epoch": 0.041569722351481306, "grad_norm": 5.49586296081543, "learning_rate": 0.0001999115864324281, "loss": 7.0643, "step": 871 }, { "epoch": 0.04161744878357256, "grad_norm": 5.633253574371338, "learning_rate": 0.00019991137608031684, "loss": 6.3781, "step": 872 }, { "epoch": 0.04166517521566381, "grad_norm": 4.588703632354736, "learning_rate": 0.00019991116547838064, "loss": 6.8861, "step": 873 }, { "epoch": 0.04171290164775507, "grad_norm": 5.573362827301025, "learning_rate": 0.00019991095462661994, "loss": 5.9548, "step": 874 }, { "epoch": 0.04176062807984632, "grad_norm": 5.8958306312561035, "learning_rate": 0.00019991074352503533, "loss": 7.5237, "step": 875 }, { "epoch": 0.041808354511937575, "grad_norm": 6.3074951171875, "learning_rate": 0.00019991053217362732, "loss": 7.616, "step": 876 }, { "epoch": 0.041856080944028824, "grad_norm": 4.953910827636719, "learning_rate": 0.00019991032057239646, "loss": 6.6202, "step": 877 }, { "epoch": 0.04190380737612008, "grad_norm": 5.442967891693115, "learning_rate": 0.0001999101087213432, "loss": 7.0154, "step": 878 }, { "epoch": 0.04195153380821133, "grad_norm": 3.637193441390991, "learning_rate": 0.00019990989662046818, "loss": 4.9516, "step": 879 }, { "epoch": 0.04199926024030259, "grad_norm": 5.736254692077637, "learning_rate": 0.00019990968426977183, "loss": 8.772, "step": 880 }, { "epoch": 0.042046986672393837, "grad_norm": 5.409998893737793, "learning_rate": 0.00019990947166925475, "loss": 5.7412, "step": 881 }, { "epoch": 0.04209471310448509, "grad_norm": 5.611230850219727, "learning_rate": 0.00019990925881891746, "loss": 7.5416, "step": 882 }, { "epoch": 0.04214243953657634, "grad_norm": 4.482954502105713, "learning_rate": 0.00019990904571876047, "loss": 4.7959, "step": 883 }, { "epoch": 0.0421901659686676, "grad_norm": 4.531114101409912, "learning_rate": 0.00019990883236878433, "loss": 5.7608, "step": 884 }, { "epoch": 0.04223789240075885, "grad_norm": 4.574626445770264, "learning_rate": 0.00019990861876898955, "loss": 6.4336, "step": 885 }, { "epoch": 0.042285618832850105, "grad_norm": 5.010381698608398, "learning_rate": 0.00019990840491937667, "loss": 5.5343, "step": 886 }, { "epoch": 0.042333345264941355, "grad_norm": 23.135805130004883, "learning_rate": 0.00019990819081994627, "loss": 6.8274, "step": 887 }, { "epoch": 0.04238107169703261, "grad_norm": 4.994808197021484, "learning_rate": 0.00019990797647069885, "loss": 6.755, "step": 888 }, { "epoch": 0.04242879812912386, "grad_norm": 4.2555131912231445, "learning_rate": 0.00019990776187163496, "loss": 6.2608, "step": 889 }, { "epoch": 0.04247652456121512, "grad_norm": 4.021069526672363, "learning_rate": 0.0001999075470227551, "loss": 6.5349, "step": 890 }, { "epoch": 0.04252425099330637, "grad_norm": 5.096982002258301, "learning_rate": 0.00019990733192405983, "loss": 6.2902, "step": 891 }, { "epoch": 0.042571977425397624, "grad_norm": 5.149520397186279, "learning_rate": 0.0001999071165755497, "loss": 5.9141, "step": 892 }, { "epoch": 0.04261970385748887, "grad_norm": 5.362805366516113, "learning_rate": 0.00019990690097722525, "loss": 6.2891, "step": 893 }, { "epoch": 0.04266743028958013, "grad_norm": 5.558497905731201, "learning_rate": 0.000199906685129087, "loss": 7.4599, "step": 894 }, { "epoch": 0.04271515672167138, "grad_norm": 7.813801288604736, "learning_rate": 0.00019990646903113546, "loss": 8.3155, "step": 895 }, { "epoch": 0.04276288315376263, "grad_norm": 5.941169738769531, "learning_rate": 0.00019990625268337126, "loss": 8.2724, "step": 896 }, { "epoch": 0.042810609585853886, "grad_norm": 5.919430732727051, "learning_rate": 0.00019990603608579485, "loss": 8.8818, "step": 897 }, { "epoch": 0.042858336017945135, "grad_norm": 6.019927501678467, "learning_rate": 0.00019990581923840683, "loss": 8.1603, "step": 898 }, { "epoch": 0.04290606245003639, "grad_norm": 4.297407627105713, "learning_rate": 0.0001999056021412077, "loss": 5.4535, "step": 899 }, { "epoch": 0.04295378888212764, "grad_norm": 5.540119171142578, "learning_rate": 0.00019990538479419808, "loss": 6.8169, "step": 900 }, { "epoch": 0.0430015153142189, "grad_norm": 7.282078742980957, "learning_rate": 0.0001999051671973784, "loss": 10.3661, "step": 901 }, { "epoch": 0.04304924174631015, "grad_norm": 4.648987770080566, "learning_rate": 0.0001999049493507493, "loss": 6.2884, "step": 902 }, { "epoch": 0.043096968178401404, "grad_norm": 4.584688663482666, "learning_rate": 0.00019990473125431124, "loss": 6.1958, "step": 903 }, { "epoch": 0.043144694610492654, "grad_norm": 6.671578884124756, "learning_rate": 0.00019990451290806486, "loss": 8.1228, "step": 904 }, { "epoch": 0.04319242104258391, "grad_norm": 5.0437211990356445, "learning_rate": 0.00019990429431201062, "loss": 6.8539, "step": 905 }, { "epoch": 0.04324014747467516, "grad_norm": 5.622092247009277, "learning_rate": 0.00019990407546614916, "loss": 6.2136, "step": 906 }, { "epoch": 0.043287873906766416, "grad_norm": 4.784860134124756, "learning_rate": 0.00019990385637048093, "loss": 5.7546, "step": 907 }, { "epoch": 0.043335600338857666, "grad_norm": 5.893241882324219, "learning_rate": 0.00019990363702500653, "loss": 7.4963, "step": 908 }, { "epoch": 0.04338332677094892, "grad_norm": 5.539127826690674, "learning_rate": 0.0001999034174297265, "loss": 5.9528, "step": 909 }, { "epoch": 0.04343105320304017, "grad_norm": 6.872004508972168, "learning_rate": 0.00019990319758464138, "loss": 8.8821, "step": 910 }, { "epoch": 0.04347877963513143, "grad_norm": 5.204216480255127, "learning_rate": 0.00019990297748975172, "loss": 7.1393, "step": 911 }, { "epoch": 0.04352650606722268, "grad_norm": 4.861927032470703, "learning_rate": 0.00019990275714505811, "loss": 7.1279, "step": 912 }, { "epoch": 0.043574232499313935, "grad_norm": 5.202857494354248, "learning_rate": 0.00019990253655056105, "loss": 6.6672, "step": 913 }, { "epoch": 0.043621958931405184, "grad_norm": 5.327061653137207, "learning_rate": 0.00019990231570626109, "loss": 7.5628, "step": 914 }, { "epoch": 0.04366968536349644, "grad_norm": 4.384208679199219, "learning_rate": 0.00019990209461215883, "loss": 5.9165, "step": 915 }, { "epoch": 0.04371741179558769, "grad_norm": 5.1692728996276855, "learning_rate": 0.00019990187326825477, "loss": 5.2629, "step": 916 }, { "epoch": 0.04376513822767895, "grad_norm": 4.899837493896484, "learning_rate": 0.0001999016516745495, "loss": 6.8963, "step": 917 }, { "epoch": 0.0438128646597702, "grad_norm": 5.127400875091553, "learning_rate": 0.00019990142983104358, "loss": 7.155, "step": 918 }, { "epoch": 0.04386059109186145, "grad_norm": 5.6775031089782715, "learning_rate": 0.00019990120773773752, "loss": 5.7734, "step": 919 }, { "epoch": 0.0439083175239527, "grad_norm": 5.566905975341797, "learning_rate": 0.00019990098539463195, "loss": 5.6039, "step": 920 }, { "epoch": 0.04395604395604396, "grad_norm": 4.917365074157715, "learning_rate": 0.00019990076280172733, "loss": 6.3656, "step": 921 }, { "epoch": 0.04400377038813521, "grad_norm": 5.078327655792236, "learning_rate": 0.0001999005399590243, "loss": 5.5302, "step": 922 }, { "epoch": 0.04405149682022646, "grad_norm": 6.031999588012695, "learning_rate": 0.00019990031686652334, "loss": 8.0072, "step": 923 }, { "epoch": 0.044099223252317715, "grad_norm": 5.797032833099365, "learning_rate": 0.0001999000935242251, "loss": 7.0327, "step": 924 }, { "epoch": 0.044146949684408965, "grad_norm": 4.609327793121338, "learning_rate": 0.00019989986993213007, "loss": 7.5469, "step": 925 }, { "epoch": 0.04419467611650022, "grad_norm": 4.5047478675842285, "learning_rate": 0.00019989964609023884, "loss": 4.9482, "step": 926 }, { "epoch": 0.04424240254859147, "grad_norm": 5.294336318969727, "learning_rate": 0.00019989942199855193, "loss": 5.931, "step": 927 }, { "epoch": 0.04429012898068273, "grad_norm": 4.31606388092041, "learning_rate": 0.00019989919765706993, "loss": 5.3772, "step": 928 }, { "epoch": 0.04433785541277398, "grad_norm": 5.5156354904174805, "learning_rate": 0.00019989897306579342, "loss": 9.1919, "step": 929 }, { "epoch": 0.044385581844865234, "grad_norm": 4.627865791320801, "learning_rate": 0.00019989874822472293, "loss": 6.6124, "step": 930 }, { "epoch": 0.04443330827695648, "grad_norm": 6.759988307952881, "learning_rate": 0.00019989852313385904, "loss": 7.2521, "step": 931 }, { "epoch": 0.04448103470904774, "grad_norm": 3.7612102031707764, "learning_rate": 0.0001998982977932023, "loss": 3.9385, "step": 932 }, { "epoch": 0.04452876114113899, "grad_norm": 6.557289123535156, "learning_rate": 0.00019989807220275325, "loss": 7.3376, "step": 933 }, { "epoch": 0.044576487573230246, "grad_norm": 4.596192836761475, "learning_rate": 0.00019989784636251253, "loss": 6.0196, "step": 934 }, { "epoch": 0.044624214005321496, "grad_norm": 7.797136306762695, "learning_rate": 0.00019989762027248064, "loss": 6.6297, "step": 935 }, { "epoch": 0.04467194043741275, "grad_norm": 4.064505100250244, "learning_rate": 0.00019989739393265813, "loss": 5.1576, "step": 936 }, { "epoch": 0.044719666869504, "grad_norm": 7.547177791595459, "learning_rate": 0.00019989716734304563, "loss": 6.4417, "step": 937 }, { "epoch": 0.04476739330159526, "grad_norm": 4.869728088378906, "learning_rate": 0.0001998969405036437, "loss": 6.3855, "step": 938 }, { "epoch": 0.04481511973368651, "grad_norm": 5.0461955070495605, "learning_rate": 0.00019989671341445286, "loss": 5.2461, "step": 939 }, { "epoch": 0.044862846165777764, "grad_norm": 4.460671901702881, "learning_rate": 0.00019989648607547368, "loss": 6.5, "step": 940 }, { "epoch": 0.044910572597869014, "grad_norm": 4.058277130126953, "learning_rate": 0.00019989625848670678, "loss": 6.2384, "step": 941 }, { "epoch": 0.04495829902996027, "grad_norm": 4.937887191772461, "learning_rate": 0.0001998960306481527, "loss": 6.4741, "step": 942 }, { "epoch": 0.04500602546205152, "grad_norm": 6.590853214263916, "learning_rate": 0.000199895802559812, "loss": 7.4206, "step": 943 }, { "epoch": 0.04505375189414278, "grad_norm": 5.189913749694824, "learning_rate": 0.00019989557422168527, "loss": 6.6971, "step": 944 }, { "epoch": 0.045101478326234026, "grad_norm": 4.0167036056518555, "learning_rate": 0.00019989534563377302, "loss": 5.0494, "step": 945 }, { "epoch": 0.04514920475832528, "grad_norm": 4.717167854309082, "learning_rate": 0.0001998951167960759, "loss": 6.0042, "step": 946 }, { "epoch": 0.04519693119041653, "grad_norm": 4.401949882507324, "learning_rate": 0.00019989488770859448, "loss": 6.4458, "step": 947 }, { "epoch": 0.04524465762250778, "grad_norm": 5.108852863311768, "learning_rate": 0.0001998946583713293, "loss": 6.7377, "step": 948 }, { "epoch": 0.04529238405459904, "grad_norm": 4.5663743019104, "learning_rate": 0.00019989442878428092, "loss": 7.5957, "step": 949 }, { "epoch": 0.04534011048669029, "grad_norm": 4.989386081695557, "learning_rate": 0.00019989419894744993, "loss": 5.3255, "step": 950 }, { "epoch": 0.045387836918781545, "grad_norm": 6.568408012390137, "learning_rate": 0.00019989396886083691, "loss": 6.8902, "step": 951 }, { "epoch": 0.045435563350872794, "grad_norm": 5.707376956939697, "learning_rate": 0.00019989373852444245, "loss": 6.609, "step": 952 }, { "epoch": 0.04548328978296405, "grad_norm": 5.835626125335693, "learning_rate": 0.00019989350793826708, "loss": 7.1086, "step": 953 }, { "epoch": 0.0455310162150553, "grad_norm": 4.273617744445801, "learning_rate": 0.00019989327710231143, "loss": 6.5601, "step": 954 }, { "epoch": 0.04557874264714656, "grad_norm": 6.64673376083374, "learning_rate": 0.00019989304601657605, "loss": 7.3089, "step": 955 }, { "epoch": 0.04562646907923781, "grad_norm": 5.273048400878906, "learning_rate": 0.0001998928146810615, "loss": 6.6003, "step": 956 }, { "epoch": 0.04567419551132906, "grad_norm": 4.514017105102539, "learning_rate": 0.00019989258309576844, "loss": 6.05, "step": 957 }, { "epoch": 0.04572192194342031, "grad_norm": 5.334542751312256, "learning_rate": 0.00019989235126069733, "loss": 6.179, "step": 958 }, { "epoch": 0.04576964837551157, "grad_norm": 4.8062639236450195, "learning_rate": 0.0001998921191758488, "loss": 6.4569, "step": 959 }, { "epoch": 0.04581737480760282, "grad_norm": 5.592841148376465, "learning_rate": 0.00019989188684122348, "loss": 5.5724, "step": 960 }, { "epoch": 0.045865101239694075, "grad_norm": 3.947697639465332, "learning_rate": 0.0001998916542568219, "loss": 4.4471, "step": 961 }, { "epoch": 0.045912827671785325, "grad_norm": 4.99360466003418, "learning_rate": 0.00019989142142264467, "loss": 6.0034, "step": 962 }, { "epoch": 0.04596055410387658, "grad_norm": 4.887674808502197, "learning_rate": 0.0001998911883386923, "loss": 5.6171, "step": 963 }, { "epoch": 0.04600828053596783, "grad_norm": 4.912217140197754, "learning_rate": 0.00019989095500496547, "loss": 6.0814, "step": 964 }, { "epoch": 0.04605600696805909, "grad_norm": 4.825184345245361, "learning_rate": 0.0001998907214214647, "loss": 6.7967, "step": 965 }, { "epoch": 0.04610373340015034, "grad_norm": 6.812923431396484, "learning_rate": 0.00019989048758819057, "loss": 8.6487, "step": 966 }, { "epoch": 0.046151459832241594, "grad_norm": 5.65037202835083, "learning_rate": 0.00019989025350514376, "loss": 6.7521, "step": 967 }, { "epoch": 0.046199186264332844, "grad_norm": 6.486409664154053, "learning_rate": 0.0001998900191723247, "loss": 7.4413, "step": 968 }, { "epoch": 0.0462469126964241, "grad_norm": 4.401943683624268, "learning_rate": 0.0001998897845897341, "loss": 5.6543, "step": 969 }, { "epoch": 0.04629463912851535, "grad_norm": 5.280481815338135, "learning_rate": 0.00019988954975737252, "loss": 7.0099, "step": 970 }, { "epoch": 0.046342365560606606, "grad_norm": 13.325774192810059, "learning_rate": 0.00019988931467524053, "loss": 6.0848, "step": 971 }, { "epoch": 0.046390091992697856, "grad_norm": 4.264022350311279, "learning_rate": 0.0001998890793433387, "loss": 5.356, "step": 972 }, { "epoch": 0.046437818424789105, "grad_norm": 6.167986869812012, "learning_rate": 0.00019988884376166764, "loss": 7.761, "step": 973 }, { "epoch": 0.04648554485688036, "grad_norm": 6.079741477966309, "learning_rate": 0.00019988860793022798, "loss": 7.7704, "step": 974 }, { "epoch": 0.04653327128897161, "grad_norm": 8.012140274047852, "learning_rate": 0.00019988837184902022, "loss": 7.1931, "step": 975 }, { "epoch": 0.04658099772106287, "grad_norm": 6.5676984786987305, "learning_rate": 0.00019988813551804498, "loss": 9.78, "step": 976 }, { "epoch": 0.04662872415315412, "grad_norm": 6.935944557189941, "learning_rate": 0.00019988789893730292, "loss": 8.1894, "step": 977 }, { "epoch": 0.046676450585245374, "grad_norm": 7.147305488586426, "learning_rate": 0.00019988766210679457, "loss": 8.4201, "step": 978 }, { "epoch": 0.046724177017336624, "grad_norm": 5.459933280944824, "learning_rate": 0.00019988742502652054, "loss": 6.5252, "step": 979 }, { "epoch": 0.04677190344942788, "grad_norm": 5.302978992462158, "learning_rate": 0.0001998871876964814, "loss": 7.2742, "step": 980 }, { "epoch": 0.04681962988151913, "grad_norm": 6.157651424407959, "learning_rate": 0.00019988695011667775, "loss": 6.1179, "step": 981 }, { "epoch": 0.04686735631361039, "grad_norm": 6.638370037078857, "learning_rate": 0.0001998867122871102, "loss": 6.6726, "step": 982 }, { "epoch": 0.046915082745701636, "grad_norm": 5.652102947235107, "learning_rate": 0.00019988647420777938, "loss": 6.8927, "step": 983 }, { "epoch": 0.04696280917779289, "grad_norm": 9.53808307647705, "learning_rate": 0.0001998862358786858, "loss": 6.3198, "step": 984 }, { "epoch": 0.04701053560988414, "grad_norm": 4.738818645477295, "learning_rate": 0.0001998859972998301, "loss": 5.0822, "step": 985 }, { "epoch": 0.0470582620419754, "grad_norm": 7.873166084289551, "learning_rate": 0.00019988575847121288, "loss": 9.3843, "step": 986 }, { "epoch": 0.04710598847406665, "grad_norm": 5.429816246032715, "learning_rate": 0.00019988551939283475, "loss": 6.1037, "step": 987 }, { "epoch": 0.047153714906157905, "grad_norm": 4.602708339691162, "learning_rate": 0.0001998852800646963, "loss": 6.6626, "step": 988 }, { "epoch": 0.047201441338249155, "grad_norm": 4.438327789306641, "learning_rate": 0.0001998850404867981, "loss": 6.1862, "step": 989 }, { "epoch": 0.04724916777034041, "grad_norm": 5.587163925170898, "learning_rate": 0.00019988480065914077, "loss": 6.062, "step": 990 }, { "epoch": 0.04729689420243166, "grad_norm": 6.313799858093262, "learning_rate": 0.0001998845605817249, "loss": 6.9393, "step": 991 }, { "epoch": 0.04734462063452292, "grad_norm": 6.336730480194092, "learning_rate": 0.0001998843202545511, "loss": 5.6876, "step": 992 }, { "epoch": 0.04739234706661417, "grad_norm": 4.389042854309082, "learning_rate": 0.00019988407967761997, "loss": 6.2301, "step": 993 }, { "epoch": 0.04744007349870542, "grad_norm": 4.943663120269775, "learning_rate": 0.00019988383885093211, "loss": 7.6153, "step": 994 }, { "epoch": 0.04748779993079667, "grad_norm": 4.959845542907715, "learning_rate": 0.00019988359777448812, "loss": 6.4427, "step": 995 }, { "epoch": 0.04753552636288793, "grad_norm": 4.92530632019043, "learning_rate": 0.00019988335644828862, "loss": 5.6377, "step": 996 }, { "epoch": 0.04758325279497918, "grad_norm": 4.722359657287598, "learning_rate": 0.00019988311487233423, "loss": 5.3645, "step": 997 }, { "epoch": 0.047630979227070436, "grad_norm": 5.177328109741211, "learning_rate": 0.00019988287304662547, "loss": 6.5359, "step": 998 }, { "epoch": 0.047678705659161685, "grad_norm": 5.293278217315674, "learning_rate": 0.000199882630971163, "loss": 5.9198, "step": 999 }, { "epoch": 0.047726432091252935, "grad_norm": 6.693418502807617, "learning_rate": 0.00019988238864594745, "loss": 7.2242, "step": 1000 }, { "epoch": 0.047726432091252935, "eval_loss": 1.6225401163101196, "eval_runtime": 96.5212, "eval_samples_per_second": 8.734, "eval_steps_per_second": 4.372, "step": 1000 }, { "epoch": 0.04777415852334419, "grad_norm": 3.982450008392334, "learning_rate": 0.0001998821460709794, "loss": 4.7282, "step": 1001 }, { "epoch": 0.04782188495543544, "grad_norm": 4.383751392364502, "learning_rate": 0.00019988190324625945, "loss": 5.3222, "step": 1002 }, { "epoch": 0.0478696113875267, "grad_norm": 3.7811129093170166, "learning_rate": 0.0001998816601717882, "loss": 4.7759, "step": 1003 }, { "epoch": 0.04791733781961795, "grad_norm": 4.680543422698975, "learning_rate": 0.00019988141684756626, "loss": 6.2151, "step": 1004 }, { "epoch": 0.047965064251709204, "grad_norm": 5.8284783363342285, "learning_rate": 0.0001998811732735943, "loss": 6.5275, "step": 1005 }, { "epoch": 0.04801279068380045, "grad_norm": 5.888545989990234, "learning_rate": 0.00019988092944987282, "loss": 5.9785, "step": 1006 }, { "epoch": 0.04806051711589171, "grad_norm": 4.542748928070068, "learning_rate": 0.00019988068537640254, "loss": 5.8873, "step": 1007 }, { "epoch": 0.04810824354798296, "grad_norm": 6.352034091949463, "learning_rate": 0.000199880441053184, "loss": 7.8587, "step": 1008 }, { "epoch": 0.048155969980074216, "grad_norm": 4.588389873504639, "learning_rate": 0.0001998801964802178, "loss": 6.3699, "step": 1009 }, { "epoch": 0.048203696412165466, "grad_norm": 5.194941520690918, "learning_rate": 0.00019987995165750462, "loss": 6.2729, "step": 1010 }, { "epoch": 0.04825142284425672, "grad_norm": 4.829345703125, "learning_rate": 0.00019987970658504503, "loss": 6.0763, "step": 1011 }, { "epoch": 0.04829914927634797, "grad_norm": 5.661465644836426, "learning_rate": 0.00019987946126283964, "loss": 7.2686, "step": 1012 }, { "epoch": 0.04834687570843923, "grad_norm": 4.653143882751465, "learning_rate": 0.00019987921569088903, "loss": 6.0923, "step": 1013 }, { "epoch": 0.04839460214053048, "grad_norm": 4.28420352935791, "learning_rate": 0.0001998789698691939, "loss": 5.5359, "step": 1014 }, { "epoch": 0.048442328572621735, "grad_norm": 6.598835468292236, "learning_rate": 0.00019987872379775482, "loss": 8.1344, "step": 1015 }, { "epoch": 0.048490055004712984, "grad_norm": 5.757112979888916, "learning_rate": 0.00019987847747657235, "loss": 6.1497, "step": 1016 }, { "epoch": 0.04853778143680424, "grad_norm": 5.738983154296875, "learning_rate": 0.0001998782309056472, "loss": 5.8063, "step": 1017 }, { "epoch": 0.04858550786889549, "grad_norm": 4.124687194824219, "learning_rate": 0.00019987798408497995, "loss": 4.9015, "step": 1018 }, { "epoch": 0.04863323430098675, "grad_norm": 6.530104160308838, "learning_rate": 0.0001998777370145712, "loss": 7.4121, "step": 1019 }, { "epoch": 0.048680960733077996, "grad_norm": 4.622954368591309, "learning_rate": 0.0001998774896944216, "loss": 5.9544, "step": 1020 }, { "epoch": 0.04872868716516925, "grad_norm": 4.708102703094482, "learning_rate": 0.0001998772421245317, "loss": 6.3914, "step": 1021 }, { "epoch": 0.0487764135972605, "grad_norm": 4.251955032348633, "learning_rate": 0.00019987699430490223, "loss": 6.951, "step": 1022 }, { "epoch": 0.04882414002935176, "grad_norm": 4.7548604011535645, "learning_rate": 0.00019987674623553372, "loss": 5.3874, "step": 1023 }, { "epoch": 0.04887186646144301, "grad_norm": 6.010146141052246, "learning_rate": 0.00019987649791642682, "loss": 7.6085, "step": 1024 }, { "epoch": 0.04891959289353426, "grad_norm": 4.291179656982422, "learning_rate": 0.00019987624934758214, "loss": 6.2565, "step": 1025 }, { "epoch": 0.048967319325625515, "grad_norm": 4.319268703460693, "learning_rate": 0.00019987600052900033, "loss": 5.2744, "step": 1026 }, { "epoch": 0.049015045757716764, "grad_norm": 4.772900581359863, "learning_rate": 0.00019987575146068198, "loss": 5.7683, "step": 1027 }, { "epoch": 0.04906277218980802, "grad_norm": 4.654770374298096, "learning_rate": 0.00019987550214262774, "loss": 4.5878, "step": 1028 }, { "epoch": 0.04911049862189927, "grad_norm": 5.684296607971191, "learning_rate": 0.00019987525257483818, "loss": 6.8889, "step": 1029 }, { "epoch": 0.04915822505399053, "grad_norm": 5.211754322052002, "learning_rate": 0.000199875002757314, "loss": 6.9047, "step": 1030 }, { "epoch": 0.04920595148608178, "grad_norm": 4.064318656921387, "learning_rate": 0.00019987475269005577, "loss": 6.1023, "step": 1031 }, { "epoch": 0.04925367791817303, "grad_norm": 3.863576889038086, "learning_rate": 0.00019987450237306416, "loss": 5.1943, "step": 1032 }, { "epoch": 0.04930140435026428, "grad_norm": 11.64242172241211, "learning_rate": 0.00019987425180633973, "loss": 6.6168, "step": 1033 }, { "epoch": 0.04934913078235554, "grad_norm": 4.990972518920898, "learning_rate": 0.00019987400098988315, "loss": 5.7149, "step": 1034 }, { "epoch": 0.04939685721444679, "grad_norm": 5.527631759643555, "learning_rate": 0.00019987374992369507, "loss": 7.013, "step": 1035 }, { "epoch": 0.049444583646538046, "grad_norm": 4.625354290008545, "learning_rate": 0.00019987349860777607, "loss": 6.0626, "step": 1036 }, { "epoch": 0.049492310078629295, "grad_norm": 3.995800495147705, "learning_rate": 0.00019987324704212682, "loss": 4.5538, "step": 1037 }, { "epoch": 0.04954003651072055, "grad_norm": 5.268013954162598, "learning_rate": 0.0001998729952267479, "loss": 6.4556, "step": 1038 }, { "epoch": 0.0495877629428118, "grad_norm": 6.213385105133057, "learning_rate": 0.00019987274316163997, "loss": 7.9345, "step": 1039 }, { "epoch": 0.04963548937490306, "grad_norm": 4.998571872711182, "learning_rate": 0.00019987249084680369, "loss": 6.3542, "step": 1040 }, { "epoch": 0.04968321580699431, "grad_norm": 5.2931809425354, "learning_rate": 0.00019987223828223962, "loss": 6.7211, "step": 1041 }, { "epoch": 0.049730942239085564, "grad_norm": 5.426149845123291, "learning_rate": 0.00019987198546794844, "loss": 6.8105, "step": 1042 }, { "epoch": 0.049778668671176814, "grad_norm": 5.814045429229736, "learning_rate": 0.0001998717324039308, "loss": 6.9914, "step": 1043 }, { "epoch": 0.04982639510326807, "grad_norm": 6.0917134284973145, "learning_rate": 0.00019987147909018727, "loss": 7.0058, "step": 1044 }, { "epoch": 0.04987412153535932, "grad_norm": 4.31552791595459, "learning_rate": 0.00019987122552671854, "loss": 4.9764, "step": 1045 }, { "epoch": 0.049921847967450576, "grad_norm": 5.409708499908447, "learning_rate": 0.0001998709717135252, "loss": 6.4111, "step": 1046 }, { "epoch": 0.049969574399541826, "grad_norm": 5.168500900268555, "learning_rate": 0.0001998707176506079, "loss": 7.1758, "step": 1047 }, { "epoch": 0.05001730083163308, "grad_norm": 6.697951793670654, "learning_rate": 0.00019987046333796733, "loss": 9.1732, "step": 1048 }, { "epoch": 0.05006502726372433, "grad_norm": 4.68121862411499, "learning_rate": 0.00019987020877560406, "loss": 5.0227, "step": 1049 }, { "epoch": 0.05011275369581558, "grad_norm": 6.713630676269531, "learning_rate": 0.00019986995396351873, "loss": 5.6485, "step": 1050 }, { "epoch": 0.05016048012790684, "grad_norm": 5.136265277862549, "learning_rate": 0.000199869698901712, "loss": 6.246, "step": 1051 }, { "epoch": 0.05020820655999809, "grad_norm": 4.445631980895996, "learning_rate": 0.0001998694435901845, "loss": 6.1888, "step": 1052 }, { "epoch": 0.050255932992089344, "grad_norm": 5.714282512664795, "learning_rate": 0.00019986918802893686, "loss": 8.6711, "step": 1053 }, { "epoch": 0.050303659424180594, "grad_norm": 6.590182781219482, "learning_rate": 0.00019986893221796975, "loss": 7.7472, "step": 1054 }, { "epoch": 0.05035138585627185, "grad_norm": 4.867012023925781, "learning_rate": 0.00019986867615728375, "loss": 6.1684, "step": 1055 }, { "epoch": 0.0503991122883631, "grad_norm": 5.920507907867432, "learning_rate": 0.00019986841984687956, "loss": 5.7659, "step": 1056 }, { "epoch": 0.05044683872045436, "grad_norm": 5.206281661987305, "learning_rate": 0.0001998681632867578, "loss": 6.1972, "step": 1057 }, { "epoch": 0.050494565152545606, "grad_norm": 7.144778251647949, "learning_rate": 0.0001998679064769191, "loss": 7.4238, "step": 1058 }, { "epoch": 0.05054229158463686, "grad_norm": 6.907451152801514, "learning_rate": 0.00019986764941736413, "loss": 6.7202, "step": 1059 }, { "epoch": 0.05059001801672811, "grad_norm": 4.997003078460693, "learning_rate": 0.00019986739210809347, "loss": 6.538, "step": 1060 }, { "epoch": 0.05063774444881937, "grad_norm": 4.924596786499023, "learning_rate": 0.00019986713454910786, "loss": 6.1636, "step": 1061 }, { "epoch": 0.05068547088091062, "grad_norm": 4.833995819091797, "learning_rate": 0.00019986687674040788, "loss": 5.5879, "step": 1062 }, { "epoch": 0.050733197313001875, "grad_norm": 6.053867816925049, "learning_rate": 0.00019986661868199416, "loss": 7.3043, "step": 1063 }, { "epoch": 0.050780923745093125, "grad_norm": 5.307857990264893, "learning_rate": 0.0001998663603738674, "loss": 6.9255, "step": 1064 }, { "epoch": 0.05082865017718438, "grad_norm": 5.850740432739258, "learning_rate": 0.0001998661018160282, "loss": 7.1374, "step": 1065 }, { "epoch": 0.05087637660927563, "grad_norm": 4.8014326095581055, "learning_rate": 0.00019986584300847723, "loss": 6.1535, "step": 1066 }, { "epoch": 0.05092410304136689, "grad_norm": 6.172316074371338, "learning_rate": 0.00019986558395121513, "loss": 5.6689, "step": 1067 }, { "epoch": 0.05097182947345814, "grad_norm": 5.152455806732178, "learning_rate": 0.00019986532464424256, "loss": 5.4864, "step": 1068 }, { "epoch": 0.051019555905549394, "grad_norm": 3.440164089202881, "learning_rate": 0.00019986506508756015, "loss": 4.5576, "step": 1069 }, { "epoch": 0.05106728233764064, "grad_norm": 5.030889511108398, "learning_rate": 0.00019986480528116855, "loss": 6.2356, "step": 1070 }, { "epoch": 0.0511150087697319, "grad_norm": 4.914235591888428, "learning_rate": 0.00019986454522506845, "loss": 6.1619, "step": 1071 }, { "epoch": 0.05116273520182315, "grad_norm": 5.566851615905762, "learning_rate": 0.00019986428491926042, "loss": 7.0477, "step": 1072 }, { "epoch": 0.051210461633914406, "grad_norm": 4.793924331665039, "learning_rate": 0.00019986402436374516, "loss": 5.3626, "step": 1073 }, { "epoch": 0.051258188066005655, "grad_norm": 6.499633312225342, "learning_rate": 0.00019986376355852334, "loss": 5.851, "step": 1074 }, { "epoch": 0.051305914498096905, "grad_norm": 7.014519691467285, "learning_rate": 0.00019986350250359559, "loss": 8.3398, "step": 1075 }, { "epoch": 0.05135364093018816, "grad_norm": 7.875018119812012, "learning_rate": 0.00019986324119896254, "loss": 8.6885, "step": 1076 }, { "epoch": 0.05140136736227941, "grad_norm": 7.861631393432617, "learning_rate": 0.0001998629796446249, "loss": 6.8349, "step": 1077 }, { "epoch": 0.05144909379437067, "grad_norm": 5.950527667999268, "learning_rate": 0.00019986271784058329, "loss": 6.3815, "step": 1078 }, { "epoch": 0.05149682022646192, "grad_norm": 5.826357364654541, "learning_rate": 0.00019986245578683834, "loss": 7.1728, "step": 1079 }, { "epoch": 0.051544546658553174, "grad_norm": 6.646771430969238, "learning_rate": 0.00019986219348339072, "loss": 5.9697, "step": 1080 }, { "epoch": 0.051592273090644424, "grad_norm": 4.821702003479004, "learning_rate": 0.00019986193093024111, "loss": 6.7591, "step": 1081 }, { "epoch": 0.05163999952273568, "grad_norm": 5.375950336456299, "learning_rate": 0.00019986166812739018, "loss": 5.3714, "step": 1082 }, { "epoch": 0.05168772595482693, "grad_norm": 5.14496374130249, "learning_rate": 0.00019986140507483854, "loss": 6.2763, "step": 1083 }, { "epoch": 0.051735452386918186, "grad_norm": 6.648619174957275, "learning_rate": 0.00019986114177258685, "loss": 7.1802, "step": 1084 }, { "epoch": 0.051783178819009436, "grad_norm": 4.715754508972168, "learning_rate": 0.00019986087822063584, "loss": 5.4824, "step": 1085 }, { "epoch": 0.05183090525110069, "grad_norm": 5.885828495025635, "learning_rate": 0.00019986061441898607, "loss": 6.5953, "step": 1086 }, { "epoch": 0.05187863168319194, "grad_norm": 4.4462738037109375, "learning_rate": 0.00019986035036763825, "loss": 6.1619, "step": 1087 }, { "epoch": 0.0519263581152832, "grad_norm": 5.530253887176514, "learning_rate": 0.00019986008606659306, "loss": 4.9084, "step": 1088 }, { "epoch": 0.05197408454737445, "grad_norm": 5.773584365844727, "learning_rate": 0.0001998598215158511, "loss": 7.1593, "step": 1089 }, { "epoch": 0.052021810979465705, "grad_norm": 6.453237056732178, "learning_rate": 0.0001998595567154131, "loss": 7.2424, "step": 1090 }, { "epoch": 0.052069537411556954, "grad_norm": 4.998067855834961, "learning_rate": 0.00019985929166527967, "loss": 5.9513, "step": 1091 }, { "epoch": 0.05211726384364821, "grad_norm": 5.26843786239624, "learning_rate": 0.0001998590263654515, "loss": 6.1763, "step": 1092 }, { "epoch": 0.05216499027573946, "grad_norm": 5.641318321228027, "learning_rate": 0.00019985876081592924, "loss": 7.0811, "step": 1093 }, { "epoch": 0.05221271670783072, "grad_norm": 5.9788055419921875, "learning_rate": 0.0001998584950167136, "loss": 6.0497, "step": 1094 }, { "epoch": 0.05226044313992197, "grad_norm": 5.755883693695068, "learning_rate": 0.00019985822896780516, "loss": 6.8116, "step": 1095 }, { "epoch": 0.05230816957201322, "grad_norm": 5.647629261016846, "learning_rate": 0.0001998579626692046, "loss": 5.991, "step": 1096 }, { "epoch": 0.05235589600410447, "grad_norm": 4.610182762145996, "learning_rate": 0.00019985769612091268, "loss": 6.1028, "step": 1097 }, { "epoch": 0.05240362243619573, "grad_norm": 4.9788498878479, "learning_rate": 0.00019985742932292997, "loss": 6.8862, "step": 1098 }, { "epoch": 0.05245134886828698, "grad_norm": 7.177347183227539, "learning_rate": 0.0001998571622752572, "loss": 7.3631, "step": 1099 }, { "epoch": 0.052499075300378235, "grad_norm": 4.670802593231201, "learning_rate": 0.00019985689497789498, "loss": 5.1865, "step": 1100 }, { "epoch": 0.052546801732469485, "grad_norm": 9.47829532623291, "learning_rate": 0.00019985662743084398, "loss": 7.9464, "step": 1101 }, { "epoch": 0.052594528164560735, "grad_norm": 7.647347450256348, "learning_rate": 0.00019985635963410493, "loss": 9.2026, "step": 1102 }, { "epoch": 0.05264225459665199, "grad_norm": 5.180408954620361, "learning_rate": 0.00019985609158767846, "loss": 6.4986, "step": 1103 }, { "epoch": 0.05268998102874324, "grad_norm": 5.6756911277771, "learning_rate": 0.00019985582329156527, "loss": 6.0234, "step": 1104 }, { "epoch": 0.0527377074608345, "grad_norm": 4.4249348640441895, "learning_rate": 0.00019985555474576598, "loss": 6.6278, "step": 1105 }, { "epoch": 0.05278543389292575, "grad_norm": 6.0559492111206055, "learning_rate": 0.00019985528595028128, "loss": 6.9083, "step": 1106 }, { "epoch": 0.052833160325017, "grad_norm": 4.614184379577637, "learning_rate": 0.00019985501690511184, "loss": 6.6513, "step": 1107 }, { "epoch": 0.05288088675710825, "grad_norm": 6.088013172149658, "learning_rate": 0.00019985474761025836, "loss": 7.4275, "step": 1108 }, { "epoch": 0.05292861318919951, "grad_norm": 4.85730504989624, "learning_rate": 0.0001998544780657215, "loss": 5.7579, "step": 1109 }, { "epoch": 0.05297633962129076, "grad_norm": 4.3683085441589355, "learning_rate": 0.0001998542082715019, "loss": 4.6668, "step": 1110 }, { "epoch": 0.053024066053382016, "grad_norm": 7.553590297698975, "learning_rate": 0.0001998539382276003, "loss": 7.65, "step": 1111 }, { "epoch": 0.053071792485473265, "grad_norm": 4.7064714431762695, "learning_rate": 0.0001998536679340173, "loss": 4.9334, "step": 1112 }, { "epoch": 0.05311951891756452, "grad_norm": 5.9359660148620605, "learning_rate": 0.00019985339739075366, "loss": 7.1572, "step": 1113 }, { "epoch": 0.05316724534965577, "grad_norm": 7.483563423156738, "learning_rate": 0.00019985312659780998, "loss": 8.4957, "step": 1114 }, { "epoch": 0.05321497178174703, "grad_norm": 4.971370220184326, "learning_rate": 0.00019985285555518698, "loss": 6.7571, "step": 1115 }, { "epoch": 0.05326269821383828, "grad_norm": 5.1564741134643555, "learning_rate": 0.00019985258426288532, "loss": 6.1666, "step": 1116 }, { "epoch": 0.053310424645929534, "grad_norm": 6.064185619354248, "learning_rate": 0.00019985231272090569, "loss": 7.203, "step": 1117 }, { "epoch": 0.053358151078020784, "grad_norm": 5.504134178161621, "learning_rate": 0.00019985204092924877, "loss": 7.003, "step": 1118 }, { "epoch": 0.05340587751011204, "grad_norm": 4.722824573516846, "learning_rate": 0.00019985176888791522, "loss": 6.7303, "step": 1119 }, { "epoch": 0.05345360394220329, "grad_norm": 5.977590560913086, "learning_rate": 0.0001998514965969057, "loss": 6.732, "step": 1120 }, { "epoch": 0.053501330374294546, "grad_norm": 5.4381937980651855, "learning_rate": 0.00019985122405622097, "loss": 6.71, "step": 1121 }, { "epoch": 0.053549056806385796, "grad_norm": 6.722708225250244, "learning_rate": 0.00019985095126586166, "loss": 7.0432, "step": 1122 }, { "epoch": 0.05359678323847705, "grad_norm": 4.659569263458252, "learning_rate": 0.00019985067822582845, "loss": 4.4901, "step": 1123 }, { "epoch": 0.0536445096705683, "grad_norm": 4.766648292541504, "learning_rate": 0.00019985040493612205, "loss": 7.1615, "step": 1124 }, { "epoch": 0.05369223610265956, "grad_norm": 6.026020050048828, "learning_rate": 0.0001998501313967431, "loss": 6.1998, "step": 1125 }, { "epoch": 0.05373996253475081, "grad_norm": 5.1706647872924805, "learning_rate": 0.0001998498576076923, "loss": 5.0663, "step": 1126 }, { "epoch": 0.05378768896684206, "grad_norm": 6.402740001678467, "learning_rate": 0.00019984958356897035, "loss": 6.8489, "step": 1127 }, { "epoch": 0.053835415398933315, "grad_norm": 5.805992126464844, "learning_rate": 0.00019984930928057794, "loss": 6.5616, "step": 1128 }, { "epoch": 0.053883141831024564, "grad_norm": 4.082311630249023, "learning_rate": 0.00019984903474251574, "loss": 5.2591, "step": 1129 }, { "epoch": 0.05393086826311582, "grad_norm": 4.216063022613525, "learning_rate": 0.00019984875995478443, "loss": 4.9084, "step": 1130 }, { "epoch": 0.05397859469520707, "grad_norm": 4.6913065910339355, "learning_rate": 0.00019984848491738473, "loss": 6.2884, "step": 1131 }, { "epoch": 0.05402632112729833, "grad_norm": 6.930257320404053, "learning_rate": 0.00019984820963031728, "loss": 8.423, "step": 1132 }, { "epoch": 0.054074047559389576, "grad_norm": 4.628183841705322, "learning_rate": 0.0001998479340935828, "loss": 4.594, "step": 1133 }, { "epoch": 0.05412177399148083, "grad_norm": 5.1324567794799805, "learning_rate": 0.00019984765830718197, "loss": 5.9156, "step": 1134 }, { "epoch": 0.05416950042357208, "grad_norm": 5.622409343719482, "learning_rate": 0.0001998473822711155, "loss": 6.8418, "step": 1135 }, { "epoch": 0.05421722685566334, "grad_norm": 5.3281378746032715, "learning_rate": 0.00019984710598538403, "loss": 6.8281, "step": 1136 }, { "epoch": 0.05426495328775459, "grad_norm": 5.083497524261475, "learning_rate": 0.0001998468294499883, "loss": 6.7846, "step": 1137 }, { "epoch": 0.054312679719845845, "grad_norm": 4.499049186706543, "learning_rate": 0.00019984655266492898, "loss": 5.9195, "step": 1138 }, { "epoch": 0.054360406151937095, "grad_norm": 5.126265048980713, "learning_rate": 0.00019984627563020678, "loss": 4.735, "step": 1139 }, { "epoch": 0.05440813258402835, "grad_norm": 6.623693466186523, "learning_rate": 0.00019984599834582233, "loss": 7.9586, "step": 1140 }, { "epoch": 0.0544558590161196, "grad_norm": 5.693083763122559, "learning_rate": 0.0001998457208117764, "loss": 6.8345, "step": 1141 }, { "epoch": 0.05450358544821086, "grad_norm": 7.059975624084473, "learning_rate": 0.00019984544302806972, "loss": 5.5992, "step": 1142 }, { "epoch": 0.05455131188030211, "grad_norm": 4.278986930847168, "learning_rate": 0.00019984516499470286, "loss": 4.7835, "step": 1143 }, { "epoch": 0.054599038312393364, "grad_norm": 6.068154811859131, "learning_rate": 0.00019984488671167657, "loss": 6.6768, "step": 1144 }, { "epoch": 0.05464676474448461, "grad_norm": 8.686722755432129, "learning_rate": 0.00019984460817899156, "loss": 7.1451, "step": 1145 }, { "epoch": 0.05469449117657587, "grad_norm": 5.142326831817627, "learning_rate": 0.0001998443293966485, "loss": 7.0095, "step": 1146 }, { "epoch": 0.05474221760866712, "grad_norm": 5.32951021194458, "learning_rate": 0.00019984405036464815, "loss": 7.2642, "step": 1147 }, { "epoch": 0.054789944040758376, "grad_norm": 4.901379585266113, "learning_rate": 0.00019984377108299114, "loss": 6.3059, "step": 1148 }, { "epoch": 0.054837670472849626, "grad_norm": 4.03926420211792, "learning_rate": 0.0001998434915516782, "loss": 5.6945, "step": 1149 }, { "epoch": 0.05488539690494088, "grad_norm": 3.957887887954712, "learning_rate": 0.00019984321177071, "loss": 4.7959, "step": 1150 }, { "epoch": 0.05493312333703213, "grad_norm": 3.7819643020629883, "learning_rate": 0.00019984293174008725, "loss": 4.8499, "step": 1151 }, { "epoch": 0.05498084976912338, "grad_norm": 8.007612228393555, "learning_rate": 0.00019984265145981073, "loss": 6.9894, "step": 1152 }, { "epoch": 0.05502857620121464, "grad_norm": 4.035107135772705, "learning_rate": 0.00019984237092988098, "loss": 4.3952, "step": 1153 }, { "epoch": 0.05507630263330589, "grad_norm": 4.920530319213867, "learning_rate": 0.00019984209015029886, "loss": 6.7073, "step": 1154 }, { "epoch": 0.055124029065397144, "grad_norm": 5.93609619140625, "learning_rate": 0.000199841809121065, "loss": 7.668, "step": 1155 }, { "epoch": 0.055171755497488394, "grad_norm": 5.07185173034668, "learning_rate": 0.00019984152784218004, "loss": 6.1871, "step": 1156 }, { "epoch": 0.05521948192957965, "grad_norm": 6.424498081207275, "learning_rate": 0.0001998412463136448, "loss": 5.6898, "step": 1157 }, { "epoch": 0.0552672083616709, "grad_norm": 5.224612236022949, "learning_rate": 0.00019984096453545995, "loss": 4.8568, "step": 1158 }, { "epoch": 0.055314934793762156, "grad_norm": 7.0444536209106445, "learning_rate": 0.00019984068250762616, "loss": 7.2326, "step": 1159 }, { "epoch": 0.055362661225853406, "grad_norm": 7.36407995223999, "learning_rate": 0.00019984040023014418, "loss": 6.8475, "step": 1160 }, { "epoch": 0.05541038765794466, "grad_norm": 12.85987663269043, "learning_rate": 0.00019984011770301466, "loss": 8.9236, "step": 1161 }, { "epoch": 0.05545811409003591, "grad_norm": 6.9210124015808105, "learning_rate": 0.00019983983492623833, "loss": 6.9435, "step": 1162 }, { "epoch": 0.05550584052212717, "grad_norm": 3.869915246963501, "learning_rate": 0.0001998395518998159, "loss": 4.6063, "step": 1163 }, { "epoch": 0.05555356695421842, "grad_norm": 5.981685161590576, "learning_rate": 0.0001998392686237481, "loss": 7.7304, "step": 1164 }, { "epoch": 0.055601293386309675, "grad_norm": 3.4333200454711914, "learning_rate": 0.0001998389850980356, "loss": 4.9103, "step": 1165 }, { "epoch": 0.055649019818400924, "grad_norm": 4.8307600021362305, "learning_rate": 0.00019983870132267914, "loss": 4.6898, "step": 1166 }, { "epoch": 0.05569674625049218, "grad_norm": 5.47720193862915, "learning_rate": 0.00019983841729767944, "loss": 7.3213, "step": 1167 }, { "epoch": 0.05574447268258343, "grad_norm": 5.845385551452637, "learning_rate": 0.00019983813302303717, "loss": 5.8094, "step": 1168 }, { "epoch": 0.05579219911467469, "grad_norm": 5.838316440582275, "learning_rate": 0.00019983784849875307, "loss": 6.9801, "step": 1169 }, { "epoch": 0.05583992554676594, "grad_norm": 6.361464023590088, "learning_rate": 0.00019983756372482782, "loss": 7.5671, "step": 1170 }, { "epoch": 0.05588765197885719, "grad_norm": 3.902689218521118, "learning_rate": 0.00019983727870126217, "loss": 6.4714, "step": 1171 }, { "epoch": 0.05593537841094844, "grad_norm": 4.72988748550415, "learning_rate": 0.0001998369934280568, "loss": 6.121, "step": 1172 }, { "epoch": 0.0559831048430397, "grad_norm": 4.2512335777282715, "learning_rate": 0.00019983670790521245, "loss": 4.8785, "step": 1173 }, { "epoch": 0.05603083127513095, "grad_norm": 5.300204277038574, "learning_rate": 0.00019983642213272982, "loss": 6.5145, "step": 1174 }, { "epoch": 0.056078557707222206, "grad_norm": 8.376568794250488, "learning_rate": 0.00019983613611060963, "loss": 6.9734, "step": 1175 }, { "epoch": 0.056126284139313455, "grad_norm": 4.279684543609619, "learning_rate": 0.00019983584983885257, "loss": 5.4829, "step": 1176 }, { "epoch": 0.05617401057140471, "grad_norm": 4.573862075805664, "learning_rate": 0.00019983556331745942, "loss": 6.2195, "step": 1177 }, { "epoch": 0.05622173700349596, "grad_norm": 8.839829444885254, "learning_rate": 0.00019983527654643082, "loss": 7.4778, "step": 1178 }, { "epoch": 0.05626946343558721, "grad_norm": 5.623775005340576, "learning_rate": 0.00019983498952576753, "loss": 6.9196, "step": 1179 }, { "epoch": 0.05631718986767847, "grad_norm": 6.764212608337402, "learning_rate": 0.00019983470225547027, "loss": 7.9646, "step": 1180 }, { "epoch": 0.05636491629976972, "grad_norm": 5.477482795715332, "learning_rate": 0.00019983441473553975, "loss": 7.1554, "step": 1181 }, { "epoch": 0.056412642731860974, "grad_norm": 5.284195899963379, "learning_rate": 0.00019983412696597666, "loss": 6.0003, "step": 1182 }, { "epoch": 0.05646036916395222, "grad_norm": 4.522648811340332, "learning_rate": 0.00019983383894678176, "loss": 6.7204, "step": 1183 }, { "epoch": 0.05650809559604348, "grad_norm": 5.639135837554932, "learning_rate": 0.00019983355067795577, "loss": 6.8315, "step": 1184 }, { "epoch": 0.05655582202813473, "grad_norm": 7.610204696655273, "learning_rate": 0.00019983326215949939, "loss": 7.7124, "step": 1185 }, { "epoch": 0.056603548460225986, "grad_norm": 5.576025009155273, "learning_rate": 0.00019983297339141336, "loss": 7.4557, "step": 1186 }, { "epoch": 0.056651274892317235, "grad_norm": 5.472518444061279, "learning_rate": 0.00019983268437369836, "loss": 7.1166, "step": 1187 }, { "epoch": 0.05669900132440849, "grad_norm": 3.9628682136535645, "learning_rate": 0.00019983239510635515, "loss": 4.9649, "step": 1188 }, { "epoch": 0.05674672775649974, "grad_norm": 4.020157337188721, "learning_rate": 0.00019983210558938445, "loss": 5.6404, "step": 1189 }, { "epoch": 0.056794454188591, "grad_norm": 5.640921115875244, "learning_rate": 0.00019983181582278702, "loss": 6.9914, "step": 1190 }, { "epoch": 0.05684218062068225, "grad_norm": 6.915165901184082, "learning_rate": 0.0001998315258065635, "loss": 7.7097, "step": 1191 }, { "epoch": 0.056889907052773504, "grad_norm": 5.602634906768799, "learning_rate": 0.00019983123554071468, "loss": 7.2969, "step": 1192 }, { "epoch": 0.056937633484864754, "grad_norm": 5.144901275634766, "learning_rate": 0.00019983094502524124, "loss": 6.5638, "step": 1193 }, { "epoch": 0.05698535991695601, "grad_norm": 4.76383638381958, "learning_rate": 0.00019983065426014394, "loss": 5.7529, "step": 1194 }, { "epoch": 0.05703308634904726, "grad_norm": 7.6515116691589355, "learning_rate": 0.00019983036324542353, "loss": 6.5345, "step": 1195 }, { "epoch": 0.05708081278113852, "grad_norm": 5.488112926483154, "learning_rate": 0.00019983007198108067, "loss": 6.4554, "step": 1196 }, { "epoch": 0.057128539213229766, "grad_norm": 5.051981449127197, "learning_rate": 0.00019982978046711615, "loss": 7.3895, "step": 1197 }, { "epoch": 0.05717626564532102, "grad_norm": 5.33380651473999, "learning_rate": 0.00019982948870353066, "loss": 6.2182, "step": 1198 }, { "epoch": 0.05722399207741227, "grad_norm": 6.080343723297119, "learning_rate": 0.00019982919669032497, "loss": 6.4017, "step": 1199 }, { "epoch": 0.05727171850950353, "grad_norm": 5.60158634185791, "learning_rate": 0.00019982890442749976, "loss": 6.4631, "step": 1200 }, { "epoch": 0.05727171850950353, "eval_loss": 1.628016710281372, "eval_runtime": 96.4699, "eval_samples_per_second": 8.738, "eval_steps_per_second": 4.374, "step": 1200 }, { "epoch": 0.05731944494159478, "grad_norm": 5.992093086242676, "learning_rate": 0.00019982861191505577, "loss": 6.8994, "step": 1201 }, { "epoch": 0.057367171373686035, "grad_norm": 5.991743564605713, "learning_rate": 0.00019982831915299379, "loss": 6.6767, "step": 1202 }, { "epoch": 0.057414897805777285, "grad_norm": 5.2606377601623535, "learning_rate": 0.0001998280261413145, "loss": 6.6699, "step": 1203 }, { "epoch": 0.057462624237868534, "grad_norm": 6.123988628387451, "learning_rate": 0.0001998277328800186, "loss": 7.6825, "step": 1204 }, { "epoch": 0.05751035066995979, "grad_norm": 4.713397026062012, "learning_rate": 0.00019982743936910688, "loss": 6.376, "step": 1205 }, { "epoch": 0.05755807710205104, "grad_norm": 4.936553955078125, "learning_rate": 0.00019982714560858007, "loss": 4.5528, "step": 1206 }, { "epoch": 0.0576058035341423, "grad_norm": 4.821127414703369, "learning_rate": 0.0001998268515984389, "loss": 5.7587, "step": 1207 }, { "epoch": 0.05765352996623355, "grad_norm": 4.191883087158203, "learning_rate": 0.00019982655733868408, "loss": 5.9239, "step": 1208 }, { "epoch": 0.0577012563983248, "grad_norm": 4.648068428039551, "learning_rate": 0.00019982626282931637, "loss": 5.6163, "step": 1209 }, { "epoch": 0.05774898283041605, "grad_norm": 4.575096607208252, "learning_rate": 0.0001998259680703365, "loss": 5.5589, "step": 1210 }, { "epoch": 0.05779670926250731, "grad_norm": 11.58159351348877, "learning_rate": 0.00019982567306174522, "loss": 7.6307, "step": 1211 }, { "epoch": 0.05784443569459856, "grad_norm": 5.328958511352539, "learning_rate": 0.00019982537780354324, "loss": 6.6109, "step": 1212 }, { "epoch": 0.057892162126689815, "grad_norm": 4.02738618850708, "learning_rate": 0.0001998250822957313, "loss": 4.1032, "step": 1213 }, { "epoch": 0.057939888558781065, "grad_norm": 5.261392116546631, "learning_rate": 0.00019982478653831018, "loss": 5.9951, "step": 1214 }, { "epoch": 0.05798761499087232, "grad_norm": 5.21045446395874, "learning_rate": 0.00019982449053128058, "loss": 6.1101, "step": 1215 }, { "epoch": 0.05803534142296357, "grad_norm": 3.9710166454315186, "learning_rate": 0.00019982419427464328, "loss": 4.5463, "step": 1216 }, { "epoch": 0.05808306785505483, "grad_norm": 5.389290809631348, "learning_rate": 0.00019982389776839897, "loss": 6.1384, "step": 1217 }, { "epoch": 0.05813079428714608, "grad_norm": 5.931976795196533, "learning_rate": 0.0001998236010125484, "loss": 6.8934, "step": 1218 }, { "epoch": 0.058178520719237334, "grad_norm": 6.911122798919678, "learning_rate": 0.00019982330400709233, "loss": 6.5987, "step": 1219 }, { "epoch": 0.05822624715132858, "grad_norm": 5.2879204750061035, "learning_rate": 0.00019982300675203148, "loss": 5.2397, "step": 1220 }, { "epoch": 0.05827397358341984, "grad_norm": 5.441773414611816, "learning_rate": 0.00019982270924736665, "loss": 6.2828, "step": 1221 }, { "epoch": 0.05832170001551109, "grad_norm": 6.332236289978027, "learning_rate": 0.00019982241149309852, "loss": 6.7718, "step": 1222 }, { "epoch": 0.058369426447602346, "grad_norm": 3.1186656951904297, "learning_rate": 0.0001998221134892279, "loss": 3.687, "step": 1223 }, { "epoch": 0.058417152879693596, "grad_norm": 5.341313362121582, "learning_rate": 0.00019982181523575546, "loss": 5.9183, "step": 1224 }, { "epoch": 0.05846487931178485, "grad_norm": 9.742603302001953, "learning_rate": 0.00019982151673268196, "loss": 11.1658, "step": 1225 }, { "epoch": 0.0585126057438761, "grad_norm": 5.633368492126465, "learning_rate": 0.0001998212179800082, "loss": 6.2966, "step": 1226 }, { "epoch": 0.05856033217596736, "grad_norm": 6.390228748321533, "learning_rate": 0.00019982091897773491, "loss": 7.3505, "step": 1227 }, { "epoch": 0.05860805860805861, "grad_norm": 4.819535732269287, "learning_rate": 0.0001998206197258628, "loss": 6.8309, "step": 1228 }, { "epoch": 0.05865578504014986, "grad_norm": 5.8090105056762695, "learning_rate": 0.00019982032022439262, "loss": 6.8552, "step": 1229 }, { "epoch": 0.058703511472241114, "grad_norm": 4.38669490814209, "learning_rate": 0.0001998200204733252, "loss": 4.5287, "step": 1230 }, { "epoch": 0.058751237904332364, "grad_norm": 5.424665451049805, "learning_rate": 0.00019981972047266116, "loss": 6.465, "step": 1231 }, { "epoch": 0.05879896433642362, "grad_norm": 5.625690460205078, "learning_rate": 0.00019981942022240137, "loss": 7.6902, "step": 1232 }, { "epoch": 0.05884669076851487, "grad_norm": 4.135089874267578, "learning_rate": 0.00019981911972254653, "loss": 5.1217, "step": 1233 }, { "epoch": 0.058894417200606126, "grad_norm": 4.584601402282715, "learning_rate": 0.00019981881897309736, "loss": 5.1911, "step": 1234 }, { "epoch": 0.058942143632697376, "grad_norm": 5.869520664215088, "learning_rate": 0.00019981851797405464, "loss": 6.6174, "step": 1235 }, { "epoch": 0.05898987006478863, "grad_norm": 5.562112331390381, "learning_rate": 0.00019981821672541915, "loss": 6.5435, "step": 1236 }, { "epoch": 0.05903759649687988, "grad_norm": 6.489696025848389, "learning_rate": 0.0001998179152271916, "loss": 6.7296, "step": 1237 }, { "epoch": 0.05908532292897114, "grad_norm": 4.990871429443359, "learning_rate": 0.00019981761347937277, "loss": 6.63, "step": 1238 }, { "epoch": 0.05913304936106239, "grad_norm": 6.288900375366211, "learning_rate": 0.00019981731148196343, "loss": 8.4076, "step": 1239 }, { "epoch": 0.059180775793153645, "grad_norm": 6.3664350509643555, "learning_rate": 0.00019981700923496428, "loss": 5.8255, "step": 1240 }, { "epoch": 0.059228502225244894, "grad_norm": 4.019838333129883, "learning_rate": 0.00019981670673837614, "loss": 6.0219, "step": 1241 }, { "epoch": 0.05927622865733615, "grad_norm": 6.570740222930908, "learning_rate": 0.0001998164039921997, "loss": 7.1613, "step": 1242 }, { "epoch": 0.0593239550894274, "grad_norm": 6.6033806800842285, "learning_rate": 0.00019981610099643576, "loss": 6.3113, "step": 1243 }, { "epoch": 0.05937168152151866, "grad_norm": 4.821962833404541, "learning_rate": 0.0001998157977510851, "loss": 5.7839, "step": 1244 }, { "epoch": 0.05941940795360991, "grad_norm": 6.395529747009277, "learning_rate": 0.0001998154942561484, "loss": 7.8184, "step": 1245 }, { "epoch": 0.05946713438570116, "grad_norm": 5.567226886749268, "learning_rate": 0.00019981519051162652, "loss": 6.5943, "step": 1246 }, { "epoch": 0.05951486081779241, "grad_norm": 5.136303901672363, "learning_rate": 0.00019981488651752013, "loss": 5.8387, "step": 1247 }, { "epoch": 0.05956258724988367, "grad_norm": 5.914418697357178, "learning_rate": 0.00019981458227383002, "loss": 6.1116, "step": 1248 }, { "epoch": 0.05961031368197492, "grad_norm": 5.672010898590088, "learning_rate": 0.000199814277780557, "loss": 6.7764, "step": 1249 }, { "epoch": 0.059658040114066176, "grad_norm": 4.827054023742676, "learning_rate": 0.00019981397303770173, "loss": 6.8793, "step": 1250 }, { "epoch": 0.059705766546157425, "grad_norm": 5.450680732727051, "learning_rate": 0.00019981366804526507, "loss": 6.828, "step": 1251 }, { "epoch": 0.05975349297824868, "grad_norm": 4.682491779327393, "learning_rate": 0.00019981336280324775, "loss": 5.7658, "step": 1252 }, { "epoch": 0.05980121941033993, "grad_norm": 6.666014671325684, "learning_rate": 0.00019981305731165048, "loss": 8.9666, "step": 1253 }, { "epoch": 0.05984894584243118, "grad_norm": 5.8116230964660645, "learning_rate": 0.0001998127515704741, "loss": 7.0605, "step": 1254 }, { "epoch": 0.05989667227452244, "grad_norm": 5.112065315246582, "learning_rate": 0.00019981244557971937, "loss": 6.0437, "step": 1255 }, { "epoch": 0.05994439870661369, "grad_norm": 4.358669757843018, "learning_rate": 0.00019981213933938698, "loss": 5.5774, "step": 1256 }, { "epoch": 0.059992125138704944, "grad_norm": 6.462079048156738, "learning_rate": 0.00019981183284947777, "loss": 7.9854, "step": 1257 }, { "epoch": 0.06003985157079619, "grad_norm": 5.645269393920898, "learning_rate": 0.00019981152610999247, "loss": 6.2153, "step": 1258 }, { "epoch": 0.06008757800288745, "grad_norm": 6.452097415924072, "learning_rate": 0.0001998112191209319, "loss": 7.2683, "step": 1259 }, { "epoch": 0.0601353044349787, "grad_norm": 3.697091579437256, "learning_rate": 0.00019981091188229675, "loss": 4.8166, "step": 1260 }, { "epoch": 0.060183030867069956, "grad_norm": 6.5822882652282715, "learning_rate": 0.0001998106043940878, "loss": 8.7029, "step": 1261 }, { "epoch": 0.060230757299161206, "grad_norm": 4.781540393829346, "learning_rate": 0.00019981029665630588, "loss": 5.3068, "step": 1262 }, { "epoch": 0.06027848373125246, "grad_norm": 5.348550319671631, "learning_rate": 0.00019980998866895173, "loss": 6.506, "step": 1263 }, { "epoch": 0.06032621016334371, "grad_norm": 5.933366298675537, "learning_rate": 0.0001998096804320261, "loss": 6.8603, "step": 1264 }, { "epoch": 0.06037393659543497, "grad_norm": 9.509913444519043, "learning_rate": 0.00019980937194552978, "loss": 6.3672, "step": 1265 }, { "epoch": 0.06042166302752622, "grad_norm": 4.7004523277282715, "learning_rate": 0.00019980906320946353, "loss": 6.0443, "step": 1266 }, { "epoch": 0.060469389459617474, "grad_norm": 3.8409836292266846, "learning_rate": 0.0001998087542238281, "loss": 4.7681, "step": 1267 }, { "epoch": 0.060517115891708724, "grad_norm": 7.4877119064331055, "learning_rate": 0.00019980844498862434, "loss": 6.577, "step": 1268 }, { "epoch": 0.06056484232379998, "grad_norm": 7.089905738830566, "learning_rate": 0.00019980813550385293, "loss": 5.6053, "step": 1269 }, { "epoch": 0.06061256875589123, "grad_norm": 4.442049980163574, "learning_rate": 0.0001998078257695147, "loss": 5.3552, "step": 1270 }, { "epoch": 0.06066029518798249, "grad_norm": 5.32235860824585, "learning_rate": 0.00019980751578561044, "loss": 6.5989, "step": 1271 }, { "epoch": 0.060708021620073736, "grad_norm": 4.274033069610596, "learning_rate": 0.00019980720555214088, "loss": 4.482, "step": 1272 }, { "epoch": 0.06075574805216499, "grad_norm": 6.038319110870361, "learning_rate": 0.00019980689506910682, "loss": 6.9905, "step": 1273 }, { "epoch": 0.06080347448425624, "grad_norm": 5.794177532196045, "learning_rate": 0.000199806584336509, "loss": 6.2485, "step": 1274 }, { "epoch": 0.0608512009163475, "grad_norm": 6.8565449714660645, "learning_rate": 0.00019980627335434826, "loss": 7.6865, "step": 1275 }, { "epoch": 0.06089892734843875, "grad_norm": 5.700689792633057, "learning_rate": 0.0001998059621226253, "loss": 7.3154, "step": 1276 }, { "epoch": 0.060946653780530005, "grad_norm": 6.456467628479004, "learning_rate": 0.00019980565064134096, "loss": 7.4391, "step": 1277 }, { "epoch": 0.060994380212621255, "grad_norm": 5.394682884216309, "learning_rate": 0.000199805338910496, "loss": 6.9588, "step": 1278 }, { "epoch": 0.06104210664471251, "grad_norm": 5.933255672454834, "learning_rate": 0.0001998050269300912, "loss": 6.5465, "step": 1279 }, { "epoch": 0.06108983307680376, "grad_norm": 4.815224647521973, "learning_rate": 0.00019980471470012735, "loss": 7.2611, "step": 1280 }, { "epoch": 0.06113755950889501, "grad_norm": 5.374650478363037, "learning_rate": 0.00019980440222060522, "loss": 7.2072, "step": 1281 }, { "epoch": 0.06118528594098627, "grad_norm": 5.975259780883789, "learning_rate": 0.00019980408949152559, "loss": 8.692, "step": 1282 }, { "epoch": 0.06123301237307752, "grad_norm": 4.682956218719482, "learning_rate": 0.00019980377651288925, "loss": 5.0153, "step": 1283 }, { "epoch": 0.06128073880516877, "grad_norm": 5.41241979598999, "learning_rate": 0.00019980346328469698, "loss": 5.9428, "step": 1284 }, { "epoch": 0.06132846523726002, "grad_norm": 6.177036762237549, "learning_rate": 0.00019980314980694953, "loss": 5.7881, "step": 1285 }, { "epoch": 0.06137619166935128, "grad_norm": 5.134255409240723, "learning_rate": 0.00019980283607964774, "loss": 6.4698, "step": 1286 }, { "epoch": 0.06142391810144253, "grad_norm": 6.0259504318237305, "learning_rate": 0.00019980252210279232, "loss": 7.9555, "step": 1287 }, { "epoch": 0.061471644533533785, "grad_norm": 5.946018695831299, "learning_rate": 0.00019980220787638414, "loss": 7.192, "step": 1288 }, { "epoch": 0.061519370965625035, "grad_norm": 4.8159403800964355, "learning_rate": 0.000199801893400424, "loss": 6.3315, "step": 1289 }, { "epoch": 0.06156709739771629, "grad_norm": 4.71375036239624, "learning_rate": 0.00019980157867491256, "loss": 6.2069, "step": 1290 }, { "epoch": 0.06161482382980754, "grad_norm": 4.72582483291626, "learning_rate": 0.0001998012636998507, "loss": 5.256, "step": 1291 }, { "epoch": 0.0616625502618988, "grad_norm": 5.787103652954102, "learning_rate": 0.0001998009484752392, "loss": 6.4886, "step": 1292 }, { "epoch": 0.06171027669399005, "grad_norm": 5.104354381561279, "learning_rate": 0.00019980063300107883, "loss": 6.2915, "step": 1293 }, { "epoch": 0.061758003126081304, "grad_norm": 5.608123779296875, "learning_rate": 0.0001998003172773704, "loss": 7.4317, "step": 1294 }, { "epoch": 0.061805729558172554, "grad_norm": 3.760908603668213, "learning_rate": 0.00019980000130411467, "loss": 4.5233, "step": 1295 }, { "epoch": 0.06185345599026381, "grad_norm": 5.665719032287598, "learning_rate": 0.00019979968508131247, "loss": 7.585, "step": 1296 }, { "epoch": 0.06190118242235506, "grad_norm": 5.46486234664917, "learning_rate": 0.00019979936860896452, "loss": 6.8773, "step": 1297 }, { "epoch": 0.061948908854446316, "grad_norm": 6.026041507720947, "learning_rate": 0.0001997990518870717, "loss": 7.4133, "step": 1298 }, { "epoch": 0.061996635286537566, "grad_norm": 10.97952651977539, "learning_rate": 0.00019979873491563478, "loss": 6.351, "step": 1299 }, { "epoch": 0.06204436171862882, "grad_norm": 5.1238884925842285, "learning_rate": 0.00019979841769465447, "loss": 4.966, "step": 1300 }, { "epoch": 0.06209208815072007, "grad_norm": 6.978937149047852, "learning_rate": 0.00019979810022413167, "loss": 7.2696, "step": 1301 }, { "epoch": 0.06213981458281133, "grad_norm": 5.799099922180176, "learning_rate": 0.00019979778250406712, "loss": 7.4118, "step": 1302 }, { "epoch": 0.06218754101490258, "grad_norm": 4.965507984161377, "learning_rate": 0.0001997974645344616, "loss": 6.2402, "step": 1303 }, { "epoch": 0.062235267446993835, "grad_norm": 4.546393394470215, "learning_rate": 0.00019979714631531597, "loss": 5.8729, "step": 1304 }, { "epoch": 0.062282993879085084, "grad_norm": 4.928406715393066, "learning_rate": 0.00019979682784663095, "loss": 6.2708, "step": 1305 }, { "epoch": 0.062330720311176334, "grad_norm": 4.03574800491333, "learning_rate": 0.0001997965091284074, "loss": 4.8473, "step": 1306 }, { "epoch": 0.06237844674326759, "grad_norm": 4.830216407775879, "learning_rate": 0.00019979619016064607, "loss": 6.8637, "step": 1307 }, { "epoch": 0.06242617317535884, "grad_norm": 4.492814540863037, "learning_rate": 0.0001997958709433478, "loss": 5.8724, "step": 1308 }, { "epoch": 0.0624738996074501, "grad_norm": 6.305149555206299, "learning_rate": 0.00019979555147651333, "loss": 8.2697, "step": 1309 }, { "epoch": 0.06252162603954135, "grad_norm": 6.380387306213379, "learning_rate": 0.00019979523176014353, "loss": 8.0075, "step": 1310 }, { "epoch": 0.0625693524716326, "grad_norm": 6.43091344833374, "learning_rate": 0.00019979491179423912, "loss": 7.7513, "step": 1311 }, { "epoch": 0.06261707890372385, "grad_norm": 6.313225269317627, "learning_rate": 0.00019979459157880097, "loss": 5.9832, "step": 1312 }, { "epoch": 0.0626648053358151, "grad_norm": 5.914539337158203, "learning_rate": 0.0001997942711138298, "loss": 7.2379, "step": 1313 }, { "epoch": 0.06271253176790637, "grad_norm": 7.0986456871032715, "learning_rate": 0.00019979395039932657, "loss": 6.8586, "step": 1314 }, { "epoch": 0.06276025819999762, "grad_norm": 4.56702995300293, "learning_rate": 0.0001997936294352919, "loss": 5.3618, "step": 1315 }, { "epoch": 0.06280798463208886, "grad_norm": 4.9112982749938965, "learning_rate": 0.00019979330822172665, "loss": 6.8347, "step": 1316 }, { "epoch": 0.06285571106418011, "grad_norm": 6.451658725738525, "learning_rate": 0.00019979298675863171, "loss": 7.2375, "step": 1317 }, { "epoch": 0.06290343749627138, "grad_norm": 4.130640983581543, "learning_rate": 0.00019979266504600778, "loss": 5.3879, "step": 1318 }, { "epoch": 0.06295116392836263, "grad_norm": 5.467159748077393, "learning_rate": 0.0001997923430838557, "loss": 6.5589, "step": 1319 }, { "epoch": 0.06299889036045388, "grad_norm": 5.687652111053467, "learning_rate": 0.0001997920208721763, "loss": 6.8836, "step": 1320 }, { "epoch": 0.06304661679254513, "grad_norm": 5.277466297149658, "learning_rate": 0.00019979169841097033, "loss": 5.4351, "step": 1321 }, { "epoch": 0.06309434322463639, "grad_norm": 5.857608318328857, "learning_rate": 0.00019979137570023863, "loss": 6.6218, "step": 1322 }, { "epoch": 0.06314206965672764, "grad_norm": 4.951214790344238, "learning_rate": 0.000199791052739982, "loss": 6.5093, "step": 1323 }, { "epoch": 0.06318979608881889, "grad_norm": 5.142601490020752, "learning_rate": 0.00019979072953020126, "loss": 7.2203, "step": 1324 }, { "epoch": 0.06323752252091014, "grad_norm": 5.16283655166626, "learning_rate": 0.00019979040607089724, "loss": 6.2853, "step": 1325 }, { "epoch": 0.0632852489530014, "grad_norm": 5.099846363067627, "learning_rate": 0.00019979008236207065, "loss": 5.6786, "step": 1326 }, { "epoch": 0.06333297538509265, "grad_norm": 7.235396385192871, "learning_rate": 0.00019978975840372244, "loss": 8.6704, "step": 1327 }, { "epoch": 0.0633807018171839, "grad_norm": 5.552468299865723, "learning_rate": 0.0001997894341958533, "loss": 5.565, "step": 1328 }, { "epoch": 0.06342842824927515, "grad_norm": 5.986726760864258, "learning_rate": 0.0001997891097384641, "loss": 6.1292, "step": 1329 }, { "epoch": 0.06347615468136641, "grad_norm": 4.076527118682861, "learning_rate": 0.00019978878503155566, "loss": 5.8267, "step": 1330 }, { "epoch": 0.06352388111345766, "grad_norm": 5.784380912780762, "learning_rate": 0.00019978846007512874, "loss": 6.8164, "step": 1331 }, { "epoch": 0.06357160754554891, "grad_norm": 7.183407783508301, "learning_rate": 0.0001997881348691842, "loss": 7.157, "step": 1332 }, { "epoch": 0.06361933397764016, "grad_norm": 5.831700801849365, "learning_rate": 0.00019978780941372286, "loss": 4.9682, "step": 1333 }, { "epoch": 0.06366706040973141, "grad_norm": 5.133748531341553, "learning_rate": 0.0001997874837087455, "loss": 5.6342, "step": 1334 }, { "epoch": 0.06371478684182268, "grad_norm": 4.338518142700195, "learning_rate": 0.00019978715775425293, "loss": 5.527, "step": 1335 }, { "epoch": 0.06376251327391393, "grad_norm": 6.827880859375, "learning_rate": 0.00019978683155024598, "loss": 6.2537, "step": 1336 }, { "epoch": 0.06381023970600518, "grad_norm": 6.917072296142578, "learning_rate": 0.0001997865050967255, "loss": 5.8059, "step": 1337 }, { "epoch": 0.06385796613809643, "grad_norm": 6.251923084259033, "learning_rate": 0.00019978617839369223, "loss": 6.4509, "step": 1338 }, { "epoch": 0.06390569257018769, "grad_norm": 5.752988815307617, "learning_rate": 0.00019978585144114706, "loss": 6.9817, "step": 1339 }, { "epoch": 0.06395341900227894, "grad_norm": 5.287779331207275, "learning_rate": 0.00019978552423909076, "loss": 6.5855, "step": 1340 }, { "epoch": 0.06400114543437019, "grad_norm": 8.172158241271973, "learning_rate": 0.00019978519678752417, "loss": 7.3372, "step": 1341 }, { "epoch": 0.06404887186646144, "grad_norm": 6.428116798400879, "learning_rate": 0.00019978486908644812, "loss": 8.8865, "step": 1342 }, { "epoch": 0.0640965982985527, "grad_norm": 4.930361270904541, "learning_rate": 0.00019978454113586338, "loss": 6.0703, "step": 1343 }, { "epoch": 0.06414432473064395, "grad_norm": 5.518083095550537, "learning_rate": 0.00019978421293577085, "loss": 4.9527, "step": 1344 }, { "epoch": 0.0641920511627352, "grad_norm": 6.320931434631348, "learning_rate": 0.00019978388448617127, "loss": 7.5066, "step": 1345 }, { "epoch": 0.06423977759482645, "grad_norm": 5.729351997375488, "learning_rate": 0.0001997835557870655, "loss": 6.0256, "step": 1346 }, { "epoch": 0.06428750402691771, "grad_norm": 6.066036701202393, "learning_rate": 0.00019978322683845436, "loss": 7.5009, "step": 1347 }, { "epoch": 0.06433523045900896, "grad_norm": 7.021487712860107, "learning_rate": 0.00019978289764033868, "loss": 6.1152, "step": 1348 }, { "epoch": 0.06438295689110021, "grad_norm": 5.401805400848389, "learning_rate": 0.00019978256819271927, "loss": 7.1905, "step": 1349 }, { "epoch": 0.06443068332319146, "grad_norm": 4.905795097351074, "learning_rate": 0.00019978223849559693, "loss": 5.5385, "step": 1350 }, { "epoch": 0.06447840975528273, "grad_norm": 6.608560085296631, "learning_rate": 0.00019978190854897255, "loss": 9.2393, "step": 1351 }, { "epoch": 0.06452613618737398, "grad_norm": 6.524369716644287, "learning_rate": 0.0001997815783528469, "loss": 7.999, "step": 1352 }, { "epoch": 0.06457386261946522, "grad_norm": 4.459353446960449, "learning_rate": 0.00019978124790722082, "loss": 5.2927, "step": 1353 }, { "epoch": 0.06462158905155647, "grad_norm": 4.674695014953613, "learning_rate": 0.00019978091721209516, "loss": 5.6534, "step": 1354 }, { "epoch": 0.06466931548364774, "grad_norm": 6.882339954376221, "learning_rate": 0.0001997805862674707, "loss": 7.8705, "step": 1355 }, { "epoch": 0.06471704191573899, "grad_norm": 4.643222808837891, "learning_rate": 0.0001997802550733483, "loss": 6.0668, "step": 1356 }, { "epoch": 0.06476476834783024, "grad_norm": 5.594811916351318, "learning_rate": 0.00019977992362972877, "loss": 6.2129, "step": 1357 }, { "epoch": 0.06481249477992149, "grad_norm": 7.571159362792969, "learning_rate": 0.00019977959193661295, "loss": 6.5168, "step": 1358 }, { "epoch": 0.06486022121201274, "grad_norm": 7.705883979797363, "learning_rate": 0.0001997792599940017, "loss": 5.5792, "step": 1359 }, { "epoch": 0.064907947644104, "grad_norm": 4.479014873504639, "learning_rate": 0.0001997789278018958, "loss": 5.6164, "step": 1360 }, { "epoch": 0.06495567407619525, "grad_norm": 5.1996307373046875, "learning_rate": 0.0001997785953602961, "loss": 6.5921, "step": 1361 }, { "epoch": 0.0650034005082865, "grad_norm": 5.447461128234863, "learning_rate": 0.0001997782626692034, "loss": 6.7676, "step": 1362 }, { "epoch": 0.06505112694037775, "grad_norm": 5.4039201736450195, "learning_rate": 0.0001997779297286186, "loss": 6.8753, "step": 1363 }, { "epoch": 0.06509885337246901, "grad_norm": 5.607146739959717, "learning_rate": 0.0001997775965385425, "loss": 7.2069, "step": 1364 }, { "epoch": 0.06514657980456026, "grad_norm": 7.765103340148926, "learning_rate": 0.0001997772630989759, "loss": 10.0938, "step": 1365 }, { "epoch": 0.06519430623665151, "grad_norm": 3.9354865550994873, "learning_rate": 0.0001997769294099197, "loss": 5.0037, "step": 1366 }, { "epoch": 0.06524203266874276, "grad_norm": 6.80724573135376, "learning_rate": 0.00019977659547137466, "loss": 7.5893, "step": 1367 }, { "epoch": 0.06528975910083402, "grad_norm": 5.386005401611328, "learning_rate": 0.00019977626128334168, "loss": 4.8811, "step": 1368 }, { "epoch": 0.06533748553292527, "grad_norm": 5.199729919433594, "learning_rate": 0.00019977592684582154, "loss": 6.1969, "step": 1369 }, { "epoch": 0.06538521196501652, "grad_norm": 5.6323323249816895, "learning_rate": 0.0001997755921588151, "loss": 6.8672, "step": 1370 }, { "epoch": 0.06543293839710777, "grad_norm": 5.429327487945557, "learning_rate": 0.0001997752572223232, "loss": 6.6176, "step": 1371 }, { "epoch": 0.06548066482919904, "grad_norm": 5.106516361236572, "learning_rate": 0.00019977492203634674, "loss": 6.1367, "step": 1372 }, { "epoch": 0.06552839126129029, "grad_norm": 5.300068378448486, "learning_rate": 0.00019977458660088644, "loss": 5.856, "step": 1373 }, { "epoch": 0.06557611769338154, "grad_norm": 5.450684070587158, "learning_rate": 0.00019977425091594322, "loss": 7.2261, "step": 1374 }, { "epoch": 0.06562384412547279, "grad_norm": 4.138898849487305, "learning_rate": 0.00019977391498151787, "loss": 5.2684, "step": 1375 }, { "epoch": 0.06567157055756405, "grad_norm": 5.977757453918457, "learning_rate": 0.00019977357879761125, "loss": 7.4852, "step": 1376 }, { "epoch": 0.0657192969896553, "grad_norm": 6.0743842124938965, "learning_rate": 0.00019977324236422424, "loss": 7.6254, "step": 1377 }, { "epoch": 0.06576702342174655, "grad_norm": 5.307611465454102, "learning_rate": 0.00019977290568135763, "loss": 6.7658, "step": 1378 }, { "epoch": 0.0658147498538378, "grad_norm": 6.397800922393799, "learning_rate": 0.00019977256874901227, "loss": 7.3642, "step": 1379 }, { "epoch": 0.06586247628592906, "grad_norm": 5.224310398101807, "learning_rate": 0.000199772231567189, "loss": 6.3156, "step": 1380 }, { "epoch": 0.06591020271802031, "grad_norm": 6.8444037437438965, "learning_rate": 0.00019977189413588868, "loss": 7.9758, "step": 1381 }, { "epoch": 0.06595792915011156, "grad_norm": 9.323966979980469, "learning_rate": 0.00019977155645511217, "loss": 7.0848, "step": 1382 }, { "epoch": 0.06600565558220281, "grad_norm": 5.162333965301514, "learning_rate": 0.00019977121852486027, "loss": 5.7988, "step": 1383 }, { "epoch": 0.06605338201429406, "grad_norm": 5.020596027374268, "learning_rate": 0.00019977088034513386, "loss": 6.0843, "step": 1384 }, { "epoch": 0.06610110844638532, "grad_norm": 4.701834201812744, "learning_rate": 0.00019977054191593375, "loss": 5.587, "step": 1385 }, { "epoch": 0.06614883487847657, "grad_norm": 6.176102638244629, "learning_rate": 0.00019977020323726084, "loss": 6.592, "step": 1386 }, { "epoch": 0.06619656131056782, "grad_norm": 5.295979976654053, "learning_rate": 0.00019976986430911593, "loss": 6.2341, "step": 1387 }, { "epoch": 0.06624428774265907, "grad_norm": 6.3301310539245605, "learning_rate": 0.00019976952513149987, "loss": 6.6459, "step": 1388 }, { "epoch": 0.06629201417475034, "grad_norm": 5.851332187652588, "learning_rate": 0.00019976918570441355, "loss": 6.9627, "step": 1389 }, { "epoch": 0.06633974060684159, "grad_norm": 4.749933242797852, "learning_rate": 0.00019976884602785776, "loss": 5.397, "step": 1390 }, { "epoch": 0.06638746703893283, "grad_norm": 6.7949066162109375, "learning_rate": 0.00019976850610183336, "loss": 7.0712, "step": 1391 }, { "epoch": 0.06643519347102408, "grad_norm": 5.44989538192749, "learning_rate": 0.00019976816592634129, "loss": 5.2793, "step": 1392 }, { "epoch": 0.06648291990311535, "grad_norm": 4.744762420654297, "learning_rate": 0.00019976782550138226, "loss": 5.546, "step": 1393 }, { "epoch": 0.0665306463352066, "grad_norm": 5.614182472229004, "learning_rate": 0.00019976748482695725, "loss": 6.0618, "step": 1394 }, { "epoch": 0.06657837276729785, "grad_norm": 5.821581840515137, "learning_rate": 0.000199767143903067, "loss": 5.4909, "step": 1395 }, { "epoch": 0.0666260991993891, "grad_norm": 6.319411754608154, "learning_rate": 0.00019976680272971244, "loss": 7.4397, "step": 1396 }, { "epoch": 0.06667382563148036, "grad_norm": 7.183498382568359, "learning_rate": 0.00019976646130689442, "loss": 7.0303, "step": 1397 }, { "epoch": 0.06672155206357161, "grad_norm": 6.496625900268555, "learning_rate": 0.00019976611963461372, "loss": 6.9933, "step": 1398 }, { "epoch": 0.06676927849566286, "grad_norm": 5.2109375, "learning_rate": 0.00019976577771287128, "loss": 6.2678, "step": 1399 }, { "epoch": 0.06681700492775411, "grad_norm": 4.8720011711120605, "learning_rate": 0.0001997654355416679, "loss": 5.7667, "step": 1400 }, { "epoch": 0.06681700492775411, "eval_loss": 1.6251323223114014, "eval_runtime": 96.5448, "eval_samples_per_second": 8.732, "eval_steps_per_second": 4.371, "step": 1400 }, { "epoch": 0.06686473135984537, "grad_norm": 8.0670747756958, "learning_rate": 0.00019976509312100447, "loss": 8.2399, "step": 1401 }, { "epoch": 0.06691245779193662, "grad_norm": 5.263416290283203, "learning_rate": 0.00019976475045088186, "loss": 6.591, "step": 1402 }, { "epoch": 0.06696018422402787, "grad_norm": 6.180189609527588, "learning_rate": 0.00019976440753130087, "loss": 7.0425, "step": 1403 }, { "epoch": 0.06700791065611912, "grad_norm": 5.058129787445068, "learning_rate": 0.00019976406436226238, "loss": 5.4545, "step": 1404 }, { "epoch": 0.06705563708821038, "grad_norm": 6.61360502243042, "learning_rate": 0.00019976372094376727, "loss": 6.7202, "step": 1405 }, { "epoch": 0.06710336352030163, "grad_norm": 6.124049663543701, "learning_rate": 0.00019976337727581637, "loss": 6.3471, "step": 1406 }, { "epoch": 0.06715108995239288, "grad_norm": 5.526522636413574, "learning_rate": 0.0001997630333584106, "loss": 6.0526, "step": 1407 }, { "epoch": 0.06719881638448413, "grad_norm": 5.516505718231201, "learning_rate": 0.0001997626891915507, "loss": 7.9752, "step": 1408 }, { "epoch": 0.06724654281657538, "grad_norm": 5.651198387145996, "learning_rate": 0.00019976234477523766, "loss": 6.0345, "step": 1409 }, { "epoch": 0.06729426924866665, "grad_norm": 5.454300880432129, "learning_rate": 0.00019976200010947225, "loss": 6.1546, "step": 1410 }, { "epoch": 0.0673419956807579, "grad_norm": 5.49661111831665, "learning_rate": 0.0001997616551942554, "loss": 6.1562, "step": 1411 }, { "epoch": 0.06738972211284915, "grad_norm": 5.8716721534729, "learning_rate": 0.0001997613100295879, "loss": 7.2321, "step": 1412 }, { "epoch": 0.0674374485449404, "grad_norm": 7.179393768310547, "learning_rate": 0.00019976096461547067, "loss": 8.2223, "step": 1413 }, { "epoch": 0.06748517497703166, "grad_norm": 5.213138580322266, "learning_rate": 0.00019976061895190457, "loss": 5.653, "step": 1414 }, { "epoch": 0.06753290140912291, "grad_norm": 5.578834056854248, "learning_rate": 0.00019976027303889043, "loss": 6.5287, "step": 1415 }, { "epoch": 0.06758062784121416, "grad_norm": 5.943813323974609, "learning_rate": 0.00019975992687642916, "loss": 7.4855, "step": 1416 }, { "epoch": 0.06762835427330541, "grad_norm": 5.0130534172058105, "learning_rate": 0.0001997595804645216, "loss": 5.5635, "step": 1417 }, { "epoch": 0.06767608070539667, "grad_norm": 8.088458061218262, "learning_rate": 0.0001997592338031686, "loss": 6.5269, "step": 1418 }, { "epoch": 0.06772380713748792, "grad_norm": 6.006148815155029, "learning_rate": 0.00019975888689237103, "loss": 7.4442, "step": 1419 }, { "epoch": 0.06777153356957917, "grad_norm": 5.695216178894043, "learning_rate": 0.00019975853973212978, "loss": 7.1272, "step": 1420 }, { "epoch": 0.06781926000167042, "grad_norm": 4.421334743499756, "learning_rate": 0.00019975819232244572, "loss": 4.794, "step": 1421 }, { "epoch": 0.06786698643376168, "grad_norm": 6.380979061126709, "learning_rate": 0.0001997578446633197, "loss": 6.7469, "step": 1422 }, { "epoch": 0.06791471286585293, "grad_norm": 10.048431396484375, "learning_rate": 0.00019975749675475263, "loss": 4.5735, "step": 1423 }, { "epoch": 0.06796243929794418, "grad_norm": 6.169594764709473, "learning_rate": 0.0001997571485967453, "loss": 7.5358, "step": 1424 }, { "epoch": 0.06801016573003543, "grad_norm": 7.54771089553833, "learning_rate": 0.00019975680018929865, "loss": 6.2074, "step": 1425 }, { "epoch": 0.0680578921621267, "grad_norm": 6.739799499511719, "learning_rate": 0.00019975645153241353, "loss": 7.5895, "step": 1426 }, { "epoch": 0.06810561859421795, "grad_norm": 8.054227828979492, "learning_rate": 0.0001997561026260908, "loss": 6.0915, "step": 1427 }, { "epoch": 0.0681533450263092, "grad_norm": 6.955227375030518, "learning_rate": 0.00019975575347033138, "loss": 6.7571, "step": 1428 }, { "epoch": 0.06820107145840044, "grad_norm": 5.498645305633545, "learning_rate": 0.00019975540406513606, "loss": 6.2918, "step": 1429 }, { "epoch": 0.06824879789049171, "grad_norm": 5.576145172119141, "learning_rate": 0.0001997550544105058, "loss": 8.0397, "step": 1430 }, { "epoch": 0.06829652432258296, "grad_norm": 6.459205627441406, "learning_rate": 0.0001997547045064414, "loss": 5.7634, "step": 1431 }, { "epoch": 0.06834425075467421, "grad_norm": 4.438475131988525, "learning_rate": 0.00019975435435294378, "loss": 5.5416, "step": 1432 }, { "epoch": 0.06839197718676546, "grad_norm": 7.885876178741455, "learning_rate": 0.0001997540039500138, "loss": 6.5277, "step": 1433 }, { "epoch": 0.0684397036188567, "grad_norm": 6.090540885925293, "learning_rate": 0.00019975365329765235, "loss": 7.2471, "step": 1434 }, { "epoch": 0.06848743005094797, "grad_norm": 4.9131035804748535, "learning_rate": 0.0001997533023958603, "loss": 5.5133, "step": 1435 }, { "epoch": 0.06853515648303922, "grad_norm": 6.626755714416504, "learning_rate": 0.0001997529512446385, "loss": 7.9222, "step": 1436 }, { "epoch": 0.06858288291513047, "grad_norm": 5.282534599304199, "learning_rate": 0.00019975259984398788, "loss": 6.8754, "step": 1437 }, { "epoch": 0.06863060934722172, "grad_norm": 4.99605655670166, "learning_rate": 0.0001997522481939093, "loss": 6.4879, "step": 1438 }, { "epoch": 0.06867833577931298, "grad_norm": 7.873188018798828, "learning_rate": 0.0001997518962944036, "loss": 7.5573, "step": 1439 }, { "epoch": 0.06872606221140423, "grad_norm": 4.471059799194336, "learning_rate": 0.00019975154414547173, "loss": 5.72, "step": 1440 }, { "epoch": 0.06877378864349548, "grad_norm": 3.9933717250823975, "learning_rate": 0.00019975119174711452, "loss": 5.9306, "step": 1441 }, { "epoch": 0.06882151507558673, "grad_norm": 6.150444507598877, "learning_rate": 0.00019975083909933283, "loss": 5.8745, "step": 1442 }, { "epoch": 0.068869241507678, "grad_norm": 5.360218524932861, "learning_rate": 0.0001997504862021276, "loss": 6.8853, "step": 1443 }, { "epoch": 0.06891696793976924, "grad_norm": 3.830113649368286, "learning_rate": 0.0001997501330554997, "loss": 4.4829, "step": 1444 }, { "epoch": 0.0689646943718605, "grad_norm": 5.9787187576293945, "learning_rate": 0.00019974977965945, "loss": 5.6516, "step": 1445 }, { "epoch": 0.06901242080395174, "grad_norm": 5.175154209136963, "learning_rate": 0.00019974942601397934, "loss": 7.1329, "step": 1446 }, { "epoch": 0.069060147236043, "grad_norm": 6.442656517028809, "learning_rate": 0.00019974907211908868, "loss": 7.3565, "step": 1447 }, { "epoch": 0.06910787366813426, "grad_norm": 5.78922700881958, "learning_rate": 0.00019974871797477889, "loss": 7.5712, "step": 1448 }, { "epoch": 0.0691556001002255, "grad_norm": 5.0412445068359375, "learning_rate": 0.0001997483635810508, "loss": 7.355, "step": 1449 }, { "epoch": 0.06920332653231676, "grad_norm": 5.201817035675049, "learning_rate": 0.00019974800893790535, "loss": 6.7539, "step": 1450 }, { "epoch": 0.06925105296440802, "grad_norm": 4.882143020629883, "learning_rate": 0.00019974765404534345, "loss": 5.7016, "step": 1451 }, { "epoch": 0.06929877939649927, "grad_norm": 6.180375099182129, "learning_rate": 0.00019974729890336588, "loss": 6.508, "step": 1452 }, { "epoch": 0.06934650582859052, "grad_norm": 6.680655002593994, "learning_rate": 0.00019974694351197362, "loss": 6.9433, "step": 1453 }, { "epoch": 0.06939423226068177, "grad_norm": 4.165004730224609, "learning_rate": 0.00019974658787116757, "loss": 4.8608, "step": 1454 }, { "epoch": 0.06944195869277303, "grad_norm": 4.066885471343994, "learning_rate": 0.00019974623198094852, "loss": 5.2078, "step": 1455 }, { "epoch": 0.06948968512486428, "grad_norm": 8.530899047851562, "learning_rate": 0.0001997458758413175, "loss": 6.3692, "step": 1456 }, { "epoch": 0.06953741155695553, "grad_norm": 7.534850597381592, "learning_rate": 0.00019974551945227525, "loss": 7.4906, "step": 1457 }, { "epoch": 0.06958513798904678, "grad_norm": 4.971254825592041, "learning_rate": 0.00019974516281382281, "loss": 5.8827, "step": 1458 }, { "epoch": 0.06963286442113803, "grad_norm": 6.5846357345581055, "learning_rate": 0.00019974480592596097, "loss": 4.9457, "step": 1459 }, { "epoch": 0.0696805908532293, "grad_norm": 4.665902614593506, "learning_rate": 0.00019974444878869064, "loss": 6.0893, "step": 1460 }, { "epoch": 0.06972831728532054, "grad_norm": 4.834763526916504, "learning_rate": 0.0001997440914020127, "loss": 5.5494, "step": 1461 }, { "epoch": 0.06977604371741179, "grad_norm": 4.592804431915283, "learning_rate": 0.00019974373376592812, "loss": 5.968, "step": 1462 }, { "epoch": 0.06982377014950304, "grad_norm": 8.394420623779297, "learning_rate": 0.0001997433758804377, "loss": 7.9709, "step": 1463 }, { "epoch": 0.0698714965815943, "grad_norm": 4.9166789054870605, "learning_rate": 0.00019974301774554237, "loss": 6.7768, "step": 1464 }, { "epoch": 0.06991922301368556, "grad_norm": 5.937588691711426, "learning_rate": 0.00019974265936124308, "loss": 6.5277, "step": 1465 }, { "epoch": 0.0699669494457768, "grad_norm": 6.530873775482178, "learning_rate": 0.00019974230072754065, "loss": 7.4278, "step": 1466 }, { "epoch": 0.07001467587786805, "grad_norm": 4.817973613739014, "learning_rate": 0.000199741941844436, "loss": 6.0094, "step": 1467 }, { "epoch": 0.07006240230995932, "grad_norm": 4.916870594024658, "learning_rate": 0.00019974158271193003, "loss": 6.6554, "step": 1468 }, { "epoch": 0.07011012874205057, "grad_norm": 5.593127250671387, "learning_rate": 0.00019974122333002366, "loss": 6.6662, "step": 1469 }, { "epoch": 0.07015785517414182, "grad_norm": 9.2138090133667, "learning_rate": 0.00019974086369871775, "loss": 7.0618, "step": 1470 }, { "epoch": 0.07020558160623307, "grad_norm": 5.314058780670166, "learning_rate": 0.00019974050381801322, "loss": 6.1235, "step": 1471 }, { "epoch": 0.07025330803832433, "grad_norm": 7.747195720672607, "learning_rate": 0.000199740143687911, "loss": 8.2476, "step": 1472 }, { "epoch": 0.07030103447041558, "grad_norm": 5.9003753662109375, "learning_rate": 0.0001997397833084119, "loss": 6.4312, "step": 1473 }, { "epoch": 0.07034876090250683, "grad_norm": 6.365148067474365, "learning_rate": 0.0001997394226795169, "loss": 8.2969, "step": 1474 }, { "epoch": 0.07039648733459808, "grad_norm": 7.798591613769531, "learning_rate": 0.0001997390618012269, "loss": 8.7275, "step": 1475 }, { "epoch": 0.07044421376668934, "grad_norm": 10.233098983764648, "learning_rate": 0.00019973870067354276, "loss": 7.1471, "step": 1476 }, { "epoch": 0.07049194019878059, "grad_norm": 6.749266624450684, "learning_rate": 0.00019973833929646544, "loss": 7.8638, "step": 1477 }, { "epoch": 0.07053966663087184, "grad_norm": 6.408066272735596, "learning_rate": 0.00019973797766999578, "loss": 6.9021, "step": 1478 }, { "epoch": 0.07058739306296309, "grad_norm": 4.665059566497803, "learning_rate": 0.00019973761579413472, "loss": 5.5684, "step": 1479 }, { "epoch": 0.07063511949505435, "grad_norm": 6.971848011016846, "learning_rate": 0.00019973725366888317, "loss": 5.2145, "step": 1480 }, { "epoch": 0.0706828459271456, "grad_norm": 4.309101104736328, "learning_rate": 0.00019973689129424202, "loss": 4.9203, "step": 1481 }, { "epoch": 0.07073057235923685, "grad_norm": 7.202592849731445, "learning_rate": 0.00019973652867021215, "loss": 7.9743, "step": 1482 }, { "epoch": 0.0707782987913281, "grad_norm": 4.574654579162598, "learning_rate": 0.00019973616579679453, "loss": 5.9004, "step": 1483 }, { "epoch": 0.07082602522341937, "grad_norm": 5.718390941619873, "learning_rate": 0.00019973580267399003, "loss": 5.6975, "step": 1484 }, { "epoch": 0.07087375165551062, "grad_norm": 4.803091526031494, "learning_rate": 0.00019973543930179957, "loss": 5.7176, "step": 1485 }, { "epoch": 0.07092147808760187, "grad_norm": 7.671842098236084, "learning_rate": 0.00019973507568022406, "loss": 6.7297, "step": 1486 }, { "epoch": 0.07096920451969312, "grad_norm": 5.249096393585205, "learning_rate": 0.00019973471180926439, "loss": 5.6079, "step": 1487 }, { "epoch": 0.07101693095178437, "grad_norm": 5.604092597961426, "learning_rate": 0.00019973434768892146, "loss": 7.1322, "step": 1488 }, { "epoch": 0.07106465738387563, "grad_norm": 6.230264186859131, "learning_rate": 0.0001997339833191962, "loss": 6.978, "step": 1489 }, { "epoch": 0.07111238381596688, "grad_norm": 6.016617298126221, "learning_rate": 0.00019973361870008954, "loss": 5.5847, "step": 1490 }, { "epoch": 0.07116011024805813, "grad_norm": 4.400451183319092, "learning_rate": 0.00019973325383160236, "loss": 5.1666, "step": 1491 }, { "epoch": 0.07120783668014938, "grad_norm": 5.012749671936035, "learning_rate": 0.00019973288871373562, "loss": 6.4622, "step": 1492 }, { "epoch": 0.07125556311224064, "grad_norm": 5.776968479156494, "learning_rate": 0.00019973252334649019, "loss": 5.9394, "step": 1493 }, { "epoch": 0.07130328954433189, "grad_norm": 5.619912624359131, "learning_rate": 0.00019973215772986692, "loss": 6.6817, "step": 1494 }, { "epoch": 0.07135101597642314, "grad_norm": 6.103733539581299, "learning_rate": 0.00019973179186386688, "loss": 5.7278, "step": 1495 }, { "epoch": 0.07139874240851439, "grad_norm": 8.222773551940918, "learning_rate": 0.00019973142574849088, "loss": 7.8202, "step": 1496 }, { "epoch": 0.07144646884060565, "grad_norm": 8.36824893951416, "learning_rate": 0.00019973105938373987, "loss": 9.5946, "step": 1497 }, { "epoch": 0.0714941952726969, "grad_norm": 6.333368301391602, "learning_rate": 0.00019973069276961472, "loss": 7.2274, "step": 1498 }, { "epoch": 0.07154192170478815, "grad_norm": 6.297748565673828, "learning_rate": 0.00019973032590611637, "loss": 6.6262, "step": 1499 }, { "epoch": 0.0715896481368794, "grad_norm": 6.146022319793701, "learning_rate": 0.00019972995879324578, "loss": 7.1535, "step": 1500 }, { "epoch": 0.07163737456897067, "grad_norm": 5.654027462005615, "learning_rate": 0.00019972959143100383, "loss": 6.98, "step": 1501 }, { "epoch": 0.07168510100106192, "grad_norm": 4.936557769775391, "learning_rate": 0.00019972922381939146, "loss": 4.4652, "step": 1502 }, { "epoch": 0.07173282743315317, "grad_norm": 5.2737202644348145, "learning_rate": 0.00019972885595840952, "loss": 5.6898, "step": 1503 }, { "epoch": 0.07178055386524441, "grad_norm": 5.161301136016846, "learning_rate": 0.00019972848784805905, "loss": 5.1067, "step": 1504 }, { "epoch": 0.07182828029733568, "grad_norm": 4.05790376663208, "learning_rate": 0.00019972811948834086, "loss": 4.7801, "step": 1505 }, { "epoch": 0.07187600672942693, "grad_norm": 5.834397792816162, "learning_rate": 0.00019972775087925593, "loss": 6.0723, "step": 1506 }, { "epoch": 0.07192373316151818, "grad_norm": 5.8512187004089355, "learning_rate": 0.00019972738202080517, "loss": 7.2836, "step": 1507 }, { "epoch": 0.07197145959360943, "grad_norm": 4.914852142333984, "learning_rate": 0.00019972701291298946, "loss": 4.8328, "step": 1508 }, { "epoch": 0.07201918602570069, "grad_norm": 6.523605823516846, "learning_rate": 0.00019972664355580978, "loss": 6.8695, "step": 1509 }, { "epoch": 0.07206691245779194, "grad_norm": 4.680057525634766, "learning_rate": 0.00019972627394926706, "loss": 5.7238, "step": 1510 }, { "epoch": 0.07211463888988319, "grad_norm": 5.73003625869751, "learning_rate": 0.0001997259040933622, "loss": 6.6803, "step": 1511 }, { "epoch": 0.07216236532197444, "grad_norm": 6.248278617858887, "learning_rate": 0.00019972553398809608, "loss": 7.2845, "step": 1512 }, { "epoch": 0.07221009175406569, "grad_norm": 5.356723308563232, "learning_rate": 0.00019972516363346972, "loss": 5.5901, "step": 1513 }, { "epoch": 0.07225781818615695, "grad_norm": 5.030494213104248, "learning_rate": 0.00019972479302948396, "loss": 4.7535, "step": 1514 }, { "epoch": 0.0723055446182482, "grad_norm": 6.608799934387207, "learning_rate": 0.00019972442217613978, "loss": 7.3457, "step": 1515 }, { "epoch": 0.07235327105033945, "grad_norm": 4.198172569274902, "learning_rate": 0.00019972405107343807, "loss": 4.8257, "step": 1516 }, { "epoch": 0.0724009974824307, "grad_norm": 4.48665714263916, "learning_rate": 0.0001997236797213798, "loss": 5.4951, "step": 1517 }, { "epoch": 0.07244872391452196, "grad_norm": 3.513526201248169, "learning_rate": 0.00019972330811996586, "loss": 3.8787, "step": 1518 }, { "epoch": 0.07249645034661321, "grad_norm": 4.193220615386963, "learning_rate": 0.0001997229362691972, "loss": 5.3623, "step": 1519 }, { "epoch": 0.07254417677870446, "grad_norm": 5.500551223754883, "learning_rate": 0.00019972256416907476, "loss": 6.0362, "step": 1520 }, { "epoch": 0.07259190321079571, "grad_norm": 4.897032737731934, "learning_rate": 0.00019972219181959943, "loss": 5.5378, "step": 1521 }, { "epoch": 0.07263962964288698, "grad_norm": 5.901869773864746, "learning_rate": 0.00019972181922077217, "loss": 5.3082, "step": 1522 }, { "epoch": 0.07268735607497823, "grad_norm": 4.50872278213501, "learning_rate": 0.00019972144637259392, "loss": 5.4624, "step": 1523 }, { "epoch": 0.07273508250706948, "grad_norm": 4.8166584968566895, "learning_rate": 0.0001997210732750656, "loss": 6.3738, "step": 1524 }, { "epoch": 0.07278280893916073, "grad_norm": 5.151186466217041, "learning_rate": 0.00019972069992818813, "loss": 4.9422, "step": 1525 }, { "epoch": 0.07283053537125199, "grad_norm": 5.420198440551758, "learning_rate": 0.00019972032633196248, "loss": 5.4735, "step": 1526 }, { "epoch": 0.07287826180334324, "grad_norm": 5.823660850524902, "learning_rate": 0.00019971995248638951, "loss": 6.4576, "step": 1527 }, { "epoch": 0.07292598823543449, "grad_norm": 5.933914661407471, "learning_rate": 0.00019971957839147027, "loss": 5.964, "step": 1528 }, { "epoch": 0.07297371466752574, "grad_norm": 5.879248142242432, "learning_rate": 0.0001997192040472056, "loss": 7.0123, "step": 1529 }, { "epoch": 0.073021441099617, "grad_norm": 9.106918334960938, "learning_rate": 0.00019971882945359645, "loss": 7.333, "step": 1530 }, { "epoch": 0.07306916753170825, "grad_norm": 7.2231364250183105, "learning_rate": 0.00019971845461064377, "loss": 5.4108, "step": 1531 }, { "epoch": 0.0731168939637995, "grad_norm": 6.6595940589904785, "learning_rate": 0.00019971807951834854, "loss": 7.8109, "step": 1532 }, { "epoch": 0.07316462039589075, "grad_norm": 6.513481616973877, "learning_rate": 0.00019971770417671165, "loss": 6.4648, "step": 1533 }, { "epoch": 0.07321234682798201, "grad_norm": 6.308335781097412, "learning_rate": 0.00019971732858573403, "loss": 8.0733, "step": 1534 }, { "epoch": 0.07326007326007326, "grad_norm": 4.566835403442383, "learning_rate": 0.00019971695274541667, "loss": 6.4237, "step": 1535 }, { "epoch": 0.07330779969216451, "grad_norm": 5.003836631774902, "learning_rate": 0.00019971657665576045, "loss": 6.5042, "step": 1536 }, { "epoch": 0.07335552612425576, "grad_norm": 6.262314319610596, "learning_rate": 0.00019971620031676633, "loss": 7.0278, "step": 1537 }, { "epoch": 0.07340325255634701, "grad_norm": 5.076964855194092, "learning_rate": 0.00019971582372843526, "loss": 5.2852, "step": 1538 }, { "epoch": 0.07345097898843828, "grad_norm": 4.98058557510376, "learning_rate": 0.00019971544689076817, "loss": 6.0187, "step": 1539 }, { "epoch": 0.07349870542052953, "grad_norm": 5.334786415100098, "learning_rate": 0.00019971506980376603, "loss": 6.4198, "step": 1540 }, { "epoch": 0.07354643185262077, "grad_norm": 6.519253730773926, "learning_rate": 0.00019971469246742976, "loss": 8.0925, "step": 1541 }, { "epoch": 0.07359415828471202, "grad_norm": 8.862279891967773, "learning_rate": 0.00019971431488176032, "loss": 8.9549, "step": 1542 }, { "epoch": 0.07364188471680329, "grad_norm": 5.169224262237549, "learning_rate": 0.00019971393704675862, "loss": 6.789, "step": 1543 }, { "epoch": 0.07368961114889454, "grad_norm": 5.402736663818359, "learning_rate": 0.00019971355896242565, "loss": 6.5456, "step": 1544 }, { "epoch": 0.07373733758098579, "grad_norm": 5.247530460357666, "learning_rate": 0.00019971318062876232, "loss": 7.5086, "step": 1545 }, { "epoch": 0.07378506401307704, "grad_norm": 5.178173542022705, "learning_rate": 0.0001997128020457696, "loss": 6.2893, "step": 1546 }, { "epoch": 0.0738327904451683, "grad_norm": 5.8277435302734375, "learning_rate": 0.00019971242321344842, "loss": 7.3786, "step": 1547 }, { "epoch": 0.07388051687725955, "grad_norm": 4.098021984100342, "learning_rate": 0.00019971204413179972, "loss": 5.3967, "step": 1548 }, { "epoch": 0.0739282433093508, "grad_norm": 5.216212272644043, "learning_rate": 0.00019971166480082446, "loss": 6.2666, "step": 1549 }, { "epoch": 0.07397596974144205, "grad_norm": 4.811190605163574, "learning_rate": 0.0001997112852205236, "loss": 5.4698, "step": 1550 }, { "epoch": 0.07402369617353331, "grad_norm": 6.3235650062561035, "learning_rate": 0.00019971090539089805, "loss": 7.6886, "step": 1551 }, { "epoch": 0.07407142260562456, "grad_norm": 6.371283054351807, "learning_rate": 0.0001997105253119488, "loss": 9.2504, "step": 1552 }, { "epoch": 0.07411914903771581, "grad_norm": 5.656994819641113, "learning_rate": 0.0001997101449836768, "loss": 6.1135, "step": 1553 }, { "epoch": 0.07416687546980706, "grad_norm": 6.661611080169678, "learning_rate": 0.000199709764406083, "loss": 7.0288, "step": 1554 }, { "epoch": 0.07421460190189832, "grad_norm": 6.038095474243164, "learning_rate": 0.00019970938357916833, "loss": 7.948, "step": 1555 }, { "epoch": 0.07426232833398957, "grad_norm": 4.901935577392578, "learning_rate": 0.00019970900250293374, "loss": 6.2219, "step": 1556 }, { "epoch": 0.07431005476608082, "grad_norm": 4.575124263763428, "learning_rate": 0.0001997086211773802, "loss": 6.3292, "step": 1557 }, { "epoch": 0.07435778119817207, "grad_norm": 6.391813278198242, "learning_rate": 0.0001997082396025087, "loss": 7.0486, "step": 1558 }, { "epoch": 0.07440550763026334, "grad_norm": 5.125641345977783, "learning_rate": 0.0001997078577783201, "loss": 6.0365, "step": 1559 }, { "epoch": 0.07445323406235459, "grad_norm": 6.541107654571533, "learning_rate": 0.00019970747570481541, "loss": 6.5239, "step": 1560 }, { "epoch": 0.07450096049444584, "grad_norm": 11.997361183166504, "learning_rate": 0.0001997070933819956, "loss": 5.7399, "step": 1561 }, { "epoch": 0.07454868692653709, "grad_norm": 6.662923336029053, "learning_rate": 0.00019970671080986162, "loss": 9.2848, "step": 1562 }, { "epoch": 0.07459641335862834, "grad_norm": 5.5470123291015625, "learning_rate": 0.0001997063279884144, "loss": 6.3059, "step": 1563 }, { "epoch": 0.0746441397907196, "grad_norm": 5.866000652313232, "learning_rate": 0.00019970594491765492, "loss": 6.4263, "step": 1564 }, { "epoch": 0.07469186622281085, "grad_norm": 6.036199569702148, "learning_rate": 0.0001997055615975841, "loss": 7.1695, "step": 1565 }, { "epoch": 0.0747395926549021, "grad_norm": 5.312992572784424, "learning_rate": 0.00019970517802820296, "loss": 6.341, "step": 1566 }, { "epoch": 0.07478731908699335, "grad_norm": 4.94517183303833, "learning_rate": 0.00019970479420951246, "loss": 6.6993, "step": 1567 }, { "epoch": 0.07483504551908461, "grad_norm": 6.285000801086426, "learning_rate": 0.00019970441014151348, "loss": 7.5511, "step": 1568 }, { "epoch": 0.07488277195117586, "grad_norm": 5.202957630157471, "learning_rate": 0.00019970402582420704, "loss": 6.5146, "step": 1569 }, { "epoch": 0.07493049838326711, "grad_norm": 6.017721652984619, "learning_rate": 0.00019970364125759409, "loss": 7.7479, "step": 1570 }, { "epoch": 0.07497822481535836, "grad_norm": 6.5648064613342285, "learning_rate": 0.0001997032564416756, "loss": 7.2589, "step": 1571 }, { "epoch": 0.07502595124744962, "grad_norm": 5.027810096740723, "learning_rate": 0.00019970287137645253, "loss": 6.6274, "step": 1572 }, { "epoch": 0.07507367767954087, "grad_norm": 5.900641918182373, "learning_rate": 0.0001997024860619258, "loss": 5.9374, "step": 1573 }, { "epoch": 0.07512140411163212, "grad_norm": 5.835719585418701, "learning_rate": 0.00019970210049809642, "loss": 6.6491, "step": 1574 }, { "epoch": 0.07516913054372337, "grad_norm": 5.864403247833252, "learning_rate": 0.0001997017146849654, "loss": 5.9716, "step": 1575 }, { "epoch": 0.07521685697581464, "grad_norm": 7.781457901000977, "learning_rate": 0.00019970132862253358, "loss": 8.2885, "step": 1576 }, { "epoch": 0.07526458340790589, "grad_norm": 7.193665981292725, "learning_rate": 0.000199700942310802, "loss": 8.9653, "step": 1577 }, { "epoch": 0.07531230983999714, "grad_norm": 3.9959962368011475, "learning_rate": 0.00019970055574977167, "loss": 4.4899, "step": 1578 }, { "epoch": 0.07536003627208838, "grad_norm": 4.9402031898498535, "learning_rate": 0.00019970016893944346, "loss": 6.2323, "step": 1579 }, { "epoch": 0.07540776270417965, "grad_norm": 7.627409934997559, "learning_rate": 0.0001996997818798184, "loss": 9.2865, "step": 1580 }, { "epoch": 0.0754554891362709, "grad_norm": 4.46121883392334, "learning_rate": 0.00019969939457089745, "loss": 6.3595, "step": 1581 }, { "epoch": 0.07550321556836215, "grad_norm": 5.427103042602539, "learning_rate": 0.00019969900701268155, "loss": 6.2139, "step": 1582 }, { "epoch": 0.0755509420004534, "grad_norm": 4.491297245025635, "learning_rate": 0.0001996986192051717, "loss": 4.9951, "step": 1583 }, { "epoch": 0.07559866843254466, "grad_norm": 8.137874603271484, "learning_rate": 0.00019969823114836886, "loss": 7.5933, "step": 1584 }, { "epoch": 0.07564639486463591, "grad_norm": 5.7112274169921875, "learning_rate": 0.00019969784284227398, "loss": 7.3291, "step": 1585 }, { "epoch": 0.07569412129672716, "grad_norm": 4.706667423248291, "learning_rate": 0.0001996974542868881, "loss": 6.1936, "step": 1586 }, { "epoch": 0.07574184772881841, "grad_norm": 4.729734420776367, "learning_rate": 0.00019969706548221208, "loss": 5.7359, "step": 1587 }, { "epoch": 0.07578957416090966, "grad_norm": 5.268073558807373, "learning_rate": 0.00019969667642824699, "loss": 5.6506, "step": 1588 }, { "epoch": 0.07583730059300092, "grad_norm": 6.599660873413086, "learning_rate": 0.00019969628712499375, "loss": 7.9805, "step": 1589 }, { "epoch": 0.07588502702509217, "grad_norm": 4.655593395233154, "learning_rate": 0.00019969589757245338, "loss": 5.4669, "step": 1590 }, { "epoch": 0.07593275345718342, "grad_norm": 6.783434867858887, "learning_rate": 0.0001996955077706268, "loss": 7.8227, "step": 1591 }, { "epoch": 0.07598047988927467, "grad_norm": 5.285609722137451, "learning_rate": 0.000199695117719515, "loss": 5.5403, "step": 1592 }, { "epoch": 0.07602820632136593, "grad_norm": 5.0504560470581055, "learning_rate": 0.000199694727419119, "loss": 5.58, "step": 1593 }, { "epoch": 0.07607593275345718, "grad_norm": 5.632745265960693, "learning_rate": 0.00019969433686943967, "loss": 5.9426, "step": 1594 }, { "epoch": 0.07612365918554843, "grad_norm": 6.45082426071167, "learning_rate": 0.00019969394607047814, "loss": 5.2393, "step": 1595 }, { "epoch": 0.07617138561763968, "grad_norm": 5.252044677734375, "learning_rate": 0.00019969355502223523, "loss": 7.4354, "step": 1596 }, { "epoch": 0.07621911204973095, "grad_norm": 4.865884304046631, "learning_rate": 0.00019969316372471204, "loss": 5.6465, "step": 1597 }, { "epoch": 0.0762668384818222, "grad_norm": 5.0108160972595215, "learning_rate": 0.0001996927721779095, "loss": 6.4014, "step": 1598 }, { "epoch": 0.07631456491391345, "grad_norm": 5.163329124450684, "learning_rate": 0.00019969238038182856, "loss": 5.9127, "step": 1599 }, { "epoch": 0.0763622913460047, "grad_norm": 8.200749397277832, "learning_rate": 0.00019969198833647025, "loss": 8.727, "step": 1600 }, { "epoch": 0.0763622913460047, "eval_loss": 1.6216099262237549, "eval_runtime": 96.5666, "eval_samples_per_second": 8.73, "eval_steps_per_second": 4.37, "step": 1600 }, { "epoch": 0.07641001777809596, "grad_norm": 6.213042736053467, "learning_rate": 0.0001996915960418355, "loss": 6.8067, "step": 1601 }, { "epoch": 0.07645774421018721, "grad_norm": 6.56706428527832, "learning_rate": 0.00019969120349792535, "loss": 6.0904, "step": 1602 }, { "epoch": 0.07650547064227846, "grad_norm": 6.016430377960205, "learning_rate": 0.00019969081070474076, "loss": 6.6328, "step": 1603 }, { "epoch": 0.07655319707436971, "grad_norm": 5.029018402099609, "learning_rate": 0.0001996904176622827, "loss": 6.2398, "step": 1604 }, { "epoch": 0.07660092350646097, "grad_norm": 5.487245559692383, "learning_rate": 0.00019969002437055213, "loss": 6.3851, "step": 1605 }, { "epoch": 0.07664864993855222, "grad_norm": 6.892983436584473, "learning_rate": 0.00019968963082955007, "loss": 6.9439, "step": 1606 }, { "epoch": 0.07669637637064347, "grad_norm": 9.006431579589844, "learning_rate": 0.00019968923703927752, "loss": 6.6237, "step": 1607 }, { "epoch": 0.07674410280273472, "grad_norm": 6.387834548950195, "learning_rate": 0.0001996888429997354, "loss": 5.2966, "step": 1608 }, { "epoch": 0.07679182923482598, "grad_norm": 6.041240692138672, "learning_rate": 0.00019968844871092473, "loss": 5.8461, "step": 1609 }, { "epoch": 0.07683955566691723, "grad_norm": 6.499271869659424, "learning_rate": 0.00019968805417284652, "loss": 8.0893, "step": 1610 }, { "epoch": 0.07688728209900848, "grad_norm": 4.994762420654297, "learning_rate": 0.00019968765938550175, "loss": 6.4665, "step": 1611 }, { "epoch": 0.07693500853109973, "grad_norm": 5.472882270812988, "learning_rate": 0.00019968726434889137, "loss": 4.9967, "step": 1612 }, { "epoch": 0.07698273496319098, "grad_norm": 4.433282375335693, "learning_rate": 0.0001996868690630164, "loss": 5.9305, "step": 1613 }, { "epoch": 0.07703046139528225, "grad_norm": 4.604747772216797, "learning_rate": 0.00019968647352787785, "loss": 4.8833, "step": 1614 }, { "epoch": 0.0770781878273735, "grad_norm": 4.545809268951416, "learning_rate": 0.00019968607774347664, "loss": 5.9685, "step": 1615 }, { "epoch": 0.07712591425946475, "grad_norm": 6.776806354522705, "learning_rate": 0.0001996856817098138, "loss": 7.1538, "step": 1616 }, { "epoch": 0.077173640691556, "grad_norm": 6.553951740264893, "learning_rate": 0.00019968528542689033, "loss": 6.2756, "step": 1617 }, { "epoch": 0.07722136712364726, "grad_norm": 4.915081977844238, "learning_rate": 0.0001996848888947072, "loss": 5.231, "step": 1618 }, { "epoch": 0.07726909355573851, "grad_norm": 11.090683937072754, "learning_rate": 0.00019968449211326542, "loss": 7.9175, "step": 1619 }, { "epoch": 0.07731681998782976, "grad_norm": 6.923368453979492, "learning_rate": 0.00019968409508256595, "loss": 7.9178, "step": 1620 }, { "epoch": 0.07736454641992101, "grad_norm": 6.173829555511475, "learning_rate": 0.00019968369780260984, "loss": 7.1636, "step": 1621 }, { "epoch": 0.07741227285201227, "grad_norm": 6.82475471496582, "learning_rate": 0.00019968330027339803, "loss": 5.5989, "step": 1622 }, { "epoch": 0.07745999928410352, "grad_norm": 6.1241350173950195, "learning_rate": 0.0001996829024949315, "loss": 7.517, "step": 1623 }, { "epoch": 0.07750772571619477, "grad_norm": 6.248012065887451, "learning_rate": 0.00019968250446721135, "loss": 6.6465, "step": 1624 }, { "epoch": 0.07755545214828602, "grad_norm": 5.67265510559082, "learning_rate": 0.00019968210619023845, "loss": 6.2676, "step": 1625 }, { "epoch": 0.07760317858037728, "grad_norm": 8.361661911010742, "learning_rate": 0.0001996817076640139, "loss": 7.6919, "step": 1626 }, { "epoch": 0.07765090501246853, "grad_norm": 6.589196681976318, "learning_rate": 0.00019968130888853858, "loss": 7.2639, "step": 1627 }, { "epoch": 0.07769863144455978, "grad_norm": 5.206403732299805, "learning_rate": 0.0001996809098638136, "loss": 7.0795, "step": 1628 }, { "epoch": 0.07774635787665103, "grad_norm": 8.287991523742676, "learning_rate": 0.00019968051058983987, "loss": 7.9166, "step": 1629 }, { "epoch": 0.0777940843087423, "grad_norm": 6.763979434967041, "learning_rate": 0.00019968011106661845, "loss": 5.7029, "step": 1630 }, { "epoch": 0.07784181074083354, "grad_norm": 5.0751471519470215, "learning_rate": 0.00019967971129415034, "loss": 5.3303, "step": 1631 }, { "epoch": 0.0778895371729248, "grad_norm": 7.753993988037109, "learning_rate": 0.00019967931127243648, "loss": 6.9684, "step": 1632 }, { "epoch": 0.07793726360501604, "grad_norm": 6.455916881561279, "learning_rate": 0.00019967891100147793, "loss": 6.4729, "step": 1633 }, { "epoch": 0.07798499003710731, "grad_norm": 5.973205089569092, "learning_rate": 0.00019967851048127567, "loss": 5.9516, "step": 1634 }, { "epoch": 0.07803271646919856, "grad_norm": 4.762160301208496, "learning_rate": 0.00019967810971183067, "loss": 5.0758, "step": 1635 }, { "epoch": 0.0780804429012898, "grad_norm": 6.6044087409973145, "learning_rate": 0.000199677708693144, "loss": 7.6978, "step": 1636 }, { "epoch": 0.07812816933338106, "grad_norm": 5.287650108337402, "learning_rate": 0.0001996773074252166, "loss": 5.7193, "step": 1637 }, { "epoch": 0.0781758957654723, "grad_norm": 4.72916841506958, "learning_rate": 0.00019967690590804953, "loss": 4.87, "step": 1638 }, { "epoch": 0.07822362219756357, "grad_norm": 6.024513244628906, "learning_rate": 0.00019967650414164375, "loss": 7.0115, "step": 1639 }, { "epoch": 0.07827134862965482, "grad_norm": 5.97337532043457, "learning_rate": 0.00019967610212600026, "loss": 6.5577, "step": 1640 }, { "epoch": 0.07831907506174607, "grad_norm": 5.233748435974121, "learning_rate": 0.00019967569986112008, "loss": 5.8396, "step": 1641 }, { "epoch": 0.07836680149383732, "grad_norm": 4.639688491821289, "learning_rate": 0.00019967529734700424, "loss": 5.8632, "step": 1642 }, { "epoch": 0.07841452792592858, "grad_norm": 8.295655250549316, "learning_rate": 0.00019967489458365368, "loss": 8.4641, "step": 1643 }, { "epoch": 0.07846225435801983, "grad_norm": 6.5101399421691895, "learning_rate": 0.0001996744915710695, "loss": 6.4926, "step": 1644 }, { "epoch": 0.07850998079011108, "grad_norm": 4.7523193359375, "learning_rate": 0.00019967408830925262, "loss": 5.0963, "step": 1645 }, { "epoch": 0.07855770722220233, "grad_norm": 4.455989837646484, "learning_rate": 0.0001996736847982041, "loss": 5.0967, "step": 1646 }, { "epoch": 0.0786054336542936, "grad_norm": 6.464187145233154, "learning_rate": 0.00019967328103792496, "loss": 7.484, "step": 1647 }, { "epoch": 0.07865316008638484, "grad_norm": 4.563752174377441, "learning_rate": 0.00019967287702841614, "loss": 5.4025, "step": 1648 }, { "epoch": 0.0787008865184761, "grad_norm": 7.606485366821289, "learning_rate": 0.00019967247276967876, "loss": 8.581, "step": 1649 }, { "epoch": 0.07874861295056734, "grad_norm": 4.93491792678833, "learning_rate": 0.00019967206826171371, "loss": 5.2992, "step": 1650 }, { "epoch": 0.0787963393826586, "grad_norm": 6.944345951080322, "learning_rate": 0.0001996716635045221, "loss": 7.3936, "step": 1651 }, { "epoch": 0.07884406581474986, "grad_norm": 7.332148551940918, "learning_rate": 0.00019967125849810487, "loss": 7.6212, "step": 1652 }, { "epoch": 0.0788917922468411, "grad_norm": 7.307640552520752, "learning_rate": 0.00019967085324246308, "loss": 5.5633, "step": 1653 }, { "epoch": 0.07893951867893235, "grad_norm": 5.081273078918457, "learning_rate": 0.00019967044773759772, "loss": 5.6814, "step": 1654 }, { "epoch": 0.07898724511102362, "grad_norm": 12.49409008026123, "learning_rate": 0.0001996700419835098, "loss": 9.3296, "step": 1655 }, { "epoch": 0.07903497154311487, "grad_norm": 5.087846755981445, "learning_rate": 0.00019966963598020036, "loss": 6.4591, "step": 1656 }, { "epoch": 0.07908269797520612, "grad_norm": 6.591655254364014, "learning_rate": 0.00019966922972767043, "loss": 7.3023, "step": 1657 }, { "epoch": 0.07913042440729737, "grad_norm": 6.3594441413879395, "learning_rate": 0.00019966882322592096, "loss": 5.6784, "step": 1658 }, { "epoch": 0.07917815083938863, "grad_norm": 6.709249973297119, "learning_rate": 0.00019966841647495298, "loss": 8.6046, "step": 1659 }, { "epoch": 0.07922587727147988, "grad_norm": 6.185741424560547, "learning_rate": 0.00019966800947476756, "loss": 6.3217, "step": 1660 }, { "epoch": 0.07927360370357113, "grad_norm": 6.148744583129883, "learning_rate": 0.00019966760222536566, "loss": 7.7432, "step": 1661 }, { "epoch": 0.07932133013566238, "grad_norm": 5.742559909820557, "learning_rate": 0.00019966719472674837, "loss": 6.3385, "step": 1662 }, { "epoch": 0.07936905656775364, "grad_norm": 4.77824592590332, "learning_rate": 0.00019966678697891664, "loss": 4.7605, "step": 1663 }, { "epoch": 0.07941678299984489, "grad_norm": 6.681817054748535, "learning_rate": 0.00019966637898187153, "loss": 8.3296, "step": 1664 }, { "epoch": 0.07946450943193614, "grad_norm": 4.378421783447266, "learning_rate": 0.000199665970735614, "loss": 5.6909, "step": 1665 }, { "epoch": 0.07951223586402739, "grad_norm": 4.756327152252197, "learning_rate": 0.00019966556224014517, "loss": 5.3174, "step": 1666 }, { "epoch": 0.07955996229611864, "grad_norm": 10.603726387023926, "learning_rate": 0.00019966515349546598, "loss": 7.3794, "step": 1667 }, { "epoch": 0.0796076887282099, "grad_norm": 4.203999042510986, "learning_rate": 0.00019966474450157747, "loss": 6.4418, "step": 1668 }, { "epoch": 0.07965541516030115, "grad_norm": 6.3584794998168945, "learning_rate": 0.0001996643352584807, "loss": 8.7569, "step": 1669 }, { "epoch": 0.0797031415923924, "grad_norm": 4.668309688568115, "learning_rate": 0.00019966392576617662, "loss": 5.0782, "step": 1670 }, { "epoch": 0.07975086802448365, "grad_norm": 6.633731842041016, "learning_rate": 0.00019966351602466632, "loss": 6.8295, "step": 1671 }, { "epoch": 0.07979859445657492, "grad_norm": 6.814727783203125, "learning_rate": 0.00019966310603395081, "loss": 8.2306, "step": 1672 }, { "epoch": 0.07984632088866617, "grad_norm": 7.060058116912842, "learning_rate": 0.00019966269579403112, "loss": 7.1187, "step": 1673 }, { "epoch": 0.07989404732075742, "grad_norm": 5.988522052764893, "learning_rate": 0.00019966228530490825, "loss": 6.0699, "step": 1674 }, { "epoch": 0.07994177375284867, "grad_norm": 4.418022632598877, "learning_rate": 0.00019966187456658325, "loss": 5.6178, "step": 1675 }, { "epoch": 0.07998950018493993, "grad_norm": 5.4556355476379395, "learning_rate": 0.0001996614635790571, "loss": 5.6792, "step": 1676 }, { "epoch": 0.08003722661703118, "grad_norm": 6.298896789550781, "learning_rate": 0.0001996610523423309, "loss": 6.8607, "step": 1677 }, { "epoch": 0.08008495304912243, "grad_norm": 6.532053470611572, "learning_rate": 0.00019966064085640564, "loss": 7.807, "step": 1678 }, { "epoch": 0.08013267948121368, "grad_norm": 5.308156967163086, "learning_rate": 0.00019966022912128235, "loss": 7.2996, "step": 1679 }, { "epoch": 0.08018040591330494, "grad_norm": 6.2061591148376465, "learning_rate": 0.00019965981713696205, "loss": 6.8429, "step": 1680 }, { "epoch": 0.08022813234539619, "grad_norm": 5.631930351257324, "learning_rate": 0.0001996594049034458, "loss": 6.3054, "step": 1681 }, { "epoch": 0.08027585877748744, "grad_norm": 5.632227897644043, "learning_rate": 0.0001996589924207346, "loss": 6.1, "step": 1682 }, { "epoch": 0.08032358520957869, "grad_norm": 4.874825477600098, "learning_rate": 0.0001996585796888295, "loss": 5.9334, "step": 1683 }, { "epoch": 0.08037131164166995, "grad_norm": 6.659267425537109, "learning_rate": 0.00019965816670773151, "loss": 8.0067, "step": 1684 }, { "epoch": 0.0804190380737612, "grad_norm": 5.430305480957031, "learning_rate": 0.0001996577534774417, "loss": 5.7937, "step": 1685 }, { "epoch": 0.08046676450585245, "grad_norm": 11.987308502197266, "learning_rate": 0.00019965733999796108, "loss": 8.7384, "step": 1686 }, { "epoch": 0.0805144909379437, "grad_norm": 5.439146041870117, "learning_rate": 0.00019965692626929067, "loss": 6.6936, "step": 1687 }, { "epoch": 0.08056221737003497, "grad_norm": 5.855020523071289, "learning_rate": 0.00019965651229143154, "loss": 6.0076, "step": 1688 }, { "epoch": 0.08060994380212622, "grad_norm": 4.129716396331787, "learning_rate": 0.0001996560980643847, "loss": 4.4198, "step": 1689 }, { "epoch": 0.08065767023421747, "grad_norm": 4.787317752838135, "learning_rate": 0.0001996556835881512, "loss": 6.0638, "step": 1690 }, { "epoch": 0.08070539666630872, "grad_norm": 7.317868709564209, "learning_rate": 0.00019965526886273204, "loss": 7.1752, "step": 1691 }, { "epoch": 0.08075312309839996, "grad_norm": 5.041870594024658, "learning_rate": 0.0001996548538881283, "loss": 6.0734, "step": 1692 }, { "epoch": 0.08080084953049123, "grad_norm": 7.702842712402344, "learning_rate": 0.00019965443866434103, "loss": 6.5332, "step": 1693 }, { "epoch": 0.08084857596258248, "grad_norm": 6.717932224273682, "learning_rate": 0.0001996540231913712, "loss": 6.4044, "step": 1694 }, { "epoch": 0.08089630239467373, "grad_norm": 6.184224605560303, "learning_rate": 0.0001996536074692199, "loss": 5.8096, "step": 1695 }, { "epoch": 0.08094402882676498, "grad_norm": 5.836258411407471, "learning_rate": 0.00019965319149788818, "loss": 7.3853, "step": 1696 }, { "epoch": 0.08099175525885624, "grad_norm": 4.996598243713379, "learning_rate": 0.00019965277527737702, "loss": 5.4813, "step": 1697 }, { "epoch": 0.08103948169094749, "grad_norm": 7.573056221008301, "learning_rate": 0.0001996523588076875, "loss": 5.7797, "step": 1698 }, { "epoch": 0.08108720812303874, "grad_norm": 7.517128944396973, "learning_rate": 0.0001996519420888207, "loss": 6.7109, "step": 1699 }, { "epoch": 0.08113493455512999, "grad_norm": 4.552103519439697, "learning_rate": 0.0001996515251207776, "loss": 5.3078, "step": 1700 }, { "epoch": 0.08118266098722125, "grad_norm": 4.873751163482666, "learning_rate": 0.0001996511079035593, "loss": 6.6249, "step": 1701 }, { "epoch": 0.0812303874193125, "grad_norm": 5.868348598480225, "learning_rate": 0.00019965069043716675, "loss": 7.0252, "step": 1702 }, { "epoch": 0.08127811385140375, "grad_norm": 6.060241222381592, "learning_rate": 0.0001996502727216011, "loss": 6.6664, "step": 1703 }, { "epoch": 0.081325840283495, "grad_norm": 4.212329387664795, "learning_rate": 0.0001996498547568633, "loss": 5.2279, "step": 1704 }, { "epoch": 0.08137356671558627, "grad_norm": 5.285454750061035, "learning_rate": 0.0001996494365429545, "loss": 5.3834, "step": 1705 }, { "epoch": 0.08142129314767751, "grad_norm": 5.365895748138428, "learning_rate": 0.00019964901807987566, "loss": 5.8543, "step": 1706 }, { "epoch": 0.08146901957976876, "grad_norm": 5.965190887451172, "learning_rate": 0.00019964859936762785, "loss": 6.0896, "step": 1707 }, { "epoch": 0.08151674601186001, "grad_norm": 7.844050884246826, "learning_rate": 0.00019964818040621213, "loss": 6.1424, "step": 1708 }, { "epoch": 0.08156447244395128, "grad_norm": 4.484588146209717, "learning_rate": 0.00019964776119562954, "loss": 5.0725, "step": 1709 }, { "epoch": 0.08161219887604253, "grad_norm": 5.158018589019775, "learning_rate": 0.0001996473417358811, "loss": 6.2989, "step": 1710 }, { "epoch": 0.08165992530813378, "grad_norm": 7.060210227966309, "learning_rate": 0.00019964692202696792, "loss": 6.3938, "step": 1711 }, { "epoch": 0.08170765174022503, "grad_norm": 6.1060404777526855, "learning_rate": 0.000199646502068891, "loss": 7.282, "step": 1712 }, { "epoch": 0.08175537817231629, "grad_norm": 5.346414566040039, "learning_rate": 0.0001996460818616514, "loss": 5.1761, "step": 1713 }, { "epoch": 0.08180310460440754, "grad_norm": 5.801202297210693, "learning_rate": 0.0001996456614052502, "loss": 6.3118, "step": 1714 }, { "epoch": 0.08185083103649879, "grad_norm": 4.125235557556152, "learning_rate": 0.00019964524069968843, "loss": 4.2174, "step": 1715 }, { "epoch": 0.08189855746859004, "grad_norm": 3.6682240962982178, "learning_rate": 0.00019964481974496713, "loss": 4.262, "step": 1716 }, { "epoch": 0.08194628390068129, "grad_norm": 4.852508544921875, "learning_rate": 0.00019964439854108735, "loss": 6.0469, "step": 1717 }, { "epoch": 0.08199401033277255, "grad_norm": 5.197348594665527, "learning_rate": 0.00019964397708805017, "loss": 5.5976, "step": 1718 }, { "epoch": 0.0820417367648638, "grad_norm": 5.768982887268066, "learning_rate": 0.0001996435553858566, "loss": 7.2279, "step": 1719 }, { "epoch": 0.08208946319695505, "grad_norm": 13.2932710647583, "learning_rate": 0.00019964313343450777, "loss": 8.2807, "step": 1720 }, { "epoch": 0.0821371896290463, "grad_norm": 5.955008506774902, "learning_rate": 0.00019964271123400467, "loss": 6.6238, "step": 1721 }, { "epoch": 0.08218491606113756, "grad_norm": 6.39152193069458, "learning_rate": 0.00019964228878434836, "loss": 7.3249, "step": 1722 }, { "epoch": 0.08223264249322881, "grad_norm": 4.231929779052734, "learning_rate": 0.00019964186608553992, "loss": 5.2923, "step": 1723 }, { "epoch": 0.08228036892532006, "grad_norm": 7.347591400146484, "learning_rate": 0.00019964144313758042, "loss": 6.4519, "step": 1724 }, { "epoch": 0.08232809535741131, "grad_norm": 6.886537075042725, "learning_rate": 0.00019964101994047085, "loss": 8.6672, "step": 1725 }, { "epoch": 0.08237582178950258, "grad_norm": 5.32847261428833, "learning_rate": 0.00019964059649421235, "loss": 6.2811, "step": 1726 }, { "epoch": 0.08242354822159383, "grad_norm": 8.488326072692871, "learning_rate": 0.00019964017279880592, "loss": 7.7289, "step": 1727 }, { "epoch": 0.08247127465368508, "grad_norm": 6.963995933532715, "learning_rate": 0.00019963974885425266, "loss": 7.5388, "step": 1728 }, { "epoch": 0.08251900108577633, "grad_norm": 5.534013748168945, "learning_rate": 0.00019963932466055359, "loss": 6.1878, "step": 1729 }, { "epoch": 0.08256672751786759, "grad_norm": 4.852373123168945, "learning_rate": 0.00019963890021770984, "loss": 6.2748, "step": 1730 }, { "epoch": 0.08261445394995884, "grad_norm": 5.445532321929932, "learning_rate": 0.00019963847552572237, "loss": 6.2058, "step": 1731 }, { "epoch": 0.08266218038205009, "grad_norm": 8.185537338256836, "learning_rate": 0.00019963805058459232, "loss": 9.7164, "step": 1732 }, { "epoch": 0.08270990681414134, "grad_norm": 5.265599727630615, "learning_rate": 0.0001996376253943207, "loss": 5.992, "step": 1733 }, { "epoch": 0.0827576332462326, "grad_norm": 5.291799068450928, "learning_rate": 0.00019963719995490866, "loss": 6.5815, "step": 1734 }, { "epoch": 0.08280535967832385, "grad_norm": 5.858906269073486, "learning_rate": 0.00019963677426635716, "loss": 6.2855, "step": 1735 }, { "epoch": 0.0828530861104151, "grad_norm": 6.700840950012207, "learning_rate": 0.00019963634832866734, "loss": 6.4154, "step": 1736 }, { "epoch": 0.08290081254250635, "grad_norm": 6.953587532043457, "learning_rate": 0.0001996359221418402, "loss": 7.558, "step": 1737 }, { "epoch": 0.08294853897459761, "grad_norm": 5.477334976196289, "learning_rate": 0.00019963549570587685, "loss": 8.7402, "step": 1738 }, { "epoch": 0.08299626540668886, "grad_norm": 5.381014347076416, "learning_rate": 0.00019963506902077836, "loss": 6.7167, "step": 1739 }, { "epoch": 0.08304399183878011, "grad_norm": 5.538295269012451, "learning_rate": 0.0001996346420865458, "loss": 6.491, "step": 1740 }, { "epoch": 0.08309171827087136, "grad_norm": 4.111212253570557, "learning_rate": 0.0001996342149031802, "loss": 4.9609, "step": 1741 }, { "epoch": 0.08313944470296261, "grad_norm": 4.362302303314209, "learning_rate": 0.00019963378747068263, "loss": 5.5623, "step": 1742 }, { "epoch": 0.08318717113505388, "grad_norm": 5.191958904266357, "learning_rate": 0.00019963335978905419, "loss": 7.2734, "step": 1743 }, { "epoch": 0.08323489756714512, "grad_norm": 6.065234661102295, "learning_rate": 0.00019963293185829595, "loss": 5.2144, "step": 1744 }, { "epoch": 0.08328262399923637, "grad_norm": 4.423040866851807, "learning_rate": 0.00019963250367840897, "loss": 5.6843, "step": 1745 }, { "epoch": 0.08333035043132762, "grad_norm": 5.322823524475098, "learning_rate": 0.00019963207524939432, "loss": 5.6057, "step": 1746 }, { "epoch": 0.08337807686341889, "grad_norm": 5.804923057556152, "learning_rate": 0.00019963164657125305, "loss": 6.1089, "step": 1747 }, { "epoch": 0.08342580329551014, "grad_norm": 5.805313587188721, "learning_rate": 0.00019963121764398625, "loss": 7.0777, "step": 1748 }, { "epoch": 0.08347352972760139, "grad_norm": 6.270987033843994, "learning_rate": 0.00019963078846759502, "loss": 6.6001, "step": 1749 }, { "epoch": 0.08352125615969264, "grad_norm": 3.6206235885620117, "learning_rate": 0.0001996303590420804, "loss": 4.5162, "step": 1750 }, { "epoch": 0.0835689825917839, "grad_norm": 4.857881546020508, "learning_rate": 0.00019962992936744346, "loss": 6.1566, "step": 1751 }, { "epoch": 0.08361670902387515, "grad_norm": 5.4148993492126465, "learning_rate": 0.00019962949944368527, "loss": 6.1801, "step": 1752 }, { "epoch": 0.0836644354559664, "grad_norm": 6.691871643066406, "learning_rate": 0.00019962906927080694, "loss": 7.4375, "step": 1753 }, { "epoch": 0.08371216188805765, "grad_norm": 6.246443271636963, "learning_rate": 0.00019962863884880953, "loss": 7.5509, "step": 1754 }, { "epoch": 0.08375988832014891, "grad_norm": 5.4471540451049805, "learning_rate": 0.0001996282081776941, "loss": 5.3964, "step": 1755 }, { "epoch": 0.08380761475224016, "grad_norm": 4.935588836669922, "learning_rate": 0.00019962777725746175, "loss": 5.5563, "step": 1756 }, { "epoch": 0.08385534118433141, "grad_norm": 7.437018394470215, "learning_rate": 0.00019962734608811356, "loss": 8.1651, "step": 1757 }, { "epoch": 0.08390306761642266, "grad_norm": 6.402498245239258, "learning_rate": 0.00019962691466965054, "loss": 8.032, "step": 1758 }, { "epoch": 0.08395079404851392, "grad_norm": 8.192234992980957, "learning_rate": 0.00019962648300207389, "loss": 8.3023, "step": 1759 }, { "epoch": 0.08399852048060517, "grad_norm": 5.692000389099121, "learning_rate": 0.0001996260510853846, "loss": 5.6597, "step": 1760 }, { "epoch": 0.08404624691269642, "grad_norm": 4.441336631774902, "learning_rate": 0.00019962561891958376, "loss": 6.0446, "step": 1761 }, { "epoch": 0.08409397334478767, "grad_norm": 5.4050445556640625, "learning_rate": 0.00019962518650467245, "loss": 7.0277, "step": 1762 }, { "epoch": 0.08414169977687894, "grad_norm": 4.252866744995117, "learning_rate": 0.0001996247538406518, "loss": 4.4678, "step": 1763 }, { "epoch": 0.08418942620897019, "grad_norm": 6.836278438568115, "learning_rate": 0.00019962432092752286, "loss": 7.2563, "step": 1764 }, { "epoch": 0.08423715264106144, "grad_norm": 5.331582546234131, "learning_rate": 0.0001996238877652867, "loss": 6.1467, "step": 1765 }, { "epoch": 0.08428487907315269, "grad_norm": 5.630874156951904, "learning_rate": 0.00019962345435394437, "loss": 6.5396, "step": 1766 }, { "epoch": 0.08433260550524393, "grad_norm": 5.042309284210205, "learning_rate": 0.00019962302069349703, "loss": 5.5076, "step": 1767 }, { "epoch": 0.0843803319373352, "grad_norm": 4.426672458648682, "learning_rate": 0.00019962258678394573, "loss": 5.928, "step": 1768 }, { "epoch": 0.08442805836942645, "grad_norm": 8.48932933807373, "learning_rate": 0.00019962215262529155, "loss": 8.2468, "step": 1769 }, { "epoch": 0.0844757848015177, "grad_norm": 6.2159857749938965, "learning_rate": 0.00019962171821753558, "loss": 6.9449, "step": 1770 }, { "epoch": 0.08452351123360895, "grad_norm": 4.8556437492370605, "learning_rate": 0.0001996212835606789, "loss": 4.4424, "step": 1771 }, { "epoch": 0.08457123766570021, "grad_norm": 7.196907997131348, "learning_rate": 0.00019962084865472263, "loss": 6.151, "step": 1772 }, { "epoch": 0.08461896409779146, "grad_norm": 5.671611785888672, "learning_rate": 0.0001996204134996678, "loss": 6.6122, "step": 1773 }, { "epoch": 0.08466669052988271, "grad_norm": 6.0633463859558105, "learning_rate": 0.00019961997809551555, "loss": 7.7069, "step": 1774 }, { "epoch": 0.08471441696197396, "grad_norm": 5.805813312530518, "learning_rate": 0.00019961954244226697, "loss": 6.1099, "step": 1775 }, { "epoch": 0.08476214339406522, "grad_norm": 5.567056655883789, "learning_rate": 0.0001996191065399231, "loss": 6.1179, "step": 1776 }, { "epoch": 0.08480986982615647, "grad_norm": 5.7736968994140625, "learning_rate": 0.00019961867038848503, "loss": 7.0391, "step": 1777 }, { "epoch": 0.08485759625824772, "grad_norm": 4.204695224761963, "learning_rate": 0.00019961823398795392, "loss": 5.4167, "step": 1778 }, { "epoch": 0.08490532269033897, "grad_norm": 5.957277297973633, "learning_rate": 0.00019961779733833083, "loss": 6.36, "step": 1779 }, { "epoch": 0.08495304912243024, "grad_norm": 5.479842185974121, "learning_rate": 0.0001996173604396168, "loss": 5.9074, "step": 1780 }, { "epoch": 0.08500077555452148, "grad_norm": 4.836981296539307, "learning_rate": 0.000199616923291813, "loss": 6.2541, "step": 1781 }, { "epoch": 0.08504850198661273, "grad_norm": 3.9009861946105957, "learning_rate": 0.00019961648589492044, "loss": 4.0949, "step": 1782 }, { "epoch": 0.08509622841870398, "grad_norm": 4.950165271759033, "learning_rate": 0.00019961604824894028, "loss": 6.0995, "step": 1783 }, { "epoch": 0.08514395485079525, "grad_norm": 5.362955570220947, "learning_rate": 0.00019961561035387358, "loss": 6.5055, "step": 1784 }, { "epoch": 0.0851916812828865, "grad_norm": 5.2055344581604, "learning_rate": 0.0001996151722097215, "loss": 4.8912, "step": 1785 }, { "epoch": 0.08523940771497775, "grad_norm": 6.469151496887207, "learning_rate": 0.00019961473381648507, "loss": 8.8181, "step": 1786 }, { "epoch": 0.085287134147069, "grad_norm": 5.428609371185303, "learning_rate": 0.00019961429517416536, "loss": 6.4427, "step": 1787 }, { "epoch": 0.08533486057916026, "grad_norm": 5.883609294891357, "learning_rate": 0.00019961385628276354, "loss": 5.7509, "step": 1788 }, { "epoch": 0.08538258701125151, "grad_norm": 6.005609512329102, "learning_rate": 0.00019961341714228067, "loss": 5.9459, "step": 1789 }, { "epoch": 0.08543031344334276, "grad_norm": 8.431653022766113, "learning_rate": 0.00019961297775271785, "loss": 7.3227, "step": 1790 }, { "epoch": 0.08547803987543401, "grad_norm": 6.327171802520752, "learning_rate": 0.0001996125381140762, "loss": 7.7006, "step": 1791 }, { "epoch": 0.08552576630752526, "grad_norm": 6.21482515335083, "learning_rate": 0.0001996120982263568, "loss": 6.8783, "step": 1792 }, { "epoch": 0.08557349273961652, "grad_norm": 5.2386698722839355, "learning_rate": 0.0001996116580895607, "loss": 6.3512, "step": 1793 }, { "epoch": 0.08562121917170777, "grad_norm": 4.963709354400635, "learning_rate": 0.00019961121770368913, "loss": 5.4235, "step": 1794 }, { "epoch": 0.08566894560379902, "grad_norm": 8.258075714111328, "learning_rate": 0.00019961077706874303, "loss": 8.6651, "step": 1795 }, { "epoch": 0.08571667203589027, "grad_norm": 6.397151947021484, "learning_rate": 0.00019961033618472364, "loss": 8.7071, "step": 1796 }, { "epoch": 0.08576439846798153, "grad_norm": 5.547435760498047, "learning_rate": 0.000199609895051632, "loss": 5.663, "step": 1797 }, { "epoch": 0.08581212490007278, "grad_norm": 5.095963478088379, "learning_rate": 0.0001996094536694692, "loss": 4.9001, "step": 1798 }, { "epoch": 0.08585985133216403, "grad_norm": 6.133388519287109, "learning_rate": 0.00019960901203823638, "loss": 8.1782, "step": 1799 }, { "epoch": 0.08590757776425528, "grad_norm": 5.368918418884277, "learning_rate": 0.00019960857015793464, "loss": 5.2037, "step": 1800 }, { "epoch": 0.08590757776425528, "eval_loss": 1.6195329427719116, "eval_runtime": 96.4627, "eval_samples_per_second": 8.739, "eval_steps_per_second": 4.375, "step": 1800 }, { "epoch": 0.08595530419634655, "grad_norm": 5.692762851715088, "learning_rate": 0.00019960812802856506, "loss": 7.2335, "step": 1801 }, { "epoch": 0.0860030306284378, "grad_norm": 6.827969551086426, "learning_rate": 0.00019960768565012877, "loss": 8.5713, "step": 1802 }, { "epoch": 0.08605075706052905, "grad_norm": 6.382897853851318, "learning_rate": 0.00019960724302262685, "loss": 6.7326, "step": 1803 }, { "epoch": 0.0860984834926203, "grad_norm": 5.170256614685059, "learning_rate": 0.00019960680014606042, "loss": 5.69, "step": 1804 }, { "epoch": 0.08614620992471156, "grad_norm": 8.245781898498535, "learning_rate": 0.0001996063570204306, "loss": 8.1054, "step": 1805 }, { "epoch": 0.08619393635680281, "grad_norm": 5.627835750579834, "learning_rate": 0.0001996059136457385, "loss": 5.8392, "step": 1806 }, { "epoch": 0.08624166278889406, "grad_norm": 5.2213454246521, "learning_rate": 0.0001996054700219852, "loss": 5.9661, "step": 1807 }, { "epoch": 0.08628938922098531, "grad_norm": 4.8643364906311035, "learning_rate": 0.0001996050261491718, "loss": 5.7056, "step": 1808 }, { "epoch": 0.08633711565307657, "grad_norm": 5.94075345993042, "learning_rate": 0.00019960458202729947, "loss": 6.1795, "step": 1809 }, { "epoch": 0.08638484208516782, "grad_norm": 5.838682651519775, "learning_rate": 0.00019960413765636928, "loss": 6.0042, "step": 1810 }, { "epoch": 0.08643256851725907, "grad_norm": 10.372759819030762, "learning_rate": 0.00019960369303638234, "loss": 9.3382, "step": 1811 }, { "epoch": 0.08648029494935032, "grad_norm": 6.304983139038086, "learning_rate": 0.00019960324816733978, "loss": 8.4037, "step": 1812 }, { "epoch": 0.08652802138144158, "grad_norm": 5.898202419281006, "learning_rate": 0.0001996028030492427, "loss": 6.2126, "step": 1813 }, { "epoch": 0.08657574781353283, "grad_norm": 5.635178089141846, "learning_rate": 0.00019960235768209217, "loss": 5.6331, "step": 1814 }, { "epoch": 0.08662347424562408, "grad_norm": 5.929975509643555, "learning_rate": 0.00019960191206588938, "loss": 6.8255, "step": 1815 }, { "epoch": 0.08667120067771533, "grad_norm": 6.191317558288574, "learning_rate": 0.0001996014662006354, "loss": 5.8182, "step": 1816 }, { "epoch": 0.08671892710980658, "grad_norm": 9.138450622558594, "learning_rate": 0.00019960102008633138, "loss": 5.9147, "step": 1817 }, { "epoch": 0.08676665354189785, "grad_norm": 6.335710048675537, "learning_rate": 0.0001996005737229784, "loss": 7.0813, "step": 1818 }, { "epoch": 0.0868143799739891, "grad_norm": 89.27218627929688, "learning_rate": 0.0001996001271105776, "loss": 5.9644, "step": 1819 }, { "epoch": 0.08686210640608034, "grad_norm": 9.937189102172852, "learning_rate": 0.00019959968024913004, "loss": 10.5711, "step": 1820 }, { "epoch": 0.0869098328381716, "grad_norm": 5.75858736038208, "learning_rate": 0.00019959923313863693, "loss": 6.4231, "step": 1821 }, { "epoch": 0.08695755927026286, "grad_norm": 4.982675075531006, "learning_rate": 0.00019959878577909934, "loss": 5.5205, "step": 1822 }, { "epoch": 0.08700528570235411, "grad_norm": 4.038936614990234, "learning_rate": 0.00019959833817051838, "loss": 4.4054, "step": 1823 }, { "epoch": 0.08705301213444536, "grad_norm": 5.798765182495117, "learning_rate": 0.00019959789031289516, "loss": 5.8708, "step": 1824 }, { "epoch": 0.0871007385665366, "grad_norm": 4.251887321472168, "learning_rate": 0.00019959744220623081, "loss": 4.7339, "step": 1825 }, { "epoch": 0.08714846499862787, "grad_norm": 4.055886745452881, "learning_rate": 0.0001995969938505265, "loss": 5.0028, "step": 1826 }, { "epoch": 0.08719619143071912, "grad_norm": 4.766815662384033, "learning_rate": 0.0001995965452457833, "loss": 6.0082, "step": 1827 }, { "epoch": 0.08724391786281037, "grad_norm": 7.274682998657227, "learning_rate": 0.0001995960963920023, "loss": 7.61, "step": 1828 }, { "epoch": 0.08729164429490162, "grad_norm": 5.725744247436523, "learning_rate": 0.0001995956472891847, "loss": 6.2951, "step": 1829 }, { "epoch": 0.08733937072699288, "grad_norm": 5.3603057861328125, "learning_rate": 0.0001995951979373316, "loss": 5.8247, "step": 1830 }, { "epoch": 0.08738709715908413, "grad_norm": 7.089607238769531, "learning_rate": 0.00019959474833644407, "loss": 7.8775, "step": 1831 }, { "epoch": 0.08743482359117538, "grad_norm": 7.481447219848633, "learning_rate": 0.0001995942984865233, "loss": 7.8192, "step": 1832 }, { "epoch": 0.08748255002326663, "grad_norm": 4.901138782501221, "learning_rate": 0.00019959384838757037, "loss": 5.063, "step": 1833 }, { "epoch": 0.0875302764553579, "grad_norm": 7.431210517883301, "learning_rate": 0.00019959339803958645, "loss": 6.5834, "step": 1834 }, { "epoch": 0.08757800288744914, "grad_norm": 4.83966588973999, "learning_rate": 0.00019959294744257262, "loss": 4.7621, "step": 1835 }, { "epoch": 0.0876257293195404, "grad_norm": 6.232669830322266, "learning_rate": 0.00019959249659653002, "loss": 5.9189, "step": 1836 }, { "epoch": 0.08767345575163164, "grad_norm": 4.409353733062744, "learning_rate": 0.00019959204550145981, "loss": 4.7726, "step": 1837 }, { "epoch": 0.0877211821837229, "grad_norm": 6.254725933074951, "learning_rate": 0.0001995915941573631, "loss": 6.5493, "step": 1838 }, { "epoch": 0.08776890861581416, "grad_norm": 6.882847309112549, "learning_rate": 0.00019959114256424098, "loss": 6.6558, "step": 1839 }, { "epoch": 0.0878166350479054, "grad_norm": 6.962137222290039, "learning_rate": 0.00019959069072209462, "loss": 7.2759, "step": 1840 }, { "epoch": 0.08786436147999666, "grad_norm": 3.9503977298736572, "learning_rate": 0.00019959023863092514, "loss": 4.7519, "step": 1841 }, { "epoch": 0.08791208791208792, "grad_norm": 5.693809509277344, "learning_rate": 0.00019958978629073368, "loss": 6.7759, "step": 1842 }, { "epoch": 0.08795981434417917, "grad_norm": 8.888816833496094, "learning_rate": 0.00019958933370152137, "loss": 7.0524, "step": 1843 }, { "epoch": 0.08800754077627042, "grad_norm": 6.7101593017578125, "learning_rate": 0.00019958888086328932, "loss": 7.5252, "step": 1844 }, { "epoch": 0.08805526720836167, "grad_norm": 5.2643351554870605, "learning_rate": 0.00019958842777603866, "loss": 5.6273, "step": 1845 }, { "epoch": 0.08810299364045292, "grad_norm": 5.582153797149658, "learning_rate": 0.00019958797443977056, "loss": 6.2019, "step": 1846 }, { "epoch": 0.08815072007254418, "grad_norm": 5.084579944610596, "learning_rate": 0.00019958752085448613, "loss": 4.7064, "step": 1847 }, { "epoch": 0.08819844650463543, "grad_norm": 7.017921447753906, "learning_rate": 0.00019958706702018648, "loss": 7.6752, "step": 1848 }, { "epoch": 0.08824617293672668, "grad_norm": 6.079553604125977, "learning_rate": 0.00019958661293687283, "loss": 7.112, "step": 1849 }, { "epoch": 0.08829389936881793, "grad_norm": 4.764186382293701, "learning_rate": 0.0001995861586045462, "loss": 5.1878, "step": 1850 }, { "epoch": 0.0883416258009092, "grad_norm": 6.2909016609191895, "learning_rate": 0.0001995857040232078, "loss": 7.2509, "step": 1851 }, { "epoch": 0.08838935223300044, "grad_norm": 23.84864616394043, "learning_rate": 0.00019958524919285874, "loss": 6.5176, "step": 1852 }, { "epoch": 0.08843707866509169, "grad_norm": 6.480355739593506, "learning_rate": 0.00019958479411350018, "loss": 5.6499, "step": 1853 }, { "epoch": 0.08848480509718294, "grad_norm": 4.547000408172607, "learning_rate": 0.00019958433878513324, "loss": 5.2339, "step": 1854 }, { "epoch": 0.0885325315292742, "grad_norm": 5.786708354949951, "learning_rate": 0.00019958388320775907, "loss": 5.1756, "step": 1855 }, { "epoch": 0.08858025796136546, "grad_norm": 6.517292499542236, "learning_rate": 0.0001995834273813788, "loss": 7.7226, "step": 1856 }, { "epoch": 0.0886279843934567, "grad_norm": 8.183725357055664, "learning_rate": 0.00019958297130599358, "loss": 7.9364, "step": 1857 }, { "epoch": 0.08867571082554795, "grad_norm": 5.43164587020874, "learning_rate": 0.00019958251498160452, "loss": 6.165, "step": 1858 }, { "epoch": 0.08872343725763922, "grad_norm": 5.762732028961182, "learning_rate": 0.00019958205840821277, "loss": 6.0718, "step": 1859 }, { "epoch": 0.08877116368973047, "grad_norm": 9.716732025146484, "learning_rate": 0.00019958160158581954, "loss": 8.6889, "step": 1860 }, { "epoch": 0.08881889012182172, "grad_norm": 5.996732234954834, "learning_rate": 0.00019958114451442587, "loss": 7.4132, "step": 1861 }, { "epoch": 0.08886661655391297, "grad_norm": 6.845658779144287, "learning_rate": 0.00019958068719403297, "loss": 7.4148, "step": 1862 }, { "epoch": 0.08891434298600423, "grad_norm": 7.86606502532959, "learning_rate": 0.00019958022962464196, "loss": 6.448, "step": 1863 }, { "epoch": 0.08896206941809548, "grad_norm": 7.205526828765869, "learning_rate": 0.00019957977180625397, "loss": 6.8785, "step": 1864 }, { "epoch": 0.08900979585018673, "grad_norm": 5.968493461608887, "learning_rate": 0.0001995793137388702, "loss": 4.9836, "step": 1865 }, { "epoch": 0.08905752228227798, "grad_norm": 4.983180046081543, "learning_rate": 0.00019957885542249172, "loss": 5.9704, "step": 1866 }, { "epoch": 0.08910524871436924, "grad_norm": 5.562103748321533, "learning_rate": 0.00019957839685711974, "loss": 6.1199, "step": 1867 }, { "epoch": 0.08915297514646049, "grad_norm": 4.140066146850586, "learning_rate": 0.0001995779380427554, "loss": 5.4426, "step": 1868 }, { "epoch": 0.08920070157855174, "grad_norm": 5.0160956382751465, "learning_rate": 0.00019957747897939978, "loss": 7.9901, "step": 1869 }, { "epoch": 0.08924842801064299, "grad_norm": 7.082386016845703, "learning_rate": 0.0001995770196670541, "loss": 5.9645, "step": 1870 }, { "epoch": 0.08929615444273424, "grad_norm": 5.607995510101318, "learning_rate": 0.00019957656010571948, "loss": 6.0904, "step": 1871 }, { "epoch": 0.0893438808748255, "grad_norm": 5.770166397094727, "learning_rate": 0.00019957610029539707, "loss": 6.5684, "step": 1872 }, { "epoch": 0.08939160730691675, "grad_norm": 5.108577251434326, "learning_rate": 0.00019957564023608804, "loss": 7.4545, "step": 1873 }, { "epoch": 0.089439333739008, "grad_norm": 5.625748634338379, "learning_rate": 0.0001995751799277935, "loss": 5.0555, "step": 1874 }, { "epoch": 0.08948706017109925, "grad_norm": 5.752185821533203, "learning_rate": 0.00019957471937051465, "loss": 7.1917, "step": 1875 }, { "epoch": 0.08953478660319052, "grad_norm": 7.390456676483154, "learning_rate": 0.00019957425856425258, "loss": 9.0817, "step": 1876 }, { "epoch": 0.08958251303528177, "grad_norm": 5.426296710968018, "learning_rate": 0.0001995737975090085, "loss": 4.8511, "step": 1877 }, { "epoch": 0.08963023946737302, "grad_norm": 6.285407543182373, "learning_rate": 0.00019957333620478353, "loss": 7.2764, "step": 1878 }, { "epoch": 0.08967796589946427, "grad_norm": 5.6253132820129395, "learning_rate": 0.00019957287465157886, "loss": 6.6716, "step": 1879 }, { "epoch": 0.08972569233155553, "grad_norm": 5.893998146057129, "learning_rate": 0.0001995724128493956, "loss": 6.3348, "step": 1880 }, { "epoch": 0.08977341876364678, "grad_norm": 4.910081386566162, "learning_rate": 0.00019957195079823493, "loss": 6.034, "step": 1881 }, { "epoch": 0.08982114519573803, "grad_norm": 5.80413293838501, "learning_rate": 0.000199571488498098, "loss": 7.409, "step": 1882 }, { "epoch": 0.08986887162782928, "grad_norm": 5.987128257751465, "learning_rate": 0.00019957102594898594, "loss": 6.761, "step": 1883 }, { "epoch": 0.08991659805992054, "grad_norm": 6.473491191864014, "learning_rate": 0.00019957056315089995, "loss": 6.6613, "step": 1884 }, { "epoch": 0.08996432449201179, "grad_norm": 4.1360626220703125, "learning_rate": 0.00019957010010384117, "loss": 4.245, "step": 1885 }, { "epoch": 0.09001205092410304, "grad_norm": 5.108883857727051, "learning_rate": 0.00019956963680781077, "loss": 5.9881, "step": 1886 }, { "epoch": 0.09005977735619429, "grad_norm": 7.545982837677002, "learning_rate": 0.00019956917326280986, "loss": 7.1406, "step": 1887 }, { "epoch": 0.09010750378828555, "grad_norm": 4.158173561096191, "learning_rate": 0.00019956870946883965, "loss": 5.9806, "step": 1888 }, { "epoch": 0.0901552302203768, "grad_norm": 6.711853981018066, "learning_rate": 0.00019956824542590132, "loss": 8.3325, "step": 1889 }, { "epoch": 0.09020295665246805, "grad_norm": 4.975286960601807, "learning_rate": 0.00019956778113399594, "loss": 6.141, "step": 1890 }, { "epoch": 0.0902506830845593, "grad_norm": 4.7780914306640625, "learning_rate": 0.00019956731659312476, "loss": 5.8485, "step": 1891 }, { "epoch": 0.09029840951665057, "grad_norm": 6.617183685302734, "learning_rate": 0.00019956685180328888, "loss": 7.666, "step": 1892 }, { "epoch": 0.09034613594874182, "grad_norm": 4.6189727783203125, "learning_rate": 0.0001995663867644895, "loss": 4.7532, "step": 1893 }, { "epoch": 0.09039386238083306, "grad_norm": 4.73331356048584, "learning_rate": 0.00019956592147672776, "loss": 5.3607, "step": 1894 }, { "epoch": 0.09044158881292431, "grad_norm": 4.980668067932129, "learning_rate": 0.00019956545594000487, "loss": 6.2638, "step": 1895 }, { "epoch": 0.09048931524501556, "grad_norm": 5.799066543579102, "learning_rate": 0.00019956499015432193, "loss": 6.4386, "step": 1896 }, { "epoch": 0.09053704167710683, "grad_norm": 6.795571804046631, "learning_rate": 0.00019956452411968014, "loss": 5.6267, "step": 1897 }, { "epoch": 0.09058476810919808, "grad_norm": 7.541502475738525, "learning_rate": 0.00019956405783608065, "loss": 6.1345, "step": 1898 }, { "epoch": 0.09063249454128933, "grad_norm": 9.937294006347656, "learning_rate": 0.00019956359130352466, "loss": 9.0167, "step": 1899 }, { "epoch": 0.09068022097338058, "grad_norm": 5.588807582855225, "learning_rate": 0.0001995631245220133, "loss": 6.3056, "step": 1900 }, { "epoch": 0.09072794740547184, "grad_norm": 5.208680152893066, "learning_rate": 0.00019956265749154772, "loss": 6.5704, "step": 1901 }, { "epoch": 0.09077567383756309, "grad_norm": 5.525402545928955, "learning_rate": 0.00019956219021212914, "loss": 8.073, "step": 1902 }, { "epoch": 0.09082340026965434, "grad_norm": 8.202049255371094, "learning_rate": 0.0001995617226837587, "loss": 6.4539, "step": 1903 }, { "epoch": 0.09087112670174559, "grad_norm": 6.243727207183838, "learning_rate": 0.00019956125490643757, "loss": 6.6853, "step": 1904 }, { "epoch": 0.09091885313383685, "grad_norm": 6.45194149017334, "learning_rate": 0.00019956078688016695, "loss": 7.5694, "step": 1905 }, { "epoch": 0.0909665795659281, "grad_norm": 4.42805290222168, "learning_rate": 0.00019956031860494795, "loss": 5.4368, "step": 1906 }, { "epoch": 0.09101430599801935, "grad_norm": 5.469226837158203, "learning_rate": 0.0001995598500807818, "loss": 5.0051, "step": 1907 }, { "epoch": 0.0910620324301106, "grad_norm": 7.494277000427246, "learning_rate": 0.00019955938130766963, "loss": 7.8122, "step": 1908 }, { "epoch": 0.09110975886220186, "grad_norm": 3.8653011322021484, "learning_rate": 0.00019955891228561266, "loss": 4.9454, "step": 1909 }, { "epoch": 0.09115748529429311, "grad_norm": 5.886962413787842, "learning_rate": 0.00019955844301461196, "loss": 7.0929, "step": 1910 }, { "epoch": 0.09120521172638436, "grad_norm": 5.907853603363037, "learning_rate": 0.00019955797349466886, "loss": 7.1583, "step": 1911 }, { "epoch": 0.09125293815847561, "grad_norm": 7.710178852081299, "learning_rate": 0.0001995575037257844, "loss": 8.1009, "step": 1912 }, { "epoch": 0.09130066459056688, "grad_norm": 5.069637775421143, "learning_rate": 0.0001995570337079598, "loss": 5.0842, "step": 1913 }, { "epoch": 0.09134839102265813, "grad_norm": 6.01732063293457, "learning_rate": 0.00019955656344119627, "loss": 6.9818, "step": 1914 }, { "epoch": 0.09139611745474938, "grad_norm": 7.960158348083496, "learning_rate": 0.00019955609292549492, "loss": 7.4926, "step": 1915 }, { "epoch": 0.09144384388684063, "grad_norm": 5.53605842590332, "learning_rate": 0.00019955562216085698, "loss": 6.2172, "step": 1916 }, { "epoch": 0.09149157031893189, "grad_norm": 5.594004154205322, "learning_rate": 0.0001995551511472836, "loss": 6.1891, "step": 1917 }, { "epoch": 0.09153929675102314, "grad_norm": 4.7342209815979, "learning_rate": 0.00019955467988477595, "loss": 5.0942, "step": 1918 }, { "epoch": 0.09158702318311439, "grad_norm": 5.738880157470703, "learning_rate": 0.00019955420837333525, "loss": 6.6871, "step": 1919 }, { "epoch": 0.09163474961520564, "grad_norm": 4.85321044921875, "learning_rate": 0.00019955373661296265, "loss": 6.2562, "step": 1920 }, { "epoch": 0.09168247604729689, "grad_norm": 4.387369155883789, "learning_rate": 0.00019955326460365931, "loss": 4.2293, "step": 1921 }, { "epoch": 0.09173020247938815, "grad_norm": 6.184238910675049, "learning_rate": 0.00019955279234542647, "loss": 7.6299, "step": 1922 }, { "epoch": 0.0917779289114794, "grad_norm": 6.43262243270874, "learning_rate": 0.00019955231983826525, "loss": 7.3633, "step": 1923 }, { "epoch": 0.09182565534357065, "grad_norm": 6.625996112823486, "learning_rate": 0.00019955184708217685, "loss": 6.2032, "step": 1924 }, { "epoch": 0.0918733817756619, "grad_norm": 5.289299011230469, "learning_rate": 0.00019955137407716246, "loss": 7.2631, "step": 1925 }, { "epoch": 0.09192110820775316, "grad_norm": 4.64791202545166, "learning_rate": 0.00019955090082322328, "loss": 5.4584, "step": 1926 }, { "epoch": 0.09196883463984441, "grad_norm": 5.630763053894043, "learning_rate": 0.00019955042732036046, "loss": 5.98, "step": 1927 }, { "epoch": 0.09201656107193566, "grad_norm": 5.336659908294678, "learning_rate": 0.00019954995356857519, "loss": 5.2926, "step": 1928 }, { "epoch": 0.09206428750402691, "grad_norm": 5.44187593460083, "learning_rate": 0.00019954947956786866, "loss": 4.9369, "step": 1929 }, { "epoch": 0.09211201393611818, "grad_norm": 4.850311279296875, "learning_rate": 0.00019954900531824209, "loss": 4.9112, "step": 1930 }, { "epoch": 0.09215974036820943, "grad_norm": 6.510289669036865, "learning_rate": 0.0001995485308196966, "loss": 6.6208, "step": 1931 }, { "epoch": 0.09220746680030067, "grad_norm": 4.996893882751465, "learning_rate": 0.0001995480560722334, "loss": 5.4653, "step": 1932 }, { "epoch": 0.09225519323239192, "grad_norm": 6.860300540924072, "learning_rate": 0.00019954758107585373, "loss": 6.2089, "step": 1933 }, { "epoch": 0.09230291966448319, "grad_norm": 8.323232650756836, "learning_rate": 0.00019954710583055868, "loss": 8.1626, "step": 1934 }, { "epoch": 0.09235064609657444, "grad_norm": 8.373363494873047, "learning_rate": 0.00019954663033634954, "loss": 9.0978, "step": 1935 }, { "epoch": 0.09239837252866569, "grad_norm": 5.7876787185668945, "learning_rate": 0.00019954615459322742, "loss": 7.0643, "step": 1936 }, { "epoch": 0.09244609896075694, "grad_norm": 4.430604457855225, "learning_rate": 0.00019954567860119358, "loss": 5.2645, "step": 1937 }, { "epoch": 0.0924938253928482, "grad_norm": 6.38831090927124, "learning_rate": 0.00019954520236024914, "loss": 6.1311, "step": 1938 }, { "epoch": 0.09254155182493945, "grad_norm": 6.169930934906006, "learning_rate": 0.00019954472587039535, "loss": 5.4872, "step": 1939 }, { "epoch": 0.0925892782570307, "grad_norm": 6.449937343597412, "learning_rate": 0.00019954424913163333, "loss": 6.8896, "step": 1940 }, { "epoch": 0.09263700468912195, "grad_norm": 7.309608459472656, "learning_rate": 0.00019954377214396435, "loss": 6.7205, "step": 1941 }, { "epoch": 0.09268473112121321, "grad_norm": 6.497426986694336, "learning_rate": 0.00019954329490738955, "loss": 6.1321, "step": 1942 }, { "epoch": 0.09273245755330446, "grad_norm": 5.323448657989502, "learning_rate": 0.00019954281742191014, "loss": 5.6016, "step": 1943 }, { "epoch": 0.09278018398539571, "grad_norm": 5.627899169921875, "learning_rate": 0.00019954233968752733, "loss": 6.8229, "step": 1944 }, { "epoch": 0.09282791041748696, "grad_norm": 6.3683342933654785, "learning_rate": 0.0001995418617042423, "loss": 7.9162, "step": 1945 }, { "epoch": 0.09287563684957821, "grad_norm": 7.388428211212158, "learning_rate": 0.00019954138347205624, "loss": 6.7901, "step": 1946 }, { "epoch": 0.09292336328166947, "grad_norm": 4.738420486450195, "learning_rate": 0.00019954090499097035, "loss": 5.4793, "step": 1947 }, { "epoch": 0.09297108971376072, "grad_norm": 7.107654571533203, "learning_rate": 0.00019954042626098582, "loss": 6.6402, "step": 1948 }, { "epoch": 0.09301881614585197, "grad_norm": 6.25970458984375, "learning_rate": 0.00019953994728210387, "loss": 6.7944, "step": 1949 }, { "epoch": 0.09306654257794322, "grad_norm": 6.2929463386535645, "learning_rate": 0.00019953946805432567, "loss": 7.0768, "step": 1950 }, { "epoch": 0.09311426901003449, "grad_norm": 6.515363693237305, "learning_rate": 0.00019953898857765246, "loss": 7.382, "step": 1951 }, { "epoch": 0.09316199544212574, "grad_norm": 6.0229082107543945, "learning_rate": 0.00019953850885208536, "loss": 6.16, "step": 1952 }, { "epoch": 0.09320972187421699, "grad_norm": 6.195054054260254, "learning_rate": 0.00019953802887762566, "loss": 6.7316, "step": 1953 }, { "epoch": 0.09325744830630824, "grad_norm": 5.1811017990112305, "learning_rate": 0.0001995375486542745, "loss": 5.7193, "step": 1954 }, { "epoch": 0.0933051747383995, "grad_norm": 6.671079158782959, "learning_rate": 0.0001995370681820331, "loss": 6.9041, "step": 1955 }, { "epoch": 0.09335290117049075, "grad_norm": 5.995284080505371, "learning_rate": 0.00019953658746090267, "loss": 6.5472, "step": 1956 }, { "epoch": 0.093400627602582, "grad_norm": 5.943747520446777, "learning_rate": 0.0001995361064908844, "loss": 7.415, "step": 1957 }, { "epoch": 0.09344835403467325, "grad_norm": 6.161198139190674, "learning_rate": 0.00019953562527197952, "loss": 6.6886, "step": 1958 }, { "epoch": 0.09349608046676451, "grad_norm": 6.798264503479004, "learning_rate": 0.00019953514380418916, "loss": 6.8366, "step": 1959 }, { "epoch": 0.09354380689885576, "grad_norm": 7.221126079559326, "learning_rate": 0.0001995346620875146, "loss": 7.722, "step": 1960 }, { "epoch": 0.09359153333094701, "grad_norm": 6.91715145111084, "learning_rate": 0.00019953418012195703, "loss": 8.2421, "step": 1961 }, { "epoch": 0.09363925976303826, "grad_norm": 5.927742004394531, "learning_rate": 0.00019953369790751761, "loss": 6.1009, "step": 1962 }, { "epoch": 0.09368698619512952, "grad_norm": 10.192344665527344, "learning_rate": 0.0001995332154441976, "loss": 8.3961, "step": 1963 }, { "epoch": 0.09373471262722077, "grad_norm": 5.9493513107299805, "learning_rate": 0.0001995327327319982, "loss": 7.5684, "step": 1964 }, { "epoch": 0.09378243905931202, "grad_norm": 8.660958290100098, "learning_rate": 0.00019953224977092057, "loss": 9.5475, "step": 1965 }, { "epoch": 0.09383016549140327, "grad_norm": 6.687934398651123, "learning_rate": 0.00019953176656096596, "loss": 5.7409, "step": 1966 }, { "epoch": 0.09387789192349454, "grad_norm": 3.778975009918213, "learning_rate": 0.0001995312831021356, "loss": 4.9604, "step": 1967 }, { "epoch": 0.09392561835558579, "grad_norm": 5.175545692443848, "learning_rate": 0.00019953079939443062, "loss": 6.0746, "step": 1968 }, { "epoch": 0.09397334478767704, "grad_norm": 5.146554470062256, "learning_rate": 0.0001995303154378523, "loss": 5.4301, "step": 1969 }, { "epoch": 0.09402107121976828, "grad_norm": 5.948916912078857, "learning_rate": 0.00019952983123240183, "loss": 5.4946, "step": 1970 }, { "epoch": 0.09406879765185953, "grad_norm": 7.415115833282471, "learning_rate": 0.00019952934677808042, "loss": 8.3154, "step": 1971 }, { "epoch": 0.0941165240839508, "grad_norm": 4.534360885620117, "learning_rate": 0.00019952886207488926, "loss": 6.1824, "step": 1972 }, { "epoch": 0.09416425051604205, "grad_norm": 4.519402027130127, "learning_rate": 0.00019952837712282958, "loss": 4.7616, "step": 1973 }, { "epoch": 0.0942119769481333, "grad_norm": 5.43019962310791, "learning_rate": 0.00019952789192190262, "loss": 7.4646, "step": 1974 }, { "epoch": 0.09425970338022455, "grad_norm": 5.494196891784668, "learning_rate": 0.00019952740647210954, "loss": 7.0449, "step": 1975 }, { "epoch": 0.09430742981231581, "grad_norm": 7.097334384918213, "learning_rate": 0.0001995269207734516, "loss": 5.7563, "step": 1976 }, { "epoch": 0.09435515624440706, "grad_norm": 6.139416694641113, "learning_rate": 0.00019952643482592996, "loss": 6.3506, "step": 1977 }, { "epoch": 0.09440288267649831, "grad_norm": 5.325318813323975, "learning_rate": 0.0001995259486295459, "loss": 6.6352, "step": 1978 }, { "epoch": 0.09445060910858956, "grad_norm": 6.176486492156982, "learning_rate": 0.0001995254621843006, "loss": 6.4361, "step": 1979 }, { "epoch": 0.09449833554068082, "grad_norm": 6.585480690002441, "learning_rate": 0.0001995249754901953, "loss": 6.18, "step": 1980 }, { "epoch": 0.09454606197277207, "grad_norm": 5.279582977294922, "learning_rate": 0.00019952448854723115, "loss": 7.2575, "step": 1981 }, { "epoch": 0.09459378840486332, "grad_norm": 6.4810895919799805, "learning_rate": 0.00019952400135540946, "loss": 7.4695, "step": 1982 }, { "epoch": 0.09464151483695457, "grad_norm": 6.0961809158325195, "learning_rate": 0.0001995235139147314, "loss": 6.5435, "step": 1983 }, { "epoch": 0.09468924126904583, "grad_norm": 6.2828779220581055, "learning_rate": 0.00019952302622519818, "loss": 6.8209, "step": 1984 }, { "epoch": 0.09473696770113708, "grad_norm": 5.865154266357422, "learning_rate": 0.00019952253828681103, "loss": 6.9568, "step": 1985 }, { "epoch": 0.09478469413322833, "grad_norm": 6.766358375549316, "learning_rate": 0.00019952205009957118, "loss": 8.9635, "step": 1986 }, { "epoch": 0.09483242056531958, "grad_norm": 6.596803188323975, "learning_rate": 0.00019952156166347983, "loss": 7.5744, "step": 1987 }, { "epoch": 0.09488014699741085, "grad_norm": 5.979830265045166, "learning_rate": 0.00019952107297853825, "loss": 6.4619, "step": 1988 }, { "epoch": 0.0949278734295021, "grad_norm": 8.215448379516602, "learning_rate": 0.0001995205840447476, "loss": 9.5849, "step": 1989 }, { "epoch": 0.09497559986159335, "grad_norm": 5.243971824645996, "learning_rate": 0.00019952009486210912, "loss": 5.5483, "step": 1990 }, { "epoch": 0.0950233262936846, "grad_norm": 6.68547248840332, "learning_rate": 0.0001995196054306241, "loss": 7.9443, "step": 1991 }, { "epoch": 0.09507105272577586, "grad_norm": 8.142982482910156, "learning_rate": 0.00019951911575029363, "loss": 7.9948, "step": 1992 }, { "epoch": 0.09511877915786711, "grad_norm": 6.444507598876953, "learning_rate": 0.00019951862582111906, "loss": 6.4369, "step": 1993 }, { "epoch": 0.09516650558995836, "grad_norm": 7.554701805114746, "learning_rate": 0.00019951813564310155, "loss": 7.4948, "step": 1994 }, { "epoch": 0.09521423202204961, "grad_norm": 7.4234442710876465, "learning_rate": 0.00019951764521624235, "loss": 7.6692, "step": 1995 }, { "epoch": 0.09526195845414087, "grad_norm": 5.237441539764404, "learning_rate": 0.00019951715454054268, "loss": 6.6965, "step": 1996 }, { "epoch": 0.09530968488623212, "grad_norm": 13.602616310119629, "learning_rate": 0.00019951666361600373, "loss": 6.7896, "step": 1997 }, { "epoch": 0.09535741131832337, "grad_norm": 5.892328262329102, "learning_rate": 0.00019951617244262678, "loss": 6.1477, "step": 1998 }, { "epoch": 0.09540513775041462, "grad_norm": 4.428401470184326, "learning_rate": 0.00019951568102041307, "loss": 5.4074, "step": 1999 }, { "epoch": 0.09545286418250587, "grad_norm": 3.492969512939453, "learning_rate": 0.00019951518934936376, "loss": 3.9859, "step": 2000 }, { "epoch": 0.09545286418250587, "eval_loss": 1.605623722076416, "eval_runtime": 96.5966, "eval_samples_per_second": 8.727, "eval_steps_per_second": 4.369, "step": 2000 }, { "epoch": 0.09550059061459713, "grad_norm": 6.175414562225342, "learning_rate": 0.00019951469742948014, "loss": 6.5288, "step": 2001 }, { "epoch": 0.09554831704668838, "grad_norm": 3.6939139366149902, "learning_rate": 0.00019951420526076344, "loss": 4.756, "step": 2002 }, { "epoch": 0.09559604347877963, "grad_norm": 9.499626159667969, "learning_rate": 0.00019951371284321483, "loss": 6.3431, "step": 2003 }, { "epoch": 0.09564376991087088, "grad_norm": 6.353444576263428, "learning_rate": 0.00019951322017683557, "loss": 7.0988, "step": 2004 }, { "epoch": 0.09569149634296215, "grad_norm": 7.806198596954346, "learning_rate": 0.00019951272726162695, "loss": 7.7317, "step": 2005 }, { "epoch": 0.0957392227750534, "grad_norm": 7.30381965637207, "learning_rate": 0.00019951223409759013, "loss": 7.5044, "step": 2006 }, { "epoch": 0.09578694920714464, "grad_norm": 8.295380592346191, "learning_rate": 0.00019951174068472637, "loss": 10.5317, "step": 2007 }, { "epoch": 0.0958346756392359, "grad_norm": 5.466734886169434, "learning_rate": 0.0001995112470230369, "loss": 5.154, "step": 2008 }, { "epoch": 0.09588240207132716, "grad_norm": 7.218965530395508, "learning_rate": 0.00019951075311252297, "loss": 6.9654, "step": 2009 }, { "epoch": 0.09593012850341841, "grad_norm": 5.146548271179199, "learning_rate": 0.0001995102589531858, "loss": 6.8214, "step": 2010 }, { "epoch": 0.09597785493550966, "grad_norm": 5.846188068389893, "learning_rate": 0.0001995097645450266, "loss": 6.4063, "step": 2011 }, { "epoch": 0.0960255813676009, "grad_norm": 7.430274963378906, "learning_rate": 0.00019950926988804665, "loss": 6.7679, "step": 2012 }, { "epoch": 0.09607330779969217, "grad_norm": 6.761413097381592, "learning_rate": 0.00019950877498224717, "loss": 6.8477, "step": 2013 }, { "epoch": 0.09612103423178342, "grad_norm": 9.278961181640625, "learning_rate": 0.0001995082798276294, "loss": 6.6772, "step": 2014 }, { "epoch": 0.09616876066387467, "grad_norm": 6.324934482574463, "learning_rate": 0.0001995077844241946, "loss": 6.6185, "step": 2015 }, { "epoch": 0.09621648709596592, "grad_norm": 6.153281211853027, "learning_rate": 0.00019950728877194396, "loss": 6.4164, "step": 2016 }, { "epoch": 0.09626421352805718, "grad_norm": 7.29350471496582, "learning_rate": 0.00019950679287087874, "loss": 7.576, "step": 2017 }, { "epoch": 0.09631193996014843, "grad_norm": 5.512648582458496, "learning_rate": 0.00019950629672100018, "loss": 6.5212, "step": 2018 }, { "epoch": 0.09635966639223968, "grad_norm": 9.72251033782959, "learning_rate": 0.00019950580032230953, "loss": 7.4948, "step": 2019 }, { "epoch": 0.09640739282433093, "grad_norm": 5.254001140594482, "learning_rate": 0.00019950530367480803, "loss": 6.4021, "step": 2020 }, { "epoch": 0.0964551192564222, "grad_norm": 5.06203556060791, "learning_rate": 0.00019950480677849693, "loss": 5.5241, "step": 2021 }, { "epoch": 0.09650284568851344, "grad_norm": 5.806079864501953, "learning_rate": 0.00019950430963337743, "loss": 5.3812, "step": 2022 }, { "epoch": 0.0965505721206047, "grad_norm": 5.14645528793335, "learning_rate": 0.00019950381223945082, "loss": 6.2341, "step": 2023 }, { "epoch": 0.09659829855269594, "grad_norm": 6.698554515838623, "learning_rate": 0.00019950331459671832, "loss": 7.1556, "step": 2024 }, { "epoch": 0.0966460249847872, "grad_norm": 6.099356174468994, "learning_rate": 0.00019950281670518121, "loss": 5.2903, "step": 2025 }, { "epoch": 0.09669375141687846, "grad_norm": 5.152215480804443, "learning_rate": 0.0001995023185648407, "loss": 7.7764, "step": 2026 }, { "epoch": 0.0967414778489697, "grad_norm": 6.293337821960449, "learning_rate": 0.00019950182017569801, "loss": 7.0009, "step": 2027 }, { "epoch": 0.09678920428106096, "grad_norm": 6.451601982116699, "learning_rate": 0.00019950132153775442, "loss": 7.1087, "step": 2028 }, { "epoch": 0.0968369307131522, "grad_norm": 6.628176689147949, "learning_rate": 0.00019950082265101117, "loss": 6.388, "step": 2029 }, { "epoch": 0.09688465714524347, "grad_norm": 5.797245979309082, "learning_rate": 0.00019950032351546954, "loss": 6.0733, "step": 2030 }, { "epoch": 0.09693238357733472, "grad_norm": 6.363885402679443, "learning_rate": 0.00019949982413113073, "loss": 6.7292, "step": 2031 }, { "epoch": 0.09698011000942597, "grad_norm": 6.42457914352417, "learning_rate": 0.00019949932449799606, "loss": 9.8553, "step": 2032 }, { "epoch": 0.09702783644151722, "grad_norm": 6.3178911209106445, "learning_rate": 0.00019949882461606667, "loss": 7.8743, "step": 2033 }, { "epoch": 0.09707556287360848, "grad_norm": 5.86131477355957, "learning_rate": 0.00019949832448534386, "loss": 7.3333, "step": 2034 }, { "epoch": 0.09712328930569973, "grad_norm": 6.166070461273193, "learning_rate": 0.0001994978241058289, "loss": 6.6477, "step": 2035 }, { "epoch": 0.09717101573779098, "grad_norm": 5.363022804260254, "learning_rate": 0.0001994973234775231, "loss": 5.2925, "step": 2036 }, { "epoch": 0.09721874216988223, "grad_norm": 4.887767791748047, "learning_rate": 0.00019949682260042754, "loss": 5.3932, "step": 2037 }, { "epoch": 0.0972664686019735, "grad_norm": 6.711620807647705, "learning_rate": 0.00019949632147454364, "loss": 7.4646, "step": 2038 }, { "epoch": 0.09731419503406474, "grad_norm": 5.595021724700928, "learning_rate": 0.00019949582009987256, "loss": 5.8875, "step": 2039 }, { "epoch": 0.09736192146615599, "grad_norm": 5.815378189086914, "learning_rate": 0.0001994953184764156, "loss": 6.7538, "step": 2040 }, { "epoch": 0.09740964789824724, "grad_norm": 6.263134956359863, "learning_rate": 0.00019949481660417397, "loss": 5.0031, "step": 2041 }, { "epoch": 0.0974573743303385, "grad_norm": 5.8824591636657715, "learning_rate": 0.00019949431448314898, "loss": 5.6186, "step": 2042 }, { "epoch": 0.09750510076242976, "grad_norm": 5.6964192390441895, "learning_rate": 0.00019949381211334182, "loss": 7.5536, "step": 2043 }, { "epoch": 0.097552827194521, "grad_norm": 6.4975128173828125, "learning_rate": 0.0001994933094947538, "loss": 6.4729, "step": 2044 }, { "epoch": 0.09760055362661225, "grad_norm": 5.330821990966797, "learning_rate": 0.00019949280662738618, "loss": 6.2096, "step": 2045 }, { "epoch": 0.09764828005870352, "grad_norm": 5.2428436279296875, "learning_rate": 0.00019949230351124015, "loss": 5.8797, "step": 2046 }, { "epoch": 0.09769600649079477, "grad_norm": 4.79240083694458, "learning_rate": 0.00019949180014631703, "loss": 5.0223, "step": 2047 }, { "epoch": 0.09774373292288602, "grad_norm": 7.743186950683594, "learning_rate": 0.00019949129653261809, "loss": 8.6794, "step": 2048 }, { "epoch": 0.09779145935497727, "grad_norm": 6.97572135925293, "learning_rate": 0.00019949079267014453, "loss": 7.7431, "step": 2049 }, { "epoch": 0.09783918578706852, "grad_norm": 7.532467365264893, "learning_rate": 0.00019949028855889767, "loss": 7.8343, "step": 2050 }, { "epoch": 0.09788691221915978, "grad_norm": 5.043698787689209, "learning_rate": 0.00019948978419887873, "loss": 5.2176, "step": 2051 }, { "epoch": 0.09793463865125103, "grad_norm": 5.074033737182617, "learning_rate": 0.000199489279590089, "loss": 5.4742, "step": 2052 }, { "epoch": 0.09798236508334228, "grad_norm": 5.588888168334961, "learning_rate": 0.00019948877473252967, "loss": 6.0821, "step": 2053 }, { "epoch": 0.09803009151543353, "grad_norm": 8.261739730834961, "learning_rate": 0.00019948826962620211, "loss": 7.2484, "step": 2054 }, { "epoch": 0.09807781794752479, "grad_norm": 6.825746536254883, "learning_rate": 0.0001994877642711075, "loss": 7.4662, "step": 2055 }, { "epoch": 0.09812554437961604, "grad_norm": 4.981271266937256, "learning_rate": 0.00019948725866724716, "loss": 5.9239, "step": 2056 }, { "epoch": 0.09817327081170729, "grad_norm": 5.583786964416504, "learning_rate": 0.0001994867528146223, "loss": 6.12, "step": 2057 }, { "epoch": 0.09822099724379854, "grad_norm": 5.576869964599609, "learning_rate": 0.00019948624671323425, "loss": 5.2695, "step": 2058 }, { "epoch": 0.0982687236758898, "grad_norm": 5.743818283081055, "learning_rate": 0.00019948574036308424, "loss": 6.0713, "step": 2059 }, { "epoch": 0.09831645010798105, "grad_norm": 9.865864753723145, "learning_rate": 0.0001994852337641735, "loss": 7.1039, "step": 2060 }, { "epoch": 0.0983641765400723, "grad_norm": 7.344568252563477, "learning_rate": 0.00019948472691650333, "loss": 8.4153, "step": 2061 }, { "epoch": 0.09841190297216355, "grad_norm": 5.147101402282715, "learning_rate": 0.000199484219820075, "loss": 6.9787, "step": 2062 }, { "epoch": 0.09845962940425482, "grad_norm": 4.541990280151367, "learning_rate": 0.0001994837124748898, "loss": 5.766, "step": 2063 }, { "epoch": 0.09850735583634607, "grad_norm": 5.624611854553223, "learning_rate": 0.00019948320488094899, "loss": 6.2582, "step": 2064 }, { "epoch": 0.09855508226843732, "grad_norm": 5.886934757232666, "learning_rate": 0.00019948269703825376, "loss": 6.9005, "step": 2065 }, { "epoch": 0.09860280870052857, "grad_norm": 5.615740776062012, "learning_rate": 0.00019948218894680553, "loss": 8.559, "step": 2066 }, { "epoch": 0.09865053513261983, "grad_norm": 4.974883556365967, "learning_rate": 0.0001994816806066054, "loss": 5.7551, "step": 2067 }, { "epoch": 0.09869826156471108, "grad_norm": 4.981917381286621, "learning_rate": 0.00019948117201765478, "loss": 6.3539, "step": 2068 }, { "epoch": 0.09874598799680233, "grad_norm": 3.6366353034973145, "learning_rate": 0.0001994806631799549, "loss": 4.5631, "step": 2069 }, { "epoch": 0.09879371442889358, "grad_norm": 9.334731101989746, "learning_rate": 0.000199480154093507, "loss": 6.5904, "step": 2070 }, { "epoch": 0.09884144086098484, "grad_norm": 3.803725004196167, "learning_rate": 0.00019947964475831235, "loss": 4.1213, "step": 2071 }, { "epoch": 0.09888916729307609, "grad_norm": 7.621596336364746, "learning_rate": 0.00019947913517437228, "loss": 8.0286, "step": 2072 }, { "epoch": 0.09893689372516734, "grad_norm": 5.940497875213623, "learning_rate": 0.00019947862534168802, "loss": 6.0544, "step": 2073 }, { "epoch": 0.09898462015725859, "grad_norm": 5.207281589508057, "learning_rate": 0.00019947811526026087, "loss": 5.8265, "step": 2074 }, { "epoch": 0.09903234658934984, "grad_norm": 5.425878047943115, "learning_rate": 0.00019947760493009206, "loss": 6.9139, "step": 2075 }, { "epoch": 0.0990800730214411, "grad_norm": 5.618648052215576, "learning_rate": 0.00019947709435118291, "loss": 6.3085, "step": 2076 }, { "epoch": 0.09912779945353235, "grad_norm": 4.416502952575684, "learning_rate": 0.0001994765835235347, "loss": 4.3363, "step": 2077 }, { "epoch": 0.0991755258856236, "grad_norm": 7.534506797790527, "learning_rate": 0.00019947607244714865, "loss": 8.9734, "step": 2078 }, { "epoch": 0.09922325231771485, "grad_norm": 5.209206581115723, "learning_rate": 0.00019947556112202613, "loss": 5.7788, "step": 2079 }, { "epoch": 0.09927097874980612, "grad_norm": 5.866442680358887, "learning_rate": 0.0001994750495481683, "loss": 5.8366, "step": 2080 }, { "epoch": 0.09931870518189737, "grad_norm": 6.299295902252197, "learning_rate": 0.00019947453772557657, "loss": 6.2577, "step": 2081 }, { "epoch": 0.09936643161398862, "grad_norm": 6.085508823394775, "learning_rate": 0.00019947402565425215, "loss": 7.109, "step": 2082 }, { "epoch": 0.09941415804607986, "grad_norm": 7.182192325592041, "learning_rate": 0.00019947351333419634, "loss": 7.2283, "step": 2083 }, { "epoch": 0.09946188447817113, "grad_norm": 8.089944839477539, "learning_rate": 0.00019947300076541036, "loss": 8.4512, "step": 2084 }, { "epoch": 0.09950961091026238, "grad_norm": 4.942322731018066, "learning_rate": 0.00019947248794789557, "loss": 4.1914, "step": 2085 }, { "epoch": 0.09955733734235363, "grad_norm": 7.140364170074463, "learning_rate": 0.0001994719748816532, "loss": 6.6143, "step": 2086 }, { "epoch": 0.09960506377444488, "grad_norm": 4.702012062072754, "learning_rate": 0.00019947146156668456, "loss": 5.1365, "step": 2087 }, { "epoch": 0.09965279020653614, "grad_norm": 5.454802513122559, "learning_rate": 0.00019947094800299095, "loss": 5.274, "step": 2088 }, { "epoch": 0.09970051663862739, "grad_norm": 5.532293319702148, "learning_rate": 0.00019947043419057362, "loss": 6.0414, "step": 2089 }, { "epoch": 0.09974824307071864, "grad_norm": 8.889829635620117, "learning_rate": 0.00019946992012943388, "loss": 9.5325, "step": 2090 }, { "epoch": 0.09979596950280989, "grad_norm": 4.8973588943481445, "learning_rate": 0.00019946940581957298, "loss": 4.8087, "step": 2091 }, { "epoch": 0.09984369593490115, "grad_norm": 5.1472249031066895, "learning_rate": 0.00019946889126099224, "loss": 6.6267, "step": 2092 }, { "epoch": 0.0998914223669924, "grad_norm": 6.497985363006592, "learning_rate": 0.00019946837645369293, "loss": 6.9945, "step": 2093 }, { "epoch": 0.09993914879908365, "grad_norm": 5.6702985763549805, "learning_rate": 0.00019946786139767638, "loss": 5.914, "step": 2094 }, { "epoch": 0.0999868752311749, "grad_norm": 6.481780529022217, "learning_rate": 0.0001994673460929438, "loss": 6.2933, "step": 2095 }, { "epoch": 0.10003460166326616, "grad_norm": 6.166791915893555, "learning_rate": 0.00019946683053949653, "loss": 7.3421, "step": 2096 }, { "epoch": 0.10008232809535741, "grad_norm": 5.381234169006348, "learning_rate": 0.00019946631473733583, "loss": 6.4301, "step": 2097 }, { "epoch": 0.10013005452744866, "grad_norm": 6.188833236694336, "learning_rate": 0.000199465798686463, "loss": 6.0311, "step": 2098 }, { "epoch": 0.10017778095953991, "grad_norm": 6.2230753898620605, "learning_rate": 0.0001994652823868794, "loss": 8.7762, "step": 2099 }, { "epoch": 0.10022550739163116, "grad_norm": 4.895851135253906, "learning_rate": 0.00019946476583858623, "loss": 4.9342, "step": 2100 }, { "epoch": 0.10027323382372243, "grad_norm": 7.687273025512695, "learning_rate": 0.00019946424904158477, "loss": 7.0908, "step": 2101 }, { "epoch": 0.10032096025581368, "grad_norm": 7.630702018737793, "learning_rate": 0.0001994637319958764, "loss": 8.7147, "step": 2102 }, { "epoch": 0.10036868668790493, "grad_norm": 8.54239559173584, "learning_rate": 0.00019946321470146238, "loss": 7.34, "step": 2103 }, { "epoch": 0.10041641311999618, "grad_norm": 5.114380836486816, "learning_rate": 0.00019946269715834393, "loss": 6.422, "step": 2104 }, { "epoch": 0.10046413955208744, "grad_norm": 5.070735454559326, "learning_rate": 0.00019946217936652244, "loss": 5.4877, "step": 2105 }, { "epoch": 0.10051186598417869, "grad_norm": 6.23119592666626, "learning_rate": 0.0001994616613259992, "loss": 6.7999, "step": 2106 }, { "epoch": 0.10055959241626994, "grad_norm": 5.303956508636475, "learning_rate": 0.00019946114303677544, "loss": 6.5712, "step": 2107 }, { "epoch": 0.10060731884836119, "grad_norm": 6.202952861785889, "learning_rate": 0.00019946062449885247, "loss": 5.7484, "step": 2108 }, { "epoch": 0.10065504528045245, "grad_norm": 4.8048200607299805, "learning_rate": 0.0001994601057122316, "loss": 5.6592, "step": 2109 }, { "epoch": 0.1007027717125437, "grad_norm": 4.856410026550293, "learning_rate": 0.0001994595866769142, "loss": 5.738, "step": 2110 }, { "epoch": 0.10075049814463495, "grad_norm": 6.121587753295898, "learning_rate": 0.00019945906739290146, "loss": 6.0993, "step": 2111 }, { "epoch": 0.1007982245767262, "grad_norm": 6.890287399291992, "learning_rate": 0.0001994585478601947, "loss": 9.0528, "step": 2112 }, { "epoch": 0.10084595100881746, "grad_norm": 6.7539825439453125, "learning_rate": 0.00019945802807879524, "loss": 7.7864, "step": 2113 }, { "epoch": 0.10089367744090871, "grad_norm": 5.973094463348389, "learning_rate": 0.0001994575080487044, "loss": 7.6062, "step": 2114 }, { "epoch": 0.10094140387299996, "grad_norm": 6.893575191497803, "learning_rate": 0.00019945698776992345, "loss": 5.6347, "step": 2115 }, { "epoch": 0.10098913030509121, "grad_norm": 6.671051979064941, "learning_rate": 0.00019945646724245372, "loss": 8.4864, "step": 2116 }, { "epoch": 0.10103685673718248, "grad_norm": 5.2045464515686035, "learning_rate": 0.00019945594646629647, "loss": 5.7913, "step": 2117 }, { "epoch": 0.10108458316927373, "grad_norm": 4.425787448883057, "learning_rate": 0.00019945542544145303, "loss": 5.3547, "step": 2118 }, { "epoch": 0.10113230960136498, "grad_norm": 7.290124893188477, "learning_rate": 0.0001994549041679247, "loss": 7.322, "step": 2119 }, { "epoch": 0.10118003603345622, "grad_norm": 6.588491439819336, "learning_rate": 0.00019945438264571278, "loss": 7.923, "step": 2120 }, { "epoch": 0.10122776246554749, "grad_norm": 5.309898376464844, "learning_rate": 0.00019945386087481858, "loss": 6.6395, "step": 2121 }, { "epoch": 0.10127548889763874, "grad_norm": 5.862447261810303, "learning_rate": 0.0001994533388552434, "loss": 6.8281, "step": 2122 }, { "epoch": 0.10132321532972999, "grad_norm": 6.614034175872803, "learning_rate": 0.00019945281658698854, "loss": 6.0992, "step": 2123 }, { "epoch": 0.10137094176182124, "grad_norm": 7.342194080352783, "learning_rate": 0.00019945229407005528, "loss": 6.3548, "step": 2124 }, { "epoch": 0.10141866819391249, "grad_norm": 5.4828972816467285, "learning_rate": 0.000199451771304445, "loss": 7.2609, "step": 2125 }, { "epoch": 0.10146639462600375, "grad_norm": 5.1916375160217285, "learning_rate": 0.00019945124829015893, "loss": 6.0348, "step": 2126 }, { "epoch": 0.101514121058095, "grad_norm": 5.027164459228516, "learning_rate": 0.0001994507250271984, "loss": 5.6735, "step": 2127 }, { "epoch": 0.10156184749018625, "grad_norm": 6.761245250701904, "learning_rate": 0.00019945020151556478, "loss": 6.0822, "step": 2128 }, { "epoch": 0.1016095739222775, "grad_norm": 6.794353008270264, "learning_rate": 0.00019944967775525928, "loss": 7.4609, "step": 2129 }, { "epoch": 0.10165730035436876, "grad_norm": 6.08527946472168, "learning_rate": 0.0001994491537462833, "loss": 7.3312, "step": 2130 }, { "epoch": 0.10170502678646001, "grad_norm": 5.508772373199463, "learning_rate": 0.00019944862948863807, "loss": 5.5693, "step": 2131 }, { "epoch": 0.10175275321855126, "grad_norm": 7.736018180847168, "learning_rate": 0.00019944810498232496, "loss": 5.0115, "step": 2132 }, { "epoch": 0.10180047965064251, "grad_norm": 6.23877477645874, "learning_rate": 0.00019944758022734526, "loss": 6.9576, "step": 2133 }, { "epoch": 0.10184820608273377, "grad_norm": 6.963664531707764, "learning_rate": 0.0001994470552237003, "loss": 7.1944, "step": 2134 }, { "epoch": 0.10189593251482502, "grad_norm": 6.2649335861206055, "learning_rate": 0.00019944652997139133, "loss": 6.067, "step": 2135 }, { "epoch": 0.10194365894691627, "grad_norm": 6.799932956695557, "learning_rate": 0.00019944600447041974, "loss": 6.3921, "step": 2136 }, { "epoch": 0.10199138537900752, "grad_norm": 4.08432149887085, "learning_rate": 0.0001994454787207868, "loss": 6.538, "step": 2137 }, { "epoch": 0.10203911181109879, "grad_norm": 9.47099494934082, "learning_rate": 0.00019944495272249387, "loss": 9.0004, "step": 2138 }, { "epoch": 0.10208683824319004, "grad_norm": 6.31077241897583, "learning_rate": 0.0001994444264755422, "loss": 7.2668, "step": 2139 }, { "epoch": 0.10213456467528129, "grad_norm": 4.733635902404785, "learning_rate": 0.00019944389997993316, "loss": 4.791, "step": 2140 }, { "epoch": 0.10218229110737254, "grad_norm": 6.712369918823242, "learning_rate": 0.000199443373235668, "loss": 6.9912, "step": 2141 }, { "epoch": 0.1022300175394638, "grad_norm": 4.643430233001709, "learning_rate": 0.00019944284624274814, "loss": 4.4897, "step": 2142 }, { "epoch": 0.10227774397155505, "grad_norm": 7.026204586029053, "learning_rate": 0.00019944231900117482, "loss": 7.5701, "step": 2143 }, { "epoch": 0.1023254704036463, "grad_norm": 4.825094699859619, "learning_rate": 0.00019944179151094938, "loss": 5.7908, "step": 2144 }, { "epoch": 0.10237319683573755, "grad_norm": 6.208577632904053, "learning_rate": 0.00019944126377207316, "loss": 7.3274, "step": 2145 }, { "epoch": 0.10242092326782881, "grad_norm": 6.465273857116699, "learning_rate": 0.0001994407357845474, "loss": 5.8288, "step": 2146 }, { "epoch": 0.10246864969992006, "grad_norm": 6.087522983551025, "learning_rate": 0.0001994402075483735, "loss": 5.9612, "step": 2147 }, { "epoch": 0.10251637613201131, "grad_norm": 6.790709018707275, "learning_rate": 0.0001994396790635528, "loss": 7.7314, "step": 2148 }, { "epoch": 0.10256410256410256, "grad_norm": 5.373871326446533, "learning_rate": 0.00019943915033008653, "loss": 5.8133, "step": 2149 }, { "epoch": 0.10261182899619381, "grad_norm": 7.719859600067139, "learning_rate": 0.00019943862134797605, "loss": 8.025, "step": 2150 }, { "epoch": 0.10265955542828507, "grad_norm": 6.4228010177612305, "learning_rate": 0.00019943809211722276, "loss": 6.3181, "step": 2151 }, { "epoch": 0.10270728186037632, "grad_norm": 5.982853412628174, "learning_rate": 0.00019943756263782786, "loss": 6.9432, "step": 2152 }, { "epoch": 0.10275500829246757, "grad_norm": 5.685730934143066, "learning_rate": 0.00019943703290979276, "loss": 6.5969, "step": 2153 }, { "epoch": 0.10280273472455882, "grad_norm": 6.125352382659912, "learning_rate": 0.00019943650293311875, "loss": 7.0158, "step": 2154 }, { "epoch": 0.10285046115665009, "grad_norm": 6.5551557540893555, "learning_rate": 0.00019943597270780713, "loss": 6.8842, "step": 2155 }, { "epoch": 0.10289818758874134, "grad_norm": 6.908018589019775, "learning_rate": 0.0001994354422338593, "loss": 6.9893, "step": 2156 }, { "epoch": 0.10294591402083259, "grad_norm": 6.3062543869018555, "learning_rate": 0.00019943491151127652, "loss": 5.89, "step": 2157 }, { "epoch": 0.10299364045292383, "grad_norm": 5.956306457519531, "learning_rate": 0.00019943438054006017, "loss": 6.4246, "step": 2158 }, { "epoch": 0.1030413668850151, "grad_norm": 5.519827842712402, "learning_rate": 0.00019943384932021152, "loss": 5.9419, "step": 2159 }, { "epoch": 0.10308909331710635, "grad_norm": 8.265746116638184, "learning_rate": 0.00019943331785173194, "loss": 8.0928, "step": 2160 }, { "epoch": 0.1031368197491976, "grad_norm": 6.37744140625, "learning_rate": 0.00019943278613462274, "loss": 7.7536, "step": 2161 }, { "epoch": 0.10318454618128885, "grad_norm": 6.88592529296875, "learning_rate": 0.00019943225416888528, "loss": 8.2723, "step": 2162 }, { "epoch": 0.10323227261338011, "grad_norm": 6.681483745574951, "learning_rate": 0.00019943172195452085, "loss": 6.5794, "step": 2163 }, { "epoch": 0.10327999904547136, "grad_norm": 4.668801784515381, "learning_rate": 0.00019943118949153078, "loss": 5.0583, "step": 2164 }, { "epoch": 0.10332772547756261, "grad_norm": 6.50665283203125, "learning_rate": 0.0001994306567799164, "loss": 6.9696, "step": 2165 }, { "epoch": 0.10337545190965386, "grad_norm": 5.758954048156738, "learning_rate": 0.0001994301238196791, "loss": 6.8225, "step": 2166 }, { "epoch": 0.10342317834174512, "grad_norm": 6.344405651092529, "learning_rate": 0.00019942959061082017, "loss": 6.8602, "step": 2167 }, { "epoch": 0.10347090477383637, "grad_norm": 4.859897136688232, "learning_rate": 0.00019942905715334092, "loss": 5.5967, "step": 2168 }, { "epoch": 0.10351863120592762, "grad_norm": 6.460178375244141, "learning_rate": 0.00019942852344724272, "loss": 6.411, "step": 2169 }, { "epoch": 0.10356635763801887, "grad_norm": 6.044944763183594, "learning_rate": 0.0001994279894925269, "loss": 6.2701, "step": 2170 }, { "epoch": 0.10361408407011014, "grad_norm": 6.786638259887695, "learning_rate": 0.00019942745528919479, "loss": 9.3, "step": 2171 }, { "epoch": 0.10366181050220138, "grad_norm": 5.898018836975098, "learning_rate": 0.00019942692083724773, "loss": 6.6145, "step": 2172 }, { "epoch": 0.10370953693429263, "grad_norm": 6.186138153076172, "learning_rate": 0.00019942638613668702, "loss": 7.112, "step": 2173 }, { "epoch": 0.10375726336638388, "grad_norm": 6.5253167152404785, "learning_rate": 0.00019942585118751404, "loss": 7.813, "step": 2174 }, { "epoch": 0.10380498979847515, "grad_norm": 5.430375099182129, "learning_rate": 0.00019942531598973013, "loss": 7.708, "step": 2175 }, { "epoch": 0.1038527162305664, "grad_norm": 4.2034783363342285, "learning_rate": 0.0001994247805433366, "loss": 5.349, "step": 2176 }, { "epoch": 0.10390044266265765, "grad_norm": 6.661571025848389, "learning_rate": 0.00019942424484833482, "loss": 6.2961, "step": 2177 }, { "epoch": 0.1039481690947489, "grad_norm": 5.960770606994629, "learning_rate": 0.00019942370890472613, "loss": 6.9406, "step": 2178 }, { "epoch": 0.10399589552684015, "grad_norm": 4.570981979370117, "learning_rate": 0.0001994231727125118, "loss": 5.307, "step": 2179 }, { "epoch": 0.10404362195893141, "grad_norm": 6.911619663238525, "learning_rate": 0.00019942263627169325, "loss": 6.447, "step": 2180 }, { "epoch": 0.10409134839102266, "grad_norm": 3.8423147201538086, "learning_rate": 0.0001994220995822718, "loss": 4.22, "step": 2181 }, { "epoch": 0.10413907482311391, "grad_norm": 8.009982109069824, "learning_rate": 0.0001994215626442488, "loss": 7.8943, "step": 2182 }, { "epoch": 0.10418680125520516, "grad_norm": 6.329585075378418, "learning_rate": 0.00019942102545762556, "loss": 5.2294, "step": 2183 }, { "epoch": 0.10423452768729642, "grad_norm": 5.874599456787109, "learning_rate": 0.00019942048802240342, "loss": 5.9641, "step": 2184 }, { "epoch": 0.10428225411938767, "grad_norm": 4.815300464630127, "learning_rate": 0.00019941995033858376, "loss": 5.7496, "step": 2185 }, { "epoch": 0.10432998055147892, "grad_norm": 7.155482292175293, "learning_rate": 0.00019941941240616794, "loss": 6.9402, "step": 2186 }, { "epoch": 0.10437770698357017, "grad_norm": 5.28063440322876, "learning_rate": 0.00019941887422515725, "loss": 6.2889, "step": 2187 }, { "epoch": 0.10442543341566143, "grad_norm": 5.873574733734131, "learning_rate": 0.00019941833579555307, "loss": 5.9383, "step": 2188 }, { "epoch": 0.10447315984775268, "grad_norm": 4.884374141693115, "learning_rate": 0.00019941779711735673, "loss": 5.8587, "step": 2189 }, { "epoch": 0.10452088627984393, "grad_norm": 6.578730583190918, "learning_rate": 0.0001994172581905696, "loss": 7.5547, "step": 2190 }, { "epoch": 0.10456861271193518, "grad_norm": 7.629443168640137, "learning_rate": 0.00019941671901519298, "loss": 7.4847, "step": 2191 }, { "epoch": 0.10461633914402645, "grad_norm": 5.76410436630249, "learning_rate": 0.00019941617959122828, "loss": 5.1354, "step": 2192 }, { "epoch": 0.1046640655761177, "grad_norm": 4.090086936950684, "learning_rate": 0.0001994156399186768, "loss": 4.3761, "step": 2193 }, { "epoch": 0.10471179200820895, "grad_norm": 5.655608177185059, "learning_rate": 0.00019941509999753994, "loss": 6.8013, "step": 2194 }, { "epoch": 0.1047595184403002, "grad_norm": 4.7437920570373535, "learning_rate": 0.00019941455982781897, "loss": 5.6816, "step": 2195 }, { "epoch": 0.10480724487239146, "grad_norm": 7.205836772918701, "learning_rate": 0.00019941401940951533, "loss": 7.0836, "step": 2196 }, { "epoch": 0.10485497130448271, "grad_norm": 6.238199234008789, "learning_rate": 0.00019941347874263032, "loss": 7.8366, "step": 2197 }, { "epoch": 0.10490269773657396, "grad_norm": 6.134603500366211, "learning_rate": 0.0001994129378271653, "loss": 6.6753, "step": 2198 }, { "epoch": 0.10495042416866521, "grad_norm": 7.363043308258057, "learning_rate": 0.00019941239666312164, "loss": 7.2709, "step": 2199 }, { "epoch": 0.10499815060075647, "grad_norm": 5.145401477813721, "learning_rate": 0.00019941185525050066, "loss": 5.4473, "step": 2200 }, { "epoch": 0.10499815060075647, "eval_loss": 1.602339506149292, "eval_runtime": 96.5325, "eval_samples_per_second": 8.733, "eval_steps_per_second": 4.372, "step": 2200 }, { "epoch": 0.10504587703284772, "grad_norm": 5.450164318084717, "learning_rate": 0.00019941131358930373, "loss": 6.1929, "step": 2201 }, { "epoch": 0.10509360346493897, "grad_norm": 5.591408729553223, "learning_rate": 0.00019941077167953225, "loss": 6.1391, "step": 2202 }, { "epoch": 0.10514132989703022, "grad_norm": 5.992808818817139, "learning_rate": 0.00019941022952118747, "loss": 4.8693, "step": 2203 }, { "epoch": 0.10518905632912147, "grad_norm": 4.80279016494751, "learning_rate": 0.0001994096871142708, "loss": 5.2619, "step": 2204 }, { "epoch": 0.10523678276121273, "grad_norm": 5.691819667816162, "learning_rate": 0.00019940914445878367, "loss": 5.3011, "step": 2205 }, { "epoch": 0.10528450919330398, "grad_norm": 5.115384578704834, "learning_rate": 0.00019940860155472732, "loss": 4.359, "step": 2206 }, { "epoch": 0.10533223562539523, "grad_norm": 5.720730781555176, "learning_rate": 0.00019940805840210318, "loss": 5.9681, "step": 2207 }, { "epoch": 0.10537996205748648, "grad_norm": 4.984778881072998, "learning_rate": 0.00019940751500091257, "loss": 5.6348, "step": 2208 }, { "epoch": 0.10542768848957774, "grad_norm": 5.787764072418213, "learning_rate": 0.00019940697135115688, "loss": 6.2302, "step": 2209 }, { "epoch": 0.105475414921669, "grad_norm": 5.652523040771484, "learning_rate": 0.00019940642745283746, "loss": 5.879, "step": 2210 }, { "epoch": 0.10552314135376024, "grad_norm": 7.101466655731201, "learning_rate": 0.00019940588330595563, "loss": 7.963, "step": 2211 }, { "epoch": 0.1055708677858515, "grad_norm": 6.2562713623046875, "learning_rate": 0.00019940533891051282, "loss": 6.9716, "step": 2212 }, { "epoch": 0.10561859421794276, "grad_norm": 5.532405376434326, "learning_rate": 0.00019940479426651034, "loss": 6.9063, "step": 2213 }, { "epoch": 0.105666320650034, "grad_norm": 6.955551624298096, "learning_rate": 0.00019940424937394958, "loss": 6.0961, "step": 2214 }, { "epoch": 0.10571404708212526, "grad_norm": 8.936443328857422, "learning_rate": 0.00019940370423283187, "loss": 8.8187, "step": 2215 }, { "epoch": 0.1057617735142165, "grad_norm": 5.99800443649292, "learning_rate": 0.00019940315884315864, "loss": 6.5717, "step": 2216 }, { "epoch": 0.10580949994630777, "grad_norm": 6.92852783203125, "learning_rate": 0.00019940261320493114, "loss": 6.509, "step": 2217 }, { "epoch": 0.10585722637839902, "grad_norm": 6.274646282196045, "learning_rate": 0.00019940206731815086, "loss": 7.7704, "step": 2218 }, { "epoch": 0.10590495281049027, "grad_norm": 6.350011348724365, "learning_rate": 0.00019940152118281908, "loss": 7.1323, "step": 2219 }, { "epoch": 0.10595267924258152, "grad_norm": 5.615455150604248, "learning_rate": 0.00019940097479893718, "loss": 6.8173, "step": 2220 }, { "epoch": 0.10600040567467278, "grad_norm": 6.136722087860107, "learning_rate": 0.00019940042816650656, "loss": 7.6495, "step": 2221 }, { "epoch": 0.10604813210676403, "grad_norm": 6.436617374420166, "learning_rate": 0.00019939988128552853, "loss": 6.2319, "step": 2222 }, { "epoch": 0.10609585853885528, "grad_norm": 5.910126686096191, "learning_rate": 0.00019939933415600452, "loss": 6.2024, "step": 2223 }, { "epoch": 0.10614358497094653, "grad_norm": 6.70936393737793, "learning_rate": 0.0001993987867779359, "loss": 7.4669, "step": 2224 }, { "epoch": 0.1061913114030378, "grad_norm": 6.343003273010254, "learning_rate": 0.00019939823915132395, "loss": 6.854, "step": 2225 }, { "epoch": 0.10623903783512904, "grad_norm": 5.298125743865967, "learning_rate": 0.00019939769127617015, "loss": 5.9439, "step": 2226 }, { "epoch": 0.1062867642672203, "grad_norm": 5.3317108154296875, "learning_rate": 0.0001993971431524758, "loss": 5.5212, "step": 2227 }, { "epoch": 0.10633449069931154, "grad_norm": 7.667644500732422, "learning_rate": 0.00019939659478024227, "loss": 8.6898, "step": 2228 }, { "epoch": 0.10638221713140279, "grad_norm": 5.852370262145996, "learning_rate": 0.00019939604615947095, "loss": 5.7408, "step": 2229 }, { "epoch": 0.10642994356349406, "grad_norm": 5.767739772796631, "learning_rate": 0.00019939549729016326, "loss": 6.2188, "step": 2230 }, { "epoch": 0.1064776699955853, "grad_norm": 24.669878005981445, "learning_rate": 0.00019939494817232048, "loss": 5.4811, "step": 2231 }, { "epoch": 0.10652539642767656, "grad_norm": 6.24949836730957, "learning_rate": 0.00019939439880594403, "loss": 6.7981, "step": 2232 }, { "epoch": 0.1065731228597678, "grad_norm": 4.919851303100586, "learning_rate": 0.0001993938491910353, "loss": 5.7305, "step": 2233 }, { "epoch": 0.10662084929185907, "grad_norm": 4.707479000091553, "learning_rate": 0.00019939329932759566, "loss": 4.4757, "step": 2234 }, { "epoch": 0.10666857572395032, "grad_norm": 5.307102680206299, "learning_rate": 0.00019939274921562645, "loss": 6.0117, "step": 2235 }, { "epoch": 0.10671630215604157, "grad_norm": 5.719334602355957, "learning_rate": 0.00019939219885512906, "loss": 5.803, "step": 2236 }, { "epoch": 0.10676402858813282, "grad_norm": 4.507400035858154, "learning_rate": 0.00019939164824610487, "loss": 4.8236, "step": 2237 }, { "epoch": 0.10681175502022408, "grad_norm": 5.215142726898193, "learning_rate": 0.00019939109738855527, "loss": 6.0824, "step": 2238 }, { "epoch": 0.10685948145231533, "grad_norm": 5.201887130737305, "learning_rate": 0.00019939054628248163, "loss": 5.8014, "step": 2239 }, { "epoch": 0.10690720788440658, "grad_norm": 5.401710510253906, "learning_rate": 0.00019938999492788532, "loss": 7.0493, "step": 2240 }, { "epoch": 0.10695493431649783, "grad_norm": 6.155087471008301, "learning_rate": 0.0001993894433247677, "loss": 7.3769, "step": 2241 }, { "epoch": 0.10700266074858909, "grad_norm": 5.142300128936768, "learning_rate": 0.0001993888914731302, "loss": 5.3895, "step": 2242 }, { "epoch": 0.10705038718068034, "grad_norm": 8.86355972290039, "learning_rate": 0.00019938833937297416, "loss": 8.0173, "step": 2243 }, { "epoch": 0.10709811361277159, "grad_norm": 5.095742225646973, "learning_rate": 0.00019938778702430096, "loss": 5.1925, "step": 2244 }, { "epoch": 0.10714584004486284, "grad_norm": 6.408175945281982, "learning_rate": 0.00019938723442711203, "loss": 6.9968, "step": 2245 }, { "epoch": 0.1071935664769541, "grad_norm": 7.479118824005127, "learning_rate": 0.0001993866815814087, "loss": 7.813, "step": 2246 }, { "epoch": 0.10724129290904535, "grad_norm": 5.595974922180176, "learning_rate": 0.00019938612848719237, "loss": 5.5132, "step": 2247 }, { "epoch": 0.1072890193411366, "grad_norm": 5.417313575744629, "learning_rate": 0.0001993855751444644, "loss": 5.7089, "step": 2248 }, { "epoch": 0.10733674577322785, "grad_norm": 5.91009521484375, "learning_rate": 0.0001993850215532262, "loss": 6.8627, "step": 2249 }, { "epoch": 0.10738447220531912, "grad_norm": 6.116936683654785, "learning_rate": 0.00019938446771347915, "loss": 7.3076, "step": 2250 }, { "epoch": 0.10743219863741037, "grad_norm": 5.593759059906006, "learning_rate": 0.00019938391362522468, "loss": 6.352, "step": 2251 }, { "epoch": 0.10747992506950162, "grad_norm": 6.780364513397217, "learning_rate": 0.00019938335928846408, "loss": 7.6296, "step": 2252 }, { "epoch": 0.10752765150159287, "grad_norm": 7.030237674713135, "learning_rate": 0.00019938280470319878, "loss": 7.8474, "step": 2253 }, { "epoch": 0.10757537793368412, "grad_norm": 8.07816219329834, "learning_rate": 0.00019938224986943022, "loss": 7.5247, "step": 2254 }, { "epoch": 0.10762310436577538, "grad_norm": 7.469743728637695, "learning_rate": 0.0001993816947871597, "loss": 8.2702, "step": 2255 }, { "epoch": 0.10767083079786663, "grad_norm": 5.907149791717529, "learning_rate": 0.00019938113945638865, "loss": 6.131, "step": 2256 }, { "epoch": 0.10771855722995788, "grad_norm": 6.085369110107422, "learning_rate": 0.00019938058387711845, "loss": 5.569, "step": 2257 }, { "epoch": 0.10776628366204913, "grad_norm": 5.916553497314453, "learning_rate": 0.00019938002804935053, "loss": 6.8827, "step": 2258 }, { "epoch": 0.10781401009414039, "grad_norm": 5.946355819702148, "learning_rate": 0.00019937947197308623, "loss": 6.3029, "step": 2259 }, { "epoch": 0.10786173652623164, "grad_norm": 5.058459281921387, "learning_rate": 0.00019937891564832694, "loss": 6.0389, "step": 2260 }, { "epoch": 0.10790946295832289, "grad_norm": 5.020669460296631, "learning_rate": 0.00019937835907507404, "loss": 7.4807, "step": 2261 }, { "epoch": 0.10795718939041414, "grad_norm": 5.872036933898926, "learning_rate": 0.000199377802253329, "loss": 7.3551, "step": 2262 }, { "epoch": 0.1080049158225054, "grad_norm": 6.965964317321777, "learning_rate": 0.00019937724518309316, "loss": 5.4708, "step": 2263 }, { "epoch": 0.10805264225459665, "grad_norm": 5.607543468475342, "learning_rate": 0.00019937668786436787, "loss": 7.897, "step": 2264 }, { "epoch": 0.1081003686866879, "grad_norm": 5.302212715148926, "learning_rate": 0.0001993761302971546, "loss": 5.4608, "step": 2265 }, { "epoch": 0.10814809511877915, "grad_norm": 4.950501918792725, "learning_rate": 0.00019937557248145472, "loss": 5.9577, "step": 2266 }, { "epoch": 0.10819582155087042, "grad_norm": 6.807582855224609, "learning_rate": 0.0001993750144172696, "loss": 9.1126, "step": 2267 }, { "epoch": 0.10824354798296167, "grad_norm": 4.571179389953613, "learning_rate": 0.00019937445610460067, "loss": 5.1932, "step": 2268 }, { "epoch": 0.10829127441505292, "grad_norm": 6.727152347564697, "learning_rate": 0.0001993738975434493, "loss": 7.5464, "step": 2269 }, { "epoch": 0.10833900084714417, "grad_norm": 6.7499871253967285, "learning_rate": 0.00019937333873381685, "loss": 7.0233, "step": 2270 }, { "epoch": 0.10838672727923543, "grad_norm": 6.338409900665283, "learning_rate": 0.0001993727796757048, "loss": 7.9402, "step": 2271 }, { "epoch": 0.10843445371132668, "grad_norm": 5.187378406524658, "learning_rate": 0.0001993722203691145, "loss": 6.3124, "step": 2272 }, { "epoch": 0.10848218014341793, "grad_norm": 6.8389997482299805, "learning_rate": 0.00019937166081404736, "loss": 5.4769, "step": 2273 }, { "epoch": 0.10852990657550918, "grad_norm": 6.336965560913086, "learning_rate": 0.00019937110101050478, "loss": 7.4738, "step": 2274 }, { "epoch": 0.10857763300760044, "grad_norm": 5.328413963317871, "learning_rate": 0.00019937054095848814, "loss": 5.5843, "step": 2275 }, { "epoch": 0.10862535943969169, "grad_norm": 6.723316669464111, "learning_rate": 0.0001993699806579989, "loss": 7.3747, "step": 2276 }, { "epoch": 0.10867308587178294, "grad_norm": 8.220144271850586, "learning_rate": 0.00019936942010903837, "loss": 7.368, "step": 2277 }, { "epoch": 0.10872081230387419, "grad_norm": 5.896597862243652, "learning_rate": 0.00019936885931160801, "loss": 7.4897, "step": 2278 }, { "epoch": 0.10876853873596544, "grad_norm": 5.127669334411621, "learning_rate": 0.00019936829826570923, "loss": 5.2628, "step": 2279 }, { "epoch": 0.1088162651680567, "grad_norm": 5.313838481903076, "learning_rate": 0.0001993677369713434, "loss": 5.643, "step": 2280 }, { "epoch": 0.10886399160014795, "grad_norm": 5.881006240844727, "learning_rate": 0.00019936717542851199, "loss": 6.6248, "step": 2281 }, { "epoch": 0.1089117180322392, "grad_norm": 4.9843244552612305, "learning_rate": 0.00019936661363721626, "loss": 5.4418, "step": 2282 }, { "epoch": 0.10895944446433045, "grad_norm": 6.7661943435668945, "learning_rate": 0.00019936605159745778, "loss": 7.1359, "step": 2283 }, { "epoch": 0.10900717089642172, "grad_norm": 4.885817527770996, "learning_rate": 0.00019936548930923786, "loss": 6.0781, "step": 2284 }, { "epoch": 0.10905489732851296, "grad_norm": 6.349702835083008, "learning_rate": 0.00019936492677255792, "loss": 6.7283, "step": 2285 }, { "epoch": 0.10910262376060421, "grad_norm": 7.6741461753845215, "learning_rate": 0.00019936436398741939, "loss": 6.876, "step": 2286 }, { "epoch": 0.10915035019269546, "grad_norm": 4.658005714416504, "learning_rate": 0.00019936380095382365, "loss": 5.0804, "step": 2287 }, { "epoch": 0.10919807662478673, "grad_norm": 6.007474899291992, "learning_rate": 0.00019936323767177214, "loss": 5.9884, "step": 2288 }, { "epoch": 0.10924580305687798, "grad_norm": 7.069979190826416, "learning_rate": 0.00019936267414126621, "loss": 5.87, "step": 2289 }, { "epoch": 0.10929352948896923, "grad_norm": 7.070275783538818, "learning_rate": 0.00019936211036230734, "loss": 7.335, "step": 2290 }, { "epoch": 0.10934125592106048, "grad_norm": 5.51244592666626, "learning_rate": 0.0001993615463348969, "loss": 5.12, "step": 2291 }, { "epoch": 0.10938898235315174, "grad_norm": 6.348170757293701, "learning_rate": 0.0001993609820590363, "loss": 6.1544, "step": 2292 }, { "epoch": 0.10943670878524299, "grad_norm": 7.591483116149902, "learning_rate": 0.00019936041753472697, "loss": 7.2094, "step": 2293 }, { "epoch": 0.10948443521733424, "grad_norm": 6.927452564239502, "learning_rate": 0.0001993598527619703, "loss": 7.4317, "step": 2294 }, { "epoch": 0.10953216164942549, "grad_norm": 6.805290699005127, "learning_rate": 0.00019935928774076774, "loss": 7.6707, "step": 2295 }, { "epoch": 0.10957988808151675, "grad_norm": 4.375084400177002, "learning_rate": 0.00019935872247112065, "loss": 4.3418, "step": 2296 }, { "epoch": 0.109627614513608, "grad_norm": 5.1622633934021, "learning_rate": 0.00019935815695303047, "loss": 6.8221, "step": 2297 }, { "epoch": 0.10967534094569925, "grad_norm": 4.2949066162109375, "learning_rate": 0.00019935759118649862, "loss": 6.0277, "step": 2298 }, { "epoch": 0.1097230673777905, "grad_norm": 7.153820037841797, "learning_rate": 0.00019935702517152648, "loss": 6.82, "step": 2299 }, { "epoch": 0.10977079380988176, "grad_norm": 5.745052814483643, "learning_rate": 0.0001993564589081155, "loss": 6.2553, "step": 2300 }, { "epoch": 0.10981852024197301, "grad_norm": 6.87201452255249, "learning_rate": 0.00019935589239626712, "loss": 7.7752, "step": 2301 }, { "epoch": 0.10986624667406426, "grad_norm": 5.777369022369385, "learning_rate": 0.00019935532563598272, "loss": 6.3414, "step": 2302 }, { "epoch": 0.10991397310615551, "grad_norm": 7.546662330627441, "learning_rate": 0.0001993547586272637, "loss": 8.5535, "step": 2303 }, { "epoch": 0.10996169953824676, "grad_norm": 6.240353584289551, "learning_rate": 0.00019935419137011152, "loss": 5.8689, "step": 2304 }, { "epoch": 0.11000942597033803, "grad_norm": 6.683908939361572, "learning_rate": 0.00019935362386452757, "loss": 6.5744, "step": 2305 }, { "epoch": 0.11005715240242928, "grad_norm": 6.688528060913086, "learning_rate": 0.0001993530561105133, "loss": 6.5041, "step": 2306 }, { "epoch": 0.11010487883452053, "grad_norm": 6.669954776763916, "learning_rate": 0.00019935248810807007, "loss": 8.4592, "step": 2307 }, { "epoch": 0.11015260526661177, "grad_norm": 5.248333930969238, "learning_rate": 0.00019935191985719937, "loss": 5.163, "step": 2308 }, { "epoch": 0.11020033169870304, "grad_norm": 6.443370342254639, "learning_rate": 0.00019935135135790258, "loss": 7.1714, "step": 2309 }, { "epoch": 0.11024805813079429, "grad_norm": 5.921457290649414, "learning_rate": 0.0001993507826101811, "loss": 6.3727, "step": 2310 }, { "epoch": 0.11029578456288554, "grad_norm": 5.955488204956055, "learning_rate": 0.00019935021361403642, "loss": 6.4379, "step": 2311 }, { "epoch": 0.11034351099497679, "grad_norm": 5.717413902282715, "learning_rate": 0.00019934964436946987, "loss": 6.7091, "step": 2312 }, { "epoch": 0.11039123742706805, "grad_norm": 6.291435718536377, "learning_rate": 0.00019934907487648298, "loss": 7.9312, "step": 2313 }, { "epoch": 0.1104389638591593, "grad_norm": 6.957582950592041, "learning_rate": 0.00019934850513507712, "loss": 8.0919, "step": 2314 }, { "epoch": 0.11048669029125055, "grad_norm": 7.492276191711426, "learning_rate": 0.00019934793514525371, "loss": 7.1227, "step": 2315 }, { "epoch": 0.1105344167233418, "grad_norm": 6.058623313903809, "learning_rate": 0.00019934736490701417, "loss": 6.6442, "step": 2316 }, { "epoch": 0.11058214315543306, "grad_norm": 7.71964693069458, "learning_rate": 0.00019934679442035997, "loss": 8.5892, "step": 2317 }, { "epoch": 0.11062986958752431, "grad_norm": 6.302182197570801, "learning_rate": 0.00019934622368529244, "loss": 6.2054, "step": 2318 }, { "epoch": 0.11067759601961556, "grad_norm": 5.68353271484375, "learning_rate": 0.00019934565270181317, "loss": 5.8535, "step": 2319 }, { "epoch": 0.11072532245170681, "grad_norm": 13.180729866027832, "learning_rate": 0.00019934508146992341, "loss": 5.0452, "step": 2320 }, { "epoch": 0.11077304888379808, "grad_norm": 6.95402193069458, "learning_rate": 0.00019934450998962468, "loss": 6.5574, "step": 2321 }, { "epoch": 0.11082077531588932, "grad_norm": 4.98831033706665, "learning_rate": 0.00019934393826091841, "loss": 4.8938, "step": 2322 }, { "epoch": 0.11086850174798057, "grad_norm": 6.9891510009765625, "learning_rate": 0.000199343366283806, "loss": 5.9829, "step": 2323 }, { "epoch": 0.11091622818007182, "grad_norm": 5.644136905670166, "learning_rate": 0.00019934279405828893, "loss": 6.7659, "step": 2324 }, { "epoch": 0.11096395461216309, "grad_norm": 5.159231662750244, "learning_rate": 0.00019934222158436856, "loss": 5.5547, "step": 2325 }, { "epoch": 0.11101168104425434, "grad_norm": 5.834488391876221, "learning_rate": 0.0001993416488620464, "loss": 7.0846, "step": 2326 }, { "epoch": 0.11105940747634559, "grad_norm": 6.442501544952393, "learning_rate": 0.0001993410758913238, "loss": 6.5023, "step": 2327 }, { "epoch": 0.11110713390843684, "grad_norm": 6.493159294128418, "learning_rate": 0.00019934050267220226, "loss": 6.8592, "step": 2328 }, { "epoch": 0.11115486034052809, "grad_norm": 5.3274712562561035, "learning_rate": 0.00019933992920468318, "loss": 5.79, "step": 2329 }, { "epoch": 0.11120258677261935, "grad_norm": 7.033855438232422, "learning_rate": 0.00019933935548876802, "loss": 6.8646, "step": 2330 }, { "epoch": 0.1112503132047106, "grad_norm": 7.091256618499756, "learning_rate": 0.00019933878152445815, "loss": 8.0637, "step": 2331 }, { "epoch": 0.11129803963680185, "grad_norm": 5.766811370849609, "learning_rate": 0.00019933820731175508, "loss": 6.6315, "step": 2332 }, { "epoch": 0.1113457660688931, "grad_norm": 6.875061511993408, "learning_rate": 0.0001993376328506602, "loss": 7.5878, "step": 2333 }, { "epoch": 0.11139349250098436, "grad_norm": 6.680387496948242, "learning_rate": 0.00019933705814117496, "loss": 5.9366, "step": 2334 }, { "epoch": 0.11144121893307561, "grad_norm": 6.255440711975098, "learning_rate": 0.00019933648318330081, "loss": 6.6987, "step": 2335 }, { "epoch": 0.11148894536516686, "grad_norm": 6.5973711013793945, "learning_rate": 0.0001993359079770392, "loss": 6.6335, "step": 2336 }, { "epoch": 0.11153667179725811, "grad_norm": 7.028340816497803, "learning_rate": 0.00019933533252239151, "loss": 6.9884, "step": 2337 }, { "epoch": 0.11158439822934937, "grad_norm": 5.201953411102295, "learning_rate": 0.00019933475681935923, "loss": 6.3241, "step": 2338 }, { "epoch": 0.11163212466144062, "grad_norm": 6.204700469970703, "learning_rate": 0.0001993341808679438, "loss": 7.4391, "step": 2339 }, { "epoch": 0.11167985109353187, "grad_norm": 6.416488170623779, "learning_rate": 0.0001993336046681466, "loss": 7.4533, "step": 2340 }, { "epoch": 0.11172757752562312, "grad_norm": 4.511433124542236, "learning_rate": 0.00019933302821996916, "loss": 4.9382, "step": 2341 }, { "epoch": 0.11177530395771439, "grad_norm": 4.712959289550781, "learning_rate": 0.00019933245152341288, "loss": 4.7801, "step": 2342 }, { "epoch": 0.11182303038980564, "grad_norm": 4.192983627319336, "learning_rate": 0.00019933187457847918, "loss": 6.1418, "step": 2343 }, { "epoch": 0.11187075682189689, "grad_norm": 8.828011512756348, "learning_rate": 0.0001993312973851695, "loss": 7.6646, "step": 2344 }, { "epoch": 0.11191848325398814, "grad_norm": 8.318449020385742, "learning_rate": 0.00019933071994348534, "loss": 9.4005, "step": 2345 }, { "epoch": 0.1119662096860794, "grad_norm": 5.3740386962890625, "learning_rate": 0.00019933014225342806, "loss": 5.7934, "step": 2346 }, { "epoch": 0.11201393611817065, "grad_norm": 6.731147766113281, "learning_rate": 0.0001993295643149992, "loss": 6.8445, "step": 2347 }, { "epoch": 0.1120616625502619, "grad_norm": 4.421756267547607, "learning_rate": 0.00019932898612820015, "loss": 4.8008, "step": 2348 }, { "epoch": 0.11210938898235315, "grad_norm": 8.543274879455566, "learning_rate": 0.00019932840769303236, "loss": 5.5817, "step": 2349 }, { "epoch": 0.11215711541444441, "grad_norm": 7.155072212219238, "learning_rate": 0.00019932782900949726, "loss": 6.2773, "step": 2350 }, { "epoch": 0.11220484184653566, "grad_norm": 9.269936561584473, "learning_rate": 0.00019932725007759634, "loss": 8.2326, "step": 2351 }, { "epoch": 0.11225256827862691, "grad_norm": 7.568048477172852, "learning_rate": 0.000199326670897331, "loss": 7.7525, "step": 2352 }, { "epoch": 0.11230029471071816, "grad_norm": 5.6857991218566895, "learning_rate": 0.00019932609146870272, "loss": 6.557, "step": 2353 }, { "epoch": 0.11234802114280942, "grad_norm": 8.984227180480957, "learning_rate": 0.000199325511791713, "loss": 7.3694, "step": 2354 }, { "epoch": 0.11239574757490067, "grad_norm": 5.334461212158203, "learning_rate": 0.00019932493186636317, "loss": 6.5095, "step": 2355 }, { "epoch": 0.11244347400699192, "grad_norm": 4.5501251220703125, "learning_rate": 0.00019932435169265475, "loss": 5.0719, "step": 2356 }, { "epoch": 0.11249120043908317, "grad_norm": 6.914137363433838, "learning_rate": 0.0001993237712705892, "loss": 8.3949, "step": 2357 }, { "epoch": 0.11253892687117442, "grad_norm": 6.041475296020508, "learning_rate": 0.00019932319060016792, "loss": 6.8818, "step": 2358 }, { "epoch": 0.11258665330326569, "grad_norm": 4.7477803230285645, "learning_rate": 0.0001993226096813924, "loss": 6.2313, "step": 2359 }, { "epoch": 0.11263437973535693, "grad_norm": 7.208108901977539, "learning_rate": 0.00019932202851426412, "loss": 7.2228, "step": 2360 }, { "epoch": 0.11268210616744818, "grad_norm": 6.3531670570373535, "learning_rate": 0.00019932144709878448, "loss": 6.8451, "step": 2361 }, { "epoch": 0.11272983259953943, "grad_norm": 5.461609840393066, "learning_rate": 0.00019932086543495496, "loss": 7.7151, "step": 2362 }, { "epoch": 0.1127775590316307, "grad_norm": 5.988696575164795, "learning_rate": 0.00019932028352277702, "loss": 6.6906, "step": 2363 }, { "epoch": 0.11282528546372195, "grad_norm": 4.52699613571167, "learning_rate": 0.0001993197013622521, "loss": 5.4023, "step": 2364 }, { "epoch": 0.1128730118958132, "grad_norm": 6.659121036529541, "learning_rate": 0.00019931911895338162, "loss": 6.355, "step": 2365 }, { "epoch": 0.11292073832790445, "grad_norm": 4.363926887512207, "learning_rate": 0.0001993185362961671, "loss": 4.5778, "step": 2366 }, { "epoch": 0.11296846475999571, "grad_norm": 5.3649115562438965, "learning_rate": 0.00019931795339061, "loss": 4.2701, "step": 2367 }, { "epoch": 0.11301619119208696, "grad_norm": 6.213987827301025, "learning_rate": 0.00019931737023671172, "loss": 5.5372, "step": 2368 }, { "epoch": 0.11306391762417821, "grad_norm": 5.593975067138672, "learning_rate": 0.00019931678683447377, "loss": 6.3435, "step": 2369 }, { "epoch": 0.11311164405626946, "grad_norm": 4.41213846206665, "learning_rate": 0.00019931620318389755, "loss": 5.769, "step": 2370 }, { "epoch": 0.11315937048836072, "grad_norm": 6.6761627197265625, "learning_rate": 0.00019931561928498458, "loss": 6.7361, "step": 2371 }, { "epoch": 0.11320709692045197, "grad_norm": 5.262270927429199, "learning_rate": 0.0001993150351377363, "loss": 7.0394, "step": 2372 }, { "epoch": 0.11325482335254322, "grad_norm": 5.07050895690918, "learning_rate": 0.00019931445074215418, "loss": 5.0472, "step": 2373 }, { "epoch": 0.11330254978463447, "grad_norm": 6.702690124511719, "learning_rate": 0.00019931386609823966, "loss": 7.3641, "step": 2374 }, { "epoch": 0.11335027621672573, "grad_norm": 6.738053798675537, "learning_rate": 0.0001993132812059942, "loss": 6.3667, "step": 2375 }, { "epoch": 0.11339800264881698, "grad_norm": 6.065093517303467, "learning_rate": 0.0001993126960654193, "loss": 6.2547, "step": 2376 }, { "epoch": 0.11344572908090823, "grad_norm": 11.0866060256958, "learning_rate": 0.00019931211067651634, "loss": 6.9068, "step": 2377 }, { "epoch": 0.11349345551299948, "grad_norm": 6.7252197265625, "learning_rate": 0.00019931152503928687, "loss": 5.4051, "step": 2378 }, { "epoch": 0.11354118194509075, "grad_norm": 6.592766284942627, "learning_rate": 0.00019931093915373236, "loss": 5.6256, "step": 2379 }, { "epoch": 0.113588908377182, "grad_norm": 5.081091403961182, "learning_rate": 0.00019931035301985422, "loss": 5.7141, "step": 2380 }, { "epoch": 0.11363663480927325, "grad_norm": 4.645683288574219, "learning_rate": 0.0001993097666376539, "loss": 5.7581, "step": 2381 }, { "epoch": 0.1136843612413645, "grad_norm": 9.910128593444824, "learning_rate": 0.00019930918000713292, "loss": 7.054, "step": 2382 }, { "epoch": 0.11373208767345575, "grad_norm": 5.0828447341918945, "learning_rate": 0.00019930859312829275, "loss": 6.108, "step": 2383 }, { "epoch": 0.11377981410554701, "grad_norm": 5.93762731552124, "learning_rate": 0.0001993080060011348, "loss": 6.5221, "step": 2384 }, { "epoch": 0.11382754053763826, "grad_norm": 5.379035949707031, "learning_rate": 0.0001993074186256606, "loss": 5.5101, "step": 2385 }, { "epoch": 0.11387526696972951, "grad_norm": 7.046672344207764, "learning_rate": 0.00019930683100187157, "loss": 6.0654, "step": 2386 }, { "epoch": 0.11392299340182076, "grad_norm": 5.484672546386719, "learning_rate": 0.00019930624312976925, "loss": 6.6674, "step": 2387 }, { "epoch": 0.11397071983391202, "grad_norm": 5.460198879241943, "learning_rate": 0.000199305655009355, "loss": 6.2622, "step": 2388 }, { "epoch": 0.11401844626600327, "grad_norm": 6.094839096069336, "learning_rate": 0.0001993050666406304, "loss": 6.5156, "step": 2389 }, { "epoch": 0.11406617269809452, "grad_norm": 6.646234035491943, "learning_rate": 0.00019930447802359686, "loss": 7.9221, "step": 2390 }, { "epoch": 0.11411389913018577, "grad_norm": 7.640484809875488, "learning_rate": 0.00019930388915825585, "loss": 8.555, "step": 2391 }, { "epoch": 0.11416162556227703, "grad_norm": 5.273614406585693, "learning_rate": 0.00019930330004460885, "loss": 6.1841, "step": 2392 }, { "epoch": 0.11420935199436828, "grad_norm": 5.593787670135498, "learning_rate": 0.00019930271068265736, "loss": 5.6689, "step": 2393 }, { "epoch": 0.11425707842645953, "grad_norm": 7.496115684509277, "learning_rate": 0.00019930212107240286, "loss": 7.1425, "step": 2394 }, { "epoch": 0.11430480485855078, "grad_norm": 4.734747409820557, "learning_rate": 0.00019930153121384676, "loss": 4.4183, "step": 2395 }, { "epoch": 0.11435253129064205, "grad_norm": 5.430137634277344, "learning_rate": 0.0001993009411069906, "loss": 5.1923, "step": 2396 }, { "epoch": 0.1144002577227333, "grad_norm": 5.818052291870117, "learning_rate": 0.0001993003507518358, "loss": 6.9959, "step": 2397 }, { "epoch": 0.11444798415482454, "grad_norm": 8.082784652709961, "learning_rate": 0.00019929976014838389, "loss": 6.6515, "step": 2398 }, { "epoch": 0.1144957105869158, "grad_norm": 5.44492244720459, "learning_rate": 0.0001992991692966363, "loss": 5.7843, "step": 2399 }, { "epoch": 0.11454343701900706, "grad_norm": 6.5203447341918945, "learning_rate": 0.00019929857819659454, "loss": 6.4552, "step": 2400 }, { "epoch": 0.11454343701900706, "eval_loss": 1.6121469736099243, "eval_runtime": 96.4975, "eval_samples_per_second": 8.736, "eval_steps_per_second": 4.373, "step": 2400 }, { "epoch": 0.11459116345109831, "grad_norm": 6.055756092071533, "learning_rate": 0.00019929798684826006, "loss": 7.0865, "step": 2401 }, { "epoch": 0.11463888988318956, "grad_norm": 7.1241044998168945, "learning_rate": 0.00019929739525163437, "loss": 6.9193, "step": 2402 }, { "epoch": 0.1146866163152808, "grad_norm": 5.732908725738525, "learning_rate": 0.00019929680340671896, "loss": 6.8239, "step": 2403 }, { "epoch": 0.11473434274737207, "grad_norm": 6.3990278244018555, "learning_rate": 0.00019929621131351525, "loss": 7.5857, "step": 2404 }, { "epoch": 0.11478206917946332, "grad_norm": 6.51461124420166, "learning_rate": 0.00019929561897202478, "loss": 7.5126, "step": 2405 }, { "epoch": 0.11482979561155457, "grad_norm": 4.333196640014648, "learning_rate": 0.000199295026382249, "loss": 5.407, "step": 2406 }, { "epoch": 0.11487752204364582, "grad_norm": 8.812541007995605, "learning_rate": 0.0001992944335441894, "loss": 7.8813, "step": 2407 }, { "epoch": 0.11492524847573707, "grad_norm": 6.706300735473633, "learning_rate": 0.00019929384045784746, "loss": 7.3434, "step": 2408 }, { "epoch": 0.11497297490782833, "grad_norm": 5.9792375564575195, "learning_rate": 0.00019929324712322465, "loss": 6.588, "step": 2409 }, { "epoch": 0.11502070133991958, "grad_norm": 5.693054676055908, "learning_rate": 0.00019929265354032247, "loss": 6.9219, "step": 2410 }, { "epoch": 0.11506842777201083, "grad_norm": 6.320680141448975, "learning_rate": 0.00019929205970914242, "loss": 6.0497, "step": 2411 }, { "epoch": 0.11511615420410208, "grad_norm": 7.449224948883057, "learning_rate": 0.00019929146562968594, "loss": 6.0061, "step": 2412 }, { "epoch": 0.11516388063619334, "grad_norm": 5.945936679840088, "learning_rate": 0.00019929087130195457, "loss": 6.7282, "step": 2413 }, { "epoch": 0.1152116070682846, "grad_norm": 5.9501752853393555, "learning_rate": 0.00019929027672594977, "loss": 6.9589, "step": 2414 }, { "epoch": 0.11525933350037584, "grad_norm": 5.984325408935547, "learning_rate": 0.00019928968190167302, "loss": 6.0555, "step": 2415 }, { "epoch": 0.1153070599324671, "grad_norm": 5.344329833984375, "learning_rate": 0.00019928908682912582, "loss": 6.0593, "step": 2416 }, { "epoch": 0.11535478636455836, "grad_norm": 7.7345709800720215, "learning_rate": 0.0001992884915083096, "loss": 7.2564, "step": 2417 }, { "epoch": 0.1154025127966496, "grad_norm": 6.209690570831299, "learning_rate": 0.00019928789593922597, "loss": 6.8254, "step": 2418 }, { "epoch": 0.11545023922874086, "grad_norm": 7.814291477203369, "learning_rate": 0.00019928730012187628, "loss": 6.2167, "step": 2419 }, { "epoch": 0.1154979656608321, "grad_norm": 7.086947917938232, "learning_rate": 0.00019928670405626212, "loss": 6.982, "step": 2420 }, { "epoch": 0.11554569209292337, "grad_norm": 5.73114538192749, "learning_rate": 0.00019928610774238496, "loss": 6.7713, "step": 2421 }, { "epoch": 0.11559341852501462, "grad_norm": 6.70130729675293, "learning_rate": 0.00019928551118024627, "loss": 6.346, "step": 2422 }, { "epoch": 0.11564114495710587, "grad_norm": 6.0923662185668945, "learning_rate": 0.00019928491436984756, "loss": 7.3247, "step": 2423 }, { "epoch": 0.11568887138919712, "grad_norm": 5.1853861808776855, "learning_rate": 0.0001992843173111903, "loss": 5.7999, "step": 2424 }, { "epoch": 0.11573659782128838, "grad_norm": 6.79941463470459, "learning_rate": 0.000199283720004276, "loss": 5.7225, "step": 2425 }, { "epoch": 0.11578432425337963, "grad_norm": 6.806683540344238, "learning_rate": 0.00019928312244910616, "loss": 8.1717, "step": 2426 }, { "epoch": 0.11583205068547088, "grad_norm": 6.361385822296143, "learning_rate": 0.00019928252464568224, "loss": 7.508, "step": 2427 }, { "epoch": 0.11587977711756213, "grad_norm": 7.511900424957275, "learning_rate": 0.0001992819265940058, "loss": 7.6279, "step": 2428 }, { "epoch": 0.1159275035496534, "grad_norm": 6.408151149749756, "learning_rate": 0.00019928132829407825, "loss": 6.464, "step": 2429 }, { "epoch": 0.11597522998174464, "grad_norm": 6.161413192749023, "learning_rate": 0.00019928072974590116, "loss": 7.1698, "step": 2430 }, { "epoch": 0.11602295641383589, "grad_norm": 6.958288669586182, "learning_rate": 0.00019928013094947598, "loss": 8.821, "step": 2431 }, { "epoch": 0.11607068284592714, "grad_norm": 4.9188127517700195, "learning_rate": 0.00019927953190480423, "loss": 5.397, "step": 2432 }, { "epoch": 0.11611840927801839, "grad_norm": 5.682304382324219, "learning_rate": 0.0001992789326118874, "loss": 7.3442, "step": 2433 }, { "epoch": 0.11616613571010966, "grad_norm": 5.912992000579834, "learning_rate": 0.00019927833307072698, "loss": 6.825, "step": 2434 }, { "epoch": 0.1162138621422009, "grad_norm": 5.315921783447266, "learning_rate": 0.00019927773328132448, "loss": 4.3194, "step": 2435 }, { "epoch": 0.11626158857429215, "grad_norm": 6.76012659072876, "learning_rate": 0.00019927713324368145, "loss": 6.5639, "step": 2436 }, { "epoch": 0.1163093150063834, "grad_norm": 6.277029037475586, "learning_rate": 0.00019927653295779928, "loss": 6.0674, "step": 2437 }, { "epoch": 0.11635704143847467, "grad_norm": 5.538710117340088, "learning_rate": 0.00019927593242367956, "loss": 7.0892, "step": 2438 }, { "epoch": 0.11640476787056592, "grad_norm": 6.0107245445251465, "learning_rate": 0.00019927533164132376, "loss": 6.2243, "step": 2439 }, { "epoch": 0.11645249430265717, "grad_norm": 8.592703819274902, "learning_rate": 0.0001992747306107334, "loss": 5.2615, "step": 2440 }, { "epoch": 0.11650022073474842, "grad_norm": 5.739934921264648, "learning_rate": 0.00019927412933190992, "loss": 5.8984, "step": 2441 }, { "epoch": 0.11654794716683968, "grad_norm": 5.398654460906982, "learning_rate": 0.0001992735278048549, "loss": 5.6214, "step": 2442 }, { "epoch": 0.11659567359893093, "grad_norm": 6.197778224945068, "learning_rate": 0.00019927292602956982, "loss": 6.3904, "step": 2443 }, { "epoch": 0.11664340003102218, "grad_norm": 5.15541410446167, "learning_rate": 0.00019927232400605616, "loss": 5.1736, "step": 2444 }, { "epoch": 0.11669112646311343, "grad_norm": 7.815628528594971, "learning_rate": 0.00019927172173431546, "loss": 7.705, "step": 2445 }, { "epoch": 0.11673885289520469, "grad_norm": 6.1312642097473145, "learning_rate": 0.00019927111921434922, "loss": 6.9547, "step": 2446 }, { "epoch": 0.11678657932729594, "grad_norm": 5.695800304412842, "learning_rate": 0.00019927051644615893, "loss": 5.9635, "step": 2447 }, { "epoch": 0.11683430575938719, "grad_norm": 7.247018814086914, "learning_rate": 0.00019926991342974609, "loss": 6.8249, "step": 2448 }, { "epoch": 0.11688203219147844, "grad_norm": 6.522912979125977, "learning_rate": 0.00019926931016511223, "loss": 6.7136, "step": 2449 }, { "epoch": 0.1169297586235697, "grad_norm": 5.864326477050781, "learning_rate": 0.00019926870665225885, "loss": 6.6803, "step": 2450 }, { "epoch": 0.11697748505566095, "grad_norm": 6.620709419250488, "learning_rate": 0.0001992681028911875, "loss": 7.1, "step": 2451 }, { "epoch": 0.1170252114877522, "grad_norm": 6.445756435394287, "learning_rate": 0.0001992674988818996, "loss": 6.0606, "step": 2452 }, { "epoch": 0.11707293791984345, "grad_norm": 5.338548183441162, "learning_rate": 0.00019926689462439673, "loss": 4.951, "step": 2453 }, { "epoch": 0.11712066435193472, "grad_norm": 5.401038646697998, "learning_rate": 0.00019926629011868035, "loss": 5.7122, "step": 2454 }, { "epoch": 0.11716839078402597, "grad_norm": 7.513417720794678, "learning_rate": 0.00019926568536475205, "loss": 7.0187, "step": 2455 }, { "epoch": 0.11721611721611722, "grad_norm": 7.435956954956055, "learning_rate": 0.00019926508036261328, "loss": 6.212, "step": 2456 }, { "epoch": 0.11726384364820847, "grad_norm": 6.338344097137451, "learning_rate": 0.00019926447511226555, "loss": 7.8148, "step": 2457 }, { "epoch": 0.11731157008029972, "grad_norm": 6.532636642456055, "learning_rate": 0.00019926386961371042, "loss": 7.2181, "step": 2458 }, { "epoch": 0.11735929651239098, "grad_norm": 5.906259536743164, "learning_rate": 0.00019926326386694935, "loss": 7.3303, "step": 2459 }, { "epoch": 0.11740702294448223, "grad_norm": 5.578773498535156, "learning_rate": 0.00019926265787198388, "loss": 5.9464, "step": 2460 }, { "epoch": 0.11745474937657348, "grad_norm": 6.0162272453308105, "learning_rate": 0.00019926205162881555, "loss": 6.7526, "step": 2461 }, { "epoch": 0.11750247580866473, "grad_norm": 4.859315395355225, "learning_rate": 0.00019926144513744584, "loss": 5.4383, "step": 2462 }, { "epoch": 0.11755020224075599, "grad_norm": 5.332747936248779, "learning_rate": 0.00019926083839787628, "loss": 6.8147, "step": 2463 }, { "epoch": 0.11759792867284724, "grad_norm": 6.401904106140137, "learning_rate": 0.00019926023141010837, "loss": 5.5658, "step": 2464 }, { "epoch": 0.11764565510493849, "grad_norm": 8.178959846496582, "learning_rate": 0.00019925962417414366, "loss": 5.4159, "step": 2465 }, { "epoch": 0.11769338153702974, "grad_norm": 4.893832683563232, "learning_rate": 0.00019925901668998362, "loss": 6.2554, "step": 2466 }, { "epoch": 0.117741107969121, "grad_norm": 6.215792655944824, "learning_rate": 0.00019925840895762984, "loss": 5.6131, "step": 2467 }, { "epoch": 0.11778883440121225, "grad_norm": 103.55281829833984, "learning_rate": 0.00019925780097708378, "loss": 10.3719, "step": 2468 }, { "epoch": 0.1178365608333035, "grad_norm": 6.744437217712402, "learning_rate": 0.00019925719274834695, "loss": 4.8652, "step": 2469 }, { "epoch": 0.11788428726539475, "grad_norm": 6.799299240112305, "learning_rate": 0.00019925658427142096, "loss": 6.5246, "step": 2470 }, { "epoch": 0.11793201369748602, "grad_norm": 6.361636161804199, "learning_rate": 0.00019925597554630722, "loss": 7.6175, "step": 2471 }, { "epoch": 0.11797974012957727, "grad_norm": 4.605724334716797, "learning_rate": 0.00019925536657300734, "loss": 5.1498, "step": 2472 }, { "epoch": 0.11802746656166851, "grad_norm": 5.402807235717773, "learning_rate": 0.0001992547573515228, "loss": 6.0529, "step": 2473 }, { "epoch": 0.11807519299375976, "grad_norm": 8.144805908203125, "learning_rate": 0.00019925414788185512, "loss": 8.934, "step": 2474 }, { "epoch": 0.11812291942585103, "grad_norm": 10.172551155090332, "learning_rate": 0.00019925353816400583, "loss": 7.2408, "step": 2475 }, { "epoch": 0.11817064585794228, "grad_norm": 9.197928428649902, "learning_rate": 0.00019925292819797648, "loss": 6.8291, "step": 2476 }, { "epoch": 0.11821837229003353, "grad_norm": 606.15283203125, "learning_rate": 0.00019925231798376854, "loss": 8.0119, "step": 2477 }, { "epoch": 0.11826609872212478, "grad_norm": 16.890789031982422, "learning_rate": 0.0001992517075213836, "loss": 7.1259, "step": 2478 }, { "epoch": 0.11831382515421604, "grad_norm": 44.77732467651367, "learning_rate": 0.00019925109681082315, "loss": 7.1127, "step": 2479 }, { "epoch": 0.11836155158630729, "grad_norm": 278.022705078125, "learning_rate": 0.00019925048585208872, "loss": 6.3405, "step": 2480 }, { "epoch": 0.11840927801839854, "grad_norm": 9.286020278930664, "learning_rate": 0.0001992498746451818, "loss": 8.3945, "step": 2481 }, { "epoch": 0.11845700445048979, "grad_norm": 4.785390853881836, "learning_rate": 0.00019924926319010404, "loss": 5.2656, "step": 2482 }, { "epoch": 0.11850473088258104, "grad_norm": 5.759562969207764, "learning_rate": 0.00019924865148685683, "loss": 6.6575, "step": 2483 }, { "epoch": 0.1185524573146723, "grad_norm": 5.401790142059326, "learning_rate": 0.00019924803953544177, "loss": 6.0652, "step": 2484 }, { "epoch": 0.11860018374676355, "grad_norm": 6.856231212615967, "learning_rate": 0.00019924742733586038, "loss": 7.3388, "step": 2485 }, { "epoch": 0.1186479101788548, "grad_norm": 4.967160701751709, "learning_rate": 0.0001992468148881142, "loss": 6.0109, "step": 2486 }, { "epoch": 0.11869563661094605, "grad_norm": 7.75101900100708, "learning_rate": 0.00019924620219220472, "loss": 6.6262, "step": 2487 }, { "epoch": 0.11874336304303731, "grad_norm": 8.207610130310059, "learning_rate": 0.00019924558924813352, "loss": 7.5386, "step": 2488 }, { "epoch": 0.11879108947512856, "grad_norm": 6.139005661010742, "learning_rate": 0.00019924497605590212, "loss": 6.0931, "step": 2489 }, { "epoch": 0.11883881590721981, "grad_norm": 5.760986328125, "learning_rate": 0.00019924436261551206, "loss": 5.4283, "step": 2490 }, { "epoch": 0.11888654233931106, "grad_norm": 6.050114631652832, "learning_rate": 0.00019924374892696484, "loss": 6.4558, "step": 2491 }, { "epoch": 0.11893426877140233, "grad_norm": 6.406091690063477, "learning_rate": 0.00019924313499026201, "loss": 6.0646, "step": 2492 }, { "epoch": 0.11898199520349358, "grad_norm": 5.947465419769287, "learning_rate": 0.00019924252080540512, "loss": 7.7813, "step": 2493 }, { "epoch": 0.11902972163558483, "grad_norm": 6.351478576660156, "learning_rate": 0.00019924190637239571, "loss": 6.2155, "step": 2494 }, { "epoch": 0.11907744806767608, "grad_norm": 5.51107931137085, "learning_rate": 0.0001992412916912353, "loss": 5.8992, "step": 2495 }, { "epoch": 0.11912517449976734, "grad_norm": 5.350305080413818, "learning_rate": 0.00019924067676192544, "loss": 5.2331, "step": 2496 }, { "epoch": 0.11917290093185859, "grad_norm": 5.645650386810303, "learning_rate": 0.0001992400615844676, "loss": 5.9623, "step": 2497 }, { "epoch": 0.11922062736394984, "grad_norm": 5.326278209686279, "learning_rate": 0.00019923944615886344, "loss": 5.8638, "step": 2498 }, { "epoch": 0.11926835379604109, "grad_norm": 5.611948490142822, "learning_rate": 0.00019923883048511443, "loss": 6.6996, "step": 2499 }, { "epoch": 0.11931608022813235, "grad_norm": 5.947208404541016, "learning_rate": 0.00019923821456322208, "loss": 5.2053, "step": 2500 }, { "epoch": 0.1193638066602236, "grad_norm": 6.137307167053223, "learning_rate": 0.000199237598393188, "loss": 6.1276, "step": 2501 }, { "epoch": 0.11941153309231485, "grad_norm": 7.9021525382995605, "learning_rate": 0.0001992369819750137, "loss": 8.74, "step": 2502 }, { "epoch": 0.1194592595244061, "grad_norm": 8.133113861083984, "learning_rate": 0.00019923636530870068, "loss": 8.2945, "step": 2503 }, { "epoch": 0.11950698595649736, "grad_norm": 4.672719478607178, "learning_rate": 0.00019923574839425054, "loss": 5.661, "step": 2504 }, { "epoch": 0.11955471238858861, "grad_norm": 5.1231513023376465, "learning_rate": 0.0001992351312316648, "loss": 5.0149, "step": 2505 }, { "epoch": 0.11960243882067986, "grad_norm": 6.5395121574401855, "learning_rate": 0.00019923451382094498, "loss": 6.6384, "step": 2506 }, { "epoch": 0.11965016525277111, "grad_norm": 6.963812828063965, "learning_rate": 0.00019923389616209266, "loss": 7.6418, "step": 2507 }, { "epoch": 0.11969789168486236, "grad_norm": 5.314905643463135, "learning_rate": 0.0001992332782551094, "loss": 5.357, "step": 2508 }, { "epoch": 0.11974561811695363, "grad_norm": 6.953101634979248, "learning_rate": 0.0001992326600999967, "loss": 7.4247, "step": 2509 }, { "epoch": 0.11979334454904488, "grad_norm": 5.65818977355957, "learning_rate": 0.0001992320416967561, "loss": 6.9697, "step": 2510 }, { "epoch": 0.11984107098113612, "grad_norm": 5.882226467132568, "learning_rate": 0.0001992314230453892, "loss": 6.6894, "step": 2511 }, { "epoch": 0.11988879741322737, "grad_norm": 7.456608295440674, "learning_rate": 0.00019923080414589752, "loss": 6.5442, "step": 2512 }, { "epoch": 0.11993652384531864, "grad_norm": 5.068438529968262, "learning_rate": 0.0001992301849982826, "loss": 5.6924, "step": 2513 }, { "epoch": 0.11998425027740989, "grad_norm": 5.8276519775390625, "learning_rate": 0.00019922956560254599, "loss": 4.5949, "step": 2514 }, { "epoch": 0.12003197670950114, "grad_norm": 7.3889617919921875, "learning_rate": 0.00019922894595868923, "loss": 7.027, "step": 2515 }, { "epoch": 0.12007970314159239, "grad_norm": 5.222685813903809, "learning_rate": 0.00019922832606671386, "loss": 5.7551, "step": 2516 }, { "epoch": 0.12012742957368365, "grad_norm": 5.6962127685546875, "learning_rate": 0.0001992277059266215, "loss": 5.0557, "step": 2517 }, { "epoch": 0.1201751560057749, "grad_norm": 5.449974060058594, "learning_rate": 0.00019922708553841363, "loss": 6.3609, "step": 2518 }, { "epoch": 0.12022288243786615, "grad_norm": 5.272806167602539, "learning_rate": 0.00019922646490209183, "loss": 5.3785, "step": 2519 }, { "epoch": 0.1202706088699574, "grad_norm": 6.4713921546936035, "learning_rate": 0.00019922584401765763, "loss": 7.2317, "step": 2520 }, { "epoch": 0.12031833530204866, "grad_norm": 5.631217002868652, "learning_rate": 0.0001992252228851126, "loss": 5.7852, "step": 2521 }, { "epoch": 0.12036606173413991, "grad_norm": 6.146111488342285, "learning_rate": 0.0001992246015044583, "loss": 5.7136, "step": 2522 }, { "epoch": 0.12041378816623116, "grad_norm": 7.096044063568115, "learning_rate": 0.00019922397987569626, "loss": 7.2231, "step": 2523 }, { "epoch": 0.12046151459832241, "grad_norm": 7.921234607696533, "learning_rate": 0.00019922335799882806, "loss": 5.5, "step": 2524 }, { "epoch": 0.12050924103041367, "grad_norm": 5.085870265960693, "learning_rate": 0.00019922273587385524, "loss": 4.867, "step": 2525 }, { "epoch": 0.12055696746250492, "grad_norm": 6.939750671386719, "learning_rate": 0.00019922211350077934, "loss": 6.7561, "step": 2526 }, { "epoch": 0.12060469389459617, "grad_norm": 7.504082679748535, "learning_rate": 0.00019922149087960196, "loss": 7.1749, "step": 2527 }, { "epoch": 0.12065242032668742, "grad_norm": 6.798505783081055, "learning_rate": 0.00019922086801032462, "loss": 6.8441, "step": 2528 }, { "epoch": 0.12070014675877869, "grad_norm": 6.547543048858643, "learning_rate": 0.00019922024489294892, "loss": 4.8769, "step": 2529 }, { "epoch": 0.12074787319086994, "grad_norm": 8.873817443847656, "learning_rate": 0.00019921962152747634, "loss": 9.6741, "step": 2530 }, { "epoch": 0.12079559962296119, "grad_norm": 6.475502014160156, "learning_rate": 0.0001992189979139085, "loss": 6.5672, "step": 2531 }, { "epoch": 0.12084332605505244, "grad_norm": 4.202572345733643, "learning_rate": 0.00019921837405224694, "loss": 3.9452, "step": 2532 }, { "epoch": 0.1208910524871437, "grad_norm": 6.775956153869629, "learning_rate": 0.00019921774994249324, "loss": 4.9947, "step": 2533 }, { "epoch": 0.12093877891923495, "grad_norm": 7.0207695960998535, "learning_rate": 0.00019921712558464895, "loss": 7.7427, "step": 2534 }, { "epoch": 0.1209865053513262, "grad_norm": 6.175810813903809, "learning_rate": 0.00019921650097871562, "loss": 6.728, "step": 2535 }, { "epoch": 0.12103423178341745, "grad_norm": 6.027743816375732, "learning_rate": 0.0001992158761246948, "loss": 6.0176, "step": 2536 }, { "epoch": 0.1210819582155087, "grad_norm": 7.800528049468994, "learning_rate": 0.00019921525102258811, "loss": 7.3638, "step": 2537 }, { "epoch": 0.12112968464759996, "grad_norm": 6.8329267501831055, "learning_rate": 0.00019921462567239705, "loss": 7.3964, "step": 2538 }, { "epoch": 0.12117741107969121, "grad_norm": 6.654507637023926, "learning_rate": 0.0001992140000741232, "loss": 7.6877, "step": 2539 }, { "epoch": 0.12122513751178246, "grad_norm": 6.265644073486328, "learning_rate": 0.00019921337422776816, "loss": 6.2082, "step": 2540 }, { "epoch": 0.12127286394387371, "grad_norm": 5.6409196853637695, "learning_rate": 0.00019921274813333346, "loss": 6.288, "step": 2541 }, { "epoch": 0.12132059037596497, "grad_norm": 6.226003170013428, "learning_rate": 0.00019921212179082064, "loss": 6.6538, "step": 2542 }, { "epoch": 0.12136831680805622, "grad_norm": 4.8436384201049805, "learning_rate": 0.00019921149520023135, "loss": 4.5971, "step": 2543 }, { "epoch": 0.12141604324014747, "grad_norm": 6.016689777374268, "learning_rate": 0.00019921086836156707, "loss": 6.8544, "step": 2544 }, { "epoch": 0.12146376967223872, "grad_norm": 4.865900039672852, "learning_rate": 0.0001992102412748294, "loss": 4.7976, "step": 2545 }, { "epoch": 0.12151149610432999, "grad_norm": 4.419651508331299, "learning_rate": 0.0001992096139400199, "loss": 5.0167, "step": 2546 }, { "epoch": 0.12155922253642124, "grad_norm": 6.08146858215332, "learning_rate": 0.0001992089863571402, "loss": 6.2101, "step": 2547 }, { "epoch": 0.12160694896851248, "grad_norm": 6.761473178863525, "learning_rate": 0.00019920835852619176, "loss": 6.9868, "step": 2548 }, { "epoch": 0.12165467540060373, "grad_norm": 4.87887716293335, "learning_rate": 0.00019920773044717626, "loss": 5.0713, "step": 2549 }, { "epoch": 0.121702401832695, "grad_norm": 5.633859157562256, "learning_rate": 0.0001992071021200952, "loss": 6.1883, "step": 2550 }, { "epoch": 0.12175012826478625, "grad_norm": 6.28723669052124, "learning_rate": 0.0001992064735449502, "loss": 5.2651, "step": 2551 }, { "epoch": 0.1217978546968775, "grad_norm": 5.057219982147217, "learning_rate": 0.00019920584472174274, "loss": 4.7898, "step": 2552 }, { "epoch": 0.12184558112896875, "grad_norm": 7.6184821128845215, "learning_rate": 0.0001992052156504745, "loss": 7.9772, "step": 2553 }, { "epoch": 0.12189330756106001, "grad_norm": 7.5784173011779785, "learning_rate": 0.000199204586331147, "loss": 7.3766, "step": 2554 }, { "epoch": 0.12194103399315126, "grad_norm": 6.459564208984375, "learning_rate": 0.00019920395676376181, "loss": 5.3843, "step": 2555 }, { "epoch": 0.12198876042524251, "grad_norm": 8.033954620361328, "learning_rate": 0.00019920332694832048, "loss": 8.4444, "step": 2556 }, { "epoch": 0.12203648685733376, "grad_norm": 7.824962615966797, "learning_rate": 0.00019920269688482466, "loss": 7.2764, "step": 2557 }, { "epoch": 0.12208421328942502, "grad_norm": 7.024128437042236, "learning_rate": 0.00019920206657327588, "loss": 6.7798, "step": 2558 }, { "epoch": 0.12213193972151627, "grad_norm": 7.976163864135742, "learning_rate": 0.00019920143601367575, "loss": 6.9575, "step": 2559 }, { "epoch": 0.12217966615360752, "grad_norm": 6.063900470733643, "learning_rate": 0.0001992008052060258, "loss": 5.2735, "step": 2560 }, { "epoch": 0.12222739258569877, "grad_norm": 10.203490257263184, "learning_rate": 0.0001992001741503276, "loss": 6.9637, "step": 2561 }, { "epoch": 0.12227511901779002, "grad_norm": 6.814461708068848, "learning_rate": 0.0001991995428465828, "loss": 7.4683, "step": 2562 }, { "epoch": 0.12232284544988128, "grad_norm": 5.394496917724609, "learning_rate": 0.00019919891129479292, "loss": 5.9349, "step": 2563 }, { "epoch": 0.12237057188197253, "grad_norm": 7.58441162109375, "learning_rate": 0.00019919827949495952, "loss": 6.5405, "step": 2564 }, { "epoch": 0.12241829831406378, "grad_norm": 5.798213958740234, "learning_rate": 0.00019919764744708422, "loss": 6.6679, "step": 2565 }, { "epoch": 0.12246602474615503, "grad_norm": 5.101596355438232, "learning_rate": 0.0001991970151511686, "loss": 5.3157, "step": 2566 }, { "epoch": 0.1225137511782463, "grad_norm": 4.84085750579834, "learning_rate": 0.00019919638260721423, "loss": 5.0614, "step": 2567 }, { "epoch": 0.12256147761033755, "grad_norm": 6.472250461578369, "learning_rate": 0.00019919574981522268, "loss": 6.5636, "step": 2568 }, { "epoch": 0.1226092040424288, "grad_norm": 5.555691719055176, "learning_rate": 0.00019919511677519557, "loss": 5.9893, "step": 2569 }, { "epoch": 0.12265693047452005, "grad_norm": 6.802445411682129, "learning_rate": 0.00019919448348713445, "loss": 7.2901, "step": 2570 }, { "epoch": 0.12270465690661131, "grad_norm": 5.74739408493042, "learning_rate": 0.00019919384995104093, "loss": 5.9203, "step": 2571 }, { "epoch": 0.12275238333870256, "grad_norm": 6.161860942840576, "learning_rate": 0.00019919321616691655, "loss": 7.1177, "step": 2572 }, { "epoch": 0.12280010977079381, "grad_norm": 5.15494441986084, "learning_rate": 0.00019919258213476292, "loss": 5.502, "step": 2573 }, { "epoch": 0.12284783620288506, "grad_norm": 6.421548366546631, "learning_rate": 0.00019919194785458167, "loss": 5.7436, "step": 2574 }, { "epoch": 0.12289556263497632, "grad_norm": 6.586711883544922, "learning_rate": 0.0001991913133263743, "loss": 6.9608, "step": 2575 }, { "epoch": 0.12294328906706757, "grad_norm": 5.92076301574707, "learning_rate": 0.00019919067855014247, "loss": 6.092, "step": 2576 }, { "epoch": 0.12299101549915882, "grad_norm": 6.7879862785339355, "learning_rate": 0.00019919004352588767, "loss": 6.7476, "step": 2577 }, { "epoch": 0.12303874193125007, "grad_norm": 8.76463508605957, "learning_rate": 0.00019918940825361164, "loss": 8.9568, "step": 2578 }, { "epoch": 0.12308646836334133, "grad_norm": 4.8239665031433105, "learning_rate": 0.00019918877273331583, "loss": 5.2598, "step": 2579 }, { "epoch": 0.12313419479543258, "grad_norm": 6.723465919494629, "learning_rate": 0.00019918813696500189, "loss": 6.6971, "step": 2580 }, { "epoch": 0.12318192122752383, "grad_norm": 6.38710355758667, "learning_rate": 0.00019918750094867144, "loss": 5.7313, "step": 2581 }, { "epoch": 0.12322964765961508, "grad_norm": 6.35195779800415, "learning_rate": 0.000199186864684326, "loss": 7.8379, "step": 2582 }, { "epoch": 0.12327737409170635, "grad_norm": 6.694102764129639, "learning_rate": 0.0001991862281719672, "loss": 6.2678, "step": 2583 }, { "epoch": 0.1233251005237976, "grad_norm": 6.242228984832764, "learning_rate": 0.00019918559141159664, "loss": 5.813, "step": 2584 }, { "epoch": 0.12337282695588885, "grad_norm": 7.03825569152832, "learning_rate": 0.00019918495440321586, "loss": 6.2827, "step": 2585 }, { "epoch": 0.1234205533879801, "grad_norm": 4.9181389808654785, "learning_rate": 0.0001991843171468265, "loss": 4.9544, "step": 2586 }, { "epoch": 0.12346827982007134, "grad_norm": 7.315206050872803, "learning_rate": 0.00019918367964243014, "loss": 8.3353, "step": 2587 }, { "epoch": 0.12351600625216261, "grad_norm": 6.019472122192383, "learning_rate": 0.0001991830418900284, "loss": 6.734, "step": 2588 }, { "epoch": 0.12356373268425386, "grad_norm": 5.389978885650635, "learning_rate": 0.00019918240388962284, "loss": 6.5434, "step": 2589 }, { "epoch": 0.12361145911634511, "grad_norm": 6.509700298309326, "learning_rate": 0.00019918176564121508, "loss": 7.3647, "step": 2590 }, { "epoch": 0.12365918554843636, "grad_norm": 8.090117454528809, "learning_rate": 0.0001991811271448067, "loss": 7.2696, "step": 2591 }, { "epoch": 0.12370691198052762, "grad_norm": 5.416383266448975, "learning_rate": 0.00019918048840039928, "loss": 6.4276, "step": 2592 }, { "epoch": 0.12375463841261887, "grad_norm": 6.610803127288818, "learning_rate": 0.00019917984940799445, "loss": 5.4854, "step": 2593 }, { "epoch": 0.12380236484471012, "grad_norm": 4.918051242828369, "learning_rate": 0.0001991792101675938, "loss": 4.2269, "step": 2594 }, { "epoch": 0.12385009127680137, "grad_norm": 6.454699993133545, "learning_rate": 0.0001991785706791989, "loss": 6.3255, "step": 2595 }, { "epoch": 0.12389781770889263, "grad_norm": 10.754586219787598, "learning_rate": 0.0001991779309428114, "loss": 10.1455, "step": 2596 }, { "epoch": 0.12394554414098388, "grad_norm": 7.033049583435059, "learning_rate": 0.00019917729095843286, "loss": 6.9458, "step": 2597 }, { "epoch": 0.12399327057307513, "grad_norm": 5.376648426055908, "learning_rate": 0.0001991766507260649, "loss": 5.361, "step": 2598 }, { "epoch": 0.12404099700516638, "grad_norm": 7.248912811279297, "learning_rate": 0.0001991760102457091, "loss": 7.2075, "step": 2599 }, { "epoch": 0.12408872343725764, "grad_norm": 5.227941989898682, "learning_rate": 0.0001991753695173671, "loss": 5.2552, "step": 2600 }, { "epoch": 0.12408872343725764, "eval_loss": 1.616467833518982, "eval_runtime": 96.4878, "eval_samples_per_second": 8.737, "eval_steps_per_second": 4.374, "step": 2600 }, { "epoch": 0.1241364498693489, "grad_norm": 6.640268802642822, "learning_rate": 0.00019917472854104044, "loss": 6.6788, "step": 2601 }, { "epoch": 0.12418417630144014, "grad_norm": 6.098337650299072, "learning_rate": 0.0001991740873167308, "loss": 6.4633, "step": 2602 }, { "epoch": 0.1242319027335314, "grad_norm": 7.400532245635986, "learning_rate": 0.0001991734458444397, "loss": 6.3654, "step": 2603 }, { "epoch": 0.12427962916562266, "grad_norm": 6.543416500091553, "learning_rate": 0.00019917280412416882, "loss": 6.3276, "step": 2604 }, { "epoch": 0.1243273555977139, "grad_norm": 9.13476276397705, "learning_rate": 0.00019917216215591972, "loss": 7.5483, "step": 2605 }, { "epoch": 0.12437508202980516, "grad_norm": 6.472226142883301, "learning_rate": 0.000199171519939694, "loss": 7.5186, "step": 2606 }, { "epoch": 0.1244228084618964, "grad_norm": 6.057101249694824, "learning_rate": 0.0001991708774754933, "loss": 5.7719, "step": 2607 }, { "epoch": 0.12447053489398767, "grad_norm": 8.963512420654297, "learning_rate": 0.00019917023476331922, "loss": 6.3314, "step": 2608 }, { "epoch": 0.12451826132607892, "grad_norm": 6.93634557723999, "learning_rate": 0.00019916959180317335, "loss": 8.5299, "step": 2609 }, { "epoch": 0.12456598775817017, "grad_norm": 6.162479400634766, "learning_rate": 0.00019916894859505727, "loss": 5.6562, "step": 2610 }, { "epoch": 0.12461371419026142, "grad_norm": 5.48561429977417, "learning_rate": 0.00019916830513897266, "loss": 6.2665, "step": 2611 }, { "epoch": 0.12466144062235267, "grad_norm": 3.560473680496216, "learning_rate": 0.00019916766143492106, "loss": 3.051, "step": 2612 }, { "epoch": 0.12470916705444393, "grad_norm": 7.0898895263671875, "learning_rate": 0.00019916701748290416, "loss": 6.9835, "step": 2613 }, { "epoch": 0.12475689348653518, "grad_norm": 5.331742286682129, "learning_rate": 0.00019916637328292348, "loss": 6.0956, "step": 2614 }, { "epoch": 0.12480461991862643, "grad_norm": 8.475786209106445, "learning_rate": 0.00019916572883498068, "loss": 7.2774, "step": 2615 }, { "epoch": 0.12485234635071768, "grad_norm": 7.285261154174805, "learning_rate": 0.00019916508413907736, "loss": 9.4067, "step": 2616 }, { "epoch": 0.12490007278280894, "grad_norm": 6.492762565612793, "learning_rate": 0.00019916443919521513, "loss": 6.2198, "step": 2617 }, { "epoch": 0.1249477992149002, "grad_norm": 5.463344573974609, "learning_rate": 0.00019916379400339564, "loss": 6.9778, "step": 2618 }, { "epoch": 0.12499552564699144, "grad_norm": 5.793583393096924, "learning_rate": 0.00019916314856362045, "loss": 6.8516, "step": 2619 }, { "epoch": 0.1250432520790827, "grad_norm": 5.299160003662109, "learning_rate": 0.00019916250287589117, "loss": 5.9155, "step": 2620 }, { "epoch": 0.12509097851117396, "grad_norm": 5.868138790130615, "learning_rate": 0.00019916185694020947, "loss": 7.445, "step": 2621 }, { "epoch": 0.1251387049432652, "grad_norm": 6.175563335418701, "learning_rate": 0.00019916121075657693, "loss": 7.4463, "step": 2622 }, { "epoch": 0.12518643137535646, "grad_norm": 6.524340629577637, "learning_rate": 0.00019916056432499514, "loss": 5.5733, "step": 2623 }, { "epoch": 0.1252341578074477, "grad_norm": 5.758206367492676, "learning_rate": 0.0001991599176454658, "loss": 5.9721, "step": 2624 }, { "epoch": 0.12528188423953895, "grad_norm": 5.740457534790039, "learning_rate": 0.00019915927071799042, "loss": 6.1435, "step": 2625 }, { "epoch": 0.1253296106716302, "grad_norm": 5.045222759246826, "learning_rate": 0.00019915862354257072, "loss": 6.4285, "step": 2626 }, { "epoch": 0.12537733710372148, "grad_norm": 6.698198318481445, "learning_rate": 0.00019915797611920824, "loss": 7.0418, "step": 2627 }, { "epoch": 0.12542506353581273, "grad_norm": 5.3449602127075195, "learning_rate": 0.00019915732844790463, "loss": 5.8405, "step": 2628 }, { "epoch": 0.12547278996790398, "grad_norm": 8.230816841125488, "learning_rate": 0.0001991566805286615, "loss": 8.8511, "step": 2629 }, { "epoch": 0.12552051639999523, "grad_norm": 4.497639179229736, "learning_rate": 0.0001991560323614805, "loss": 5.1889, "step": 2630 }, { "epoch": 0.12556824283208648, "grad_norm": 7.639451503753662, "learning_rate": 0.00019915538394636321, "loss": 6.0825, "step": 2631 }, { "epoch": 0.12561596926417773, "grad_norm": 6.102507591247559, "learning_rate": 0.00019915473528331129, "loss": 5.6234, "step": 2632 }, { "epoch": 0.12566369569626898, "grad_norm": 5.559940338134766, "learning_rate": 0.00019915408637232634, "loss": 6.1196, "step": 2633 }, { "epoch": 0.12571142212836023, "grad_norm": 6.262628078460693, "learning_rate": 0.00019915343721341, "loss": 6.5523, "step": 2634 }, { "epoch": 0.1257591485604515, "grad_norm": 6.31156587600708, "learning_rate": 0.00019915278780656382, "loss": 6.9212, "step": 2635 }, { "epoch": 0.12580687499254276, "grad_norm": 5.397120475769043, "learning_rate": 0.0001991521381517895, "loss": 7.2011, "step": 2636 }, { "epoch": 0.125854601424634, "grad_norm": 5.896693229675293, "learning_rate": 0.00019915148824908868, "loss": 6.2354, "step": 2637 }, { "epoch": 0.12590232785672525, "grad_norm": 5.77620267868042, "learning_rate": 0.00019915083809846293, "loss": 5.7495, "step": 2638 }, { "epoch": 0.1259500542888165, "grad_norm": 4.042270660400391, "learning_rate": 0.00019915018769991387, "loss": 5.0322, "step": 2639 }, { "epoch": 0.12599778072090775, "grad_norm": 7.152609825134277, "learning_rate": 0.0001991495370534432, "loss": 7.4355, "step": 2640 }, { "epoch": 0.126045507152999, "grad_norm": 5.517868518829346, "learning_rate": 0.00019914888615905248, "loss": 5.7128, "step": 2641 }, { "epoch": 0.12609323358509025, "grad_norm": 6.580667495727539, "learning_rate": 0.00019914823501674333, "loss": 7.5049, "step": 2642 }, { "epoch": 0.1261409600171815, "grad_norm": 5.372723579406738, "learning_rate": 0.00019914758362651744, "loss": 6.6719, "step": 2643 }, { "epoch": 0.12618868644927278, "grad_norm": 6.25331974029541, "learning_rate": 0.0001991469319883764, "loss": 8.3484, "step": 2644 }, { "epoch": 0.12623641288136403, "grad_norm": 7.487483024597168, "learning_rate": 0.00019914628010232182, "loss": 6.6997, "step": 2645 }, { "epoch": 0.12628413931345528, "grad_norm": 7.138690948486328, "learning_rate": 0.00019914562796835542, "loss": 6.7894, "step": 2646 }, { "epoch": 0.12633186574554653, "grad_norm": 4.610174179077148, "learning_rate": 0.0001991449755864787, "loss": 4.4906, "step": 2647 }, { "epoch": 0.12637959217763778, "grad_norm": 6.854246139526367, "learning_rate": 0.00019914432295669337, "loss": 7.2979, "step": 2648 }, { "epoch": 0.12642731860972903, "grad_norm": 8.805462837219238, "learning_rate": 0.00019914367007900106, "loss": 7.6631, "step": 2649 }, { "epoch": 0.12647504504182028, "grad_norm": 8.960370063781738, "learning_rate": 0.00019914301695340338, "loss": 6.965, "step": 2650 }, { "epoch": 0.12652277147391153, "grad_norm": 6.149599552154541, "learning_rate": 0.00019914236357990197, "loss": 5.7883, "step": 2651 }, { "epoch": 0.1265704979060028, "grad_norm": 5.623478889465332, "learning_rate": 0.00019914170995849847, "loss": 6.1539, "step": 2652 }, { "epoch": 0.12661822433809405, "grad_norm": 6.2518792152404785, "learning_rate": 0.00019914105608919452, "loss": 6.0241, "step": 2653 }, { "epoch": 0.1266659507701853, "grad_norm": 6.36302375793457, "learning_rate": 0.00019914040197199174, "loss": 6.4629, "step": 2654 }, { "epoch": 0.12671367720227655, "grad_norm": 6.179405212402344, "learning_rate": 0.0001991397476068918, "loss": 6.0301, "step": 2655 }, { "epoch": 0.1267614036343678, "grad_norm": 6.06155252456665, "learning_rate": 0.00019913909299389623, "loss": 6.2193, "step": 2656 }, { "epoch": 0.12680913006645905, "grad_norm": 6.631749629974365, "learning_rate": 0.0001991384381330068, "loss": 6.6485, "step": 2657 }, { "epoch": 0.1268568564985503, "grad_norm": 8.34609603881836, "learning_rate": 0.0001991377830242251, "loss": 6.7854, "step": 2658 }, { "epoch": 0.12690458293064155, "grad_norm": 6.579798221588135, "learning_rate": 0.00019913712766755275, "loss": 7.3782, "step": 2659 }, { "epoch": 0.12695230936273283, "grad_norm": 6.286202907562256, "learning_rate": 0.00019913647206299138, "loss": 6.2319, "step": 2660 }, { "epoch": 0.12700003579482408, "grad_norm": 6.803478240966797, "learning_rate": 0.00019913581621054267, "loss": 6.5881, "step": 2661 }, { "epoch": 0.12704776222691533, "grad_norm": 5.428170680999756, "learning_rate": 0.00019913516011020824, "loss": 6.6578, "step": 2662 }, { "epoch": 0.12709548865900658, "grad_norm": 5.8981032371521, "learning_rate": 0.0001991345037619897, "loss": 6.648, "step": 2663 }, { "epoch": 0.12714321509109783, "grad_norm": 6.136375427246094, "learning_rate": 0.00019913384716588876, "loss": 6.4674, "step": 2664 }, { "epoch": 0.12719094152318908, "grad_norm": 7.333561420440674, "learning_rate": 0.00019913319032190702, "loss": 7.6672, "step": 2665 }, { "epoch": 0.12723866795528033, "grad_norm": 6.941676616668701, "learning_rate": 0.00019913253323004609, "loss": 6.5824, "step": 2666 }, { "epoch": 0.12728639438737158, "grad_norm": 5.5567779541015625, "learning_rate": 0.00019913187589030767, "loss": 6.1721, "step": 2667 }, { "epoch": 0.12733412081946283, "grad_norm": 7.499778747558594, "learning_rate": 0.00019913121830269335, "loss": 7.4446, "step": 2668 }, { "epoch": 0.1273818472515541, "grad_norm": 8.435744285583496, "learning_rate": 0.00019913056046720485, "loss": 7.4201, "step": 2669 }, { "epoch": 0.12742957368364535, "grad_norm": 5.2992095947265625, "learning_rate": 0.0001991299023838437, "loss": 6.645, "step": 2670 }, { "epoch": 0.1274773001157366, "grad_norm": 5.7956223487854, "learning_rate": 0.0001991292440526117, "loss": 5.3825, "step": 2671 }, { "epoch": 0.12752502654782785, "grad_norm": 5.5730509757995605, "learning_rate": 0.00019912858547351038, "loss": 5.0827, "step": 2672 }, { "epoch": 0.1275727529799191, "grad_norm": 6.531050205230713, "learning_rate": 0.00019912792664654143, "loss": 6.4528, "step": 2673 }, { "epoch": 0.12762047941201035, "grad_norm": 5.897129535675049, "learning_rate": 0.00019912726757170644, "loss": 6.4735, "step": 2674 }, { "epoch": 0.1276682058441016, "grad_norm": 5.176238536834717, "learning_rate": 0.00019912660824900717, "loss": 5.9754, "step": 2675 }, { "epoch": 0.12771593227619285, "grad_norm": 5.605959415435791, "learning_rate": 0.00019912594867844516, "loss": 5.5582, "step": 2676 }, { "epoch": 0.12776365870828413, "grad_norm": 6.690275192260742, "learning_rate": 0.0001991252888600221, "loss": 6.5006, "step": 2677 }, { "epoch": 0.12781138514037538, "grad_norm": 8.319185256958008, "learning_rate": 0.00019912462879373966, "loss": 6.3769, "step": 2678 }, { "epoch": 0.12785911157246663, "grad_norm": 5.938503265380859, "learning_rate": 0.00019912396847959947, "loss": 5.9018, "step": 2679 }, { "epoch": 0.12790683800455788, "grad_norm": 6.656704902648926, "learning_rate": 0.0001991233079176032, "loss": 4.3682, "step": 2680 }, { "epoch": 0.12795456443664913, "grad_norm": 6.898440361022949, "learning_rate": 0.00019912264710775243, "loss": 7.5726, "step": 2681 }, { "epoch": 0.12800229086874038, "grad_norm": 5.528750419616699, "learning_rate": 0.0001991219860500489, "loss": 4.7983, "step": 2682 }, { "epoch": 0.12805001730083163, "grad_norm": 6.4117817878723145, "learning_rate": 0.00019912132474449423, "loss": 6.4503, "step": 2683 }, { "epoch": 0.12809774373292288, "grad_norm": 6.662499904632568, "learning_rate": 0.00019912066319109008, "loss": 7.6737, "step": 2684 }, { "epoch": 0.12814547016501415, "grad_norm": 6.276695728302002, "learning_rate": 0.0001991200013898381, "loss": 6.7972, "step": 2685 }, { "epoch": 0.1281931965971054, "grad_norm": 6.167166709899902, "learning_rate": 0.00019911933934073993, "loss": 6.0611, "step": 2686 }, { "epoch": 0.12824092302919665, "grad_norm": 6.943721294403076, "learning_rate": 0.00019911867704379722, "loss": 6.0425, "step": 2687 }, { "epoch": 0.1282886494612879, "grad_norm": 6.147932052612305, "learning_rate": 0.0001991180144990117, "loss": 5.8839, "step": 2688 }, { "epoch": 0.12833637589337915, "grad_norm": 6.889720439910889, "learning_rate": 0.0001991173517063849, "loss": 5.5848, "step": 2689 }, { "epoch": 0.1283841023254704, "grad_norm": 5.700843811035156, "learning_rate": 0.00019911668866591858, "loss": 5.2723, "step": 2690 }, { "epoch": 0.12843182875756165, "grad_norm": 6.69716215133667, "learning_rate": 0.0001991160253776144, "loss": 6.8941, "step": 2691 }, { "epoch": 0.1284795551896529, "grad_norm": 9.107110977172852, "learning_rate": 0.00019911536184147397, "loss": 8.4972, "step": 2692 }, { "epoch": 0.12852728162174415, "grad_norm": 6.4476776123046875, "learning_rate": 0.00019911469805749895, "loss": 7.1879, "step": 2693 }, { "epoch": 0.12857500805383543, "grad_norm": 6.365779399871826, "learning_rate": 0.000199114034025691, "loss": 5.6422, "step": 2694 }, { "epoch": 0.12862273448592668, "grad_norm": 5.086087703704834, "learning_rate": 0.00019911336974605179, "loss": 4.4789, "step": 2695 }, { "epoch": 0.12867046091801793, "grad_norm": 4.119086265563965, "learning_rate": 0.000199112705218583, "loss": 4.5278, "step": 2696 }, { "epoch": 0.12871818735010918, "grad_norm": 5.206000804901123, "learning_rate": 0.0001991120404432863, "loss": 5.2369, "step": 2697 }, { "epoch": 0.12876591378220043, "grad_norm": 4.440513610839844, "learning_rate": 0.00019911137542016333, "loss": 5.9666, "step": 2698 }, { "epoch": 0.12881364021429167, "grad_norm": 8.883622169494629, "learning_rate": 0.0001991107101492157, "loss": 7.3967, "step": 2699 }, { "epoch": 0.12886136664638292, "grad_norm": 6.536487579345703, "learning_rate": 0.00019911004463044516, "loss": 6.1909, "step": 2700 }, { "epoch": 0.12890909307847417, "grad_norm": 4.277031898498535, "learning_rate": 0.00019910937886385334, "loss": 4.1263, "step": 2701 }, { "epoch": 0.12895681951056545, "grad_norm": 5.570382118225098, "learning_rate": 0.00019910871284944192, "loss": 6.8265, "step": 2702 }, { "epoch": 0.1290045459426567, "grad_norm": 4.961894989013672, "learning_rate": 0.0001991080465872125, "loss": 4.9882, "step": 2703 }, { "epoch": 0.12905227237474795, "grad_norm": 5.48018741607666, "learning_rate": 0.00019910738007716683, "loss": 6.3094, "step": 2704 }, { "epoch": 0.1290999988068392, "grad_norm": 5.8967671394348145, "learning_rate": 0.00019910671331930654, "loss": 7.0458, "step": 2705 }, { "epoch": 0.12914772523893045, "grad_norm": 7.660339832305908, "learning_rate": 0.00019910604631363327, "loss": 7.6331, "step": 2706 }, { "epoch": 0.1291954516710217, "grad_norm": 5.007476329803467, "learning_rate": 0.00019910537906014873, "loss": 6.0415, "step": 2707 }, { "epoch": 0.12924317810311295, "grad_norm": 7.231809616088867, "learning_rate": 0.00019910471155885459, "loss": 6.9033, "step": 2708 }, { "epoch": 0.1292909045352042, "grad_norm": 7.8866753578186035, "learning_rate": 0.0001991040438097525, "loss": 8.1581, "step": 2709 }, { "epoch": 0.12933863096729548, "grad_norm": 4.540760517120361, "learning_rate": 0.00019910337581284415, "loss": 3.9487, "step": 2710 }, { "epoch": 0.12938635739938673, "grad_norm": 6.535691261291504, "learning_rate": 0.0001991027075681312, "loss": 6.8769, "step": 2711 }, { "epoch": 0.12943408383147798, "grad_norm": 7.321287631988525, "learning_rate": 0.00019910203907561527, "loss": 7.6307, "step": 2712 }, { "epoch": 0.12948181026356922, "grad_norm": 8.327848434448242, "learning_rate": 0.0001991013703352981, "loss": 6.3343, "step": 2713 }, { "epoch": 0.12952953669566047, "grad_norm": 4.456166744232178, "learning_rate": 0.00019910070134718133, "loss": 4.3715, "step": 2714 }, { "epoch": 0.12957726312775172, "grad_norm": 7.4280781745910645, "learning_rate": 0.00019910003211126668, "loss": 7.9487, "step": 2715 }, { "epoch": 0.12962498955984297, "grad_norm": 6.105531215667725, "learning_rate": 0.00019909936262755572, "loss": 5.3206, "step": 2716 }, { "epoch": 0.12967271599193422, "grad_norm": 6.178251266479492, "learning_rate": 0.00019909869289605023, "loss": 7.429, "step": 2717 }, { "epoch": 0.12972044242402547, "grad_norm": 7.50347375869751, "learning_rate": 0.00019909802291675187, "loss": 6.884, "step": 2718 }, { "epoch": 0.12976816885611675, "grad_norm": 8.008402824401855, "learning_rate": 0.00019909735268966222, "loss": 6.4672, "step": 2719 }, { "epoch": 0.129815895288208, "grad_norm": 6.246631622314453, "learning_rate": 0.0001990966822147831, "loss": 5.7752, "step": 2720 }, { "epoch": 0.12986362172029925, "grad_norm": 5.764650821685791, "learning_rate": 0.00019909601149211605, "loss": 6.088, "step": 2721 }, { "epoch": 0.1299113481523905, "grad_norm": 6.757874488830566, "learning_rate": 0.00019909534052166286, "loss": 6.4215, "step": 2722 }, { "epoch": 0.12995907458448175, "grad_norm": 7.595560073852539, "learning_rate": 0.0001990946693034251, "loss": 7.4822, "step": 2723 }, { "epoch": 0.130006801016573, "grad_norm": 6.582478046417236, "learning_rate": 0.00019909399783740454, "loss": 6.6361, "step": 2724 }, { "epoch": 0.13005452744866425, "grad_norm": 5.431997299194336, "learning_rate": 0.00019909332612360285, "loss": 6.3359, "step": 2725 }, { "epoch": 0.1301022538807555, "grad_norm": 5.795412540435791, "learning_rate": 0.00019909265416202164, "loss": 7.2463, "step": 2726 }, { "epoch": 0.13014998031284677, "grad_norm": 6.594300270080566, "learning_rate": 0.00019909198195266266, "loss": 5.7899, "step": 2727 }, { "epoch": 0.13019770674493802, "grad_norm": 6.408435344696045, "learning_rate": 0.00019909130949552755, "loss": 7.3634, "step": 2728 }, { "epoch": 0.13024543317702927, "grad_norm": 6.5589919090271, "learning_rate": 0.000199090636790618, "loss": 6.6704, "step": 2729 }, { "epoch": 0.13029315960912052, "grad_norm": 6.496981620788574, "learning_rate": 0.0001990899638379357, "loss": 6.1406, "step": 2730 }, { "epoch": 0.13034088604121177, "grad_norm": 4.503755569458008, "learning_rate": 0.00019908929063748235, "loss": 5.0579, "step": 2731 }, { "epoch": 0.13038861247330302, "grad_norm": 7.117122173309326, "learning_rate": 0.0001990886171892596, "loss": 7.4509, "step": 2732 }, { "epoch": 0.13043633890539427, "grad_norm": 6.266332149505615, "learning_rate": 0.00019908794349326913, "loss": 6.4409, "step": 2733 }, { "epoch": 0.13048406533748552, "grad_norm": 6.61918306350708, "learning_rate": 0.00019908726954951267, "loss": 7.2602, "step": 2734 }, { "epoch": 0.1305317917695768, "grad_norm": 6.399169445037842, "learning_rate": 0.00019908659535799187, "loss": 7.1302, "step": 2735 }, { "epoch": 0.13057951820166805, "grad_norm": 7.0500898361206055, "learning_rate": 0.00019908592091870842, "loss": 7.6507, "step": 2736 }, { "epoch": 0.1306272446337593, "grad_norm": 5.893949508666992, "learning_rate": 0.00019908524623166401, "loss": 6.7635, "step": 2737 }, { "epoch": 0.13067497106585055, "grad_norm": 6.369676113128662, "learning_rate": 0.0001990845712968603, "loss": 7.2699, "step": 2738 }, { "epoch": 0.1307226974979418, "grad_norm": 6.691352844238281, "learning_rate": 0.00019908389611429905, "loss": 7.8422, "step": 2739 }, { "epoch": 0.13077042393003305, "grad_norm": 6.010880470275879, "learning_rate": 0.00019908322068398188, "loss": 7.8595, "step": 2740 }, { "epoch": 0.1308181503621243, "grad_norm": 6.41002893447876, "learning_rate": 0.0001990825450059105, "loss": 6.4199, "step": 2741 }, { "epoch": 0.13086587679421555, "grad_norm": 6.322947025299072, "learning_rate": 0.0001990818690800866, "loss": 5.793, "step": 2742 }, { "epoch": 0.1309136032263068, "grad_norm": 6.068185329437256, "learning_rate": 0.0001990811929065119, "loss": 5.8207, "step": 2743 }, { "epoch": 0.13096132965839807, "grad_norm": 4.704338073730469, "learning_rate": 0.00019908051648518803, "loss": 5.3277, "step": 2744 }, { "epoch": 0.13100905609048932, "grad_norm": 5.012578964233398, "learning_rate": 0.00019907983981611672, "loss": 5.0888, "step": 2745 }, { "epoch": 0.13105678252258057, "grad_norm": 6.814549922943115, "learning_rate": 0.00019907916289929966, "loss": 7.5414, "step": 2746 }, { "epoch": 0.13110450895467182, "grad_norm": 5.401822090148926, "learning_rate": 0.00019907848573473853, "loss": 5.8487, "step": 2747 }, { "epoch": 0.13115223538676307, "grad_norm": 6.063152313232422, "learning_rate": 0.00019907780832243506, "loss": 7.1534, "step": 2748 }, { "epoch": 0.13119996181885432, "grad_norm": 4.4824395179748535, "learning_rate": 0.00019907713066239088, "loss": 4.1908, "step": 2749 }, { "epoch": 0.13124768825094557, "grad_norm": 7.166471004486084, "learning_rate": 0.00019907645275460775, "loss": 7.4206, "step": 2750 }, { "epoch": 0.13129541468303682, "grad_norm": 8.25527286529541, "learning_rate": 0.0001990757745990873, "loss": 8.5766, "step": 2751 }, { "epoch": 0.1313431411151281, "grad_norm": 5.488870143890381, "learning_rate": 0.0001990750961958313, "loss": 5.4217, "step": 2752 }, { "epoch": 0.13139086754721935, "grad_norm": 7.472160339355469, "learning_rate": 0.00019907441754484138, "loss": 7.9112, "step": 2753 }, { "epoch": 0.1314385939793106, "grad_norm": 6.098803997039795, "learning_rate": 0.00019907373864611925, "loss": 5.8821, "step": 2754 }, { "epoch": 0.13148632041140185, "grad_norm": 5.743438243865967, "learning_rate": 0.00019907305949966664, "loss": 6.1745, "step": 2755 }, { "epoch": 0.1315340468434931, "grad_norm": 5.455843448638916, "learning_rate": 0.00019907238010548523, "loss": 5.7785, "step": 2756 }, { "epoch": 0.13158177327558435, "grad_norm": 5.50465726852417, "learning_rate": 0.00019907170046357673, "loss": 5.6001, "step": 2757 }, { "epoch": 0.1316294997076756, "grad_norm": 6.299639701843262, "learning_rate": 0.0001990710205739428, "loss": 8.1939, "step": 2758 }, { "epoch": 0.13167722613976685, "grad_norm": 6.706847667694092, "learning_rate": 0.0001990703404365852, "loss": 6.7525, "step": 2759 }, { "epoch": 0.13172495257185812, "grad_norm": 8.025195121765137, "learning_rate": 0.00019906966005150557, "loss": 7.904, "step": 2760 }, { "epoch": 0.13177267900394937, "grad_norm": 9.603436470031738, "learning_rate": 0.00019906897941870564, "loss": 6.8963, "step": 2761 }, { "epoch": 0.13182040543604062, "grad_norm": 6.5206217765808105, "learning_rate": 0.00019906829853818714, "loss": 6.6906, "step": 2762 }, { "epoch": 0.13186813186813187, "grad_norm": 5.41235876083374, "learning_rate": 0.00019906761740995173, "loss": 6.1441, "step": 2763 }, { "epoch": 0.13191585830022312, "grad_norm": 5.746955871582031, "learning_rate": 0.00019906693603400112, "loss": 5.5281, "step": 2764 }, { "epoch": 0.13196358473231437, "grad_norm": 6.709540843963623, "learning_rate": 0.000199066254410337, "loss": 6.3966, "step": 2765 }, { "epoch": 0.13201131116440562, "grad_norm": 5.8970417976379395, "learning_rate": 0.00019906557253896115, "loss": 6.9662, "step": 2766 }, { "epoch": 0.13205903759649687, "grad_norm": 5.076374053955078, "learning_rate": 0.00019906489041987517, "loss": 5.1472, "step": 2767 }, { "epoch": 0.13210676402858812, "grad_norm": 6.5956902503967285, "learning_rate": 0.00019906420805308083, "loss": 7.3836, "step": 2768 }, { "epoch": 0.1321544904606794, "grad_norm": 5.3043928146362305, "learning_rate": 0.00019906352543857983, "loss": 6.8485, "step": 2769 }, { "epoch": 0.13220221689277065, "grad_norm": 7.224258899688721, "learning_rate": 0.00019906284257637386, "loss": 6.2999, "step": 2770 }, { "epoch": 0.1322499433248619, "grad_norm": 4.787478923797607, "learning_rate": 0.00019906215946646465, "loss": 4.6901, "step": 2771 }, { "epoch": 0.13229766975695315, "grad_norm": 5.938333511352539, "learning_rate": 0.0001990614761088539, "loss": 7.4666, "step": 2772 }, { "epoch": 0.1323453961890444, "grad_norm": 6.651096820831299, "learning_rate": 0.0001990607925035433, "loss": 7.231, "step": 2773 }, { "epoch": 0.13239312262113564, "grad_norm": 4.987978935241699, "learning_rate": 0.00019906010865053454, "loss": 5.4362, "step": 2774 }, { "epoch": 0.1324408490532269, "grad_norm": 6.788287162780762, "learning_rate": 0.0001990594245498294, "loss": 7.0538, "step": 2775 }, { "epoch": 0.13248857548531814, "grad_norm": 5.706051826477051, "learning_rate": 0.00019905874020142953, "loss": 6.9948, "step": 2776 }, { "epoch": 0.13253630191740942, "grad_norm": 7.513382434844971, "learning_rate": 0.0001990580556053367, "loss": 6.8821, "step": 2777 }, { "epoch": 0.13258402834950067, "grad_norm": 7.37601375579834, "learning_rate": 0.00019905737076155258, "loss": 8.9933, "step": 2778 }, { "epoch": 0.13263175478159192, "grad_norm": 4.514930248260498, "learning_rate": 0.00019905668567007887, "loss": 4.4424, "step": 2779 }, { "epoch": 0.13267948121368317, "grad_norm": 6.982491970062256, "learning_rate": 0.0001990560003309173, "loss": 7.462, "step": 2780 }, { "epoch": 0.13272720764577442, "grad_norm": 7.289805889129639, "learning_rate": 0.0001990553147440696, "loss": 6.7003, "step": 2781 }, { "epoch": 0.13277493407786567, "grad_norm": 5.799755573272705, "learning_rate": 0.00019905462890953745, "loss": 6.4667, "step": 2782 }, { "epoch": 0.13282266050995692, "grad_norm": 5.055026531219482, "learning_rate": 0.0001990539428273226, "loss": 5.4229, "step": 2783 }, { "epoch": 0.13287038694204817, "grad_norm": 4.6647844314575195, "learning_rate": 0.00019905325649742674, "loss": 4.7695, "step": 2784 }, { "epoch": 0.13291811337413945, "grad_norm": 7.971372604370117, "learning_rate": 0.00019905256991985162, "loss": 6.1439, "step": 2785 }, { "epoch": 0.1329658398062307, "grad_norm": 6.43125581741333, "learning_rate": 0.00019905188309459891, "loss": 6.2026, "step": 2786 }, { "epoch": 0.13301356623832195, "grad_norm": 4.872565269470215, "learning_rate": 0.00019905119602167036, "loss": 4.4752, "step": 2787 }, { "epoch": 0.1330612926704132, "grad_norm": 5.977827548980713, "learning_rate": 0.00019905050870106767, "loss": 5.6266, "step": 2788 }, { "epoch": 0.13310901910250444, "grad_norm": 7.002156734466553, "learning_rate": 0.0001990498211327926, "loss": 7.0112, "step": 2789 }, { "epoch": 0.1331567455345957, "grad_norm": 6.144350528717041, "learning_rate": 0.0001990491333168468, "loss": 6.1978, "step": 2790 }, { "epoch": 0.13320447196668694, "grad_norm": 7.993871212005615, "learning_rate": 0.000199048445253232, "loss": 6.4143, "step": 2791 }, { "epoch": 0.1332521983987782, "grad_norm": 6.021445274353027, "learning_rate": 0.00019904775694195, "loss": 5.4286, "step": 2792 }, { "epoch": 0.13329992483086944, "grad_norm": 5.774000644683838, "learning_rate": 0.00019904706838300243, "loss": 5.5089, "step": 2793 }, { "epoch": 0.13334765126296072, "grad_norm": 6.7085185050964355, "learning_rate": 0.00019904637957639108, "loss": 7.9083, "step": 2794 }, { "epoch": 0.13339537769505197, "grad_norm": 7.979789733886719, "learning_rate": 0.00019904569052211764, "loss": 6.8796, "step": 2795 }, { "epoch": 0.13344310412714322, "grad_norm": 9.78246784210205, "learning_rate": 0.0001990450012201838, "loss": 9.3304, "step": 2796 }, { "epoch": 0.13349083055923447, "grad_norm": 5.552200794219971, "learning_rate": 0.00019904431167059135, "loss": 5.1558, "step": 2797 }, { "epoch": 0.13353855699132572, "grad_norm": 6.368596076965332, "learning_rate": 0.00019904362187334198, "loss": 6.2315, "step": 2798 }, { "epoch": 0.13358628342341697, "grad_norm": 4.584041118621826, "learning_rate": 0.00019904293182843738, "loss": 5.0416, "step": 2799 }, { "epoch": 0.13363400985550822, "grad_norm": 6.690604209899902, "learning_rate": 0.00019904224153587935, "loss": 6.5086, "step": 2800 }, { "epoch": 0.13363400985550822, "eval_loss": 1.6191966533660889, "eval_runtime": 96.5269, "eval_samples_per_second": 8.733, "eval_steps_per_second": 4.372, "step": 2800 } ], "logging_steps": 1, "max_steps": 62856, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.596245419851776e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }