{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.999656605198997, "eval_steps": 1, "global_step": 218400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004578597346702838, "grad_norm": 1.834977626800537, "learning_rate": 4.578754578754579e-07, "loss": 0.7024, "step": 100 }, { "epoch": 0.009157194693405675, "grad_norm": 1.148632287979126, "learning_rate": 9.157509157509158e-07, "loss": 0.6769, "step": 200 }, { "epoch": 0.013735792040108512, "grad_norm": 0.9439795613288879, "learning_rate": 1.3736263736263736e-06, "loss": 0.6151, "step": 300 }, { "epoch": 0.01831438938681135, "grad_norm": 0.7975717186927795, "learning_rate": 1.8315018315018316e-06, "loss": 0.5009, "step": 400 }, { "epoch": 0.022892986733514187, "grad_norm": 0.6828369498252869, "learning_rate": 2.2893772893772894e-06, "loss": 0.4383, "step": 500 }, { "epoch": 0.027471584080217024, "grad_norm": 0.7334387302398682, "learning_rate": 2.747252747252747e-06, "loss": 0.3948, "step": 600 }, { "epoch": 0.032050181426919865, "grad_norm": 0.5922483205795288, "learning_rate": 3.205128205128205e-06, "loss": 0.3682, "step": 700 }, { "epoch": 0.0366287787736227, "grad_norm": 0.42683616280555725, "learning_rate": 3.663003663003663e-06, "loss": 0.3471, "step": 800 }, { "epoch": 0.04120737612032554, "grad_norm": 0.798675537109375, "learning_rate": 4.120879120879121e-06, "loss": 0.3391, "step": 900 }, { "epoch": 0.045785973467028375, "grad_norm": 0.6987248659133911, "learning_rate": 4.578754578754579e-06, "loss": 0.3213, "step": 1000 }, { "epoch": 0.05036457081373121, "grad_norm": 0.697691798210144, "learning_rate": 5.036630036630037e-06, "loss": 0.3021, "step": 1100 }, { "epoch": 0.05494316816043405, "grad_norm": 1.0107570886611938, "learning_rate": 5.494505494505494e-06, "loss": 0.2854, "step": 1200 }, { "epoch": 0.059521765507136885, "grad_norm": 0.9664294719696045, "learning_rate": 5.9523809523809525e-06, "loss": 0.2712, "step": 1300 }, { "epoch": 0.06410036285383973, "grad_norm": 0.8629726767539978, "learning_rate": 6.41025641025641e-06, "loss": 0.2591, "step": 1400 }, { "epoch": 0.06867896020054257, "grad_norm": 1.2182508707046509, "learning_rate": 6.868131868131869e-06, "loss": 0.2523, "step": 1500 }, { "epoch": 0.0732575575472454, "grad_norm": 0.8369842171669006, "learning_rate": 7.326007326007326e-06, "loss": 0.2498, "step": 1600 }, { "epoch": 0.07783615489394824, "grad_norm": 1.195522427558899, "learning_rate": 7.783882783882785e-06, "loss": 0.2345, "step": 1700 }, { "epoch": 0.08241475224065108, "grad_norm": 1.7609663009643555, "learning_rate": 8.241758241758243e-06, "loss": 0.226, "step": 1800 }, { "epoch": 0.08699334958735391, "grad_norm": 1.2075083255767822, "learning_rate": 8.6996336996337e-06, "loss": 0.2187, "step": 1900 }, { "epoch": 0.09157194693405675, "grad_norm": 1.0836577415466309, "learning_rate": 9.157509157509158e-06, "loss": 0.2121, "step": 2000 }, { "epoch": 0.09615054428075959, "grad_norm": 0.962322473526001, "learning_rate": 9.615384615384616e-06, "loss": 0.1984, "step": 2100 }, { "epoch": 0.10072914162746242, "grad_norm": 0.6929520964622498, "learning_rate": 1.0073260073260074e-05, "loss": 0.1949, "step": 2200 }, { "epoch": 0.10530773897416526, "grad_norm": 1.0407646894454956, "learning_rate": 1.0531135531135532e-05, "loss": 0.1968, "step": 2300 }, { "epoch": 0.1098863363208681, "grad_norm": 1.3924702405929565, "learning_rate": 1.0989010989010989e-05, "loss": 0.1932, "step": 2400 }, { "epoch": 0.11446493366757093, "grad_norm": 1.2128703594207764, "learning_rate": 1.1446886446886447e-05, "loss": 0.1855, "step": 2500 }, { "epoch": 0.11904353101427377, "grad_norm": 1.8191227912902832, "learning_rate": 1.1904761904761905e-05, "loss": 0.1866, "step": 2600 }, { "epoch": 0.12362212836097662, "grad_norm": 1.528939127922058, "learning_rate": 1.2362637362637363e-05, "loss": 0.1798, "step": 2700 }, { "epoch": 0.12820072570767946, "grad_norm": 1.4381574392318726, "learning_rate": 1.282051282051282e-05, "loss": 0.1788, "step": 2800 }, { "epoch": 0.13277932305438228, "grad_norm": 1.9535735845565796, "learning_rate": 1.327838827838828e-05, "loss": 0.1751, "step": 2900 }, { "epoch": 0.13735792040108513, "grad_norm": 1.095574140548706, "learning_rate": 1.3736263736263738e-05, "loss": 0.1745, "step": 3000 }, { "epoch": 0.14193651774778795, "grad_norm": 1.4498175382614136, "learning_rate": 1.4194139194139194e-05, "loss": 0.1654, "step": 3100 }, { "epoch": 0.1465151150944908, "grad_norm": 1.274528980255127, "learning_rate": 1.4652014652014653e-05, "loss": 0.1759, "step": 3200 }, { "epoch": 0.15109371244119363, "grad_norm": 2.9102790355682373, "learning_rate": 1.510989010989011e-05, "loss": 0.1663, "step": 3300 }, { "epoch": 0.15567230978789648, "grad_norm": 0.8863092660903931, "learning_rate": 1.556776556776557e-05, "loss": 0.1613, "step": 3400 }, { "epoch": 0.1602509071345993, "grad_norm": 1.825391411781311, "learning_rate": 1.602564102564103e-05, "loss": 0.1655, "step": 3500 }, { "epoch": 0.16482950448130215, "grad_norm": 1.2341893911361694, "learning_rate": 1.6483516483516486e-05, "loss": 0.1532, "step": 3600 }, { "epoch": 0.169408101828005, "grad_norm": 1.6574805974960327, "learning_rate": 1.6941391941391942e-05, "loss": 0.1485, "step": 3700 }, { "epoch": 0.17398669917470783, "grad_norm": 1.424926996231079, "learning_rate": 1.73992673992674e-05, "loss": 0.1514, "step": 3800 }, { "epoch": 0.17856529652141068, "grad_norm": 1.5658457279205322, "learning_rate": 1.785714285714286e-05, "loss": 0.1532, "step": 3900 }, { "epoch": 0.1831438938681135, "grad_norm": 2.2447550296783447, "learning_rate": 1.8315018315018315e-05, "loss": 0.1536, "step": 4000 }, { "epoch": 0.18772249121481635, "grad_norm": 1.8312195539474487, "learning_rate": 1.8772893772893775e-05, "loss": 0.152, "step": 4100 }, { "epoch": 0.19230108856151917, "grad_norm": 4.884443283081055, "learning_rate": 1.923076923076923e-05, "loss": 0.1423, "step": 4200 }, { "epoch": 0.19687968590822202, "grad_norm": 2.421905994415283, "learning_rate": 1.9688644688644688e-05, "loss": 0.1493, "step": 4300 }, { "epoch": 0.20145828325492485, "grad_norm": 1.5298246145248413, "learning_rate": 2.0146520146520148e-05, "loss": 0.1399, "step": 4400 }, { "epoch": 0.2060368806016277, "grad_norm": 1.8627902269363403, "learning_rate": 2.0604395604395604e-05, "loss": 0.1418, "step": 4500 }, { "epoch": 0.21061547794833052, "grad_norm": 1.0591548681259155, "learning_rate": 2.1062271062271064e-05, "loss": 0.146, "step": 4600 }, { "epoch": 0.21519407529503337, "grad_norm": 2.3305251598358154, "learning_rate": 2.152014652014652e-05, "loss": 0.1394, "step": 4700 }, { "epoch": 0.2197726726417362, "grad_norm": 2.5741324424743652, "learning_rate": 2.1978021978021977e-05, "loss": 0.1357, "step": 4800 }, { "epoch": 0.22435126998843904, "grad_norm": 1.3497207164764404, "learning_rate": 2.2435897435897437e-05, "loss": 0.1293, "step": 4900 }, { "epoch": 0.22892986733514187, "grad_norm": 1.5644819736480713, "learning_rate": 2.2893772893772894e-05, "loss": 0.132, "step": 5000 }, { "epoch": 0.23350846468184472, "grad_norm": 1.2510719299316406, "learning_rate": 2.3351648351648354e-05, "loss": 0.1266, "step": 5100 }, { "epoch": 0.23808706202854754, "grad_norm": 2.4705810546875, "learning_rate": 2.380952380952381e-05, "loss": 0.1334, "step": 5200 }, { "epoch": 0.2426656593752504, "grad_norm": 2.0139317512512207, "learning_rate": 2.4267399267399267e-05, "loss": 0.1336, "step": 5300 }, { "epoch": 0.24724425672195324, "grad_norm": 1.2344926595687866, "learning_rate": 2.4725274725274727e-05, "loss": 0.1309, "step": 5400 }, { "epoch": 0.25182285406865607, "grad_norm": 1.8490768671035767, "learning_rate": 2.5183150183150183e-05, "loss": 0.1337, "step": 5500 }, { "epoch": 0.2564014514153589, "grad_norm": 1.2988219261169434, "learning_rate": 2.564102564102564e-05, "loss": 0.1373, "step": 5600 }, { "epoch": 0.26098004876206177, "grad_norm": 1.7260547876358032, "learning_rate": 2.6098901098901103e-05, "loss": 0.1199, "step": 5700 }, { "epoch": 0.26555864610876456, "grad_norm": 2.653820037841797, "learning_rate": 2.655677655677656e-05, "loss": 0.1175, "step": 5800 }, { "epoch": 0.2701372434554674, "grad_norm": 2.190546989440918, "learning_rate": 2.7014652014652016e-05, "loss": 0.1216, "step": 5900 }, { "epoch": 0.27471584080217026, "grad_norm": 1.3163684606552124, "learning_rate": 2.7472527472527476e-05, "loss": 0.127, "step": 6000 }, { "epoch": 0.2792944381488731, "grad_norm": 2.4772284030914307, "learning_rate": 2.7930402930402932e-05, "loss": 0.1218, "step": 6100 }, { "epoch": 0.2838730354955759, "grad_norm": 2.5586929321289062, "learning_rate": 2.838827838827839e-05, "loss": 0.1258, "step": 6200 }, { "epoch": 0.28845163284227876, "grad_norm": 1.8947139978408813, "learning_rate": 2.8846153846153845e-05, "loss": 0.1242, "step": 6300 }, { "epoch": 0.2930302301889816, "grad_norm": 2.729238271713257, "learning_rate": 2.9304029304029305e-05, "loss": 0.1183, "step": 6400 }, { "epoch": 0.29760882753568446, "grad_norm": 1.338982343673706, "learning_rate": 2.9761904761904762e-05, "loss": 0.1203, "step": 6500 }, { "epoch": 0.30218742488238726, "grad_norm": 1.6393356323242188, "learning_rate": 3.021978021978022e-05, "loss": 0.1173, "step": 6600 }, { "epoch": 0.3067660222290901, "grad_norm": 2.4386088848114014, "learning_rate": 3.067765567765568e-05, "loss": 0.1262, "step": 6700 }, { "epoch": 0.31134461957579296, "grad_norm": 1.6236072778701782, "learning_rate": 3.113553113553114e-05, "loss": 0.1163, "step": 6800 }, { "epoch": 0.3159232169224958, "grad_norm": 1.6855531930923462, "learning_rate": 3.1593406593406595e-05, "loss": 0.1116, "step": 6900 }, { "epoch": 0.3205018142691986, "grad_norm": 0.9769238233566284, "learning_rate": 3.205128205128206e-05, "loss": 0.1152, "step": 7000 }, { "epoch": 0.32508041161590145, "grad_norm": 2.372692823410034, "learning_rate": 3.2509157509157515e-05, "loss": 0.1148, "step": 7100 }, { "epoch": 0.3296590089626043, "grad_norm": 1.6294013261795044, "learning_rate": 3.296703296703297e-05, "loss": 0.1149, "step": 7200 }, { "epoch": 0.33423760630930716, "grad_norm": 1.4730180501937866, "learning_rate": 3.342490842490843e-05, "loss": 0.113, "step": 7300 }, { "epoch": 0.33881620365601, "grad_norm": 1.435680866241455, "learning_rate": 3.3882783882783884e-05, "loss": 0.1084, "step": 7400 }, { "epoch": 0.3433948010027128, "grad_norm": 1.2709417343139648, "learning_rate": 3.434065934065934e-05, "loss": 0.1076, "step": 7500 }, { "epoch": 0.34797339834941565, "grad_norm": 2.1665501594543457, "learning_rate": 3.47985347985348e-05, "loss": 0.1074, "step": 7600 }, { "epoch": 0.3525519956961185, "grad_norm": 1.0768879652023315, "learning_rate": 3.525641025641026e-05, "loss": 0.1124, "step": 7700 }, { "epoch": 0.35713059304282135, "grad_norm": 2.1648874282836914, "learning_rate": 3.571428571428572e-05, "loss": 0.1139, "step": 7800 }, { "epoch": 0.36170919038952415, "grad_norm": 1.0463404655456543, "learning_rate": 3.6172161172161173e-05, "loss": 0.1084, "step": 7900 }, { "epoch": 0.366287787736227, "grad_norm": 2.0209906101226807, "learning_rate": 3.663003663003663e-05, "loss": 0.1143, "step": 8000 }, { "epoch": 0.37086638508292985, "grad_norm": 1.6264885663986206, "learning_rate": 3.708791208791209e-05, "loss": 0.1117, "step": 8100 }, { "epoch": 0.3754449824296327, "grad_norm": 1.8169121742248535, "learning_rate": 3.754578754578755e-05, "loss": 0.1055, "step": 8200 }, { "epoch": 0.3800235797763355, "grad_norm": 1.3127943277359009, "learning_rate": 3.8003663003663006e-05, "loss": 0.1135, "step": 8300 }, { "epoch": 0.38460217712303835, "grad_norm": 1.2721083164215088, "learning_rate": 3.846153846153846e-05, "loss": 0.1144, "step": 8400 }, { "epoch": 0.3891807744697412, "grad_norm": 1.393925666809082, "learning_rate": 3.891941391941392e-05, "loss": 0.106, "step": 8500 }, { "epoch": 0.39375937181644405, "grad_norm": 1.0821542739868164, "learning_rate": 3.9377289377289376e-05, "loss": 0.105, "step": 8600 }, { "epoch": 0.39833796916314684, "grad_norm": 1.5736069679260254, "learning_rate": 3.983516483516483e-05, "loss": 0.111, "step": 8700 }, { "epoch": 0.4029165665098497, "grad_norm": 1.8037768602371216, "learning_rate": 4.0293040293040296e-05, "loss": 0.1094, "step": 8800 }, { "epoch": 0.40749516385655254, "grad_norm": 1.1317250728607178, "learning_rate": 4.075091575091575e-05, "loss": 0.1028, "step": 8900 }, { "epoch": 0.4120737612032554, "grad_norm": 1.362167477607727, "learning_rate": 4.120879120879121e-05, "loss": 0.1087, "step": 9000 }, { "epoch": 0.41665235854995825, "grad_norm": 1.9178133010864258, "learning_rate": 4.166666666666667e-05, "loss": 0.1036, "step": 9100 }, { "epoch": 0.42123095589666104, "grad_norm": 1.3326084613800049, "learning_rate": 4.212454212454213e-05, "loss": 0.1034, "step": 9200 }, { "epoch": 0.4258095532433639, "grad_norm": 2.299654245376587, "learning_rate": 4.2582417582417585e-05, "loss": 0.0938, "step": 9300 }, { "epoch": 0.43038815059006674, "grad_norm": 1.5850861072540283, "learning_rate": 4.304029304029304e-05, "loss": 0.0991, "step": 9400 }, { "epoch": 0.4349667479367696, "grad_norm": 1.0600929260253906, "learning_rate": 4.34981684981685e-05, "loss": 0.1001, "step": 9500 }, { "epoch": 0.4395453452834724, "grad_norm": 0.8734288811683655, "learning_rate": 4.3956043956043955e-05, "loss": 0.0991, "step": 9600 }, { "epoch": 0.44412394263017524, "grad_norm": 1.5875756740570068, "learning_rate": 4.441391941391941e-05, "loss": 0.0982, "step": 9700 }, { "epoch": 0.4487025399768781, "grad_norm": 1.2083957195281982, "learning_rate": 4.4871794871794874e-05, "loss": 0.1027, "step": 9800 }, { "epoch": 0.45328113732358094, "grad_norm": 1.5730398893356323, "learning_rate": 4.532967032967033e-05, "loss": 0.1035, "step": 9900 }, { "epoch": 0.45785973467028374, "grad_norm": 1.0928138494491577, "learning_rate": 4.578754578754579e-05, "loss": 0.1017, "step": 10000 }, { "epoch": 0.4624383320169866, "grad_norm": 1.858508586883545, "learning_rate": 4.624542124542125e-05, "loss": 0.1056, "step": 10100 }, { "epoch": 0.46701692936368944, "grad_norm": 0.7009546756744385, "learning_rate": 4.670329670329671e-05, "loss": 0.1014, "step": 10200 }, { "epoch": 0.4715955267103923, "grad_norm": 1.1056081056594849, "learning_rate": 4.7161172161172164e-05, "loss": 0.0997, "step": 10300 }, { "epoch": 0.4761741240570951, "grad_norm": 1.5328575372695923, "learning_rate": 4.761904761904762e-05, "loss": 0.0971, "step": 10400 }, { "epoch": 0.48075272140379793, "grad_norm": 2.9480137825012207, "learning_rate": 4.8076923076923084e-05, "loss": 0.1004, "step": 10500 }, { "epoch": 0.4853313187505008, "grad_norm": 0.9198638796806335, "learning_rate": 4.8534798534798533e-05, "loss": 0.093, "step": 10600 }, { "epoch": 0.48990991609720363, "grad_norm": 1.3510689735412598, "learning_rate": 4.899267399267399e-05, "loss": 0.0966, "step": 10700 }, { "epoch": 0.4944885134439065, "grad_norm": 1.1206891536712646, "learning_rate": 4.945054945054945e-05, "loss": 0.1007, "step": 10800 }, { "epoch": 0.4990671107906093, "grad_norm": 1.6948041915893555, "learning_rate": 4.990842490842491e-05, "loss": 0.0904, "step": 10900 }, { "epoch": 0.5036457081373121, "grad_norm": 0.7195038199424744, "learning_rate": 5.0366300366300366e-05, "loss": 0.0894, "step": 11000 }, { "epoch": 0.508224305484015, "grad_norm": 0.9326936602592468, "learning_rate": 5.082417582417582e-05, "loss": 0.0935, "step": 11100 }, { "epoch": 0.5128029028307178, "grad_norm": 1.224360704421997, "learning_rate": 5.128205128205128e-05, "loss": 0.1011, "step": 11200 }, { "epoch": 0.5173815001774207, "grad_norm": 0.7471579313278198, "learning_rate": 5.173992673992675e-05, "loss": 0.0936, "step": 11300 }, { "epoch": 0.5219600975241235, "grad_norm": 0.8234615921974182, "learning_rate": 5.2197802197802206e-05, "loss": 0.092, "step": 11400 }, { "epoch": 0.5265386948708263, "grad_norm": 1.204841136932373, "learning_rate": 5.265567765567766e-05, "loss": 0.1006, "step": 11500 }, { "epoch": 0.5311172922175291, "grad_norm": 0.980890691280365, "learning_rate": 5.311355311355312e-05, "loss": 0.0909, "step": 11600 }, { "epoch": 0.535695889564232, "grad_norm": 0.8736656308174133, "learning_rate": 5.3571428571428575e-05, "loss": 0.0921, "step": 11700 }, { "epoch": 0.5402744869109348, "grad_norm": 1.8916438817977905, "learning_rate": 5.402930402930403e-05, "loss": 0.092, "step": 11800 }, { "epoch": 0.5448530842576377, "grad_norm": 0.49095866084098816, "learning_rate": 5.448717948717948e-05, "loss": 0.0922, "step": 11900 }, { "epoch": 0.5494316816043405, "grad_norm": 1.418338656425476, "learning_rate": 5.494505494505495e-05, "loss": 0.088, "step": 12000 }, { "epoch": 0.5540102789510434, "grad_norm": 0.6211123466491699, "learning_rate": 5.540293040293041e-05, "loss": 0.0931, "step": 12100 }, { "epoch": 0.5585888762977462, "grad_norm": 1.9046452045440674, "learning_rate": 5.5860805860805865e-05, "loss": 0.0934, "step": 12200 }, { "epoch": 0.563167473644449, "grad_norm": 0.9247643351554871, "learning_rate": 5.631868131868132e-05, "loss": 0.0889, "step": 12300 }, { "epoch": 0.5677460709911518, "grad_norm": 1.4018969535827637, "learning_rate": 5.677655677655678e-05, "loss": 0.0904, "step": 12400 }, { "epoch": 0.5723246683378547, "grad_norm": 0.510405421257019, "learning_rate": 5.7234432234432234e-05, "loss": 0.0856, "step": 12500 }, { "epoch": 0.5769032656845575, "grad_norm": 0.7951760292053223, "learning_rate": 5.769230769230769e-05, "loss": 0.0881, "step": 12600 }, { "epoch": 0.5814818630312604, "grad_norm": 1.340402364730835, "learning_rate": 5.8150183150183154e-05, "loss": 0.0869, "step": 12700 }, { "epoch": 0.5860604603779632, "grad_norm": 1.1029311418533325, "learning_rate": 5.860805860805861e-05, "loss": 0.0922, "step": 12800 }, { "epoch": 0.5906390577246661, "grad_norm": 0.9942110180854797, "learning_rate": 5.906593406593407e-05, "loss": 0.094, "step": 12900 }, { "epoch": 0.5952176550713689, "grad_norm": 0.9533814787864685, "learning_rate": 5.9523809523809524e-05, "loss": 0.0846, "step": 13000 }, { "epoch": 0.5997962524180718, "grad_norm": 0.9805833101272583, "learning_rate": 5.998168498168498e-05, "loss": 0.0889, "step": 13100 }, { "epoch": 0.6043748497647745, "grad_norm": 0.6185852885246277, "learning_rate": 6.043956043956044e-05, "loss": 0.0832, "step": 13200 }, { "epoch": 0.6089534471114774, "grad_norm": 0.508185088634491, "learning_rate": 6.089743589743589e-05, "loss": 0.0828, "step": 13300 }, { "epoch": 0.6135320444581802, "grad_norm": 0.8816813826560974, "learning_rate": 6.135531135531136e-05, "loss": 0.0907, "step": 13400 }, { "epoch": 0.6181106418048831, "grad_norm": 0.7851380109786987, "learning_rate": 6.181318681318681e-05, "loss": 0.0838, "step": 13500 }, { "epoch": 0.6226892391515859, "grad_norm": 1.2309856414794922, "learning_rate": 6.227106227106228e-05, "loss": 0.0892, "step": 13600 }, { "epoch": 0.6272678364982888, "grad_norm": 0.9368516802787781, "learning_rate": 6.272893772893773e-05, "loss": 0.0826, "step": 13700 }, { "epoch": 0.6318464338449916, "grad_norm": 2.122927188873291, "learning_rate": 6.318681318681319e-05, "loss": 0.0891, "step": 13800 }, { "epoch": 0.6364250311916945, "grad_norm": 1.421099305152893, "learning_rate": 6.364468864468864e-05, "loss": 0.0856, "step": 13900 }, { "epoch": 0.6410036285383972, "grad_norm": 1.240886926651001, "learning_rate": 6.410256410256412e-05, "loss": 0.084, "step": 14000 }, { "epoch": 0.6455822258851001, "grad_norm": 1.5990924835205078, "learning_rate": 6.456043956043957e-05, "loss": 0.08, "step": 14100 }, { "epoch": 0.6501608232318029, "grad_norm": 1.1593393087387085, "learning_rate": 6.501831501831503e-05, "loss": 0.0872, "step": 14200 }, { "epoch": 0.6547394205785058, "grad_norm": 1.4352833032608032, "learning_rate": 6.547619047619048e-05, "loss": 0.0855, "step": 14300 }, { "epoch": 0.6593180179252086, "grad_norm": 1.0805554389953613, "learning_rate": 6.593406593406594e-05, "loss": 0.076, "step": 14400 }, { "epoch": 0.6638966152719115, "grad_norm": 1.4789384603500366, "learning_rate": 6.639194139194139e-05, "loss": 0.0787, "step": 14500 }, { "epoch": 0.6684752126186143, "grad_norm": 0.5183298587799072, "learning_rate": 6.684981684981686e-05, "loss": 0.0818, "step": 14600 }, { "epoch": 0.6730538099653172, "grad_norm": 0.540027916431427, "learning_rate": 6.730769230769232e-05, "loss": 0.0821, "step": 14700 }, { "epoch": 0.67763240731202, "grad_norm": 0.4286615550518036, "learning_rate": 6.776556776556777e-05, "loss": 0.0835, "step": 14800 }, { "epoch": 0.6822110046587228, "grad_norm": 0.6089257597923279, "learning_rate": 6.822344322344323e-05, "loss": 0.0774, "step": 14900 }, { "epoch": 0.6867896020054256, "grad_norm": 1.8646626472473145, "learning_rate": 6.868131868131868e-05, "loss": 0.0768, "step": 15000 }, { "epoch": 0.6913681993521285, "grad_norm": 1.5041414499282837, "learning_rate": 6.913919413919414e-05, "loss": 0.0811, "step": 15100 }, { "epoch": 0.6959467966988313, "grad_norm": 1.2774971723556519, "learning_rate": 6.95970695970696e-05, "loss": 0.0824, "step": 15200 }, { "epoch": 0.7005253940455342, "grad_norm": 0.7839298248291016, "learning_rate": 7.005494505494506e-05, "loss": 0.0825, "step": 15300 }, { "epoch": 0.705103991392237, "grad_norm": 0.8007500767707825, "learning_rate": 7.051282051282052e-05, "loss": 0.0822, "step": 15400 }, { "epoch": 0.7096825887389399, "grad_norm": 0.9601584672927856, "learning_rate": 7.097069597069597e-05, "loss": 0.0735, "step": 15500 }, { "epoch": 0.7142611860856427, "grad_norm": 0.6283702254295349, "learning_rate": 7.142857142857143e-05, "loss": 0.0834, "step": 15600 }, { "epoch": 0.7188397834323454, "grad_norm": 0.9371336102485657, "learning_rate": 7.188644688644688e-05, "loss": 0.0805, "step": 15700 }, { "epoch": 0.7234183807790483, "grad_norm": 0.703433096408844, "learning_rate": 7.234432234432235e-05, "loss": 0.0797, "step": 15800 }, { "epoch": 0.7279969781257511, "grad_norm": 1.1103012561798096, "learning_rate": 7.28021978021978e-05, "loss": 0.0702, "step": 15900 }, { "epoch": 0.732575575472454, "grad_norm": 1.1333719491958618, "learning_rate": 7.326007326007326e-05, "loss": 0.0785, "step": 16000 }, { "epoch": 0.7371541728191568, "grad_norm": 1.4945460557937622, "learning_rate": 7.371794871794872e-05, "loss": 0.0759, "step": 16100 }, { "epoch": 0.7417327701658597, "grad_norm": 1.2516579627990723, "learning_rate": 7.417582417582419e-05, "loss": 0.0773, "step": 16200 }, { "epoch": 0.7463113675125626, "grad_norm": 0.6910843253135681, "learning_rate": 7.463369963369964e-05, "loss": 0.0791, "step": 16300 }, { "epoch": 0.7508899648592654, "grad_norm": 1.752681851387024, "learning_rate": 7.50915750915751e-05, "loss": 0.0729, "step": 16400 }, { "epoch": 0.7554685622059683, "grad_norm": 0.46465998888015747, "learning_rate": 7.554945054945055e-05, "loss": 0.0691, "step": 16500 }, { "epoch": 0.760047159552671, "grad_norm": 0.6676632165908813, "learning_rate": 7.600732600732601e-05, "loss": 0.0773, "step": 16600 }, { "epoch": 0.7646257568993738, "grad_norm": 0.5497579574584961, "learning_rate": 7.646520146520146e-05, "loss": 0.0734, "step": 16700 }, { "epoch": 0.7692043542460767, "grad_norm": 1.6269124746322632, "learning_rate": 7.692307692307693e-05, "loss": 0.075, "step": 16800 }, { "epoch": 0.7737829515927795, "grad_norm": 1.7342535257339478, "learning_rate": 7.738095238095239e-05, "loss": 0.0738, "step": 16900 }, { "epoch": 0.7783615489394824, "grad_norm": 0.5286089181900024, "learning_rate": 7.783882783882784e-05, "loss": 0.0791, "step": 17000 }, { "epoch": 0.7829401462861852, "grad_norm": 1.0948727130889893, "learning_rate": 7.82967032967033e-05, "loss": 0.074, "step": 17100 }, { "epoch": 0.7875187436328881, "grad_norm": 0.7580143809318542, "learning_rate": 7.875457875457875e-05, "loss": 0.0776, "step": 17200 }, { "epoch": 0.792097340979591, "grad_norm": 0.6144015789031982, "learning_rate": 7.921245421245422e-05, "loss": 0.0698, "step": 17300 }, { "epoch": 0.7966759383262937, "grad_norm": 1.054747462272644, "learning_rate": 7.967032967032966e-05, "loss": 0.0773, "step": 17400 }, { "epoch": 0.8012545356729965, "grad_norm": 0.7159505486488342, "learning_rate": 8.012820512820514e-05, "loss": 0.0751, "step": 17500 }, { "epoch": 0.8058331330196994, "grad_norm": 0.7566177248954773, "learning_rate": 8.058608058608059e-05, "loss": 0.0734, "step": 17600 }, { "epoch": 0.8104117303664022, "grad_norm": 0.6282426714897156, "learning_rate": 8.104395604395605e-05, "loss": 0.0778, "step": 17700 }, { "epoch": 0.8149903277131051, "grad_norm": 1.3555270433425903, "learning_rate": 8.15018315018315e-05, "loss": 0.0702, "step": 17800 }, { "epoch": 0.8195689250598079, "grad_norm": 0.43876418471336365, "learning_rate": 8.195970695970697e-05, "loss": 0.0736, "step": 17900 }, { "epoch": 0.8241475224065108, "grad_norm": 0.8096747994422913, "learning_rate": 8.241758241758242e-05, "loss": 0.0743, "step": 18000 }, { "epoch": 0.8287261197532136, "grad_norm": 0.5688252449035645, "learning_rate": 8.287545787545788e-05, "loss": 0.0701, "step": 18100 }, { "epoch": 0.8333047170999165, "grad_norm": 0.711829662322998, "learning_rate": 8.333333333333334e-05, "loss": 0.0795, "step": 18200 }, { "epoch": 0.8378833144466192, "grad_norm": 0.9951382875442505, "learning_rate": 8.37912087912088e-05, "loss": 0.0752, "step": 18300 }, { "epoch": 0.8424619117933221, "grad_norm": 1.2362946271896362, "learning_rate": 8.424908424908426e-05, "loss": 0.0726, "step": 18400 }, { "epoch": 0.8470405091400249, "grad_norm": 0.6342608332633972, "learning_rate": 8.470695970695971e-05, "loss": 0.0784, "step": 18500 }, { "epoch": 0.8516191064867278, "grad_norm": 0.4258309006690979, "learning_rate": 8.516483516483517e-05, "loss": 0.0725, "step": 18600 }, { "epoch": 0.8561977038334306, "grad_norm": 0.6683163642883301, "learning_rate": 8.562271062271062e-05, "loss": 0.0711, "step": 18700 }, { "epoch": 0.8607763011801335, "grad_norm": 0.7911510467529297, "learning_rate": 8.608058608058608e-05, "loss": 0.0683, "step": 18800 }, { "epoch": 0.8653548985268363, "grad_norm": 0.5352203845977783, "learning_rate": 8.653846153846155e-05, "loss": 0.0702, "step": 18900 }, { "epoch": 0.8699334958735392, "grad_norm": 0.850853443145752, "learning_rate": 8.6996336996337e-05, "loss": 0.0702, "step": 19000 }, { "epoch": 0.8745120932202419, "grad_norm": 0.5566896796226501, "learning_rate": 8.745421245421246e-05, "loss": 0.0764, "step": 19100 }, { "epoch": 0.8790906905669448, "grad_norm": 0.28583312034606934, "learning_rate": 8.791208791208791e-05, "loss": 0.0701, "step": 19200 }, { "epoch": 0.8836692879136476, "grad_norm": 0.4633546471595764, "learning_rate": 8.836996336996337e-05, "loss": 0.0748, "step": 19300 }, { "epoch": 0.8882478852603505, "grad_norm": 0.6778764724731445, "learning_rate": 8.882783882783882e-05, "loss": 0.0719, "step": 19400 }, { "epoch": 0.8928264826070533, "grad_norm": 0.9359253644943237, "learning_rate": 8.92857142857143e-05, "loss": 0.0729, "step": 19500 }, { "epoch": 0.8974050799537562, "grad_norm": 4.642319679260254, "learning_rate": 8.974358974358975e-05, "loss": 0.0704, "step": 19600 }, { "epoch": 0.901983677300459, "grad_norm": 1.6843513250350952, "learning_rate": 9.020146520146521e-05, "loss": 0.0703, "step": 19700 }, { "epoch": 0.9065622746471619, "grad_norm": 0.6702886819839478, "learning_rate": 9.065934065934066e-05, "loss": 0.072, "step": 19800 }, { "epoch": 0.9111408719938647, "grad_norm": 0.7958008646965027, "learning_rate": 9.111721611721613e-05, "loss": 0.0717, "step": 19900 }, { "epoch": 0.9157194693405675, "grad_norm": 0.41371116042137146, "learning_rate": 9.157509157509158e-05, "loss": 0.0701, "step": 20000 }, { "epoch": 0.9202980666872703, "grad_norm": 0.446638286113739, "learning_rate": 9.203296703296704e-05, "loss": 0.0643, "step": 20100 }, { "epoch": 0.9248766640339732, "grad_norm": 0.5474185347557068, "learning_rate": 9.24908424908425e-05, "loss": 0.0672, "step": 20200 }, { "epoch": 0.929455261380676, "grad_norm": 1.0076775550842285, "learning_rate": 9.294871794871795e-05, "loss": 0.0699, "step": 20300 }, { "epoch": 0.9340338587273789, "grad_norm": 0.45534393191337585, "learning_rate": 9.340659340659341e-05, "loss": 0.0712, "step": 20400 }, { "epoch": 0.9386124560740817, "grad_norm": 0.6854729652404785, "learning_rate": 9.386446886446886e-05, "loss": 0.0695, "step": 20500 }, { "epoch": 0.9431910534207846, "grad_norm": 1.4581429958343506, "learning_rate": 9.432234432234433e-05, "loss": 0.0676, "step": 20600 }, { "epoch": 0.9477696507674874, "grad_norm": 1.6819262504577637, "learning_rate": 9.478021978021978e-05, "loss": 0.0683, "step": 20700 }, { "epoch": 0.9523482481141902, "grad_norm": 0.8808913826942444, "learning_rate": 9.523809523809524e-05, "loss": 0.0668, "step": 20800 }, { "epoch": 0.956926845460893, "grad_norm": 0.3119984269142151, "learning_rate": 9.56959706959707e-05, "loss": 0.074, "step": 20900 }, { "epoch": 0.9615054428075959, "grad_norm": 0.6743124723434448, "learning_rate": 9.615384615384617e-05, "loss": 0.0728, "step": 21000 }, { "epoch": 0.9660840401542987, "grad_norm": 0.6196538209915161, "learning_rate": 9.661172161172162e-05, "loss": 0.0677, "step": 21100 }, { "epoch": 0.9706626375010016, "grad_norm": 0.7010948657989502, "learning_rate": 9.706959706959707e-05, "loss": 0.0716, "step": 21200 }, { "epoch": 0.9752412348477044, "grad_norm": 0.7601842880249023, "learning_rate": 9.752747252747253e-05, "loss": 0.0675, "step": 21300 }, { "epoch": 0.9798198321944073, "grad_norm": 0.5342845320701599, "learning_rate": 9.798534798534798e-05, "loss": 0.0732, "step": 21400 }, { "epoch": 0.9843984295411101, "grad_norm": 0.7880052328109741, "learning_rate": 9.844322344322346e-05, "loss": 0.0671, "step": 21500 }, { "epoch": 0.988977026887813, "grad_norm": 0.690728485584259, "learning_rate": 9.89010989010989e-05, "loss": 0.069, "step": 21600 }, { "epoch": 0.9935556242345157, "grad_norm": 0.6646633148193359, "learning_rate": 9.935897435897437e-05, "loss": 0.0667, "step": 21700 }, { "epoch": 0.9981342215812186, "grad_norm": 1.2037309408187866, "learning_rate": 9.981684981684982e-05, "loss": 0.0683, "step": 21800 }, { "epoch": 0.9999656605198998, "eval_loss": 0.14296908676624298, "eval_runtime": 256.7574, "eval_samples_per_second": 21.421, "eval_steps_per_second": 21.421, "step": 21840 }, { "epoch": 1.0027128189279215, "grad_norm": 0.4327790439128876, "learning_rate": 9.999997700931376e-05, "loss": 0.0692, "step": 21900 }, { "epoch": 1.0072914162746243, "grad_norm": 0.8181611895561218, "learning_rate": 9.999983651075218e-05, "loss": 0.0542, "step": 22000 }, { "epoch": 1.011870013621327, "grad_norm": 0.6566409468650818, "learning_rate": 9.999956828659095e-05, "loss": 0.0571, "step": 22100 }, { "epoch": 1.01644861096803, "grad_norm": 0.7238597273826599, "learning_rate": 9.999917233751526e-05, "loss": 0.0611, "step": 22200 }, { "epoch": 1.0210272083147327, "grad_norm": 0.3051077127456665, "learning_rate": 9.999864866453658e-05, "loss": 0.059, "step": 22300 }, { "epoch": 1.0256058056614357, "grad_norm": 1.5912861824035645, "learning_rate": 9.999799726899262e-05, "loss": 0.0615, "step": 22400 }, { "epoch": 1.0301844030081384, "grad_norm": 0.6656569242477417, "learning_rate": 9.999721815254742e-05, "loss": 0.0618, "step": 22500 }, { "epoch": 1.0347630003548414, "grad_norm": 1.1994621753692627, "learning_rate": 9.999631131719119e-05, "loss": 0.0614, "step": 22600 }, { "epoch": 1.039341597701544, "grad_norm": 0.6420437097549438, "learning_rate": 9.999527676524052e-05, "loss": 0.0565, "step": 22700 }, { "epoch": 1.043920195048247, "grad_norm": 0.6077245473861694, "learning_rate": 9.999411449933816e-05, "loss": 0.0609, "step": 22800 }, { "epoch": 1.0484987923949498, "grad_norm": 0.6168214082717896, "learning_rate": 9.999282452245315e-05, "loss": 0.0579, "step": 22900 }, { "epoch": 1.0530773897416525, "grad_norm": 0.4628690779209137, "learning_rate": 9.999140683788078e-05, "loss": 0.0576, "step": 23000 }, { "epoch": 1.0576559870883555, "grad_norm": 0.43243736028671265, "learning_rate": 9.998986144924251e-05, "loss": 0.0615, "step": 23100 }, { "epoch": 1.0622345844350582, "grad_norm": 0.7162685394287109, "learning_rate": 9.998818836048611e-05, "loss": 0.0598, "step": 23200 }, { "epoch": 1.0668131817817612, "grad_norm": 0.7162106037139893, "learning_rate": 9.99863875758855e-05, "loss": 0.0574, "step": 23300 }, { "epoch": 1.071391779128464, "grad_norm": 0.4392016530036926, "learning_rate": 9.998445910004082e-05, "loss": 0.0576, "step": 23400 }, { "epoch": 1.075970376475167, "grad_norm": 0.8344998955726624, "learning_rate": 9.998240293787841e-05, "loss": 0.0639, "step": 23500 }, { "epoch": 1.0805489738218697, "grad_norm": 0.9016310572624207, "learning_rate": 9.998021909465076e-05, "loss": 0.058, "step": 23600 }, { "epoch": 1.0851275711685724, "grad_norm": 0.1677553951740265, "learning_rate": 9.997790757593657e-05, "loss": 0.0648, "step": 23700 }, { "epoch": 1.0897061685152754, "grad_norm": 0.6796389222145081, "learning_rate": 9.997546838764065e-05, "loss": 0.0589, "step": 23800 }, { "epoch": 1.094284765861978, "grad_norm": 0.667464554309845, "learning_rate": 9.997290153599394e-05, "loss": 0.0557, "step": 23900 }, { "epoch": 1.098863363208681, "grad_norm": 0.9013321995735168, "learning_rate": 9.997020702755353e-05, "loss": 0.0555, "step": 24000 }, { "epoch": 1.1034419605553838, "grad_norm": 0.3552779257297516, "learning_rate": 9.996738486920259e-05, "loss": 0.0568, "step": 24100 }, { "epoch": 1.1080205579020868, "grad_norm": 0.6730219721794128, "learning_rate": 9.996443506815039e-05, "loss": 0.0556, "step": 24200 }, { "epoch": 1.1125991552487895, "grad_norm": 0.29462745785713196, "learning_rate": 9.996135763193225e-05, "loss": 0.055, "step": 24300 }, { "epoch": 1.1171777525954925, "grad_norm": 0.3105739653110504, "learning_rate": 9.995815256840955e-05, "loss": 0.0592, "step": 24400 }, { "epoch": 1.1217563499421952, "grad_norm": 0.5383213758468628, "learning_rate": 9.995481988576968e-05, "loss": 0.0525, "step": 24500 }, { "epoch": 1.126334947288898, "grad_norm": 0.6290645003318787, "learning_rate": 9.995135959252605e-05, "loss": 0.058, "step": 24600 }, { "epoch": 1.130913544635601, "grad_norm": 0.4531712532043457, "learning_rate": 9.994777169751806e-05, "loss": 0.0515, "step": 24700 }, { "epoch": 1.1354921419823036, "grad_norm": 0.5031425952911377, "learning_rate": 9.994405620991102e-05, "loss": 0.0591, "step": 24800 }, { "epoch": 1.1400707393290066, "grad_norm": 0.8398526310920715, "learning_rate": 9.994021313919628e-05, "loss": 0.0608, "step": 24900 }, { "epoch": 1.1446493366757093, "grad_norm": 0.3783178925514221, "learning_rate": 9.9936242495191e-05, "loss": 0.0589, "step": 25000 }, { "epoch": 1.1492279340224123, "grad_norm": 0.3554207384586334, "learning_rate": 9.99321442880383e-05, "loss": 0.0561, "step": 25100 }, { "epoch": 1.153806531369115, "grad_norm": 0.8848966956138611, "learning_rate": 9.992791852820709e-05, "loss": 0.0571, "step": 25200 }, { "epoch": 1.158385128715818, "grad_norm": 0.4907087981700897, "learning_rate": 9.99235652264922e-05, "loss": 0.0593, "step": 25300 }, { "epoch": 1.1629637260625207, "grad_norm": 0.6268092393875122, "learning_rate": 9.991908439401421e-05, "loss": 0.0526, "step": 25400 }, { "epoch": 1.1675423234092235, "grad_norm": 0.5183268785476685, "learning_rate": 9.991447604221951e-05, "loss": 0.0536, "step": 25500 }, { "epoch": 1.1721209207559264, "grad_norm": 0.4522722065448761, "learning_rate": 9.990974018288022e-05, "loss": 0.05, "step": 25600 }, { "epoch": 1.1766995181026292, "grad_norm": 0.8773862719535828, "learning_rate": 9.990487682809418e-05, "loss": 0.0539, "step": 25700 }, { "epoch": 1.1812781154493321, "grad_norm": 0.5325748920440674, "learning_rate": 9.989988599028492e-05, "loss": 0.0604, "step": 25800 }, { "epoch": 1.1858567127960349, "grad_norm": 0.5544828772544861, "learning_rate": 9.989476768220168e-05, "loss": 0.0538, "step": 25900 }, { "epoch": 1.1904353101427378, "grad_norm": 0.8816759586334229, "learning_rate": 9.988952191691925e-05, "loss": 0.0568, "step": 26000 }, { "epoch": 1.1950139074894406, "grad_norm": 0.8002095222473145, "learning_rate": 9.988414870783806e-05, "loss": 0.0573, "step": 26100 }, { "epoch": 1.1995925048361435, "grad_norm": 0.5534511208534241, "learning_rate": 9.987864806868405e-05, "loss": 0.0597, "step": 26200 }, { "epoch": 1.2041711021828463, "grad_norm": 0.4148072898387909, "learning_rate": 9.987302001350875e-05, "loss": 0.049, "step": 26300 }, { "epoch": 1.208749699529549, "grad_norm": 0.30762553215026855, "learning_rate": 9.986726455668913e-05, "loss": 0.0559, "step": 26400 }, { "epoch": 1.213328296876252, "grad_norm": 0.7850671410560608, "learning_rate": 9.986138171292762e-05, "loss": 0.0515, "step": 26500 }, { "epoch": 1.2179068942229547, "grad_norm": 0.45396122336387634, "learning_rate": 9.985537149725205e-05, "loss": 0.0529, "step": 26600 }, { "epoch": 1.2224854915696577, "grad_norm": 0.4627123177051544, "learning_rate": 9.984923392501567e-05, "loss": 0.0555, "step": 26700 }, { "epoch": 1.2270640889163604, "grad_norm": 0.8190097212791443, "learning_rate": 9.984296901189702e-05, "loss": 0.0507, "step": 26800 }, { "epoch": 1.2316426862630634, "grad_norm": 0.6249597668647766, "learning_rate": 9.983657677389992e-05, "loss": 0.0538, "step": 26900 }, { "epoch": 1.2362212836097661, "grad_norm": 0.8909338116645813, "learning_rate": 9.983005722735351e-05, "loss": 0.0458, "step": 27000 }, { "epoch": 1.240799880956469, "grad_norm": 0.4777618944644928, "learning_rate": 9.98234103889121e-05, "loss": 0.0552, "step": 27100 }, { "epoch": 1.2453784783031718, "grad_norm": 0.30679649114608765, "learning_rate": 9.981663627555515e-05, "loss": 0.0547, "step": 27200 }, { "epoch": 1.2499570756498746, "grad_norm": 0.5480089783668518, "learning_rate": 9.980973490458728e-05, "loss": 0.0584, "step": 27300 }, { "epoch": 1.2545356729965775, "grad_norm": 0.7595780491828918, "learning_rate": 9.980270629363819e-05, "loss": 0.056, "step": 27400 }, { "epoch": 1.2591142703432803, "grad_norm": 0.34684839844703674, "learning_rate": 9.979555046066261e-05, "loss": 0.0545, "step": 27500 }, { "epoch": 1.2636928676899832, "grad_norm": 0.4605325758457184, "learning_rate": 9.978826742394027e-05, "loss": 0.0588, "step": 27600 }, { "epoch": 1.268271465036686, "grad_norm": 0.8060219287872314, "learning_rate": 9.97808572020758e-05, "loss": 0.0529, "step": 27700 }, { "epoch": 1.2728500623833887, "grad_norm": 0.4551374614238739, "learning_rate": 9.97733198139988e-05, "loss": 0.0506, "step": 27800 }, { "epoch": 1.2774286597300917, "grad_norm": 0.5313341617584229, "learning_rate": 9.976565527896366e-05, "loss": 0.0524, "step": 27900 }, { "epoch": 1.2820072570767946, "grad_norm": 0.511184811592102, "learning_rate": 9.97578636165496e-05, "loss": 0.0522, "step": 28000 }, { "epoch": 1.2865858544234974, "grad_norm": 0.8772425055503845, "learning_rate": 9.974994484666058e-05, "loss": 0.0546, "step": 28100 }, { "epoch": 1.2911644517702001, "grad_norm": 0.4593620002269745, "learning_rate": 9.974189898952524e-05, "loss": 0.0527, "step": 28200 }, { "epoch": 1.295743049116903, "grad_norm": 0.49878451228141785, "learning_rate": 9.973372606569692e-05, "loss": 0.0536, "step": 28300 }, { "epoch": 1.3003216464636058, "grad_norm": 0.8320513367652893, "learning_rate": 9.97254260960535e-05, "loss": 0.0522, "step": 28400 }, { "epoch": 1.3049002438103088, "grad_norm": 0.4917149245738983, "learning_rate": 9.971699910179742e-05, "loss": 0.0574, "step": 28500 }, { "epoch": 1.3094788411570115, "grad_norm": 0.561815083026886, "learning_rate": 9.97084451044556e-05, "loss": 0.0557, "step": 28600 }, { "epoch": 1.3140574385037143, "grad_norm": 0.43367573618888855, "learning_rate": 9.969976412587944e-05, "loss": 0.0522, "step": 28700 }, { "epoch": 1.3186360358504172, "grad_norm": 1.0113517045974731, "learning_rate": 9.969095618824462e-05, "loss": 0.0491, "step": 28800 }, { "epoch": 1.3232146331971202, "grad_norm": 0.548916220664978, "learning_rate": 9.968202131405124e-05, "loss": 0.0499, "step": 28900 }, { "epoch": 1.327793230543823, "grad_norm": 0.5541431903839111, "learning_rate": 9.967295952612361e-05, "loss": 0.0464, "step": 29000 }, { "epoch": 1.3323718278905257, "grad_norm": 0.47956761717796326, "learning_rate": 9.966377084761023e-05, "loss": 0.0548, "step": 29100 }, { "epoch": 1.3369504252372286, "grad_norm": 0.9489524960517883, "learning_rate": 9.965445530198378e-05, "loss": 0.0576, "step": 29200 }, { "epoch": 1.3415290225839314, "grad_norm": 0.7664705514907837, "learning_rate": 9.964501291304101e-05, "loss": 0.055, "step": 29300 }, { "epoch": 1.3461076199306343, "grad_norm": 0.5601370930671692, "learning_rate": 9.96354437049027e-05, "loss": 0.0525, "step": 29400 }, { "epoch": 1.350686217277337, "grad_norm": 0.3737477958202362, "learning_rate": 9.962574770201358e-05, "loss": 0.049, "step": 29500 }, { "epoch": 1.3552648146240398, "grad_norm": 0.8171801567077637, "learning_rate": 9.96159249291423e-05, "loss": 0.0501, "step": 29600 }, { "epoch": 1.3598434119707428, "grad_norm": 0.8035039305686951, "learning_rate": 9.960597541138131e-05, "loss": 0.0493, "step": 29700 }, { "epoch": 1.3644220093174457, "grad_norm": 0.2262045294046402, "learning_rate": 9.959589917414687e-05, "loss": 0.0503, "step": 29800 }, { "epoch": 1.3690006066641485, "grad_norm": 0.5973814725875854, "learning_rate": 9.958569624317893e-05, "loss": 0.0528, "step": 29900 }, { "epoch": 1.3735792040108512, "grad_norm": 0.66443932056427, "learning_rate": 9.957536664454108e-05, "loss": 0.0509, "step": 30000 }, { "epoch": 1.3781578013575542, "grad_norm": 1.04296875, "learning_rate": 9.956491040462052e-05, "loss": 0.0515, "step": 30100 }, { "epoch": 1.382736398704257, "grad_norm": 1.5576283931732178, "learning_rate": 9.955432755012788e-05, "loss": 0.0533, "step": 30200 }, { "epoch": 1.3873149960509599, "grad_norm": 0.3329857885837555, "learning_rate": 9.954361810809732e-05, "loss": 0.0523, "step": 30300 }, { "epoch": 1.3918935933976626, "grad_norm": 1.7028000354766846, "learning_rate": 9.953278210588628e-05, "loss": 0.0516, "step": 30400 }, { "epoch": 1.3964721907443653, "grad_norm": 0.49420544505119324, "learning_rate": 9.952181957117559e-05, "loss": 0.0505, "step": 30500 }, { "epoch": 1.4010507880910683, "grad_norm": 0.15591812133789062, "learning_rate": 9.951073053196926e-05, "loss": 0.0512, "step": 30600 }, { "epoch": 1.4056293854377713, "grad_norm": 0.4006904661655426, "learning_rate": 9.949951501659445e-05, "loss": 0.0522, "step": 30700 }, { "epoch": 1.410207982784474, "grad_norm": 0.46110183000564575, "learning_rate": 9.948817305370143e-05, "loss": 0.049, "step": 30800 }, { "epoch": 1.4147865801311768, "grad_norm": 0.24079594016075134, "learning_rate": 9.947670467226349e-05, "loss": 0.0521, "step": 30900 }, { "epoch": 1.4193651774778797, "grad_norm": 0.6515139937400818, "learning_rate": 9.946510990157682e-05, "loss": 0.0495, "step": 31000 }, { "epoch": 1.4239437748245825, "grad_norm": 0.5415006279945374, "learning_rate": 9.945338877126052e-05, "loss": 0.0526, "step": 31100 }, { "epoch": 1.4285223721712854, "grad_norm": 0.8711938261985779, "learning_rate": 9.944154131125642e-05, "loss": 0.0484, "step": 31200 }, { "epoch": 1.4331009695179882, "grad_norm": 0.5021001696586609, "learning_rate": 9.942956755182916e-05, "loss": 0.0567, "step": 31300 }, { "epoch": 1.437679566864691, "grad_norm": 1.9676926136016846, "learning_rate": 9.941746752356588e-05, "loss": 0.0496, "step": 31400 }, { "epoch": 1.4422581642113939, "grad_norm": 0.5120891332626343, "learning_rate": 9.94052412573764e-05, "loss": 0.0492, "step": 31500 }, { "epoch": 1.4468367615580966, "grad_norm": 0.9182060956954956, "learning_rate": 9.939288878449294e-05, "loss": 0.0525, "step": 31600 }, { "epoch": 1.4514153589047996, "grad_norm": 0.6737085580825806, "learning_rate": 9.938041013647016e-05, "loss": 0.0462, "step": 31700 }, { "epoch": 1.4559939562515023, "grad_norm": 0.7034218311309814, "learning_rate": 9.936780534518502e-05, "loss": 0.0497, "step": 31800 }, { "epoch": 1.4605725535982053, "grad_norm": 0.9228888750076294, "learning_rate": 9.935507444283669e-05, "loss": 0.0482, "step": 31900 }, { "epoch": 1.465151150944908, "grad_norm": 0.3609278202056885, "learning_rate": 9.934221746194655e-05, "loss": 0.0594, "step": 32000 }, { "epoch": 1.469729748291611, "grad_norm": 0.12724661827087402, "learning_rate": 9.932923443535798e-05, "loss": 0.0476, "step": 32100 }, { "epoch": 1.4743083456383137, "grad_norm": 0.5686663389205933, "learning_rate": 9.931612539623643e-05, "loss": 0.0538, "step": 32200 }, { "epoch": 1.4788869429850164, "grad_norm": 0.6813719868659973, "learning_rate": 9.930289037806919e-05, "loss": 0.0511, "step": 32300 }, { "epoch": 1.4834655403317194, "grad_norm": 0.678242027759552, "learning_rate": 9.928952941466538e-05, "loss": 0.0492, "step": 32400 }, { "epoch": 1.4880441376784221, "grad_norm": 0.7721807360649109, "learning_rate": 9.927604254015585e-05, "loss": 0.0529, "step": 32500 }, { "epoch": 1.492622735025125, "grad_norm": 0.6314060688018799, "learning_rate": 9.926242978899312e-05, "loss": 0.0462, "step": 32600 }, { "epoch": 1.4972013323718278, "grad_norm": 0.5451350212097168, "learning_rate": 9.924869119595119e-05, "loss": 0.0476, "step": 32700 }, { "epoch": 1.5017799297185306, "grad_norm": 0.5342521071434021, "learning_rate": 9.923482679612563e-05, "loss": 0.0505, "step": 32800 }, { "epoch": 1.5063585270652335, "grad_norm": 0.7561967968940735, "learning_rate": 9.922083662493329e-05, "loss": 0.0491, "step": 32900 }, { "epoch": 1.5109371244119365, "grad_norm": 0.2349376529455185, "learning_rate": 9.920672071811237e-05, "loss": 0.0463, "step": 33000 }, { "epoch": 1.5155157217586392, "grad_norm": 0.3987545073032379, "learning_rate": 9.919247911172224e-05, "loss": 0.0528, "step": 33100 }, { "epoch": 1.520094319105342, "grad_norm": 0.4922156035900116, "learning_rate": 9.917811184214337e-05, "loss": 0.0479, "step": 33200 }, { "epoch": 1.524672916452045, "grad_norm": 0.9758409261703491, "learning_rate": 9.916361894607722e-05, "loss": 0.0537, "step": 33300 }, { "epoch": 1.529251513798748, "grad_norm": 0.5304883718490601, "learning_rate": 9.914900046054623e-05, "loss": 0.0504, "step": 33400 }, { "epoch": 1.5338301111454506, "grad_norm": 0.4293117821216583, "learning_rate": 9.913425642289358e-05, "loss": 0.0481, "step": 33500 }, { "epoch": 1.5384087084921534, "grad_norm": 0.354592889547348, "learning_rate": 9.911938687078324e-05, "loss": 0.0496, "step": 33600 }, { "epoch": 1.5429873058388561, "grad_norm": 0.36046740412712097, "learning_rate": 9.910439184219978e-05, "loss": 0.0451, "step": 33700 }, { "epoch": 1.547565903185559, "grad_norm": 0.4680946171283722, "learning_rate": 9.90892713754483e-05, "loss": 0.048, "step": 33800 }, { "epoch": 1.552144500532262, "grad_norm": 0.4586212635040283, "learning_rate": 9.907402550915433e-05, "loss": 0.0462, "step": 33900 }, { "epoch": 1.5567230978789648, "grad_norm": 0.2608386278152466, "learning_rate": 9.905865428226376e-05, "loss": 0.0472, "step": 34000 }, { "epoch": 1.5613016952256675, "grad_norm": 0.5291585922241211, "learning_rate": 9.90431577340427e-05, "loss": 0.044, "step": 34100 }, { "epoch": 1.5658802925723705, "grad_norm": 0.9200330376625061, "learning_rate": 9.90275359040774e-05, "loss": 0.0487, "step": 34200 }, { "epoch": 1.5704588899190735, "grad_norm": 0.550689160823822, "learning_rate": 9.901178883227414e-05, "loss": 0.0515, "step": 34300 }, { "epoch": 1.5750374872657762, "grad_norm": 0.7476568818092346, "learning_rate": 9.899591655885912e-05, "loss": 0.0457, "step": 34400 }, { "epoch": 1.579616084612479, "grad_norm": 0.8736041188240051, "learning_rate": 9.89799191243784e-05, "loss": 0.0473, "step": 34500 }, { "epoch": 1.5841946819591817, "grad_norm": 0.17842432856559753, "learning_rate": 9.896379656969776e-05, "loss": 0.0456, "step": 34600 }, { "epoch": 1.5887732793058846, "grad_norm": 0.5870159864425659, "learning_rate": 9.894754893600258e-05, "loss": 0.052, "step": 34700 }, { "epoch": 1.5933518766525876, "grad_norm": 0.33038216829299927, "learning_rate": 9.893117626479777e-05, "loss": 0.0498, "step": 34800 }, { "epoch": 1.5979304739992903, "grad_norm": 0.7480065226554871, "learning_rate": 9.891467859790767e-05, "loss": 0.0484, "step": 34900 }, { "epoch": 1.602509071345993, "grad_norm": 0.46852391958236694, "learning_rate": 9.889805597747588e-05, "loss": 0.0471, "step": 35000 }, { "epoch": 1.607087668692696, "grad_norm": 0.33162882924079895, "learning_rate": 9.888130844596524e-05, "loss": 0.0477, "step": 35100 }, { "epoch": 1.6116662660393988, "grad_norm": 1.0083402395248413, "learning_rate": 9.886443604615764e-05, "loss": 0.051, "step": 35200 }, { "epoch": 1.6162448633861017, "grad_norm": 0.6158673763275146, "learning_rate": 9.8847438821154e-05, "loss": 0.0459, "step": 35300 }, { "epoch": 1.6208234607328045, "grad_norm": 1.0110929012298584, "learning_rate": 9.883031681437405e-05, "loss": 0.0481, "step": 35400 }, { "epoch": 1.6254020580795072, "grad_norm": 0.35791000723838806, "learning_rate": 9.881307006955634e-05, "loss": 0.0466, "step": 35500 }, { "epoch": 1.6299806554262102, "grad_norm": 0.5888839364051819, "learning_rate": 9.879569863075799e-05, "loss": 0.048, "step": 35600 }, { "epoch": 1.6345592527729131, "grad_norm": 0.7552986741065979, "learning_rate": 9.877820254235471e-05, "loss": 0.0482, "step": 35700 }, { "epoch": 1.6391378501196159, "grad_norm": 0.5620241165161133, "learning_rate": 9.87605818490406e-05, "loss": 0.0461, "step": 35800 }, { "epoch": 1.6437164474663186, "grad_norm": 0.40786847472190857, "learning_rate": 9.87428365958281e-05, "loss": 0.0501, "step": 35900 }, { "epoch": 1.6482950448130216, "grad_norm": 0.3627175986766815, "learning_rate": 9.872496682804781e-05, "loss": 0.0495, "step": 36000 }, { "epoch": 1.6528736421597243, "grad_norm": 0.3631226122379303, "learning_rate": 9.870697259134844e-05, "loss": 0.0415, "step": 36100 }, { "epoch": 1.6574522395064273, "grad_norm": 0.41811424493789673, "learning_rate": 9.86888539316966e-05, "loss": 0.0444, "step": 36200 }, { "epoch": 1.66203083685313, "grad_norm": 0.8351331949234009, "learning_rate": 9.867061089537677e-05, "loss": 0.0499, "step": 36300 }, { "epoch": 1.6666094341998328, "grad_norm": 0.26144087314605713, "learning_rate": 9.865224352899119e-05, "loss": 0.0488, "step": 36400 }, { "epoch": 1.6711880315465357, "grad_norm": 1.9417400360107422, "learning_rate": 9.863375187945967e-05, "loss": 0.0456, "step": 36500 }, { "epoch": 1.6757666288932387, "grad_norm": 0.762496292591095, "learning_rate": 9.861513599401948e-05, "loss": 0.0446, "step": 36600 }, { "epoch": 1.6803452262399414, "grad_norm": 0.6936019659042358, "learning_rate": 9.859639592022528e-05, "loss": 0.046, "step": 36700 }, { "epoch": 1.6849238235866442, "grad_norm": 0.3661505877971649, "learning_rate": 9.857753170594897e-05, "loss": 0.0445, "step": 36800 }, { "epoch": 1.689502420933347, "grad_norm": 0.6424843668937683, "learning_rate": 9.85585433993796e-05, "loss": 0.0473, "step": 36900 }, { "epoch": 1.6940810182800499, "grad_norm": 0.520645022392273, "learning_rate": 9.853943104902315e-05, "loss": 0.0474, "step": 37000 }, { "epoch": 1.6986596156267528, "grad_norm": 0.3194561302661896, "learning_rate": 9.852019470370253e-05, "loss": 0.0482, "step": 37100 }, { "epoch": 1.7032382129734556, "grad_norm": 0.6570625305175781, "learning_rate": 9.850083441255735e-05, "loss": 0.0457, "step": 37200 }, { "epoch": 1.7078168103201583, "grad_norm": 0.4810948371887207, "learning_rate": 9.84813502250439e-05, "loss": 0.0474, "step": 37300 }, { "epoch": 1.7123954076668613, "grad_norm": 0.5983640551567078, "learning_rate": 9.846174219093491e-05, "loss": 0.0451, "step": 37400 }, { "epoch": 1.7169740050135642, "grad_norm": 0.4565774202346802, "learning_rate": 9.844201036031951e-05, "loss": 0.0436, "step": 37500 }, { "epoch": 1.721552602360267, "grad_norm": 0.4429413974285126, "learning_rate": 9.842215478360306e-05, "loss": 0.0415, "step": 37600 }, { "epoch": 1.7261311997069697, "grad_norm": 0.461791068315506, "learning_rate": 9.840217551150706e-05, "loss": 0.0436, "step": 37700 }, { "epoch": 1.7307097970536724, "grad_norm": 0.7613334059715271, "learning_rate": 9.838207259506891e-05, "loss": 0.0433, "step": 37800 }, { "epoch": 1.7352883944003754, "grad_norm": 0.1547241359949112, "learning_rate": 9.836184608564198e-05, "loss": 0.044, "step": 37900 }, { "epoch": 1.7398669917470784, "grad_norm": 0.45752155780792236, "learning_rate": 9.834149603489526e-05, "loss": 0.0436, "step": 38000 }, { "epoch": 1.7444455890937811, "grad_norm": 1.1743345260620117, "learning_rate": 9.832102249481338e-05, "loss": 0.0443, "step": 38100 }, { "epoch": 1.7490241864404839, "grad_norm": 0.9375355243682861, "learning_rate": 9.830042551769641e-05, "loss": 0.0437, "step": 38200 }, { "epoch": 1.7536027837871868, "grad_norm": 0.3472870886325836, "learning_rate": 9.827970515615977e-05, "loss": 0.0445, "step": 38300 }, { "epoch": 1.7581813811338898, "grad_norm": 1.0196037292480469, "learning_rate": 9.825886146313402e-05, "loss": 0.0452, "step": 38400 }, { "epoch": 1.7627599784805925, "grad_norm": 0.774361789226532, "learning_rate": 9.82378944918648e-05, "loss": 0.0465, "step": 38500 }, { "epoch": 1.7673385758272953, "grad_norm": 1.1351568698883057, "learning_rate": 9.821680429591269e-05, "loss": 0.0438, "step": 38600 }, { "epoch": 1.771917173173998, "grad_norm": 0.3935364782810211, "learning_rate": 9.819559092915299e-05, "loss": 0.0477, "step": 38700 }, { "epoch": 1.776495770520701, "grad_norm": 0.48644939064979553, "learning_rate": 9.81742544457757e-05, "loss": 0.0482, "step": 38800 }, { "epoch": 1.781074367867404, "grad_norm": 0.7816250324249268, "learning_rate": 9.815279490028529e-05, "loss": 0.0418, "step": 38900 }, { "epoch": 1.7856529652141067, "grad_norm": 0.9440283179283142, "learning_rate": 9.81312123475006e-05, "loss": 0.0445, "step": 39000 }, { "epoch": 1.7902315625608094, "grad_norm": 0.5908809304237366, "learning_rate": 9.810950684255473e-05, "loss": 0.0467, "step": 39100 }, { "epoch": 1.7948101599075124, "grad_norm": 0.8200555443763733, "learning_rate": 9.80876784408948e-05, "loss": 0.044, "step": 39200 }, { "epoch": 1.7993887572542153, "grad_norm": 0.513478696346283, "learning_rate": 9.806572719828193e-05, "loss": 0.0437, "step": 39300 }, { "epoch": 1.803967354600918, "grad_norm": 0.34043118357658386, "learning_rate": 9.8043653170791e-05, "loss": 0.0412, "step": 39400 }, { "epoch": 1.8085459519476208, "grad_norm": 0.7160608172416687, "learning_rate": 9.802145641481056e-05, "loss": 0.0475, "step": 39500 }, { "epoch": 1.8131245492943235, "grad_norm": 0.7825314998626709, "learning_rate": 9.799913698704269e-05, "loss": 0.0475, "step": 39600 }, { "epoch": 1.8177031466410265, "grad_norm": 0.38378453254699707, "learning_rate": 9.797669494450281e-05, "loss": 0.0416, "step": 39700 }, { "epoch": 1.8222817439877295, "grad_norm": 0.3362584710121155, "learning_rate": 9.795413034451959e-05, "loss": 0.0439, "step": 39800 }, { "epoch": 1.8268603413344322, "grad_norm": 0.42681771516799927, "learning_rate": 9.793144324473473e-05, "loss": 0.0458, "step": 39900 }, { "epoch": 1.831438938681135, "grad_norm": 0.7747517824172974, "learning_rate": 9.790863370310293e-05, "loss": 0.0442, "step": 40000 }, { "epoch": 1.836017536027838, "grad_norm": 0.481751024723053, "learning_rate": 9.788570177789158e-05, "loss": 0.0479, "step": 40100 }, { "epoch": 1.8405961333745409, "grad_norm": 0.4065397381782532, "learning_rate": 9.78626475276808e-05, "loss": 0.0433, "step": 40200 }, { "epoch": 1.8451747307212436, "grad_norm": 0.47623851895332336, "learning_rate": 9.78394710113631e-05, "loss": 0.043, "step": 40300 }, { "epoch": 1.8497533280679463, "grad_norm": 0.5850650668144226, "learning_rate": 9.781617228814339e-05, "loss": 0.0413, "step": 40400 }, { "epoch": 1.854331925414649, "grad_norm": 0.5443374514579773, "learning_rate": 9.77927514175387e-05, "loss": 0.044, "step": 40500 }, { "epoch": 1.858910522761352, "grad_norm": 0.5081647634506226, "learning_rate": 9.776920845937816e-05, "loss": 0.0417, "step": 40600 }, { "epoch": 1.863489120108055, "grad_norm": 0.9533047080039978, "learning_rate": 9.774554347380271e-05, "loss": 0.0438, "step": 40700 }, { "epoch": 1.8680677174547577, "grad_norm": 0.7583338618278503, "learning_rate": 9.772175652126503e-05, "loss": 0.0437, "step": 40800 }, { "epoch": 1.8726463148014605, "grad_norm": 0.6980351209640503, "learning_rate": 9.769784766252941e-05, "loss": 0.0453, "step": 40900 }, { "epoch": 1.8772249121481634, "grad_norm": 0.523002564907074, "learning_rate": 9.767381695867149e-05, "loss": 0.0436, "step": 41000 }, { "epoch": 1.8818035094948664, "grad_norm": 0.20366418361663818, "learning_rate": 9.764966447107819e-05, "loss": 0.0436, "step": 41100 }, { "epoch": 1.8863821068415692, "grad_norm": 0.5247259140014648, "learning_rate": 9.762539026144755e-05, "loss": 0.0436, "step": 41200 }, { "epoch": 1.890960704188272, "grad_norm": 0.7223037481307983, "learning_rate": 9.760099439178852e-05, "loss": 0.0433, "step": 41300 }, { "epoch": 1.8955393015349746, "grad_norm": 0.8085638880729675, "learning_rate": 9.757647692442083e-05, "loss": 0.0438, "step": 41400 }, { "epoch": 1.9001178988816776, "grad_norm": 0.360061913728714, "learning_rate": 9.755183792197486e-05, "loss": 0.0437, "step": 41500 }, { "epoch": 1.9046964962283806, "grad_norm": 0.33270975947380066, "learning_rate": 9.752707744739145e-05, "loss": 0.0417, "step": 41600 }, { "epoch": 1.9092750935750833, "grad_norm": 0.40628722310066223, "learning_rate": 9.750219556392175e-05, "loss": 0.0442, "step": 41700 }, { "epoch": 1.913853690921786, "grad_norm": 0.41940930485725403, "learning_rate": 9.7477192335127e-05, "loss": 0.0401, "step": 41800 }, { "epoch": 1.918432288268489, "grad_norm": 0.41912841796875, "learning_rate": 9.74520678248785e-05, "loss": 0.0444, "step": 41900 }, { "epoch": 1.9230108856151917, "grad_norm": 0.5386761426925659, "learning_rate": 9.742682209735727e-05, "loss": 0.0419, "step": 42000 }, { "epoch": 1.9275894829618947, "grad_norm": 0.4845391809940338, "learning_rate": 9.74014552170541e-05, "loss": 0.0423, "step": 42100 }, { "epoch": 1.9321680803085974, "grad_norm": 0.7683694958686829, "learning_rate": 9.737596724876914e-05, "loss": 0.0423, "step": 42200 }, { "epoch": 1.9367466776553002, "grad_norm": 0.3862452805042267, "learning_rate": 9.735035825761197e-05, "loss": 0.0391, "step": 42300 }, { "epoch": 1.9413252750020031, "grad_norm": 0.34973305463790894, "learning_rate": 9.732462830900124e-05, "loss": 0.0421, "step": 42400 }, { "epoch": 1.945903872348706, "grad_norm": 0.6201797127723694, "learning_rate": 9.729877746866465e-05, "loss": 0.044, "step": 42500 }, { "epoch": 1.9504824696954088, "grad_norm": 1.0637577772140503, "learning_rate": 9.72728058026387e-05, "loss": 0.0445, "step": 42600 }, { "epoch": 1.9550610670421116, "grad_norm": 0.4952399432659149, "learning_rate": 9.724671337726854e-05, "loss": 0.0428, "step": 42700 }, { "epoch": 1.9596396643888145, "grad_norm": 0.8040750026702881, "learning_rate": 9.722050025920778e-05, "loss": 0.0422, "step": 42800 }, { "epoch": 1.9642182617355173, "grad_norm": 0.5028950572013855, "learning_rate": 9.719416651541839e-05, "loss": 0.0435, "step": 42900 }, { "epoch": 1.9687968590822202, "grad_norm": 0.3617078959941864, "learning_rate": 9.716771221317042e-05, "loss": 0.0414, "step": 43000 }, { "epoch": 1.973375456428923, "grad_norm": 0.6627247929573059, "learning_rate": 9.714113742004198e-05, "loss": 0.0442, "step": 43100 }, { "epoch": 1.9779540537756257, "grad_norm": 0.5225775241851807, "learning_rate": 9.711444220391886e-05, "loss": 0.041, "step": 43200 }, { "epoch": 1.9825326511223287, "grad_norm": 0.44373607635498047, "learning_rate": 9.708762663299456e-05, "loss": 0.0498, "step": 43300 }, { "epoch": 1.9871112484690316, "grad_norm": 0.6220275163650513, "learning_rate": 9.706069077577001e-05, "loss": 0.0431, "step": 43400 }, { "epoch": 1.9916898458157344, "grad_norm": 0.14181743562221527, "learning_rate": 9.703363470105338e-05, "loss": 0.0405, "step": 43500 }, { "epoch": 1.9962684431624371, "grad_norm": 0.5560967922210693, "learning_rate": 9.700645847796e-05, "loss": 0.0393, "step": 43600 }, { "epoch": 1.9999771070132666, "eval_loss": 0.16101938486099243, "eval_runtime": 260.5165, "eval_samples_per_second": 21.112, "eval_steps_per_second": 21.112, "step": 43681 }, { "epoch": 2.00084704050914, "grad_norm": 0.33940353989601135, "learning_rate": 9.697916217591206e-05, "loss": 0.0412, "step": 43700 }, { "epoch": 2.005425637855843, "grad_norm": 0.5346599817276001, "learning_rate": 9.695174586463848e-05, "loss": 0.0324, "step": 43800 }, { "epoch": 2.010004235202546, "grad_norm": 0.33371779322624207, "learning_rate": 9.692420961417488e-05, "loss": 0.0289, "step": 43900 }, { "epoch": 2.0145828325492485, "grad_norm": 0.7601104974746704, "learning_rate": 9.689655349486309e-05, "loss": 0.0301, "step": 44000 }, { "epoch": 2.0191614298959513, "grad_norm": 1.0337655544281006, "learning_rate": 9.686877757735127e-05, "loss": 0.0308, "step": 44100 }, { "epoch": 2.023740027242654, "grad_norm": 0.39981570839881897, "learning_rate": 9.684088193259355e-05, "loss": 0.03, "step": 44200 }, { "epoch": 2.028318624589357, "grad_norm": 0.6537315845489502, "learning_rate": 9.681286663184994e-05, "loss": 0.0319, "step": 44300 }, { "epoch": 2.03289722193606, "grad_norm": 0.4220235347747803, "learning_rate": 9.678473174668606e-05, "loss": 0.03, "step": 44400 }, { "epoch": 2.0374758192827627, "grad_norm": 0.20741093158721924, "learning_rate": 9.675647734897309e-05, "loss": 0.0315, "step": 44500 }, { "epoch": 2.0420544166294654, "grad_norm": 0.46414992213249207, "learning_rate": 9.672810351088743e-05, "loss": 0.0304, "step": 44600 }, { "epoch": 2.0466330139761686, "grad_norm": 0.4579373002052307, "learning_rate": 9.669961030491064e-05, "loss": 0.0312, "step": 44700 }, { "epoch": 2.0512116113228713, "grad_norm": 0.7499117851257324, "learning_rate": 9.66709978038292e-05, "loss": 0.0306, "step": 44800 }, { "epoch": 2.055790208669574, "grad_norm": 0.2862129807472229, "learning_rate": 9.664226608073431e-05, "loss": 0.031, "step": 44900 }, { "epoch": 2.060368806016277, "grad_norm": 0.1198749765753746, "learning_rate": 9.661341520902176e-05, "loss": 0.0335, "step": 45000 }, { "epoch": 2.0649474033629795, "grad_norm": 0.32276931405067444, "learning_rate": 9.658444526239168e-05, "loss": 0.0308, "step": 45100 }, { "epoch": 2.0695260007096827, "grad_norm": 0.30404484272003174, "learning_rate": 9.655535631484838e-05, "loss": 0.0293, "step": 45200 }, { "epoch": 2.0741045980563855, "grad_norm": 0.4213615357875824, "learning_rate": 9.652614844070018e-05, "loss": 0.0314, "step": 45300 }, { "epoch": 2.078683195403088, "grad_norm": 0.26582449674606323, "learning_rate": 9.64968217145592e-05, "loss": 0.035, "step": 45400 }, { "epoch": 2.083261792749791, "grad_norm": 0.462671160697937, "learning_rate": 9.646737621134112e-05, "loss": 0.0313, "step": 45500 }, { "epoch": 2.087840390096494, "grad_norm": 0.7931702733039856, "learning_rate": 9.643781200626511e-05, "loss": 0.0341, "step": 45600 }, { "epoch": 2.092418987443197, "grad_norm": 0.7540990710258484, "learning_rate": 9.640812917485353e-05, "loss": 0.03, "step": 45700 }, { "epoch": 2.0969975847898996, "grad_norm": 0.23272991180419922, "learning_rate": 9.637832779293177e-05, "loss": 0.0282, "step": 45800 }, { "epoch": 2.1015761821366024, "grad_norm": 0.4618046283721924, "learning_rate": 9.634840793662807e-05, "loss": 0.0345, "step": 45900 }, { "epoch": 2.106154779483305, "grad_norm": 0.5170308947563171, "learning_rate": 9.63183696823733e-05, "loss": 0.0301, "step": 46000 }, { "epoch": 2.1107333768300083, "grad_norm": 0.4604352116584778, "learning_rate": 9.628821310690082e-05, "loss": 0.0304, "step": 46100 }, { "epoch": 2.115311974176711, "grad_norm": 0.5687543749809265, "learning_rate": 9.625793828724618e-05, "loss": 0.0326, "step": 46200 }, { "epoch": 2.1198905715234138, "grad_norm": 0.6474491357803345, "learning_rate": 9.622754530074705e-05, "loss": 0.0323, "step": 46300 }, { "epoch": 2.1244691688701165, "grad_norm": 0.3418339490890503, "learning_rate": 9.619703422504291e-05, "loss": 0.0311, "step": 46400 }, { "epoch": 2.1290477662168197, "grad_norm": 0.542149543762207, "learning_rate": 9.616640513807493e-05, "loss": 0.0302, "step": 46500 }, { "epoch": 2.1336263635635224, "grad_norm": 0.3595920205116272, "learning_rate": 9.613565811808576e-05, "loss": 0.033, "step": 46600 }, { "epoch": 2.138204960910225, "grad_norm": 0.30227652192115784, "learning_rate": 9.610479324361926e-05, "loss": 0.0333, "step": 46700 }, { "epoch": 2.142783558256928, "grad_norm": 0.3367413580417633, "learning_rate": 9.607381059352038e-05, "loss": 0.0291, "step": 46800 }, { "epoch": 2.1473621556036306, "grad_norm": 0.33046597242355347, "learning_rate": 9.604271024693495e-05, "loss": 0.0324, "step": 46900 }, { "epoch": 2.151940752950334, "grad_norm": 0.3185320794582367, "learning_rate": 9.601149228330944e-05, "loss": 0.03, "step": 47000 }, { "epoch": 2.1565193502970366, "grad_norm": 0.4530138075351715, "learning_rate": 9.598015678239074e-05, "loss": 0.0301, "step": 47100 }, { "epoch": 2.1610979476437393, "grad_norm": 1.8580175638198853, "learning_rate": 9.594870382422604e-05, "loss": 0.0279, "step": 47200 }, { "epoch": 2.165676544990442, "grad_norm": 0.7226896286010742, "learning_rate": 9.591713348916258e-05, "loss": 0.0376, "step": 47300 }, { "epoch": 2.170255142337145, "grad_norm": 0.5682775974273682, "learning_rate": 9.588544585784741e-05, "loss": 0.0319, "step": 47400 }, { "epoch": 2.174833739683848, "grad_norm": 0.35570502281188965, "learning_rate": 9.585364101122723e-05, "loss": 0.0323, "step": 47500 }, { "epoch": 2.1794123370305507, "grad_norm": 0.34631285071372986, "learning_rate": 9.582171903054816e-05, "loss": 0.0349, "step": 47600 }, { "epoch": 2.1839909343772534, "grad_norm": 0.6178816556930542, "learning_rate": 9.578967999735556e-05, "loss": 0.0309, "step": 47700 }, { "epoch": 2.188569531723956, "grad_norm": 0.37293490767478943, "learning_rate": 9.575752399349378e-05, "loss": 0.0288, "step": 47800 }, { "epoch": 2.1931481290706594, "grad_norm": 0.6338769197463989, "learning_rate": 9.572525110110601e-05, "loss": 0.0321, "step": 47900 }, { "epoch": 2.197726726417362, "grad_norm": 0.19380028545856476, "learning_rate": 9.569286140263399e-05, "loss": 0.0306, "step": 48000 }, { "epoch": 2.202305323764065, "grad_norm": 0.3065268099308014, "learning_rate": 9.566035498081784e-05, "loss": 0.03, "step": 48100 }, { "epoch": 2.2068839211107676, "grad_norm": 0.29010531306266785, "learning_rate": 9.562773191869594e-05, "loss": 0.0327, "step": 48200 }, { "epoch": 2.2114625184574708, "grad_norm": 0.40536558628082275, "learning_rate": 9.559499229960451e-05, "loss": 0.028, "step": 48300 }, { "epoch": 2.2160411158041735, "grad_norm": 0.30061614513397217, "learning_rate": 9.55621362071776e-05, "loss": 0.0306, "step": 48400 }, { "epoch": 2.2206197131508763, "grad_norm": 0.5350512266159058, "learning_rate": 9.552916372534674e-05, "loss": 0.0301, "step": 48500 }, { "epoch": 2.225198310497579, "grad_norm": 0.4163435101509094, "learning_rate": 9.549607493834085e-05, "loss": 0.0333, "step": 48600 }, { "epoch": 2.2297769078442817, "grad_norm": 0.6648384928703308, "learning_rate": 9.546286993068588e-05, "loss": 0.0323, "step": 48700 }, { "epoch": 2.234355505190985, "grad_norm": 0.3643403947353363, "learning_rate": 9.54295487872047e-05, "loss": 0.0287, "step": 48800 }, { "epoch": 2.2389341025376877, "grad_norm": 0.8857894539833069, "learning_rate": 9.539611159301684e-05, "loss": 0.0299, "step": 48900 }, { "epoch": 2.2435126998843904, "grad_norm": 0.4569896459579468, "learning_rate": 9.536255843353832e-05, "loss": 0.0317, "step": 49000 }, { "epoch": 2.248091297231093, "grad_norm": 0.46430703997612, "learning_rate": 9.532888939448134e-05, "loss": 0.0342, "step": 49100 }, { "epoch": 2.252669894577796, "grad_norm": 0.4034232795238495, "learning_rate": 9.529510456185417e-05, "loss": 0.0316, "step": 49200 }, { "epoch": 2.257248491924499, "grad_norm": 0.5079818964004517, "learning_rate": 9.526120402196083e-05, "loss": 0.0302, "step": 49300 }, { "epoch": 2.261827089271202, "grad_norm": 0.4281846880912781, "learning_rate": 9.522718786140097e-05, "loss": 0.0328, "step": 49400 }, { "epoch": 2.2664056866179045, "grad_norm": 1.395179033279419, "learning_rate": 9.519305616706953e-05, "loss": 0.0321, "step": 49500 }, { "epoch": 2.2709842839646073, "grad_norm": 0.22532618045806885, "learning_rate": 9.515880902615661e-05, "loss": 0.0323, "step": 49600 }, { "epoch": 2.2755628813113105, "grad_norm": 0.2474541962146759, "learning_rate": 9.512444652614728e-05, "loss": 0.0318, "step": 49700 }, { "epoch": 2.280141478658013, "grad_norm": 0.2567112445831299, "learning_rate": 9.50899687548212e-05, "loss": 0.0334, "step": 49800 }, { "epoch": 2.284720076004716, "grad_norm": 0.446916401386261, "learning_rate": 9.505537580025256e-05, "loss": 0.0314, "step": 49900 }, { "epoch": 2.2892986733514187, "grad_norm": 0.4602959454059601, "learning_rate": 9.502066775080976e-05, "loss": 0.0287, "step": 50000 }, { "epoch": 2.293877270698122, "grad_norm": 0.16146403551101685, "learning_rate": 9.49858446951552e-05, "loss": 0.0304, "step": 50100 }, { "epoch": 2.2984558680448246, "grad_norm": 0.5386151075363159, "learning_rate": 9.495090672224511e-05, "loss": 0.0312, "step": 50200 }, { "epoch": 2.3030344653915273, "grad_norm": 0.4908753037452698, "learning_rate": 9.491585392132924e-05, "loss": 0.0266, "step": 50300 }, { "epoch": 2.30761306273823, "grad_norm": 0.2822779715061188, "learning_rate": 9.48806863819507e-05, "loss": 0.031, "step": 50400 }, { "epoch": 2.312191660084933, "grad_norm": 0.4930579364299774, "learning_rate": 9.484540419394568e-05, "loss": 0.0264, "step": 50500 }, { "epoch": 2.316770257431636, "grad_norm": 0.3992035686969757, "learning_rate": 9.481000744744321e-05, "loss": 0.0331, "step": 50600 }, { "epoch": 2.3213488547783387, "grad_norm": 0.8977420926094055, "learning_rate": 9.477449623286505e-05, "loss": 0.0294, "step": 50700 }, { "epoch": 2.3259274521250415, "grad_norm": 0.26158419251441956, "learning_rate": 9.473887064092531e-05, "loss": 0.0332, "step": 50800 }, { "epoch": 2.3305060494717442, "grad_norm": 0.4764149487018585, "learning_rate": 9.470313076263025e-05, "loss": 0.0335, "step": 50900 }, { "epoch": 2.335084646818447, "grad_norm": 0.21730680763721466, "learning_rate": 9.466727668927816e-05, "loss": 0.0285, "step": 51000 }, { "epoch": 2.33966324416515, "grad_norm": 0.7260767817497253, "learning_rate": 9.463130851245898e-05, "loss": 0.0336, "step": 51100 }, { "epoch": 2.344241841511853, "grad_norm": 0.688016951084137, "learning_rate": 9.459522632405415e-05, "loss": 0.0291, "step": 51200 }, { "epoch": 2.3488204388585556, "grad_norm": 0.172237828373909, "learning_rate": 9.455903021623637e-05, "loss": 0.0287, "step": 51300 }, { "epoch": 2.3533990362052584, "grad_norm": 0.42502665519714355, "learning_rate": 9.452272028146932e-05, "loss": 0.0304, "step": 51400 }, { "epoch": 2.3579776335519615, "grad_norm": 0.6109219193458557, "learning_rate": 9.448629661250745e-05, "loss": 0.0353, "step": 51500 }, { "epoch": 2.3625562308986643, "grad_norm": 0.6103388071060181, "learning_rate": 9.444975930239581e-05, "loss": 0.0293, "step": 51600 }, { "epoch": 2.367134828245367, "grad_norm": 0.6018409132957458, "learning_rate": 9.441310844446965e-05, "loss": 0.0349, "step": 51700 }, { "epoch": 2.3717134255920698, "grad_norm": 0.3021933436393738, "learning_rate": 9.437634413235436e-05, "loss": 0.0294, "step": 51800 }, { "epoch": 2.376292022938773, "grad_norm": 0.42470723390579224, "learning_rate": 9.433946645996514e-05, "loss": 0.0296, "step": 51900 }, { "epoch": 2.3808706202854757, "grad_norm": 0.3379852771759033, "learning_rate": 9.430247552150673e-05, "loss": 0.0294, "step": 52000 }, { "epoch": 2.3854492176321784, "grad_norm": 0.22888457775115967, "learning_rate": 9.426537141147322e-05, "loss": 0.0286, "step": 52100 }, { "epoch": 2.390027814978881, "grad_norm": 0.5915461778640747, "learning_rate": 9.422815422464786e-05, "loss": 0.0353, "step": 52200 }, { "epoch": 2.394606412325584, "grad_norm": 0.4212239980697632, "learning_rate": 9.419082405610267e-05, "loss": 0.0293, "step": 52300 }, { "epoch": 2.399185009672287, "grad_norm": 0.21963568031787872, "learning_rate": 9.415338100119833e-05, "loss": 0.0291, "step": 52400 }, { "epoch": 2.40376360701899, "grad_norm": 0.40482693910598755, "learning_rate": 9.41158251555839e-05, "loss": 0.0299, "step": 52500 }, { "epoch": 2.4083422043656926, "grad_norm": 1.0071722269058228, "learning_rate": 9.407815661519655e-05, "loss": 0.0272, "step": 52600 }, { "epoch": 2.4129208017123953, "grad_norm": 0.9266312718391418, "learning_rate": 9.404037547626134e-05, "loss": 0.0292, "step": 52700 }, { "epoch": 2.417499399059098, "grad_norm": 0.9991750121116638, "learning_rate": 9.400248183529093e-05, "loss": 0.0341, "step": 52800 }, { "epoch": 2.4220779964058012, "grad_norm": 0.4451786279678345, "learning_rate": 9.396447578908543e-05, "loss": 0.0308, "step": 52900 }, { "epoch": 2.426656593752504, "grad_norm": 0.7537618279457092, "learning_rate": 9.392635743473204e-05, "loss": 0.0335, "step": 53000 }, { "epoch": 2.4312351910992067, "grad_norm": 0.3898552358150482, "learning_rate": 9.388812686960486e-05, "loss": 0.0303, "step": 53100 }, { "epoch": 2.4358137884459095, "grad_norm": 0.1613057404756546, "learning_rate": 9.384978419136468e-05, "loss": 0.0319, "step": 53200 }, { "epoch": 2.440392385792612, "grad_norm": 0.34397152066230774, "learning_rate": 9.381132949795861e-05, "loss": 0.0343, "step": 53300 }, { "epoch": 2.4449709831393154, "grad_norm": 0.38366371393203735, "learning_rate": 9.377276288761997e-05, "loss": 0.0296, "step": 53400 }, { "epoch": 2.449549580486018, "grad_norm": 0.15570569038391113, "learning_rate": 9.373408445886798e-05, "loss": 0.0294, "step": 53500 }, { "epoch": 2.454128177832721, "grad_norm": 0.2775089144706726, "learning_rate": 9.369529431050743e-05, "loss": 0.0301, "step": 53600 }, { "epoch": 2.4587067751794236, "grad_norm": 0.20707450807094574, "learning_rate": 9.365639254162854e-05, "loss": 0.0301, "step": 53700 }, { "epoch": 2.463285372526127, "grad_norm": 0.16948607563972473, "learning_rate": 9.36173792516067e-05, "loss": 0.0329, "step": 53800 }, { "epoch": 2.4678639698728295, "grad_norm": 0.4837573766708374, "learning_rate": 9.357825454010213e-05, "loss": 0.0299, "step": 53900 }, { "epoch": 2.4724425672195323, "grad_norm": 0.4705110490322113, "learning_rate": 9.353901850705972e-05, "loss": 0.0312, "step": 54000 }, { "epoch": 2.477021164566235, "grad_norm": 0.6251786947250366, "learning_rate": 9.349967125270871e-05, "loss": 0.0306, "step": 54100 }, { "epoch": 2.481599761912938, "grad_norm": 0.27536630630493164, "learning_rate": 9.346021287756246e-05, "loss": 0.03, "step": 54200 }, { "epoch": 2.486178359259641, "grad_norm": 0.581510066986084, "learning_rate": 9.342064348241818e-05, "loss": 0.0326, "step": 54300 }, { "epoch": 2.4907569566063437, "grad_norm": 0.4884732961654663, "learning_rate": 9.338096316835671e-05, "loss": 0.0281, "step": 54400 }, { "epoch": 2.4953355539530464, "grad_norm": 0.34184959530830383, "learning_rate": 9.334117203674219e-05, "loss": 0.0308, "step": 54500 }, { "epoch": 2.499914151299749, "grad_norm": 1.2529618740081787, "learning_rate": 9.330127018922194e-05, "loss": 0.0286, "step": 54600 }, { "epoch": 2.5044927486464523, "grad_norm": 0.3773830831050873, "learning_rate": 9.326125772772597e-05, "loss": 0.0313, "step": 54700 }, { "epoch": 2.509071345993155, "grad_norm": 0.5453410744667053, "learning_rate": 9.322113475446698e-05, "loss": 0.029, "step": 54800 }, { "epoch": 2.513649943339858, "grad_norm": 0.4246394634246826, "learning_rate": 9.318090137193988e-05, "loss": 0.0333, "step": 54900 }, { "epoch": 2.5182285406865605, "grad_norm": 0.46837061643600464, "learning_rate": 9.314055768292169e-05, "loss": 0.0311, "step": 55000 }, { "epoch": 2.5228071380332633, "grad_norm": 0.31000879406929016, "learning_rate": 9.310010379047119e-05, "loss": 0.0288, "step": 55100 }, { "epoch": 2.5273857353799665, "grad_norm": 0.36738067865371704, "learning_rate": 9.305953979792865e-05, "loss": 0.0318, "step": 55200 }, { "epoch": 2.531964332726669, "grad_norm": 0.29930517077445984, "learning_rate": 9.301886580891562e-05, "loss": 0.0285, "step": 55300 }, { "epoch": 2.536542930073372, "grad_norm": 0.22497807443141937, "learning_rate": 9.297808192733464e-05, "loss": 0.0283, "step": 55400 }, { "epoch": 2.541121527420075, "grad_norm": 0.6719942688941956, "learning_rate": 9.293718825736897e-05, "loss": 0.0283, "step": 55500 }, { "epoch": 2.5457001247667774, "grad_norm": 0.32624194025993347, "learning_rate": 9.289618490348228e-05, "loss": 0.0309, "step": 55600 }, { "epoch": 2.5502787221134806, "grad_norm": 0.5497521162033081, "learning_rate": 9.285507197041853e-05, "loss": 0.0288, "step": 55700 }, { "epoch": 2.5548573194601834, "grad_norm": 0.8471511006355286, "learning_rate": 9.281384956320153e-05, "loss": 0.0302, "step": 55800 }, { "epoch": 2.559435916806886, "grad_norm": 0.40366891026496887, "learning_rate": 9.277251778713474e-05, "loss": 0.0279, "step": 55900 }, { "epoch": 2.5640145141535893, "grad_norm": 0.10733508318662643, "learning_rate": 9.273107674780102e-05, "loss": 0.0285, "step": 56000 }, { "epoch": 2.568593111500292, "grad_norm": 0.2292618602514267, "learning_rate": 9.268952655106236e-05, "loss": 0.0266, "step": 56100 }, { "epoch": 2.5731717088469948, "grad_norm": 0.4450601637363434, "learning_rate": 9.26478673030596e-05, "loss": 0.0297, "step": 56200 }, { "epoch": 2.5777503061936975, "grad_norm": 1.0813257694244385, "learning_rate": 9.260609911021209e-05, "loss": 0.0319, "step": 56300 }, { "epoch": 2.5823289035404002, "grad_norm": 0.3026310205459595, "learning_rate": 9.256422207921757e-05, "loss": 0.0315, "step": 56400 }, { "epoch": 2.5869075008871034, "grad_norm": 0.23144447803497314, "learning_rate": 9.252223631705175e-05, "loss": 0.0294, "step": 56500 }, { "epoch": 2.591486098233806, "grad_norm": 0.38160964846611023, "learning_rate": 9.248014193096811e-05, "loss": 0.031, "step": 56600 }, { "epoch": 2.596064695580509, "grad_norm": 0.2660236060619354, "learning_rate": 9.243793902849763e-05, "loss": 0.0279, "step": 56700 }, { "epoch": 2.6006432929272116, "grad_norm": 0.7620320320129395, "learning_rate": 9.239562771744848e-05, "loss": 0.0318, "step": 56800 }, { "epoch": 2.6052218902739144, "grad_norm": 0.5840933918952942, "learning_rate": 9.235320810590575e-05, "loss": 0.0317, "step": 56900 }, { "epoch": 2.6098004876206176, "grad_norm": 0.3403662443161011, "learning_rate": 9.231068030223122e-05, "loss": 0.0322, "step": 57000 }, { "epoch": 2.6143790849673203, "grad_norm": 0.2513747811317444, "learning_rate": 9.226804441506302e-05, "loss": 0.0295, "step": 57100 }, { "epoch": 2.618957682314023, "grad_norm": 0.433006227016449, "learning_rate": 9.22253005533154e-05, "loss": 0.0308, "step": 57200 }, { "epoch": 2.623536279660726, "grad_norm": 0.3026902675628662, "learning_rate": 9.218244882617842e-05, "loss": 0.0253, "step": 57300 }, { "epoch": 2.6281148770074285, "grad_norm": 0.4516427516937256, "learning_rate": 9.213948934311767e-05, "loss": 0.0295, "step": 57400 }, { "epoch": 2.6326934743541317, "grad_norm": 0.26671695709228516, "learning_rate": 9.209642221387405e-05, "loss": 0.028, "step": 57500 }, { "epoch": 2.6372720717008344, "grad_norm": 0.5790793299674988, "learning_rate": 9.20532475484634e-05, "loss": 0.0301, "step": 57600 }, { "epoch": 2.641850669047537, "grad_norm": 0.37218374013900757, "learning_rate": 9.200996545717629e-05, "loss": 0.0302, "step": 57700 }, { "epoch": 2.6464292663942404, "grad_norm": 0.5596415400505066, "learning_rate": 9.196657605057769e-05, "loss": 0.0332, "step": 57800 }, { "epoch": 2.651007863740943, "grad_norm": 0.6026178002357483, "learning_rate": 9.192307943950675e-05, "loss": 0.0297, "step": 57900 }, { "epoch": 2.655586461087646, "grad_norm": 0.13471604883670807, "learning_rate": 9.187947573507642e-05, "loss": 0.031, "step": 58000 }, { "epoch": 2.6601650584343486, "grad_norm": 0.7578465342521667, "learning_rate": 9.183576504867327e-05, "loss": 0.0255, "step": 58100 }, { "epoch": 2.6647436557810513, "grad_norm": 0.32717058062553406, "learning_rate": 9.179194749195713e-05, "loss": 0.0295, "step": 58200 }, { "epoch": 2.6693222531277545, "grad_norm": 0.4371168911457062, "learning_rate": 9.174802317686084e-05, "loss": 0.0297, "step": 58300 }, { "epoch": 2.6739008504744572, "grad_norm": 0.3458854854106903, "learning_rate": 9.170399221558995e-05, "loss": 0.0252, "step": 58400 }, { "epoch": 2.67847944782116, "grad_norm": 0.8889488577842712, "learning_rate": 9.165985472062246e-05, "loss": 0.0292, "step": 58500 }, { "epoch": 2.6830580451678627, "grad_norm": 0.6179521679878235, "learning_rate": 9.161561080470847e-05, "loss": 0.0304, "step": 58600 }, { "epoch": 2.6876366425145655, "grad_norm": 0.3913422226905823, "learning_rate": 9.157126058087e-05, "loss": 0.0321, "step": 58700 }, { "epoch": 2.6922152398612686, "grad_norm": 0.31714576482772827, "learning_rate": 9.152680416240059e-05, "loss": 0.0306, "step": 58800 }, { "epoch": 2.6967938372079714, "grad_norm": 0.16598474979400635, "learning_rate": 9.148224166286506e-05, "loss": 0.0308, "step": 58900 }, { "epoch": 2.701372434554674, "grad_norm": 0.4751458466053009, "learning_rate": 9.14375731960992e-05, "loss": 0.0328, "step": 59000 }, { "epoch": 2.705951031901377, "grad_norm": 0.8825288414955139, "learning_rate": 9.139279887620955e-05, "loss": 0.0288, "step": 59100 }, { "epoch": 2.7105296292480796, "grad_norm": 0.4172840714454651, "learning_rate": 9.1347918817573e-05, "loss": 0.0301, "step": 59200 }, { "epoch": 2.715108226594783, "grad_norm": 0.3465460538864136, "learning_rate": 9.13029331348366e-05, "loss": 0.0252, "step": 59300 }, { "epoch": 2.7196868239414855, "grad_norm": 1.264923095703125, "learning_rate": 9.125784194291717e-05, "loss": 0.0272, "step": 59400 }, { "epoch": 2.7242654212881883, "grad_norm": 0.2547473907470703, "learning_rate": 9.121264535700107e-05, "loss": 0.0332, "step": 59500 }, { "epoch": 2.7288440186348915, "grad_norm": 0.508148193359375, "learning_rate": 9.116734349254393e-05, "loss": 0.0317, "step": 59600 }, { "epoch": 2.733422615981594, "grad_norm": 0.6783300638198853, "learning_rate": 9.112193646527024e-05, "loss": 0.0331, "step": 59700 }, { "epoch": 2.738001213328297, "grad_norm": 0.16436424851417542, "learning_rate": 9.107642439117321e-05, "loss": 0.0288, "step": 59800 }, { "epoch": 2.7425798106749997, "grad_norm": 0.4682653546333313, "learning_rate": 9.103080738651434e-05, "loss": 0.0287, "step": 59900 }, { "epoch": 2.7471584080217024, "grad_norm": 0.6873565912246704, "learning_rate": 9.09850855678232e-05, "loss": 0.0337, "step": 60000 }, { "epoch": 2.7517370053684056, "grad_norm": 0.6117233037948608, "learning_rate": 9.093925905189713e-05, "loss": 0.0298, "step": 60100 }, { "epoch": 2.7563156027151083, "grad_norm": 0.17423506081104279, "learning_rate": 9.089332795580086e-05, "loss": 0.03, "step": 60200 }, { "epoch": 2.760894200061811, "grad_norm": 0.5828815698623657, "learning_rate": 9.084729239686633e-05, "loss": 0.0289, "step": 60300 }, { "epoch": 2.765472797408514, "grad_norm": 0.2698822021484375, "learning_rate": 9.080115249269232e-05, "loss": 0.0298, "step": 60400 }, { "epoch": 2.7700513947552166, "grad_norm": 0.5367493629455566, "learning_rate": 9.075490836114413e-05, "loss": 0.0322, "step": 60500 }, { "epoch": 2.7746299921019197, "grad_norm": 0.4073825478553772, "learning_rate": 9.070856012035336e-05, "loss": 0.0292, "step": 60600 }, { "epoch": 2.7792085894486225, "grad_norm": 0.22106589376926422, "learning_rate": 9.066210788871751e-05, "loss": 0.0308, "step": 60700 }, { "epoch": 2.783787186795325, "grad_norm": 0.575246274471283, "learning_rate": 9.061555178489978e-05, "loss": 0.0284, "step": 60800 }, { "epoch": 2.788365784142028, "grad_norm": 0.44034871459007263, "learning_rate": 9.056889192782866e-05, "loss": 0.0277, "step": 60900 }, { "epoch": 2.7929443814887307, "grad_norm": 0.2914714217185974, "learning_rate": 9.05221284366977e-05, "loss": 0.0298, "step": 61000 }, { "epoch": 2.797522978835434, "grad_norm": 0.411410391330719, "learning_rate": 9.04752614309652e-05, "loss": 0.0256, "step": 61100 }, { "epoch": 2.8021015761821366, "grad_norm": 0.172648623585701, "learning_rate": 9.04282910303539e-05, "loss": 0.0326, "step": 61200 }, { "epoch": 2.8066801735288394, "grad_norm": 0.279862642288208, "learning_rate": 9.038121735485062e-05, "loss": 0.0275, "step": 61300 }, { "epoch": 2.8112587708755425, "grad_norm": 0.2992120385169983, "learning_rate": 9.033404052470602e-05, "loss": 0.0287, "step": 61400 }, { "epoch": 2.815837368222245, "grad_norm": 0.3917059004306793, "learning_rate": 9.028676066043428e-05, "loss": 0.0316, "step": 61500 }, { "epoch": 2.820415965568948, "grad_norm": 0.5848602056503296, "learning_rate": 9.023937788281278e-05, "loss": 0.0303, "step": 61600 }, { "epoch": 2.8249945629156508, "grad_norm": 0.4045267701148987, "learning_rate": 9.019189231288176e-05, "loss": 0.0282, "step": 61700 }, { "epoch": 2.8295731602623535, "grad_norm": 0.38309866189956665, "learning_rate": 9.014430407194413e-05, "loss": 0.0287, "step": 61800 }, { "epoch": 2.8341517576090567, "grad_norm": 0.7173412442207336, "learning_rate": 9.009661328156498e-05, "loss": 0.0274, "step": 61900 }, { "epoch": 2.8387303549557594, "grad_norm": 0.37477946281433105, "learning_rate": 9.00488200635714e-05, "loss": 0.0303, "step": 62000 }, { "epoch": 2.843308952302462, "grad_norm": 0.26493415236473083, "learning_rate": 9.000092454005216e-05, "loss": 0.0289, "step": 62100 }, { "epoch": 2.847887549649165, "grad_norm": 0.15275776386260986, "learning_rate": 8.995292683335733e-05, "loss": 0.0304, "step": 62200 }, { "epoch": 2.8524661469958676, "grad_norm": 0.2792358994483948, "learning_rate": 8.990482706609805e-05, "loss": 0.0311, "step": 62300 }, { "epoch": 2.857044744342571, "grad_norm": 0.4240334630012512, "learning_rate": 8.985662536114613e-05, "loss": 0.0304, "step": 62400 }, { "epoch": 2.8616233416892736, "grad_norm": 0.137941375374794, "learning_rate": 8.980832184163382e-05, "loss": 0.0309, "step": 62500 }, { "epoch": 2.8662019390359763, "grad_norm": 0.2340019941329956, "learning_rate": 8.975991663095344e-05, "loss": 0.0296, "step": 62600 }, { "epoch": 2.870780536382679, "grad_norm": 0.39523446559906006, "learning_rate": 8.97114098527571e-05, "loss": 0.0284, "step": 62700 }, { "epoch": 2.875359133729382, "grad_norm": 0.5535847544670105, "learning_rate": 8.966280163095633e-05, "loss": 0.0325, "step": 62800 }, { "epoch": 2.879937731076085, "grad_norm": 0.4570659101009369, "learning_rate": 8.961409208972182e-05, "loss": 0.0237, "step": 62900 }, { "epoch": 2.8845163284227877, "grad_norm": 0.5584346055984497, "learning_rate": 8.95652813534831e-05, "loss": 0.0358, "step": 63000 }, { "epoch": 2.8890949257694905, "grad_norm": 0.961768388748169, "learning_rate": 8.951636954692819e-05, "loss": 0.0299, "step": 63100 }, { "epoch": 2.893673523116193, "grad_norm": 0.24575570225715637, "learning_rate": 8.94673567950033e-05, "loss": 0.0282, "step": 63200 }, { "epoch": 2.898252120462896, "grad_norm": 0.32376107573509216, "learning_rate": 8.941824322291246e-05, "loss": 0.0263, "step": 63300 }, { "epoch": 2.902830717809599, "grad_norm": 0.20682887732982635, "learning_rate": 8.936902895611732e-05, "loss": 0.0313, "step": 63400 }, { "epoch": 2.907409315156302, "grad_norm": 0.29019802808761597, "learning_rate": 8.931971412033673e-05, "loss": 0.0327, "step": 63500 }, { "epoch": 2.9119879125030046, "grad_norm": 0.6069703102111816, "learning_rate": 8.927029884154646e-05, "loss": 0.0272, "step": 63600 }, { "epoch": 2.9165665098497078, "grad_norm": 0.5670173168182373, "learning_rate": 8.922078324597879e-05, "loss": 0.0317, "step": 63700 }, { "epoch": 2.9211451071964105, "grad_norm": 0.29881516098976135, "learning_rate": 8.917116746012235e-05, "loss": 0.0283, "step": 63800 }, { "epoch": 2.9257237045431133, "grad_norm": 0.722374439239502, "learning_rate": 8.91214516107217e-05, "loss": 0.0295, "step": 63900 }, { "epoch": 2.930302301889816, "grad_norm": 0.4505271315574646, "learning_rate": 8.907163582477693e-05, "loss": 0.0282, "step": 64000 }, { "epoch": 2.9348808992365187, "grad_norm": 0.9996728301048279, "learning_rate": 8.902172022954353e-05, "loss": 0.0283, "step": 64100 }, { "epoch": 2.939459496583222, "grad_norm": 0.5205316543579102, "learning_rate": 8.897170495253187e-05, "loss": 0.0281, "step": 64200 }, { "epoch": 2.9440380939299247, "grad_norm": 0.6521015763282776, "learning_rate": 8.892159012150701e-05, "loss": 0.0279, "step": 64300 }, { "epoch": 2.9486166912766274, "grad_norm": 0.8637863397598267, "learning_rate": 8.88713758644883e-05, "loss": 0.0277, "step": 64400 }, { "epoch": 2.95319528862333, "grad_norm": 0.9392446875572205, "learning_rate": 8.88210623097491e-05, "loss": 0.0256, "step": 64500 }, { "epoch": 2.957773885970033, "grad_norm": 0.23240399360656738, "learning_rate": 8.877064958581636e-05, "loss": 0.0276, "step": 64600 }, { "epoch": 2.962352483316736, "grad_norm": 0.5640022158622742, "learning_rate": 8.872013782147047e-05, "loss": 0.0294, "step": 64700 }, { "epoch": 2.966931080663439, "grad_norm": 0.254486620426178, "learning_rate": 8.86695271457447e-05, "loss": 0.0267, "step": 64800 }, { "epoch": 2.9715096780101415, "grad_norm": 0.4906103014945984, "learning_rate": 8.86188176879251e-05, "loss": 0.0279, "step": 64900 }, { "epoch": 2.9760882753568443, "grad_norm": 0.3822503387928009, "learning_rate": 8.856800957755e-05, "loss": 0.0299, "step": 65000 }, { "epoch": 2.980666872703547, "grad_norm": 0.4109038710594177, "learning_rate": 8.851710294440973e-05, "loss": 0.0297, "step": 65100 }, { "epoch": 2.98524547005025, "grad_norm": 0.4413500130176544, "learning_rate": 8.846609791854633e-05, "loss": 0.0272, "step": 65200 }, { "epoch": 2.989824067396953, "grad_norm": 0.762428879737854, "learning_rate": 8.84149946302532e-05, "loss": 0.0279, "step": 65300 }, { "epoch": 2.9944026647436557, "grad_norm": 0.9755131602287292, "learning_rate": 8.83637932100747e-05, "loss": 0.0294, "step": 65400 }, { "epoch": 2.998981262090359, "grad_norm": 0.3907323181629181, "learning_rate": 8.831249378880591e-05, "loss": 0.0312, "step": 65500 }, { "epoch": 2.9999885535066335, "eval_loss": 0.14470230042934418, "eval_runtime": 251.7148, "eval_samples_per_second": 21.85, "eval_steps_per_second": 21.85, "step": 65522 }, { "epoch": 3.0035598594370616, "grad_norm": 0.09335774928331375, "learning_rate": 8.826109649749224e-05, "loss": 0.024, "step": 65600 }, { "epoch": 3.0081384567837643, "grad_norm": 0.43074119091033936, "learning_rate": 8.820960146742913e-05, "loss": 0.0205, "step": 65700 }, { "epoch": 3.012717054130467, "grad_norm": 0.5296483635902405, "learning_rate": 8.815800883016168e-05, "loss": 0.0223, "step": 65800 }, { "epoch": 3.01729565147717, "grad_norm": 0.3759153187274933, "learning_rate": 8.810631871748432e-05, "loss": 0.0207, "step": 65900 }, { "epoch": 3.021874248823873, "grad_norm": 0.6265881657600403, "learning_rate": 8.805453126144047e-05, "loss": 0.0218, "step": 66000 }, { "epoch": 3.0264528461705758, "grad_norm": 1.2174720764160156, "learning_rate": 8.800264659432232e-05, "loss": 0.0217, "step": 66100 }, { "epoch": 3.0310314435172785, "grad_norm": 0.9290931224822998, "learning_rate": 8.795066484867023e-05, "loss": 0.0199, "step": 66200 }, { "epoch": 3.0356100408639812, "grad_norm": 0.6158362030982971, "learning_rate": 8.789858615727265e-05, "loss": 0.0182, "step": 66300 }, { "epoch": 3.040188638210684, "grad_norm": 0.35175448656082153, "learning_rate": 8.784641065316567e-05, "loss": 0.0192, "step": 66400 }, { "epoch": 3.044767235557387, "grad_norm": 0.6219223141670227, "learning_rate": 8.779413846963267e-05, "loss": 0.0174, "step": 66500 }, { "epoch": 3.04934583290409, "grad_norm": 0.1079217791557312, "learning_rate": 8.7741769740204e-05, "loss": 0.0213, "step": 66600 }, { "epoch": 3.0539244302507926, "grad_norm": 0.4346974790096283, "learning_rate": 8.768930459865665e-05, "loss": 0.0207, "step": 66700 }, { "epoch": 3.0585030275974954, "grad_norm": 0.26265600323677063, "learning_rate": 8.76367431790139e-05, "loss": 0.0213, "step": 66800 }, { "epoch": 3.0630816249441986, "grad_norm": 0.536638617515564, "learning_rate": 8.758408561554495e-05, "loss": 0.0207, "step": 66900 }, { "epoch": 3.0676602222909013, "grad_norm": 0.4859350025653839, "learning_rate": 8.753133204276462e-05, "loss": 0.0208, "step": 67000 }, { "epoch": 3.072238819637604, "grad_norm": 0.03394511342048645, "learning_rate": 8.7478482595433e-05, "loss": 0.0202, "step": 67100 }, { "epoch": 3.0768174169843068, "grad_norm": 1.2979317903518677, "learning_rate": 8.742553740855506e-05, "loss": 0.02, "step": 67200 }, { "epoch": 3.0813960143310095, "grad_norm": 0.7448957562446594, "learning_rate": 8.737249661738036e-05, "loss": 0.02, "step": 67300 }, { "epoch": 3.0859746116777127, "grad_norm": 0.45380228757858276, "learning_rate": 8.731936035740269e-05, "loss": 0.0214, "step": 67400 }, { "epoch": 3.0905532090244154, "grad_norm": 0.49080690741539, "learning_rate": 8.726612876435972e-05, "loss": 0.0206, "step": 67500 }, { "epoch": 3.095131806371118, "grad_norm": 0.2271386682987213, "learning_rate": 8.721280197423258e-05, "loss": 0.0218, "step": 67600 }, { "epoch": 3.099710403717821, "grad_norm": 0.7691048383712769, "learning_rate": 8.71593801232457e-05, "loss": 0.0184, "step": 67700 }, { "epoch": 3.104289001064524, "grad_norm": 0.37762150168418884, "learning_rate": 8.710586334786627e-05, "loss": 0.0196, "step": 67800 }, { "epoch": 3.108867598411227, "grad_norm": 0.4796387255191803, "learning_rate": 8.705225178480398e-05, "loss": 0.0194, "step": 67900 }, { "epoch": 3.1134461957579296, "grad_norm": 0.1666077822446823, "learning_rate": 8.699854557101063e-05, "loss": 0.0215, "step": 68000 }, { "epoch": 3.1180247931046323, "grad_norm": 0.287124365568161, "learning_rate": 8.69447448436799e-05, "loss": 0.0184, "step": 68100 }, { "epoch": 3.122603390451335, "grad_norm": 0.2599179744720459, "learning_rate": 8.689084974024677e-05, "loss": 0.0185, "step": 68200 }, { "epoch": 3.1271819877980382, "grad_norm": 0.33696624636650085, "learning_rate": 8.683686039838742e-05, "loss": 0.0199, "step": 68300 }, { "epoch": 3.131760585144741, "grad_norm": 0.4512630105018616, "learning_rate": 8.678277695601872e-05, "loss": 0.0189, "step": 68400 }, { "epoch": 3.1363391824914437, "grad_norm": 1.3083339929580688, "learning_rate": 8.67285995512979e-05, "loss": 0.0205, "step": 68500 }, { "epoch": 3.1409177798381465, "grad_norm": 0.5254839658737183, "learning_rate": 8.66743283226223e-05, "loss": 0.021, "step": 68600 }, { "epoch": 3.145496377184849, "grad_norm": 0.37214428186416626, "learning_rate": 8.66199634086288e-05, "loss": 0.0214, "step": 68700 }, { "epoch": 3.1500749745315524, "grad_norm": 0.39814454317092896, "learning_rate": 8.656550494819373e-05, "loss": 0.0215, "step": 68800 }, { "epoch": 3.154653571878255, "grad_norm": 0.7737843990325928, "learning_rate": 8.651095308043232e-05, "loss": 0.0199, "step": 68900 }, { "epoch": 3.159232169224958, "grad_norm": 0.32976606488227844, "learning_rate": 8.645630794469843e-05, "loss": 0.0232, "step": 69000 }, { "epoch": 3.1638107665716606, "grad_norm": 0.23388764262199402, "learning_rate": 8.640156968058417e-05, "loss": 0.0197, "step": 69100 }, { "epoch": 3.168389363918364, "grad_norm": 0.15984760224819183, "learning_rate": 8.634673842791956e-05, "loss": 0.0212, "step": 69200 }, { "epoch": 3.1729679612650665, "grad_norm": 0.20868225395679474, "learning_rate": 8.629181432677213e-05, "loss": 0.02, "step": 69300 }, { "epoch": 3.1775465586117693, "grad_norm": 0.12190031260251999, "learning_rate": 8.623679751744662e-05, "loss": 0.0195, "step": 69400 }, { "epoch": 3.182125155958472, "grad_norm": 0.7357327342033386, "learning_rate": 8.61816881404846e-05, "loss": 0.0212, "step": 69500 }, { "epoch": 3.186703753305175, "grad_norm": 0.231657475233078, "learning_rate": 8.612648633666406e-05, "loss": 0.0181, "step": 69600 }, { "epoch": 3.191282350651878, "grad_norm": 0.9028156995773315, "learning_rate": 8.607119224699919e-05, "loss": 0.0216, "step": 69700 }, { "epoch": 3.1958609479985807, "grad_norm": 0.30773207545280457, "learning_rate": 8.601580601273982e-05, "loss": 0.0189, "step": 69800 }, { "epoch": 3.2004395453452834, "grad_norm": 0.15716642141342163, "learning_rate": 8.596032777537123e-05, "loss": 0.022, "step": 69900 }, { "epoch": 3.205018142691986, "grad_norm": 0.2637390196323395, "learning_rate": 8.59047576766137e-05, "loss": 0.0174, "step": 70000 }, { "epoch": 3.2095967400386893, "grad_norm": 0.29018816351890564, "learning_rate": 8.584909585842218e-05, "loss": 0.0205, "step": 70100 }, { "epoch": 3.214175337385392, "grad_norm": 0.6676698327064514, "learning_rate": 8.579334246298593e-05, "loss": 0.0176, "step": 70200 }, { "epoch": 3.218753934732095, "grad_norm": 0.3571256101131439, "learning_rate": 8.573749763272811e-05, "loss": 0.0229, "step": 70300 }, { "epoch": 3.2233325320787976, "grad_norm": 0.7378453016281128, "learning_rate": 8.568156151030549e-05, "loss": 0.0185, "step": 70400 }, { "epoch": 3.2279111294255003, "grad_norm": 0.533330500125885, "learning_rate": 8.562553423860802e-05, "loss": 0.0207, "step": 70500 }, { "epoch": 3.2324897267722035, "grad_norm": 0.28255611658096313, "learning_rate": 8.556941596075852e-05, "loss": 0.0185, "step": 70600 }, { "epoch": 3.237068324118906, "grad_norm": 0.37244170904159546, "learning_rate": 8.551320682011228e-05, "loss": 0.0217, "step": 70700 }, { "epoch": 3.241646921465609, "grad_norm": 0.16496537625789642, "learning_rate": 8.545690696025666e-05, "loss": 0.0238, "step": 70800 }, { "epoch": 3.2462255188123117, "grad_norm": 1.0030924081802368, "learning_rate": 8.540051652501082e-05, "loss": 0.0213, "step": 70900 }, { "epoch": 3.250804116159015, "grad_norm": 0.7419716715812683, "learning_rate": 8.534403565842528e-05, "loss": 0.0225, "step": 71000 }, { "epoch": 3.2553827135057176, "grad_norm": 0.2792261242866516, "learning_rate": 8.528746450478156e-05, "loss": 0.0187, "step": 71100 }, { "epoch": 3.2599613108524204, "grad_norm": 0.0836094543337822, "learning_rate": 8.523080320859181e-05, "loss": 0.0221, "step": 71200 }, { "epoch": 3.264539908199123, "grad_norm": 0.10340839624404907, "learning_rate": 8.517405191459847e-05, "loss": 0.0213, "step": 71300 }, { "epoch": 3.2691185055458263, "grad_norm": 0.7118562459945679, "learning_rate": 8.511721076777389e-05, "loss": 0.0193, "step": 71400 }, { "epoch": 3.273697102892529, "grad_norm": 0.12246321886777878, "learning_rate": 8.50602799133199e-05, "loss": 0.0223, "step": 71500 }, { "epoch": 3.2782757002392318, "grad_norm": 0.2873895764350891, "learning_rate": 8.500325949666755e-05, "loss": 0.0213, "step": 71600 }, { "epoch": 3.2828542975859345, "grad_norm": 0.5243780016899109, "learning_rate": 8.494614966347668e-05, "loss": 0.0201, "step": 71700 }, { "epoch": 3.2874328949326372, "grad_norm": 0.28602150082588196, "learning_rate": 8.488895055963546e-05, "loss": 0.0209, "step": 71800 }, { "epoch": 3.2920114922793404, "grad_norm": 0.35241249203681946, "learning_rate": 8.483166233126022e-05, "loss": 0.0217, "step": 71900 }, { "epoch": 3.296590089626043, "grad_norm": 0.6958779096603394, "learning_rate": 8.477428512469488e-05, "loss": 0.023, "step": 72000 }, { "epoch": 3.301168686972746, "grad_norm": 0.13842323422431946, "learning_rate": 8.471681908651067e-05, "loss": 0.0202, "step": 72100 }, { "epoch": 3.3057472843194486, "grad_norm": 0.21349883079528809, "learning_rate": 8.46592643635058e-05, "loss": 0.0209, "step": 72200 }, { "epoch": 3.3103258816661514, "grad_norm": 0.3605678975582123, "learning_rate": 8.460162110270494e-05, "loss": 0.0241, "step": 72300 }, { "epoch": 3.3149044790128546, "grad_norm": 0.46661075949668884, "learning_rate": 8.454388945135895e-05, "loss": 0.0193, "step": 72400 }, { "epoch": 3.3194830763595573, "grad_norm": 0.24211075901985168, "learning_rate": 8.448606955694457e-05, "loss": 0.0214, "step": 72500 }, { "epoch": 3.32406167370626, "grad_norm": 0.3622238337993622, "learning_rate": 8.442816156716385e-05, "loss": 0.0213, "step": 72600 }, { "epoch": 3.328640271052963, "grad_norm": 1.0499359369277954, "learning_rate": 8.437016562994397e-05, "loss": 0.0196, "step": 72700 }, { "epoch": 3.3332188683996655, "grad_norm": 0.2845001816749573, "learning_rate": 8.43120818934367e-05, "loss": 0.0202, "step": 72800 }, { "epoch": 3.3377974657463687, "grad_norm": 0.5690521001815796, "learning_rate": 8.42539105060181e-05, "loss": 0.0209, "step": 72900 }, { "epoch": 3.3423760630930714, "grad_norm": 0.09998586773872375, "learning_rate": 8.419565161628823e-05, "loss": 0.018, "step": 73000 }, { "epoch": 3.346954660439774, "grad_norm": 0.9970934391021729, "learning_rate": 8.413730537307056e-05, "loss": 0.0213, "step": 73100 }, { "epoch": 3.351533257786477, "grad_norm": 1.1385819911956787, "learning_rate": 8.407887192541177e-05, "loss": 0.0198, "step": 73200 }, { "epoch": 3.35611185513318, "grad_norm": 0.6288115382194519, "learning_rate": 8.402035142258131e-05, "loss": 0.0211, "step": 73300 }, { "epoch": 3.360690452479883, "grad_norm": 0.35352623462677, "learning_rate": 8.396174401407095e-05, "loss": 0.0189, "step": 73400 }, { "epoch": 3.3652690498265856, "grad_norm": 0.5127176642417908, "learning_rate": 8.390304984959454e-05, "loss": 0.0195, "step": 73500 }, { "epoch": 3.3698476471732883, "grad_norm": 0.9110797643661499, "learning_rate": 8.384426907908754e-05, "loss": 0.0219, "step": 73600 }, { "epoch": 3.3744262445199915, "grad_norm": 0.22417746484279633, "learning_rate": 8.378540185270656e-05, "loss": 0.0194, "step": 73700 }, { "epoch": 3.3790048418666943, "grad_norm": 0.49265140295028687, "learning_rate": 8.372644832082917e-05, "loss": 0.0205, "step": 73800 }, { "epoch": 3.383583439213397, "grad_norm": 0.7536473870277405, "learning_rate": 8.366740863405336e-05, "loss": 0.0222, "step": 73900 }, { "epoch": 3.3881620365600997, "grad_norm": 0.2447548657655716, "learning_rate": 8.360828294319721e-05, "loss": 0.0205, "step": 74000 }, { "epoch": 3.3927406339068025, "grad_norm": 0.3335092067718506, "learning_rate": 8.354907139929851e-05, "loss": 0.0208, "step": 74100 }, { "epoch": 3.3973192312535057, "grad_norm": 0.6961463689804077, "learning_rate": 8.348977415361434e-05, "loss": 0.018, "step": 74200 }, { "epoch": 3.4018978286002084, "grad_norm": 0.4184730648994446, "learning_rate": 8.343039135762071e-05, "loss": 0.0198, "step": 74300 }, { "epoch": 3.406476425946911, "grad_norm": 0.6484507918357849, "learning_rate": 8.337092316301223e-05, "loss": 0.0203, "step": 74400 }, { "epoch": 3.411055023293614, "grad_norm": 0.31808891892433167, "learning_rate": 8.331136972170155e-05, "loss": 0.0202, "step": 74500 }, { "epoch": 3.4156336206403166, "grad_norm": 0.6552246809005737, "learning_rate": 8.325173118581919e-05, "loss": 0.0198, "step": 74600 }, { "epoch": 3.42021221798702, "grad_norm": 0.5105406641960144, "learning_rate": 8.319200770771298e-05, "loss": 0.0197, "step": 74700 }, { "epoch": 3.4247908153337225, "grad_norm": 0.9565762877464294, "learning_rate": 8.313219943994777e-05, "loss": 0.019, "step": 74800 }, { "epoch": 3.4293694126804253, "grad_norm": 0.7772880792617798, "learning_rate": 8.3072306535305e-05, "loss": 0.0207, "step": 74900 }, { "epoch": 3.433948010027128, "grad_norm": 0.6711807250976562, "learning_rate": 8.30123291467823e-05, "loss": 0.0222, "step": 75000 }, { "epoch": 3.438526607373831, "grad_norm": 0.10591955482959747, "learning_rate": 8.295226742759315e-05, "loss": 0.0199, "step": 75100 }, { "epoch": 3.443105204720534, "grad_norm": 0.5128488540649414, "learning_rate": 8.289212153116642e-05, "loss": 0.0219, "step": 75200 }, { "epoch": 3.4476838020672367, "grad_norm": 0.24297969043254852, "learning_rate": 8.283189161114602e-05, "loss": 0.0205, "step": 75300 }, { "epoch": 3.4522623994139394, "grad_norm": 0.9164755344390869, "learning_rate": 8.27715778213905e-05, "loss": 0.0224, "step": 75400 }, { "epoch": 3.4568409967606426, "grad_norm": 0.493466317653656, "learning_rate": 8.271118031597271e-05, "loss": 0.0204, "step": 75500 }, { "epoch": 3.4614195941073453, "grad_norm": 0.27884870767593384, "learning_rate": 8.265069924917925e-05, "loss": 0.0199, "step": 75600 }, { "epoch": 3.465998191454048, "grad_norm": 0.2624457776546478, "learning_rate": 8.259013477551027e-05, "loss": 0.0223, "step": 75700 }, { "epoch": 3.470576788800751, "grad_norm": 0.6593875885009766, "learning_rate": 8.252948704967896e-05, "loss": 0.0186, "step": 75800 }, { "epoch": 3.4751553861474536, "grad_norm": 0.398616760969162, "learning_rate": 8.246875622661113e-05, "loss": 0.0199, "step": 75900 }, { "epoch": 3.4797339834941567, "grad_norm": 0.2612878978252411, "learning_rate": 8.240794246144492e-05, "loss": 0.0207, "step": 76000 }, { "epoch": 3.4843125808408595, "grad_norm": 0.21333344280719757, "learning_rate": 8.234704590953033e-05, "loss": 0.0205, "step": 76100 }, { "epoch": 3.4888911781875622, "grad_norm": 1.0213849544525146, "learning_rate": 8.228606672642884e-05, "loss": 0.0199, "step": 76200 }, { "epoch": 3.493469775534265, "grad_norm": 0.29667162895202637, "learning_rate": 8.222500506791304e-05, "loss": 0.0215, "step": 76300 }, { "epoch": 3.4980483728809677, "grad_norm": 0.20311638712882996, "learning_rate": 8.216386108996614e-05, "loss": 0.0219, "step": 76400 }, { "epoch": 3.502626970227671, "grad_norm": 0.8317406177520752, "learning_rate": 8.21026349487817e-05, "loss": 0.0215, "step": 76500 }, { "epoch": 3.5072055675743736, "grad_norm": 0.4841706156730652, "learning_rate": 8.204132680076312e-05, "loss": 0.0207, "step": 76600 }, { "epoch": 3.5117841649210764, "grad_norm": 0.5647122263908386, "learning_rate": 8.197993680252334e-05, "loss": 0.0217, "step": 76700 }, { "epoch": 3.516362762267779, "grad_norm": 0.9369067549705505, "learning_rate": 8.191846511088435e-05, "loss": 0.0215, "step": 76800 }, { "epoch": 3.520941359614482, "grad_norm": 0.7805814743041992, "learning_rate": 8.185691188287684e-05, "loss": 0.0219, "step": 76900 }, { "epoch": 3.525519956961185, "grad_norm": 1.2135581970214844, "learning_rate": 8.179527727573975e-05, "loss": 0.0193, "step": 77000 }, { "epoch": 3.5300985543078878, "grad_norm": 0.14101019501686096, "learning_rate": 8.173356144691999e-05, "loss": 0.0211, "step": 77100 }, { "epoch": 3.5346771516545905, "grad_norm": 0.7078022956848145, "learning_rate": 8.167176455407187e-05, "loss": 0.0204, "step": 77200 }, { "epoch": 3.5392557490012937, "grad_norm": 1.2366012334823608, "learning_rate": 8.160988675505679e-05, "loss": 0.0183, "step": 77300 }, { "epoch": 3.5438343463479964, "grad_norm": 0.26279062032699585, "learning_rate": 8.15479282079429e-05, "loss": 0.02, "step": 77400 }, { "epoch": 3.548412943694699, "grad_norm": 0.21293646097183228, "learning_rate": 8.148588907100454e-05, "loss": 0.0203, "step": 77500 }, { "epoch": 3.552991541041402, "grad_norm": 0.48216012120246887, "learning_rate": 8.142376950272193e-05, "loss": 0.0192, "step": 77600 }, { "epoch": 3.5575701383881047, "grad_norm": 0.1273164004087448, "learning_rate": 8.136156966178081e-05, "loss": 0.0183, "step": 77700 }, { "epoch": 3.562148735734808, "grad_norm": 0.621910035610199, "learning_rate": 8.12992897070719e-05, "loss": 0.0217, "step": 77800 }, { "epoch": 3.5667273330815106, "grad_norm": 0.3813430964946747, "learning_rate": 8.123692979769064e-05, "loss": 0.0184, "step": 77900 }, { "epoch": 3.5713059304282133, "grad_norm": 0.3676023781299591, "learning_rate": 8.117449009293668e-05, "loss": 0.0175, "step": 78000 }, { "epoch": 3.575884527774916, "grad_norm": 0.41113948822021484, "learning_rate": 8.111197075231351e-05, "loss": 0.0194, "step": 78100 }, { "epoch": 3.580463125121619, "grad_norm": 0.2245587855577469, "learning_rate": 8.104937193552806e-05, "loss": 0.0212, "step": 78200 }, { "epoch": 3.585041722468322, "grad_norm": 0.08874198794364929, "learning_rate": 8.098669380249029e-05, "loss": 0.0192, "step": 78300 }, { "epoch": 3.5896203198150247, "grad_norm": 0.29562532901763916, "learning_rate": 8.092393651331275e-05, "loss": 0.022, "step": 78400 }, { "epoch": 3.5941989171617275, "grad_norm": 0.47509998083114624, "learning_rate": 8.086110022831023e-05, "loss": 0.0202, "step": 78500 }, { "epoch": 3.59877751450843, "grad_norm": 0.41073593497276306, "learning_rate": 8.079818510799928e-05, "loss": 0.0214, "step": 78600 }, { "epoch": 3.603356111855133, "grad_norm": 0.2985229790210724, "learning_rate": 8.073519131309786e-05, "loss": 0.0165, "step": 78700 }, { "epoch": 3.607934709201836, "grad_norm": 0.7368443012237549, "learning_rate": 8.067211900452492e-05, "loss": 0.0177, "step": 78800 }, { "epoch": 3.612513306548539, "grad_norm": 0.46281248331069946, "learning_rate": 8.060896834339993e-05, "loss": 0.0221, "step": 78900 }, { "epoch": 3.6170919038952416, "grad_norm": 0.18318797647953033, "learning_rate": 8.054573949104253e-05, "loss": 0.0191, "step": 79000 }, { "epoch": 3.621670501241945, "grad_norm": 0.19009487330913544, "learning_rate": 8.048243260897217e-05, "loss": 0.0212, "step": 79100 }, { "epoch": 3.6262490985886475, "grad_norm": 0.38268911838531494, "learning_rate": 8.041904785890749e-05, "loss": 0.0197, "step": 79200 }, { "epoch": 3.6308276959353503, "grad_norm": 0.3892700672149658, "learning_rate": 8.035558540276618e-05, "loss": 0.0214, "step": 79300 }, { "epoch": 3.635406293282053, "grad_norm": 0.6497855186462402, "learning_rate": 8.029204540266434e-05, "loss": 0.0192, "step": 79400 }, { "epoch": 3.6399848906287557, "grad_norm": 0.20039434731006622, "learning_rate": 8.022842802091623e-05, "loss": 0.0188, "step": 79500 }, { "epoch": 3.644563487975459, "grad_norm": 0.19965870678424835, "learning_rate": 8.016473342003372e-05, "loss": 0.0204, "step": 79600 }, { "epoch": 3.6491420853221617, "grad_norm": 0.12873798608779907, "learning_rate": 8.010096176272595e-05, "loss": 0.0189, "step": 79700 }, { "epoch": 3.6537206826688644, "grad_norm": 0.26886749267578125, "learning_rate": 8.003711321189895e-05, "loss": 0.0206, "step": 79800 }, { "epoch": 3.658299280015567, "grad_norm": 0.4891631305217743, "learning_rate": 7.997318793065513e-05, "loss": 0.0204, "step": 79900 }, { "epoch": 3.66287787736227, "grad_norm": 0.2781907021999359, "learning_rate": 7.99091860822929e-05, "loss": 0.0192, "step": 80000 }, { "epoch": 3.667456474708973, "grad_norm": 0.3009509742259979, "learning_rate": 7.984510783030632e-05, "loss": 0.0185, "step": 80100 }, { "epoch": 3.672035072055676, "grad_norm": 0.5892056822776794, "learning_rate": 7.978095333838457e-05, "loss": 0.0191, "step": 80200 }, { "epoch": 3.6766136694023785, "grad_norm": 0.3318547308444977, "learning_rate": 7.97167227704116e-05, "loss": 0.0194, "step": 80300 }, { "epoch": 3.6811922667490813, "grad_norm": 0.4608217179775238, "learning_rate": 7.965241629046571e-05, "loss": 0.0215, "step": 80400 }, { "epoch": 3.685770864095784, "grad_norm": 0.39660006761550903, "learning_rate": 7.95880340628191e-05, "loss": 0.0172, "step": 80500 }, { "epoch": 3.690349461442487, "grad_norm": 0.182856485247612, "learning_rate": 7.952357625193749e-05, "loss": 0.0184, "step": 80600 }, { "epoch": 3.69492805878919, "grad_norm": 0.6444191932678223, "learning_rate": 7.945904302247969e-05, "loss": 0.0179, "step": 80700 }, { "epoch": 3.6995066561358927, "grad_norm": 0.4182109534740448, "learning_rate": 7.939443453929712e-05, "loss": 0.0217, "step": 80800 }, { "epoch": 3.704085253482596, "grad_norm": 4.025650501251221, "learning_rate": 7.932975096743346e-05, "loss": 0.0203, "step": 80900 }, { "epoch": 3.708663850829298, "grad_norm": 0.665017306804657, "learning_rate": 7.926499247212422e-05, "loss": 0.0186, "step": 81000 }, { "epoch": 3.7132424481760014, "grad_norm": 0.12548814713954926, "learning_rate": 7.920015921879631e-05, "loss": 0.0182, "step": 81100 }, { "epoch": 3.717821045522704, "grad_norm": 0.33034953474998474, "learning_rate": 7.913525137306756e-05, "loss": 0.0225, "step": 81200 }, { "epoch": 3.722399642869407, "grad_norm": 0.2771977186203003, "learning_rate": 7.907026910074643e-05, "loss": 0.0206, "step": 81300 }, { "epoch": 3.72697824021611, "grad_norm": 0.1603299379348755, "learning_rate": 7.900521256783143e-05, "loss": 0.0191, "step": 81400 }, { "epoch": 3.7315568375628128, "grad_norm": 0.29296520352363586, "learning_rate": 7.894008194051077e-05, "loss": 0.0199, "step": 81500 }, { "epoch": 3.7361354349095155, "grad_norm": 0.3158813416957855, "learning_rate": 7.8874877385162e-05, "loss": 0.0216, "step": 81600 }, { "epoch": 3.7407140322562182, "grad_norm": 0.42911648750305176, "learning_rate": 7.880959906835148e-05, "loss": 0.0174, "step": 81700 }, { "epoch": 3.745292629602921, "grad_norm": 0.3854501247406006, "learning_rate": 7.8744247156834e-05, "loss": 0.0217, "step": 81800 }, { "epoch": 3.749871226949624, "grad_norm": 0.1661909967660904, "learning_rate": 7.86788218175523e-05, "loss": 0.0185, "step": 81900 }, { "epoch": 3.754449824296327, "grad_norm": 0.3275599479675293, "learning_rate": 7.861332321763682e-05, "loss": 0.0172, "step": 82000 }, { "epoch": 3.7590284216430296, "grad_norm": 0.4914777874946594, "learning_rate": 7.854775152440501e-05, "loss": 0.0206, "step": 82100 }, { "epoch": 3.7636070189897324, "grad_norm": 0.6310822367668152, "learning_rate": 7.84821069053611e-05, "loss": 0.0193, "step": 82200 }, { "epoch": 3.768185616336435, "grad_norm": 0.33729735016822815, "learning_rate": 7.841638952819563e-05, "loss": 0.0209, "step": 82300 }, { "epoch": 3.7727642136831383, "grad_norm": 0.6020189523696899, "learning_rate": 7.835059956078494e-05, "loss": 0.0194, "step": 82400 }, { "epoch": 3.777342811029841, "grad_norm": 0.3810158669948578, "learning_rate": 7.828473717119088e-05, "loss": 0.0199, "step": 82500 }, { "epoch": 3.781921408376544, "grad_norm": 0.6647739410400391, "learning_rate": 7.821880252766025e-05, "loss": 0.0211, "step": 82600 }, { "epoch": 3.7865000057232465, "grad_norm": 0.5358772873878479, "learning_rate": 7.815279579862442e-05, "loss": 0.0196, "step": 82700 }, { "epoch": 3.7910786030699493, "grad_norm": 0.26241055130958557, "learning_rate": 7.808671715269896e-05, "loss": 0.0206, "step": 82800 }, { "epoch": 3.7956572004166524, "grad_norm": 0.24061718583106995, "learning_rate": 7.802056675868306e-05, "loss": 0.0186, "step": 82900 }, { "epoch": 3.800235797763355, "grad_norm": 0.16280798614025116, "learning_rate": 7.79543447855593e-05, "loss": 0.0185, "step": 83000 }, { "epoch": 3.804814395110058, "grad_norm": 0.7385302186012268, "learning_rate": 7.788805140249302e-05, "loss": 0.0207, "step": 83100 }, { "epoch": 3.809392992456761, "grad_norm": 0.20743854343891144, "learning_rate": 7.782168677883206e-05, "loss": 0.0177, "step": 83200 }, { "epoch": 3.813971589803464, "grad_norm": 0.3482532501220703, "learning_rate": 7.775525108410615e-05, "loss": 0.0216, "step": 83300 }, { "epoch": 3.8185501871501666, "grad_norm": 0.42130351066589355, "learning_rate": 7.768874448802665e-05, "loss": 0.0207, "step": 83400 }, { "epoch": 3.8231287844968693, "grad_norm": 0.44204580783843994, "learning_rate": 7.762216716048602e-05, "loss": 0.0215, "step": 83500 }, { "epoch": 3.827707381843572, "grad_norm": 0.20962856709957123, "learning_rate": 7.755551927155739e-05, "loss": 0.0183, "step": 83600 }, { "epoch": 3.8322859791902752, "grad_norm": 0.19921015202999115, "learning_rate": 7.748880099149415e-05, "loss": 0.02, "step": 83700 }, { "epoch": 3.836864576536978, "grad_norm": 0.2693636119365692, "learning_rate": 7.742201249072948e-05, "loss": 0.019, "step": 83800 }, { "epoch": 3.8414431738836807, "grad_norm": 0.677135705947876, "learning_rate": 7.735515393987602e-05, "loss": 0.0195, "step": 83900 }, { "epoch": 3.8460217712303835, "grad_norm": 0.34260210394859314, "learning_rate": 7.728822550972523e-05, "loss": 0.0194, "step": 84000 }, { "epoch": 3.850600368577086, "grad_norm": 0.83556067943573, "learning_rate": 7.72212273712472e-05, "loss": 0.0226, "step": 84100 }, { "epoch": 3.8551789659237894, "grad_norm": 0.22360268235206604, "learning_rate": 7.715415969559002e-05, "loss": 0.0177, "step": 84200 }, { "epoch": 3.859757563270492, "grad_norm": 0.32109469175338745, "learning_rate": 7.708702265407941e-05, "loss": 0.0197, "step": 84300 }, { "epoch": 3.864336160617195, "grad_norm": 0.4577140212059021, "learning_rate": 7.701981641821834e-05, "loss": 0.0173, "step": 84400 }, { "epoch": 3.8689147579638976, "grad_norm": 0.30675482749938965, "learning_rate": 7.695254115968648e-05, "loss": 0.0198, "step": 84500 }, { "epoch": 3.8734933553106004, "grad_norm": 0.6526969075202942, "learning_rate": 7.688519705033989e-05, "loss": 0.0222, "step": 84600 }, { "epoch": 3.8780719526573035, "grad_norm": 0.09654036164283752, "learning_rate": 7.681778426221042e-05, "loss": 0.0194, "step": 84700 }, { "epoch": 3.8826505500040063, "grad_norm": 0.2337755411863327, "learning_rate": 7.675030296750542e-05, "loss": 0.019, "step": 84800 }, { "epoch": 3.887229147350709, "grad_norm": 0.05356181785464287, "learning_rate": 7.668275333860724e-05, "loss": 0.0202, "step": 84900 }, { "epoch": 3.891807744697412, "grad_norm": 0.4630540907382965, "learning_rate": 7.66151355480728e-05, "loss": 0.0182, "step": 85000 }, { "epoch": 3.896386342044115, "grad_norm": 0.21360653638839722, "learning_rate": 7.65474497686331e-05, "loss": 0.0198, "step": 85100 }, { "epoch": 3.9009649393908177, "grad_norm": 0.2991812229156494, "learning_rate": 7.647969617319282e-05, "loss": 0.0201, "step": 85200 }, { "epoch": 3.9055435367375204, "grad_norm": 0.214981809258461, "learning_rate": 7.641187493482995e-05, "loss": 0.0164, "step": 85300 }, { "epoch": 3.910122134084223, "grad_norm": 0.48418205976486206, "learning_rate": 7.634398622679517e-05, "loss": 0.0192, "step": 85400 }, { "epoch": 3.9147007314309263, "grad_norm": 0.5781142711639404, "learning_rate": 7.62760302225116e-05, "loss": 0.0199, "step": 85500 }, { "epoch": 3.919279328777629, "grad_norm": 0.7809280157089233, "learning_rate": 7.620800709557421e-05, "loss": 0.0186, "step": 85600 }, { "epoch": 3.923857926124332, "grad_norm": 0.1833581030368805, "learning_rate": 7.61399170197495e-05, "loss": 0.0189, "step": 85700 }, { "epoch": 3.9284365234710346, "grad_norm": 0.3215663433074951, "learning_rate": 7.60717601689749e-05, "loss": 0.0168, "step": 85800 }, { "epoch": 3.9330151208177373, "grad_norm": 0.41018444299697876, "learning_rate": 7.600353671735853e-05, "loss": 0.0208, "step": 85900 }, { "epoch": 3.9375937181644405, "grad_norm": 0.34082677960395813, "learning_rate": 7.593524683917854e-05, "loss": 0.0191, "step": 86000 }, { "epoch": 3.942172315511143, "grad_norm": 0.39426901936531067, "learning_rate": 7.586689070888284e-05, "loss": 0.0199, "step": 86100 }, { "epoch": 3.946750912857846, "grad_norm": 0.4446451663970947, "learning_rate": 7.579846850108855e-05, "loss": 0.0204, "step": 86200 }, { "epoch": 3.9513295102045487, "grad_norm": 0.3159216344356537, "learning_rate": 7.572998039058159e-05, "loss": 0.0183, "step": 86300 }, { "epoch": 3.9559081075512514, "grad_norm": 0.3799346387386322, "learning_rate": 7.566142655231622e-05, "loss": 0.019, "step": 86400 }, { "epoch": 3.9604867048979546, "grad_norm": 0.4832625687122345, "learning_rate": 7.559280716141463e-05, "loss": 0.0179, "step": 86500 }, { "epoch": 3.9650653022446574, "grad_norm": 0.2456403523683548, "learning_rate": 7.552412239316645e-05, "loss": 0.0184, "step": 86600 }, { "epoch": 3.96964389959136, "grad_norm": 0.3314709961414337, "learning_rate": 7.545537242302829e-05, "loss": 0.0177, "step": 86700 }, { "epoch": 3.9742224969380633, "grad_norm": 0.4336375892162323, "learning_rate": 7.53865574266234e-05, "loss": 0.0187, "step": 86800 }, { "epoch": 3.978801094284766, "grad_norm": 0.7629146575927734, "learning_rate": 7.531767757974104e-05, "loss": 0.0199, "step": 86900 }, { "epoch": 3.9833796916314688, "grad_norm": 0.16511370241641998, "learning_rate": 7.52487330583362e-05, "loss": 0.0182, "step": 87000 }, { "epoch": 3.9879582889781715, "grad_norm": 0.29885396361351013, "learning_rate": 7.517972403852905e-05, "loss": 0.0193, "step": 87100 }, { "epoch": 3.9925368863248742, "grad_norm": 0.4066375494003296, "learning_rate": 7.511065069660458e-05, "loss": 0.0191, "step": 87200 }, { "epoch": 3.9971154836715774, "grad_norm": 0.44243311882019043, "learning_rate": 7.504151320901199e-05, "loss": 0.0203, "step": 87300 }, { "epoch": 4.0, "eval_loss": 0.15203019976615906, "eval_runtime": 258.1696, "eval_samples_per_second": 21.304, "eval_steps_per_second": 21.304, "step": 87363 }, { "epoch": 4.00169408101828, "grad_norm": 0.2750494182109833, "learning_rate": 7.497231175236442e-05, "loss": 0.0174, "step": 87400 }, { "epoch": 4.006272678364983, "grad_norm": 0.4887785315513611, "learning_rate": 7.490304650343841e-05, "loss": 0.0131, "step": 87500 }, { "epoch": 4.010851275711686, "grad_norm": 0.21974627673625946, "learning_rate": 7.483371763917345e-05, "loss": 0.0141, "step": 87600 }, { "epoch": 4.015429873058388, "grad_norm": 0.34770917892456055, "learning_rate": 7.476432533667151e-05, "loss": 0.0139, "step": 87700 }, { "epoch": 4.020008470405092, "grad_norm": 0.2878529727458954, "learning_rate": 7.469486977319665e-05, "loss": 0.0118, "step": 87800 }, { "epoch": 4.024587067751794, "grad_norm": 0.6604347229003906, "learning_rate": 7.462535112617452e-05, "loss": 0.0128, "step": 87900 }, { "epoch": 4.029165665098497, "grad_norm": 0.4288138747215271, "learning_rate": 7.455576957319194e-05, "loss": 0.0145, "step": 88000 }, { "epoch": 4.0337442624452, "grad_norm": 0.19010120630264282, "learning_rate": 7.448612529199637e-05, "loss": 0.0114, "step": 88100 }, { "epoch": 4.0383228597919025, "grad_norm": 0.3835040032863617, "learning_rate": 7.441641846049556e-05, "loss": 0.0152, "step": 88200 }, { "epoch": 4.042901457138606, "grad_norm": 2.3910844326019287, "learning_rate": 7.434664925675702e-05, "loss": 0.0153, "step": 88300 }, { "epoch": 4.047480054485308, "grad_norm": 0.40593621134757996, "learning_rate": 7.427681785900761e-05, "loss": 0.0143, "step": 88400 }, { "epoch": 4.052058651832011, "grad_norm": 0.08815860003232956, "learning_rate": 7.420692444563305e-05, "loss": 0.014, "step": 88500 }, { "epoch": 4.056637249178714, "grad_norm": 0.33992356061935425, "learning_rate": 7.413696919517749e-05, "loss": 0.0135, "step": 88600 }, { "epoch": 4.061215846525417, "grad_norm": 0.32726776599884033, "learning_rate": 7.406695228634305e-05, "loss": 0.0131, "step": 88700 }, { "epoch": 4.06579444387212, "grad_norm": 0.3524836301803589, "learning_rate": 7.399687389798933e-05, "loss": 0.0136, "step": 88800 }, { "epoch": 4.070373041218823, "grad_norm": 0.18603968620300293, "learning_rate": 7.3926734209133e-05, "loss": 0.0123, "step": 88900 }, { "epoch": 4.074951638565525, "grad_norm": 0.4780280888080597, "learning_rate": 7.385653339894733e-05, "loss": 0.0142, "step": 89000 }, { "epoch": 4.0795302359122285, "grad_norm": 0.22851374745368958, "learning_rate": 7.378627164676173e-05, "loss": 0.013, "step": 89100 }, { "epoch": 4.084108833258931, "grad_norm": 0.4251825511455536, "learning_rate": 7.371594913206124e-05, "loss": 0.0153, "step": 89200 }, { "epoch": 4.088687430605634, "grad_norm": 0.3959885239601135, "learning_rate": 7.364556603448619e-05, "loss": 0.0166, "step": 89300 }, { "epoch": 4.093266027952337, "grad_norm": 0.8459362387657166, "learning_rate": 7.357512253383162e-05, "loss": 0.0152, "step": 89400 }, { "epoch": 4.0978446252990395, "grad_norm": 0.5725641250610352, "learning_rate": 7.35046188100469e-05, "loss": 0.0135, "step": 89500 }, { "epoch": 4.102423222645743, "grad_norm": 0.2906801402568817, "learning_rate": 7.343405504323519e-05, "loss": 0.013, "step": 89600 }, { "epoch": 4.107001819992445, "grad_norm": 0.10050017386674881, "learning_rate": 7.33634314136531e-05, "loss": 0.0114, "step": 89700 }, { "epoch": 4.111580417339148, "grad_norm": 0.6948938965797424, "learning_rate": 7.329274810171014e-05, "loss": 0.0138, "step": 89800 }, { "epoch": 4.116159014685851, "grad_norm": 0.4069768190383911, "learning_rate": 7.322200528796822e-05, "loss": 0.0124, "step": 89900 }, { "epoch": 4.120737612032554, "grad_norm": 0.09699010848999023, "learning_rate": 7.315120315314134e-05, "loss": 0.0128, "step": 90000 }, { "epoch": 4.125316209379257, "grad_norm": 0.3347591161727905, "learning_rate": 7.308034187809498e-05, "loss": 0.0166, "step": 90100 }, { "epoch": 4.129894806725959, "grad_norm": 0.22168204188346863, "learning_rate": 7.300942164384571e-05, "loss": 0.0151, "step": 90200 }, { "epoch": 4.134473404072662, "grad_norm": 0.5564683675765991, "learning_rate": 7.293844263156072e-05, "loss": 0.0126, "step": 90300 }, { "epoch": 4.1390520014193655, "grad_norm": 0.32226261496543884, "learning_rate": 7.28674050225573e-05, "loss": 0.0131, "step": 90400 }, { "epoch": 4.143630598766068, "grad_norm": 0.36912479996681213, "learning_rate": 7.279630899830252e-05, "loss": 0.0143, "step": 90500 }, { "epoch": 4.148209196112771, "grad_norm": 0.2860753834247589, "learning_rate": 7.272515474041259e-05, "loss": 0.0152, "step": 90600 }, { "epoch": 4.152787793459474, "grad_norm": 0.3625887930393219, "learning_rate": 7.265394243065253e-05, "loss": 0.0143, "step": 90700 }, { "epoch": 4.157366390806176, "grad_norm": 0.24506491422653198, "learning_rate": 7.258267225093563e-05, "loss": 0.015, "step": 90800 }, { "epoch": 4.16194498815288, "grad_norm": 0.03290629759430885, "learning_rate": 7.251134438332299e-05, "loss": 0.0126, "step": 90900 }, { "epoch": 4.166523585499582, "grad_norm": 0.4261631667613983, "learning_rate": 7.243995901002312e-05, "loss": 0.0148, "step": 91000 }, { "epoch": 4.171102182846285, "grad_norm": 0.14463308453559875, "learning_rate": 7.23685163133914e-05, "loss": 0.0113, "step": 91100 }, { "epoch": 4.175680780192988, "grad_norm": 0.53131502866745, "learning_rate": 7.229701647592966e-05, "loss": 0.0136, "step": 91200 }, { "epoch": 4.180259377539691, "grad_norm": 0.30526795983314514, "learning_rate": 7.222545968028569e-05, "loss": 0.0142, "step": 91300 }, { "epoch": 4.184837974886394, "grad_norm": 0.07798325270414352, "learning_rate": 7.215384610925278e-05, "loss": 0.0134, "step": 91400 }, { "epoch": 4.189416572233096, "grad_norm": 0.164367213845253, "learning_rate": 7.208217594576923e-05, "loss": 0.0127, "step": 91500 }, { "epoch": 4.193995169579799, "grad_norm": 0.0945630893111229, "learning_rate": 7.201044937291797e-05, "loss": 0.0118, "step": 91600 }, { "epoch": 4.198573766926502, "grad_norm": 0.38682791590690613, "learning_rate": 7.193866657392597e-05, "loss": 0.0141, "step": 91700 }, { "epoch": 4.203152364273205, "grad_norm": 0.49326708912849426, "learning_rate": 7.186682773216384e-05, "loss": 0.0125, "step": 91800 }, { "epoch": 4.207730961619908, "grad_norm": 0.2276126593351364, "learning_rate": 7.179493303114537e-05, "loss": 0.014, "step": 91900 }, { "epoch": 4.21230955896661, "grad_norm": 0.5109021067619324, "learning_rate": 7.172298265452706e-05, "loss": 0.0138, "step": 92000 }, { "epoch": 4.216888156313313, "grad_norm": 0.23471687734127045, "learning_rate": 7.165097678610759e-05, "loss": 0.014, "step": 92100 }, { "epoch": 4.221466753660017, "grad_norm": 0.4894104301929474, "learning_rate": 7.15789156098274e-05, "loss": 0.0155, "step": 92200 }, { "epoch": 4.226045351006719, "grad_norm": 0.1319025456905365, "learning_rate": 7.150679930976825e-05, "loss": 0.0135, "step": 92300 }, { "epoch": 4.230623948353422, "grad_norm": 0.32496750354766846, "learning_rate": 7.143462807015271e-05, "loss": 0.0136, "step": 92400 }, { "epoch": 4.235202545700124, "grad_norm": 0.380876749753952, "learning_rate": 7.136240207534365e-05, "loss": 0.0148, "step": 92500 }, { "epoch": 4.2397811430468275, "grad_norm": 0.18530067801475525, "learning_rate": 7.129012150984387e-05, "loss": 0.0143, "step": 92600 }, { "epoch": 4.244359740393531, "grad_norm": 0.9411688446998596, "learning_rate": 7.121778655829554e-05, "loss": 0.0115, "step": 92700 }, { "epoch": 4.248938337740233, "grad_norm": 0.22460629045963287, "learning_rate": 7.114539740547974e-05, "loss": 0.0159, "step": 92800 }, { "epoch": 4.253516935086936, "grad_norm": 0.19735155999660492, "learning_rate": 7.107295423631606e-05, "loss": 0.0133, "step": 92900 }, { "epoch": 4.258095532433639, "grad_norm": 0.2656545341014862, "learning_rate": 7.100045723586204e-05, "loss": 0.0125, "step": 93000 }, { "epoch": 4.262674129780342, "grad_norm": 1.059777021408081, "learning_rate": 7.092790658931273e-05, "loss": 0.0148, "step": 93100 }, { "epoch": 4.267252727127045, "grad_norm": 0.3590608835220337, "learning_rate": 7.085530248200027e-05, "loss": 0.0139, "step": 93200 }, { "epoch": 4.271831324473747, "grad_norm": 0.133284792304039, "learning_rate": 7.07826450993933e-05, "loss": 0.0153, "step": 93300 }, { "epoch": 4.27640992182045, "grad_norm": 0.3305582106113434, "learning_rate": 7.070993462709656e-05, "loss": 0.0129, "step": 93400 }, { "epoch": 4.2809885191671535, "grad_norm": 0.4209526777267456, "learning_rate": 7.06371712508505e-05, "loss": 0.0125, "step": 93500 }, { "epoch": 4.285567116513856, "grad_norm": 0.10924796760082245, "learning_rate": 7.056435515653059e-05, "loss": 0.0162, "step": 93600 }, { "epoch": 4.290145713860559, "grad_norm": 0.4727434515953064, "learning_rate": 7.049148653014702e-05, "loss": 0.0126, "step": 93700 }, { "epoch": 4.294724311207261, "grad_norm": 0.5440820455551147, "learning_rate": 7.041856555784421e-05, "loss": 0.0131, "step": 93800 }, { "epoch": 4.2993029085539645, "grad_norm": 0.07101954519748688, "learning_rate": 7.034559242590027e-05, "loss": 0.0163, "step": 93900 }, { "epoch": 4.303881505900668, "grad_norm": 1.4522393941879272, "learning_rate": 7.027256732072651e-05, "loss": 0.014, "step": 94000 }, { "epoch": 4.30846010324737, "grad_norm": 0.1080670952796936, "learning_rate": 7.019949042886708e-05, "loss": 0.013, "step": 94100 }, { "epoch": 4.313038700594073, "grad_norm": 0.4725320339202881, "learning_rate": 7.012636193699837e-05, "loss": 0.0133, "step": 94200 }, { "epoch": 4.317617297940776, "grad_norm": 0.7752532362937927, "learning_rate": 7.005318203192864e-05, "loss": 0.0136, "step": 94300 }, { "epoch": 4.322195895287479, "grad_norm": 0.39167362451553345, "learning_rate": 6.997995090059739e-05, "loss": 0.0132, "step": 94400 }, { "epoch": 4.326774492634182, "grad_norm": 0.16077743470668793, "learning_rate": 6.990666873007505e-05, "loss": 0.0126, "step": 94500 }, { "epoch": 4.331353089980884, "grad_norm": 0.20132170617580414, "learning_rate": 6.983333570756245e-05, "loss": 0.0125, "step": 94600 }, { "epoch": 4.335931687327587, "grad_norm": 0.4036431610584259, "learning_rate": 6.975995202039025e-05, "loss": 0.0149, "step": 94700 }, { "epoch": 4.34051028467429, "grad_norm": 0.8535305261611938, "learning_rate": 6.968651785601859e-05, "loss": 0.0136, "step": 94800 }, { "epoch": 4.345088882020993, "grad_norm": 0.3927995562553406, "learning_rate": 6.961303340203653e-05, "loss": 0.0146, "step": 94900 }, { "epoch": 4.349667479367696, "grad_norm": 0.371528297662735, "learning_rate": 6.953949884616162e-05, "loss": 0.0124, "step": 95000 }, { "epoch": 4.354246076714398, "grad_norm": 0.06207489222288132, "learning_rate": 6.946591437623934e-05, "loss": 0.0129, "step": 95100 }, { "epoch": 4.358824674061101, "grad_norm": 0.05522959679365158, "learning_rate": 6.939228018024275e-05, "loss": 0.0133, "step": 95200 }, { "epoch": 4.363403271407805, "grad_norm": 0.5625087022781372, "learning_rate": 6.931859644627189e-05, "loss": 0.0141, "step": 95300 }, { "epoch": 4.367981868754507, "grad_norm": 0.13779932260513306, "learning_rate": 6.924486336255337e-05, "loss": 0.0135, "step": 95400 }, { "epoch": 4.37256046610121, "grad_norm": 1.0762056112289429, "learning_rate": 6.917108111743984e-05, "loss": 0.0142, "step": 95500 }, { "epoch": 4.377139063447912, "grad_norm": 0.22283124923706055, "learning_rate": 6.909724989940953e-05, "loss": 0.0133, "step": 95600 }, { "epoch": 4.3817176607946156, "grad_norm": 0.5186660289764404, "learning_rate": 6.902336989706581e-05, "loss": 0.0136, "step": 95700 }, { "epoch": 4.386296258141319, "grad_norm": 0.47632691264152527, "learning_rate": 6.894944129913667e-05, "loss": 0.0147, "step": 95800 }, { "epoch": 4.390874855488021, "grad_norm": 1.1676534414291382, "learning_rate": 6.887546429447419e-05, "loss": 0.0128, "step": 95900 }, { "epoch": 4.395453452834724, "grad_norm": 1.0476038455963135, "learning_rate": 6.880143907205411e-05, "loss": 0.0132, "step": 96000 }, { "epoch": 4.4000320501814265, "grad_norm": 0.656058669090271, "learning_rate": 6.872736582097541e-05, "loss": 0.0152, "step": 96100 }, { "epoch": 4.40461064752813, "grad_norm": 0.3963877856731415, "learning_rate": 6.86532447304597e-05, "loss": 0.0122, "step": 96200 }, { "epoch": 4.409189244874833, "grad_norm": 0.23698298633098602, "learning_rate": 6.857907598985081e-05, "loss": 0.0135, "step": 96300 }, { "epoch": 4.413767842221535, "grad_norm": 0.20948071777820587, "learning_rate": 6.850485978861431e-05, "loss": 0.0136, "step": 96400 }, { "epoch": 4.418346439568238, "grad_norm": 0.3551422357559204, "learning_rate": 6.843059631633699e-05, "loss": 0.0143, "step": 96500 }, { "epoch": 4.4229250369149415, "grad_norm": 0.21045321226119995, "learning_rate": 6.835628576272638e-05, "loss": 0.0149, "step": 96600 }, { "epoch": 4.427503634261644, "grad_norm": 1.5752928256988525, "learning_rate": 6.828192831761033e-05, "loss": 0.0151, "step": 96700 }, { "epoch": 4.432082231608347, "grad_norm": 0.4416331350803375, "learning_rate": 6.820752417093644e-05, "loss": 0.0133, "step": 96800 }, { "epoch": 4.436660828955049, "grad_norm": 0.44132721424102783, "learning_rate": 6.81330735127716e-05, "loss": 0.0101, "step": 96900 }, { "epoch": 4.4412394263017525, "grad_norm": 0.2506002187728882, "learning_rate": 6.805857653330156e-05, "loss": 0.0128, "step": 97000 }, { "epoch": 4.445818023648456, "grad_norm": 0.11981073021888733, "learning_rate": 6.798403342283034e-05, "loss": 0.0127, "step": 97100 }, { "epoch": 4.450396620995158, "grad_norm": 0.9063414335250854, "learning_rate": 6.790944437177984e-05, "loss": 0.0136, "step": 97200 }, { "epoch": 4.454975218341861, "grad_norm": 1.0382390022277832, "learning_rate": 6.783480957068934e-05, "loss": 0.0116, "step": 97300 }, { "epoch": 4.4595538156885635, "grad_norm": 0.22426804900169373, "learning_rate": 6.776012921021492e-05, "loss": 0.0149, "step": 97400 }, { "epoch": 4.464132413035267, "grad_norm": 0.4911547899246216, "learning_rate": 6.768540348112907e-05, "loss": 0.0123, "step": 97500 }, { "epoch": 4.46871101038197, "grad_norm": 0.6653274893760681, "learning_rate": 6.761063257432023e-05, "loss": 0.0121, "step": 97600 }, { "epoch": 4.473289607728672, "grad_norm": 0.37786972522735596, "learning_rate": 6.753581668079219e-05, "loss": 0.0133, "step": 97700 }, { "epoch": 4.477868205075375, "grad_norm": 0.15616688132286072, "learning_rate": 6.746095599166362e-05, "loss": 0.013, "step": 97800 }, { "epoch": 4.482446802422078, "grad_norm": 0.11935741454362869, "learning_rate": 6.738605069816775e-05, "loss": 0.0148, "step": 97900 }, { "epoch": 4.487025399768781, "grad_norm": 0.18721537292003632, "learning_rate": 6.731110099165164e-05, "loss": 0.0139, "step": 98000 }, { "epoch": 4.491603997115484, "grad_norm": 0.3637322783470154, "learning_rate": 6.723610706357582e-05, "loss": 0.0148, "step": 98100 }, { "epoch": 4.496182594462186, "grad_norm": 0.1633034497499466, "learning_rate": 6.716106910551385e-05, "loss": 0.0127, "step": 98200 }, { "epoch": 4.5007611918088894, "grad_norm": 0.19283847510814667, "learning_rate": 6.708598730915168e-05, "loss": 0.0132, "step": 98300 }, { "epoch": 4.505339789155592, "grad_norm": 0.17327933013439178, "learning_rate": 6.701086186628732e-05, "loss": 0.0156, "step": 98400 }, { "epoch": 4.509918386502295, "grad_norm": 0.06521926075220108, "learning_rate": 6.693569296883022e-05, "loss": 0.0137, "step": 98500 }, { "epoch": 4.514496983848998, "grad_norm": 0.4145078659057617, "learning_rate": 6.686048080880086e-05, "loss": 0.0144, "step": 98600 }, { "epoch": 4.5190755811957, "grad_norm": 0.5390291810035706, "learning_rate": 6.678522557833024e-05, "loss": 0.0132, "step": 98700 }, { "epoch": 4.523654178542404, "grad_norm": 0.2249838411808014, "learning_rate": 6.670992746965938e-05, "loss": 0.0122, "step": 98800 }, { "epoch": 4.528232775889107, "grad_norm": 0.09684702008962631, "learning_rate": 6.663458667513882e-05, "loss": 0.0122, "step": 98900 }, { "epoch": 4.532811373235809, "grad_norm": 0.5852058529853821, "learning_rate": 6.655920338722816e-05, "loss": 0.014, "step": 99000 }, { "epoch": 4.537389970582512, "grad_norm": 0.4523356258869171, "learning_rate": 6.648377779849554e-05, "loss": 0.0129, "step": 99100 }, { "epoch": 4.5419685679292146, "grad_norm": 0.4520733058452606, "learning_rate": 6.640831010161716e-05, "loss": 0.0123, "step": 99200 }, { "epoch": 4.546547165275918, "grad_norm": 0.42760178446769714, "learning_rate": 6.633280048937678e-05, "loss": 0.0171, "step": 99300 }, { "epoch": 4.551125762622621, "grad_norm": 0.27447327971458435, "learning_rate": 6.625724915466526e-05, "loss": 0.0136, "step": 99400 }, { "epoch": 4.555704359969323, "grad_norm": 0.30612578988075256, "learning_rate": 6.618165629048e-05, "loss": 0.0133, "step": 99500 }, { "epoch": 4.560282957316026, "grad_norm": 0.48825210332870483, "learning_rate": 6.610602208992454e-05, "loss": 0.0123, "step": 99600 }, { "epoch": 4.564861554662729, "grad_norm": 0.43417781591415405, "learning_rate": 6.603034674620794e-05, "loss": 0.0149, "step": 99700 }, { "epoch": 4.569440152009432, "grad_norm": 0.6489459276199341, "learning_rate": 6.595463045264445e-05, "loss": 0.0118, "step": 99800 }, { "epoch": 4.574018749356135, "grad_norm": 0.29751142859458923, "learning_rate": 6.587887340265286e-05, "loss": 0.0122, "step": 99900 }, { "epoch": 4.578597346702837, "grad_norm": 0.1352328062057495, "learning_rate": 6.580307578975608e-05, "loss": 0.0139, "step": 100000 }, { "epoch": 4.5831759440495405, "grad_norm": 0.2985703945159912, "learning_rate": 6.572723780758069e-05, "loss": 0.0121, "step": 100100 }, { "epoch": 4.587754541396244, "grad_norm": 0.1775195151567459, "learning_rate": 6.565135964985634e-05, "loss": 0.0139, "step": 100200 }, { "epoch": 4.592333138742946, "grad_norm": 0.41841718554496765, "learning_rate": 6.557544151041531e-05, "loss": 0.0146, "step": 100300 }, { "epoch": 4.596911736089649, "grad_norm": 0.07853005081415176, "learning_rate": 6.549948358319206e-05, "loss": 0.0138, "step": 100400 }, { "epoch": 4.6014903334363515, "grad_norm": 0.39813074469566345, "learning_rate": 6.542348606222266e-05, "loss": 0.0127, "step": 100500 }, { "epoch": 4.606068930783055, "grad_norm": 0.3754967749118805, "learning_rate": 6.53474491416443e-05, "loss": 0.0156, "step": 100600 }, { "epoch": 4.610647528129757, "grad_norm": 0.6578196287155151, "learning_rate": 6.527137301569486e-05, "loss": 0.0125, "step": 100700 }, { "epoch": 4.61522612547646, "grad_norm": 0.7814628481864929, "learning_rate": 6.519525787871235e-05, "loss": 0.0142, "step": 100800 }, { "epoch": 4.619804722823163, "grad_norm": 0.23694345355033875, "learning_rate": 6.511910392513443e-05, "loss": 0.0115, "step": 100900 }, { "epoch": 4.624383320169866, "grad_norm": 0.18302284181118011, "learning_rate": 6.504291134949792e-05, "loss": 0.0138, "step": 101000 }, { "epoch": 4.628961917516569, "grad_norm": 0.5445951223373413, "learning_rate": 6.496668034643831e-05, "loss": 0.0149, "step": 101100 }, { "epoch": 4.633540514863272, "grad_norm": 0.6721272468566895, "learning_rate": 6.489041111068926e-05, "loss": 0.014, "step": 101200 }, { "epoch": 4.638119112209974, "grad_norm": 0.40816518664360046, "learning_rate": 6.481410383708206e-05, "loss": 0.012, "step": 101300 }, { "epoch": 4.6426977095566775, "grad_norm": 0.28873249888420105, "learning_rate": 6.473775872054521e-05, "loss": 0.0148, "step": 101400 }, { "epoch": 4.64727630690338, "grad_norm": 0.5939431190490723, "learning_rate": 6.466137595610388e-05, "loss": 0.0124, "step": 101500 }, { "epoch": 4.651854904250083, "grad_norm": 0.08564829081296921, "learning_rate": 6.458495573887933e-05, "loss": 0.0128, "step": 101600 }, { "epoch": 4.656433501596786, "grad_norm": 0.8717368245124817, "learning_rate": 6.450849826408865e-05, "loss": 0.0137, "step": 101700 }, { "epoch": 4.6610120989434884, "grad_norm": 0.02314877323806286, "learning_rate": 6.443200372704395e-05, "loss": 0.0151, "step": 101800 }, { "epoch": 4.665590696290192, "grad_norm": 0.2785604000091553, "learning_rate": 6.43554723231521e-05, "loss": 0.0111, "step": 101900 }, { "epoch": 4.670169293636894, "grad_norm": 0.14578752219676971, "learning_rate": 6.427890424791415e-05, "loss": 0.0131, "step": 102000 }, { "epoch": 4.674747890983597, "grad_norm": 0.14544513821601868, "learning_rate": 6.420229969692477e-05, "loss": 0.0136, "step": 102100 }, { "epoch": 4.6793264883303, "grad_norm": 0.36046111583709717, "learning_rate": 6.412565886587185e-05, "loss": 0.0135, "step": 102200 }, { "epoch": 4.683905085677003, "grad_norm": 0.208379328250885, "learning_rate": 6.404898195053597e-05, "loss": 0.0132, "step": 102300 }, { "epoch": 4.688483683023706, "grad_norm": 0.04505769535899162, "learning_rate": 6.397226914678986e-05, "loss": 0.014, "step": 102400 }, { "epoch": 4.693062280370409, "grad_norm": 0.12393535673618317, "learning_rate": 6.389552065059795e-05, "loss": 0.0142, "step": 102500 }, { "epoch": 4.697640877717111, "grad_norm": 0.14113786816596985, "learning_rate": 6.381873665801581e-05, "loss": 0.0146, "step": 102600 }, { "epoch": 4.702219475063814, "grad_norm": 0.14992213249206543, "learning_rate": 6.374191736518974e-05, "loss": 0.01, "step": 102700 }, { "epoch": 4.706798072410517, "grad_norm": 0.24738559126853943, "learning_rate": 6.366506296835616e-05, "loss": 0.0114, "step": 102800 }, { "epoch": 4.71137666975722, "grad_norm": 0.6193427443504333, "learning_rate": 6.358817366384122e-05, "loss": 0.0139, "step": 102900 }, { "epoch": 4.715955267103923, "grad_norm": 0.24367505311965942, "learning_rate": 6.35112496480602e-05, "loss": 0.0113, "step": 103000 }, { "epoch": 4.720533864450625, "grad_norm": 0.11543486267328262, "learning_rate": 6.343429111751704e-05, "loss": 0.015, "step": 103100 }, { "epoch": 4.725112461797329, "grad_norm": 0.23988036811351776, "learning_rate": 6.33572982688039e-05, "loss": 0.0121, "step": 103200 }, { "epoch": 4.729691059144031, "grad_norm": 0.26978039741516113, "learning_rate": 6.328027129860057e-05, "loss": 0.0117, "step": 103300 }, { "epoch": 4.734269656490734, "grad_norm": 0.047924984246492386, "learning_rate": 6.3203210403674e-05, "loss": 0.0141, "step": 103400 }, { "epoch": 4.738848253837437, "grad_norm": 0.23787090182304382, "learning_rate": 6.312611578087784e-05, "loss": 0.0133, "step": 103500 }, { "epoch": 4.7434268511841395, "grad_norm": 0.9701817035675049, "learning_rate": 6.304898762715186e-05, "loss": 0.0121, "step": 103600 }, { "epoch": 4.748005448530843, "grad_norm": 0.5129296183586121, "learning_rate": 6.29718261395215e-05, "loss": 0.0161, "step": 103700 }, { "epoch": 4.752584045877546, "grad_norm": 0.2481413185596466, "learning_rate": 6.289463151509733e-05, "loss": 0.0142, "step": 103800 }, { "epoch": 4.757162643224248, "grad_norm": 0.4262784719467163, "learning_rate": 6.281740395107462e-05, "loss": 0.0152, "step": 103900 }, { "epoch": 4.761741240570951, "grad_norm": 0.42060771584510803, "learning_rate": 6.274014364473274e-05, "loss": 0.0132, "step": 104000 }, { "epoch": 4.766319837917654, "grad_norm": 0.2619081437587738, "learning_rate": 6.26628507934347e-05, "loss": 0.0124, "step": 104100 }, { "epoch": 4.770898435264357, "grad_norm": 0.47017577290534973, "learning_rate": 6.258552559462668e-05, "loss": 0.0132, "step": 104200 }, { "epoch": 4.775477032611059, "grad_norm": 0.5897017121315002, "learning_rate": 6.250816824583747e-05, "loss": 0.0134, "step": 104300 }, { "epoch": 4.780055629957762, "grad_norm": 0.41096287965774536, "learning_rate": 6.243077894467799e-05, "loss": 0.0139, "step": 104400 }, { "epoch": 4.7846342273044655, "grad_norm": 0.9277390241622925, "learning_rate": 6.235335788884079e-05, "loss": 0.0114, "step": 104500 }, { "epoch": 4.789212824651168, "grad_norm": 0.4034029245376587, "learning_rate": 6.227590527609952e-05, "loss": 0.0117, "step": 104600 }, { "epoch": 4.793791421997871, "grad_norm": 0.08527888357639313, "learning_rate": 6.219842130430846e-05, "loss": 0.0139, "step": 104700 }, { "epoch": 4.798370019344574, "grad_norm": 0.43536534905433655, "learning_rate": 6.2120906171402e-05, "loss": 0.0136, "step": 104800 }, { "epoch": 4.8029486166912765, "grad_norm": 0.14146916568279266, "learning_rate": 6.204336007539412e-05, "loss": 0.014, "step": 104900 }, { "epoch": 4.80752721403798, "grad_norm": 0.2524791657924652, "learning_rate": 6.19657832143779e-05, "loss": 0.0149, "step": 105000 }, { "epoch": 4.812105811384682, "grad_norm": 0.14325548708438873, "learning_rate": 6.1888175786525e-05, "loss": 0.0135, "step": 105100 }, { "epoch": 4.816684408731385, "grad_norm": 0.08125073462724686, "learning_rate": 6.181053799008519e-05, "loss": 0.012, "step": 105200 }, { "epoch": 4.821263006078088, "grad_norm": 0.765481173992157, "learning_rate": 6.173287002338577e-05, "loss": 0.0123, "step": 105300 }, { "epoch": 4.825841603424791, "grad_norm": 0.6038789749145508, "learning_rate": 6.165517208483117e-05, "loss": 0.0135, "step": 105400 }, { "epoch": 4.830420200771494, "grad_norm": 0.3267226219177246, "learning_rate": 6.157744437290236e-05, "loss": 0.012, "step": 105500 }, { "epoch": 4.834998798118196, "grad_norm": 0.08704890310764313, "learning_rate": 6.149968708615634e-05, "loss": 0.0136, "step": 105600 }, { "epoch": 4.839577395464899, "grad_norm": 0.22156277298927307, "learning_rate": 6.142190042322569e-05, "loss": 0.013, "step": 105700 }, { "epoch": 4.8441559928116025, "grad_norm": 0.12729842960834503, "learning_rate": 6.134408458281805e-05, "loss": 0.014, "step": 105800 }, { "epoch": 4.848734590158305, "grad_norm": 0.21868254244327545, "learning_rate": 6.12662397637155e-05, "loss": 0.0154, "step": 105900 }, { "epoch": 4.853313187505008, "grad_norm": 0.3205544352531433, "learning_rate": 6.118836616477427e-05, "loss": 0.0132, "step": 106000 }, { "epoch": 4.857891784851711, "grad_norm": 0.26208868622779846, "learning_rate": 6.111046398492404e-05, "loss": 0.0139, "step": 106100 }, { "epoch": 4.862470382198413, "grad_norm": 0.6751037836074829, "learning_rate": 6.103253342316753e-05, "loss": 0.0129, "step": 106200 }, { "epoch": 4.867048979545117, "grad_norm": 0.2062651365995407, "learning_rate": 6.095457467857989e-05, "loss": 0.0145, "step": 106300 }, { "epoch": 4.871627576891819, "grad_norm": 0.18155290186405182, "learning_rate": 6.087658795030837e-05, "loss": 0.0127, "step": 106400 }, { "epoch": 4.876206174238522, "grad_norm": 0.17720471322536469, "learning_rate": 6.079857343757165e-05, "loss": 0.0134, "step": 106500 }, { "epoch": 4.880784771585224, "grad_norm": 0.09973806142807007, "learning_rate": 6.072053133965938e-05, "loss": 0.0116, "step": 106600 }, { "epoch": 4.885363368931928, "grad_norm": 0.25288718938827515, "learning_rate": 6.064246185593167e-05, "loss": 0.0127, "step": 106700 }, { "epoch": 4.889941966278631, "grad_norm": 0.19430892169475555, "learning_rate": 6.056436518581864e-05, "loss": 0.0147, "step": 106800 }, { "epoch": 4.894520563625333, "grad_norm": 0.31932905316352844, "learning_rate": 6.0486241528819795e-05, "loss": 0.0127, "step": 106900 }, { "epoch": 4.899099160972036, "grad_norm": 0.06558812409639359, "learning_rate": 6.040809108450363e-05, "loss": 0.0124, "step": 107000 }, { "epoch": 4.903677758318739, "grad_norm": 0.20380474627017975, "learning_rate": 6.032991405250702e-05, "loss": 0.0147, "step": 107100 }, { "epoch": 4.908256355665442, "grad_norm": 0.08541610836982727, "learning_rate": 6.025171063253479e-05, "loss": 0.014, "step": 107200 }, { "epoch": 4.912834953012145, "grad_norm": 0.3804337978363037, "learning_rate": 6.017348102435918e-05, "loss": 0.0116, "step": 107300 }, { "epoch": 4.917413550358847, "grad_norm": 0.3044677674770355, "learning_rate": 6.00952254278193e-05, "loss": 0.0141, "step": 107400 }, { "epoch": 4.92199214770555, "grad_norm": 0.4350314438343048, "learning_rate": 6.001694404282068e-05, "loss": 0.0129, "step": 107500 }, { "epoch": 4.926570745052254, "grad_norm": 0.19222760200500488, "learning_rate": 5.993863706933468e-05, "loss": 0.0124, "step": 107600 }, { "epoch": 4.931149342398956, "grad_norm": 0.36865904927253723, "learning_rate": 5.986030470739811e-05, "loss": 0.0113, "step": 107700 }, { "epoch": 4.935727939745659, "grad_norm": 0.20282283425331116, "learning_rate": 5.9781947157112536e-05, "loss": 0.013, "step": 107800 }, { "epoch": 4.940306537092361, "grad_norm": 0.11859617382287979, "learning_rate": 5.970356461864391e-05, "loss": 0.0138, "step": 107900 }, { "epoch": 4.9448851344390645, "grad_norm": 0.5312494039535522, "learning_rate": 5.962515729222208e-05, "loss": 0.0128, "step": 108000 }, { "epoch": 4.949463731785768, "grad_norm": 0.40164250135421753, "learning_rate": 5.95467253781401e-05, "loss": 0.0117, "step": 108100 }, { "epoch": 4.95404232913247, "grad_norm": 0.11808757483959198, "learning_rate": 5.9468269076753894e-05, "loss": 0.0121, "step": 108200 }, { "epoch": 4.958620926479173, "grad_norm": 0.20174367725849152, "learning_rate": 5.938978858848171e-05, "loss": 0.0122, "step": 108300 }, { "epoch": 4.963199523825876, "grad_norm": 0.33299440145492554, "learning_rate": 5.9311284113803524e-05, "loss": 0.0115, "step": 108400 }, { "epoch": 4.967778121172579, "grad_norm": 0.6904717683792114, "learning_rate": 5.9232755853260635e-05, "loss": 0.0139, "step": 108500 }, { "epoch": 4.972356718519282, "grad_norm": 0.17567585408687592, "learning_rate": 5.915420400745507e-05, "loss": 0.0118, "step": 108600 }, { "epoch": 4.976935315865984, "grad_norm": 0.16880100965499878, "learning_rate": 5.907562877704912e-05, "loss": 0.015, "step": 108700 }, { "epoch": 4.981513913212687, "grad_norm": 0.2917187213897705, "learning_rate": 5.899703036276482e-05, "loss": 0.0135, "step": 108800 }, { "epoch": 4.9860925105593905, "grad_norm": 0.028255263343453407, "learning_rate": 5.891840896538339e-05, "loss": 0.0112, "step": 108900 }, { "epoch": 4.990671107906093, "grad_norm": 0.2152412086725235, "learning_rate": 5.883976478574482e-05, "loss": 0.014, "step": 109000 }, { "epoch": 4.995249705252796, "grad_norm": 0.3723663091659546, "learning_rate": 5.876109802474725e-05, "loss": 0.0123, "step": 109100 }, { "epoch": 4.999828302599498, "grad_norm": 0.6162732243537903, "learning_rate": 5.868240888334653e-05, "loss": 0.0161, "step": 109200 }, { "epoch": 4.9999656605199, "eval_loss": 0.17184050381183624, "eval_runtime": 244.2658, "eval_samples_per_second": 22.516, "eval_steps_per_second": 22.516, "step": 109203 }, { "epoch": 5.0044068999462015, "grad_norm": 0.5017980337142944, "learning_rate": 5.860369756255566e-05, "loss": 0.0083, "step": 109300 }, { "epoch": 5.008985497292905, "grad_norm": 0.14825376868247986, "learning_rate": 5.8524964263444324e-05, "loss": 0.0097, "step": 109400 }, { "epoch": 5.013564094639607, "grad_norm": 1.7440462112426758, "learning_rate": 5.8446209187138324e-05, "loss": 0.0083, "step": 109500 }, { "epoch": 5.01814269198631, "grad_norm": 0.25318461656570435, "learning_rate": 5.8367432534819124e-05, "loss": 0.0094, "step": 109600 }, { "epoch": 5.022721289333012, "grad_norm": 0.0751919150352478, "learning_rate": 5.8288634507723274e-05, "loss": 0.0089, "step": 109700 }, { "epoch": 5.027299886679716, "grad_norm": 0.3842028081417084, "learning_rate": 5.820981530714191e-05, "loss": 0.0088, "step": 109800 }, { "epoch": 5.031878484026419, "grad_norm": 0.11625286936759949, "learning_rate": 5.813097513442035e-05, "loss": 0.008, "step": 109900 }, { "epoch": 5.036457081373121, "grad_norm": 0.25438615679740906, "learning_rate": 5.805211419095736e-05, "loss": 0.009, "step": 110000 }, { "epoch": 5.041035678719824, "grad_norm": 0.13749825954437256, "learning_rate": 5.797323267820484e-05, "loss": 0.0092, "step": 110100 }, { "epoch": 5.0456142760665275, "grad_norm": 0.06733408570289612, "learning_rate": 5.789433079766723e-05, "loss": 0.0097, "step": 110200 }, { "epoch": 5.05019287341323, "grad_norm": 0.2959531843662262, "learning_rate": 5.7815408750900993e-05, "loss": 0.0071, "step": 110300 }, { "epoch": 5.054771470759933, "grad_norm": 0.10893545299768448, "learning_rate": 5.773646673951406e-05, "loss": 0.0096, "step": 110400 }, { "epoch": 5.059350068106635, "grad_norm": 0.9517889618873596, "learning_rate": 5.765750496516547e-05, "loss": 0.0108, "step": 110500 }, { "epoch": 5.063928665453338, "grad_norm": 0.31945428252220154, "learning_rate": 5.757852362956463e-05, "loss": 0.0107, "step": 110600 }, { "epoch": 5.068507262800042, "grad_norm": 0.2407699078321457, "learning_rate": 5.7499522934470994e-05, "loss": 0.0083, "step": 110700 }, { "epoch": 5.073085860146744, "grad_norm": 0.15435832738876343, "learning_rate": 5.7420503081693446e-05, "loss": 0.0086, "step": 110800 }, { "epoch": 5.077664457493447, "grad_norm": 0.4791698455810547, "learning_rate": 5.734146427308979e-05, "loss": 0.0072, "step": 110900 }, { "epoch": 5.082243054840149, "grad_norm": 0.14484897255897522, "learning_rate": 5.7262406710566296e-05, "loss": 0.0105, "step": 111000 }, { "epoch": 5.086821652186853, "grad_norm": 0.5574690103530884, "learning_rate": 5.71833305960771e-05, "loss": 0.0092, "step": 111100 }, { "epoch": 5.091400249533556, "grad_norm": 0.3678722679615021, "learning_rate": 5.7104236131623736e-05, "loss": 0.0099, "step": 111200 }, { "epoch": 5.095978846880258, "grad_norm": 0.8227113485336304, "learning_rate": 5.702512351925464e-05, "loss": 0.008, "step": 111300 }, { "epoch": 5.100557444226961, "grad_norm": 0.13089661300182343, "learning_rate": 5.6945992961064586e-05, "loss": 0.0081, "step": 111400 }, { "epoch": 5.1051360415736635, "grad_norm": 0.008971684612333775, "learning_rate": 5.6866844659194185e-05, "loss": 0.0084, "step": 111500 }, { "epoch": 5.109714638920367, "grad_norm": 0.0824974775314331, "learning_rate": 5.6787678815829404e-05, "loss": 0.0098, "step": 111600 }, { "epoch": 5.11429323626707, "grad_norm": 0.17469094693660736, "learning_rate": 5.6708495633200964e-05, "loss": 0.0078, "step": 111700 }, { "epoch": 5.118871833613772, "grad_norm": 0.13333024084568024, "learning_rate": 5.6629295313583974e-05, "loss": 0.0082, "step": 111800 }, { "epoch": 5.123450430960475, "grad_norm": 0.43794387578964233, "learning_rate": 5.6550078059297205e-05, "loss": 0.0089, "step": 111900 }, { "epoch": 5.1280290283071785, "grad_norm": 0.37814435362815857, "learning_rate": 5.6470844072702764e-05, "loss": 0.0105, "step": 112000 }, { "epoch": 5.132607625653881, "grad_norm": 0.3779330253601074, "learning_rate": 5.639159355620551e-05, "loss": 0.0084, "step": 112100 }, { "epoch": 5.137186223000584, "grad_norm": 0.30869078636169434, "learning_rate": 5.631232671225247e-05, "loss": 0.0093, "step": 112200 }, { "epoch": 5.141764820347286, "grad_norm": 0.3333792984485626, "learning_rate": 5.623304374333239e-05, "loss": 0.0108, "step": 112300 }, { "epoch": 5.1463434176939895, "grad_norm": 1.2692680358886719, "learning_rate": 5.6153744851975274e-05, "loss": 0.0081, "step": 112400 }, { "epoch": 5.150922015040693, "grad_norm": 0.017233431339263916, "learning_rate": 5.607443024075173e-05, "loss": 0.0075, "step": 112500 }, { "epoch": 5.155500612387395, "grad_norm": 0.46397635340690613, "learning_rate": 5.5995100112272545e-05, "loss": 0.0108, "step": 112600 }, { "epoch": 5.160079209734098, "grad_norm": 0.23527605831623077, "learning_rate": 5.591575466918816e-05, "loss": 0.0094, "step": 112700 }, { "epoch": 5.1646578070808005, "grad_norm": 0.19655343890190125, "learning_rate": 5.583639411418811e-05, "loss": 0.0092, "step": 112800 }, { "epoch": 5.169236404427504, "grad_norm": 0.6157360076904297, "learning_rate": 5.575701865000054e-05, "loss": 0.0085, "step": 112900 }, { "epoch": 5.173815001774207, "grad_norm": 0.4467610716819763, "learning_rate": 5.56776284793917e-05, "loss": 0.0092, "step": 113000 }, { "epoch": 5.178393599120909, "grad_norm": 0.16839289665222168, "learning_rate": 5.559822380516539e-05, "loss": 0.0093, "step": 113100 }, { "epoch": 5.182972196467612, "grad_norm": 0.08081818372011185, "learning_rate": 5.551880483016248e-05, "loss": 0.0088, "step": 113200 }, { "epoch": 5.187550793814315, "grad_norm": 0.7287288308143616, "learning_rate": 5.543937175726035e-05, "loss": 0.0084, "step": 113300 }, { "epoch": 5.192129391161018, "grad_norm": 0.18267770111560822, "learning_rate": 5.5359924789372396e-05, "loss": 0.0083, "step": 113400 }, { "epoch": 5.196707988507721, "grad_norm": 0.3210001587867737, "learning_rate": 5.528046412944752e-05, "loss": 0.0094, "step": 113500 }, { "epoch": 5.201286585854423, "grad_norm": 0.21997089684009552, "learning_rate": 5.520098998046958e-05, "loss": 0.0089, "step": 113600 }, { "epoch": 5.2058651832011265, "grad_norm": 0.24578975141048431, "learning_rate": 5.5121502545456925e-05, "loss": 0.0095, "step": 113700 }, { "epoch": 5.210443780547829, "grad_norm": 1.2959401607513428, "learning_rate": 5.504200202746182e-05, "loss": 0.0085, "step": 113800 }, { "epoch": 5.215022377894532, "grad_norm": 0.12553347647190094, "learning_rate": 5.496248862956994e-05, "loss": 0.0089, "step": 113900 }, { "epoch": 5.219600975241235, "grad_norm": 0.7202230095863342, "learning_rate": 5.488296255489991e-05, "loss": 0.008, "step": 114000 }, { "epoch": 5.224179572587937, "grad_norm": 0.7170085310935974, "learning_rate": 5.480342400660268e-05, "loss": 0.0104, "step": 114100 }, { "epoch": 5.228758169934641, "grad_norm": 0.029888896271586418, "learning_rate": 5.4723873187861085e-05, "loss": 0.0092, "step": 114200 }, { "epoch": 5.233336767281344, "grad_norm": 0.2950020730495453, "learning_rate": 5.4644310301889334e-05, "loss": 0.0089, "step": 114300 }, { "epoch": 5.237915364628046, "grad_norm": 0.12343444675207138, "learning_rate": 5.456473555193242e-05, "loss": 0.008, "step": 114400 }, { "epoch": 5.242493961974749, "grad_norm": 0.5347928404808044, "learning_rate": 5.4485149141265667e-05, "loss": 0.0079, "step": 114500 }, { "epoch": 5.247072559321452, "grad_norm": 0.9914150834083557, "learning_rate": 5.440555127319418e-05, "loss": 0.0111, "step": 114600 }, { "epoch": 5.251651156668155, "grad_norm": 0.24366235733032227, "learning_rate": 5.432594215105234e-05, "loss": 0.0085, "step": 114700 }, { "epoch": 5.256229754014858, "grad_norm": 0.1021379604935646, "learning_rate": 5.424632197820324e-05, "loss": 0.0091, "step": 114800 }, { "epoch": 5.26080835136156, "grad_norm": 0.11071757227182388, "learning_rate": 5.4166690958038265e-05, "loss": 0.0082, "step": 114900 }, { "epoch": 5.265386948708263, "grad_norm": 1.4259638786315918, "learning_rate": 5.408704929397648e-05, "loss": 0.0085, "step": 115000 }, { "epoch": 5.269965546054966, "grad_norm": 0.2681211531162262, "learning_rate": 5.4007397189464105e-05, "loss": 0.0108, "step": 115100 }, { "epoch": 5.274544143401669, "grad_norm": 0.4776928126811981, "learning_rate": 5.3927734847974064e-05, "loss": 0.008, "step": 115200 }, { "epoch": 5.279122740748372, "grad_norm": 0.38615280389785767, "learning_rate": 5.3848062473005464e-05, "loss": 0.0092, "step": 115300 }, { "epoch": 5.283701338095074, "grad_norm": 0.23448576033115387, "learning_rate": 5.376838026808298e-05, "loss": 0.0099, "step": 115400 }, { "epoch": 5.2882799354417775, "grad_norm": 0.11435823887586594, "learning_rate": 5.368868843675642e-05, "loss": 0.0093, "step": 115500 }, { "epoch": 5.29285853278848, "grad_norm": 0.22706013917922974, "learning_rate": 5.360898718260021e-05, "loss": 0.0085, "step": 115600 }, { "epoch": 5.297437130135183, "grad_norm": 0.04221300780773163, "learning_rate": 5.3529276709212816e-05, "loss": 0.0084, "step": 115700 }, { "epoch": 5.302015727481886, "grad_norm": 0.3892548382282257, "learning_rate": 5.344955722021624e-05, "loss": 0.0101, "step": 115800 }, { "epoch": 5.3065943248285885, "grad_norm": 0.13219723105430603, "learning_rate": 5.336982891925559e-05, "loss": 0.0087, "step": 115900 }, { "epoch": 5.311172922175292, "grad_norm": 0.18125391006469727, "learning_rate": 5.32900920099984e-05, "loss": 0.0097, "step": 116000 }, { "epoch": 5.315751519521994, "grad_norm": 0.14028698205947876, "learning_rate": 5.321034669613422e-05, "loss": 0.0088, "step": 116100 }, { "epoch": 5.320330116868697, "grad_norm": 0.1114293709397316, "learning_rate": 5.31305931813741e-05, "loss": 0.0086, "step": 116200 }, { "epoch": 5.3249087142154, "grad_norm": 0.20969901978969574, "learning_rate": 5.3050831669450005e-05, "loss": 0.0082, "step": 116300 }, { "epoch": 5.329487311562103, "grad_norm": 0.07742590457201004, "learning_rate": 5.297106236411432e-05, "loss": 0.0083, "step": 116400 }, { "epoch": 5.334065908908806, "grad_norm": 0.259859174489975, "learning_rate": 5.2891285469139395e-05, "loss": 0.0087, "step": 116500 }, { "epoch": 5.338644506255509, "grad_norm": 0.3085865080356598, "learning_rate": 5.2811501188316915e-05, "loss": 0.0103, "step": 116600 }, { "epoch": 5.343223103602211, "grad_norm": 0.27554938197135925, "learning_rate": 5.2731709725457434e-05, "loss": 0.0084, "step": 116700 }, { "epoch": 5.3478017009489145, "grad_norm": 0.36539149284362793, "learning_rate": 5.2651911284389896e-05, "loss": 0.0085, "step": 116800 }, { "epoch": 5.352380298295617, "grad_norm": 0.47007834911346436, "learning_rate": 5.2572106068961026e-05, "loss": 0.0106, "step": 116900 }, { "epoch": 5.35695889564232, "grad_norm": 0.22008706629276276, "learning_rate": 5.249229428303486e-05, "loss": 0.0086, "step": 117000 }, { "epoch": 5.361537492989023, "grad_norm": 0.02755674161016941, "learning_rate": 5.241247613049225e-05, "loss": 0.0093, "step": 117100 }, { "epoch": 5.3661160903357255, "grad_norm": 0.11869332939386368, "learning_rate": 5.233265181523028e-05, "loss": 0.0086, "step": 117200 }, { "epoch": 5.370694687682429, "grad_norm": 0.6038843393325806, "learning_rate": 5.225282154116179e-05, "loss": 0.0089, "step": 117300 }, { "epoch": 5.375273285029131, "grad_norm": 0.34202539920806885, "learning_rate": 5.217298551221483e-05, "loss": 0.0101, "step": 117400 }, { "epoch": 5.379851882375834, "grad_norm": 0.18048258125782013, "learning_rate": 5.2093143932332176e-05, "loss": 0.0089, "step": 117500 }, { "epoch": 5.384430479722537, "grad_norm": 0.14283466339111328, "learning_rate": 5.201329700547076e-05, "loss": 0.0076, "step": 117600 }, { "epoch": 5.38900907706924, "grad_norm": 0.5224958658218384, "learning_rate": 5.193344493560117e-05, "loss": 0.0091, "step": 117700 }, { "epoch": 5.393587674415943, "grad_norm": 0.0608445443212986, "learning_rate": 5.185358792670718e-05, "loss": 0.0091, "step": 117800 }, { "epoch": 5.398166271762646, "grad_norm": 0.3700086176395416, "learning_rate": 5.177372618278511e-05, "loss": 0.0087, "step": 117900 }, { "epoch": 5.402744869109348, "grad_norm": 0.20753388106822968, "learning_rate": 5.16938599078434e-05, "loss": 0.0099, "step": 118000 }, { "epoch": 5.407323466456051, "grad_norm": 0.13068944215774536, "learning_rate": 5.161398930590212e-05, "loss": 0.0099, "step": 118100 }, { "epoch": 5.411902063802754, "grad_norm": 0.34820255637168884, "learning_rate": 5.153411458099231e-05, "loss": 0.0087, "step": 118200 }, { "epoch": 5.416480661149457, "grad_norm": 0.3474198281764984, "learning_rate": 5.145423593715557e-05, "loss": 0.0104, "step": 118300 }, { "epoch": 5.42105925849616, "grad_norm": 0.11103557795286179, "learning_rate": 5.137435357844357e-05, "loss": 0.0065, "step": 118400 }, { "epoch": 5.425637855842862, "grad_norm": 0.08837764710187912, "learning_rate": 5.129446770891738e-05, "loss": 0.0078, "step": 118500 }, { "epoch": 5.430216453189566, "grad_norm": 0.07470713555812836, "learning_rate": 5.121457853264708e-05, "loss": 0.0074, "step": 118600 }, { "epoch": 5.434795050536268, "grad_norm": 0.08019549399614334, "learning_rate": 5.1134686253711215e-05, "loss": 0.0104, "step": 118700 }, { "epoch": 5.439373647882971, "grad_norm": 0.1745513528585434, "learning_rate": 5.105479107619624e-05, "loss": 0.009, "step": 118800 }, { "epoch": 5.443952245229674, "grad_norm": 0.07470156252384186, "learning_rate": 5.097489320419598e-05, "loss": 0.0083, "step": 118900 }, { "epoch": 5.4485308425763765, "grad_norm": 0.5151394605636597, "learning_rate": 5.089499284181122e-05, "loss": 0.0083, "step": 119000 }, { "epoch": 5.45310943992308, "grad_norm": 0.11218901723623276, "learning_rate": 5.081509019314902e-05, "loss": 0.0097, "step": 119100 }, { "epoch": 5.457688037269782, "grad_norm": 0.25493118166923523, "learning_rate": 5.073518546232234e-05, "loss": 0.0084, "step": 119200 }, { "epoch": 5.462266634616485, "grad_norm": 0.39373013377189636, "learning_rate": 5.065527885344944e-05, "loss": 0.0098, "step": 119300 }, { "epoch": 5.466845231963188, "grad_norm": 0.5648688673973083, "learning_rate": 5.057537057065338e-05, "loss": 0.009, "step": 119400 }, { "epoch": 5.471423829309891, "grad_norm": 0.2762792408466339, "learning_rate": 5.049546081806149e-05, "loss": 0.0077, "step": 119500 }, { "epoch": 5.476002426656594, "grad_norm": 0.10117408633232117, "learning_rate": 5.041554979980486e-05, "loss": 0.0078, "step": 119600 }, { "epoch": 5.480581024003296, "grad_norm": 0.7319039106369019, "learning_rate": 5.0335637720017817e-05, "loss": 0.0085, "step": 119700 }, { "epoch": 5.485159621349999, "grad_norm": 0.4741845428943634, "learning_rate": 5.025572478283738e-05, "loss": 0.0084, "step": 119800 }, { "epoch": 5.4897382186967025, "grad_norm": 0.2592092752456665, "learning_rate": 5.0175811192402767e-05, "loss": 0.0075, "step": 119900 }, { "epoch": 5.494316816043405, "grad_norm": 0.03605992719531059, "learning_rate": 5.009589715285492e-05, "loss": 0.0056, "step": 120000 }, { "epoch": 5.498895413390108, "grad_norm": 0.518429696559906, "learning_rate": 5.0015982868335834e-05, "loss": 0.0104, "step": 120100 }, { "epoch": 5.503474010736811, "grad_norm": 0.42362892627716064, "learning_rate": 4.993606854298817e-05, "loss": 0.0106, "step": 120200 }, { "epoch": 5.5080526080835135, "grad_norm": 0.27914491295814514, "learning_rate": 4.985615438095473e-05, "loss": 0.008, "step": 120300 }, { "epoch": 5.512631205430217, "grad_norm": 0.12702660262584686, "learning_rate": 4.977624058637783e-05, "loss": 0.0094, "step": 120400 }, { "epoch": 5.517209802776919, "grad_norm": 0.06755949556827545, "learning_rate": 4.969632736339893e-05, "loss": 0.0089, "step": 120500 }, { "epoch": 5.521788400123622, "grad_norm": 0.2052990347146988, "learning_rate": 4.961641491615794e-05, "loss": 0.0079, "step": 120600 }, { "epoch": 5.526366997470325, "grad_norm": 0.27255722880363464, "learning_rate": 4.953650344879286e-05, "loss": 0.0076, "step": 120700 }, { "epoch": 5.530945594817028, "grad_norm": 0.10563024878501892, "learning_rate": 4.945659316543916e-05, "loss": 0.0087, "step": 120800 }, { "epoch": 5.535524192163731, "grad_norm": 0.31879550218582153, "learning_rate": 4.9376684270229254e-05, "loss": 0.009, "step": 120900 }, { "epoch": 5.540102789510433, "grad_norm": 0.21383854746818542, "learning_rate": 4.929677696729207e-05, "loss": 0.0085, "step": 121000 }, { "epoch": 5.544681386857136, "grad_norm": 0.2081623524427414, "learning_rate": 4.921687146075244e-05, "loss": 0.0095, "step": 121100 }, { "epoch": 5.5492599842038395, "grad_norm": 0.12125098705291748, "learning_rate": 4.913696795473058e-05, "loss": 0.0084, "step": 121200 }, { "epoch": 5.553838581550542, "grad_norm": 0.17820671200752258, "learning_rate": 4.905706665334165e-05, "loss": 0.0081, "step": 121300 }, { "epoch": 5.558417178897245, "grad_norm": 0.2230408489704132, "learning_rate": 4.897716776069512e-05, "loss": 0.0079, "step": 121400 }, { "epoch": 5.562995776243948, "grad_norm": 0.3595784604549408, "learning_rate": 4.889727148089439e-05, "loss": 0.0104, "step": 121500 }, { "epoch": 5.56757437359065, "grad_norm": 0.08180402964353561, "learning_rate": 4.8817378018036073e-05, "loss": 0.008, "step": 121600 }, { "epoch": 5.572152970937354, "grad_norm": 0.13690640032291412, "learning_rate": 4.873748757620967e-05, "loss": 0.0093, "step": 121700 }, { "epoch": 5.576731568284056, "grad_norm": 0.048987165093421936, "learning_rate": 4.865760035949695e-05, "loss": 0.0088, "step": 121800 }, { "epoch": 5.581310165630759, "grad_norm": 0.7239773869514465, "learning_rate": 4.857771657197142e-05, "loss": 0.0098, "step": 121900 }, { "epoch": 5.585888762977461, "grad_norm": 0.13404466211795807, "learning_rate": 4.849783641769783e-05, "loss": 0.0095, "step": 122000 }, { "epoch": 5.590467360324165, "grad_norm": 0.30230358242988586, "learning_rate": 4.8417960100731706e-05, "loss": 0.0076, "step": 122100 }, { "epoch": 5.595045957670868, "grad_norm": 0.169099822640419, "learning_rate": 4.8338087825118675e-05, "loss": 0.009, "step": 122200 }, { "epoch": 5.59962455501757, "grad_norm": 0.7153336405754089, "learning_rate": 4.8258219794894095e-05, "loss": 0.0088, "step": 122300 }, { "epoch": 5.604203152364273, "grad_norm": 0.167174831032753, "learning_rate": 4.817835621408251e-05, "loss": 0.0076, "step": 122400 }, { "epoch": 5.608781749710976, "grad_norm": 0.16803164780139923, "learning_rate": 4.809849728669702e-05, "loss": 0.0079, "step": 122500 }, { "epoch": 5.613360347057679, "grad_norm": 0.645155131816864, "learning_rate": 4.80186432167389e-05, "loss": 0.008, "step": 122600 }, { "epoch": 5.617938944404382, "grad_norm": 0.1512228399515152, "learning_rate": 4.7938794208197005e-05, "loss": 0.0091, "step": 122700 }, { "epoch": 5.622517541751084, "grad_norm": 0.28644976019859314, "learning_rate": 4.7858950465047224e-05, "loss": 0.0081, "step": 122800 }, { "epoch": 5.627096139097787, "grad_norm": 0.5135303735733032, "learning_rate": 4.7779112191252054e-05, "loss": 0.0092, "step": 122900 }, { "epoch": 5.631674736444491, "grad_norm": 0.38240012526512146, "learning_rate": 4.769927959075999e-05, "loss": 0.0105, "step": 123000 }, { "epoch": 5.636253333791193, "grad_norm": 0.565757155418396, "learning_rate": 4.761945286750499e-05, "loss": 0.0093, "step": 123100 }, { "epoch": 5.640831931137896, "grad_norm": 0.12311606109142303, "learning_rate": 4.7539632225406095e-05, "loss": 0.0076, "step": 123200 }, { "epoch": 5.645410528484598, "grad_norm": 0.2507004737854004, "learning_rate": 4.745981786836672e-05, "loss": 0.0088, "step": 123300 }, { "epoch": 5.6499891258313015, "grad_norm": 0.3408881425857544, "learning_rate": 4.738001000027431e-05, "loss": 0.0088, "step": 123400 }, { "epoch": 5.654567723178005, "grad_norm": 0.6254268884658813, "learning_rate": 4.730020882499964e-05, "loss": 0.0091, "step": 123500 }, { "epoch": 5.659146320524707, "grad_norm": 0.046281538903713226, "learning_rate": 4.722041454639645e-05, "loss": 0.0084, "step": 123600 }, { "epoch": 5.66372491787141, "grad_norm": 0.12148924171924591, "learning_rate": 4.714062736830088e-05, "loss": 0.0078, "step": 123700 }, { "epoch": 5.668303515218113, "grad_norm": 0.06817379593849182, "learning_rate": 4.706084749453085e-05, "loss": 0.0078, "step": 123800 }, { "epoch": 5.672882112564816, "grad_norm": 0.11472304165363312, "learning_rate": 4.6981075128885693e-05, "loss": 0.0092, "step": 123900 }, { "epoch": 5.677460709911519, "grad_norm": 0.7873682975769043, "learning_rate": 4.690131047514556e-05, "loss": 0.0082, "step": 124000 }, { "epoch": 5.682039307258221, "grad_norm": 0.34170079231262207, "learning_rate": 4.6821553737070856e-05, "loss": 0.008, "step": 124100 }, { "epoch": 5.686617904604924, "grad_norm": 0.562393844127655, "learning_rate": 4.674180511840178e-05, "loss": 0.0079, "step": 124200 }, { "epoch": 5.691196501951627, "grad_norm": 0.32295772433280945, "learning_rate": 4.6662064822857844e-05, "loss": 0.0088, "step": 124300 }, { "epoch": 5.69577509929833, "grad_norm": 0.09313233196735382, "learning_rate": 4.658233305413722e-05, "loss": 0.0083, "step": 124400 }, { "epoch": 5.700353696645033, "grad_norm": 0.27240103483200073, "learning_rate": 4.650261001591633e-05, "loss": 0.0076, "step": 124500 }, { "epoch": 5.704932293991735, "grad_norm": 0.5987135767936707, "learning_rate": 4.642289591184934e-05, "loss": 0.0072, "step": 124600 }, { "epoch": 5.7095108913384385, "grad_norm": 0.044540900737047195, "learning_rate": 4.6343190945567504e-05, "loss": 0.0084, "step": 124700 }, { "epoch": 5.714089488685142, "grad_norm": 0.19168873131275177, "learning_rate": 4.626349532067879e-05, "loss": 0.0085, "step": 124800 }, { "epoch": 5.718668086031844, "grad_norm": 0.3095737397670746, "learning_rate": 4.6183809240767314e-05, "loss": 0.0102, "step": 124900 }, { "epoch": 5.723246683378547, "grad_norm": 0.34387272596359253, "learning_rate": 4.6104132909392765e-05, "loss": 0.0084, "step": 125000 }, { "epoch": 5.727825280725249, "grad_norm": 0.18629814684391022, "learning_rate": 4.602446653008997e-05, "loss": 0.0091, "step": 125100 }, { "epoch": 5.732403878071953, "grad_norm": 0.1663748174905777, "learning_rate": 4.594481030636832e-05, "loss": 0.0094, "step": 125200 }, { "epoch": 5.736982475418656, "grad_norm": 0.21490418910980225, "learning_rate": 4.586516444171122e-05, "loss": 0.0083, "step": 125300 }, { "epoch": 5.741561072765358, "grad_norm": 0.17258259654045105, "learning_rate": 4.57855291395757e-05, "loss": 0.0089, "step": 125400 }, { "epoch": 5.746139670112061, "grad_norm": 0.25354665517807007, "learning_rate": 4.5705904603391716e-05, "loss": 0.0077, "step": 125500 }, { "epoch": 5.750718267458764, "grad_norm": 0.28657224774360657, "learning_rate": 4.562629103656183e-05, "loss": 0.0074, "step": 125600 }, { "epoch": 5.755296864805467, "grad_norm": 0.36166995763778687, "learning_rate": 4.5546688642460446e-05, "loss": 0.0091, "step": 125700 }, { "epoch": 5.75987546215217, "grad_norm": 0.19394946098327637, "learning_rate": 4.5467097624433524e-05, "loss": 0.0097, "step": 125800 }, { "epoch": 5.764454059498872, "grad_norm": 0.16516007483005524, "learning_rate": 4.538751818579797e-05, "loss": 0.0085, "step": 125900 }, { "epoch": 5.769032656845575, "grad_norm": 0.2279433161020279, "learning_rate": 4.530795052984104e-05, "loss": 0.0078, "step": 126000 }, { "epoch": 5.773611254192279, "grad_norm": 0.5914369225502014, "learning_rate": 4.522839485981994e-05, "loss": 0.0085, "step": 126100 }, { "epoch": 5.778189851538981, "grad_norm": 0.06345394253730774, "learning_rate": 4.514885137896127e-05, "loss": 0.0096, "step": 126200 }, { "epoch": 5.782768448885684, "grad_norm": 0.2646149694919586, "learning_rate": 4.506932029046044e-05, "loss": 0.0073, "step": 126300 }, { "epoch": 5.787347046232386, "grad_norm": 0.8094835877418518, "learning_rate": 4.498980179748123e-05, "loss": 0.0082, "step": 126400 }, { "epoch": 5.79192564357909, "grad_norm": 0.4164597988128662, "learning_rate": 4.4910296103155296e-05, "loss": 0.0079, "step": 126500 }, { "epoch": 5.796504240925793, "grad_norm": 0.3092726469039917, "learning_rate": 4.48308034105815e-05, "loss": 0.0102, "step": 126600 }, { "epoch": 5.801082838272495, "grad_norm": 0.2584327161312103, "learning_rate": 4.475132392282556e-05, "loss": 0.0084, "step": 126700 }, { "epoch": 5.805661435619198, "grad_norm": 0.07558545470237732, "learning_rate": 4.467185784291946e-05, "loss": 0.008, "step": 126800 }, { "epoch": 5.8102400329659005, "grad_norm": 0.1425691694021225, "learning_rate": 4.459240537386089e-05, "loss": 0.0095, "step": 126900 }, { "epoch": 5.814818630312604, "grad_norm": 0.4250103235244751, "learning_rate": 4.451296671861282e-05, "loss": 0.009, "step": 127000 }, { "epoch": 5.819397227659307, "grad_norm": 0.06756921857595444, "learning_rate": 4.443354208010291e-05, "loss": 0.0073, "step": 127100 }, { "epoch": 5.823975825006009, "grad_norm": 0.2185693234205246, "learning_rate": 4.4354131661222996e-05, "loss": 0.0072, "step": 127200 }, { "epoch": 5.828554422352712, "grad_norm": 0.3645274341106415, "learning_rate": 4.427473566482863e-05, "loss": 0.0106, "step": 127300 }, { "epoch": 5.8331330196994156, "grad_norm": 0.26136744022369385, "learning_rate": 4.4195354293738484e-05, "loss": 0.0085, "step": 127400 }, { "epoch": 5.837711617046118, "grad_norm": 0.1584431380033493, "learning_rate": 4.4115987750733914e-05, "loss": 0.0067, "step": 127500 }, { "epoch": 5.842290214392821, "grad_norm": 0.3366251587867737, "learning_rate": 4.4036636238558335e-05, "loss": 0.0072, "step": 127600 }, { "epoch": 5.846868811739523, "grad_norm": 0.1969982236623764, "learning_rate": 4.39572999599168e-05, "loss": 0.0099, "step": 127700 }, { "epoch": 5.8514474090862265, "grad_norm": 0.178545281291008, "learning_rate": 4.3877979117475486e-05, "loss": 0.0063, "step": 127800 }, { "epoch": 5.856026006432929, "grad_norm": 0.3591267168521881, "learning_rate": 4.379867391386106e-05, "loss": 0.0074, "step": 127900 }, { "epoch": 5.860604603779632, "grad_norm": 0.11651629209518433, "learning_rate": 4.371938455166028e-05, "loss": 0.0079, "step": 128000 }, { "epoch": 5.865183201126335, "grad_norm": 0.19086627662181854, "learning_rate": 4.364011123341947e-05, "loss": 0.0067, "step": 128100 }, { "epoch": 5.8697617984730375, "grad_norm": 0.0712941512465477, "learning_rate": 4.35608541616439e-05, "loss": 0.0099, "step": 128200 }, { "epoch": 5.874340395819741, "grad_norm": 0.26921433210372925, "learning_rate": 4.348161353879737e-05, "loss": 0.0107, "step": 128300 }, { "epoch": 5.878918993166444, "grad_norm": 0.6659551858901978, "learning_rate": 4.340238956730169e-05, "loss": 0.0081, "step": 128400 }, { "epoch": 5.883497590513146, "grad_norm": 1.7324509620666504, "learning_rate": 4.3323182449536095e-05, "loss": 0.0076, "step": 128500 }, { "epoch": 5.888076187859849, "grad_norm": 0.4373182952404022, "learning_rate": 4.3243992387836755e-05, "loss": 0.0063, "step": 128600 }, { "epoch": 5.892654785206552, "grad_norm": 0.45876213908195496, "learning_rate": 4.316481958449634e-05, "loss": 0.008, "step": 128700 }, { "epoch": 5.897233382553255, "grad_norm": 0.18616245687007904, "learning_rate": 4.308566424176336e-05, "loss": 0.0072, "step": 128800 }, { "epoch": 5.901811979899958, "grad_norm": 0.056702371686697006, "learning_rate": 4.3006526561841725e-05, "loss": 0.0086, "step": 128900 }, { "epoch": 5.90639057724666, "grad_norm": 0.38554903864860535, "learning_rate": 4.292740674689031e-05, "loss": 0.0078, "step": 129000 }, { "epoch": 5.9109691745933635, "grad_norm": 0.6524538397789001, "learning_rate": 4.284830499902223e-05, "loss": 0.0093, "step": 129100 }, { "epoch": 5.915547771940066, "grad_norm": 0.3187253475189209, "learning_rate": 4.276922152030454e-05, "loss": 0.0075, "step": 129200 }, { "epoch": 5.920126369286769, "grad_norm": 0.208381786942482, "learning_rate": 4.269015651275761e-05, "loss": 0.0073, "step": 129300 }, { "epoch": 5.924704966633472, "grad_norm": 0.2706379294395447, "learning_rate": 4.261111017835456e-05, "loss": 0.0074, "step": 129400 }, { "epoch": 5.929283563980174, "grad_norm": 0.8774177432060242, "learning_rate": 4.253208271902091e-05, "loss": 0.008, "step": 129500 }, { "epoch": 5.933862161326878, "grad_norm": 0.22220508754253387, "learning_rate": 4.245307433663388e-05, "loss": 0.0078, "step": 129600 }, { "epoch": 5.938440758673581, "grad_norm": 0.37277668714523315, "learning_rate": 4.237408523302203e-05, "loss": 0.0073, "step": 129700 }, { "epoch": 5.943019356020283, "grad_norm": 0.1921541541814804, "learning_rate": 4.229511560996459e-05, "loss": 0.0082, "step": 129800 }, { "epoch": 5.947597953366986, "grad_norm": 0.8308386206626892, "learning_rate": 4.221616566919107e-05, "loss": 0.0085, "step": 129900 }, { "epoch": 5.952176550713689, "grad_norm": 0.11215928941965103, "learning_rate": 4.213723561238074e-05, "loss": 0.0081, "step": 130000 }, { "epoch": 5.956755148060392, "grad_norm": 0.6458770632743835, "learning_rate": 4.205832564116201e-05, "loss": 0.0091, "step": 130100 }, { "epoch": 5.961333745407094, "grad_norm": 0.2930019199848175, "learning_rate": 4.197943595711198e-05, "loss": 0.0059, "step": 130200 }, { "epoch": 5.965912342753797, "grad_norm": 0.08667781949043274, "learning_rate": 4.190056676175602e-05, "loss": 0.0072, "step": 130300 }, { "epoch": 5.9704909401005, "grad_norm": 0.34257155656814575, "learning_rate": 4.1821718256567034e-05, "loss": 0.0076, "step": 130400 }, { "epoch": 5.975069537447203, "grad_norm": 0.2989988327026367, "learning_rate": 4.174289064296514e-05, "loss": 0.0104, "step": 130500 }, { "epoch": 5.979648134793906, "grad_norm": 0.6057233810424805, "learning_rate": 4.1664084122317124e-05, "loss": 0.0065, "step": 130600 }, { "epoch": 5.984226732140609, "grad_norm": 0.16379669308662415, "learning_rate": 4.15852988959358e-05, "loss": 0.0072, "step": 130700 }, { "epoch": 5.988805329487311, "grad_norm": 0.061728738248348236, "learning_rate": 4.150653516507964e-05, "loss": 0.0076, "step": 130800 }, { "epoch": 5.9933839268340146, "grad_norm": 0.19023200869560242, "learning_rate": 4.142779313095223e-05, "loss": 0.0074, "step": 130900 }, { "epoch": 5.997962524180718, "grad_norm": 0.2615407109260559, "learning_rate": 4.134907299470165e-05, "loss": 0.0087, "step": 131000 }, { "epoch": 5.999977107013266, "eval_loss": 0.13592004776000977, "eval_runtime": 244.1354, "eval_samples_per_second": 22.528, "eval_steps_per_second": 22.528, "step": 131044 }, { "epoch": 6.00254112152742, "grad_norm": 0.12518206238746643, "learning_rate": 4.127037495742013e-05, "loss": 0.0077, "step": 131100 }, { "epoch": 6.007119718874123, "grad_norm": 0.1018320843577385, "learning_rate": 4.119169922014339e-05, "loss": 0.0043, "step": 131200 }, { "epoch": 6.0116983162208255, "grad_norm": 0.09295986592769623, "learning_rate": 4.111304598385018e-05, "loss": 0.0061, "step": 131300 }, { "epoch": 6.016276913567529, "grad_norm": 0.05357728898525238, "learning_rate": 4.103441544946184e-05, "loss": 0.0056, "step": 131400 }, { "epoch": 6.020855510914231, "grad_norm": 0.08241847157478333, "learning_rate": 4.095580781784162e-05, "loss": 0.0059, "step": 131500 }, { "epoch": 6.025434108260934, "grad_norm": 0.12265779078006744, "learning_rate": 4.087722328979438e-05, "loss": 0.0033, "step": 131600 }, { "epoch": 6.030012705607637, "grad_norm": 0.11975305527448654, "learning_rate": 4.079866206606582e-05, "loss": 0.0061, "step": 131700 }, { "epoch": 6.03459130295434, "grad_norm": 0.15824288129806519, "learning_rate": 4.072012434734222e-05, "loss": 0.0066, "step": 131800 }, { "epoch": 6.039169900301043, "grad_norm": 0.2796044647693634, "learning_rate": 4.06416103342498e-05, "loss": 0.0055, "step": 131900 }, { "epoch": 6.043748497647746, "grad_norm": 0.1359216570854187, "learning_rate": 4.056312022735417e-05, "loss": 0.006, "step": 132000 }, { "epoch": 6.048327094994448, "grad_norm": 0.24055655300617218, "learning_rate": 4.0484654227159914e-05, "loss": 0.0072, "step": 132100 }, { "epoch": 6.0529056923411515, "grad_norm": 0.4629483222961426, "learning_rate": 4.040621253411004e-05, "loss": 0.0059, "step": 132200 }, { "epoch": 6.057484289687854, "grad_norm": 0.3944862186908722, "learning_rate": 4.032779534858544e-05, "loss": 0.0059, "step": 132300 }, { "epoch": 6.062062887034557, "grad_norm": 0.45347368717193604, "learning_rate": 4.0249402870904396e-05, "loss": 0.0061, "step": 132400 }, { "epoch": 6.06664148438126, "grad_norm": 0.587853729724884, "learning_rate": 4.017103530132212e-05, "loss": 0.0074, "step": 132500 }, { "epoch": 6.0712200817279625, "grad_norm": 0.6638960242271423, "learning_rate": 4.0092692840030134e-05, "loss": 0.0071, "step": 132600 }, { "epoch": 6.075798679074666, "grad_norm": 0.28217336535453796, "learning_rate": 4.0014375687155844e-05, "loss": 0.0055, "step": 132700 }, { "epoch": 6.080377276421368, "grad_norm": 0.19813333451747894, "learning_rate": 3.993608404276205e-05, "loss": 0.0066, "step": 132800 }, { "epoch": 6.084955873768071, "grad_norm": 0.06923089921474457, "learning_rate": 3.985781810684631e-05, "loss": 0.006, "step": 132900 }, { "epoch": 6.089534471114774, "grad_norm": 0.5418972969055176, "learning_rate": 3.9779578079340554e-05, "loss": 0.0051, "step": 133000 }, { "epoch": 6.094113068461477, "grad_norm": 0.1508362740278244, "learning_rate": 3.970136416011056e-05, "loss": 0.0049, "step": 133100 }, { "epoch": 6.09869166580818, "grad_norm": 0.26092636585235596, "learning_rate": 3.962317654895533e-05, "loss": 0.0054, "step": 133200 }, { "epoch": 6.103270263154882, "grad_norm": 0.3573048412799835, "learning_rate": 3.9545015445606736e-05, "loss": 0.007, "step": 133300 }, { "epoch": 6.107848860501585, "grad_norm": 0.033060140907764435, "learning_rate": 3.946688104972891e-05, "loss": 0.0045, "step": 133400 }, { "epoch": 6.1124274578482884, "grad_norm": 0.05039636045694351, "learning_rate": 3.9388773560917724e-05, "loss": 0.0048, "step": 133500 }, { "epoch": 6.117006055194991, "grad_norm": 0.41924425959587097, "learning_rate": 3.931069317870039e-05, "loss": 0.0065, "step": 133600 }, { "epoch": 6.121584652541694, "grad_norm": 0.33294302225112915, "learning_rate": 3.9232640102534786e-05, "loss": 0.0054, "step": 133700 }, { "epoch": 6.126163249888397, "grad_norm": 0.025311682373285294, "learning_rate": 3.915461453180914e-05, "loss": 0.0048, "step": 133800 }, { "epoch": 6.130741847235099, "grad_norm": 0.13680239021778107, "learning_rate": 3.907661666584131e-05, "loss": 0.0055, "step": 133900 }, { "epoch": 6.135320444581803, "grad_norm": 0.2641524076461792, "learning_rate": 3.899864670387844e-05, "loss": 0.0063, "step": 134000 }, { "epoch": 6.139899041928505, "grad_norm": 0.28719770908355713, "learning_rate": 3.892070484509642e-05, "loss": 0.0052, "step": 134100 }, { "epoch": 6.144477639275208, "grad_norm": 0.19892792403697968, "learning_rate": 3.884279128859927e-05, "loss": 0.0045, "step": 134200 }, { "epoch": 6.149056236621911, "grad_norm": 0.16647031903266907, "learning_rate": 3.8764906233418775e-05, "loss": 0.0062, "step": 134300 }, { "epoch": 6.1536348339686135, "grad_norm": 0.18115417659282684, "learning_rate": 3.86870498785139e-05, "loss": 0.0053, "step": 134400 }, { "epoch": 6.158213431315317, "grad_norm": 0.7846788167953491, "learning_rate": 3.860922242277028e-05, "loss": 0.006, "step": 134500 }, { "epoch": 6.162792028662019, "grad_norm": 0.056557830423116684, "learning_rate": 3.853142406499972e-05, "loss": 0.0068, "step": 134600 }, { "epoch": 6.167370626008722, "grad_norm": 0.37449362874031067, "learning_rate": 3.845365500393974e-05, "loss": 0.0055, "step": 134700 }, { "epoch": 6.171949223355425, "grad_norm": 1.3642663955688477, "learning_rate": 3.837591543825296e-05, "loss": 0.0052, "step": 134800 }, { "epoch": 6.176527820702128, "grad_norm": 0.022911841049790382, "learning_rate": 3.8298205566526676e-05, "loss": 0.0042, "step": 134900 }, { "epoch": 6.181106418048831, "grad_norm": 0.028689689934253693, "learning_rate": 3.8220525587272384e-05, "loss": 0.0062, "step": 135000 }, { "epoch": 6.185685015395533, "grad_norm": 0.3197433650493622, "learning_rate": 3.814287569892512e-05, "loss": 0.0059, "step": 135100 }, { "epoch": 6.190263612742236, "grad_norm": 0.06785603612661362, "learning_rate": 3.806525609984312e-05, "loss": 0.0049, "step": 135200 }, { "epoch": 6.1948422100889395, "grad_norm": 0.3414902985095978, "learning_rate": 3.7987666988307244e-05, "loss": 0.0053, "step": 135300 }, { "epoch": 6.199420807435642, "grad_norm": 0.1857975274324417, "learning_rate": 3.791010856252043e-05, "loss": 0.0058, "step": 135400 }, { "epoch": 6.203999404782345, "grad_norm": 0.1203976720571518, "learning_rate": 3.7832581020607284e-05, "loss": 0.0077, "step": 135500 }, { "epoch": 6.208578002129048, "grad_norm": 0.3408762514591217, "learning_rate": 3.7755084560613455e-05, "loss": 0.0065, "step": 135600 }, { "epoch": 6.2131565994757505, "grad_norm": 0.0590222142636776, "learning_rate": 3.767761938050528e-05, "loss": 0.0055, "step": 135700 }, { "epoch": 6.217735196822454, "grad_norm": 0.662112295627594, "learning_rate": 3.760018567816908e-05, "loss": 0.0059, "step": 135800 }, { "epoch": 6.222313794169156, "grad_norm": 0.005584437865763903, "learning_rate": 3.752278365141084e-05, "loss": 0.0067, "step": 135900 }, { "epoch": 6.226892391515859, "grad_norm": 0.15262584388256073, "learning_rate": 3.744541349795564e-05, "loss": 0.0065, "step": 136000 }, { "epoch": 6.231470988862562, "grad_norm": 0.644844114780426, "learning_rate": 3.7368075415447086e-05, "loss": 0.0047, "step": 136100 }, { "epoch": 6.236049586209265, "grad_norm": 0.06777459383010864, "learning_rate": 3.729076960144687e-05, "loss": 0.0052, "step": 136200 }, { "epoch": 6.240628183555968, "grad_norm": 0.7604510188102722, "learning_rate": 3.721349625343431e-05, "loss": 0.0054, "step": 136300 }, { "epoch": 6.24520678090267, "grad_norm": 0.3464463949203491, "learning_rate": 3.71362555688057e-05, "loss": 0.0053, "step": 136400 }, { "epoch": 6.249785378249373, "grad_norm": 0.032986678183078766, "learning_rate": 3.705904774487396e-05, "loss": 0.0053, "step": 136500 }, { "epoch": 6.2543639755960765, "grad_norm": 0.270702987909317, "learning_rate": 3.6981872978868065e-05, "loss": 0.0056, "step": 136600 }, { "epoch": 6.258942572942779, "grad_norm": 0.5898098945617676, "learning_rate": 3.6904731467932493e-05, "loss": 0.0047, "step": 136700 }, { "epoch": 6.263521170289482, "grad_norm": 0.07433097064495087, "learning_rate": 3.682762340912681e-05, "loss": 0.0057, "step": 136800 }, { "epoch": 6.268099767636184, "grad_norm": 0.208632692694664, "learning_rate": 3.675054899942515e-05, "loss": 0.0064, "step": 136900 }, { "epoch": 6.2726783649828874, "grad_norm": 0.48827114701271057, "learning_rate": 3.6673508435715634e-05, "loss": 0.0056, "step": 137000 }, { "epoch": 6.277256962329591, "grad_norm": 0.15773746371269226, "learning_rate": 3.659650191479994e-05, "loss": 0.0059, "step": 137100 }, { "epoch": 6.281835559676293, "grad_norm": 0.46037283539772034, "learning_rate": 3.651952963339282e-05, "loss": 0.0052, "step": 137200 }, { "epoch": 6.286414157022996, "grad_norm": 0.07779065519571304, "learning_rate": 3.6442591788121505e-05, "loss": 0.0051, "step": 137300 }, { "epoch": 6.290992754369698, "grad_norm": 0.5138252377510071, "learning_rate": 3.6365688575525315e-05, "loss": 0.0053, "step": 137400 }, { "epoch": 6.295571351716402, "grad_norm": 0.21173468232154846, "learning_rate": 3.628882019205506e-05, "loss": 0.0058, "step": 137500 }, { "epoch": 6.300149949063105, "grad_norm": 0.4661062955856323, "learning_rate": 3.621198683407258e-05, "loss": 0.0053, "step": 137600 }, { "epoch": 6.304728546409807, "grad_norm": 0.2002924084663391, "learning_rate": 3.613518869785025e-05, "loss": 0.0054, "step": 137700 }, { "epoch": 6.30930714375651, "grad_norm": 0.24317267537117004, "learning_rate": 3.6058425979570485e-05, "loss": 0.0057, "step": 137800 }, { "epoch": 6.313885741103213, "grad_norm": 0.14312195777893066, "learning_rate": 3.598169887532521e-05, "loss": 0.0059, "step": 137900 }, { "epoch": 6.318464338449916, "grad_norm": 0.07625292241573334, "learning_rate": 3.590500758111537e-05, "loss": 0.006, "step": 138000 }, { "epoch": 6.323042935796619, "grad_norm": 0.07330285757780075, "learning_rate": 3.582835229285042e-05, "loss": 0.0044, "step": 138100 }, { "epoch": 6.327621533143321, "grad_norm": 0.025587473064661026, "learning_rate": 3.5751733206347894e-05, "loss": 0.0054, "step": 138200 }, { "epoch": 6.332200130490024, "grad_norm": 0.09335857629776001, "learning_rate": 3.567515051733277e-05, "loss": 0.0062, "step": 138300 }, { "epoch": 6.336778727836728, "grad_norm": 0.031704433262348175, "learning_rate": 3.559860442143709e-05, "loss": 0.0063, "step": 138400 }, { "epoch": 6.34135732518343, "grad_norm": 0.09114887565374374, "learning_rate": 3.552209511419943e-05, "loss": 0.0045, "step": 138500 }, { "epoch": 6.345935922530133, "grad_norm": 0.023929867893457413, "learning_rate": 3.5445622791064356e-05, "loss": 0.0053, "step": 138600 }, { "epoch": 6.350514519876835, "grad_norm": 1.4821025133132935, "learning_rate": 3.5369187647381974e-05, "loss": 0.0056, "step": 138700 }, { "epoch": 6.3550931172235385, "grad_norm": 0.16608977317810059, "learning_rate": 3.529278987840744e-05, "loss": 0.0055, "step": 138800 }, { "epoch": 6.359671714570242, "grad_norm": 0.21598820388317108, "learning_rate": 3.5216429679300376e-05, "loss": 0.0051, "step": 138900 }, { "epoch": 6.364250311916944, "grad_norm": 0.016882436349987984, "learning_rate": 3.5140107245124476e-05, "loss": 0.0052, "step": 139000 }, { "epoch": 6.368828909263647, "grad_norm": 0.05500126630067825, "learning_rate": 3.506382277084696e-05, "loss": 0.0043, "step": 139100 }, { "epoch": 6.37340750661035, "grad_norm": 0.03151680901646614, "learning_rate": 3.4987576451338055e-05, "loss": 0.0056, "step": 139200 }, { "epoch": 6.377986103957053, "grad_norm": 0.07861995697021484, "learning_rate": 3.491136848137053e-05, "loss": 0.0044, "step": 139300 }, { "epoch": 6.382564701303756, "grad_norm": 0.12852996587753296, "learning_rate": 3.483519905561924e-05, "loss": 0.0045, "step": 139400 }, { "epoch": 6.387143298650458, "grad_norm": 0.6488528847694397, "learning_rate": 3.475906836866046e-05, "loss": 0.0043, "step": 139500 }, { "epoch": 6.391721895997161, "grad_norm": 0.3967334032058716, "learning_rate": 3.468297661497164e-05, "loss": 0.0069, "step": 139600 }, { "epoch": 6.396300493343864, "grad_norm": 0.1223519891500473, "learning_rate": 3.460692398893068e-05, "loss": 0.0054, "step": 139700 }, { "epoch": 6.400879090690567, "grad_norm": 0.15460754930973053, "learning_rate": 3.453091068481559e-05, "loss": 0.0056, "step": 139800 }, { "epoch": 6.40545768803727, "grad_norm": 0.040079645812511444, "learning_rate": 3.445493689680388e-05, "loss": 0.0055, "step": 139900 }, { "epoch": 6.410036285383972, "grad_norm": 0.020937960594892502, "learning_rate": 3.4379002818972124e-05, "loss": 0.0054, "step": 140000 }, { "epoch": 6.4146148827306755, "grad_norm": 0.21271613240242004, "learning_rate": 3.43031086452955e-05, "loss": 0.0046, "step": 140100 }, { "epoch": 6.419193480077379, "grad_norm": 0.09579429775476456, "learning_rate": 3.4227254569647205e-05, "loss": 0.0051, "step": 140200 }, { "epoch": 6.423772077424081, "grad_norm": 0.043937426060438156, "learning_rate": 3.4151440785798004e-05, "loss": 0.0053, "step": 140300 }, { "epoch": 6.428350674770784, "grad_norm": 0.4987468123435974, "learning_rate": 3.4075667487415785e-05, "loss": 0.0058, "step": 140400 }, { "epoch": 6.432929272117486, "grad_norm": 0.6420878171920776, "learning_rate": 3.399993486806495e-05, "loss": 0.0067, "step": 140500 }, { "epoch": 6.43750786946419, "grad_norm": 0.39332908391952515, "learning_rate": 3.392424312120601e-05, "loss": 0.0052, "step": 140600 }, { "epoch": 6.442086466810893, "grad_norm": 0.06132081523537636, "learning_rate": 3.384859244019511e-05, "loss": 0.0047, "step": 140700 }, { "epoch": 6.446665064157595, "grad_norm": 0.23858435451984406, "learning_rate": 3.377298301828343e-05, "loss": 0.0047, "step": 140800 }, { "epoch": 6.451243661504298, "grad_norm": 0.32406067848205566, "learning_rate": 3.3697415048616765e-05, "loss": 0.0055, "step": 140900 }, { "epoch": 6.455822258851001, "grad_norm": 1.4054268598556519, "learning_rate": 3.362188872423506e-05, "loss": 0.0051, "step": 141000 }, { "epoch": 6.460400856197704, "grad_norm": 0.05585220828652382, "learning_rate": 3.354640423807183e-05, "loss": 0.0062, "step": 141100 }, { "epoch": 6.464979453544407, "grad_norm": 0.018032953143119812, "learning_rate": 3.347096178295371e-05, "loss": 0.0037, "step": 141200 }, { "epoch": 6.469558050891109, "grad_norm": 0.907580554485321, "learning_rate": 3.339556155160004e-05, "loss": 0.006, "step": 141300 }, { "epoch": 6.474136648237812, "grad_norm": 0.08372417092323303, "learning_rate": 3.3320203736622184e-05, "loss": 0.0057, "step": 141400 }, { "epoch": 6.478715245584516, "grad_norm": 0.39907532930374146, "learning_rate": 3.324488853052326e-05, "loss": 0.0044, "step": 141500 }, { "epoch": 6.483293842931218, "grad_norm": 1.1346983909606934, "learning_rate": 3.3169616125697486e-05, "loss": 0.0048, "step": 141600 }, { "epoch": 6.487872440277921, "grad_norm": 0.8341863751411438, "learning_rate": 3.3094386714429724e-05, "loss": 0.0047, "step": 141700 }, { "epoch": 6.492451037624623, "grad_norm": 0.07596173137426376, "learning_rate": 3.301920048889506e-05, "loss": 0.0041, "step": 141800 }, { "epoch": 6.497029634971327, "grad_norm": 0.7775061130523682, "learning_rate": 3.294405764115823e-05, "loss": 0.0049, "step": 141900 }, { "epoch": 6.50160823231803, "grad_norm": 0.30017733573913574, "learning_rate": 3.286895836317319e-05, "loss": 0.0043, "step": 142000 }, { "epoch": 6.506186829664732, "grad_norm": 0.3541896343231201, "learning_rate": 3.2793902846782534e-05, "loss": 0.0055, "step": 142100 }, { "epoch": 6.510765427011435, "grad_norm": 0.1445729285478592, "learning_rate": 3.271889128371712e-05, "loss": 0.0054, "step": 142200 }, { "epoch": 6.5153440243581375, "grad_norm": 0.20019569993019104, "learning_rate": 3.2643923865595536e-05, "loss": 0.005, "step": 142300 }, { "epoch": 6.519922621704841, "grad_norm": 0.21448808908462524, "learning_rate": 3.2569000783923544e-05, "loss": 0.0051, "step": 142400 }, { "epoch": 6.524501219051544, "grad_norm": 0.13675835728645325, "learning_rate": 3.249412223009368e-05, "loss": 0.0046, "step": 142500 }, { "epoch": 6.529079816398246, "grad_norm": 0.11393424868583679, "learning_rate": 3.2419288395384785e-05, "loss": 0.004, "step": 142600 }, { "epoch": 6.533658413744949, "grad_norm": 0.2209634631872177, "learning_rate": 3.234449947096135e-05, "loss": 0.0042, "step": 142700 }, { "epoch": 6.538237011091653, "grad_norm": 0.025969982147216797, "learning_rate": 3.226975564787322e-05, "loss": 0.0059, "step": 142800 }, { "epoch": 6.542815608438355, "grad_norm": 0.8372477293014526, "learning_rate": 3.2195057117055036e-05, "loss": 0.0042, "step": 142900 }, { "epoch": 6.547394205785058, "grad_norm": 0.8654465675354004, "learning_rate": 3.212040406932569e-05, "loss": 0.0046, "step": 143000 }, { "epoch": 6.55197280313176, "grad_norm": 0.614989161491394, "learning_rate": 3.204579669538792e-05, "loss": 0.0052, "step": 143100 }, { "epoch": 6.5565514004784635, "grad_norm": 0.31863656640052795, "learning_rate": 3.19712351858278e-05, "loss": 0.0053, "step": 143200 }, { "epoch": 6.561129997825166, "grad_norm": 0.5576188564300537, "learning_rate": 3.1896719731114186e-05, "loss": 0.0053, "step": 143300 }, { "epoch": 6.565708595171869, "grad_norm": 0.17591483891010284, "learning_rate": 3.182225052159833e-05, "loss": 0.0049, "step": 143400 }, { "epoch": 6.570287192518572, "grad_norm": 0.29361802339553833, "learning_rate": 3.174782774751338e-05, "loss": 0.0053, "step": 143500 }, { "epoch": 6.5748657898652745, "grad_norm": 0.046735215932130814, "learning_rate": 3.167345159897378e-05, "loss": 0.0047, "step": 143600 }, { "epoch": 6.579444387211978, "grad_norm": 0.3643573224544525, "learning_rate": 3.1599122265974946e-05, "loss": 0.0041, "step": 143700 }, { "epoch": 6.584022984558681, "grad_norm": 0.1328035444021225, "learning_rate": 3.152483993839265e-05, "loss": 0.0045, "step": 143800 }, { "epoch": 6.588601581905383, "grad_norm": 0.11621426790952682, "learning_rate": 3.145060480598263e-05, "loss": 0.0045, "step": 143900 }, { "epoch": 6.593180179252086, "grad_norm": 0.3485720753669739, "learning_rate": 3.137641705838004e-05, "loss": 0.0057, "step": 144000 }, { "epoch": 6.597758776598789, "grad_norm": 0.7195566892623901, "learning_rate": 3.1302276885098955e-05, "loss": 0.0057, "step": 144100 }, { "epoch": 6.602337373945492, "grad_norm": 0.07403887808322906, "learning_rate": 3.122818447553201e-05, "loss": 0.0052, "step": 144200 }, { "epoch": 6.606915971292195, "grad_norm": 0.07861506193876266, "learning_rate": 3.115414001894974e-05, "loss": 0.0054, "step": 144300 }, { "epoch": 6.611494568638897, "grad_norm": 0.20306392014026642, "learning_rate": 3.108014370450021e-05, "loss": 0.0063, "step": 144400 }, { "epoch": 6.6160731659856005, "grad_norm": 0.5941068530082703, "learning_rate": 3.100619572120854e-05, "loss": 0.0052, "step": 144500 }, { "epoch": 6.620651763332303, "grad_norm": 0.4385504126548767, "learning_rate": 3.0932296257976336e-05, "loss": 0.0049, "step": 144600 }, { "epoch": 6.625230360679006, "grad_norm": 0.07183999568223953, "learning_rate": 3.0858445503581266e-05, "loss": 0.0054, "step": 144700 }, { "epoch": 6.629808958025709, "grad_norm": 0.04285150766372681, "learning_rate": 3.0784643646676635e-05, "loss": 0.0046, "step": 144800 }, { "epoch": 6.634387555372411, "grad_norm": 0.04266421124339104, "learning_rate": 3.071089087579074e-05, "loss": 0.0064, "step": 144900 }, { "epoch": 6.638966152719115, "grad_norm": 0.4585297405719757, "learning_rate": 3.063718737932655e-05, "loss": 0.0053, "step": 145000 }, { "epoch": 6.643544750065818, "grad_norm": 0.04796597734093666, "learning_rate": 3.0563533345561155e-05, "loss": 0.0052, "step": 145100 }, { "epoch": 6.64812334741252, "grad_norm": 0.7312212586402893, "learning_rate": 3.0489928962645275e-05, "loss": 0.0047, "step": 145200 }, { "epoch": 6.652701944759223, "grad_norm": 0.4970768988132477, "learning_rate": 3.041637441860279e-05, "loss": 0.005, "step": 145300 }, { "epoch": 6.657280542105926, "grad_norm": 0.28591519594192505, "learning_rate": 3.0342869901330313e-05, "loss": 0.0047, "step": 145400 }, { "epoch": 6.661859139452629, "grad_norm": 0.019086016342043877, "learning_rate": 3.02694155985966e-05, "loss": 0.0052, "step": 145500 }, { "epoch": 6.666437736799331, "grad_norm": 0.06344935297966003, "learning_rate": 3.019601169804216e-05, "loss": 0.0054, "step": 145600 }, { "epoch": 6.671016334146034, "grad_norm": 0.2660221755504608, "learning_rate": 3.012265838717878e-05, "loss": 0.0049, "step": 145700 }, { "epoch": 6.675594931492737, "grad_norm": 0.22985537350177765, "learning_rate": 3.0049355853388955e-05, "loss": 0.0049, "step": 145800 }, { "epoch": 6.68017352883944, "grad_norm": 0.06822630017995834, "learning_rate": 2.9976104283925515e-05, "loss": 0.004, "step": 145900 }, { "epoch": 6.684752126186143, "grad_norm": 0.03378499671816826, "learning_rate": 2.9902903865911068e-05, "loss": 0.0062, "step": 146000 }, { "epoch": 6.689330723532846, "grad_norm": 0.3799358904361725, "learning_rate": 2.9829754786337603e-05, "loss": 0.0056, "step": 146100 }, { "epoch": 6.693909320879548, "grad_norm": 0.2396411895751953, "learning_rate": 2.975665723206591e-05, "loss": 0.0049, "step": 146200 }, { "epoch": 6.698487918226252, "grad_norm": 0.19714663922786713, "learning_rate": 2.9683611389825167e-05, "loss": 0.0057, "step": 146300 }, { "epoch": 6.703066515572954, "grad_norm": 0.07194243371486664, "learning_rate": 2.9610617446212495e-05, "loss": 0.0058, "step": 146400 }, { "epoch": 6.707645112919657, "grad_norm": 0.567692220211029, "learning_rate": 2.9537675587692382e-05, "loss": 0.0045, "step": 146500 }, { "epoch": 6.71222371026636, "grad_norm": 0.4618910551071167, "learning_rate": 2.946478600059629e-05, "loss": 0.0051, "step": 146600 }, { "epoch": 6.7168023076130625, "grad_norm": 0.09115318953990936, "learning_rate": 2.939194887112218e-05, "loss": 0.0046, "step": 146700 }, { "epoch": 6.721380904959766, "grad_norm": 0.1926048994064331, "learning_rate": 2.9319164385333953e-05, "loss": 0.0039, "step": 146800 }, { "epoch": 6.725959502306468, "grad_norm": 0.5767799615859985, "learning_rate": 2.9246432729161055e-05, "loss": 0.0068, "step": 146900 }, { "epoch": 6.730538099653171, "grad_norm": 0.5855737328529358, "learning_rate": 2.917375408839803e-05, "loss": 0.0039, "step": 147000 }, { "epoch": 6.735116696999874, "grad_norm": 0.18008683621883392, "learning_rate": 2.910112864870388e-05, "loss": 0.0053, "step": 147100 }, { "epoch": 6.739695294346577, "grad_norm": 0.013704614713788033, "learning_rate": 2.9028556595601786e-05, "loss": 0.0058, "step": 147200 }, { "epoch": 6.74427389169328, "grad_norm": 0.2103748619556427, "learning_rate": 2.895603811447858e-05, "loss": 0.0053, "step": 147300 }, { "epoch": 6.748852489039983, "grad_norm": 0.3199872374534607, "learning_rate": 2.888357339058413e-05, "loss": 0.0045, "step": 147400 }, { "epoch": 6.753431086386685, "grad_norm": 0.1987699270248413, "learning_rate": 2.8811162609031104e-05, "loss": 0.0038, "step": 147500 }, { "epoch": 6.7580096837333885, "grad_norm": 0.12141498178243637, "learning_rate": 2.8738805954794295e-05, "loss": 0.004, "step": 147600 }, { "epoch": 6.762588281080091, "grad_norm": 0.24456042051315308, "learning_rate": 2.8666503612710226e-05, "loss": 0.0052, "step": 147700 }, { "epoch": 6.767166878426794, "grad_norm": 0.21112100780010223, "learning_rate": 2.8594255767476718e-05, "loss": 0.0057, "step": 147800 }, { "epoch": 6.771745475773497, "grad_norm": 0.395063579082489, "learning_rate": 2.852206260365237e-05, "loss": 0.0051, "step": 147900 }, { "epoch": 6.7763240731201995, "grad_norm": 0.39365246891975403, "learning_rate": 2.8449924305656107e-05, "loss": 0.0043, "step": 148000 }, { "epoch": 6.780902670466903, "grad_norm": 0.19307668507099152, "learning_rate": 2.8377841057766624e-05, "loss": 0.0057, "step": 148100 }, { "epoch": 6.785481267813605, "grad_norm": 0.3313720226287842, "learning_rate": 2.8305813044122097e-05, "loss": 0.0054, "step": 148200 }, { "epoch": 6.790059865160308, "grad_norm": 0.6470041871070862, "learning_rate": 2.8233840448719532e-05, "loss": 0.0048, "step": 148300 }, { "epoch": 6.794638462507011, "grad_norm": 0.9007655382156372, "learning_rate": 2.8161923455414367e-05, "loss": 0.0055, "step": 148400 }, { "epoch": 6.799217059853714, "grad_norm": 0.8383020758628845, "learning_rate": 2.8090062247920045e-05, "loss": 0.005, "step": 148500 }, { "epoch": 6.803795657200417, "grad_norm": 0.2168063223361969, "learning_rate": 2.80182570098075e-05, "loss": 0.0045, "step": 148600 }, { "epoch": 6.80837425454712, "grad_norm": 0.1763121336698532, "learning_rate": 2.794650792450464e-05, "loss": 0.0058, "step": 148700 }, { "epoch": 6.812952851893822, "grad_norm": 0.022952038794755936, "learning_rate": 2.7874815175296e-05, "loss": 0.0043, "step": 148800 }, { "epoch": 6.8175314492405255, "grad_norm": 0.01308775506913662, "learning_rate": 2.7803178945322134e-05, "loss": 0.0047, "step": 148900 }, { "epoch": 6.822110046587228, "grad_norm": 0.0029964440036565065, "learning_rate": 2.7731599417579245e-05, "loss": 0.0052, "step": 149000 }, { "epoch": 6.826688643933931, "grad_norm": 0.2904300093650818, "learning_rate": 2.7660076774918708e-05, "loss": 0.0039, "step": 149100 }, { "epoch": 6.831267241280633, "grad_norm": 0.19335739314556122, "learning_rate": 2.7588611200046592e-05, "loss": 0.004, "step": 149200 }, { "epoch": 6.835845838627336, "grad_norm": 0.04841936379671097, "learning_rate": 2.7517202875523117e-05, "loss": 0.0048, "step": 149300 }, { "epoch": 6.84042443597404, "grad_norm": 0.12522141635417938, "learning_rate": 2.7445851983762344e-05, "loss": 0.004, "step": 149400 }, { "epoch": 6.845003033320742, "grad_norm": 0.17885401844978333, "learning_rate": 2.737455870703155e-05, "loss": 0.0055, "step": 149500 }, { "epoch": 6.849581630667445, "grad_norm": 0.47798067331314087, "learning_rate": 2.7303323227450857e-05, "loss": 0.005, "step": 149600 }, { "epoch": 6.854160228014148, "grad_norm": 0.02613680437207222, "learning_rate": 2.7232145726992752e-05, "loss": 0.0065, "step": 149700 }, { "epoch": 6.8587388253608506, "grad_norm": 0.2435833066701889, "learning_rate": 2.7161026387481636e-05, "loss": 0.0061, "step": 149800 }, { "epoch": 6.863317422707554, "grad_norm": 0.08455272018909454, "learning_rate": 2.7089965390593263e-05, "loss": 0.0059, "step": 149900 }, { "epoch": 6.867896020054256, "grad_norm": 0.08332820981740952, "learning_rate": 2.7018962917854418e-05, "loss": 0.0042, "step": 150000 }, { "epoch": 6.872474617400959, "grad_norm": 0.34127193689346313, "learning_rate": 2.6948019150642383e-05, "loss": 0.0029, "step": 150100 }, { "epoch": 6.877053214747662, "grad_norm": 0.1273692101240158, "learning_rate": 2.6877134270184435e-05, "loss": 0.0051, "step": 150200 }, { "epoch": 6.881631812094365, "grad_norm": 0.14943909645080566, "learning_rate": 2.6806308457557423e-05, "loss": 0.0062, "step": 150300 }, { "epoch": 6.886210409441068, "grad_norm": 0.14334943890571594, "learning_rate": 2.6735541893687343e-05, "loss": 0.0056, "step": 150400 }, { "epoch": 6.89078900678777, "grad_norm": 0.15444263815879822, "learning_rate": 2.666483475934885e-05, "loss": 0.0045, "step": 150500 }, { "epoch": 6.895367604134473, "grad_norm": 0.31661495566368103, "learning_rate": 2.6594187235164713e-05, "loss": 0.0063, "step": 150600 }, { "epoch": 6.8999462014811765, "grad_norm": 0.4060909152030945, "learning_rate": 2.65235995016055e-05, "loss": 0.0052, "step": 150700 }, { "epoch": 6.904524798827879, "grad_norm": 0.20253728330135345, "learning_rate": 2.645307173898901e-05, "loss": 0.0064, "step": 150800 }, { "epoch": 6.909103396174582, "grad_norm": 0.7078954577445984, "learning_rate": 2.6382604127479815e-05, "loss": 0.0044, "step": 150900 }, { "epoch": 6.913681993521285, "grad_norm": 0.14812923967838287, "learning_rate": 2.6312196847088893e-05, "loss": 0.0052, "step": 151000 }, { "epoch": 6.9182605908679875, "grad_norm": 0.10642609000205994, "learning_rate": 2.6241850077673087e-05, "loss": 0.0052, "step": 151100 }, { "epoch": 6.922839188214691, "grad_norm": 0.15536317229270935, "learning_rate": 2.6171563998934605e-05, "loss": 0.0053, "step": 151200 }, { "epoch": 6.927417785561393, "grad_norm": 0.1677425354719162, "learning_rate": 2.6101338790420715e-05, "loss": 0.0048, "step": 151300 }, { "epoch": 6.931996382908096, "grad_norm": 0.1952294558286667, "learning_rate": 2.6031174631523118e-05, "loss": 0.0059, "step": 151400 }, { "epoch": 6.9365749802547985, "grad_norm": 0.171901673078537, "learning_rate": 2.5961071701477567e-05, "loss": 0.0049, "step": 151500 }, { "epoch": 6.941153577601502, "grad_norm": 0.0632760226726532, "learning_rate": 2.589103017936344e-05, "loss": 0.0043, "step": 151600 }, { "epoch": 6.945732174948205, "grad_norm": 0.14387081563472748, "learning_rate": 2.582105024410325e-05, "loss": 0.0046, "step": 151700 }, { "epoch": 6.950310772294907, "grad_norm": 0.1789962351322174, "learning_rate": 2.575113207446213e-05, "loss": 0.0041, "step": 151800 }, { "epoch": 6.95488936964161, "grad_norm": 0.05178796499967575, "learning_rate": 2.5681275849047482e-05, "loss": 0.0052, "step": 151900 }, { "epoch": 6.9594679669883135, "grad_norm": 0.06059027463197708, "learning_rate": 2.5611481746308473e-05, "loss": 0.0049, "step": 152000 }, { "epoch": 6.964046564335016, "grad_norm": 0.02760574221611023, "learning_rate": 2.5541749944535554e-05, "loss": 0.005, "step": 152100 }, { "epoch": 6.968625161681719, "grad_norm": 0.5041255950927734, "learning_rate": 2.547208062185999e-05, "loss": 0.0034, "step": 152200 }, { "epoch": 6.973203759028421, "grad_norm": 0.12533140182495117, "learning_rate": 2.5402473956253515e-05, "loss": 0.0059, "step": 152300 }, { "epoch": 6.9777823563751245, "grad_norm": 0.0706457793712616, "learning_rate": 2.5332930125527787e-05, "loss": 0.006, "step": 152400 }, { "epoch": 6.982360953721828, "grad_norm": 0.37089434266090393, "learning_rate": 2.5263449307333908e-05, "loss": 0.0052, "step": 152500 }, { "epoch": 6.98693955106853, "grad_norm": 0.034625936299562454, "learning_rate": 2.5194031679162067e-05, "loss": 0.0048, "step": 152600 }, { "epoch": 6.991518148415233, "grad_norm": 0.19594725966453552, "learning_rate": 2.512467741834099e-05, "loss": 0.0048, "step": 152700 }, { "epoch": 6.996096745761935, "grad_norm": 0.09410729259252548, "learning_rate": 2.505538670203754e-05, "loss": 0.0043, "step": 152800 }, { "epoch": 6.9999885535066335, "eval_loss": 0.1694260537624359, "eval_runtime": 268.0609, "eval_samples_per_second": 20.518, "eval_steps_per_second": 20.518, "step": 152885 }, { "epoch": 7.000675343108639, "grad_norm": 0.03295362740755081, "learning_rate": 2.4986159707256274e-05, "loss": 0.0039, "step": 152900 }, { "epoch": 7.005253940455342, "grad_norm": 0.06891167163848877, "learning_rate": 2.4916996610838973e-05, "loss": 0.0035, "step": 153000 }, { "epoch": 7.009832537802044, "grad_norm": 0.12001962214708328, "learning_rate": 2.484789758946414e-05, "loss": 0.0034, "step": 153100 }, { "epoch": 7.014411135148747, "grad_norm": 0.2218388170003891, "learning_rate": 2.477886281964667e-05, "loss": 0.0026, "step": 153200 }, { "epoch": 7.01898973249545, "grad_norm": 0.006664707791060209, "learning_rate": 2.4709892477737262e-05, "loss": 0.0028, "step": 153300 }, { "epoch": 7.023568329842153, "grad_norm": 0.0030799272935837507, "learning_rate": 2.464098673992205e-05, "loss": 0.0035, "step": 153400 }, { "epoch": 7.028146927188856, "grad_norm": 0.14536774158477783, "learning_rate": 2.457214578222215e-05, "loss": 0.0028, "step": 153500 }, { "epoch": 7.032725524535558, "grad_norm": 0.14756208658218384, "learning_rate": 2.450336978049322e-05, "loss": 0.0031, "step": 153600 }, { "epoch": 7.037304121882261, "grad_norm": 0.22443515062332153, "learning_rate": 2.44346589104249e-05, "loss": 0.0044, "step": 153700 }, { "epoch": 7.041882719228965, "grad_norm": 0.05271737277507782, "learning_rate": 2.4366013347540545e-05, "loss": 0.0034, "step": 153800 }, { "epoch": 7.046461316575667, "grad_norm": 0.10101396590471268, "learning_rate": 2.4297433267196668e-05, "loss": 0.0043, "step": 153900 }, { "epoch": 7.05103991392237, "grad_norm": 0.042054641991853714, "learning_rate": 2.422891884458241e-05, "loss": 0.0034, "step": 154000 }, { "epoch": 7.055618511269072, "grad_norm": 0.3988908529281616, "learning_rate": 2.4160470254719285e-05, "loss": 0.0033, "step": 154100 }, { "epoch": 7.0601971086157755, "grad_norm": 0.02858237735927105, "learning_rate": 2.4092087672460623e-05, "loss": 0.004, "step": 154200 }, { "epoch": 7.064775705962479, "grad_norm": 0.023146772757172585, "learning_rate": 2.4023771272491125e-05, "loss": 0.0033, "step": 154300 }, { "epoch": 7.069354303309181, "grad_norm": 0.2908150553703308, "learning_rate": 2.39555212293264e-05, "loss": 0.0028, "step": 154400 }, { "epoch": 7.073932900655884, "grad_norm": 0.15492355823516846, "learning_rate": 2.38873377173126e-05, "loss": 0.003, "step": 154500 }, { "epoch": 7.0785114980025865, "grad_norm": 0.15679115056991577, "learning_rate": 2.3819220910625882e-05, "loss": 0.002, "step": 154600 }, { "epoch": 7.08309009534929, "grad_norm": 0.08304117619991302, "learning_rate": 2.3751170983272e-05, "loss": 0.0037, "step": 154700 }, { "epoch": 7.087668692695993, "grad_norm": 0.19912482798099518, "learning_rate": 2.368318810908588e-05, "loss": 0.0038, "step": 154800 }, { "epoch": 7.092247290042695, "grad_norm": 0.163644939661026, "learning_rate": 2.3615272461731186e-05, "loss": 0.0046, "step": 154900 }, { "epoch": 7.096825887389398, "grad_norm": 0.564191460609436, "learning_rate": 2.3547424214699786e-05, "loss": 0.0027, "step": 155000 }, { "epoch": 7.1014044847361015, "grad_norm": 0.21988272666931152, "learning_rate": 2.347964354131144e-05, "loss": 0.0032, "step": 155100 }, { "epoch": 7.105983082082804, "grad_norm": 0.028988847509026527, "learning_rate": 2.3411930614713247e-05, "loss": 0.0038, "step": 155200 }, { "epoch": 7.110561679429507, "grad_norm": 0.23106561601161957, "learning_rate": 2.3344285607879224e-05, "loss": 0.0026, "step": 155300 }, { "epoch": 7.115140276776209, "grad_norm": 0.08767526596784592, "learning_rate": 2.3276708693609943e-05, "loss": 0.0044, "step": 155400 }, { "epoch": 7.1197188741229125, "grad_norm": 0.11939793080091476, "learning_rate": 2.3209200044532027e-05, "loss": 0.0028, "step": 155500 }, { "epoch": 7.124297471469616, "grad_norm": 0.19785170257091522, "learning_rate": 2.3141759833097653e-05, "loss": 0.003, "step": 155600 }, { "epoch": 7.128876068816318, "grad_norm": 0.889499843120575, "learning_rate": 2.307438823158425e-05, "loss": 0.0024, "step": 155700 }, { "epoch": 7.133454666163021, "grad_norm": 0.5035886764526367, "learning_rate": 2.300708541209393e-05, "loss": 0.0039, "step": 155800 }, { "epoch": 7.1380332635097234, "grad_norm": 0.04118403419852257, "learning_rate": 2.2939851546553094e-05, "loss": 0.0038, "step": 155900 }, { "epoch": 7.142611860856427, "grad_norm": 0.3574579358100891, "learning_rate": 2.2872686806712035e-05, "loss": 0.0028, "step": 156000 }, { "epoch": 7.14719045820313, "grad_norm": 0.005380525719374418, "learning_rate": 2.2805591364144447e-05, "loss": 0.0028, "step": 156100 }, { "epoch": 7.151769055549832, "grad_norm": 0.0275371465831995, "learning_rate": 2.273856539024703e-05, "loss": 0.0029, "step": 156200 }, { "epoch": 7.156347652896535, "grad_norm": 0.05387549847364426, "learning_rate": 2.2671609056238952e-05, "loss": 0.0026, "step": 156300 }, { "epoch": 7.160926250243238, "grad_norm": 0.25863537192344666, "learning_rate": 2.2604722533161572e-05, "loss": 0.0022, "step": 156400 }, { "epoch": 7.165504847589941, "grad_norm": 0.03250390663743019, "learning_rate": 2.2537905991877855e-05, "loss": 0.0026, "step": 156500 }, { "epoch": 7.170083444936644, "grad_norm": 0.15917915105819702, "learning_rate": 2.2471159603071995e-05, "loss": 0.0047, "step": 156600 }, { "epoch": 7.174662042283346, "grad_norm": 0.25873464345932007, "learning_rate": 2.2404483537249023e-05, "loss": 0.0041, "step": 156700 }, { "epoch": 7.179240639630049, "grad_norm": 0.03446133807301521, "learning_rate": 2.233787796473432e-05, "loss": 0.0027, "step": 156800 }, { "epoch": 7.183819236976753, "grad_norm": 0.39116761088371277, "learning_rate": 2.2271343055673144e-05, "loss": 0.0027, "step": 156900 }, { "epoch": 7.188397834323455, "grad_norm": 0.005667871795594692, "learning_rate": 2.22048789800303e-05, "loss": 0.0032, "step": 157000 }, { "epoch": 7.192976431670158, "grad_norm": 0.2927045226097107, "learning_rate": 2.2138485907589613e-05, "loss": 0.0033, "step": 157100 }, { "epoch": 7.19755502901686, "grad_norm": 0.15872865915298462, "learning_rate": 2.2072164007953517e-05, "loss": 0.0029, "step": 157200 }, { "epoch": 7.202133626363564, "grad_norm": 0.5440332293510437, "learning_rate": 2.200591345054267e-05, "loss": 0.0037, "step": 157300 }, { "epoch": 7.206712223710267, "grad_norm": 0.2492242008447647, "learning_rate": 2.193973440459549e-05, "loss": 0.0029, "step": 157400 }, { "epoch": 7.211290821056969, "grad_norm": 0.0664735659956932, "learning_rate": 2.187362703916766e-05, "loss": 0.0036, "step": 157500 }, { "epoch": 7.215869418403672, "grad_norm": 0.006082352716475725, "learning_rate": 2.1807591523131827e-05, "loss": 0.0023, "step": 157600 }, { "epoch": 7.2204480157503745, "grad_norm": 0.008978066965937614, "learning_rate": 2.1741628025177036e-05, "loss": 0.0031, "step": 157700 }, { "epoch": 7.225026613097078, "grad_norm": 0.034269288182258606, "learning_rate": 2.167573671380837e-05, "loss": 0.005, "step": 157800 }, { "epoch": 7.229605210443781, "grad_norm": 0.2424388974905014, "learning_rate": 2.1609917757346542e-05, "loss": 0.0031, "step": 157900 }, { "epoch": 7.234183807790483, "grad_norm": 0.07712133228778839, "learning_rate": 2.1544171323927415e-05, "loss": 0.003, "step": 158000 }, { "epoch": 7.238762405137186, "grad_norm": 0.4302210509777069, "learning_rate": 2.1478497581501616e-05, "loss": 0.0034, "step": 158100 }, { "epoch": 7.243341002483889, "grad_norm": 0.09475143998861313, "learning_rate": 2.141289669783401e-05, "loss": 0.0028, "step": 158200 }, { "epoch": 7.247919599830592, "grad_norm": 0.35180267691612244, "learning_rate": 2.134736884050343e-05, "loss": 0.0042, "step": 158300 }, { "epoch": 7.252498197177295, "grad_norm": 0.013500731438398361, "learning_rate": 2.1281914176902108e-05, "loss": 0.0043, "step": 158400 }, { "epoch": 7.257076794523997, "grad_norm": 0.23346847295761108, "learning_rate": 2.1216532874235285e-05, "loss": 0.0031, "step": 158500 }, { "epoch": 7.2616553918707005, "grad_norm": 0.16270193457603455, "learning_rate": 2.115122509952085e-05, "loss": 0.004, "step": 158600 }, { "epoch": 7.266233989217403, "grad_norm": 0.23846475780010223, "learning_rate": 2.1085991019588863e-05, "loss": 0.0027, "step": 158700 }, { "epoch": 7.270812586564106, "grad_norm": 0.027605965733528137, "learning_rate": 2.1020830801081077e-05, "loss": 0.0026, "step": 158800 }, { "epoch": 7.275391183910809, "grad_norm": 0.01757560484111309, "learning_rate": 2.0955744610450618e-05, "loss": 0.0036, "step": 158900 }, { "epoch": 7.2799697812575115, "grad_norm": 0.02324344404041767, "learning_rate": 2.0890732613961478e-05, "loss": 0.0029, "step": 159000 }, { "epoch": 7.284548378604215, "grad_norm": 0.01704220287501812, "learning_rate": 2.0825794977688108e-05, "loss": 0.0037, "step": 159100 }, { "epoch": 7.289126975950918, "grad_norm": 0.08089294284582138, "learning_rate": 2.0760931867515032e-05, "loss": 0.0035, "step": 159200 }, { "epoch": 7.29370557329762, "grad_norm": 0.15005187690258026, "learning_rate": 2.0696143449136402e-05, "loss": 0.0022, "step": 159300 }, { "epoch": 7.298284170644323, "grad_norm": 0.0878557413816452, "learning_rate": 2.063142988805552e-05, "loss": 0.0035, "step": 159400 }, { "epoch": 7.302862767991026, "grad_norm": 0.012229022569954395, "learning_rate": 2.056679134958453e-05, "loss": 0.0026, "step": 159500 }, { "epoch": 7.307441365337729, "grad_norm": 0.053704481571912766, "learning_rate": 2.050222799884387e-05, "loss": 0.0036, "step": 159600 }, { "epoch": 7.312019962684432, "grad_norm": 0.5345095992088318, "learning_rate": 2.0437740000761925e-05, "loss": 0.0038, "step": 159700 }, { "epoch": 7.316598560031134, "grad_norm": 0.09854476153850555, "learning_rate": 2.037332752007461e-05, "loss": 0.0031, "step": 159800 }, { "epoch": 7.3211771573778375, "grad_norm": 0.04005116969347, "learning_rate": 2.0308990721324927e-05, "loss": 0.0027, "step": 159900 }, { "epoch": 7.32575575472454, "grad_norm": 1.264863133430481, "learning_rate": 2.0244729768862518e-05, "loss": 0.0034, "step": 160000 }, { "epoch": 7.330334352071243, "grad_norm": 0.017268653959035873, "learning_rate": 2.01805448268433e-05, "loss": 0.0037, "step": 160100 }, { "epoch": 7.334912949417946, "grad_norm": 0.10752640664577484, "learning_rate": 2.0116436059229038e-05, "loss": 0.0035, "step": 160200 }, { "epoch": 7.339491546764648, "grad_norm": 0.43235811591148376, "learning_rate": 2.0052403629786858e-05, "loss": 0.0027, "step": 160300 }, { "epoch": 7.344070144111352, "grad_norm": 0.014576783403754234, "learning_rate": 1.9988447702088898e-05, "loss": 0.0035, "step": 160400 }, { "epoch": 7.348648741458054, "grad_norm": 0.1350947916507721, "learning_rate": 1.9924568439511876e-05, "loss": 0.0032, "step": 160500 }, { "epoch": 7.353227338804757, "grad_norm": 0.24974310398101807, "learning_rate": 1.98607660052367e-05, "loss": 0.0038, "step": 160600 }, { "epoch": 7.35780593615146, "grad_norm": 0.05233803018927574, "learning_rate": 1.9797040562247948e-05, "loss": 0.0041, "step": 160700 }, { "epoch": 7.362384533498163, "grad_norm": 0.18822649121284485, "learning_rate": 1.9733392273333596e-05, "loss": 0.0037, "step": 160800 }, { "epoch": 7.366963130844866, "grad_norm": 0.19756104052066803, "learning_rate": 1.9669821301084475e-05, "loss": 0.0027, "step": 160900 }, { "epoch": 7.371541728191568, "grad_norm": 0.00448650261387229, "learning_rate": 1.9606327807893902e-05, "loss": 0.0032, "step": 161000 }, { "epoch": 7.376120325538271, "grad_norm": 0.14489981532096863, "learning_rate": 1.954291195595733e-05, "loss": 0.0031, "step": 161100 }, { "epoch": 7.380698922884974, "grad_norm": 0.0051267268136143684, "learning_rate": 1.947957390727185e-05, "loss": 0.003, "step": 161200 }, { "epoch": 7.385277520231677, "grad_norm": 0.38486120104789734, "learning_rate": 1.941631382363576e-05, "loss": 0.0035, "step": 161300 }, { "epoch": 7.38985611757838, "grad_norm": 0.004985155537724495, "learning_rate": 1.9353131866648273e-05, "loss": 0.0024, "step": 161400 }, { "epoch": 7.394434714925083, "grad_norm": 0.002783630508929491, "learning_rate": 1.929002819770896e-05, "loss": 0.0034, "step": 161500 }, { "epoch": 7.399013312271785, "grad_norm": 0.2842748165130615, "learning_rate": 1.922700297801741e-05, "loss": 0.0034, "step": 161600 }, { "epoch": 7.403591909618489, "grad_norm": 0.050929997116327286, "learning_rate": 1.9164056368572846e-05, "loss": 0.003, "step": 161700 }, { "epoch": 7.408170506965191, "grad_norm": 0.06748020648956299, "learning_rate": 1.9101188530173687e-05, "loss": 0.0032, "step": 161800 }, { "epoch": 7.412749104311894, "grad_norm": 0.03134176880121231, "learning_rate": 1.9038399623417063e-05, "loss": 0.0023, "step": 161900 }, { "epoch": 7.417327701658597, "grad_norm": 0.06679194420576096, "learning_rate": 1.897568980869855e-05, "loss": 0.0032, "step": 162000 }, { "epoch": 7.4219062990052995, "grad_norm": 0.22911858558654785, "learning_rate": 1.8913059246211612e-05, "loss": 0.0033, "step": 162100 }, { "epoch": 7.426484896352003, "grad_norm": 0.12825864553451538, "learning_rate": 1.8850508095947332e-05, "loss": 0.0029, "step": 162200 }, { "epoch": 7.431063493698705, "grad_norm": 0.022259972989559174, "learning_rate": 1.8788036517693858e-05, "loss": 0.004, "step": 162300 }, { "epoch": 7.435642091045408, "grad_norm": 0.09766406565904617, "learning_rate": 1.8725644671036126e-05, "loss": 0.0033, "step": 162400 }, { "epoch": 7.440220688392111, "grad_norm": 0.6670352816581726, "learning_rate": 1.8663332715355396e-05, "loss": 0.0032, "step": 162500 }, { "epoch": 7.444799285738814, "grad_norm": 0.009802890941500664, "learning_rate": 1.8601100809828787e-05, "loss": 0.0039, "step": 162600 }, { "epoch": 7.449377883085517, "grad_norm": 0.08977996557950974, "learning_rate": 1.853894911342901e-05, "loss": 0.0029, "step": 162700 }, { "epoch": 7.45395648043222, "grad_norm": 0.713555097579956, "learning_rate": 1.847687778492382e-05, "loss": 0.0027, "step": 162800 }, { "epoch": 7.458535077778922, "grad_norm": 0.3743430972099304, "learning_rate": 1.8414886982875664e-05, "loss": 0.0034, "step": 162900 }, { "epoch": 7.4631136751256255, "grad_norm": 0.0767466276884079, "learning_rate": 1.8352976865641326e-05, "loss": 0.0032, "step": 163000 }, { "epoch": 7.467692272472328, "grad_norm": 0.28391310572624207, "learning_rate": 1.8291147591371482e-05, "loss": 0.0035, "step": 163100 }, { "epoch": 7.472270869819031, "grad_norm": 0.25534164905548096, "learning_rate": 1.822939931801024e-05, "loss": 0.0028, "step": 163200 }, { "epoch": 7.476849467165734, "grad_norm": 0.03635001927614212, "learning_rate": 1.816773220329484e-05, "loss": 0.0035, "step": 163300 }, { "epoch": 7.4814280645124365, "grad_norm": 0.06547212600708008, "learning_rate": 1.810614640475518e-05, "loss": 0.004, "step": 163400 }, { "epoch": 7.48600666185914, "grad_norm": 0.10231446474790573, "learning_rate": 1.8044642079713408e-05, "loss": 0.0026, "step": 163500 }, { "epoch": 7.490585259205842, "grad_norm": 0.08887581527233124, "learning_rate": 1.79832193852836e-05, "loss": 0.002, "step": 163600 }, { "epoch": 7.495163856552545, "grad_norm": 0.01825689524412155, "learning_rate": 1.792187847837129e-05, "loss": 0.0032, "step": 163700 }, { "epoch": 7.499742453899248, "grad_norm": 0.0413985475897789, "learning_rate": 1.7860619515673033e-05, "loss": 0.003, "step": 163800 }, { "epoch": 7.504321051245951, "grad_norm": 0.11123603582382202, "learning_rate": 1.779944265367614e-05, "loss": 0.0031, "step": 163900 }, { "epoch": 7.508899648592654, "grad_norm": 0.11079199612140656, "learning_rate": 1.7738348048658127e-05, "loss": 0.0029, "step": 164000 }, { "epoch": 7.513478245939356, "grad_norm": 0.026996923610568047, "learning_rate": 1.767733585668639e-05, "loss": 0.0028, "step": 164100 }, { "epoch": 7.518056843286059, "grad_norm": 0.2861877381801605, "learning_rate": 1.7616406233617832e-05, "loss": 0.0033, "step": 164200 }, { "epoch": 7.5226354406327625, "grad_norm": 0.013889641501009464, "learning_rate": 1.7555559335098414e-05, "loss": 0.0034, "step": 164300 }, { "epoch": 7.527214037979465, "grad_norm": 0.5749355554580688, "learning_rate": 1.749479531656279e-05, "loss": 0.0034, "step": 164400 }, { "epoch": 7.531792635326168, "grad_norm": 0.03499993681907654, "learning_rate": 1.7434114333233852e-05, "loss": 0.0029, "step": 164500 }, { "epoch": 7.53637123267287, "grad_norm": 0.1424218863248825, "learning_rate": 1.737351654012244e-05, "loss": 0.0025, "step": 164600 }, { "epoch": 7.540949830019573, "grad_norm": 0.009633993729948997, "learning_rate": 1.7313002092026837e-05, "loss": 0.0032, "step": 164700 }, { "epoch": 7.545528427366277, "grad_norm": 0.02650436945259571, "learning_rate": 1.725257114353241e-05, "loss": 0.0044, "step": 164800 }, { "epoch": 7.550107024712979, "grad_norm": 0.0338139683008194, "learning_rate": 1.7192223849011258e-05, "loss": 0.0029, "step": 164900 }, { "epoch": 7.554685622059682, "grad_norm": 1.0118355751037598, "learning_rate": 1.7131960362621796e-05, "loss": 0.0041, "step": 165000 }, { "epoch": 7.559264219406385, "grad_norm": 0.014256274327635765, "learning_rate": 1.7071780838308288e-05, "loss": 0.0027, "step": 165100 }, { "epoch": 7.563842816753088, "grad_norm": 0.05664459615945816, "learning_rate": 1.7011685429800595e-05, "loss": 0.0026, "step": 165200 }, { "epoch": 7.568421414099791, "grad_norm": 0.14832501113414764, "learning_rate": 1.695167429061364e-05, "loss": 0.0027, "step": 165300 }, { "epoch": 7.573000011446493, "grad_norm": 0.19807232916355133, "learning_rate": 1.6891747574047078e-05, "loss": 0.0026, "step": 165400 }, { "epoch": 7.577578608793196, "grad_norm": 0.09145753085613251, "learning_rate": 1.6831905433184946e-05, "loss": 0.0032, "step": 165500 }, { "epoch": 7.582157206139899, "grad_norm": 0.021602990105748177, "learning_rate": 1.6772148020895228e-05, "loss": 0.0022, "step": 165600 }, { "epoch": 7.586735803486602, "grad_norm": 0.2839347720146179, "learning_rate": 1.671247548982941e-05, "loss": 0.0034, "step": 165700 }, { "epoch": 7.591314400833305, "grad_norm": 0.02294602431356907, "learning_rate": 1.6652887992422235e-05, "loss": 0.0023, "step": 165800 }, { "epoch": 7.595892998180007, "grad_norm": 0.027606772258877754, "learning_rate": 1.659338568089114e-05, "loss": 0.0032, "step": 165900 }, { "epoch": 7.60047159552671, "grad_norm": 0.01902574673295021, "learning_rate": 1.653396870723599e-05, "loss": 0.0036, "step": 166000 }, { "epoch": 7.6050501928734136, "grad_norm": 0.06941546499729156, "learning_rate": 1.6474637223238665e-05, "loss": 0.0031, "step": 166100 }, { "epoch": 7.609628790220116, "grad_norm": 0.06622402369976044, "learning_rate": 1.641539138046264e-05, "loss": 0.003, "step": 166200 }, { "epoch": 7.614207387566819, "grad_norm": 0.0019321365980431437, "learning_rate": 1.6356231330252657e-05, "loss": 0.0031, "step": 166300 }, { "epoch": 7.618785984913522, "grad_norm": 0.11348855495452881, "learning_rate": 1.629715722373423e-05, "loss": 0.0039, "step": 166400 }, { "epoch": 7.6233645822602245, "grad_norm": 0.14493609964847565, "learning_rate": 1.6238169211813387e-05, "loss": 0.0019, "step": 166500 }, { "epoch": 7.627943179606928, "grad_norm": 0.11578594148159027, "learning_rate": 1.6179267445176206e-05, "loss": 0.0031, "step": 166600 }, { "epoch": 7.63252177695363, "grad_norm": 0.026161905378103256, "learning_rate": 1.6120452074288416e-05, "loss": 0.0031, "step": 166700 }, { "epoch": 7.637100374300333, "grad_norm": 0.048572130501270294, "learning_rate": 1.6061723249395104e-05, "loss": 0.0027, "step": 166800 }, { "epoch": 7.6416789716470355, "grad_norm": 0.08658236265182495, "learning_rate": 1.600308112052027e-05, "loss": 0.0048, "step": 166900 }, { "epoch": 7.646257568993739, "grad_norm": 0.03995939716696739, "learning_rate": 1.594452583746638e-05, "loss": 0.0029, "step": 167000 }, { "epoch": 7.650836166340442, "grad_norm": 0.5306475758552551, "learning_rate": 1.588605754981413e-05, "loss": 0.0032, "step": 167100 }, { "epoch": 7.655414763687144, "grad_norm": 0.008948258124291897, "learning_rate": 1.582767640692194e-05, "loss": 0.0024, "step": 167200 }, { "epoch": 7.659993361033847, "grad_norm": 0.09350460022687912, "learning_rate": 1.576938255792561e-05, "loss": 0.0032, "step": 167300 }, { "epoch": 7.6645719583805505, "grad_norm": 0.34027963876724243, "learning_rate": 1.5711176151737984e-05, "loss": 0.0029, "step": 167400 }, { "epoch": 7.669150555727253, "grad_norm": 0.012650508433580399, "learning_rate": 1.5653057337048514e-05, "loss": 0.0031, "step": 167500 }, { "epoch": 7.673729153073956, "grad_norm": 0.07974658906459808, "learning_rate": 1.5595026262322875e-05, "loss": 0.0023, "step": 167600 }, { "epoch": 7.678307750420658, "grad_norm": 0.06705432385206223, "learning_rate": 1.553708307580265e-05, "loss": 0.0032, "step": 167700 }, { "epoch": 7.6828863477673615, "grad_norm": 0.027641797438263893, "learning_rate": 1.547922792550488e-05, "loss": 0.0036, "step": 167800 }, { "epoch": 7.687464945114065, "grad_norm": 0.44552162289619446, "learning_rate": 1.5421460959221707e-05, "loss": 0.0036, "step": 167900 }, { "epoch": 7.692043542460767, "grad_norm": 0.02241067960858345, "learning_rate": 1.536378232452003e-05, "loss": 0.0037, "step": 168000 }, { "epoch": 7.69662213980747, "grad_norm": 0.2189732789993286, "learning_rate": 1.5306192168741117e-05, "loss": 0.0026, "step": 168100 }, { "epoch": 7.701200737154172, "grad_norm": 0.046641841530799866, "learning_rate": 1.5248690639000162e-05, "loss": 0.0035, "step": 168200 }, { "epoch": 7.705779334500876, "grad_norm": 0.02562684379518032, "learning_rate": 1.5191277882186023e-05, "loss": 0.003, "step": 168300 }, { "epoch": 7.710357931847579, "grad_norm": 0.22241626679897308, "learning_rate": 1.513395404496072e-05, "loss": 0.0022, "step": 168400 }, { "epoch": 7.714936529194281, "grad_norm": 0.2740160822868347, "learning_rate": 1.5076719273759198e-05, "loss": 0.0033, "step": 168500 }, { "epoch": 7.719515126540984, "grad_norm": 0.02267398126423359, "learning_rate": 1.5019573714788809e-05, "loss": 0.002, "step": 168600 }, { "epoch": 7.7240937238876874, "grad_norm": 0.008224272169172764, "learning_rate": 1.4962517514029067e-05, "loss": 0.0022, "step": 168700 }, { "epoch": 7.72867232123439, "grad_norm": 0.11832094937562943, "learning_rate": 1.4905550817231206e-05, "loss": 0.0029, "step": 168800 }, { "epoch": 7.733250918581093, "grad_norm": 0.3029548227787018, "learning_rate": 1.4848673769917787e-05, "loss": 0.0042, "step": 168900 }, { "epoch": 7.737829515927795, "grad_norm": 0.026391340419650078, "learning_rate": 1.4791886517382413e-05, "loss": 0.0031, "step": 169000 }, { "epoch": 7.742408113274498, "grad_norm": 0.4289281666278839, "learning_rate": 1.473518920468926e-05, "loss": 0.0033, "step": 169100 }, { "epoch": 7.746986710621201, "grad_norm": 0.1801924854516983, "learning_rate": 1.4678581976672751e-05, "loss": 0.0028, "step": 169200 }, { "epoch": 7.751565307967904, "grad_norm": 0.06808359920978546, "learning_rate": 1.4622064977937222e-05, "loss": 0.0037, "step": 169300 }, { "epoch": 7.756143905314607, "grad_norm": 0.5008605122566223, "learning_rate": 1.4565638352856503e-05, "loss": 0.0032, "step": 169400 }, { "epoch": 7.760722502661309, "grad_norm": 0.13920585811138153, "learning_rate": 1.4509302245573536e-05, "loss": 0.0032, "step": 169500 }, { "epoch": 7.7653011000080125, "grad_norm": 0.002380757825449109, "learning_rate": 1.4453056800000076e-05, "loss": 0.0025, "step": 169600 }, { "epoch": 7.769879697354716, "grad_norm": 0.03281938657164574, "learning_rate": 1.4396902159816245e-05, "loss": 0.0028, "step": 169700 }, { "epoch": 7.774458294701418, "grad_norm": 0.2583022117614746, "learning_rate": 1.4340838468470197e-05, "loss": 0.0031, "step": 169800 }, { "epoch": 7.779036892048121, "grad_norm": 0.0035414681769907475, "learning_rate": 1.4284865869177789e-05, "loss": 0.0031, "step": 169900 }, { "epoch": 7.783615489394824, "grad_norm": 0.23097677528858185, "learning_rate": 1.4228984504922178e-05, "loss": 0.0034, "step": 170000 }, { "epoch": 7.788194086741527, "grad_norm": 0.515470027923584, "learning_rate": 1.4173194518453414e-05, "loss": 0.004, "step": 170100 }, { "epoch": 7.79277268408823, "grad_norm": 0.03734416887164116, "learning_rate": 1.4117496052288193e-05, "loss": 0.0025, "step": 170200 }, { "epoch": 7.797351281434932, "grad_norm": 0.270358681678772, "learning_rate": 1.4061889248709343e-05, "loss": 0.0017, "step": 170300 }, { "epoch": 7.801929878781635, "grad_norm": 0.027283625677227974, "learning_rate": 1.4006374249765597e-05, "loss": 0.0028, "step": 170400 }, { "epoch": 7.806508476128338, "grad_norm": 0.06574155390262604, "learning_rate": 1.3950951197271134e-05, "loss": 0.0031, "step": 170500 }, { "epoch": 7.811087073475041, "grad_norm": 0.05151946470141411, "learning_rate": 1.3895620232805279e-05, "loss": 0.0017, "step": 170600 }, { "epoch": 7.815665670821744, "grad_norm": 0.012561053037643433, "learning_rate": 1.3840381497712113e-05, "loss": 0.0025, "step": 170700 }, { "epoch": 7.820244268168446, "grad_norm": 0.005159000866115093, "learning_rate": 1.3785235133100088e-05, "loss": 0.0034, "step": 170800 }, { "epoch": 7.8248228655151495, "grad_norm": 0.04550444707274437, "learning_rate": 1.3730181279841748e-05, "loss": 0.0024, "step": 170900 }, { "epoch": 7.829401462861853, "grad_norm": 0.05944928154349327, "learning_rate": 1.3675220078573253e-05, "loss": 0.0022, "step": 171000 }, { "epoch": 7.833980060208555, "grad_norm": 0.31237590312957764, "learning_rate": 1.3620351669694103e-05, "loss": 0.0023, "step": 171100 }, { "epoch": 7.838558657555258, "grad_norm": 0.0012041196459904313, "learning_rate": 1.356557619336678e-05, "loss": 0.0027, "step": 171200 }, { "epoch": 7.8431372549019605, "grad_norm": 0.1280195415019989, "learning_rate": 1.3510893789516372e-05, "loss": 0.0034, "step": 171300 }, { "epoch": 7.847715852248664, "grad_norm": 0.2050485610961914, "learning_rate": 1.345630459783015e-05, "loss": 0.0028, "step": 171400 }, { "epoch": 7.852294449595367, "grad_norm": 0.15840676426887512, "learning_rate": 1.340180875775735e-05, "loss": 0.002, "step": 171500 }, { "epoch": 7.856873046942069, "grad_norm": 0.7529467344284058, "learning_rate": 1.3347406408508695e-05, "loss": 0.0022, "step": 171600 }, { "epoch": 7.861451644288772, "grad_norm": 0.03594828397035599, "learning_rate": 1.3293097689056078e-05, "loss": 0.0025, "step": 171700 }, { "epoch": 7.866030241635475, "grad_norm": 0.4587234854698181, "learning_rate": 1.323888273813223e-05, "loss": 0.0029, "step": 171800 }, { "epoch": 7.870608838982178, "grad_norm": 0.05882592126727104, "learning_rate": 1.3184761694230375e-05, "loss": 0.0026, "step": 171900 }, { "epoch": 7.875187436328881, "grad_norm": 0.07484336197376251, "learning_rate": 1.3130734695603786e-05, "loss": 0.0028, "step": 172000 }, { "epoch": 7.879766033675583, "grad_norm": 0.008674757555127144, "learning_rate": 1.3076801880265554e-05, "loss": 0.0028, "step": 172100 }, { "epoch": 7.884344631022286, "grad_norm": 0.41222670674324036, "learning_rate": 1.3022963385988151e-05, "loss": 0.0036, "step": 172200 }, { "epoch": 7.88892322836899, "grad_norm": 0.10513575375080109, "learning_rate": 1.296921935030308e-05, "loss": 0.0029, "step": 172300 }, { "epoch": 7.893501825715692, "grad_norm": 0.29091617465019226, "learning_rate": 1.2915569910500591e-05, "loss": 0.004, "step": 172400 }, { "epoch": 7.898080423062395, "grad_norm": 0.09394501894712448, "learning_rate": 1.2862015203629274e-05, "loss": 0.0032, "step": 172500 }, { "epoch": 7.902659020409097, "grad_norm": 0.0589442253112793, "learning_rate": 1.2808555366495728e-05, "loss": 0.0027, "step": 172600 }, { "epoch": 7.907237617755801, "grad_norm": 0.02068307250738144, "learning_rate": 1.2755190535664168e-05, "loss": 0.0024, "step": 172700 }, { "epoch": 7.911816215102503, "grad_norm": 0.08841919153928757, "learning_rate": 1.2701920847456166e-05, "loss": 0.0027, "step": 172800 }, { "epoch": 7.916394812449206, "grad_norm": 0.22736288607120514, "learning_rate": 1.264874643795021e-05, "loss": 0.0034, "step": 172900 }, { "epoch": 7.920973409795909, "grad_norm": 0.16831666231155396, "learning_rate": 1.2595667442981401e-05, "loss": 0.0023, "step": 173000 }, { "epoch": 7.9255520071426115, "grad_norm": 0.04770100489258766, "learning_rate": 1.2542683998141119e-05, "loss": 0.0025, "step": 173100 }, { "epoch": 7.930130604489315, "grad_norm": 0.6141162514686584, "learning_rate": 1.2489796238776675e-05, "loss": 0.004, "step": 173200 }, { "epoch": 7.934709201836018, "grad_norm": 0.7967793345451355, "learning_rate": 1.243700429999089e-05, "loss": 0.0027, "step": 173300 }, { "epoch": 7.93928779918272, "grad_norm": 0.015516542829573154, "learning_rate": 1.2384308316641874e-05, "loss": 0.0017, "step": 173400 }, { "epoch": 7.943866396529423, "grad_norm": 0.0020021158270537853, "learning_rate": 1.233170842334258e-05, "loss": 0.0029, "step": 173500 }, { "epoch": 7.948444993876126, "grad_norm": 0.014905404299497604, "learning_rate": 1.2279204754460493e-05, "loss": 0.0026, "step": 173600 }, { "epoch": 7.953023591222829, "grad_norm": 0.04339270293712616, "learning_rate": 1.222679744411731e-05, "loss": 0.0031, "step": 173700 }, { "epoch": 7.957602188569532, "grad_norm": 0.10109388083219528, "learning_rate": 1.2174486626188586e-05, "loss": 0.0033, "step": 173800 }, { "epoch": 7.962180785916234, "grad_norm": 0.018510516732931137, "learning_rate": 1.2122272434303344e-05, "loss": 0.0026, "step": 173900 }, { "epoch": 7.9667593832629375, "grad_norm": 0.014604040421545506, "learning_rate": 1.2070155001843835e-05, "loss": 0.0024, "step": 174000 }, { "epoch": 7.97133798060964, "grad_norm": 0.20794948935508728, "learning_rate": 1.2018134461945075e-05, "loss": 0.0033, "step": 174100 }, { "epoch": 7.975916577956343, "grad_norm": 0.06476528197526932, "learning_rate": 1.1966210947494583e-05, "loss": 0.0024, "step": 174200 }, { "epoch": 7.980495175303046, "grad_norm": 0.0063975718803703785, "learning_rate": 1.1914384591132044e-05, "loss": 0.0022, "step": 174300 }, { "epoch": 7.9850737726497485, "grad_norm": 0.03397635370492935, "learning_rate": 1.1862655525248945e-05, "loss": 0.0025, "step": 174400 }, { "epoch": 7.989652369996452, "grad_norm": 0.030696725472807884, "learning_rate": 1.1811023881988248e-05, "loss": 0.0021, "step": 174500 }, { "epoch": 7.994230967343155, "grad_norm": 0.08137042820453644, "learning_rate": 1.1759489793244022e-05, "loss": 0.0025, "step": 174600 }, { "epoch": 7.998809564689857, "grad_norm": 0.0656815618276596, "learning_rate": 1.1708053390661128e-05, "loss": 0.0026, "step": 174700 }, { "epoch": 8.0, "eval_loss": 0.17588233947753906, "eval_runtime": 260.0784, "eval_samples_per_second": 21.147, "eval_steps_per_second": 21.147, "step": 174726 }, { "epoch": 8.00338816203656, "grad_norm": 0.10640919208526611, "learning_rate": 1.1656714805634938e-05, "loss": 0.0018, "step": 174800 }, { "epoch": 8.007966759383264, "grad_norm": 0.0020934424828737974, "learning_rate": 1.1605474169310881e-05, "loss": 0.002, "step": 174900 }, { "epoch": 8.012545356729966, "grad_norm": 0.09055866301059723, "learning_rate": 1.1554331612584218e-05, "loss": 0.0017, "step": 175000 }, { "epoch": 8.017123954076668, "grad_norm": 0.49149322509765625, "learning_rate": 1.1503287266099666e-05, "loss": 0.0025, "step": 175100 }, { "epoch": 8.021702551423372, "grad_norm": 0.01625397428870201, "learning_rate": 1.145234126025102e-05, "loss": 0.0021, "step": 175200 }, { "epoch": 8.026281148770074, "grad_norm": 0.8061564564704895, "learning_rate": 1.1401493725180912e-05, "loss": 0.0015, "step": 175300 }, { "epoch": 8.030859746116777, "grad_norm": 0.6298221349716187, "learning_rate": 1.1350744790780388e-05, "loss": 0.0018, "step": 175400 }, { "epoch": 8.035438343463479, "grad_norm": 0.051574669778347015, "learning_rate": 1.130009458668863e-05, "loss": 0.0019, "step": 175500 }, { "epoch": 8.040016940810183, "grad_norm": 0.034144267439842224, "learning_rate": 1.1249543242292627e-05, "loss": 0.0019, "step": 175600 }, { "epoch": 8.044595538156885, "grad_norm": 0.05505882203578949, "learning_rate": 1.119909088672682e-05, "loss": 0.0019, "step": 175700 }, { "epoch": 8.049174135503588, "grad_norm": 0.01235408615320921, "learning_rate": 1.1148737648872759e-05, "loss": 0.0019, "step": 175800 }, { "epoch": 8.053752732850292, "grad_norm": 0.07047531008720398, "learning_rate": 1.1098483657358844e-05, "loss": 0.0017, "step": 175900 }, { "epoch": 8.058331330196994, "grad_norm": 0.048473093658685684, "learning_rate": 1.1048329040559896e-05, "loss": 0.0019, "step": 176000 }, { "epoch": 8.062909927543696, "grad_norm": 0.019425269216299057, "learning_rate": 1.0998273926596897e-05, "loss": 0.0015, "step": 176100 }, { "epoch": 8.0674885248904, "grad_norm": 0.0072584389708936214, "learning_rate": 1.094831844333667e-05, "loss": 0.0024, "step": 176200 }, { "epoch": 8.072067122237103, "grad_norm": 0.0020360236521810293, "learning_rate": 1.0898462718391523e-05, "loss": 0.0014, "step": 176300 }, { "epoch": 8.076645719583805, "grad_norm": 0.5871603488922119, "learning_rate": 1.0848706879118892e-05, "loss": 0.0019, "step": 176400 }, { "epoch": 8.08122431693051, "grad_norm": 0.13031832873821259, "learning_rate": 1.0799051052621106e-05, "loss": 0.0017, "step": 176500 }, { "epoch": 8.085802914277211, "grad_norm": 0.008929682895541191, "learning_rate": 1.074949536574496e-05, "loss": 0.0016, "step": 176600 }, { "epoch": 8.090381511623914, "grad_norm": 0.003812073729932308, "learning_rate": 1.0700039945081498e-05, "loss": 0.0017, "step": 176700 }, { "epoch": 8.094960108970616, "grad_norm": 0.011707616969943047, "learning_rate": 1.0650684916965559e-05, "loss": 0.0016, "step": 176800 }, { "epoch": 8.09953870631732, "grad_norm": 0.037662629038095474, "learning_rate": 1.0601430407475582e-05, "loss": 0.002, "step": 176900 }, { "epoch": 8.104117303664022, "grad_norm": 0.012711996212601662, "learning_rate": 1.0552276542433237e-05, "loss": 0.0015, "step": 177000 }, { "epoch": 8.108695901010725, "grad_norm": 0.19156889617443085, "learning_rate": 1.0503223447403032e-05, "loss": 0.0011, "step": 177100 }, { "epoch": 8.113274498357429, "grad_norm": 0.010708093643188477, "learning_rate": 1.0454271247692137e-05, "loss": 0.0013, "step": 177200 }, { "epoch": 8.117853095704131, "grad_norm": 0.02960583008825779, "learning_rate": 1.040542006834992e-05, "loss": 0.0024, "step": 177300 }, { "epoch": 8.122431693050833, "grad_norm": 0.1249750480055809, "learning_rate": 1.0356670034167698e-05, "loss": 0.0015, "step": 177400 }, { "epoch": 8.127010290397537, "grad_norm": 0.0189303457736969, "learning_rate": 1.0308021269678442e-05, "loss": 0.0021, "step": 177500 }, { "epoch": 8.13158888774424, "grad_norm": 0.004004355985671282, "learning_rate": 1.025947389915643e-05, "loss": 0.0025, "step": 177600 }, { "epoch": 8.136167485090942, "grad_norm": 0.021703239530324936, "learning_rate": 1.0211028046616866e-05, "loss": 0.0012, "step": 177700 }, { "epoch": 8.140746082437646, "grad_norm": 0.41312482953071594, "learning_rate": 1.0162683835815705e-05, "loss": 0.0013, "step": 177800 }, { "epoch": 8.145324679784348, "grad_norm": 0.021725183352828026, "learning_rate": 1.0114441390249202e-05, "loss": 0.0012, "step": 177900 }, { "epoch": 8.14990327713105, "grad_norm": 0.33544811606407166, "learning_rate": 1.0066300833153647e-05, "loss": 0.002, "step": 178000 }, { "epoch": 8.154481874477753, "grad_norm": 0.024289660155773163, "learning_rate": 1.0018262287505086e-05, "loss": 0.0023, "step": 178100 }, { "epoch": 8.159060471824457, "grad_norm": 0.49725693464279175, "learning_rate": 9.970325876018982e-06, "loss": 0.002, "step": 178200 }, { "epoch": 8.16363906917116, "grad_norm": 0.018485499545931816, "learning_rate": 9.922491721149845e-06, "loss": 0.0019, "step": 178300 }, { "epoch": 8.168217666517862, "grad_norm": 0.009344914928078651, "learning_rate": 9.874759945091016e-06, "loss": 0.0016, "step": 178400 }, { "epoch": 8.172796263864566, "grad_norm": 0.019952520728111267, "learning_rate": 9.82713066977427e-06, "loss": 0.0012, "step": 178500 }, { "epoch": 8.177374861211268, "grad_norm": 0.5553386211395264, "learning_rate": 9.77960401686958e-06, "loss": 0.0019, "step": 178600 }, { "epoch": 8.18195345855797, "grad_norm": 0.009466302581131458, "learning_rate": 9.732180107784727e-06, "loss": 0.0022, "step": 178700 }, { "epoch": 8.186532055904674, "grad_norm": 0.5055824518203735, "learning_rate": 9.684859063665059e-06, "loss": 0.0017, "step": 178800 }, { "epoch": 8.191110653251377, "grad_norm": 0.38719162344932556, "learning_rate": 9.637641005393167e-06, "loss": 0.002, "step": 178900 }, { "epoch": 8.195689250598079, "grad_norm": 0.0033107008785009384, "learning_rate": 9.590526053588505e-06, "loss": 0.0013, "step": 179000 }, { "epoch": 8.200267847944781, "grad_norm": 0.015472437255084515, "learning_rate": 9.543514328607212e-06, "loss": 0.0019, "step": 179100 }, { "epoch": 8.204846445291485, "grad_norm": 0.004773670807480812, "learning_rate": 9.496605950541676e-06, "loss": 0.002, "step": 179200 }, { "epoch": 8.209425042638188, "grad_norm": 0.0060659064911305904, "learning_rate": 9.44980103922029e-06, "loss": 0.0018, "step": 179300 }, { "epoch": 8.21400363998489, "grad_norm": 0.004397235810756683, "learning_rate": 9.403099714207175e-06, "loss": 0.0017, "step": 179400 }, { "epoch": 8.218582237331594, "grad_norm": 0.004803112708032131, "learning_rate": 9.356502094801816e-06, "loss": 0.0015, "step": 179500 }, { "epoch": 8.223160834678296, "grad_norm": 0.0035059794317930937, "learning_rate": 9.310008300038758e-06, "loss": 0.0018, "step": 179600 }, { "epoch": 8.227739432024999, "grad_norm": 0.025477442890405655, "learning_rate": 9.263618448687377e-06, "loss": 0.002, "step": 179700 }, { "epoch": 8.232318029371703, "grad_norm": 0.3329303562641144, "learning_rate": 9.217332659251477e-06, "loss": 0.0018, "step": 179800 }, { "epoch": 8.236896626718405, "grad_norm": 0.2675701379776001, "learning_rate": 9.171151049969029e-06, "loss": 0.0012, "step": 179900 }, { "epoch": 8.241475224065107, "grad_norm": 1.2457773685455322, "learning_rate": 9.125073738811918e-06, "loss": 0.0019, "step": 180000 }, { "epoch": 8.246053821411811, "grad_norm": 0.1400783210992813, "learning_rate": 9.079100843485578e-06, "loss": 0.0021, "step": 180100 }, { "epoch": 8.250632418758514, "grad_norm": 0.025368591770529747, "learning_rate": 9.033232481428678e-06, "loss": 0.0018, "step": 180200 }, { "epoch": 8.255211016105216, "grad_norm": 0.0014903460396453738, "learning_rate": 8.987468769812912e-06, "loss": 0.0014, "step": 180300 }, { "epoch": 8.259789613451918, "grad_norm": 0.22623829543590546, "learning_rate": 8.941809825542596e-06, "loss": 0.0025, "step": 180400 }, { "epoch": 8.264368210798622, "grad_norm": 0.017613211646676064, "learning_rate": 8.896255765254424e-06, "loss": 0.0012, "step": 180500 }, { "epoch": 8.268946808145325, "grad_norm": 0.005598566494882107, "learning_rate": 8.850806705317183e-06, "loss": 0.001, "step": 180600 }, { "epoch": 8.273525405492027, "grad_norm": 0.17524650692939758, "learning_rate": 8.805462761831418e-06, "loss": 0.001, "step": 180700 }, { "epoch": 8.278104002838731, "grad_norm": 0.03338591754436493, "learning_rate": 8.760224050629162e-06, "loss": 0.0014, "step": 180800 }, { "epoch": 8.282682600185433, "grad_norm": 0.017168212682008743, "learning_rate": 8.715090687273614e-06, "loss": 0.001, "step": 180900 }, { "epoch": 8.287261197532136, "grad_norm": 0.09427805244922638, "learning_rate": 8.67006278705888e-06, "loss": 0.0013, "step": 181000 }, { "epoch": 8.29183979487884, "grad_norm": 0.0094602657482028, "learning_rate": 8.625140465009635e-06, "loss": 0.0013, "step": 181100 }, { "epoch": 8.296418392225542, "grad_norm": 0.06793930381536484, "learning_rate": 8.58032383588086e-06, "loss": 0.0018, "step": 181200 }, { "epoch": 8.300996989572244, "grad_norm": 0.08039774000644684, "learning_rate": 8.535613014157557e-06, "loss": 0.0019, "step": 181300 }, { "epoch": 8.305575586918948, "grad_norm": 0.03726482763886452, "learning_rate": 8.491008114054439e-06, "loss": 0.0021, "step": 181400 }, { "epoch": 8.31015418426565, "grad_norm": 0.10031867027282715, "learning_rate": 8.446509249515605e-06, "loss": 0.0021, "step": 181500 }, { "epoch": 8.314732781612353, "grad_norm": 0.38206222653388977, "learning_rate": 8.402116534214338e-06, "loss": 0.0021, "step": 181600 }, { "epoch": 8.319311378959055, "grad_norm": 0.05652381107211113, "learning_rate": 8.35783008155272e-06, "loss": 0.0009, "step": 181700 }, { "epoch": 8.32388997630576, "grad_norm": 0.0731114000082016, "learning_rate": 8.313650004661383e-06, "loss": 0.0016, "step": 181800 }, { "epoch": 8.328468573652462, "grad_norm": 0.43218135833740234, "learning_rate": 8.26957641639924e-06, "loss": 0.0024, "step": 181900 }, { "epoch": 8.333047170999164, "grad_norm": 0.08536510914564133, "learning_rate": 8.225609429353187e-06, "loss": 0.0021, "step": 182000 }, { "epoch": 8.337625768345868, "grad_norm": 0.011019705794751644, "learning_rate": 8.181749155837754e-06, "loss": 0.0016, "step": 182100 }, { "epoch": 8.34220436569257, "grad_norm": 0.040587395429611206, "learning_rate": 8.137995707894942e-06, "loss": 0.0018, "step": 182200 }, { "epoch": 8.346782963039272, "grad_norm": 0.0023947455920279026, "learning_rate": 8.094349197293793e-06, "loss": 0.0015, "step": 182300 }, { "epoch": 8.351361560385977, "grad_norm": 0.007556082680821419, "learning_rate": 8.050809735530207e-06, "loss": 0.0016, "step": 182400 }, { "epoch": 8.355940157732679, "grad_norm": 0.11117005348205566, "learning_rate": 8.007377433826634e-06, "loss": 0.0016, "step": 182500 }, { "epoch": 8.360518755079381, "grad_norm": 0.0016330329235643148, "learning_rate": 7.964052403131773e-06, "loss": 0.0013, "step": 182600 }, { "epoch": 8.365097352426083, "grad_norm": 0.4123118221759796, "learning_rate": 7.920834754120304e-06, "loss": 0.0021, "step": 182700 }, { "epoch": 8.369675949772788, "grad_norm": 0.014765871688723564, "learning_rate": 7.877724597192582e-06, "loss": 0.0022, "step": 182800 }, { "epoch": 8.37425454711949, "grad_norm": 0.004433237481862307, "learning_rate": 7.834722042474374e-06, "loss": 0.0012, "step": 182900 }, { "epoch": 8.378833144466192, "grad_norm": 0.0037168385460972786, "learning_rate": 7.791827199816593e-06, "loss": 0.0016, "step": 183000 }, { "epoch": 8.383411741812896, "grad_norm": 0.04149395972490311, "learning_rate": 7.74904017879497e-06, "loss": 0.0029, "step": 183100 }, { "epoch": 8.387990339159598, "grad_norm": 0.011970234103500843, "learning_rate": 7.70636108870983e-06, "loss": 0.0022, "step": 183200 }, { "epoch": 8.3925689365063, "grad_norm": 0.049423061311244965, "learning_rate": 7.663790038585793e-06, "loss": 0.0021, "step": 183300 }, { "epoch": 8.397147533853005, "grad_norm": 0.029166920110583305, "learning_rate": 7.621327137171447e-06, "loss": 0.0015, "step": 183400 }, { "epoch": 8.401726131199707, "grad_norm": 0.029471127316355705, "learning_rate": 7.5789724929391625e-06, "loss": 0.0019, "step": 183500 }, { "epoch": 8.40630472854641, "grad_norm": 0.039268478751182556, "learning_rate": 7.536726214084722e-06, "loss": 0.0019, "step": 183600 }, { "epoch": 8.410883325893113, "grad_norm": 0.4737110137939453, "learning_rate": 7.494588408527103e-06, "loss": 0.0018, "step": 183700 }, { "epoch": 8.415461923239816, "grad_norm": 0.03173527121543884, "learning_rate": 7.4525591839081865e-06, "loss": 0.0019, "step": 183800 }, { "epoch": 8.420040520586518, "grad_norm": 0.013487137854099274, "learning_rate": 7.4106386475925046e-06, "loss": 0.0013, "step": 183900 }, { "epoch": 8.42461911793322, "grad_norm": 0.0010746048064902425, "learning_rate": 7.368826906666887e-06, "loss": 0.0019, "step": 184000 }, { "epoch": 8.429197715279924, "grad_norm": 0.01748150959610939, "learning_rate": 7.327124067940311e-06, "loss": 0.0025, "step": 184100 }, { "epoch": 8.433776312626627, "grad_norm": 0.0159548781812191, "learning_rate": 7.285530237943505e-06, "loss": 0.0022, "step": 184200 }, { "epoch": 8.438354909973329, "grad_norm": 0.0013540086802095175, "learning_rate": 7.24404552292875e-06, "loss": 0.0012, "step": 184300 }, { "epoch": 8.442933507320033, "grad_norm": 0.12859028577804565, "learning_rate": 7.202670028869601e-06, "loss": 0.002, "step": 184400 }, { "epoch": 8.447512104666735, "grad_norm": 0.006918368861079216, "learning_rate": 7.161403861460614e-06, "loss": 0.0014, "step": 184500 }, { "epoch": 8.452090702013438, "grad_norm": 0.014983629807829857, "learning_rate": 7.1202471261170245e-06, "loss": 0.0016, "step": 184600 }, { "epoch": 8.456669299360142, "grad_norm": 0.03064214624464512, "learning_rate": 7.079199927974584e-06, "loss": 0.0021, "step": 184700 }, { "epoch": 8.461247896706844, "grad_norm": 0.030842667445540428, "learning_rate": 7.038262371889159e-06, "loss": 0.0012, "step": 184800 }, { "epoch": 8.465826494053546, "grad_norm": 0.0024680488277226686, "learning_rate": 6.997434562436606e-06, "loss": 0.002, "step": 184900 }, { "epoch": 8.470405091400249, "grad_norm": 0.06068078801035881, "learning_rate": 6.956716603912361e-06, "loss": 0.0021, "step": 185000 }, { "epoch": 8.474983688746953, "grad_norm": 0.5528777241706848, "learning_rate": 6.9161086003312945e-06, "loss": 0.0015, "step": 185100 }, { "epoch": 8.479562286093655, "grad_norm": 0.05493824928998947, "learning_rate": 6.875610655427389e-06, "loss": 0.0017, "step": 185200 }, { "epoch": 8.484140883440357, "grad_norm": 0.07727139443159103, "learning_rate": 6.83522287265344e-06, "loss": 0.0017, "step": 185300 }, { "epoch": 8.488719480787061, "grad_norm": 0.5921161770820618, "learning_rate": 6.794945355180893e-06, "loss": 0.0019, "step": 185400 }, { "epoch": 8.493298078133764, "grad_norm": 0.1638752669095993, "learning_rate": 6.754778205899465e-06, "loss": 0.0011, "step": 185500 }, { "epoch": 8.497876675480466, "grad_norm": 0.0014271615073084831, "learning_rate": 6.714721527416956e-06, "loss": 0.0017, "step": 185600 }, { "epoch": 8.50245527282717, "grad_norm": 0.010398001410067081, "learning_rate": 6.674775422058965e-06, "loss": 0.0024, "step": 185700 }, { "epoch": 8.507033870173872, "grad_norm": 0.019598359242081642, "learning_rate": 6.63493999186865e-06, "loss": 0.0024, "step": 185800 }, { "epoch": 8.511612467520575, "grad_norm": 0.00348674226552248, "learning_rate": 6.595215338606397e-06, "loss": 0.0012, "step": 185900 }, { "epoch": 8.516191064867279, "grad_norm": 0.0019242248963564634, "learning_rate": 6.555601563749675e-06, "loss": 0.0012, "step": 186000 }, { "epoch": 8.520769662213981, "grad_norm": 0.008147502318024635, "learning_rate": 6.516098768492662e-06, "loss": 0.0015, "step": 186100 }, { "epoch": 8.525348259560683, "grad_norm": 0.04510408639907837, "learning_rate": 6.47670705374604e-06, "loss": 0.001, "step": 186200 }, { "epoch": 8.529926856907386, "grad_norm": 0.011768829077482224, "learning_rate": 6.437426520136758e-06, "loss": 0.0019, "step": 186300 }, { "epoch": 8.53450545425409, "grad_norm": 0.049900226294994354, "learning_rate": 6.398257268007746e-06, "loss": 0.001, "step": 186400 }, { "epoch": 8.539084051600792, "grad_norm": 0.030545897781848907, "learning_rate": 6.359199397417637e-06, "loss": 0.0019, "step": 186500 }, { "epoch": 8.543662648947494, "grad_norm": 0.007900966331362724, "learning_rate": 6.320253008140575e-06, "loss": 0.0018, "step": 186600 }, { "epoch": 8.548241246294198, "grad_norm": 0.002196391811594367, "learning_rate": 6.281418199665884e-06, "loss": 0.002, "step": 186700 }, { "epoch": 8.5528198436409, "grad_norm": 0.08688097447156906, "learning_rate": 6.242695071197896e-06, "loss": 0.0014, "step": 186800 }, { "epoch": 8.557398440987603, "grad_norm": 0.06626435369253159, "learning_rate": 6.204083721655607e-06, "loss": 0.0017, "step": 186900 }, { "epoch": 8.561977038334307, "grad_norm": 0.08788962662220001, "learning_rate": 6.165584249672507e-06, "loss": 0.0016, "step": 187000 }, { "epoch": 8.56655563568101, "grad_norm": 0.021715328097343445, "learning_rate": 6.127196753596287e-06, "loss": 0.0017, "step": 187100 }, { "epoch": 8.571134233027712, "grad_norm": 0.0046193236485123634, "learning_rate": 6.088921331488568e-06, "loss": 0.001, "step": 187200 }, { "epoch": 8.575712830374414, "grad_norm": 0.03280609846115112, "learning_rate": 6.050758081124719e-06, "loss": 0.0021, "step": 187300 }, { "epoch": 8.580291427721118, "grad_norm": 0.01722414791584015, "learning_rate": 6.012707099993525e-06, "loss": 0.0015, "step": 187400 }, { "epoch": 8.58487002506782, "grad_norm": 0.003511949675157666, "learning_rate": 5.974768485296977e-06, "loss": 0.0019, "step": 187500 }, { "epoch": 8.589448622414523, "grad_norm": 0.1540764719247818, "learning_rate": 5.936942333950063e-06, "loss": 0.0022, "step": 187600 }, { "epoch": 8.594027219761227, "grad_norm": 0.013510748744010925, "learning_rate": 5.8992287425804485e-06, "loss": 0.0012, "step": 187700 }, { "epoch": 8.598605817107929, "grad_norm": 0.07007778435945511, "learning_rate": 5.861627807528264e-06, "loss": 0.001, "step": 187800 }, { "epoch": 8.603184414454631, "grad_norm": 0.010667093098163605, "learning_rate": 5.82413962484587e-06, "loss": 0.0015, "step": 187900 }, { "epoch": 8.607763011801335, "grad_norm": 0.024145985022187233, "learning_rate": 5.7867642902975975e-06, "loss": 0.0025, "step": 188000 }, { "epoch": 8.612341609148038, "grad_norm": 0.12270953506231308, "learning_rate": 5.749501899359477e-06, "loss": 0.0019, "step": 188100 }, { "epoch": 8.61692020649474, "grad_norm": 0.36917853355407715, "learning_rate": 5.712352547219058e-06, "loss": 0.0018, "step": 188200 }, { "epoch": 8.621498803841444, "grad_norm": 0.8480884432792664, "learning_rate": 5.675316328775126e-06, "loss": 0.0023, "step": 188300 }, { "epoch": 8.626077401188146, "grad_norm": 0.009566806256771088, "learning_rate": 5.638393338637432e-06, "loss": 0.0018, "step": 188400 }, { "epoch": 8.630655998534849, "grad_norm": 0.15766866505146027, "learning_rate": 5.601583671126531e-06, "loss": 0.0015, "step": 188500 }, { "epoch": 8.635234595881553, "grad_norm": 0.003668803023174405, "learning_rate": 5.5648874202734565e-06, "loss": 0.0014, "step": 188600 }, { "epoch": 8.639813193228255, "grad_norm": 0.01124663557857275, "learning_rate": 5.528304679819513e-06, "loss": 0.0012, "step": 188700 }, { "epoch": 8.644391790574957, "grad_norm": 0.06179165840148926, "learning_rate": 5.4918355432160726e-06, "loss": 0.0013, "step": 188800 }, { "epoch": 8.64897038792166, "grad_norm": 0.1397646963596344, "learning_rate": 5.455480103624283e-06, "loss": 0.0018, "step": 188900 }, { "epoch": 8.653548985268364, "grad_norm": 0.0030619765166193247, "learning_rate": 5.41923845391486e-06, "loss": 0.002, "step": 189000 }, { "epoch": 8.658127582615066, "grad_norm": 0.015350698493421078, "learning_rate": 5.383110686667831e-06, "loss": 0.0018, "step": 189100 }, { "epoch": 8.662706179961768, "grad_norm": 0.6472819447517395, "learning_rate": 5.347096894172304e-06, "loss": 0.0014, "step": 189200 }, { "epoch": 8.667284777308472, "grad_norm": 1.0994236469268799, "learning_rate": 5.3111971684262574e-06, "loss": 0.0017, "step": 189300 }, { "epoch": 8.671863374655175, "grad_norm": 0.010606258176267147, "learning_rate": 5.275411601136254e-06, "loss": 0.0016, "step": 189400 }, { "epoch": 8.676441972001877, "grad_norm": 0.0032367429230362177, "learning_rate": 5.239740283717265e-06, "loss": 0.002, "step": 189500 }, { "epoch": 8.68102056934858, "grad_norm": 0.0026265005581080914, "learning_rate": 5.20418330729241e-06, "loss": 0.001, "step": 189600 }, { "epoch": 8.685599166695283, "grad_norm": 0.05965089425444603, "learning_rate": 5.168740762692681e-06, "loss": 0.0016, "step": 189700 }, { "epoch": 8.690177764041985, "grad_norm": 0.040079813450574875, "learning_rate": 5.133412740456806e-06, "loss": 0.0022, "step": 189800 }, { "epoch": 8.694756361388688, "grad_norm": 0.06221432238817215, "learning_rate": 5.098199330830922e-06, "loss": 0.002, "step": 189900 }, { "epoch": 8.699334958735392, "grad_norm": 0.013662228360772133, "learning_rate": 5.063100623768391e-06, "loss": 0.0013, "step": 190000 }, { "epoch": 8.703913556082094, "grad_norm": 0.00042499735718593, "learning_rate": 5.028116708929587e-06, "loss": 0.0017, "step": 190100 }, { "epoch": 8.708492153428796, "grad_norm": 0.6862035989761353, "learning_rate": 4.993247675681639e-06, "loss": 0.0019, "step": 190200 }, { "epoch": 8.7130707507755, "grad_norm": 0.10454216599464417, "learning_rate": 4.958493613098186e-06, "loss": 0.0017, "step": 190300 }, { "epoch": 8.717649348122203, "grad_norm": 0.26306042075157166, "learning_rate": 4.9238546099592e-06, "loss": 0.0013, "step": 190400 }, { "epoch": 8.722227945468905, "grad_norm": 0.026483699679374695, "learning_rate": 4.8893307547507205e-06, "loss": 0.0016, "step": 190500 }, { "epoch": 8.72680654281561, "grad_norm": 0.033151958137750626, "learning_rate": 4.854922135664619e-06, "loss": 0.002, "step": 190600 }, { "epoch": 8.731385140162311, "grad_norm": 0.03364422544836998, "learning_rate": 4.820628840598423e-06, "loss": 0.0018, "step": 190700 }, { "epoch": 8.735963737509014, "grad_norm": 0.004185411147773266, "learning_rate": 4.786450957155064e-06, "loss": 0.0021, "step": 190800 }, { "epoch": 8.740542334855718, "grad_norm": 0.007191179320216179, "learning_rate": 4.7523885726426355e-06, "loss": 0.0017, "step": 190900 }, { "epoch": 8.74512093220242, "grad_norm": 0.0175678301602602, "learning_rate": 4.71844177407419e-06, "loss": 0.002, "step": 191000 }, { "epoch": 8.749699529549122, "grad_norm": 0.20129109919071198, "learning_rate": 4.684610648167503e-06, "loss": 0.0017, "step": 191100 }, { "epoch": 8.754278126895825, "grad_norm": 0.22425204515457153, "learning_rate": 4.6508952813448965e-06, "loss": 0.0015, "step": 191200 }, { "epoch": 8.758856724242529, "grad_norm": 0.011632180772721767, "learning_rate": 4.617295759732937e-06, "loss": 0.0019, "step": 191300 }, { "epoch": 8.763435321589231, "grad_norm": 0.00452096201479435, "learning_rate": 4.5838121691623e-06, "loss": 0.0012, "step": 191400 }, { "epoch": 8.768013918935933, "grad_norm": 0.004986160434782505, "learning_rate": 4.550444595167502e-06, "loss": 0.0014, "step": 191500 }, { "epoch": 8.772592516282637, "grad_norm": 0.0067661721259355545, "learning_rate": 4.517193122986679e-06, "loss": 0.0013, "step": 191600 }, { "epoch": 8.77717111362934, "grad_norm": 0.06658513098955154, "learning_rate": 4.484057837561406e-06, "loss": 0.003, "step": 191700 }, { "epoch": 8.781749710976042, "grad_norm": 0.003857834730297327, "learning_rate": 4.4510388235364405e-06, "loss": 0.0015, "step": 191800 }, { "epoch": 8.786328308322746, "grad_norm": 0.00674202898517251, "learning_rate": 4.418136165259512e-06, "loss": 0.001, "step": 191900 }, { "epoch": 8.790906905669448, "grad_norm": 0.004305652808398008, "learning_rate": 4.385349946781136e-06, "loss": 0.0008, "step": 192000 }, { "epoch": 8.79548550301615, "grad_norm": 0.011842915788292885, "learning_rate": 4.352680251854391e-06, "loss": 0.0015, "step": 192100 }, { "epoch": 8.800064100362853, "grad_norm": 0.030167168006300926, "learning_rate": 4.320127163934657e-06, "loss": 0.0015, "step": 192200 }, { "epoch": 8.804642697709557, "grad_norm": 0.006344472989439964, "learning_rate": 4.2876907661794755e-06, "loss": 0.0016, "step": 192300 }, { "epoch": 8.80922129505626, "grad_norm": 0.8136438131332397, "learning_rate": 4.255371141448272e-06, "loss": 0.0015, "step": 192400 }, { "epoch": 8.813799892402962, "grad_norm": 0.03604700043797493, "learning_rate": 4.223168372302189e-06, "loss": 0.0019, "step": 192500 }, { "epoch": 8.818378489749666, "grad_norm": 0.023059792816638947, "learning_rate": 4.191082541003849e-06, "loss": 0.0009, "step": 192600 }, { "epoch": 8.822957087096368, "grad_norm": 0.04644302278757095, "learning_rate": 4.159113729517184e-06, "loss": 0.0023, "step": 192700 }, { "epoch": 8.82753568444307, "grad_norm": 0.038498032838106155, "learning_rate": 4.127262019507145e-06, "loss": 0.0017, "step": 192800 }, { "epoch": 8.832114281789774, "grad_norm": 0.010661243461072445, "learning_rate": 4.095527492339596e-06, "loss": 0.0017, "step": 192900 }, { "epoch": 8.836692879136477, "grad_norm": 0.03207453712821007, "learning_rate": 4.0639102290810135e-06, "loss": 0.0024, "step": 193000 }, { "epoch": 8.841271476483179, "grad_norm": 0.1272786557674408, "learning_rate": 4.032410310498358e-06, "loss": 0.0015, "step": 193100 }, { "epoch": 8.845850073829883, "grad_norm": 0.004953332711011171, "learning_rate": 4.001027817058789e-06, "loss": 0.0015, "step": 193200 }, { "epoch": 8.850428671176585, "grad_norm": 0.08756324648857117, "learning_rate": 3.969762828929547e-06, "loss": 0.0006, "step": 193300 }, { "epoch": 8.855007268523288, "grad_norm": 0.5247501134872437, "learning_rate": 3.938615425977676e-06, "loss": 0.0018, "step": 193400 }, { "epoch": 8.85958586586999, "grad_norm": 0.0369555726647377, "learning_rate": 3.907585687769838e-06, "loss": 0.0012, "step": 193500 }, { "epoch": 8.864164463216694, "grad_norm": 0.13189056515693665, "learning_rate": 3.876673693572147e-06, "loss": 0.0009, "step": 193600 }, { "epoch": 8.868743060563396, "grad_norm": 0.022357501089572906, "learning_rate": 3.84587952234991e-06, "loss": 0.001, "step": 193700 }, { "epoch": 8.873321657910099, "grad_norm": 0.0008908796007744968, "learning_rate": 3.815203252767463e-06, "loss": 0.001, "step": 193800 }, { "epoch": 8.877900255256803, "grad_norm": 0.003647018224000931, "learning_rate": 3.7846449631879667e-06, "loss": 0.0017, "step": 193900 }, { "epoch": 8.882478852603505, "grad_norm": 0.427442729473114, "learning_rate": 3.754204731673194e-06, "loss": 0.0018, "step": 194000 }, { "epoch": 8.887057449950207, "grad_norm": 0.07006958872079849, "learning_rate": 3.723882635983328e-06, "loss": 0.0018, "step": 194100 }, { "epoch": 8.891636047296911, "grad_norm": 0.18033552169799805, "learning_rate": 3.6936787535767903e-06, "loss": 0.002, "step": 194200 }, { "epoch": 8.896214644643614, "grad_norm": 0.6087344288825989, "learning_rate": 3.6635931616100073e-06, "loss": 0.0016, "step": 194300 }, { "epoch": 8.900793241990316, "grad_norm": 0.14380215108394623, "learning_rate": 3.6336259369372296e-06, "loss": 0.0019, "step": 194400 }, { "epoch": 8.905371839337018, "grad_norm": 0.09099259227514267, "learning_rate": 3.6037771561103496e-06, "loss": 0.0007, "step": 194500 }, { "epoch": 8.909950436683722, "grad_norm": 0.3938591480255127, "learning_rate": 3.5740468953786855e-06, "loss": 0.002, "step": 194600 }, { "epoch": 8.914529034030425, "grad_norm": 0.024904364719986916, "learning_rate": 3.544435230688792e-06, "loss": 0.0007, "step": 194700 }, { "epoch": 8.919107631377127, "grad_norm": 0.0034130678977817297, "learning_rate": 3.514942237684271e-06, "loss": 0.0015, "step": 194800 }, { "epoch": 8.923686228723831, "grad_norm": 0.007133205886930227, "learning_rate": 3.485567991705563e-06, "loss": 0.0012, "step": 194900 }, { "epoch": 8.928264826070533, "grad_norm": 0.0010082671651616693, "learning_rate": 3.4563125677897932e-06, "loss": 0.0016, "step": 195000 }, { "epoch": 8.932843423417236, "grad_norm": 0.011788592673838139, "learning_rate": 3.427176040670521e-06, "loss": 0.0023, "step": 195100 }, { "epoch": 8.93742202076394, "grad_norm": 0.11746617406606674, "learning_rate": 3.3981584847776026e-06, "loss": 0.0014, "step": 195200 }, { "epoch": 8.942000618110642, "grad_norm": 0.06418687850236893, "learning_rate": 3.369259974236988e-06, "loss": 0.0018, "step": 195300 }, { "epoch": 8.946579215457344, "grad_norm": 0.012977411039173603, "learning_rate": 3.340480582870503e-06, "loss": 0.0014, "step": 195400 }, { "epoch": 8.951157812804048, "grad_norm": 0.0007548317080363631, "learning_rate": 3.311820384195674e-06, "loss": 0.0013, "step": 195500 }, { "epoch": 8.95573641015075, "grad_norm": 0.004369072150439024, "learning_rate": 3.2832794514255803e-06, "loss": 0.0011, "step": 195600 }, { "epoch": 8.960315007497453, "grad_norm": 0.14467285573482513, "learning_rate": 3.2548578574686018e-06, "loss": 0.0016, "step": 195700 }, { "epoch": 8.964893604844155, "grad_norm": 0.018095914274454117, "learning_rate": 3.2265556749282834e-06, "loss": 0.0013, "step": 195800 }, { "epoch": 8.96947220219086, "grad_norm": 0.031725652515888214, "learning_rate": 3.198372976103137e-06, "loss": 0.0013, "step": 195900 }, { "epoch": 8.974050799537562, "grad_norm": 0.12283707410097122, "learning_rate": 3.1703098329864233e-06, "loss": 0.0019, "step": 196000 }, { "epoch": 8.978629396884264, "grad_norm": 0.0016571872401982546, "learning_rate": 3.1423663172660267e-06, "loss": 0.002, "step": 196100 }, { "epoch": 8.983207994230968, "grad_norm": 0.005055413115769625, "learning_rate": 3.114542500324219e-06, "loss": 0.001, "step": 196200 }, { "epoch": 8.98778659157767, "grad_norm": 0.008997324854135513, "learning_rate": 3.086838453237506e-06, "loss": 0.0007, "step": 196300 }, { "epoch": 8.992365188924373, "grad_norm": 0.42291346192359924, "learning_rate": 3.059254246776433e-06, "loss": 0.0017, "step": 196400 }, { "epoch": 8.996943786271077, "grad_norm": 0.23169781267642975, "learning_rate": 3.0317899514054336e-06, "loss": 0.0015, "step": 196500 }, { "epoch": 8.9999656605199, "eval_loss": 0.2418144792318344, "eval_runtime": 261.8983, "eval_samples_per_second": 21.001, "eval_steps_per_second": 21.001, "step": 196566 }, { "epoch": 9.001522383617779, "grad_norm": 0.08088653534650803, "learning_rate": 3.0044456372825992e-06, "loss": 0.0019, "step": 196600 }, { "epoch": 9.006100980964481, "grad_norm": 0.13099326193332672, "learning_rate": 2.9772213742595367e-06, "loss": 0.001, "step": 196700 }, { "epoch": 9.010679578311183, "grad_norm": 0.008611609227955341, "learning_rate": 2.950117231881183e-06, "loss": 0.0008, "step": 196800 }, { "epoch": 9.015258175657888, "grad_norm": 0.6876717209815979, "learning_rate": 2.923133279385615e-06, "loss": 0.0006, "step": 196900 }, { "epoch": 9.01983677300459, "grad_norm": 0.2837451100349426, "learning_rate": 2.8962695857038922e-06, "loss": 0.0009, "step": 197000 }, { "epoch": 9.024415370351292, "grad_norm": 0.012697268277406693, "learning_rate": 2.8695262194598615e-06, "loss": 0.0011, "step": 197100 }, { "epoch": 9.028993967697996, "grad_norm": 0.002202101983129978, "learning_rate": 2.8429032489700135e-06, "loss": 0.0007, "step": 197200 }, { "epoch": 9.033572565044699, "grad_norm": 0.00148550805170089, "learning_rate": 2.8164007422432583e-06, "loss": 0.001, "step": 197300 }, { "epoch": 9.0381511623914, "grad_norm": 0.002804514952003956, "learning_rate": 2.790018766980773e-06, "loss": 0.0008, "step": 197400 }, { "epoch": 9.042729759738105, "grad_norm": 0.004013043362647295, "learning_rate": 2.763757390575872e-06, "loss": 0.0013, "step": 197500 }, { "epoch": 9.047308357084807, "grad_norm": 0.3350330591201782, "learning_rate": 2.737616680113758e-06, "loss": 0.0013, "step": 197600 }, { "epoch": 9.05188695443151, "grad_norm": 0.2957717478275299, "learning_rate": 2.7115967023714215e-06, "loss": 0.0009, "step": 197700 }, { "epoch": 9.056465551778214, "grad_norm": 0.13251306116580963, "learning_rate": 2.6856975238174266e-06, "loss": 0.0008, "step": 197800 }, { "epoch": 9.061044149124916, "grad_norm": 0.0039755236357450485, "learning_rate": 2.6599192106117333e-06, "loss": 0.0021, "step": 197900 }, { "epoch": 9.065622746471618, "grad_norm": 0.040325064212083817, "learning_rate": 2.634261828605594e-06, "loss": 0.001, "step": 198000 }, { "epoch": 9.07020134381832, "grad_norm": 0.001606648089364171, "learning_rate": 2.608725443341292e-06, "loss": 0.0014, "step": 198100 }, { "epoch": 9.074779941165025, "grad_norm": 0.0010452588321641088, "learning_rate": 2.583310120052046e-06, "loss": 0.0012, "step": 198200 }, { "epoch": 9.079358538511727, "grad_norm": 0.06777454912662506, "learning_rate": 2.5580159236618162e-06, "loss": 0.0006, "step": 198300 }, { "epoch": 9.083937135858429, "grad_norm": 0.014749701134860516, "learning_rate": 2.5328429187851552e-06, "loss": 0.0012, "step": 198400 }, { "epoch": 9.088515733205133, "grad_norm": 0.0003846607287414372, "learning_rate": 2.507791169727003e-06, "loss": 0.0008, "step": 198500 }, { "epoch": 9.093094330551835, "grad_norm": 0.004392546135932207, "learning_rate": 2.4828607404825677e-06, "loss": 0.0006, "step": 198600 }, { "epoch": 9.097672927898538, "grad_norm": 0.006986264605075121, "learning_rate": 2.4580516947371348e-06, "loss": 0.001, "step": 198700 }, { "epoch": 9.102251525245242, "grad_norm": 0.009725336916744709, "learning_rate": 2.4333640958659143e-06, "loss": 0.0007, "step": 198800 }, { "epoch": 9.106830122591944, "grad_norm": 0.02167440392076969, "learning_rate": 2.408798006933882e-06, "loss": 0.001, "step": 198900 }, { "epoch": 9.111408719938646, "grad_norm": 0.054156869649887085, "learning_rate": 2.3843534906956123e-06, "loss": 0.0013, "step": 199000 }, { "epoch": 9.11598731728535, "grad_norm": 0.011062448844313622, "learning_rate": 2.3600306095951264e-06, "loss": 0.0013, "step": 199100 }, { "epoch": 9.120565914632053, "grad_norm": 0.0029075967613607645, "learning_rate": 2.335829425765712e-06, "loss": 0.0015, "step": 199200 }, { "epoch": 9.125144511978755, "grad_norm": 0.013693880289793015, "learning_rate": 2.311750001029783e-06, "loss": 0.0014, "step": 199300 }, { "epoch": 9.129723109325457, "grad_norm": 0.011990150436758995, "learning_rate": 2.2877923968987247e-06, "loss": 0.0011, "step": 199400 }, { "epoch": 9.134301706672161, "grad_norm": 0.01806030236184597, "learning_rate": 2.2639566745727205e-06, "loss": 0.0007, "step": 199500 }, { "epoch": 9.138880304018864, "grad_norm": 0.005009980872273445, "learning_rate": 2.2402428949406086e-06, "loss": 0.0007, "step": 199600 }, { "epoch": 9.143458901365566, "grad_norm": 0.031974907964468, "learning_rate": 2.216651118579727e-06, "loss": 0.0013, "step": 199700 }, { "epoch": 9.14803749871227, "grad_norm": 0.0008488456369377673, "learning_rate": 2.19318140575574e-06, "loss": 0.0009, "step": 199800 }, { "epoch": 9.152616096058972, "grad_norm": 0.12342657893896103, "learning_rate": 2.169833816422517e-06, "loss": 0.001, "step": 199900 }, { "epoch": 9.157194693405675, "grad_norm": 0.0888073593378067, "learning_rate": 2.1466084102219452e-06, "loss": 0.0011, "step": 200000 }, { "epoch": 9.161773290752379, "grad_norm": 0.0031123904045671225, "learning_rate": 2.123505246483787e-06, "loss": 0.0012, "step": 200100 }, { "epoch": 9.166351888099081, "grad_norm": 0.021208738908171654, "learning_rate": 2.100524384225555e-06, "loss": 0.001, "step": 200200 }, { "epoch": 9.170930485445783, "grad_norm": 0.025913584977388382, "learning_rate": 2.077665882152335e-06, "loss": 0.0012, "step": 200300 }, { "epoch": 9.175509082792486, "grad_norm": 0.16090908646583557, "learning_rate": 2.0549297986566186e-06, "loss": 0.0014, "step": 200400 }, { "epoch": 9.18008768013919, "grad_norm": 0.08480704575777054, "learning_rate": 2.032316191818212e-06, "loss": 0.0018, "step": 200500 }, { "epoch": 9.184666277485892, "grad_norm": 0.0889834314584732, "learning_rate": 2.009825119404024e-06, "loss": 0.0012, "step": 200600 }, { "epoch": 9.189244874832594, "grad_norm": 0.10864217579364777, "learning_rate": 1.9874566388679518e-06, "loss": 0.001, "step": 200700 }, { "epoch": 9.193823472179298, "grad_norm": 0.0008274565334431827, "learning_rate": 1.9652108073507425e-06, "loss": 0.0011, "step": 200800 }, { "epoch": 9.198402069526, "grad_norm": 0.002116286661475897, "learning_rate": 1.943087681679823e-06, "loss": 0.001, "step": 200900 }, { "epoch": 9.202980666872703, "grad_norm": 0.029289819300174713, "learning_rate": 1.9210873183691692e-06, "loss": 0.0007, "step": 201000 }, { "epoch": 9.207559264219407, "grad_norm": 0.006239714100956917, "learning_rate": 1.899209773619154e-06, "loss": 0.0011, "step": 201100 }, { "epoch": 9.21213786156611, "grad_norm": 0.002388751832768321, "learning_rate": 1.8774551033164112e-06, "loss": 0.0009, "step": 201200 }, { "epoch": 9.216716458912812, "grad_norm": 0.15311861038208008, "learning_rate": 1.8558233630336929e-06, "loss": 0.0011, "step": 201300 }, { "epoch": 9.221295056259516, "grad_norm": 0.20378436148166656, "learning_rate": 1.8343146080297135e-06, "loss": 0.0007, "step": 201400 }, { "epoch": 9.225873653606218, "grad_norm": 0.01194208487868309, "learning_rate": 1.8129288932490274e-06, "loss": 0.0008, "step": 201500 }, { "epoch": 9.23045225095292, "grad_norm": 0.002687977161258459, "learning_rate": 1.7916662733218847e-06, "loss": 0.001, "step": 201600 }, { "epoch": 9.235030848299623, "grad_norm": 0.0027624531649053097, "learning_rate": 1.7705268025640709e-06, "loss": 0.0005, "step": 201700 }, { "epoch": 9.239609445646327, "grad_norm": 0.04479651898145676, "learning_rate": 1.7495105349767948e-06, "loss": 0.0012, "step": 201800 }, { "epoch": 9.244188042993029, "grad_norm": 0.21192124485969543, "learning_rate": 1.7286175242465509e-06, "loss": 0.0012, "step": 201900 }, { "epoch": 9.248766640339731, "grad_norm": 0.0029038949869573116, "learning_rate": 1.7078478237449402e-06, "loss": 0.0008, "step": 202000 }, { "epoch": 9.253345237686435, "grad_norm": 0.009675376117229462, "learning_rate": 1.6872014865286057e-06, "loss": 0.0013, "step": 202100 }, { "epoch": 9.257923835033138, "grad_norm": 0.007381136529147625, "learning_rate": 1.6666785653390249e-06, "loss": 0.001, "step": 202200 }, { "epoch": 9.26250243237984, "grad_norm": 0.001989328535273671, "learning_rate": 1.6462791126024169e-06, "loss": 0.0007, "step": 202300 }, { "epoch": 9.267081029726544, "grad_norm": 0.0015792534686625004, "learning_rate": 1.6260031804296084e-06, "loss": 0.0008, "step": 202400 }, { "epoch": 9.271659627073246, "grad_norm": 0.0014649959048256278, "learning_rate": 1.6058508206158728e-06, "loss": 0.0008, "step": 202500 }, { "epoch": 9.276238224419949, "grad_norm": 0.017108794301748276, "learning_rate": 1.58582208464082e-06, "loss": 0.0009, "step": 202600 }, { "epoch": 9.280816821766653, "grad_norm": 0.0022038191091269255, "learning_rate": 1.5659170236682674e-06, "loss": 0.0007, "step": 202700 }, { "epoch": 9.285395419113355, "grad_norm": 0.002022168133407831, "learning_rate": 1.5461356885461075e-06, "loss": 0.0014, "step": 202800 }, { "epoch": 9.289974016460057, "grad_norm": 0.003931309096515179, "learning_rate": 1.5264781298061415e-06, "loss": 0.0013, "step": 202900 }, { "epoch": 9.29455261380676, "grad_norm": 0.13134440779685974, "learning_rate": 1.5069443976640284e-06, "loss": 0.0009, "step": 203000 }, { "epoch": 9.299131211153464, "grad_norm": 0.004310674965381622, "learning_rate": 1.4875345420190645e-06, "loss": 0.0012, "step": 203100 }, { "epoch": 9.303709808500166, "grad_norm": 0.00843301322311163, "learning_rate": 1.4682486124541373e-06, "loss": 0.0011, "step": 203200 }, { "epoch": 9.308288405846868, "grad_norm": 0.01970786415040493, "learning_rate": 1.4490866582355267e-06, "loss": 0.0015, "step": 203300 }, { "epoch": 9.312867003193572, "grad_norm": 0.012120225466787815, "learning_rate": 1.4300487283128495e-06, "loss": 0.0011, "step": 203400 }, { "epoch": 9.317445600540275, "grad_norm": 0.0033403183333575726, "learning_rate": 1.4111348713188866e-06, "loss": 0.0007, "step": 203500 }, { "epoch": 9.322024197886977, "grad_norm": 0.07732047885656357, "learning_rate": 1.3923451355694617e-06, "loss": 0.0008, "step": 203600 }, { "epoch": 9.326602795233681, "grad_norm": 0.004604123532772064, "learning_rate": 1.3736795690633354e-06, "loss": 0.0011, "step": 203700 }, { "epoch": 9.331181392580383, "grad_norm": 0.003377101384103298, "learning_rate": 1.3551382194820884e-06, "loss": 0.0008, "step": 203800 }, { "epoch": 9.335759989927086, "grad_norm": 0.0024128479417413473, "learning_rate": 1.3367211341899667e-06, "loss": 0.0009, "step": 203900 }, { "epoch": 9.340338587273788, "grad_norm": 0.01782609149813652, "learning_rate": 1.3184283602337865e-06, "loss": 0.001, "step": 204000 }, { "epoch": 9.344917184620492, "grad_norm": 0.0036396984942257404, "learning_rate": 1.3002599443428243e-06, "loss": 0.0009, "step": 204100 }, { "epoch": 9.349495781967194, "grad_norm": 0.08581870794296265, "learning_rate": 1.2822159329286598e-06, "loss": 0.0009, "step": 204200 }, { "epoch": 9.354074379313897, "grad_norm": 0.044847775250673294, "learning_rate": 1.264296372085083e-06, "loss": 0.0011, "step": 204300 }, { "epoch": 9.3586529766606, "grad_norm": 0.001584995654411614, "learning_rate": 1.2465013075879883e-06, "loss": 0.0018, "step": 204400 }, { "epoch": 9.363231574007303, "grad_norm": 0.009455603547394276, "learning_rate": 1.2288307848952186e-06, "loss": 0.0007, "step": 204500 }, { "epoch": 9.367810171354005, "grad_norm": 0.08216769993305206, "learning_rate": 1.2112848491464824e-06, "loss": 0.0012, "step": 204600 }, { "epoch": 9.37238876870071, "grad_norm": 0.00180336635094136, "learning_rate": 1.1938635451632429e-06, "loss": 0.0013, "step": 204700 }, { "epoch": 9.376967366047412, "grad_norm": 0.007049913518130779, "learning_rate": 1.1765669174485684e-06, "loss": 0.0008, "step": 204800 }, { "epoch": 9.381545963394114, "grad_norm": 0.07437339425086975, "learning_rate": 1.1593950101870422e-06, "loss": 0.0006, "step": 204900 }, { "epoch": 9.386124560740818, "grad_norm": 0.0391901396214962, "learning_rate": 1.1423478672446586e-06, "loss": 0.001, "step": 205000 }, { "epoch": 9.39070315808752, "grad_norm": 0.16931991279125214, "learning_rate": 1.1254255321686836e-06, "loss": 0.0012, "step": 205100 }, { "epoch": 9.395281755434223, "grad_norm": 0.0040185777470469475, "learning_rate": 1.1086280481875654e-06, "loss": 0.0012, "step": 205200 }, { "epoch": 9.399860352780925, "grad_norm": 0.1352093517780304, "learning_rate": 1.0919554582108249e-06, "loss": 0.0005, "step": 205300 }, { "epoch": 9.404438950127629, "grad_norm": 0.11504676938056946, "learning_rate": 1.0754078048289374e-06, "loss": 0.0006, "step": 205400 }, { "epoch": 9.409017547474331, "grad_norm": 0.015873286873102188, "learning_rate": 1.0589851303132114e-06, "loss": 0.0017, "step": 205500 }, { "epoch": 9.413596144821033, "grad_norm": 0.12407149374485016, "learning_rate": 1.0426874766157003e-06, "loss": 0.001, "step": 205600 }, { "epoch": 9.418174742167738, "grad_norm": 0.002775526139885187, "learning_rate": 1.0265148853691009e-06, "loss": 0.0017, "step": 205700 }, { "epoch": 9.42275333951444, "grad_norm": 0.15169379115104675, "learning_rate": 1.0104673978866164e-06, "loss": 0.0011, "step": 205800 }, { "epoch": 9.427331936861142, "grad_norm": 0.007750590797513723, "learning_rate": 9.945450551618884e-07, "loss": 0.001, "step": 205900 }, { "epoch": 9.431910534207846, "grad_norm": 0.0007150355377234519, "learning_rate": 9.787478978688646e-07, "loss": 0.001, "step": 206000 }, { "epoch": 9.436489131554548, "grad_norm": 0.01111397985368967, "learning_rate": 9.630759663616983e-07, "loss": 0.0011, "step": 206100 }, { "epoch": 9.44106772890125, "grad_norm": 0.027570601552724838, "learning_rate": 9.475293006746711e-07, "loss": 0.0007, "step": 206200 }, { "epoch": 9.445646326247953, "grad_norm": 0.24770981073379517, "learning_rate": 9.321079405220423e-07, "loss": 0.0008, "step": 206300 }, { "epoch": 9.450224923594657, "grad_norm": 0.003402331378310919, "learning_rate": 9.168119252979946e-07, "loss": 0.0005, "step": 206400 }, { "epoch": 9.45480352094136, "grad_norm": 0.0008960131090134382, "learning_rate": 9.016412940765106e-07, "loss": 0.0007, "step": 206500 }, { "epoch": 9.459382118288062, "grad_norm": 0.02515571191906929, "learning_rate": 8.865960856112799e-07, "loss": 0.0015, "step": 206600 }, { "epoch": 9.463960715634766, "grad_norm": 0.1351311206817627, "learning_rate": 8.716763383355864e-07, "loss": 0.0013, "step": 206700 }, { "epoch": 9.468539312981468, "grad_norm": 0.0024123205803334713, "learning_rate": 8.568820903622376e-07, "loss": 0.0012, "step": 206800 }, { "epoch": 9.47311791032817, "grad_norm": 0.010249449871480465, "learning_rate": 8.422133794834363e-07, "loss": 0.001, "step": 206900 }, { "epoch": 9.477696507674874, "grad_norm": 0.006125771440565586, "learning_rate": 8.276702431706973e-07, "loss": 0.0009, "step": 207000 }, { "epoch": 9.482275105021577, "grad_norm": 0.0066106487065553665, "learning_rate": 8.132527185747641e-07, "loss": 0.0008, "step": 207100 }, { "epoch": 9.486853702368279, "grad_norm": 0.010087539441883564, "learning_rate": 7.989608425254924e-07, "loss": 0.001, "step": 207200 }, { "epoch": 9.491432299714983, "grad_norm": 0.06039687991142273, "learning_rate": 7.847946515317839e-07, "loss": 0.0011, "step": 207300 }, { "epoch": 9.496010897061685, "grad_norm": 0.0053437924943864346, "learning_rate": 7.707541817814468e-07, "loss": 0.001, "step": 207400 }, { "epoch": 9.500589494408388, "grad_norm": 0.0029611322097480297, "learning_rate": 7.568394691411462e-07, "loss": 0.0005, "step": 207500 }, { "epoch": 9.50516809175509, "grad_norm": 0.007037085480988026, "learning_rate": 7.4305054915631e-07, "loss": 0.0007, "step": 207600 }, { "epoch": 9.509746689101794, "grad_norm": 0.0024143033660948277, "learning_rate": 7.293874570510062e-07, "loss": 0.001, "step": 207700 }, { "epoch": 9.514325286448496, "grad_norm": 0.0061892117373645306, "learning_rate": 7.158502277278823e-07, "loss": 0.0011, "step": 207800 }, { "epoch": 9.518903883795199, "grad_norm": 0.006517268251627684, "learning_rate": 7.024388957680705e-07, "loss": 0.0013, "step": 207900 }, { "epoch": 9.523482481141903, "grad_norm": 0.02947130799293518, "learning_rate": 6.891534954310885e-07, "loss": 0.0005, "step": 208000 }, { "epoch": 9.528061078488605, "grad_norm": 0.009931573644280434, "learning_rate": 6.75994060654761e-07, "loss": 0.0009, "step": 208100 }, { "epoch": 9.532639675835307, "grad_norm": 0.0006516918656416237, "learning_rate": 6.629606250551368e-07, "loss": 0.0006, "step": 208200 }, { "epoch": 9.537218273182011, "grad_norm": 0.0033905524760484695, "learning_rate": 6.500532219263833e-07, "loss": 0.0008, "step": 208300 }, { "epoch": 9.541796870528714, "grad_norm": 0.035781797021627426, "learning_rate": 6.372718842407255e-07, "loss": 0.001, "step": 208400 }, { "epoch": 9.546375467875416, "grad_norm": 0.006109519395977259, "learning_rate": 6.24616644648357e-07, "loss": 0.0013, "step": 208500 }, { "epoch": 9.550954065222118, "grad_norm": 0.10211105644702911, "learning_rate": 6.120875354773459e-07, "loss": 0.0008, "step": 208600 }, { "epoch": 9.555532662568822, "grad_norm": 0.002378986682742834, "learning_rate": 5.996845887335511e-07, "loss": 0.0011, "step": 208700 }, { "epoch": 9.560111259915525, "grad_norm": 0.007951854728162289, "learning_rate": 5.874078361005564e-07, "loss": 0.0012, "step": 208800 }, { "epoch": 9.564689857262227, "grad_norm": 0.01049245335161686, "learning_rate": 5.75257308939564e-07, "loss": 0.0009, "step": 208900 }, { "epoch": 9.569268454608931, "grad_norm": 0.00224009295925498, "learning_rate": 5.632330382893569e-07, "loss": 0.0005, "step": 209000 }, { "epoch": 9.573847051955633, "grad_norm": 0.004827695898711681, "learning_rate": 5.513350548661811e-07, "loss": 0.0007, "step": 209100 }, { "epoch": 9.578425649302336, "grad_norm": 0.021496234461665154, "learning_rate": 5.395633890636631e-07, "loss": 0.0007, "step": 209200 }, { "epoch": 9.58300424664904, "grad_norm": 0.001609979895874858, "learning_rate": 5.279180709527765e-07, "loss": 0.0009, "step": 209300 }, { "epoch": 9.587582843995742, "grad_norm": 0.006100552622228861, "learning_rate": 5.163991302817139e-07, "loss": 0.0012, "step": 209400 }, { "epoch": 9.592161441342444, "grad_norm": 0.0038456227630376816, "learning_rate": 5.050065964758488e-07, "loss": 0.0009, "step": 209500 }, { "epoch": 9.596740038689148, "grad_norm": 0.0012561274925246835, "learning_rate": 4.937404986376348e-07, "loss": 0.0011, "step": 209600 }, { "epoch": 9.60131863603585, "grad_norm": 0.007517179474234581, "learning_rate": 4.826008655465508e-07, "loss": 0.0011, "step": 209700 }, { "epoch": 9.605897233382553, "grad_norm": 0.16051574051380157, "learning_rate": 4.7158772565902843e-07, "loss": 0.0003, "step": 209800 }, { "epoch": 9.610475830729255, "grad_norm": 0.004418348427861929, "learning_rate": 4.6070110710834116e-07, "loss": 0.0013, "step": 209900 }, { "epoch": 9.61505442807596, "grad_norm": 0.003388006007298827, "learning_rate": 4.4994103770457653e-07, "loss": 0.0007, "step": 210000 }, { "epoch": 9.619633025422662, "grad_norm": 0.1264602690935135, "learning_rate": 4.3930754493456403e-07, "loss": 0.0006, "step": 210100 }, { "epoch": 9.624211622769364, "grad_norm": 0.0033033695071935654, "learning_rate": 4.2880065596176967e-07, "loss": 0.0009, "step": 210200 }, { "epoch": 9.628790220116068, "grad_norm": 0.002471612999215722, "learning_rate": 4.184203976262513e-07, "loss": 0.0011, "step": 210300 }, { "epoch": 9.63336881746277, "grad_norm": 0.02394956909120083, "learning_rate": 4.081667964446034e-07, "loss": 0.0009, "step": 210400 }, { "epoch": 9.637947414809473, "grad_norm": 0.0018725660629570484, "learning_rate": 3.980398786098405e-07, "loss": 0.0013, "step": 210500 }, { "epoch": 9.642526012156177, "grad_norm": 0.0007891812711022794, "learning_rate": 3.8803966999139684e-07, "loss": 0.0006, "step": 210600 }, { "epoch": 9.647104609502879, "grad_norm": 0.01715545915067196, "learning_rate": 3.7816619613499913e-07, "loss": 0.0017, "step": 210700 }, { "epoch": 9.651683206849581, "grad_norm": 0.004238943103700876, "learning_rate": 3.6841948226263854e-07, "loss": 0.0009, "step": 210800 }, { "epoch": 9.656261804196284, "grad_norm": 0.07635319977998734, "learning_rate": 3.587995532724986e-07, "loss": 0.0013, "step": 210900 }, { "epoch": 9.660840401542988, "grad_norm": 0.016986342146992683, "learning_rate": 3.493064337388774e-07, "loss": 0.0004, "step": 211000 }, { "epoch": 9.66541899888969, "grad_norm": 0.004902221262454987, "learning_rate": 3.399401479121489e-07, "loss": 0.0009, "step": 211100 }, { "epoch": 9.669997596236392, "grad_norm": 0.004907084163278341, "learning_rate": 3.30700719718674e-07, "loss": 0.0011, "step": 211200 }, { "epoch": 9.674576193583096, "grad_norm": 0.0083727166056633, "learning_rate": 3.215881727607617e-07, "loss": 0.0015, "step": 211300 }, { "epoch": 9.679154790929799, "grad_norm": 0.005267091561108828, "learning_rate": 3.126025303166025e-07, "loss": 0.0009, "step": 211400 }, { "epoch": 9.6837333882765, "grad_norm": 0.0178202036768198, "learning_rate": 3.0374381534019613e-07, "loss": 0.0009, "step": 211500 }, { "epoch": 9.688311985623205, "grad_norm": 0.05054371431469917, "learning_rate": 2.9501205046131295e-07, "loss": 0.0006, "step": 211600 }, { "epoch": 9.692890582969907, "grad_norm": 0.01484039518982172, "learning_rate": 2.8640725798543266e-07, "loss": 0.0009, "step": 211700 }, { "epoch": 9.69746918031661, "grad_norm": 0.1556658148765564, "learning_rate": 2.7792945989366105e-07, "loss": 0.0013, "step": 211800 }, { "epoch": 9.702047777663314, "grad_norm": 0.0019681896083056927, "learning_rate": 2.6957867784270787e-07, "loss": 0.0008, "step": 211900 }, { "epoch": 9.706626375010016, "grad_norm": 0.014196610078215599, "learning_rate": 2.6135493316482017e-07, "loss": 0.0008, "step": 212000 }, { "epoch": 9.711204972356718, "grad_norm": 0.0016565111000090837, "learning_rate": 2.532582468677214e-07, "loss": 0.0012, "step": 212100 }, { "epoch": 9.715783569703422, "grad_norm": 0.0729876384139061, "learning_rate": 2.452886396345555e-07, "loss": 0.0008, "step": 212200 }, { "epoch": 9.720362167050125, "grad_norm": 0.05016588792204857, "learning_rate": 2.3744613182384856e-07, "loss": 0.0008, "step": 212300 }, { "epoch": 9.724940764396827, "grad_norm": 0.0004378720186650753, "learning_rate": 2.2973074346944734e-07, "loss": 0.001, "step": 212400 }, { "epoch": 9.72951936174353, "grad_norm": 0.005694561637938023, "learning_rate": 2.2214249428046952e-07, "loss": 0.001, "step": 212500 }, { "epoch": 9.734097959090233, "grad_norm": 0.006911196745932102, "learning_rate": 2.1468140364125367e-07, "loss": 0.0015, "step": 212600 }, { "epoch": 9.738676556436936, "grad_norm": 0.002870377618819475, "learning_rate": 2.0734749061130377e-07, "loss": 0.0012, "step": 212700 }, { "epoch": 9.743255153783638, "grad_norm": 0.029163073748350143, "learning_rate": 2.0014077392525031e-07, "loss": 0.0007, "step": 212800 }, { "epoch": 9.747833751130342, "grad_norm": 0.5740547180175781, "learning_rate": 1.930612719927949e-07, "loss": 0.0013, "step": 212900 }, { "epoch": 9.752412348477044, "grad_norm": 0.017990708351135254, "learning_rate": 1.8610900289867673e-07, "loss": 0.0009, "step": 213000 }, { "epoch": 9.756990945823746, "grad_norm": 0.0017326328670606017, "learning_rate": 1.792839844026062e-07, "loss": 0.001, "step": 213100 }, { "epoch": 9.76156954317045, "grad_norm": 0.0013371937675401568, "learning_rate": 1.725862339392259e-07, "loss": 0.0008, "step": 213200 }, { "epoch": 9.766148140517153, "grad_norm": 0.0008829734288156033, "learning_rate": 1.66015768618083e-07, "loss": 0.0017, "step": 213300 }, { "epoch": 9.770726737863855, "grad_norm": 0.09034759551286697, "learning_rate": 1.5957260522356243e-07, "loss": 0.0009, "step": 213400 }, { "epoch": 9.775305335210557, "grad_norm": 0.0011923068668693304, "learning_rate": 1.5325676021484825e-07, "loss": 0.001, "step": 213500 }, { "epoch": 9.779883932557262, "grad_norm": 0.0013882212806493044, "learning_rate": 1.4706824972591238e-07, "loss": 0.001, "step": 213600 }, { "epoch": 9.784462529903964, "grad_norm": 0.004688725806772709, "learning_rate": 1.410070895654203e-07, "loss": 0.0012, "step": 213700 }, { "epoch": 9.789041127250666, "grad_norm": 0.0011394763132557273, "learning_rate": 1.3507329521672552e-07, "loss": 0.0005, "step": 213800 }, { "epoch": 9.79361972459737, "grad_norm": 0.015895133838057518, "learning_rate": 1.2926688183783066e-07, "loss": 0.0006, "step": 213900 }, { "epoch": 9.798198321944072, "grad_norm": 0.032990917563438416, "learning_rate": 1.235878642613375e-07, "loss": 0.0006, "step": 214000 }, { "epoch": 9.802776919290775, "grad_norm": 0.012515276670455933, "learning_rate": 1.1803625699440824e-07, "loss": 0.001, "step": 214100 }, { "epoch": 9.807355516637479, "grad_norm": 0.002379771787673235, "learning_rate": 1.1261207421874309e-07, "loss": 0.0007, "step": 214200 }, { "epoch": 9.811934113984181, "grad_norm": 0.09647377580404282, "learning_rate": 1.0731532979051939e-07, "loss": 0.0007, "step": 214300 }, { "epoch": 9.816512711330883, "grad_norm": 0.0067366501316428185, "learning_rate": 1.021460372403915e-07, "loss": 0.0012, "step": 214400 }, { "epoch": 9.821091308677588, "grad_norm": 0.005677223205566406, "learning_rate": 9.710420977340762e-08, "loss": 0.0015, "step": 214500 }, { "epoch": 9.82566990602429, "grad_norm": 0.004333006218075752, "learning_rate": 9.218986026902632e-08, "loss": 0.0014, "step": 214600 }, { "epoch": 9.830248503370992, "grad_norm": 0.0027217718306928873, "learning_rate": 8.740300128105005e-08, "loss": 0.0003, "step": 214700 }, { "epoch": 9.834827100717694, "grad_norm": 0.30776289105415344, "learning_rate": 8.274364503760845e-08, "loss": 0.0014, "step": 214800 }, { "epoch": 9.839405698064398, "grad_norm": 0.005994019098579884, "learning_rate": 7.8211803441125e-08, "loss": 0.0009, "step": 214900 }, { "epoch": 9.8439842954111, "grad_norm": 0.0010411434341222048, "learning_rate": 7.380748806827819e-08, "loss": 0.0011, "step": 215000 }, { "epoch": 9.848562892757803, "grad_norm": 0.04015972465276718, "learning_rate": 6.953071016998491e-08, "loss": 0.0005, "step": 215100 }, { "epoch": 9.853141490104507, "grad_norm": 0.0010900140041485429, "learning_rate": 6.538148067135596e-08, "loss": 0.001, "step": 215200 }, { "epoch": 9.85772008745121, "grad_norm": 0.5897096991539001, "learning_rate": 6.135981017167947e-08, "loss": 0.0007, "step": 215300 }, { "epoch": 9.862298684797912, "grad_norm": 0.03506353124976158, "learning_rate": 5.7465708944404175e-08, "loss": 0.0006, "step": 215400 }, { "epoch": 9.866877282144616, "grad_norm": 0.042582686990499496, "learning_rate": 5.3699186937089526e-08, "loss": 0.001, "step": 215500 }, { "epoch": 9.871455879491318, "grad_norm": 0.008183780126273632, "learning_rate": 5.006025377138901e-08, "loss": 0.0009, "step": 215600 }, { "epoch": 9.87603447683802, "grad_norm": 0.0004912464646622539, "learning_rate": 4.6548918743033464e-08, "loss": 0.0013, "step": 215700 }, { "epoch": 9.880613074184723, "grad_norm": 0.008000398054718971, "learning_rate": 4.316519082179227e-08, "loss": 0.0008, "step": 215800 }, { "epoch": 9.885191671531427, "grad_norm": 0.0017890778835862875, "learning_rate": 3.9909078651478856e-08, "loss": 0.0008, "step": 215900 }, { "epoch": 9.889770268878129, "grad_norm": 0.002216469496488571, "learning_rate": 3.678059054988969e-08, "loss": 0.0008, "step": 216000 }, { "epoch": 9.894348866224831, "grad_norm": 0.07988656312227249, "learning_rate": 3.377973450881533e-08, "loss": 0.001, "step": 216100 }, { "epoch": 9.898927463571535, "grad_norm": 0.0008704246138222516, "learning_rate": 3.0906518194001586e-08, "loss": 0.0006, "step": 216200 }, { "epoch": 9.903506060918238, "grad_norm": 0.0025461604818701744, "learning_rate": 2.8160948945138434e-08, "loss": 0.0012, "step": 216300 }, { "epoch": 9.90808465826494, "grad_norm": 0.00465493043884635, "learning_rate": 2.554303377584333e-08, "loss": 0.0006, "step": 216400 }, { "epoch": 9.912663255611644, "grad_norm": 0.0038689495995640755, "learning_rate": 2.305277937362238e-08, "loss": 0.0011, "step": 216500 }, { "epoch": 9.917241852958346, "grad_norm": 0.7484769225120544, "learning_rate": 2.0690192099892535e-08, "loss": 0.001, "step": 216600 }, { "epoch": 9.921820450305049, "grad_norm": 0.01186487078666687, "learning_rate": 1.845527798992608e-08, "loss": 0.0008, "step": 216700 }, { "epoch": 9.926399047651753, "grad_norm": 0.030242715030908585, "learning_rate": 1.6348042752856173e-08, "loss": 0.0005, "step": 216800 }, { "epoch": 9.930977644998455, "grad_norm": 0.007519678212702274, "learning_rate": 1.436849177166022e-08, "loss": 0.0005, "step": 216900 }, { "epoch": 9.935556242345157, "grad_norm": 0.0013559481594711542, "learning_rate": 1.2516630103137638e-08, "loss": 0.0007, "step": 217000 }, { "epoch": 9.94013483969186, "grad_norm": 0.0019348141504451632, "learning_rate": 1.0792462477909882e-08, "loss": 0.0008, "step": 217100 }, { "epoch": 9.944713437038564, "grad_norm": 0.004812970757484436, "learning_rate": 9.195993300398221e-09, "loss": 0.0015, "step": 217200 }, { "epoch": 9.949292034385266, "grad_norm": 0.0022001820616424084, "learning_rate": 7.727226648818198e-09, "loss": 0.0014, "step": 217300 }, { "epoch": 9.953870631731968, "grad_norm": 0.07122460752725601, "learning_rate": 6.386166275157424e-09, "loss": 0.0011, "step": 217400 }, { "epoch": 9.958449229078672, "grad_norm": 0.10452170670032501, "learning_rate": 5.172815605186676e-09, "loss": 0.0016, "step": 217500 }, { "epoch": 9.963027826425375, "grad_norm": 0.009487117640674114, "learning_rate": 4.087177738432146e-09, "loss": 0.001, "step": 217600 }, { "epoch": 9.967606423772077, "grad_norm": 0.0035909826401621103, "learning_rate": 3.12925544818099e-09, "loss": 0.0016, "step": 217700 }, { "epoch": 9.972185021118781, "grad_norm": 0.0012206127867102623, "learning_rate": 2.299051181464673e-09, "loss": 0.0007, "step": 217800 }, { "epoch": 9.976763618465483, "grad_norm": 0.016972968354821205, "learning_rate": 1.596567059053422e-09, "loss": 0.001, "step": 217900 }, { "epoch": 9.981342215812186, "grad_norm": 0.0021455176174640656, "learning_rate": 1.0218048754617738e-09, "loss": 0.0006, "step": 218000 }, { "epoch": 9.985920813158888, "grad_norm": 0.0012913336977362633, "learning_rate": 5.747660989263714e-10, "loss": 0.001, "step": 218100 }, { "epoch": 9.990499410505592, "grad_norm": 0.0040626926347613335, "learning_rate": 2.554518714226184e-10, "loss": 0.001, "step": 218200 }, { "epoch": 9.995078007852294, "grad_norm": 0.0009735480998642743, "learning_rate": 6.386300864247297e-11, "loss": 0.0008, "step": 218300 }, { "epoch": 9.999656605198997, "grad_norm": 0.01641685888171196, "learning_rate": 0.0, "loss": 0.0009, "step": 218400 }, { "epoch": 9.999656605198997, "eval_loss": 0.29180198907852173, "eval_runtime": 243.5776, "eval_samples_per_second": 22.58, "eval_steps_per_second": 22.58, "step": 218400 } ], "logging_steps": 100, "max_steps": 218400, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5885525416859468e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }