{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 37620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01594896331738437, "grad_norm": 793786.1875, "learning_rate": 4.9867091972355135e-05, "loss": 4.3231, "step": 100 }, { "epoch": 0.03189792663476874, "grad_norm": 705751.0625, "learning_rate": 4.973418394471026e-05, "loss": 4.1867, "step": 200 }, { "epoch": 0.04784688995215311, "grad_norm": 761077.1875, "learning_rate": 4.9601275917065395e-05, "loss": 4.225, "step": 300 }, { "epoch": 0.06379585326953748, "grad_norm": 660073.9375, "learning_rate": 4.946836788942053e-05, "loss": 4.2529, "step": 400 }, { "epoch": 0.07974481658692185, "grad_norm": 701404.4375, "learning_rate": 4.9335459861775654e-05, "loss": 4.2985, "step": 500 }, { "epoch": 0.09569377990430622, "grad_norm": 776949.5625, "learning_rate": 4.920255183413078e-05, "loss": 4.2124, "step": 600 }, { "epoch": 0.11164274322169059, "grad_norm": 703487.375, "learning_rate": 4.906964380648591e-05, "loss": 4.224, "step": 700 }, { "epoch": 0.12759170653907495, "grad_norm": 727441.625, "learning_rate": 4.893673577884104e-05, "loss": 4.3259, "step": 800 }, { "epoch": 0.14354066985645933, "grad_norm": 772143.4375, "learning_rate": 4.880382775119617e-05, "loss": 4.1641, "step": 900 }, { "epoch": 0.1594896331738437, "grad_norm": 834641.6875, "learning_rate": 4.8670919723551306e-05, "loss": 4.2651, "step": 1000 }, { "epoch": 0.17543859649122806, "grad_norm": 743223.0, "learning_rate": 4.853801169590643e-05, "loss": 4.1867, "step": 1100 }, { "epoch": 0.19138755980861244, "grad_norm": 643478.125, "learning_rate": 4.8405103668261565e-05, "loss": 4.1816, "step": 1200 }, { "epoch": 0.20733652312599682, "grad_norm": 656405.6875, "learning_rate": 4.82721956406167e-05, "loss": 4.1677, "step": 1300 }, { "epoch": 0.22328548644338117, "grad_norm": 714701.3125, "learning_rate": 4.813928761297183e-05, "loss": 4.1595, "step": 1400 }, { "epoch": 0.23923444976076555, "grad_norm": 739746.875, "learning_rate": 4.800637958532696e-05, "loss": 4.1319, "step": 1500 }, { "epoch": 0.2551834130781499, "grad_norm": 736049.3125, "learning_rate": 4.787347155768209e-05, "loss": 4.1543, "step": 1600 }, { "epoch": 0.2711323763955343, "grad_norm": 668736.5, "learning_rate": 4.7740563530037217e-05, "loss": 4.2052, "step": 1700 }, { "epoch": 0.28708133971291866, "grad_norm": 655993.0625, "learning_rate": 4.760765550239234e-05, "loss": 4.1642, "step": 1800 }, { "epoch": 0.30303030303030304, "grad_norm": 732162.3125, "learning_rate": 4.7474747474747476e-05, "loss": 4.1464, "step": 1900 }, { "epoch": 0.3189792663476874, "grad_norm": 687130.875, "learning_rate": 4.734183944710261e-05, "loss": 4.181, "step": 2000 }, { "epoch": 0.3349282296650718, "grad_norm": 823373.1875, "learning_rate": 4.7208931419457735e-05, "loss": 4.1591, "step": 2100 }, { "epoch": 0.3508771929824561, "grad_norm": 716504.625, "learning_rate": 4.707602339181287e-05, "loss": 4.1391, "step": 2200 }, { "epoch": 0.3668261562998405, "grad_norm": 714200.6875, "learning_rate": 4.6943115364168e-05, "loss": 4.0962, "step": 2300 }, { "epoch": 0.3827751196172249, "grad_norm": 662553.0, "learning_rate": 4.681020733652313e-05, "loss": 4.0964, "step": 2400 }, { "epoch": 0.39872408293460926, "grad_norm": 767714.875, "learning_rate": 4.667729930887826e-05, "loss": 4.1174, "step": 2500 }, { "epoch": 0.41467304625199364, "grad_norm": 763852.75, "learning_rate": 4.6544391281233393e-05, "loss": 4.0442, "step": 2600 }, { "epoch": 0.430622009569378, "grad_norm": 733045.8125, "learning_rate": 4.641148325358852e-05, "loss": 4.112, "step": 2700 }, { "epoch": 0.44657097288676234, "grad_norm": 752065.875, "learning_rate": 4.6278575225943646e-05, "loss": 4.1223, "step": 2800 }, { "epoch": 0.4625199362041467, "grad_norm": 675739.8125, "learning_rate": 4.614566719829878e-05, "loss": 4.1608, "step": 2900 }, { "epoch": 0.4784688995215311, "grad_norm": 666147.375, "learning_rate": 4.6012759170653905e-05, "loss": 4.0803, "step": 3000 }, { "epoch": 0.4944178628389155, "grad_norm": 594287.9375, "learning_rate": 4.587985114300904e-05, "loss": 4.045, "step": 3100 }, { "epoch": 0.5103668261562998, "grad_norm": 778230.25, "learning_rate": 4.574694311536417e-05, "loss": 4.1268, "step": 3200 }, { "epoch": 0.5263157894736842, "grad_norm": 683312.0625, "learning_rate": 4.56140350877193e-05, "loss": 4.0424, "step": 3300 }, { "epoch": 0.5422647527910686, "grad_norm": 640858.25, "learning_rate": 4.548112706007443e-05, "loss": 4.0235, "step": 3400 }, { "epoch": 0.5582137161084529, "grad_norm": 680217.875, "learning_rate": 4.5348219032429564e-05, "loss": 4.1423, "step": 3500 }, { "epoch": 0.5741626794258373, "grad_norm": 708408.4375, "learning_rate": 4.521531100478469e-05, "loss": 4.0291, "step": 3600 }, { "epoch": 0.5901116427432217, "grad_norm": 663333.0, "learning_rate": 4.508240297713982e-05, "loss": 3.9998, "step": 3700 }, { "epoch": 0.6060606060606061, "grad_norm": 652592.0, "learning_rate": 4.494949494949495e-05, "loss": 4.0424, "step": 3800 }, { "epoch": 0.6220095693779905, "grad_norm": 667416.9375, "learning_rate": 4.481658692185008e-05, "loss": 4.053, "step": 3900 }, { "epoch": 0.6379585326953748, "grad_norm": 674108.4375, "learning_rate": 4.468367889420521e-05, "loss": 4.0748, "step": 4000 }, { "epoch": 0.6539074960127592, "grad_norm": 714003.625, "learning_rate": 4.455077086656034e-05, "loss": 3.9807, "step": 4100 }, { "epoch": 0.6698564593301436, "grad_norm": 693166.4375, "learning_rate": 4.4417862838915475e-05, "loss": 3.993, "step": 4200 }, { "epoch": 0.6858054226475279, "grad_norm": 689997.125, "learning_rate": 4.42849548112706e-05, "loss": 4.0593, "step": 4300 }, { "epoch": 0.7017543859649122, "grad_norm": 741118.5625, "learning_rate": 4.4152046783625734e-05, "loss": 4.0409, "step": 4400 }, { "epoch": 0.7177033492822966, "grad_norm": 668626.9375, "learning_rate": 4.401913875598087e-05, "loss": 4.06, "step": 4500 }, { "epoch": 0.733652312599681, "grad_norm": 771064.125, "learning_rate": 4.388623072833599e-05, "loss": 4.0122, "step": 4600 }, { "epoch": 0.7496012759170654, "grad_norm": 756131.375, "learning_rate": 4.3753322700691126e-05, "loss": 4.043, "step": 4700 }, { "epoch": 0.7655502392344498, "grad_norm": 737817.5625, "learning_rate": 4.362041467304626e-05, "loss": 4.0213, "step": 4800 }, { "epoch": 0.7814992025518341, "grad_norm": 758023.5625, "learning_rate": 4.3487506645401385e-05, "loss": 3.9778, "step": 4900 }, { "epoch": 0.7974481658692185, "grad_norm": 672895.1875, "learning_rate": 4.335459861775651e-05, "loss": 4.0434, "step": 5000 }, { "epoch": 0.8133971291866029, "grad_norm": 710937.375, "learning_rate": 4.3221690590111645e-05, "loss": 4.0117, "step": 5100 }, { "epoch": 0.8293460925039873, "grad_norm": 773563.875, "learning_rate": 4.308878256246677e-05, "loss": 4.0516, "step": 5200 }, { "epoch": 0.8452950558213717, "grad_norm": 750733.4375, "learning_rate": 4.2955874534821904e-05, "loss": 3.9798, "step": 5300 }, { "epoch": 0.861244019138756, "grad_norm": 706317.3125, "learning_rate": 4.282296650717704e-05, "loss": 3.9845, "step": 5400 }, { "epoch": 0.8771929824561403, "grad_norm": 710855.75, "learning_rate": 4.269005847953216e-05, "loss": 4.0295, "step": 5500 }, { "epoch": 0.8931419457735247, "grad_norm": 712988.9375, "learning_rate": 4.2557150451887296e-05, "loss": 3.9732, "step": 5600 }, { "epoch": 0.9090909090909091, "grad_norm": 628492.75, "learning_rate": 4.242424242424243e-05, "loss": 4.0012, "step": 5700 }, { "epoch": 0.9250398724082934, "grad_norm": 821738.375, "learning_rate": 4.2291334396597556e-05, "loss": 3.9867, "step": 5800 }, { "epoch": 0.9409888357256778, "grad_norm": 720818.125, "learning_rate": 4.215842636895269e-05, "loss": 3.962, "step": 5900 }, { "epoch": 0.9569377990430622, "grad_norm": 698428.0625, "learning_rate": 4.2025518341307815e-05, "loss": 4.0095, "step": 6000 }, { "epoch": 0.9728867623604466, "grad_norm": 769185.75, "learning_rate": 4.189261031366295e-05, "loss": 3.9469, "step": 6100 }, { "epoch": 0.988835725677831, "grad_norm": 706422.25, "learning_rate": 4.1759702286018074e-05, "loss": 3.9466, "step": 6200 }, { "epoch": 1.0, "eval_loss": 3.8981289863586426, "eval_runtime": 213.136, "eval_samples_per_second": 134.295, "eval_steps_per_second": 4.199, "step": 6270 }, { "epoch": 1.0047846889952152, "grad_norm": 664245.5, "learning_rate": 4.162679425837321e-05, "loss": 3.9397, "step": 6300 }, { "epoch": 1.0207336523125996, "grad_norm": 790251.5625, "learning_rate": 4.149388623072834e-05, "loss": 3.7918, "step": 6400 }, { "epoch": 1.036682615629984, "grad_norm": 813129.1875, "learning_rate": 4.1360978203083467e-05, "loss": 3.8563, "step": 6500 }, { "epoch": 1.0526315789473684, "grad_norm": 695871.5, "learning_rate": 4.12280701754386e-05, "loss": 3.959, "step": 6600 }, { "epoch": 1.0685805422647527, "grad_norm": 640218.3125, "learning_rate": 4.109516214779373e-05, "loss": 3.9882, "step": 6700 }, { "epoch": 1.0845295055821371, "grad_norm": 709886.125, "learning_rate": 4.096225412014886e-05, "loss": 3.9328, "step": 6800 }, { "epoch": 1.1004784688995215, "grad_norm": 663732.1875, "learning_rate": 4.082934609250399e-05, "loss": 3.947, "step": 6900 }, { "epoch": 1.1164274322169059, "grad_norm": 649781.9375, "learning_rate": 4.069643806485912e-05, "loss": 3.8735, "step": 7000 }, { "epoch": 1.1323763955342903, "grad_norm": 788934.4375, "learning_rate": 4.056353003721425e-05, "loss": 3.8548, "step": 7100 }, { "epoch": 1.1483253588516746, "grad_norm": 688279.3125, "learning_rate": 4.043062200956938e-05, "loss": 3.8844, "step": 7200 }, { "epoch": 1.164274322169059, "grad_norm": 698958.0, "learning_rate": 4.029771398192451e-05, "loss": 3.878, "step": 7300 }, { "epoch": 1.1802232854864434, "grad_norm": 777544.8125, "learning_rate": 4.016480595427964e-05, "loss": 3.8299, "step": 7400 }, { "epoch": 1.1961722488038278, "grad_norm": 768888.3125, "learning_rate": 4.003189792663477e-05, "loss": 3.859, "step": 7500 }, { "epoch": 1.2121212121212122, "grad_norm": 703599.8125, "learning_rate": 3.98989898989899e-05, "loss": 3.9008, "step": 7600 }, { "epoch": 1.2280701754385965, "grad_norm": 684616.75, "learning_rate": 3.976608187134503e-05, "loss": 3.8917, "step": 7700 }, { "epoch": 1.244019138755981, "grad_norm": 754650.9375, "learning_rate": 3.963317384370016e-05, "loss": 3.7987, "step": 7800 }, { "epoch": 1.2599681020733653, "grad_norm": 833289.8125, "learning_rate": 3.9500265816055295e-05, "loss": 3.8097, "step": 7900 }, { "epoch": 1.2759170653907497, "grad_norm": 712939.125, "learning_rate": 3.936735778841042e-05, "loss": 3.91, "step": 8000 }, { "epoch": 1.291866028708134, "grad_norm": 691034.4375, "learning_rate": 3.9234449760765554e-05, "loss": 3.8607, "step": 8100 }, { "epoch": 1.3078149920255182, "grad_norm": 777656.1875, "learning_rate": 3.910154173312068e-05, "loss": 3.8294, "step": 8200 }, { "epoch": 1.3237639553429026, "grad_norm": 779268.6875, "learning_rate": 3.896863370547581e-05, "loss": 3.839, "step": 8300 }, { "epoch": 1.339712918660287, "grad_norm": 737519.0, "learning_rate": 3.883572567783094e-05, "loss": 3.8773, "step": 8400 }, { "epoch": 1.3556618819776713, "grad_norm": 722279.3125, "learning_rate": 3.870281765018607e-05, "loss": 3.8351, "step": 8500 }, { "epoch": 1.3716108452950557, "grad_norm": 763525.9375, "learning_rate": 3.8569909622541206e-05, "loss": 3.8517, "step": 8600 }, { "epoch": 1.38755980861244, "grad_norm": 715289.3125, "learning_rate": 3.843700159489633e-05, "loss": 3.8272, "step": 8700 }, { "epoch": 1.4035087719298245, "grad_norm": 656186.625, "learning_rate": 3.8304093567251465e-05, "loss": 3.8894, "step": 8800 }, { "epoch": 1.4194577352472089, "grad_norm": 756528.875, "learning_rate": 3.81711855396066e-05, "loss": 3.8387, "step": 8900 }, { "epoch": 1.4354066985645932, "grad_norm": 716015.0, "learning_rate": 3.8038277511961725e-05, "loss": 3.7842, "step": 9000 }, { "epoch": 1.4513556618819776, "grad_norm": 721006.3125, "learning_rate": 3.790536948431686e-05, "loss": 3.7769, "step": 9100 }, { "epoch": 1.467304625199362, "grad_norm": 800538.1875, "learning_rate": 3.7772461456671984e-05, "loss": 3.8393, "step": 9200 }, { "epoch": 1.4832535885167464, "grad_norm": 699156.0, "learning_rate": 3.763955342902711e-05, "loss": 3.7527, "step": 9300 }, { "epoch": 1.4992025518341308, "grad_norm": 699306.375, "learning_rate": 3.750664540138224e-05, "loss": 3.8306, "step": 9400 }, { "epoch": 1.5151515151515151, "grad_norm": 689892.5, "learning_rate": 3.7373737373737376e-05, "loss": 3.8582, "step": 9500 }, { "epoch": 1.5311004784688995, "grad_norm": 712134.3125, "learning_rate": 3.72408293460925e-05, "loss": 3.9048, "step": 9600 }, { "epoch": 1.547049441786284, "grad_norm": 611158.625, "learning_rate": 3.7107921318447635e-05, "loss": 3.8315, "step": 9700 }, { "epoch": 1.5629984051036683, "grad_norm": 814951.875, "learning_rate": 3.697501329080277e-05, "loss": 3.8433, "step": 9800 }, { "epoch": 1.5789473684210527, "grad_norm": 638232.25, "learning_rate": 3.6842105263157895e-05, "loss": 3.8697, "step": 9900 }, { "epoch": 1.594896331738437, "grad_norm": 717277.8125, "learning_rate": 3.670919723551303e-05, "loss": 3.8447, "step": 10000 }, { "epoch": 1.6108452950558214, "grad_norm": 623000.1875, "learning_rate": 3.657628920786816e-05, "loss": 3.9104, "step": 10100 }, { "epoch": 1.6267942583732058, "grad_norm": 737050.875, "learning_rate": 3.644338118022329e-05, "loss": 3.8208, "step": 10200 }, { "epoch": 1.6427432216905902, "grad_norm": 766050.0, "learning_rate": 3.631047315257842e-05, "loss": 3.8454, "step": 10300 }, { "epoch": 1.6586921850079746, "grad_norm": 681873.25, "learning_rate": 3.6177565124933546e-05, "loss": 3.8284, "step": 10400 }, { "epoch": 1.674641148325359, "grad_norm": 636321.0625, "learning_rate": 3.604465709728867e-05, "loss": 3.8191, "step": 10500 }, { "epoch": 1.6905901116427433, "grad_norm": 751694.875, "learning_rate": 3.5911749069643806e-05, "loss": 3.8152, "step": 10600 }, { "epoch": 1.7065390749601277, "grad_norm": 779390.25, "learning_rate": 3.577884104199894e-05, "loss": 3.8235, "step": 10700 }, { "epoch": 1.722488038277512, "grad_norm": 780193.4375, "learning_rate": 3.5645933014354065e-05, "loss": 3.8109, "step": 10800 }, { "epoch": 1.7384370015948964, "grad_norm": 678532.75, "learning_rate": 3.55130249867092e-05, "loss": 3.8017, "step": 10900 }, { "epoch": 1.7543859649122808, "grad_norm": 733045.375, "learning_rate": 3.538011695906433e-05, "loss": 3.8518, "step": 11000 }, { "epoch": 1.7703349282296652, "grad_norm": 646417.375, "learning_rate": 3.5247208931419464e-05, "loss": 3.7623, "step": 11100 }, { "epoch": 1.7862838915470496, "grad_norm": 641868.25, "learning_rate": 3.511430090377459e-05, "loss": 3.7536, "step": 11200 }, { "epoch": 1.802232854864434, "grad_norm": 743804.375, "learning_rate": 3.498139287612972e-05, "loss": 3.8056, "step": 11300 }, { "epoch": 1.8181818181818183, "grad_norm": 725540.5625, "learning_rate": 3.484848484848485e-05, "loss": 3.8078, "step": 11400 }, { "epoch": 1.8341307814992025, "grad_norm": 735264.125, "learning_rate": 3.4715576820839976e-05, "loss": 3.8247, "step": 11500 }, { "epoch": 1.8500797448165869, "grad_norm": 708785.5625, "learning_rate": 3.458266879319511e-05, "loss": 3.7898, "step": 11600 }, { "epoch": 1.8660287081339713, "grad_norm": 864416.3125, "learning_rate": 3.444976076555024e-05, "loss": 3.8642, "step": 11700 }, { "epoch": 1.8819776714513556, "grad_norm": 693382.5625, "learning_rate": 3.431685273790537e-05, "loss": 3.7433, "step": 11800 }, { "epoch": 1.89792663476874, "grad_norm": 841755.25, "learning_rate": 3.41839447102605e-05, "loss": 3.8562, "step": 11900 }, { "epoch": 1.9138755980861244, "grad_norm": 662765.125, "learning_rate": 3.4051036682615634e-05, "loss": 3.8703, "step": 12000 }, { "epoch": 1.9298245614035088, "grad_norm": 707709.75, "learning_rate": 3.391812865497076e-05, "loss": 3.8402, "step": 12100 }, { "epoch": 1.9457735247208932, "grad_norm": 616722.875, "learning_rate": 3.3785220627325893e-05, "loss": 3.8432, "step": 12200 }, { "epoch": 1.9617224880382775, "grad_norm": 795238.625, "learning_rate": 3.3652312599681026e-05, "loss": 3.7319, "step": 12300 }, { "epoch": 1.977671451355662, "grad_norm": 709130.0, "learning_rate": 3.351940457203615e-05, "loss": 3.7628, "step": 12400 }, { "epoch": 1.9936204146730463, "grad_norm": 789321.375, "learning_rate": 3.3386496544391286e-05, "loss": 3.7821, "step": 12500 }, { "epoch": 2.0, "eval_loss": 3.800079584121704, "eval_runtime": 212.9573, "eval_samples_per_second": 134.407, "eval_steps_per_second": 4.203, "step": 12540 }, { "epoch": 2.0095693779904304, "grad_norm": 862786.0, "learning_rate": 3.325358851674641e-05, "loss": 3.7512, "step": 12600 }, { "epoch": 2.025518341307815, "grad_norm": 772642.5625, "learning_rate": 3.312068048910154e-05, "loss": 3.6611, "step": 12700 }, { "epoch": 2.041467304625199, "grad_norm": 706907.1875, "learning_rate": 3.298777246145667e-05, "loss": 3.7484, "step": 12800 }, { "epoch": 2.0574162679425836, "grad_norm": 940896.375, "learning_rate": 3.2854864433811804e-05, "loss": 3.7545, "step": 12900 }, { "epoch": 2.073365231259968, "grad_norm": 670013.875, "learning_rate": 3.272195640616693e-05, "loss": 3.6504, "step": 13000 }, { "epoch": 2.0893141945773523, "grad_norm": 702168.0, "learning_rate": 3.2589048378522064e-05, "loss": 3.6133, "step": 13100 }, { "epoch": 2.1052631578947367, "grad_norm": 741310.75, "learning_rate": 3.24561403508772e-05, "loss": 3.7202, "step": 13200 }, { "epoch": 2.121212121212121, "grad_norm": 630812.625, "learning_rate": 3.232323232323233e-05, "loss": 3.7131, "step": 13300 }, { "epoch": 2.1371610845295055, "grad_norm": 736768.125, "learning_rate": 3.2190324295587456e-05, "loss": 3.6982, "step": 13400 }, { "epoch": 2.15311004784689, "grad_norm": 743122.8125, "learning_rate": 3.205741626794259e-05, "loss": 3.7196, "step": 13500 }, { "epoch": 2.1690590111642742, "grad_norm": 685713.25, "learning_rate": 3.1924508240297715e-05, "loss": 3.7312, "step": 13600 }, { "epoch": 2.1850079744816586, "grad_norm": 829695.75, "learning_rate": 3.179160021265284e-05, "loss": 3.6914, "step": 13700 }, { "epoch": 2.200956937799043, "grad_norm": 640866.6875, "learning_rate": 3.1658692185007975e-05, "loss": 3.8376, "step": 13800 }, { "epoch": 2.2169059011164274, "grad_norm": 705478.25, "learning_rate": 3.152578415736311e-05, "loss": 3.7294, "step": 13900 }, { "epoch": 2.2328548644338118, "grad_norm": 664668.8125, "learning_rate": 3.1392876129718234e-05, "loss": 3.7046, "step": 14000 }, { "epoch": 2.248803827751196, "grad_norm": 653658.25, "learning_rate": 3.125996810207337e-05, "loss": 3.7142, "step": 14100 }, { "epoch": 2.2647527910685805, "grad_norm": 737460.0625, "learning_rate": 3.11270600744285e-05, "loss": 3.6449, "step": 14200 }, { "epoch": 2.280701754385965, "grad_norm": 657366.8125, "learning_rate": 3.0994152046783626e-05, "loss": 3.7051, "step": 14300 }, { "epoch": 2.2966507177033493, "grad_norm": 734922.0625, "learning_rate": 3.086124401913876e-05, "loss": 3.6793, "step": 14400 }, { "epoch": 2.3125996810207337, "grad_norm": 636888.125, "learning_rate": 3.072833599149389e-05, "loss": 3.764, "step": 14500 }, { "epoch": 2.328548644338118, "grad_norm": 776567.125, "learning_rate": 3.059542796384902e-05, "loss": 3.6952, "step": 14600 }, { "epoch": 2.3444976076555024, "grad_norm": 678031.5, "learning_rate": 3.0462519936204148e-05, "loss": 3.7086, "step": 14700 }, { "epoch": 2.360446570972887, "grad_norm": 702883.9375, "learning_rate": 3.032961190855928e-05, "loss": 3.6974, "step": 14800 }, { "epoch": 2.376395534290271, "grad_norm": 640335.375, "learning_rate": 3.0196703880914407e-05, "loss": 3.6555, "step": 14900 }, { "epoch": 2.3923444976076556, "grad_norm": 686070.0625, "learning_rate": 3.0063795853269537e-05, "loss": 3.7167, "step": 15000 }, { "epoch": 2.40829346092504, "grad_norm": 676044.25, "learning_rate": 2.993088782562467e-05, "loss": 3.6554, "step": 15100 }, { "epoch": 2.4242424242424243, "grad_norm": 767818.6875, "learning_rate": 2.9797979797979796e-05, "loss": 3.693, "step": 15200 }, { "epoch": 2.4401913875598087, "grad_norm": 739911.0625, "learning_rate": 2.966507177033493e-05, "loss": 3.6431, "step": 15300 }, { "epoch": 2.456140350877193, "grad_norm": 771256.3125, "learning_rate": 2.9532163742690062e-05, "loss": 3.6216, "step": 15400 }, { "epoch": 2.4720893141945774, "grad_norm": 773173.375, "learning_rate": 2.939925571504519e-05, "loss": 3.7742, "step": 15500 }, { "epoch": 2.488038277511962, "grad_norm": 762888.3125, "learning_rate": 2.9266347687400318e-05, "loss": 3.7236, "step": 15600 }, { "epoch": 2.503987240829346, "grad_norm": 879857.0625, "learning_rate": 2.913343965975545e-05, "loss": 3.7253, "step": 15700 }, { "epoch": 2.5199362041467306, "grad_norm": 733285.0625, "learning_rate": 2.9000531632110584e-05, "loss": 3.7166, "step": 15800 }, { "epoch": 2.535885167464115, "grad_norm": 833014.3125, "learning_rate": 2.886762360446571e-05, "loss": 3.6705, "step": 15900 }, { "epoch": 2.5518341307814993, "grad_norm": 736943.4375, "learning_rate": 2.8734715576820844e-05, "loss": 3.7253, "step": 16000 }, { "epoch": 2.5677830940988837, "grad_norm": 708644.4375, "learning_rate": 2.8601807549175973e-05, "loss": 3.6672, "step": 16100 }, { "epoch": 2.583732057416268, "grad_norm": 795735.8125, "learning_rate": 2.84688995215311e-05, "loss": 3.728, "step": 16200 }, { "epoch": 2.5996810207336525, "grad_norm": 864533.875, "learning_rate": 2.8335991493886233e-05, "loss": 3.6478, "step": 16300 }, { "epoch": 2.6156299840510364, "grad_norm": 835745.0, "learning_rate": 2.8203083466241366e-05, "loss": 3.6204, "step": 16400 }, { "epoch": 2.6315789473684212, "grad_norm": 726415.3125, "learning_rate": 2.8070175438596492e-05, "loss": 3.6899, "step": 16500 }, { "epoch": 2.647527910685805, "grad_norm": 619188.1875, "learning_rate": 2.793726741095162e-05, "loss": 3.6442, "step": 16600 }, { "epoch": 2.66347687400319, "grad_norm": 704278.1875, "learning_rate": 2.7804359383306755e-05, "loss": 3.7113, "step": 16700 }, { "epoch": 2.679425837320574, "grad_norm": 787753.0, "learning_rate": 2.767145135566188e-05, "loss": 3.7368, "step": 16800 }, { "epoch": 2.6953748006379588, "grad_norm": 738743.5625, "learning_rate": 2.7538543328017014e-05, "loss": 3.7575, "step": 16900 }, { "epoch": 2.7113237639553427, "grad_norm": 797812.75, "learning_rate": 2.7405635300372147e-05, "loss": 3.6586, "step": 17000 }, { "epoch": 2.7272727272727275, "grad_norm": 662883.0, "learning_rate": 2.7272727272727273e-05, "loss": 3.6812, "step": 17100 }, { "epoch": 2.7432216905901115, "grad_norm": 708693.8125, "learning_rate": 2.7139819245082403e-05, "loss": 3.6645, "step": 17200 }, { "epoch": 2.7591706539074963, "grad_norm": 750630.625, "learning_rate": 2.7006911217437536e-05, "loss": 3.659, "step": 17300 }, { "epoch": 2.77511961722488, "grad_norm": 743643.5625, "learning_rate": 2.6874003189792662e-05, "loss": 3.6794, "step": 17400 }, { "epoch": 2.7910685805422646, "grad_norm": 683095.9375, "learning_rate": 2.6741095162147795e-05, "loss": 3.6587, "step": 17500 }, { "epoch": 2.807017543859649, "grad_norm": 659289.8125, "learning_rate": 2.6608187134502928e-05, "loss": 3.7067, "step": 17600 }, { "epoch": 2.8229665071770333, "grad_norm": 722875.9375, "learning_rate": 2.6475279106858054e-05, "loss": 3.6918, "step": 17700 }, { "epoch": 2.8389154704944177, "grad_norm": 643060.4375, "learning_rate": 2.6342371079213184e-05, "loss": 3.694, "step": 17800 }, { "epoch": 2.854864433811802, "grad_norm": 811117.3125, "learning_rate": 2.6209463051568317e-05, "loss": 3.703, "step": 17900 }, { "epoch": 2.8708133971291865, "grad_norm": 835076.6875, "learning_rate": 2.6076555023923443e-05, "loss": 3.6346, "step": 18000 }, { "epoch": 2.886762360446571, "grad_norm": 800111.875, "learning_rate": 2.5943646996278576e-05, "loss": 3.6589, "step": 18100 }, { "epoch": 2.9027113237639552, "grad_norm": 726367.5, "learning_rate": 2.5810738968633706e-05, "loss": 3.6522, "step": 18200 }, { "epoch": 2.9186602870813396, "grad_norm": 717922.25, "learning_rate": 2.567783094098884e-05, "loss": 3.6592, "step": 18300 }, { "epoch": 2.934609250398724, "grad_norm": 841302.0625, "learning_rate": 2.5544922913343965e-05, "loss": 3.688, "step": 18400 }, { "epoch": 2.9505582137161084, "grad_norm": 697304.1875, "learning_rate": 2.5412014885699098e-05, "loss": 3.6017, "step": 18500 }, { "epoch": 2.9665071770334928, "grad_norm": 929050.25, "learning_rate": 2.527910685805423e-05, "loss": 3.6116, "step": 18600 }, { "epoch": 2.982456140350877, "grad_norm": 699048.9375, "learning_rate": 2.5146198830409358e-05, "loss": 3.6474, "step": 18700 }, { "epoch": 2.9984051036682615, "grad_norm": 783686.0625, "learning_rate": 2.5013290802764487e-05, "loss": 3.6783, "step": 18800 }, { "epoch": 3.0, "eval_loss": 3.706315755844116, "eval_runtime": 212.8533, "eval_samples_per_second": 134.473, "eval_steps_per_second": 4.205, "step": 18810 }, { "epoch": 3.014354066985646, "grad_norm": 709059.3125, "learning_rate": 2.4880382775119617e-05, "loss": 3.5423, "step": 18900 }, { "epoch": 3.0303030303030303, "grad_norm": 655684.75, "learning_rate": 2.474747474747475e-05, "loss": 3.5606, "step": 19000 }, { "epoch": 3.0462519936204147, "grad_norm": 759915.1875, "learning_rate": 2.461456671982988e-05, "loss": 3.6979, "step": 19100 }, { "epoch": 3.062200956937799, "grad_norm": 835860.375, "learning_rate": 2.448165869218501e-05, "loss": 3.6324, "step": 19200 }, { "epoch": 3.0781499202551834, "grad_norm": 675779.5625, "learning_rate": 2.434875066454014e-05, "loss": 3.5773, "step": 19300 }, { "epoch": 3.094098883572568, "grad_norm": 700443.875, "learning_rate": 2.421584263689527e-05, "loss": 3.6309, "step": 19400 }, { "epoch": 3.110047846889952, "grad_norm": 768801.6875, "learning_rate": 2.4082934609250398e-05, "loss": 3.639, "step": 19500 }, { "epoch": 3.1259968102073366, "grad_norm": 705109.6875, "learning_rate": 2.395002658160553e-05, "loss": 3.6365, "step": 19600 }, { "epoch": 3.141945773524721, "grad_norm": 661887.5, "learning_rate": 2.381711855396066e-05, "loss": 3.5754, "step": 19700 }, { "epoch": 3.1578947368421053, "grad_norm": 691131.0, "learning_rate": 2.368421052631579e-05, "loss": 3.5888, "step": 19800 }, { "epoch": 3.1738437001594897, "grad_norm": 738841.625, "learning_rate": 2.355130249867092e-05, "loss": 3.6433, "step": 19900 }, { "epoch": 3.189792663476874, "grad_norm": 663372.1875, "learning_rate": 2.341839447102605e-05, "loss": 3.5642, "step": 20000 }, { "epoch": 3.2057416267942584, "grad_norm": 619548.75, "learning_rate": 2.3285486443381183e-05, "loss": 3.5864, "step": 20100 }, { "epoch": 3.221690590111643, "grad_norm": 773919.6875, "learning_rate": 2.3152578415736312e-05, "loss": 3.6072, "step": 20200 }, { "epoch": 3.237639553429027, "grad_norm": 728946.0625, "learning_rate": 2.3019670388091442e-05, "loss": 3.6212, "step": 20300 }, { "epoch": 3.2535885167464116, "grad_norm": 659326.9375, "learning_rate": 2.288676236044657e-05, "loss": 3.6068, "step": 20400 }, { "epoch": 3.269537480063796, "grad_norm": 759802.25, "learning_rate": 2.27538543328017e-05, "loss": 3.6783, "step": 20500 }, { "epoch": 3.2854864433811803, "grad_norm": 805141.9375, "learning_rate": 2.262094630515683e-05, "loss": 3.6441, "step": 20600 }, { "epoch": 3.3014354066985647, "grad_norm": 716521.875, "learning_rate": 2.2488038277511964e-05, "loss": 3.5748, "step": 20700 }, { "epoch": 3.317384370015949, "grad_norm": 920115.4375, "learning_rate": 2.2355130249867094e-05, "loss": 3.5985, "step": 20800 }, { "epoch": 3.3333333333333335, "grad_norm": 838658.0625, "learning_rate": 2.2222222222222223e-05, "loss": 3.564, "step": 20900 }, { "epoch": 3.349282296650718, "grad_norm": 838474.9375, "learning_rate": 2.2089314194577353e-05, "loss": 3.5875, "step": 21000 }, { "epoch": 3.3652312599681022, "grad_norm": 663992.25, "learning_rate": 2.1956406166932483e-05, "loss": 3.5681, "step": 21100 }, { "epoch": 3.3811802232854866, "grad_norm": 679489.9375, "learning_rate": 2.1823498139287616e-05, "loss": 3.5276, "step": 21200 }, { "epoch": 3.397129186602871, "grad_norm": 721495.6875, "learning_rate": 2.1690590111642745e-05, "loss": 3.5599, "step": 21300 }, { "epoch": 3.4130781499202554, "grad_norm": 667400.9375, "learning_rate": 2.1557682083997875e-05, "loss": 3.5554, "step": 21400 }, { "epoch": 3.4290271132376393, "grad_norm": 706122.0, "learning_rate": 2.1424774056353005e-05, "loss": 3.6543, "step": 21500 }, { "epoch": 3.444976076555024, "grad_norm": 709513.125, "learning_rate": 2.1291866028708134e-05, "loss": 3.6568, "step": 21600 }, { "epoch": 3.460925039872408, "grad_norm": 652669.375, "learning_rate": 2.1158958001063264e-05, "loss": 3.6423, "step": 21700 }, { "epoch": 3.476874003189793, "grad_norm": 713371.625, "learning_rate": 2.1026049973418397e-05, "loss": 3.6102, "step": 21800 }, { "epoch": 3.492822966507177, "grad_norm": 694536.875, "learning_rate": 2.0893141945773527e-05, "loss": 3.5488, "step": 21900 }, { "epoch": 3.5087719298245617, "grad_norm": 668059.9375, "learning_rate": 2.0760233918128656e-05, "loss": 3.6078, "step": 22000 }, { "epoch": 3.5247208931419456, "grad_norm": 731925.875, "learning_rate": 2.0627325890483786e-05, "loss": 3.5455, "step": 22100 }, { "epoch": 3.5406698564593304, "grad_norm": 766777.9375, "learning_rate": 2.0494417862838915e-05, "loss": 3.5582, "step": 22200 }, { "epoch": 3.5566188197767143, "grad_norm": 755381.1875, "learning_rate": 2.0361509835194045e-05, "loss": 3.5859, "step": 22300 }, { "epoch": 3.5725677830940987, "grad_norm": 749873.6875, "learning_rate": 2.0228601807549178e-05, "loss": 3.7161, "step": 22400 }, { "epoch": 3.588516746411483, "grad_norm": 677539.125, "learning_rate": 2.0095693779904308e-05, "loss": 3.6225, "step": 22500 }, { "epoch": 3.6044657097288675, "grad_norm": 744990.8125, "learning_rate": 1.9962785752259437e-05, "loss": 3.5935, "step": 22600 }, { "epoch": 3.620414673046252, "grad_norm": 629363.0, "learning_rate": 1.9829877724614567e-05, "loss": 3.5828, "step": 22700 }, { "epoch": 3.6363636363636362, "grad_norm": 685327.4375, "learning_rate": 1.9696969696969697e-05, "loss": 3.5937, "step": 22800 }, { "epoch": 3.6523125996810206, "grad_norm": 760474.25, "learning_rate": 1.956406166932483e-05, "loss": 3.5799, "step": 22900 }, { "epoch": 3.668261562998405, "grad_norm": 725888.9375, "learning_rate": 1.943115364167996e-05, "loss": 3.5465, "step": 23000 }, { "epoch": 3.6842105263157894, "grad_norm": 752312.0625, "learning_rate": 1.929824561403509e-05, "loss": 3.6185, "step": 23100 }, { "epoch": 3.7001594896331738, "grad_norm": 855498.5625, "learning_rate": 1.916533758639022e-05, "loss": 3.5792, "step": 23200 }, { "epoch": 3.716108452950558, "grad_norm": 679264.5, "learning_rate": 1.9032429558745348e-05, "loss": 3.5531, "step": 23300 }, { "epoch": 3.7320574162679425, "grad_norm": 688482.875, "learning_rate": 1.8899521531100478e-05, "loss": 3.5495, "step": 23400 }, { "epoch": 3.748006379585327, "grad_norm": 756693.75, "learning_rate": 1.876661350345561e-05, "loss": 3.567, "step": 23500 }, { "epoch": 3.7639553429027113, "grad_norm": 854033.0625, "learning_rate": 1.863370547581074e-05, "loss": 3.5611, "step": 23600 }, { "epoch": 3.7799043062200957, "grad_norm": 753585.0625, "learning_rate": 1.850079744816587e-05, "loss": 3.5439, "step": 23700 }, { "epoch": 3.79585326953748, "grad_norm": 685358.25, "learning_rate": 1.8367889420521e-05, "loss": 3.535, "step": 23800 }, { "epoch": 3.8118022328548644, "grad_norm": 813028.4375, "learning_rate": 1.823498139287613e-05, "loss": 3.566, "step": 23900 }, { "epoch": 3.827751196172249, "grad_norm": 676295.9375, "learning_rate": 1.8102073365231263e-05, "loss": 3.5208, "step": 24000 }, { "epoch": 3.843700159489633, "grad_norm": 705614.1875, "learning_rate": 1.7969165337586392e-05, "loss": 3.5108, "step": 24100 }, { "epoch": 3.8596491228070176, "grad_norm": 643356.125, "learning_rate": 1.7836257309941522e-05, "loss": 3.5416, "step": 24200 }, { "epoch": 3.875598086124402, "grad_norm": 783635.625, "learning_rate": 1.770334928229665e-05, "loss": 3.6022, "step": 24300 }, { "epoch": 3.8915470494417863, "grad_norm": 829787.5, "learning_rate": 1.757044125465178e-05, "loss": 3.6056, "step": 24400 }, { "epoch": 3.9074960127591707, "grad_norm": 776633.0625, "learning_rate": 1.743753322700691e-05, "loss": 3.5857, "step": 24500 }, { "epoch": 3.923444976076555, "grad_norm": 708855.1875, "learning_rate": 1.7304625199362044e-05, "loss": 3.6086, "step": 24600 }, { "epoch": 3.9393939393939394, "grad_norm": 906587.0, "learning_rate": 1.7171717171717173e-05, "loss": 3.5361, "step": 24700 }, { "epoch": 3.955342902711324, "grad_norm": 835950.4375, "learning_rate": 1.7038809144072303e-05, "loss": 3.5976, "step": 24800 }, { "epoch": 3.971291866028708, "grad_norm": 816570.6875, "learning_rate": 1.6905901116427433e-05, "loss": 3.596, "step": 24900 }, { "epoch": 3.9872408293460926, "grad_norm": 751378.9375, "learning_rate": 1.6772993088782562e-05, "loss": 3.4856, "step": 25000 }, { "epoch": 4.0, "eval_loss": 3.646216869354248, "eval_runtime": 212.927, "eval_samples_per_second": 134.426, "eval_steps_per_second": 4.203, "step": 25080 }, { "epoch": 4.003189792663477, "grad_norm": 735559.375, "learning_rate": 1.6640085061137695e-05, "loss": 3.5286, "step": 25100 }, { "epoch": 4.019138755980861, "grad_norm": 741852.3125, "learning_rate": 1.6507177033492825e-05, "loss": 3.5367, "step": 25200 }, { "epoch": 4.035087719298246, "grad_norm": 650216.625, "learning_rate": 1.6374269005847955e-05, "loss": 3.5056, "step": 25300 }, { "epoch": 4.05103668261563, "grad_norm": 717938.9375, "learning_rate": 1.6241360978203084e-05, "loss": 3.4996, "step": 25400 }, { "epoch": 4.0669856459330145, "grad_norm": 840426.9375, "learning_rate": 1.6108452950558214e-05, "loss": 3.5293, "step": 25500 }, { "epoch": 4.082934609250398, "grad_norm": 699668.5, "learning_rate": 1.5975544922913344e-05, "loss": 3.5122, "step": 25600 }, { "epoch": 4.098883572567783, "grad_norm": 649238.8125, "learning_rate": 1.5842636895268477e-05, "loss": 3.4949, "step": 25700 }, { "epoch": 4.114832535885167, "grad_norm": 669479.3125, "learning_rate": 1.5709728867623606e-05, "loss": 3.5515, "step": 25800 }, { "epoch": 4.130781499202552, "grad_norm": 645182.625, "learning_rate": 1.5576820839978733e-05, "loss": 3.5113, "step": 25900 }, { "epoch": 4.146730462519936, "grad_norm": 913136.6875, "learning_rate": 1.5443912812333866e-05, "loss": 3.5326, "step": 26000 }, { "epoch": 4.162679425837321, "grad_norm": 641233.3125, "learning_rate": 1.5311004784688995e-05, "loss": 3.4768, "step": 26100 }, { "epoch": 4.178628389154705, "grad_norm": 798713.5625, "learning_rate": 1.5178096757044127e-05, "loss": 3.4977, "step": 26200 }, { "epoch": 4.1945773524720895, "grad_norm": 719145.9375, "learning_rate": 1.5045188729399256e-05, "loss": 3.4549, "step": 26300 }, { "epoch": 4.2105263157894735, "grad_norm": 692601.9375, "learning_rate": 1.4912280701754386e-05, "loss": 3.5228, "step": 26400 }, { "epoch": 4.226475279106858, "grad_norm": 631408.4375, "learning_rate": 1.4779372674109517e-05, "loss": 3.4742, "step": 26500 }, { "epoch": 4.242424242424242, "grad_norm": 733137.0, "learning_rate": 1.4646464646464647e-05, "loss": 3.4242, "step": 26600 }, { "epoch": 4.258373205741627, "grad_norm": 741812.9375, "learning_rate": 1.4513556618819777e-05, "loss": 3.4598, "step": 26700 }, { "epoch": 4.274322169059011, "grad_norm": 682720.125, "learning_rate": 1.4380648591174908e-05, "loss": 3.4827, "step": 26800 }, { "epoch": 4.290271132376396, "grad_norm": 784646.125, "learning_rate": 1.4247740563530037e-05, "loss": 3.4712, "step": 26900 }, { "epoch": 4.30622009569378, "grad_norm": 713282.75, "learning_rate": 1.4114832535885167e-05, "loss": 3.5542, "step": 27000 }, { "epoch": 4.3221690590111645, "grad_norm": 699440.4375, "learning_rate": 1.3981924508240298e-05, "loss": 3.4572, "step": 27100 }, { "epoch": 4.3381180223285485, "grad_norm": 733063.875, "learning_rate": 1.3849016480595428e-05, "loss": 3.4106, "step": 27200 }, { "epoch": 4.354066985645933, "grad_norm": 799606.875, "learning_rate": 1.371610845295056e-05, "loss": 3.4189, "step": 27300 }, { "epoch": 4.370015948963317, "grad_norm": 722583.25, "learning_rate": 1.3583200425305689e-05, "loss": 3.448, "step": 27400 }, { "epoch": 4.385964912280702, "grad_norm": 761491.5625, "learning_rate": 1.3450292397660819e-05, "loss": 3.5248, "step": 27500 }, { "epoch": 4.401913875598086, "grad_norm": 633397.6875, "learning_rate": 1.331738437001595e-05, "loss": 3.5675, "step": 27600 }, { "epoch": 4.417862838915471, "grad_norm": 743160.4375, "learning_rate": 1.318447634237108e-05, "loss": 3.4983, "step": 27700 }, { "epoch": 4.433811802232855, "grad_norm": 689363.3125, "learning_rate": 1.305156831472621e-05, "loss": 3.5689, "step": 27800 }, { "epoch": 4.44976076555024, "grad_norm": 634674.0, "learning_rate": 1.291866028708134e-05, "loss": 3.4763, "step": 27900 }, { "epoch": 4.4657097288676235, "grad_norm": 682868.5, "learning_rate": 1.278575225943647e-05, "loss": 3.4725, "step": 28000 }, { "epoch": 4.481658692185008, "grad_norm": 839863.0, "learning_rate": 1.26528442317916e-05, "loss": 3.5255, "step": 28100 }, { "epoch": 4.497607655502392, "grad_norm": 840497.0625, "learning_rate": 1.2519936204146731e-05, "loss": 3.4914, "step": 28200 }, { "epoch": 4.513556618819777, "grad_norm": 798480.8125, "learning_rate": 1.2387028176501861e-05, "loss": 3.4787, "step": 28300 }, { "epoch": 4.529505582137161, "grad_norm": 776783.625, "learning_rate": 1.2254120148856992e-05, "loss": 3.4817, "step": 28400 }, { "epoch": 4.545454545454545, "grad_norm": 860058.625, "learning_rate": 1.2121212121212122e-05, "loss": 3.515, "step": 28500 }, { "epoch": 4.56140350877193, "grad_norm": 797709.0, "learning_rate": 1.1988304093567252e-05, "loss": 3.4715, "step": 28600 }, { "epoch": 4.577352472089315, "grad_norm": 686232.0, "learning_rate": 1.1855396065922381e-05, "loss": 3.493, "step": 28700 }, { "epoch": 4.5933014354066986, "grad_norm": 713671.875, "learning_rate": 1.1722488038277513e-05, "loss": 3.4767, "step": 28800 }, { "epoch": 4.6092503987240825, "grad_norm": 830858.875, "learning_rate": 1.1589580010632644e-05, "loss": 3.5506, "step": 28900 }, { "epoch": 4.625199362041467, "grad_norm": 681684.8125, "learning_rate": 1.1456671982987772e-05, "loss": 3.468, "step": 29000 }, { "epoch": 4.641148325358852, "grad_norm": 693863.25, "learning_rate": 1.1323763955342903e-05, "loss": 3.4847, "step": 29100 }, { "epoch": 4.657097288676236, "grad_norm": 612233.75, "learning_rate": 1.1190855927698035e-05, "loss": 3.3994, "step": 29200 }, { "epoch": 4.67304625199362, "grad_norm": 901251.25, "learning_rate": 1.1057947900053164e-05, "loss": 3.5144, "step": 29300 }, { "epoch": 4.688995215311005, "grad_norm": 742618.4375, "learning_rate": 1.0925039872408294e-05, "loss": 3.5388, "step": 29400 }, { "epoch": 4.70494417862839, "grad_norm": 797654.0, "learning_rate": 1.0792131844763423e-05, "loss": 3.4722, "step": 29500 }, { "epoch": 4.720893141945774, "grad_norm": 763737.9375, "learning_rate": 1.0659223817118555e-05, "loss": 3.5006, "step": 29600 }, { "epoch": 4.7368421052631575, "grad_norm": 702353.875, "learning_rate": 1.0526315789473684e-05, "loss": 3.4772, "step": 29700 }, { "epoch": 4.752791068580542, "grad_norm": 790482.1875, "learning_rate": 1.0393407761828814e-05, "loss": 3.4507, "step": 29800 }, { "epoch": 4.768740031897926, "grad_norm": 706455.9375, "learning_rate": 1.0260499734183945e-05, "loss": 3.4769, "step": 29900 }, { "epoch": 4.784688995215311, "grad_norm": 720554.3125, "learning_rate": 1.0127591706539077e-05, "loss": 3.4551, "step": 30000 }, { "epoch": 4.800637958532695, "grad_norm": 754827.6875, "learning_rate": 9.994683678894205e-06, "loss": 3.4982, "step": 30100 }, { "epoch": 4.81658692185008, "grad_norm": 696089.375, "learning_rate": 9.861775651249336e-06, "loss": 3.4637, "step": 30200 }, { "epoch": 4.832535885167464, "grad_norm": 737095.3125, "learning_rate": 9.728867623604466e-06, "loss": 3.4621, "step": 30300 }, { "epoch": 4.848484848484849, "grad_norm": 768262.125, "learning_rate": 9.595959595959595e-06, "loss": 3.48, "step": 30400 }, { "epoch": 4.8644338118022326, "grad_norm": 767420.5625, "learning_rate": 9.463051568314727e-06, "loss": 3.4463, "step": 30500 }, { "epoch": 4.880382775119617, "grad_norm": 706310.0, "learning_rate": 9.330143540669856e-06, "loss": 3.4668, "step": 30600 }, { "epoch": 4.896331738437001, "grad_norm": 828940.625, "learning_rate": 9.197235513024988e-06, "loss": 3.4377, "step": 30700 }, { "epoch": 4.912280701754386, "grad_norm": 788075.375, "learning_rate": 9.064327485380117e-06, "loss": 3.4144, "step": 30800 }, { "epoch": 4.92822966507177, "grad_norm": 805364.0625, "learning_rate": 8.931419457735247e-06, "loss": 3.4726, "step": 30900 }, { "epoch": 4.944178628389155, "grad_norm": 604472.75, "learning_rate": 8.798511430090378e-06, "loss": 3.4936, "step": 31000 }, { "epoch": 4.960127591706539, "grad_norm": 684835.3125, "learning_rate": 8.66560340244551e-06, "loss": 3.4637, "step": 31100 }, { "epoch": 4.976076555023924, "grad_norm": 684998.5, "learning_rate": 8.532695374800638e-06, "loss": 3.5321, "step": 31200 }, { "epoch": 4.992025518341308, "grad_norm": 711871.5625, "learning_rate": 8.399787347155769e-06, "loss": 3.4809, "step": 31300 }, { "epoch": 5.0, "eval_loss": 3.6019129753112793, "eval_runtime": 212.6906, "eval_samples_per_second": 134.576, "eval_steps_per_second": 4.208, "step": 31350 }, { "epoch": 5.007974481658692, "grad_norm": 1041463.125, "learning_rate": 8.266879319510899e-06, "loss": 3.4055, "step": 31400 }, { "epoch": 5.023923444976076, "grad_norm": 623676.3125, "learning_rate": 8.133971291866028e-06, "loss": 3.5315, "step": 31500 }, { "epoch": 5.039872408293461, "grad_norm": 619523.9375, "learning_rate": 8.00106326422116e-06, "loss": 3.4709, "step": 31600 }, { "epoch": 5.055821371610845, "grad_norm": 779379.875, "learning_rate": 7.86815523657629e-06, "loss": 3.3933, "step": 31700 }, { "epoch": 5.07177033492823, "grad_norm": 617667.5625, "learning_rate": 7.73524720893142e-06, "loss": 3.4738, "step": 31800 }, { "epoch": 5.087719298245614, "grad_norm": 691351.5625, "learning_rate": 7.602339181286549e-06, "loss": 3.4499, "step": 31900 }, { "epoch": 5.103668261562999, "grad_norm": 702913.25, "learning_rate": 7.469431153641681e-06, "loss": 3.4476, "step": 32000 }, { "epoch": 5.119617224880383, "grad_norm": 778967.4375, "learning_rate": 7.336523125996811e-06, "loss": 3.4689, "step": 32100 }, { "epoch": 5.1355661881977674, "grad_norm": 693687.3125, "learning_rate": 7.20361509835194e-06, "loss": 3.381, "step": 32200 }, { "epoch": 5.151515151515151, "grad_norm": 843375.5, "learning_rate": 7.0707070707070704e-06, "loss": 3.438, "step": 32300 }, { "epoch": 5.167464114832536, "grad_norm": 734730.5, "learning_rate": 6.937799043062202e-06, "loss": 3.4564, "step": 32400 }, { "epoch": 5.18341307814992, "grad_norm": 716158.375, "learning_rate": 6.804891015417332e-06, "loss": 3.4529, "step": 32500 }, { "epoch": 5.199362041467305, "grad_norm": 703040.625, "learning_rate": 6.671982987772461e-06, "loss": 3.391, "step": 32600 }, { "epoch": 5.215311004784689, "grad_norm": 636837.3125, "learning_rate": 6.5390749601275915e-06, "loss": 3.4174, "step": 32700 }, { "epoch": 5.231259968102074, "grad_norm": 661893.6875, "learning_rate": 6.406166932482723e-06, "loss": 3.4256, "step": 32800 }, { "epoch": 5.247208931419458, "grad_norm": 812641.0625, "learning_rate": 6.273258904837853e-06, "loss": 3.4981, "step": 32900 }, { "epoch": 5.2631578947368425, "grad_norm": 725753.125, "learning_rate": 6.140350877192982e-06, "loss": 3.5018, "step": 33000 }, { "epoch": 5.279106858054226, "grad_norm": 691768.3125, "learning_rate": 6.007442849548113e-06, "loss": 3.3969, "step": 33100 }, { "epoch": 5.295055821371611, "grad_norm": 776131.9375, "learning_rate": 5.874534821903243e-06, "loss": 3.4489, "step": 33200 }, { "epoch": 5.311004784688995, "grad_norm": 747268.25, "learning_rate": 5.741626794258374e-06, "loss": 3.4075, "step": 33300 }, { "epoch": 5.32695374800638, "grad_norm": 864468.1875, "learning_rate": 5.608718766613503e-06, "loss": 3.4443, "step": 33400 }, { "epoch": 5.342902711323764, "grad_norm": 729350.5, "learning_rate": 5.475810738968634e-06, "loss": 3.4189, "step": 33500 }, { "epoch": 5.358851674641148, "grad_norm": 687491.0625, "learning_rate": 5.342902711323764e-06, "loss": 3.4167, "step": 33600 }, { "epoch": 5.374800637958533, "grad_norm": 682089.125, "learning_rate": 5.209994683678895e-06, "loss": 3.3691, "step": 33700 }, { "epoch": 5.3907496012759175, "grad_norm": 698959.25, "learning_rate": 5.077086656034024e-06, "loss": 3.4449, "step": 33800 }, { "epoch": 5.4066985645933014, "grad_norm": 671919.375, "learning_rate": 4.944178628389155e-06, "loss": 3.4421, "step": 33900 }, { "epoch": 5.422647527910685, "grad_norm": 647332.125, "learning_rate": 4.811270600744285e-06, "loss": 3.4529, "step": 34000 }, { "epoch": 5.43859649122807, "grad_norm": 686242.25, "learning_rate": 4.678362573099415e-06, "loss": 3.3381, "step": 34100 }, { "epoch": 5.454545454545454, "grad_norm": 936387.5, "learning_rate": 4.5454545454545455e-06, "loss": 3.4765, "step": 34200 }, { "epoch": 5.470494417862839, "grad_norm": 662910.3125, "learning_rate": 4.412546517809676e-06, "loss": 3.3729, "step": 34300 }, { "epoch": 5.486443381180223, "grad_norm": 671547.0, "learning_rate": 4.2796384901648065e-06, "loss": 3.4489, "step": 34400 }, { "epoch": 5.502392344497608, "grad_norm": 777179.3125, "learning_rate": 4.146730462519936e-06, "loss": 3.424, "step": 34500 }, { "epoch": 5.518341307814992, "grad_norm": 628742.25, "learning_rate": 4.013822434875067e-06, "loss": 3.4128, "step": 34600 }, { "epoch": 5.5342902711323765, "grad_norm": 794101.75, "learning_rate": 3.880914407230197e-06, "loss": 3.3938, "step": 34700 }, { "epoch": 5.55023923444976, "grad_norm": 720820.4375, "learning_rate": 3.7480063795853268e-06, "loss": 3.3622, "step": 34800 }, { "epoch": 5.566188197767145, "grad_norm": 686912.5, "learning_rate": 3.6150983519404573e-06, "loss": 3.3896, "step": 34900 }, { "epoch": 5.582137161084529, "grad_norm": 758477.0, "learning_rate": 3.4821903242955873e-06, "loss": 3.4494, "step": 35000 }, { "epoch": 5.598086124401914, "grad_norm": 828910.875, "learning_rate": 3.349282296650718e-06, "loss": 3.4204, "step": 35100 }, { "epoch": 5.614035087719298, "grad_norm": 650998.8125, "learning_rate": 3.216374269005848e-06, "loss": 3.4115, "step": 35200 }, { "epoch": 5.629984051036683, "grad_norm": 681491.375, "learning_rate": 3.0834662413609784e-06, "loss": 3.3861, "step": 35300 }, { "epoch": 5.645933014354067, "grad_norm": 749556.875, "learning_rate": 2.9505582137161084e-06, "loss": 3.4175, "step": 35400 }, { "epoch": 5.6618819776714515, "grad_norm": 677371.6875, "learning_rate": 2.817650186071239e-06, "loss": 3.4571, "step": 35500 }, { "epoch": 5.6778309409888355, "grad_norm": 730101.5625, "learning_rate": 2.684742158426369e-06, "loss": 3.3966, "step": 35600 }, { "epoch": 5.69377990430622, "grad_norm": 657353.375, "learning_rate": 2.5518341307814995e-06, "loss": 3.4639, "step": 35700 }, { "epoch": 5.709728867623604, "grad_norm": 734038.0, "learning_rate": 2.4189261031366296e-06, "loss": 3.3982, "step": 35800 }, { "epoch": 5.725677830940989, "grad_norm": 719495.625, "learning_rate": 2.28601807549176e-06, "loss": 3.4221, "step": 35900 }, { "epoch": 5.741626794258373, "grad_norm": 686361.0625, "learning_rate": 2.15311004784689e-06, "loss": 3.4886, "step": 36000 }, { "epoch": 5.757575757575758, "grad_norm": 804130.625, "learning_rate": 2.0202020202020206e-06, "loss": 3.4364, "step": 36100 }, { "epoch": 5.773524720893142, "grad_norm": 713737.5625, "learning_rate": 1.8872939925571505e-06, "loss": 3.391, "step": 36200 }, { "epoch": 5.7894736842105265, "grad_norm": 734503.125, "learning_rate": 1.7543859649122807e-06, "loss": 3.4291, "step": 36300 }, { "epoch": 5.8054226475279105, "grad_norm": 673341.3125, "learning_rate": 1.621477937267411e-06, "loss": 3.4816, "step": 36400 }, { "epoch": 5.821371610845295, "grad_norm": 643799.3125, "learning_rate": 1.4885699096225413e-06, "loss": 3.4221, "step": 36500 }, { "epoch": 5.837320574162679, "grad_norm": 641417.125, "learning_rate": 1.3556618819776716e-06, "loss": 3.4331, "step": 36600 }, { "epoch": 5.853269537480064, "grad_norm": 707790.25, "learning_rate": 1.2227538543328019e-06, "loss": 3.3936, "step": 36700 }, { "epoch": 5.869218500797448, "grad_norm": 734012.3125, "learning_rate": 1.089845826687932e-06, "loss": 3.4258, "step": 36800 }, { "epoch": 5.885167464114833, "grad_norm": 672716.0, "learning_rate": 9.569377990430622e-07, "loss": 3.4688, "step": 36900 }, { "epoch": 5.901116427432217, "grad_norm": 691052.75, "learning_rate": 8.240297713981925e-07, "loss": 3.3553, "step": 37000 }, { "epoch": 5.917065390749602, "grad_norm": 714107.875, "learning_rate": 6.911217437533228e-07, "loss": 3.4735, "step": 37100 }, { "epoch": 5.9330143540669855, "grad_norm": 783142.9375, "learning_rate": 5.582137161084529e-07, "loss": 3.4707, "step": 37200 }, { "epoch": 5.94896331738437, "grad_norm": 793443.25, "learning_rate": 4.2530568846358327e-07, "loss": 3.4279, "step": 37300 }, { "epoch": 5.964912280701754, "grad_norm": 828328.5625, "learning_rate": 2.9239766081871344e-07, "loss": 3.4171, "step": 37400 }, { "epoch": 5.980861244019139, "grad_norm": 826741.5625, "learning_rate": 1.5948963317384372e-07, "loss": 3.5064, "step": 37500 }, { "epoch": 5.996810207336523, "grad_norm": 698366.375, "learning_rate": 2.6581605528973954e-08, "loss": 3.42, "step": 37600 }, { "epoch": 6.0, "eval_loss": 3.615572929382324, "eval_runtime": 213.1542, "eval_samples_per_second": 134.283, "eval_steps_per_second": 4.199, "step": 37620 } ], "logging_steps": 100, "max_steps": 37620, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.65721373251031e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }