ThilinaGunathilaka's picture
Upload folder using huggingface_hub
25103cc verified
raw
history blame
64.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 37620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01594896331738437,
"grad_norm": 793786.1875,
"learning_rate": 4.9867091972355135e-05,
"loss": 4.3231,
"step": 100
},
{
"epoch": 0.03189792663476874,
"grad_norm": 705751.0625,
"learning_rate": 4.973418394471026e-05,
"loss": 4.1867,
"step": 200
},
{
"epoch": 0.04784688995215311,
"grad_norm": 761077.1875,
"learning_rate": 4.9601275917065395e-05,
"loss": 4.225,
"step": 300
},
{
"epoch": 0.06379585326953748,
"grad_norm": 660073.9375,
"learning_rate": 4.946836788942053e-05,
"loss": 4.2529,
"step": 400
},
{
"epoch": 0.07974481658692185,
"grad_norm": 701404.4375,
"learning_rate": 4.9335459861775654e-05,
"loss": 4.2985,
"step": 500
},
{
"epoch": 0.09569377990430622,
"grad_norm": 776949.5625,
"learning_rate": 4.920255183413078e-05,
"loss": 4.2124,
"step": 600
},
{
"epoch": 0.11164274322169059,
"grad_norm": 703487.375,
"learning_rate": 4.906964380648591e-05,
"loss": 4.224,
"step": 700
},
{
"epoch": 0.12759170653907495,
"grad_norm": 727441.625,
"learning_rate": 4.893673577884104e-05,
"loss": 4.3259,
"step": 800
},
{
"epoch": 0.14354066985645933,
"grad_norm": 772143.4375,
"learning_rate": 4.880382775119617e-05,
"loss": 4.1641,
"step": 900
},
{
"epoch": 0.1594896331738437,
"grad_norm": 834641.6875,
"learning_rate": 4.8670919723551306e-05,
"loss": 4.2651,
"step": 1000
},
{
"epoch": 0.17543859649122806,
"grad_norm": 743223.0,
"learning_rate": 4.853801169590643e-05,
"loss": 4.1867,
"step": 1100
},
{
"epoch": 0.19138755980861244,
"grad_norm": 643478.125,
"learning_rate": 4.8405103668261565e-05,
"loss": 4.1816,
"step": 1200
},
{
"epoch": 0.20733652312599682,
"grad_norm": 656405.6875,
"learning_rate": 4.82721956406167e-05,
"loss": 4.1677,
"step": 1300
},
{
"epoch": 0.22328548644338117,
"grad_norm": 714701.3125,
"learning_rate": 4.813928761297183e-05,
"loss": 4.1595,
"step": 1400
},
{
"epoch": 0.23923444976076555,
"grad_norm": 739746.875,
"learning_rate": 4.800637958532696e-05,
"loss": 4.1319,
"step": 1500
},
{
"epoch": 0.2551834130781499,
"grad_norm": 736049.3125,
"learning_rate": 4.787347155768209e-05,
"loss": 4.1543,
"step": 1600
},
{
"epoch": 0.2711323763955343,
"grad_norm": 668736.5,
"learning_rate": 4.7740563530037217e-05,
"loss": 4.2052,
"step": 1700
},
{
"epoch": 0.28708133971291866,
"grad_norm": 655993.0625,
"learning_rate": 4.760765550239234e-05,
"loss": 4.1642,
"step": 1800
},
{
"epoch": 0.30303030303030304,
"grad_norm": 732162.3125,
"learning_rate": 4.7474747474747476e-05,
"loss": 4.1464,
"step": 1900
},
{
"epoch": 0.3189792663476874,
"grad_norm": 687130.875,
"learning_rate": 4.734183944710261e-05,
"loss": 4.181,
"step": 2000
},
{
"epoch": 0.3349282296650718,
"grad_norm": 823373.1875,
"learning_rate": 4.7208931419457735e-05,
"loss": 4.1591,
"step": 2100
},
{
"epoch": 0.3508771929824561,
"grad_norm": 716504.625,
"learning_rate": 4.707602339181287e-05,
"loss": 4.1391,
"step": 2200
},
{
"epoch": 0.3668261562998405,
"grad_norm": 714200.6875,
"learning_rate": 4.6943115364168e-05,
"loss": 4.0962,
"step": 2300
},
{
"epoch": 0.3827751196172249,
"grad_norm": 662553.0,
"learning_rate": 4.681020733652313e-05,
"loss": 4.0964,
"step": 2400
},
{
"epoch": 0.39872408293460926,
"grad_norm": 767714.875,
"learning_rate": 4.667729930887826e-05,
"loss": 4.1174,
"step": 2500
},
{
"epoch": 0.41467304625199364,
"grad_norm": 763852.75,
"learning_rate": 4.6544391281233393e-05,
"loss": 4.0442,
"step": 2600
},
{
"epoch": 0.430622009569378,
"grad_norm": 733045.8125,
"learning_rate": 4.641148325358852e-05,
"loss": 4.112,
"step": 2700
},
{
"epoch": 0.44657097288676234,
"grad_norm": 752065.875,
"learning_rate": 4.6278575225943646e-05,
"loss": 4.1223,
"step": 2800
},
{
"epoch": 0.4625199362041467,
"grad_norm": 675739.8125,
"learning_rate": 4.614566719829878e-05,
"loss": 4.1608,
"step": 2900
},
{
"epoch": 0.4784688995215311,
"grad_norm": 666147.375,
"learning_rate": 4.6012759170653905e-05,
"loss": 4.0803,
"step": 3000
},
{
"epoch": 0.4944178628389155,
"grad_norm": 594287.9375,
"learning_rate": 4.587985114300904e-05,
"loss": 4.045,
"step": 3100
},
{
"epoch": 0.5103668261562998,
"grad_norm": 778230.25,
"learning_rate": 4.574694311536417e-05,
"loss": 4.1268,
"step": 3200
},
{
"epoch": 0.5263157894736842,
"grad_norm": 683312.0625,
"learning_rate": 4.56140350877193e-05,
"loss": 4.0424,
"step": 3300
},
{
"epoch": 0.5422647527910686,
"grad_norm": 640858.25,
"learning_rate": 4.548112706007443e-05,
"loss": 4.0235,
"step": 3400
},
{
"epoch": 0.5582137161084529,
"grad_norm": 680217.875,
"learning_rate": 4.5348219032429564e-05,
"loss": 4.1423,
"step": 3500
},
{
"epoch": 0.5741626794258373,
"grad_norm": 708408.4375,
"learning_rate": 4.521531100478469e-05,
"loss": 4.0291,
"step": 3600
},
{
"epoch": 0.5901116427432217,
"grad_norm": 663333.0,
"learning_rate": 4.508240297713982e-05,
"loss": 3.9998,
"step": 3700
},
{
"epoch": 0.6060606060606061,
"grad_norm": 652592.0,
"learning_rate": 4.494949494949495e-05,
"loss": 4.0424,
"step": 3800
},
{
"epoch": 0.6220095693779905,
"grad_norm": 667416.9375,
"learning_rate": 4.481658692185008e-05,
"loss": 4.053,
"step": 3900
},
{
"epoch": 0.6379585326953748,
"grad_norm": 674108.4375,
"learning_rate": 4.468367889420521e-05,
"loss": 4.0748,
"step": 4000
},
{
"epoch": 0.6539074960127592,
"grad_norm": 714003.625,
"learning_rate": 4.455077086656034e-05,
"loss": 3.9807,
"step": 4100
},
{
"epoch": 0.6698564593301436,
"grad_norm": 693166.4375,
"learning_rate": 4.4417862838915475e-05,
"loss": 3.993,
"step": 4200
},
{
"epoch": 0.6858054226475279,
"grad_norm": 689997.125,
"learning_rate": 4.42849548112706e-05,
"loss": 4.0593,
"step": 4300
},
{
"epoch": 0.7017543859649122,
"grad_norm": 741118.5625,
"learning_rate": 4.4152046783625734e-05,
"loss": 4.0409,
"step": 4400
},
{
"epoch": 0.7177033492822966,
"grad_norm": 668626.9375,
"learning_rate": 4.401913875598087e-05,
"loss": 4.06,
"step": 4500
},
{
"epoch": 0.733652312599681,
"grad_norm": 771064.125,
"learning_rate": 4.388623072833599e-05,
"loss": 4.0122,
"step": 4600
},
{
"epoch": 0.7496012759170654,
"grad_norm": 756131.375,
"learning_rate": 4.3753322700691126e-05,
"loss": 4.043,
"step": 4700
},
{
"epoch": 0.7655502392344498,
"grad_norm": 737817.5625,
"learning_rate": 4.362041467304626e-05,
"loss": 4.0213,
"step": 4800
},
{
"epoch": 0.7814992025518341,
"grad_norm": 758023.5625,
"learning_rate": 4.3487506645401385e-05,
"loss": 3.9778,
"step": 4900
},
{
"epoch": 0.7974481658692185,
"grad_norm": 672895.1875,
"learning_rate": 4.335459861775651e-05,
"loss": 4.0434,
"step": 5000
},
{
"epoch": 0.8133971291866029,
"grad_norm": 710937.375,
"learning_rate": 4.3221690590111645e-05,
"loss": 4.0117,
"step": 5100
},
{
"epoch": 0.8293460925039873,
"grad_norm": 773563.875,
"learning_rate": 4.308878256246677e-05,
"loss": 4.0516,
"step": 5200
},
{
"epoch": 0.8452950558213717,
"grad_norm": 750733.4375,
"learning_rate": 4.2955874534821904e-05,
"loss": 3.9798,
"step": 5300
},
{
"epoch": 0.861244019138756,
"grad_norm": 706317.3125,
"learning_rate": 4.282296650717704e-05,
"loss": 3.9845,
"step": 5400
},
{
"epoch": 0.8771929824561403,
"grad_norm": 710855.75,
"learning_rate": 4.269005847953216e-05,
"loss": 4.0295,
"step": 5500
},
{
"epoch": 0.8931419457735247,
"grad_norm": 712988.9375,
"learning_rate": 4.2557150451887296e-05,
"loss": 3.9732,
"step": 5600
},
{
"epoch": 0.9090909090909091,
"grad_norm": 628492.75,
"learning_rate": 4.242424242424243e-05,
"loss": 4.0012,
"step": 5700
},
{
"epoch": 0.9250398724082934,
"grad_norm": 821738.375,
"learning_rate": 4.2291334396597556e-05,
"loss": 3.9867,
"step": 5800
},
{
"epoch": 0.9409888357256778,
"grad_norm": 720818.125,
"learning_rate": 4.215842636895269e-05,
"loss": 3.962,
"step": 5900
},
{
"epoch": 0.9569377990430622,
"grad_norm": 698428.0625,
"learning_rate": 4.2025518341307815e-05,
"loss": 4.0095,
"step": 6000
},
{
"epoch": 0.9728867623604466,
"grad_norm": 769185.75,
"learning_rate": 4.189261031366295e-05,
"loss": 3.9469,
"step": 6100
},
{
"epoch": 0.988835725677831,
"grad_norm": 706422.25,
"learning_rate": 4.1759702286018074e-05,
"loss": 3.9466,
"step": 6200
},
{
"epoch": 1.0,
"eval_loss": 3.8981289863586426,
"eval_runtime": 213.136,
"eval_samples_per_second": 134.295,
"eval_steps_per_second": 4.199,
"step": 6270
},
{
"epoch": 1.0047846889952152,
"grad_norm": 664245.5,
"learning_rate": 4.162679425837321e-05,
"loss": 3.9397,
"step": 6300
},
{
"epoch": 1.0207336523125996,
"grad_norm": 790251.5625,
"learning_rate": 4.149388623072834e-05,
"loss": 3.7918,
"step": 6400
},
{
"epoch": 1.036682615629984,
"grad_norm": 813129.1875,
"learning_rate": 4.1360978203083467e-05,
"loss": 3.8563,
"step": 6500
},
{
"epoch": 1.0526315789473684,
"grad_norm": 695871.5,
"learning_rate": 4.12280701754386e-05,
"loss": 3.959,
"step": 6600
},
{
"epoch": 1.0685805422647527,
"grad_norm": 640218.3125,
"learning_rate": 4.109516214779373e-05,
"loss": 3.9882,
"step": 6700
},
{
"epoch": 1.0845295055821371,
"grad_norm": 709886.125,
"learning_rate": 4.096225412014886e-05,
"loss": 3.9328,
"step": 6800
},
{
"epoch": 1.1004784688995215,
"grad_norm": 663732.1875,
"learning_rate": 4.082934609250399e-05,
"loss": 3.947,
"step": 6900
},
{
"epoch": 1.1164274322169059,
"grad_norm": 649781.9375,
"learning_rate": 4.069643806485912e-05,
"loss": 3.8735,
"step": 7000
},
{
"epoch": 1.1323763955342903,
"grad_norm": 788934.4375,
"learning_rate": 4.056353003721425e-05,
"loss": 3.8548,
"step": 7100
},
{
"epoch": 1.1483253588516746,
"grad_norm": 688279.3125,
"learning_rate": 4.043062200956938e-05,
"loss": 3.8844,
"step": 7200
},
{
"epoch": 1.164274322169059,
"grad_norm": 698958.0,
"learning_rate": 4.029771398192451e-05,
"loss": 3.878,
"step": 7300
},
{
"epoch": 1.1802232854864434,
"grad_norm": 777544.8125,
"learning_rate": 4.016480595427964e-05,
"loss": 3.8299,
"step": 7400
},
{
"epoch": 1.1961722488038278,
"grad_norm": 768888.3125,
"learning_rate": 4.003189792663477e-05,
"loss": 3.859,
"step": 7500
},
{
"epoch": 1.2121212121212122,
"grad_norm": 703599.8125,
"learning_rate": 3.98989898989899e-05,
"loss": 3.9008,
"step": 7600
},
{
"epoch": 1.2280701754385965,
"grad_norm": 684616.75,
"learning_rate": 3.976608187134503e-05,
"loss": 3.8917,
"step": 7700
},
{
"epoch": 1.244019138755981,
"grad_norm": 754650.9375,
"learning_rate": 3.963317384370016e-05,
"loss": 3.7987,
"step": 7800
},
{
"epoch": 1.2599681020733653,
"grad_norm": 833289.8125,
"learning_rate": 3.9500265816055295e-05,
"loss": 3.8097,
"step": 7900
},
{
"epoch": 1.2759170653907497,
"grad_norm": 712939.125,
"learning_rate": 3.936735778841042e-05,
"loss": 3.91,
"step": 8000
},
{
"epoch": 1.291866028708134,
"grad_norm": 691034.4375,
"learning_rate": 3.9234449760765554e-05,
"loss": 3.8607,
"step": 8100
},
{
"epoch": 1.3078149920255182,
"grad_norm": 777656.1875,
"learning_rate": 3.910154173312068e-05,
"loss": 3.8294,
"step": 8200
},
{
"epoch": 1.3237639553429026,
"grad_norm": 779268.6875,
"learning_rate": 3.896863370547581e-05,
"loss": 3.839,
"step": 8300
},
{
"epoch": 1.339712918660287,
"grad_norm": 737519.0,
"learning_rate": 3.883572567783094e-05,
"loss": 3.8773,
"step": 8400
},
{
"epoch": 1.3556618819776713,
"grad_norm": 722279.3125,
"learning_rate": 3.870281765018607e-05,
"loss": 3.8351,
"step": 8500
},
{
"epoch": 1.3716108452950557,
"grad_norm": 763525.9375,
"learning_rate": 3.8569909622541206e-05,
"loss": 3.8517,
"step": 8600
},
{
"epoch": 1.38755980861244,
"grad_norm": 715289.3125,
"learning_rate": 3.843700159489633e-05,
"loss": 3.8272,
"step": 8700
},
{
"epoch": 1.4035087719298245,
"grad_norm": 656186.625,
"learning_rate": 3.8304093567251465e-05,
"loss": 3.8894,
"step": 8800
},
{
"epoch": 1.4194577352472089,
"grad_norm": 756528.875,
"learning_rate": 3.81711855396066e-05,
"loss": 3.8387,
"step": 8900
},
{
"epoch": 1.4354066985645932,
"grad_norm": 716015.0,
"learning_rate": 3.8038277511961725e-05,
"loss": 3.7842,
"step": 9000
},
{
"epoch": 1.4513556618819776,
"grad_norm": 721006.3125,
"learning_rate": 3.790536948431686e-05,
"loss": 3.7769,
"step": 9100
},
{
"epoch": 1.467304625199362,
"grad_norm": 800538.1875,
"learning_rate": 3.7772461456671984e-05,
"loss": 3.8393,
"step": 9200
},
{
"epoch": 1.4832535885167464,
"grad_norm": 699156.0,
"learning_rate": 3.763955342902711e-05,
"loss": 3.7527,
"step": 9300
},
{
"epoch": 1.4992025518341308,
"grad_norm": 699306.375,
"learning_rate": 3.750664540138224e-05,
"loss": 3.8306,
"step": 9400
},
{
"epoch": 1.5151515151515151,
"grad_norm": 689892.5,
"learning_rate": 3.7373737373737376e-05,
"loss": 3.8582,
"step": 9500
},
{
"epoch": 1.5311004784688995,
"grad_norm": 712134.3125,
"learning_rate": 3.72408293460925e-05,
"loss": 3.9048,
"step": 9600
},
{
"epoch": 1.547049441786284,
"grad_norm": 611158.625,
"learning_rate": 3.7107921318447635e-05,
"loss": 3.8315,
"step": 9700
},
{
"epoch": 1.5629984051036683,
"grad_norm": 814951.875,
"learning_rate": 3.697501329080277e-05,
"loss": 3.8433,
"step": 9800
},
{
"epoch": 1.5789473684210527,
"grad_norm": 638232.25,
"learning_rate": 3.6842105263157895e-05,
"loss": 3.8697,
"step": 9900
},
{
"epoch": 1.594896331738437,
"grad_norm": 717277.8125,
"learning_rate": 3.670919723551303e-05,
"loss": 3.8447,
"step": 10000
},
{
"epoch": 1.6108452950558214,
"grad_norm": 623000.1875,
"learning_rate": 3.657628920786816e-05,
"loss": 3.9104,
"step": 10100
},
{
"epoch": 1.6267942583732058,
"grad_norm": 737050.875,
"learning_rate": 3.644338118022329e-05,
"loss": 3.8208,
"step": 10200
},
{
"epoch": 1.6427432216905902,
"grad_norm": 766050.0,
"learning_rate": 3.631047315257842e-05,
"loss": 3.8454,
"step": 10300
},
{
"epoch": 1.6586921850079746,
"grad_norm": 681873.25,
"learning_rate": 3.6177565124933546e-05,
"loss": 3.8284,
"step": 10400
},
{
"epoch": 1.674641148325359,
"grad_norm": 636321.0625,
"learning_rate": 3.604465709728867e-05,
"loss": 3.8191,
"step": 10500
},
{
"epoch": 1.6905901116427433,
"grad_norm": 751694.875,
"learning_rate": 3.5911749069643806e-05,
"loss": 3.8152,
"step": 10600
},
{
"epoch": 1.7065390749601277,
"grad_norm": 779390.25,
"learning_rate": 3.577884104199894e-05,
"loss": 3.8235,
"step": 10700
},
{
"epoch": 1.722488038277512,
"grad_norm": 780193.4375,
"learning_rate": 3.5645933014354065e-05,
"loss": 3.8109,
"step": 10800
},
{
"epoch": 1.7384370015948964,
"grad_norm": 678532.75,
"learning_rate": 3.55130249867092e-05,
"loss": 3.8017,
"step": 10900
},
{
"epoch": 1.7543859649122808,
"grad_norm": 733045.375,
"learning_rate": 3.538011695906433e-05,
"loss": 3.8518,
"step": 11000
},
{
"epoch": 1.7703349282296652,
"grad_norm": 646417.375,
"learning_rate": 3.5247208931419464e-05,
"loss": 3.7623,
"step": 11100
},
{
"epoch": 1.7862838915470496,
"grad_norm": 641868.25,
"learning_rate": 3.511430090377459e-05,
"loss": 3.7536,
"step": 11200
},
{
"epoch": 1.802232854864434,
"grad_norm": 743804.375,
"learning_rate": 3.498139287612972e-05,
"loss": 3.8056,
"step": 11300
},
{
"epoch": 1.8181818181818183,
"grad_norm": 725540.5625,
"learning_rate": 3.484848484848485e-05,
"loss": 3.8078,
"step": 11400
},
{
"epoch": 1.8341307814992025,
"grad_norm": 735264.125,
"learning_rate": 3.4715576820839976e-05,
"loss": 3.8247,
"step": 11500
},
{
"epoch": 1.8500797448165869,
"grad_norm": 708785.5625,
"learning_rate": 3.458266879319511e-05,
"loss": 3.7898,
"step": 11600
},
{
"epoch": 1.8660287081339713,
"grad_norm": 864416.3125,
"learning_rate": 3.444976076555024e-05,
"loss": 3.8642,
"step": 11700
},
{
"epoch": 1.8819776714513556,
"grad_norm": 693382.5625,
"learning_rate": 3.431685273790537e-05,
"loss": 3.7433,
"step": 11800
},
{
"epoch": 1.89792663476874,
"grad_norm": 841755.25,
"learning_rate": 3.41839447102605e-05,
"loss": 3.8562,
"step": 11900
},
{
"epoch": 1.9138755980861244,
"grad_norm": 662765.125,
"learning_rate": 3.4051036682615634e-05,
"loss": 3.8703,
"step": 12000
},
{
"epoch": 1.9298245614035088,
"grad_norm": 707709.75,
"learning_rate": 3.391812865497076e-05,
"loss": 3.8402,
"step": 12100
},
{
"epoch": 1.9457735247208932,
"grad_norm": 616722.875,
"learning_rate": 3.3785220627325893e-05,
"loss": 3.8432,
"step": 12200
},
{
"epoch": 1.9617224880382775,
"grad_norm": 795238.625,
"learning_rate": 3.3652312599681026e-05,
"loss": 3.7319,
"step": 12300
},
{
"epoch": 1.977671451355662,
"grad_norm": 709130.0,
"learning_rate": 3.351940457203615e-05,
"loss": 3.7628,
"step": 12400
},
{
"epoch": 1.9936204146730463,
"grad_norm": 789321.375,
"learning_rate": 3.3386496544391286e-05,
"loss": 3.7821,
"step": 12500
},
{
"epoch": 2.0,
"eval_loss": 3.800079584121704,
"eval_runtime": 212.9573,
"eval_samples_per_second": 134.407,
"eval_steps_per_second": 4.203,
"step": 12540
},
{
"epoch": 2.0095693779904304,
"grad_norm": 862786.0,
"learning_rate": 3.325358851674641e-05,
"loss": 3.7512,
"step": 12600
},
{
"epoch": 2.025518341307815,
"grad_norm": 772642.5625,
"learning_rate": 3.312068048910154e-05,
"loss": 3.6611,
"step": 12700
},
{
"epoch": 2.041467304625199,
"grad_norm": 706907.1875,
"learning_rate": 3.298777246145667e-05,
"loss": 3.7484,
"step": 12800
},
{
"epoch": 2.0574162679425836,
"grad_norm": 940896.375,
"learning_rate": 3.2854864433811804e-05,
"loss": 3.7545,
"step": 12900
},
{
"epoch": 2.073365231259968,
"grad_norm": 670013.875,
"learning_rate": 3.272195640616693e-05,
"loss": 3.6504,
"step": 13000
},
{
"epoch": 2.0893141945773523,
"grad_norm": 702168.0,
"learning_rate": 3.2589048378522064e-05,
"loss": 3.6133,
"step": 13100
},
{
"epoch": 2.1052631578947367,
"grad_norm": 741310.75,
"learning_rate": 3.24561403508772e-05,
"loss": 3.7202,
"step": 13200
},
{
"epoch": 2.121212121212121,
"grad_norm": 630812.625,
"learning_rate": 3.232323232323233e-05,
"loss": 3.7131,
"step": 13300
},
{
"epoch": 2.1371610845295055,
"grad_norm": 736768.125,
"learning_rate": 3.2190324295587456e-05,
"loss": 3.6982,
"step": 13400
},
{
"epoch": 2.15311004784689,
"grad_norm": 743122.8125,
"learning_rate": 3.205741626794259e-05,
"loss": 3.7196,
"step": 13500
},
{
"epoch": 2.1690590111642742,
"grad_norm": 685713.25,
"learning_rate": 3.1924508240297715e-05,
"loss": 3.7312,
"step": 13600
},
{
"epoch": 2.1850079744816586,
"grad_norm": 829695.75,
"learning_rate": 3.179160021265284e-05,
"loss": 3.6914,
"step": 13700
},
{
"epoch": 2.200956937799043,
"grad_norm": 640866.6875,
"learning_rate": 3.1658692185007975e-05,
"loss": 3.8376,
"step": 13800
},
{
"epoch": 2.2169059011164274,
"grad_norm": 705478.25,
"learning_rate": 3.152578415736311e-05,
"loss": 3.7294,
"step": 13900
},
{
"epoch": 2.2328548644338118,
"grad_norm": 664668.8125,
"learning_rate": 3.1392876129718234e-05,
"loss": 3.7046,
"step": 14000
},
{
"epoch": 2.248803827751196,
"grad_norm": 653658.25,
"learning_rate": 3.125996810207337e-05,
"loss": 3.7142,
"step": 14100
},
{
"epoch": 2.2647527910685805,
"grad_norm": 737460.0625,
"learning_rate": 3.11270600744285e-05,
"loss": 3.6449,
"step": 14200
},
{
"epoch": 2.280701754385965,
"grad_norm": 657366.8125,
"learning_rate": 3.0994152046783626e-05,
"loss": 3.7051,
"step": 14300
},
{
"epoch": 2.2966507177033493,
"grad_norm": 734922.0625,
"learning_rate": 3.086124401913876e-05,
"loss": 3.6793,
"step": 14400
},
{
"epoch": 2.3125996810207337,
"grad_norm": 636888.125,
"learning_rate": 3.072833599149389e-05,
"loss": 3.764,
"step": 14500
},
{
"epoch": 2.328548644338118,
"grad_norm": 776567.125,
"learning_rate": 3.059542796384902e-05,
"loss": 3.6952,
"step": 14600
},
{
"epoch": 2.3444976076555024,
"grad_norm": 678031.5,
"learning_rate": 3.0462519936204148e-05,
"loss": 3.7086,
"step": 14700
},
{
"epoch": 2.360446570972887,
"grad_norm": 702883.9375,
"learning_rate": 3.032961190855928e-05,
"loss": 3.6974,
"step": 14800
},
{
"epoch": 2.376395534290271,
"grad_norm": 640335.375,
"learning_rate": 3.0196703880914407e-05,
"loss": 3.6555,
"step": 14900
},
{
"epoch": 2.3923444976076556,
"grad_norm": 686070.0625,
"learning_rate": 3.0063795853269537e-05,
"loss": 3.7167,
"step": 15000
},
{
"epoch": 2.40829346092504,
"grad_norm": 676044.25,
"learning_rate": 2.993088782562467e-05,
"loss": 3.6554,
"step": 15100
},
{
"epoch": 2.4242424242424243,
"grad_norm": 767818.6875,
"learning_rate": 2.9797979797979796e-05,
"loss": 3.693,
"step": 15200
},
{
"epoch": 2.4401913875598087,
"grad_norm": 739911.0625,
"learning_rate": 2.966507177033493e-05,
"loss": 3.6431,
"step": 15300
},
{
"epoch": 2.456140350877193,
"grad_norm": 771256.3125,
"learning_rate": 2.9532163742690062e-05,
"loss": 3.6216,
"step": 15400
},
{
"epoch": 2.4720893141945774,
"grad_norm": 773173.375,
"learning_rate": 2.939925571504519e-05,
"loss": 3.7742,
"step": 15500
},
{
"epoch": 2.488038277511962,
"grad_norm": 762888.3125,
"learning_rate": 2.9266347687400318e-05,
"loss": 3.7236,
"step": 15600
},
{
"epoch": 2.503987240829346,
"grad_norm": 879857.0625,
"learning_rate": 2.913343965975545e-05,
"loss": 3.7253,
"step": 15700
},
{
"epoch": 2.5199362041467306,
"grad_norm": 733285.0625,
"learning_rate": 2.9000531632110584e-05,
"loss": 3.7166,
"step": 15800
},
{
"epoch": 2.535885167464115,
"grad_norm": 833014.3125,
"learning_rate": 2.886762360446571e-05,
"loss": 3.6705,
"step": 15900
},
{
"epoch": 2.5518341307814993,
"grad_norm": 736943.4375,
"learning_rate": 2.8734715576820844e-05,
"loss": 3.7253,
"step": 16000
},
{
"epoch": 2.5677830940988837,
"grad_norm": 708644.4375,
"learning_rate": 2.8601807549175973e-05,
"loss": 3.6672,
"step": 16100
},
{
"epoch": 2.583732057416268,
"grad_norm": 795735.8125,
"learning_rate": 2.84688995215311e-05,
"loss": 3.728,
"step": 16200
},
{
"epoch": 2.5996810207336525,
"grad_norm": 864533.875,
"learning_rate": 2.8335991493886233e-05,
"loss": 3.6478,
"step": 16300
},
{
"epoch": 2.6156299840510364,
"grad_norm": 835745.0,
"learning_rate": 2.8203083466241366e-05,
"loss": 3.6204,
"step": 16400
},
{
"epoch": 2.6315789473684212,
"grad_norm": 726415.3125,
"learning_rate": 2.8070175438596492e-05,
"loss": 3.6899,
"step": 16500
},
{
"epoch": 2.647527910685805,
"grad_norm": 619188.1875,
"learning_rate": 2.793726741095162e-05,
"loss": 3.6442,
"step": 16600
},
{
"epoch": 2.66347687400319,
"grad_norm": 704278.1875,
"learning_rate": 2.7804359383306755e-05,
"loss": 3.7113,
"step": 16700
},
{
"epoch": 2.679425837320574,
"grad_norm": 787753.0,
"learning_rate": 2.767145135566188e-05,
"loss": 3.7368,
"step": 16800
},
{
"epoch": 2.6953748006379588,
"grad_norm": 738743.5625,
"learning_rate": 2.7538543328017014e-05,
"loss": 3.7575,
"step": 16900
},
{
"epoch": 2.7113237639553427,
"grad_norm": 797812.75,
"learning_rate": 2.7405635300372147e-05,
"loss": 3.6586,
"step": 17000
},
{
"epoch": 2.7272727272727275,
"grad_norm": 662883.0,
"learning_rate": 2.7272727272727273e-05,
"loss": 3.6812,
"step": 17100
},
{
"epoch": 2.7432216905901115,
"grad_norm": 708693.8125,
"learning_rate": 2.7139819245082403e-05,
"loss": 3.6645,
"step": 17200
},
{
"epoch": 2.7591706539074963,
"grad_norm": 750630.625,
"learning_rate": 2.7006911217437536e-05,
"loss": 3.659,
"step": 17300
},
{
"epoch": 2.77511961722488,
"grad_norm": 743643.5625,
"learning_rate": 2.6874003189792662e-05,
"loss": 3.6794,
"step": 17400
},
{
"epoch": 2.7910685805422646,
"grad_norm": 683095.9375,
"learning_rate": 2.6741095162147795e-05,
"loss": 3.6587,
"step": 17500
},
{
"epoch": 2.807017543859649,
"grad_norm": 659289.8125,
"learning_rate": 2.6608187134502928e-05,
"loss": 3.7067,
"step": 17600
},
{
"epoch": 2.8229665071770333,
"grad_norm": 722875.9375,
"learning_rate": 2.6475279106858054e-05,
"loss": 3.6918,
"step": 17700
},
{
"epoch": 2.8389154704944177,
"grad_norm": 643060.4375,
"learning_rate": 2.6342371079213184e-05,
"loss": 3.694,
"step": 17800
},
{
"epoch": 2.854864433811802,
"grad_norm": 811117.3125,
"learning_rate": 2.6209463051568317e-05,
"loss": 3.703,
"step": 17900
},
{
"epoch": 2.8708133971291865,
"grad_norm": 835076.6875,
"learning_rate": 2.6076555023923443e-05,
"loss": 3.6346,
"step": 18000
},
{
"epoch": 2.886762360446571,
"grad_norm": 800111.875,
"learning_rate": 2.5943646996278576e-05,
"loss": 3.6589,
"step": 18100
},
{
"epoch": 2.9027113237639552,
"grad_norm": 726367.5,
"learning_rate": 2.5810738968633706e-05,
"loss": 3.6522,
"step": 18200
},
{
"epoch": 2.9186602870813396,
"grad_norm": 717922.25,
"learning_rate": 2.567783094098884e-05,
"loss": 3.6592,
"step": 18300
},
{
"epoch": 2.934609250398724,
"grad_norm": 841302.0625,
"learning_rate": 2.5544922913343965e-05,
"loss": 3.688,
"step": 18400
},
{
"epoch": 2.9505582137161084,
"grad_norm": 697304.1875,
"learning_rate": 2.5412014885699098e-05,
"loss": 3.6017,
"step": 18500
},
{
"epoch": 2.9665071770334928,
"grad_norm": 929050.25,
"learning_rate": 2.527910685805423e-05,
"loss": 3.6116,
"step": 18600
},
{
"epoch": 2.982456140350877,
"grad_norm": 699048.9375,
"learning_rate": 2.5146198830409358e-05,
"loss": 3.6474,
"step": 18700
},
{
"epoch": 2.9984051036682615,
"grad_norm": 783686.0625,
"learning_rate": 2.5013290802764487e-05,
"loss": 3.6783,
"step": 18800
},
{
"epoch": 3.0,
"eval_loss": 3.706315755844116,
"eval_runtime": 212.8533,
"eval_samples_per_second": 134.473,
"eval_steps_per_second": 4.205,
"step": 18810
},
{
"epoch": 3.014354066985646,
"grad_norm": 709059.3125,
"learning_rate": 2.4880382775119617e-05,
"loss": 3.5423,
"step": 18900
},
{
"epoch": 3.0303030303030303,
"grad_norm": 655684.75,
"learning_rate": 2.474747474747475e-05,
"loss": 3.5606,
"step": 19000
},
{
"epoch": 3.0462519936204147,
"grad_norm": 759915.1875,
"learning_rate": 2.461456671982988e-05,
"loss": 3.6979,
"step": 19100
},
{
"epoch": 3.062200956937799,
"grad_norm": 835860.375,
"learning_rate": 2.448165869218501e-05,
"loss": 3.6324,
"step": 19200
},
{
"epoch": 3.0781499202551834,
"grad_norm": 675779.5625,
"learning_rate": 2.434875066454014e-05,
"loss": 3.5773,
"step": 19300
},
{
"epoch": 3.094098883572568,
"grad_norm": 700443.875,
"learning_rate": 2.421584263689527e-05,
"loss": 3.6309,
"step": 19400
},
{
"epoch": 3.110047846889952,
"grad_norm": 768801.6875,
"learning_rate": 2.4082934609250398e-05,
"loss": 3.639,
"step": 19500
},
{
"epoch": 3.1259968102073366,
"grad_norm": 705109.6875,
"learning_rate": 2.395002658160553e-05,
"loss": 3.6365,
"step": 19600
},
{
"epoch": 3.141945773524721,
"grad_norm": 661887.5,
"learning_rate": 2.381711855396066e-05,
"loss": 3.5754,
"step": 19700
},
{
"epoch": 3.1578947368421053,
"grad_norm": 691131.0,
"learning_rate": 2.368421052631579e-05,
"loss": 3.5888,
"step": 19800
},
{
"epoch": 3.1738437001594897,
"grad_norm": 738841.625,
"learning_rate": 2.355130249867092e-05,
"loss": 3.6433,
"step": 19900
},
{
"epoch": 3.189792663476874,
"grad_norm": 663372.1875,
"learning_rate": 2.341839447102605e-05,
"loss": 3.5642,
"step": 20000
},
{
"epoch": 3.2057416267942584,
"grad_norm": 619548.75,
"learning_rate": 2.3285486443381183e-05,
"loss": 3.5864,
"step": 20100
},
{
"epoch": 3.221690590111643,
"grad_norm": 773919.6875,
"learning_rate": 2.3152578415736312e-05,
"loss": 3.6072,
"step": 20200
},
{
"epoch": 3.237639553429027,
"grad_norm": 728946.0625,
"learning_rate": 2.3019670388091442e-05,
"loss": 3.6212,
"step": 20300
},
{
"epoch": 3.2535885167464116,
"grad_norm": 659326.9375,
"learning_rate": 2.288676236044657e-05,
"loss": 3.6068,
"step": 20400
},
{
"epoch": 3.269537480063796,
"grad_norm": 759802.25,
"learning_rate": 2.27538543328017e-05,
"loss": 3.6783,
"step": 20500
},
{
"epoch": 3.2854864433811803,
"grad_norm": 805141.9375,
"learning_rate": 2.262094630515683e-05,
"loss": 3.6441,
"step": 20600
},
{
"epoch": 3.3014354066985647,
"grad_norm": 716521.875,
"learning_rate": 2.2488038277511964e-05,
"loss": 3.5748,
"step": 20700
},
{
"epoch": 3.317384370015949,
"grad_norm": 920115.4375,
"learning_rate": 2.2355130249867094e-05,
"loss": 3.5985,
"step": 20800
},
{
"epoch": 3.3333333333333335,
"grad_norm": 838658.0625,
"learning_rate": 2.2222222222222223e-05,
"loss": 3.564,
"step": 20900
},
{
"epoch": 3.349282296650718,
"grad_norm": 838474.9375,
"learning_rate": 2.2089314194577353e-05,
"loss": 3.5875,
"step": 21000
},
{
"epoch": 3.3652312599681022,
"grad_norm": 663992.25,
"learning_rate": 2.1956406166932483e-05,
"loss": 3.5681,
"step": 21100
},
{
"epoch": 3.3811802232854866,
"grad_norm": 679489.9375,
"learning_rate": 2.1823498139287616e-05,
"loss": 3.5276,
"step": 21200
},
{
"epoch": 3.397129186602871,
"grad_norm": 721495.6875,
"learning_rate": 2.1690590111642745e-05,
"loss": 3.5599,
"step": 21300
},
{
"epoch": 3.4130781499202554,
"grad_norm": 667400.9375,
"learning_rate": 2.1557682083997875e-05,
"loss": 3.5554,
"step": 21400
},
{
"epoch": 3.4290271132376393,
"grad_norm": 706122.0,
"learning_rate": 2.1424774056353005e-05,
"loss": 3.6543,
"step": 21500
},
{
"epoch": 3.444976076555024,
"grad_norm": 709513.125,
"learning_rate": 2.1291866028708134e-05,
"loss": 3.6568,
"step": 21600
},
{
"epoch": 3.460925039872408,
"grad_norm": 652669.375,
"learning_rate": 2.1158958001063264e-05,
"loss": 3.6423,
"step": 21700
},
{
"epoch": 3.476874003189793,
"grad_norm": 713371.625,
"learning_rate": 2.1026049973418397e-05,
"loss": 3.6102,
"step": 21800
},
{
"epoch": 3.492822966507177,
"grad_norm": 694536.875,
"learning_rate": 2.0893141945773527e-05,
"loss": 3.5488,
"step": 21900
},
{
"epoch": 3.5087719298245617,
"grad_norm": 668059.9375,
"learning_rate": 2.0760233918128656e-05,
"loss": 3.6078,
"step": 22000
},
{
"epoch": 3.5247208931419456,
"grad_norm": 731925.875,
"learning_rate": 2.0627325890483786e-05,
"loss": 3.5455,
"step": 22100
},
{
"epoch": 3.5406698564593304,
"grad_norm": 766777.9375,
"learning_rate": 2.0494417862838915e-05,
"loss": 3.5582,
"step": 22200
},
{
"epoch": 3.5566188197767143,
"grad_norm": 755381.1875,
"learning_rate": 2.0361509835194045e-05,
"loss": 3.5859,
"step": 22300
},
{
"epoch": 3.5725677830940987,
"grad_norm": 749873.6875,
"learning_rate": 2.0228601807549178e-05,
"loss": 3.7161,
"step": 22400
},
{
"epoch": 3.588516746411483,
"grad_norm": 677539.125,
"learning_rate": 2.0095693779904308e-05,
"loss": 3.6225,
"step": 22500
},
{
"epoch": 3.6044657097288675,
"grad_norm": 744990.8125,
"learning_rate": 1.9962785752259437e-05,
"loss": 3.5935,
"step": 22600
},
{
"epoch": 3.620414673046252,
"grad_norm": 629363.0,
"learning_rate": 1.9829877724614567e-05,
"loss": 3.5828,
"step": 22700
},
{
"epoch": 3.6363636363636362,
"grad_norm": 685327.4375,
"learning_rate": 1.9696969696969697e-05,
"loss": 3.5937,
"step": 22800
},
{
"epoch": 3.6523125996810206,
"grad_norm": 760474.25,
"learning_rate": 1.956406166932483e-05,
"loss": 3.5799,
"step": 22900
},
{
"epoch": 3.668261562998405,
"grad_norm": 725888.9375,
"learning_rate": 1.943115364167996e-05,
"loss": 3.5465,
"step": 23000
},
{
"epoch": 3.6842105263157894,
"grad_norm": 752312.0625,
"learning_rate": 1.929824561403509e-05,
"loss": 3.6185,
"step": 23100
},
{
"epoch": 3.7001594896331738,
"grad_norm": 855498.5625,
"learning_rate": 1.916533758639022e-05,
"loss": 3.5792,
"step": 23200
},
{
"epoch": 3.716108452950558,
"grad_norm": 679264.5,
"learning_rate": 1.9032429558745348e-05,
"loss": 3.5531,
"step": 23300
},
{
"epoch": 3.7320574162679425,
"grad_norm": 688482.875,
"learning_rate": 1.8899521531100478e-05,
"loss": 3.5495,
"step": 23400
},
{
"epoch": 3.748006379585327,
"grad_norm": 756693.75,
"learning_rate": 1.876661350345561e-05,
"loss": 3.567,
"step": 23500
},
{
"epoch": 3.7639553429027113,
"grad_norm": 854033.0625,
"learning_rate": 1.863370547581074e-05,
"loss": 3.5611,
"step": 23600
},
{
"epoch": 3.7799043062200957,
"grad_norm": 753585.0625,
"learning_rate": 1.850079744816587e-05,
"loss": 3.5439,
"step": 23700
},
{
"epoch": 3.79585326953748,
"grad_norm": 685358.25,
"learning_rate": 1.8367889420521e-05,
"loss": 3.535,
"step": 23800
},
{
"epoch": 3.8118022328548644,
"grad_norm": 813028.4375,
"learning_rate": 1.823498139287613e-05,
"loss": 3.566,
"step": 23900
},
{
"epoch": 3.827751196172249,
"grad_norm": 676295.9375,
"learning_rate": 1.8102073365231263e-05,
"loss": 3.5208,
"step": 24000
},
{
"epoch": 3.843700159489633,
"grad_norm": 705614.1875,
"learning_rate": 1.7969165337586392e-05,
"loss": 3.5108,
"step": 24100
},
{
"epoch": 3.8596491228070176,
"grad_norm": 643356.125,
"learning_rate": 1.7836257309941522e-05,
"loss": 3.5416,
"step": 24200
},
{
"epoch": 3.875598086124402,
"grad_norm": 783635.625,
"learning_rate": 1.770334928229665e-05,
"loss": 3.6022,
"step": 24300
},
{
"epoch": 3.8915470494417863,
"grad_norm": 829787.5,
"learning_rate": 1.757044125465178e-05,
"loss": 3.6056,
"step": 24400
},
{
"epoch": 3.9074960127591707,
"grad_norm": 776633.0625,
"learning_rate": 1.743753322700691e-05,
"loss": 3.5857,
"step": 24500
},
{
"epoch": 3.923444976076555,
"grad_norm": 708855.1875,
"learning_rate": 1.7304625199362044e-05,
"loss": 3.6086,
"step": 24600
},
{
"epoch": 3.9393939393939394,
"grad_norm": 906587.0,
"learning_rate": 1.7171717171717173e-05,
"loss": 3.5361,
"step": 24700
},
{
"epoch": 3.955342902711324,
"grad_norm": 835950.4375,
"learning_rate": 1.7038809144072303e-05,
"loss": 3.5976,
"step": 24800
},
{
"epoch": 3.971291866028708,
"grad_norm": 816570.6875,
"learning_rate": 1.6905901116427433e-05,
"loss": 3.596,
"step": 24900
},
{
"epoch": 3.9872408293460926,
"grad_norm": 751378.9375,
"learning_rate": 1.6772993088782562e-05,
"loss": 3.4856,
"step": 25000
},
{
"epoch": 4.0,
"eval_loss": 3.646216869354248,
"eval_runtime": 212.927,
"eval_samples_per_second": 134.426,
"eval_steps_per_second": 4.203,
"step": 25080
},
{
"epoch": 4.003189792663477,
"grad_norm": 735559.375,
"learning_rate": 1.6640085061137695e-05,
"loss": 3.5286,
"step": 25100
},
{
"epoch": 4.019138755980861,
"grad_norm": 741852.3125,
"learning_rate": 1.6507177033492825e-05,
"loss": 3.5367,
"step": 25200
},
{
"epoch": 4.035087719298246,
"grad_norm": 650216.625,
"learning_rate": 1.6374269005847955e-05,
"loss": 3.5056,
"step": 25300
},
{
"epoch": 4.05103668261563,
"grad_norm": 717938.9375,
"learning_rate": 1.6241360978203084e-05,
"loss": 3.4996,
"step": 25400
},
{
"epoch": 4.0669856459330145,
"grad_norm": 840426.9375,
"learning_rate": 1.6108452950558214e-05,
"loss": 3.5293,
"step": 25500
},
{
"epoch": 4.082934609250398,
"grad_norm": 699668.5,
"learning_rate": 1.5975544922913344e-05,
"loss": 3.5122,
"step": 25600
},
{
"epoch": 4.098883572567783,
"grad_norm": 649238.8125,
"learning_rate": 1.5842636895268477e-05,
"loss": 3.4949,
"step": 25700
},
{
"epoch": 4.114832535885167,
"grad_norm": 669479.3125,
"learning_rate": 1.5709728867623606e-05,
"loss": 3.5515,
"step": 25800
},
{
"epoch": 4.130781499202552,
"grad_norm": 645182.625,
"learning_rate": 1.5576820839978733e-05,
"loss": 3.5113,
"step": 25900
},
{
"epoch": 4.146730462519936,
"grad_norm": 913136.6875,
"learning_rate": 1.5443912812333866e-05,
"loss": 3.5326,
"step": 26000
},
{
"epoch": 4.162679425837321,
"grad_norm": 641233.3125,
"learning_rate": 1.5311004784688995e-05,
"loss": 3.4768,
"step": 26100
},
{
"epoch": 4.178628389154705,
"grad_norm": 798713.5625,
"learning_rate": 1.5178096757044127e-05,
"loss": 3.4977,
"step": 26200
},
{
"epoch": 4.1945773524720895,
"grad_norm": 719145.9375,
"learning_rate": 1.5045188729399256e-05,
"loss": 3.4549,
"step": 26300
},
{
"epoch": 4.2105263157894735,
"grad_norm": 692601.9375,
"learning_rate": 1.4912280701754386e-05,
"loss": 3.5228,
"step": 26400
},
{
"epoch": 4.226475279106858,
"grad_norm": 631408.4375,
"learning_rate": 1.4779372674109517e-05,
"loss": 3.4742,
"step": 26500
},
{
"epoch": 4.242424242424242,
"grad_norm": 733137.0,
"learning_rate": 1.4646464646464647e-05,
"loss": 3.4242,
"step": 26600
},
{
"epoch": 4.258373205741627,
"grad_norm": 741812.9375,
"learning_rate": 1.4513556618819777e-05,
"loss": 3.4598,
"step": 26700
},
{
"epoch": 4.274322169059011,
"grad_norm": 682720.125,
"learning_rate": 1.4380648591174908e-05,
"loss": 3.4827,
"step": 26800
},
{
"epoch": 4.290271132376396,
"grad_norm": 784646.125,
"learning_rate": 1.4247740563530037e-05,
"loss": 3.4712,
"step": 26900
},
{
"epoch": 4.30622009569378,
"grad_norm": 713282.75,
"learning_rate": 1.4114832535885167e-05,
"loss": 3.5542,
"step": 27000
},
{
"epoch": 4.3221690590111645,
"grad_norm": 699440.4375,
"learning_rate": 1.3981924508240298e-05,
"loss": 3.4572,
"step": 27100
},
{
"epoch": 4.3381180223285485,
"grad_norm": 733063.875,
"learning_rate": 1.3849016480595428e-05,
"loss": 3.4106,
"step": 27200
},
{
"epoch": 4.354066985645933,
"grad_norm": 799606.875,
"learning_rate": 1.371610845295056e-05,
"loss": 3.4189,
"step": 27300
},
{
"epoch": 4.370015948963317,
"grad_norm": 722583.25,
"learning_rate": 1.3583200425305689e-05,
"loss": 3.448,
"step": 27400
},
{
"epoch": 4.385964912280702,
"grad_norm": 761491.5625,
"learning_rate": 1.3450292397660819e-05,
"loss": 3.5248,
"step": 27500
},
{
"epoch": 4.401913875598086,
"grad_norm": 633397.6875,
"learning_rate": 1.331738437001595e-05,
"loss": 3.5675,
"step": 27600
},
{
"epoch": 4.417862838915471,
"grad_norm": 743160.4375,
"learning_rate": 1.318447634237108e-05,
"loss": 3.4983,
"step": 27700
},
{
"epoch": 4.433811802232855,
"grad_norm": 689363.3125,
"learning_rate": 1.305156831472621e-05,
"loss": 3.5689,
"step": 27800
},
{
"epoch": 4.44976076555024,
"grad_norm": 634674.0,
"learning_rate": 1.291866028708134e-05,
"loss": 3.4763,
"step": 27900
},
{
"epoch": 4.4657097288676235,
"grad_norm": 682868.5,
"learning_rate": 1.278575225943647e-05,
"loss": 3.4725,
"step": 28000
},
{
"epoch": 4.481658692185008,
"grad_norm": 839863.0,
"learning_rate": 1.26528442317916e-05,
"loss": 3.5255,
"step": 28100
},
{
"epoch": 4.497607655502392,
"grad_norm": 840497.0625,
"learning_rate": 1.2519936204146731e-05,
"loss": 3.4914,
"step": 28200
},
{
"epoch": 4.513556618819777,
"grad_norm": 798480.8125,
"learning_rate": 1.2387028176501861e-05,
"loss": 3.4787,
"step": 28300
},
{
"epoch": 4.529505582137161,
"grad_norm": 776783.625,
"learning_rate": 1.2254120148856992e-05,
"loss": 3.4817,
"step": 28400
},
{
"epoch": 4.545454545454545,
"grad_norm": 860058.625,
"learning_rate": 1.2121212121212122e-05,
"loss": 3.515,
"step": 28500
},
{
"epoch": 4.56140350877193,
"grad_norm": 797709.0,
"learning_rate": 1.1988304093567252e-05,
"loss": 3.4715,
"step": 28600
},
{
"epoch": 4.577352472089315,
"grad_norm": 686232.0,
"learning_rate": 1.1855396065922381e-05,
"loss": 3.493,
"step": 28700
},
{
"epoch": 4.5933014354066986,
"grad_norm": 713671.875,
"learning_rate": 1.1722488038277513e-05,
"loss": 3.4767,
"step": 28800
},
{
"epoch": 4.6092503987240825,
"grad_norm": 830858.875,
"learning_rate": 1.1589580010632644e-05,
"loss": 3.5506,
"step": 28900
},
{
"epoch": 4.625199362041467,
"grad_norm": 681684.8125,
"learning_rate": 1.1456671982987772e-05,
"loss": 3.468,
"step": 29000
},
{
"epoch": 4.641148325358852,
"grad_norm": 693863.25,
"learning_rate": 1.1323763955342903e-05,
"loss": 3.4847,
"step": 29100
},
{
"epoch": 4.657097288676236,
"grad_norm": 612233.75,
"learning_rate": 1.1190855927698035e-05,
"loss": 3.3994,
"step": 29200
},
{
"epoch": 4.67304625199362,
"grad_norm": 901251.25,
"learning_rate": 1.1057947900053164e-05,
"loss": 3.5144,
"step": 29300
},
{
"epoch": 4.688995215311005,
"grad_norm": 742618.4375,
"learning_rate": 1.0925039872408294e-05,
"loss": 3.5388,
"step": 29400
},
{
"epoch": 4.70494417862839,
"grad_norm": 797654.0,
"learning_rate": 1.0792131844763423e-05,
"loss": 3.4722,
"step": 29500
},
{
"epoch": 4.720893141945774,
"grad_norm": 763737.9375,
"learning_rate": 1.0659223817118555e-05,
"loss": 3.5006,
"step": 29600
},
{
"epoch": 4.7368421052631575,
"grad_norm": 702353.875,
"learning_rate": 1.0526315789473684e-05,
"loss": 3.4772,
"step": 29700
},
{
"epoch": 4.752791068580542,
"grad_norm": 790482.1875,
"learning_rate": 1.0393407761828814e-05,
"loss": 3.4507,
"step": 29800
},
{
"epoch": 4.768740031897926,
"grad_norm": 706455.9375,
"learning_rate": 1.0260499734183945e-05,
"loss": 3.4769,
"step": 29900
},
{
"epoch": 4.784688995215311,
"grad_norm": 720554.3125,
"learning_rate": 1.0127591706539077e-05,
"loss": 3.4551,
"step": 30000
},
{
"epoch": 4.800637958532695,
"grad_norm": 754827.6875,
"learning_rate": 9.994683678894205e-06,
"loss": 3.4982,
"step": 30100
},
{
"epoch": 4.81658692185008,
"grad_norm": 696089.375,
"learning_rate": 9.861775651249336e-06,
"loss": 3.4637,
"step": 30200
},
{
"epoch": 4.832535885167464,
"grad_norm": 737095.3125,
"learning_rate": 9.728867623604466e-06,
"loss": 3.4621,
"step": 30300
},
{
"epoch": 4.848484848484849,
"grad_norm": 768262.125,
"learning_rate": 9.595959595959595e-06,
"loss": 3.48,
"step": 30400
},
{
"epoch": 4.8644338118022326,
"grad_norm": 767420.5625,
"learning_rate": 9.463051568314727e-06,
"loss": 3.4463,
"step": 30500
},
{
"epoch": 4.880382775119617,
"grad_norm": 706310.0,
"learning_rate": 9.330143540669856e-06,
"loss": 3.4668,
"step": 30600
},
{
"epoch": 4.896331738437001,
"grad_norm": 828940.625,
"learning_rate": 9.197235513024988e-06,
"loss": 3.4377,
"step": 30700
},
{
"epoch": 4.912280701754386,
"grad_norm": 788075.375,
"learning_rate": 9.064327485380117e-06,
"loss": 3.4144,
"step": 30800
},
{
"epoch": 4.92822966507177,
"grad_norm": 805364.0625,
"learning_rate": 8.931419457735247e-06,
"loss": 3.4726,
"step": 30900
},
{
"epoch": 4.944178628389155,
"grad_norm": 604472.75,
"learning_rate": 8.798511430090378e-06,
"loss": 3.4936,
"step": 31000
},
{
"epoch": 4.960127591706539,
"grad_norm": 684835.3125,
"learning_rate": 8.66560340244551e-06,
"loss": 3.4637,
"step": 31100
},
{
"epoch": 4.976076555023924,
"grad_norm": 684998.5,
"learning_rate": 8.532695374800638e-06,
"loss": 3.5321,
"step": 31200
},
{
"epoch": 4.992025518341308,
"grad_norm": 711871.5625,
"learning_rate": 8.399787347155769e-06,
"loss": 3.4809,
"step": 31300
},
{
"epoch": 5.0,
"eval_loss": 3.6019129753112793,
"eval_runtime": 212.6906,
"eval_samples_per_second": 134.576,
"eval_steps_per_second": 4.208,
"step": 31350
},
{
"epoch": 5.007974481658692,
"grad_norm": 1041463.125,
"learning_rate": 8.266879319510899e-06,
"loss": 3.4055,
"step": 31400
},
{
"epoch": 5.023923444976076,
"grad_norm": 623676.3125,
"learning_rate": 8.133971291866028e-06,
"loss": 3.5315,
"step": 31500
},
{
"epoch": 5.039872408293461,
"grad_norm": 619523.9375,
"learning_rate": 8.00106326422116e-06,
"loss": 3.4709,
"step": 31600
},
{
"epoch": 5.055821371610845,
"grad_norm": 779379.875,
"learning_rate": 7.86815523657629e-06,
"loss": 3.3933,
"step": 31700
},
{
"epoch": 5.07177033492823,
"grad_norm": 617667.5625,
"learning_rate": 7.73524720893142e-06,
"loss": 3.4738,
"step": 31800
},
{
"epoch": 5.087719298245614,
"grad_norm": 691351.5625,
"learning_rate": 7.602339181286549e-06,
"loss": 3.4499,
"step": 31900
},
{
"epoch": 5.103668261562999,
"grad_norm": 702913.25,
"learning_rate": 7.469431153641681e-06,
"loss": 3.4476,
"step": 32000
},
{
"epoch": 5.119617224880383,
"grad_norm": 778967.4375,
"learning_rate": 7.336523125996811e-06,
"loss": 3.4689,
"step": 32100
},
{
"epoch": 5.1355661881977674,
"grad_norm": 693687.3125,
"learning_rate": 7.20361509835194e-06,
"loss": 3.381,
"step": 32200
},
{
"epoch": 5.151515151515151,
"grad_norm": 843375.5,
"learning_rate": 7.0707070707070704e-06,
"loss": 3.438,
"step": 32300
},
{
"epoch": 5.167464114832536,
"grad_norm": 734730.5,
"learning_rate": 6.937799043062202e-06,
"loss": 3.4564,
"step": 32400
},
{
"epoch": 5.18341307814992,
"grad_norm": 716158.375,
"learning_rate": 6.804891015417332e-06,
"loss": 3.4529,
"step": 32500
},
{
"epoch": 5.199362041467305,
"grad_norm": 703040.625,
"learning_rate": 6.671982987772461e-06,
"loss": 3.391,
"step": 32600
},
{
"epoch": 5.215311004784689,
"grad_norm": 636837.3125,
"learning_rate": 6.5390749601275915e-06,
"loss": 3.4174,
"step": 32700
},
{
"epoch": 5.231259968102074,
"grad_norm": 661893.6875,
"learning_rate": 6.406166932482723e-06,
"loss": 3.4256,
"step": 32800
},
{
"epoch": 5.247208931419458,
"grad_norm": 812641.0625,
"learning_rate": 6.273258904837853e-06,
"loss": 3.4981,
"step": 32900
},
{
"epoch": 5.2631578947368425,
"grad_norm": 725753.125,
"learning_rate": 6.140350877192982e-06,
"loss": 3.5018,
"step": 33000
},
{
"epoch": 5.279106858054226,
"grad_norm": 691768.3125,
"learning_rate": 6.007442849548113e-06,
"loss": 3.3969,
"step": 33100
},
{
"epoch": 5.295055821371611,
"grad_norm": 776131.9375,
"learning_rate": 5.874534821903243e-06,
"loss": 3.4489,
"step": 33200
},
{
"epoch": 5.311004784688995,
"grad_norm": 747268.25,
"learning_rate": 5.741626794258374e-06,
"loss": 3.4075,
"step": 33300
},
{
"epoch": 5.32695374800638,
"grad_norm": 864468.1875,
"learning_rate": 5.608718766613503e-06,
"loss": 3.4443,
"step": 33400
},
{
"epoch": 5.342902711323764,
"grad_norm": 729350.5,
"learning_rate": 5.475810738968634e-06,
"loss": 3.4189,
"step": 33500
},
{
"epoch": 5.358851674641148,
"grad_norm": 687491.0625,
"learning_rate": 5.342902711323764e-06,
"loss": 3.4167,
"step": 33600
},
{
"epoch": 5.374800637958533,
"grad_norm": 682089.125,
"learning_rate": 5.209994683678895e-06,
"loss": 3.3691,
"step": 33700
},
{
"epoch": 5.3907496012759175,
"grad_norm": 698959.25,
"learning_rate": 5.077086656034024e-06,
"loss": 3.4449,
"step": 33800
},
{
"epoch": 5.4066985645933014,
"grad_norm": 671919.375,
"learning_rate": 4.944178628389155e-06,
"loss": 3.4421,
"step": 33900
},
{
"epoch": 5.422647527910685,
"grad_norm": 647332.125,
"learning_rate": 4.811270600744285e-06,
"loss": 3.4529,
"step": 34000
},
{
"epoch": 5.43859649122807,
"grad_norm": 686242.25,
"learning_rate": 4.678362573099415e-06,
"loss": 3.3381,
"step": 34100
},
{
"epoch": 5.454545454545454,
"grad_norm": 936387.5,
"learning_rate": 4.5454545454545455e-06,
"loss": 3.4765,
"step": 34200
},
{
"epoch": 5.470494417862839,
"grad_norm": 662910.3125,
"learning_rate": 4.412546517809676e-06,
"loss": 3.3729,
"step": 34300
},
{
"epoch": 5.486443381180223,
"grad_norm": 671547.0,
"learning_rate": 4.2796384901648065e-06,
"loss": 3.4489,
"step": 34400
},
{
"epoch": 5.502392344497608,
"grad_norm": 777179.3125,
"learning_rate": 4.146730462519936e-06,
"loss": 3.424,
"step": 34500
},
{
"epoch": 5.518341307814992,
"grad_norm": 628742.25,
"learning_rate": 4.013822434875067e-06,
"loss": 3.4128,
"step": 34600
},
{
"epoch": 5.5342902711323765,
"grad_norm": 794101.75,
"learning_rate": 3.880914407230197e-06,
"loss": 3.3938,
"step": 34700
},
{
"epoch": 5.55023923444976,
"grad_norm": 720820.4375,
"learning_rate": 3.7480063795853268e-06,
"loss": 3.3622,
"step": 34800
},
{
"epoch": 5.566188197767145,
"grad_norm": 686912.5,
"learning_rate": 3.6150983519404573e-06,
"loss": 3.3896,
"step": 34900
},
{
"epoch": 5.582137161084529,
"grad_norm": 758477.0,
"learning_rate": 3.4821903242955873e-06,
"loss": 3.4494,
"step": 35000
},
{
"epoch": 5.598086124401914,
"grad_norm": 828910.875,
"learning_rate": 3.349282296650718e-06,
"loss": 3.4204,
"step": 35100
},
{
"epoch": 5.614035087719298,
"grad_norm": 650998.8125,
"learning_rate": 3.216374269005848e-06,
"loss": 3.4115,
"step": 35200
},
{
"epoch": 5.629984051036683,
"grad_norm": 681491.375,
"learning_rate": 3.0834662413609784e-06,
"loss": 3.3861,
"step": 35300
},
{
"epoch": 5.645933014354067,
"grad_norm": 749556.875,
"learning_rate": 2.9505582137161084e-06,
"loss": 3.4175,
"step": 35400
},
{
"epoch": 5.6618819776714515,
"grad_norm": 677371.6875,
"learning_rate": 2.817650186071239e-06,
"loss": 3.4571,
"step": 35500
},
{
"epoch": 5.6778309409888355,
"grad_norm": 730101.5625,
"learning_rate": 2.684742158426369e-06,
"loss": 3.3966,
"step": 35600
},
{
"epoch": 5.69377990430622,
"grad_norm": 657353.375,
"learning_rate": 2.5518341307814995e-06,
"loss": 3.4639,
"step": 35700
},
{
"epoch": 5.709728867623604,
"grad_norm": 734038.0,
"learning_rate": 2.4189261031366296e-06,
"loss": 3.3982,
"step": 35800
},
{
"epoch": 5.725677830940989,
"grad_norm": 719495.625,
"learning_rate": 2.28601807549176e-06,
"loss": 3.4221,
"step": 35900
},
{
"epoch": 5.741626794258373,
"grad_norm": 686361.0625,
"learning_rate": 2.15311004784689e-06,
"loss": 3.4886,
"step": 36000
},
{
"epoch": 5.757575757575758,
"grad_norm": 804130.625,
"learning_rate": 2.0202020202020206e-06,
"loss": 3.4364,
"step": 36100
},
{
"epoch": 5.773524720893142,
"grad_norm": 713737.5625,
"learning_rate": 1.8872939925571505e-06,
"loss": 3.391,
"step": 36200
},
{
"epoch": 5.7894736842105265,
"grad_norm": 734503.125,
"learning_rate": 1.7543859649122807e-06,
"loss": 3.4291,
"step": 36300
},
{
"epoch": 5.8054226475279105,
"grad_norm": 673341.3125,
"learning_rate": 1.621477937267411e-06,
"loss": 3.4816,
"step": 36400
},
{
"epoch": 5.821371610845295,
"grad_norm": 643799.3125,
"learning_rate": 1.4885699096225413e-06,
"loss": 3.4221,
"step": 36500
},
{
"epoch": 5.837320574162679,
"grad_norm": 641417.125,
"learning_rate": 1.3556618819776716e-06,
"loss": 3.4331,
"step": 36600
},
{
"epoch": 5.853269537480064,
"grad_norm": 707790.25,
"learning_rate": 1.2227538543328019e-06,
"loss": 3.3936,
"step": 36700
},
{
"epoch": 5.869218500797448,
"grad_norm": 734012.3125,
"learning_rate": 1.089845826687932e-06,
"loss": 3.4258,
"step": 36800
},
{
"epoch": 5.885167464114833,
"grad_norm": 672716.0,
"learning_rate": 9.569377990430622e-07,
"loss": 3.4688,
"step": 36900
},
{
"epoch": 5.901116427432217,
"grad_norm": 691052.75,
"learning_rate": 8.240297713981925e-07,
"loss": 3.3553,
"step": 37000
},
{
"epoch": 5.917065390749602,
"grad_norm": 714107.875,
"learning_rate": 6.911217437533228e-07,
"loss": 3.4735,
"step": 37100
},
{
"epoch": 5.9330143540669855,
"grad_norm": 783142.9375,
"learning_rate": 5.582137161084529e-07,
"loss": 3.4707,
"step": 37200
},
{
"epoch": 5.94896331738437,
"grad_norm": 793443.25,
"learning_rate": 4.2530568846358327e-07,
"loss": 3.4279,
"step": 37300
},
{
"epoch": 5.964912280701754,
"grad_norm": 828328.5625,
"learning_rate": 2.9239766081871344e-07,
"loss": 3.4171,
"step": 37400
},
{
"epoch": 5.980861244019139,
"grad_norm": 826741.5625,
"learning_rate": 1.5948963317384372e-07,
"loss": 3.5064,
"step": 37500
},
{
"epoch": 5.996810207336523,
"grad_norm": 698366.375,
"learning_rate": 2.6581605528973954e-08,
"loss": 3.42,
"step": 37600
},
{
"epoch": 6.0,
"eval_loss": 3.615572929382324,
"eval_runtime": 213.1542,
"eval_samples_per_second": 134.283,
"eval_steps_per_second": 4.199,
"step": 37620
}
],
"logging_steps": 100,
"max_steps": 37620,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.65721373251031e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}