htlou's picture
Upload folder using huggingface_hub
34fd41f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.997968855788761,
"eval_steps": 50,
"global_step": 1107,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013540961408259987,
"grad_norm": 18.463732975176427,
"learning_rate": 5e-07,
"loss": 1.749,
"step": 5
},
{
"epoch": 0.027081922816519974,
"grad_norm": 12.15126324106773,
"learning_rate": 1e-06,
"loss": 1.5921,
"step": 10
},
{
"epoch": 0.040622884224779957,
"grad_norm": 7.7718176367972,
"learning_rate": 9.9994874230328e-07,
"loss": 1.2948,
"step": 15
},
{
"epoch": 0.05416384563303995,
"grad_norm": 4.024003853982352,
"learning_rate": 9.997949797225268e-07,
"loss": 1.1393,
"step": 20
},
{
"epoch": 0.06770480704129993,
"grad_norm": 3.796718690402949,
"learning_rate": 9.995387437838025e-07,
"loss": 1.0604,
"step": 25
},
{
"epoch": 0.08124576844955991,
"grad_norm": 3.4511274356883295,
"learning_rate": 9.991800870233637e-07,
"loss": 1.0272,
"step": 30
},
{
"epoch": 0.0947867298578199,
"grad_norm": 3.6605756447735724,
"learning_rate": 9.98719082976888e-07,
"loss": 1.003,
"step": 35
},
{
"epoch": 0.1083276912660799,
"grad_norm": 3.5416553128618156,
"learning_rate": 9.981558261643982e-07,
"loss": 0.9719,
"step": 40
},
{
"epoch": 0.12186865267433988,
"grad_norm": 3.5000585696846245,
"learning_rate": 9.97490432070881e-07,
"loss": 0.9584,
"step": 45
},
{
"epoch": 0.13540961408259986,
"grad_norm": 3.805541453776684,
"learning_rate": 9.967230371226118e-07,
"loss": 0.9444,
"step": 50
},
{
"epoch": 0.13540961408259986,
"eval_loss": 0.9377400875091553,
"eval_runtime": 182.856,
"eval_samples_per_second": 57.422,
"eval_steps_per_second": 0.902,
"step": 50
},
{
"epoch": 0.14895057549085985,
"grad_norm": 3.6018765357986844,
"learning_rate": 9.958537986591803e-07,
"loss": 0.9198,
"step": 55
},
{
"epoch": 0.16249153689911983,
"grad_norm": 3.5280703732572545,
"learning_rate": 9.948828949012327e-07,
"loss": 0.9181,
"step": 60
},
{
"epoch": 0.17603249830737983,
"grad_norm": 3.6595010678642925,
"learning_rate": 9.938105249139305e-07,
"loss": 0.9296,
"step": 65
},
{
"epoch": 0.1895734597156398,
"grad_norm": 3.323630668058957,
"learning_rate": 9.92636908566136e-07,
"loss": 0.9241,
"step": 70
},
{
"epoch": 0.2031144211238998,
"grad_norm": 3.425849289666098,
"learning_rate": 9.913622864853324e-07,
"loss": 0.8917,
"step": 75
},
{
"epoch": 0.2166553825321598,
"grad_norm": 3.9967436509747025,
"learning_rate": 9.89986920008288e-07,
"loss": 0.8988,
"step": 80
},
{
"epoch": 0.23019634394041977,
"grad_norm": 3.436149937331634,
"learning_rate": 9.885110911274738e-07,
"loss": 0.8774,
"step": 85
},
{
"epoch": 0.24373730534867977,
"grad_norm": 3.642660003309155,
"learning_rate": 9.869351024332466e-07,
"loss": 0.8787,
"step": 90
},
{
"epoch": 0.25727826675693977,
"grad_norm": 3.3574313517064978,
"learning_rate": 9.852592770518082e-07,
"loss": 0.8897,
"step": 95
},
{
"epoch": 0.2708192281651997,
"grad_norm": 3.534867148811258,
"learning_rate": 9.834839585789557e-07,
"loss": 0.8668,
"step": 100
},
{
"epoch": 0.2708192281651997,
"eval_loss": 0.8815732002258301,
"eval_runtime": 182.989,
"eval_samples_per_second": 57.38,
"eval_steps_per_second": 0.902,
"step": 100
},
{
"epoch": 0.2843601895734597,
"grad_norm": 3.476334412992533,
"learning_rate": 9.816095110096324e-07,
"loss": 0.8806,
"step": 105
},
{
"epoch": 0.2979011509817197,
"grad_norm": 3.5218146742622856,
"learning_rate": 9.796363186632983e-07,
"loss": 0.8895,
"step": 110
},
{
"epoch": 0.3114421123899797,
"grad_norm": 3.6344919656503545,
"learning_rate": 9.775647861051328e-07,
"loss": 0.8675,
"step": 115
},
{
"epoch": 0.32498307379823965,
"grad_norm": 3.8058175800193537,
"learning_rate": 9.753953380630862e-07,
"loss": 0.8739,
"step": 120
},
{
"epoch": 0.33852403520649965,
"grad_norm": 3.5458347600694515,
"learning_rate": 9.731284193407981e-07,
"loss": 0.8536,
"step": 125
},
{
"epoch": 0.35206499661475965,
"grad_norm": 3.4919069279296138,
"learning_rate": 9.707644947263975e-07,
"loss": 0.8598,
"step": 130
},
{
"epoch": 0.36560595802301965,
"grad_norm": 3.2003162498092848,
"learning_rate": 9.683040488972086e-07,
"loss": 0.8628,
"step": 135
},
{
"epoch": 0.3791469194312796,
"grad_norm": 3.337658064243358,
"learning_rate": 9.657475863203756e-07,
"loss": 0.8633,
"step": 140
},
{
"epoch": 0.3926878808395396,
"grad_norm": 3.4844992093115987,
"learning_rate": 9.63095631149432e-07,
"loss": 0.859,
"step": 145
},
{
"epoch": 0.4062288422477996,
"grad_norm": 3.5999127416589296,
"learning_rate": 9.603487271168336e-07,
"loss": 0.8436,
"step": 150
},
{
"epoch": 0.4062288422477996,
"eval_loss": 0.8559273481369019,
"eval_runtime": 182.9208,
"eval_samples_per_second": 57.402,
"eval_steps_per_second": 0.902,
"step": 150
},
{
"epoch": 0.4197698036560596,
"grad_norm": 3.4543800855029807,
"learning_rate": 9.575074374224758e-07,
"loss": 0.8431,
"step": 155
},
{
"epoch": 0.4333107650643196,
"grad_norm": 3.574884162982998,
"learning_rate": 9.545723446182201e-07,
"loss": 0.8466,
"step": 160
},
{
"epoch": 0.44685172647257954,
"grad_norm": 3.230714522717894,
"learning_rate": 9.515440504884539e-07,
"loss": 0.829,
"step": 165
},
{
"epoch": 0.46039268788083954,
"grad_norm": 3.38192753716993,
"learning_rate": 9.484231759267054e-07,
"loss": 0.8462,
"step": 170
},
{
"epoch": 0.47393364928909953,
"grad_norm": 3.562118447176729,
"learning_rate": 9.452103608083417e-07,
"loss": 0.8602,
"step": 175
},
{
"epoch": 0.48747461069735953,
"grad_norm": 3.2476691940428353,
"learning_rate": 9.419062638593748e-07,
"loss": 0.8266,
"step": 180
},
{
"epoch": 0.5010155721056195,
"grad_norm": 3.494748883924626,
"learning_rate": 9.385115625214021e-07,
"loss": 0.8219,
"step": 185
},
{
"epoch": 0.5145565335138795,
"grad_norm": 3.4365818103766452,
"learning_rate": 9.350269528127101e-07,
"loss": 0.8438,
"step": 190
},
{
"epoch": 0.5280974949221394,
"grad_norm": 3.245299565361765,
"learning_rate": 9.31453149185569e-07,
"loss": 0.8337,
"step": 195
},
{
"epoch": 0.5416384563303994,
"grad_norm": 3.603292666759954,
"learning_rate": 9.277908843797492e-07,
"loss": 0.8192,
"step": 200
},
{
"epoch": 0.5416384563303994,
"eval_loss": 0.8383815884590149,
"eval_runtime": 183.0029,
"eval_samples_per_second": 57.376,
"eval_steps_per_second": 0.902,
"step": 200
},
{
"epoch": 0.5551794177386594,
"grad_norm": 3.3561089172228797,
"learning_rate": 9.240409092722852e-07,
"loss": 0.8483,
"step": 205
},
{
"epoch": 0.5687203791469194,
"grad_norm": 3.351512113878825,
"learning_rate": 9.20203992723524e-07,
"loss": 0.8109,
"step": 210
},
{
"epoch": 0.5822613405551794,
"grad_norm": 3.4763361691712293,
"learning_rate": 9.162809214194851e-07,
"loss": 0.8335,
"step": 215
},
{
"epoch": 0.5958023019634394,
"grad_norm": 3.292224110319897,
"learning_rate": 9.122724997105646e-07,
"loss": 0.8465,
"step": 220
},
{
"epoch": 0.6093432633716994,
"grad_norm": 3.566456464228769,
"learning_rate": 9.0817954944662e-07,
"loss": 0.8257,
"step": 225
},
{
"epoch": 0.6228842247799594,
"grad_norm": 3.6677580289310088,
"learning_rate": 9.040029098084643e-07,
"loss": 0.8479,
"step": 230
},
{
"epoch": 0.6364251861882194,
"grad_norm": 3.4606320291357546,
"learning_rate": 8.997434371358092e-07,
"loss": 0.8412,
"step": 235
},
{
"epoch": 0.6499661475964793,
"grad_norm": 3.2432058195015467,
"learning_rate": 8.954020047516884e-07,
"loss": 0.7984,
"step": 240
},
{
"epoch": 0.6635071090047393,
"grad_norm": 3.24817794736711,
"learning_rate": 8.909795027833996e-07,
"loss": 0.834,
"step": 245
},
{
"epoch": 0.6770480704129993,
"grad_norm": 3.653634156312455,
"learning_rate": 8.864768379800016e-07,
"loss": 0.8285,
"step": 250
},
{
"epoch": 0.6770480704129993,
"eval_loss": 0.8261091709136963,
"eval_runtime": 182.9038,
"eval_samples_per_second": 57.407,
"eval_steps_per_second": 0.902,
"step": 250
},
{
"epoch": 0.6905890318212593,
"grad_norm": 3.5255613554356384,
"learning_rate": 8.81894933526402e-07,
"loss": 0.8322,
"step": 255
},
{
"epoch": 0.7041299932295193,
"grad_norm": 3.727303124024427,
"learning_rate": 8.772347288540763e-07,
"loss": 0.8384,
"step": 260
},
{
"epoch": 0.7176709546377793,
"grad_norm": 3.731075409838065,
"learning_rate": 8.724971794484555e-07,
"loss": 0.8263,
"step": 265
},
{
"epoch": 0.7312119160460393,
"grad_norm": 3.6981830618689484,
"learning_rate": 8.676832566530221e-07,
"loss": 0.8165,
"step": 270
},
{
"epoch": 0.7447528774542993,
"grad_norm": 3.498794505216147,
"learning_rate": 8.62793947470155e-07,
"loss": 0.8037,
"step": 275
},
{
"epoch": 0.7582938388625592,
"grad_norm": 3.5635605377852024,
"learning_rate": 8.578302543587629e-07,
"loss": 0.7896,
"step": 280
},
{
"epoch": 0.7718348002708192,
"grad_norm": 3.63798214652303,
"learning_rate": 8.527931950287506e-07,
"loss": 0.8129,
"step": 285
},
{
"epoch": 0.7853757616790792,
"grad_norm": 3.518857236471562,
"learning_rate": 8.47683802232356e-07,
"loss": 0.8182,
"step": 290
},
{
"epoch": 0.7989167230873392,
"grad_norm": 3.5608373625732694,
"learning_rate": 8.425031235524045e-07,
"loss": 0.8256,
"step": 295
},
{
"epoch": 0.8124576844955992,
"grad_norm": 3.6755850332375286,
"learning_rate": 8.372522211875224e-07,
"loss": 0.8132,
"step": 300
},
{
"epoch": 0.8124576844955992,
"eval_loss": 0.8165345191955566,
"eval_runtime": 182.8879,
"eval_samples_per_second": 57.412,
"eval_steps_per_second": 0.902,
"step": 300
},
{
"epoch": 0.8259986459038592,
"grad_norm": 3.596321476724727,
"learning_rate": 8.319321717343535e-07,
"loss": 0.8149,
"step": 305
},
{
"epoch": 0.8395396073121192,
"grad_norm": 3.7225323914645747,
"learning_rate": 8.265440659668234e-07,
"loss": 0.8124,
"step": 310
},
{
"epoch": 0.8530805687203792,
"grad_norm": 3.5663300158516016,
"learning_rate": 8.210890086124977e-07,
"loss": 0.7906,
"step": 315
},
{
"epoch": 0.8666215301286392,
"grad_norm": 3.4256587260000604,
"learning_rate": 8.155681181260776e-07,
"loss": 0.81,
"step": 320
},
{
"epoch": 0.8801624915368991,
"grad_norm": 3.4929314527584077,
"learning_rate": 8.099825264600842e-07,
"loss": 0.818,
"step": 325
},
{
"epoch": 0.8937034529451591,
"grad_norm": 3.5559616159997742,
"learning_rate": 8.04333378832772e-07,
"loss": 0.8067,
"step": 330
},
{
"epoch": 0.9072444143534191,
"grad_norm": 3.3786924383796544,
"learning_rate": 7.98621833493324e-07,
"loss": 0.802,
"step": 335
},
{
"epoch": 0.9207853757616791,
"grad_norm": 3.6281675631205377,
"learning_rate": 7.928490614843757e-07,
"loss": 0.7991,
"step": 340
},
{
"epoch": 0.9343263371699391,
"grad_norm": 3.5783671775239068,
"learning_rate": 7.870162464019143e-07,
"loss": 0.7961,
"step": 345
},
{
"epoch": 0.9478672985781991,
"grad_norm": 3.523112182862875,
"learning_rate": 7.811245841526062e-07,
"loss": 0.7997,
"step": 350
},
{
"epoch": 0.9478672985781991,
"eval_loss": 0.8075853586196899,
"eval_runtime": 182.8371,
"eval_samples_per_second": 57.428,
"eval_steps_per_second": 0.902,
"step": 350
},
{
"epoch": 0.9614082599864591,
"grad_norm": 3.327823061584991,
"learning_rate": 7.75175282708598e-07,
"loss": 0.7955,
"step": 355
},
{
"epoch": 0.9749492213947191,
"grad_norm": 3.353254883691996,
"learning_rate": 7.691695618598466e-07,
"loss": 0.7945,
"step": 360
},
{
"epoch": 0.988490182802979,
"grad_norm": 3.4573365027199547,
"learning_rate": 7.631086529640229e-07,
"loss": 0.8037,
"step": 365
},
{
"epoch": 1.002031144211239,
"grad_norm": 3.704603819142361,
"learning_rate": 7.569937986940475e-07,
"loss": 0.7833,
"step": 370
},
{
"epoch": 1.015572105619499,
"grad_norm": 3.338765249405955,
"learning_rate": 7.508262527833028e-07,
"loss": 0.736,
"step": 375
},
{
"epoch": 1.029113067027759,
"grad_norm": 3.5005014577141154,
"learning_rate": 7.446072797685799e-07,
"loss": 0.7393,
"step": 380
},
{
"epoch": 1.042654028436019,
"grad_norm": 3.5415652623772753,
"learning_rate": 7.383381547308099e-07,
"loss": 0.7461,
"step": 385
},
{
"epoch": 1.0561949898442788,
"grad_norm": 3.430934641227424,
"learning_rate": 7.320201630336318e-07,
"loss": 0.7359,
"step": 390
},
{
"epoch": 1.0697359512525388,
"grad_norm": 3.6468766296195896,
"learning_rate": 7.256546000598551e-07,
"loss": 0.7306,
"step": 395
},
{
"epoch": 1.0832769126607988,
"grad_norm": 4.055674159253643,
"learning_rate": 7.192427709458655e-07,
"loss": 0.7335,
"step": 400
},
{
"epoch": 1.0832769126607988,
"eval_loss": 0.804972231388092,
"eval_runtime": 182.8412,
"eval_samples_per_second": 57.427,
"eval_steps_per_second": 0.902,
"step": 400
},
{
"epoch": 1.0968178740690588,
"grad_norm": 3.7131013971600244,
"learning_rate": 7.127859903140311e-07,
"loss": 0.7346,
"step": 405
},
{
"epoch": 1.1103588354773188,
"grad_norm": 3.496845579088122,
"learning_rate": 7.062855820031659e-07,
"loss": 0.7409,
"step": 410
},
{
"epoch": 1.1238997968855788,
"grad_norm": 3.652963743841258,
"learning_rate": 6.997428787971005e-07,
"loss": 0.7236,
"step": 415
},
{
"epoch": 1.1374407582938388,
"grad_norm": 3.6316016258989916,
"learning_rate": 6.93159222151422e-07,
"loss": 0.734,
"step": 420
},
{
"epoch": 1.1509817197020988,
"grad_norm": 4.106556416672479,
"learning_rate": 6.86535961918433e-07,
"loss": 0.7425,
"step": 425
},
{
"epoch": 1.1645226811103588,
"grad_norm": 3.4406596945092764,
"learning_rate": 6.798744560703904e-07,
"loss": 0.7271,
"step": 430
},
{
"epoch": 1.1780636425186188,
"grad_norm": 3.5644142212223886,
"learning_rate": 6.731760704210802e-07,
"loss": 0.729,
"step": 435
},
{
"epoch": 1.1916046039268788,
"grad_norm": 3.5207779341108316,
"learning_rate": 6.66442178345783e-07,
"loss": 0.7295,
"step": 440
},
{
"epoch": 1.2051455653351388,
"grad_norm": 3.8426763234364643,
"learning_rate": 6.596741604996897e-07,
"loss": 0.7285,
"step": 445
},
{
"epoch": 1.2186865267433988,
"grad_norm": 3.537416567969802,
"learning_rate": 6.528734045348248e-07,
"loss": 0.7466,
"step": 450
},
{
"epoch": 1.2186865267433988,
"eval_loss": 0.7996942400932312,
"eval_runtime": 182.8629,
"eval_samples_per_second": 57.42,
"eval_steps_per_second": 0.902,
"step": 450
},
{
"epoch": 1.2322274881516588,
"grad_norm": 3.5566566741869603,
"learning_rate": 6.460413048155354e-07,
"loss": 0.7291,
"step": 455
},
{
"epoch": 1.2457684495599188,
"grad_norm": 3.6983928953608323,
"learning_rate": 6.391792621326027e-07,
"loss": 0.7502,
"step": 460
},
{
"epoch": 1.2593094109681786,
"grad_norm": 3.679881191981186,
"learning_rate": 6.322886834160377e-07,
"loss": 0.7375,
"step": 465
},
{
"epoch": 1.2728503723764386,
"grad_norm": 3.635647992456833,
"learning_rate": 6.253709814466167e-07,
"loss": 0.7446,
"step": 470
},
{
"epoch": 1.2863913337846986,
"grad_norm": 3.7784368285832675,
"learning_rate": 6.184275745662179e-07,
"loss": 0.7307,
"step": 475
},
{
"epoch": 1.2999322951929586,
"grad_norm": 3.975716487359526,
"learning_rate": 6.114598863870178e-07,
"loss": 0.727,
"step": 480
},
{
"epoch": 1.3134732566012186,
"grad_norm": 3.8898112879763818,
"learning_rate": 6.044693454996059e-07,
"loss": 0.7351,
"step": 485
},
{
"epoch": 1.3270142180094786,
"grad_norm": 3.883701681898452,
"learning_rate": 5.974573851800817e-07,
"loss": 0.7376,
"step": 490
},
{
"epoch": 1.3405551794177386,
"grad_norm": 3.5463439014644695,
"learning_rate": 5.904254430961869e-07,
"loss": 0.7413,
"step": 495
},
{
"epoch": 1.3540961408259986,
"grad_norm": 3.5029550543033374,
"learning_rate": 5.833749610125401e-07,
"loss": 0.7264,
"step": 500
},
{
"epoch": 1.3540961408259986,
"eval_loss": 0.7957150340080261,
"eval_runtime": 182.9392,
"eval_samples_per_second": 57.396,
"eval_steps_per_second": 0.902,
"step": 500
},
{
"epoch": 1.3676371022342586,
"grad_norm": 3.9766897722099124,
"learning_rate": 5.763073844950309e-07,
"loss": 0.7327,
"step": 505
},
{
"epoch": 1.3811780636425186,
"grad_norm": 3.80954804006231,
"learning_rate": 5.69224162614434e-07,
"loss": 0.7443,
"step": 510
},
{
"epoch": 1.3947190250507786,
"grad_norm": 3.524183540063996,
"learning_rate": 5.621267476493052e-07,
"loss": 0.7345,
"step": 515
},
{
"epoch": 1.4082599864590386,
"grad_norm": 3.620522282934874,
"learning_rate": 5.550165947882196e-07,
"loss": 0.7236,
"step": 520
},
{
"epoch": 1.4218009478672986,
"grad_norm": 3.768208934721801,
"learning_rate": 5.478951618314132e-07,
"loss": 0.7165,
"step": 525
},
{
"epoch": 1.4353419092755586,
"grad_norm": 3.5182066154018345,
"learning_rate": 5.407639088918888e-07,
"loss": 0.7297,
"step": 530
},
{
"epoch": 1.4488828706838186,
"grad_norm": 3.824867160497226,
"learning_rate": 5.33624298096048e-07,
"loss": 0.7307,
"step": 535
},
{
"epoch": 1.4624238320920786,
"grad_norm": 3.488297423659722,
"learning_rate": 5.264777932839104e-07,
"loss": 0.7487,
"step": 540
},
{
"epoch": 1.4759647935003386,
"grad_norm": 3.493644177453455,
"learning_rate": 5.193258597089809e-07,
"loss": 0.7291,
"step": 545
},
{
"epoch": 1.4895057549085986,
"grad_norm": 3.6179715470055824,
"learning_rate": 5.121699637378282e-07,
"loss": 0.7286,
"step": 550
},
{
"epoch": 1.4895057549085986,
"eval_loss": 0.7910673022270203,
"eval_runtime": 182.8784,
"eval_samples_per_second": 57.415,
"eval_steps_per_second": 0.902,
"step": 550
},
{
"epoch": 1.5030467163168586,
"grad_norm": 3.5922430809215777,
"learning_rate": 5.050115725494339e-07,
"loss": 0.7179,
"step": 555
},
{
"epoch": 1.5165876777251186,
"grad_norm": 3.4032672519789786,
"learning_rate": 4.978521538343764e-07,
"loss": 0.7366,
"step": 560
},
{
"epoch": 1.5301286391333786,
"grad_norm": 3.6567954327498327,
"learning_rate": 4.906931754939083e-07,
"loss": 0.7391,
"step": 565
},
{
"epoch": 1.5436696005416386,
"grad_norm": 3.5644839916468243,
"learning_rate": 4.835361053389921e-07,
"loss": 0.7288,
"step": 570
},
{
"epoch": 1.5572105619498986,
"grad_norm": 3.650299513434273,
"learning_rate": 4.763824107893532e-07,
"loss": 0.729,
"step": 575
},
{
"epoch": 1.5707515233581584,
"grad_norm": 3.8055072261605116,
"learning_rate": 4.692335585726145e-07,
"loss": 0.7177,
"step": 580
},
{
"epoch": 1.5842924847664184,
"grad_norm": 3.5985047420048697,
"learning_rate": 4.6209101442357116e-07,
"loss": 0.7224,
"step": 585
},
{
"epoch": 1.5978334461746784,
"grad_norm": 3.547274876419971,
"learning_rate": 4.549562427836701e-07,
"loss": 0.7327,
"step": 590
},
{
"epoch": 1.6113744075829384,
"grad_norm": 3.511159114275773,
"learning_rate": 4.4783070650075537e-07,
"loss": 0.7406,
"step": 595
},
{
"epoch": 1.6249153689911984,
"grad_norm": 3.46127753253687,
"learning_rate": 4.407158665291376e-07,
"loss": 0.7251,
"step": 600
},
{
"epoch": 1.6249153689911984,
"eval_loss": 0.7875649333000183,
"eval_runtime": 182.8944,
"eval_samples_per_second": 57.41,
"eval_steps_per_second": 0.902,
"step": 600
},
{
"epoch": 1.6384563303994584,
"grad_norm": 3.409510769647259,
"learning_rate": 4.336131816300548e-07,
"loss": 0.719,
"step": 605
},
{
"epoch": 1.6519972918077184,
"grad_norm": 3.5447361212717925,
"learning_rate": 4.265241080725808e-07,
"loss": 0.7287,
"step": 610
},
{
"epoch": 1.6655382532159784,
"grad_norm": 4.059666587103208,
"learning_rate": 4.194500993350453e-07,
"loss": 0.7399,
"step": 615
},
{
"epoch": 1.6790792146242384,
"grad_norm": 3.774832103683009,
"learning_rate": 4.1239260580702634e-07,
"loss": 0.7386,
"step": 620
},
{
"epoch": 1.6926201760324981,
"grad_norm": 3.5490582421055885,
"learning_rate": 4.053530744919749e-07,
"loss": 0.7246,
"step": 625
},
{
"epoch": 1.7061611374407581,
"grad_norm": 3.747116763144729,
"learning_rate": 3.983329487105363e-07,
"loss": 0.7372,
"step": 630
},
{
"epoch": 1.7197020988490181,
"grad_norm": 3.4822384940575795,
"learning_rate": 3.913336678046232e-07,
"loss": 0.7323,
"step": 635
},
{
"epoch": 1.7332430602572781,
"grad_norm": 3.825022969793332,
"learning_rate": 3.8435666684230726e-07,
"loss": 0.7097,
"step": 640
},
{
"epoch": 1.7467840216655381,
"grad_norm": 3.696972226883697,
"learning_rate": 3.774033763235861e-07,
"loss": 0.7304,
"step": 645
},
{
"epoch": 1.7603249830737981,
"grad_norm": 3.821368374115622,
"learning_rate": 3.7047522188708606e-07,
"loss": 0.727,
"step": 650
},
{
"epoch": 1.7603249830737981,
"eval_loss": 0.7839689254760742,
"eval_runtime": 182.875,
"eval_samples_per_second": 57.416,
"eval_steps_per_second": 0.902,
"step": 650
},
{
"epoch": 1.7738659444820581,
"grad_norm": 3.665010892266409,
"learning_rate": 3.635736240177627e-07,
"loss": 0.7223,
"step": 655
},
{
"epoch": 1.7874069058903181,
"grad_norm": 3.8705676996293916,
"learning_rate": 3.5669999775565816e-07,
"loss": 0.7313,
"step": 660
},
{
"epoch": 1.8009478672985781,
"grad_norm": 3.5842111135660057,
"learning_rate": 3.4985575240577365e-07,
"loss": 0.7321,
"step": 665
},
{
"epoch": 1.8144888287068381,
"grad_norm": 3.4424140077908767,
"learning_rate": 3.4304229124911856e-07,
"loss": 0.7316,
"step": 670
},
{
"epoch": 1.8280297901150981,
"grad_norm": 3.4068267304866646,
"learning_rate": 3.362610112549955e-07,
"loss": 0.704,
"step": 675
},
{
"epoch": 1.8415707515233581,
"grad_norm": 3.685081317112416,
"learning_rate": 3.295133027945778e-07,
"loss": 0.7167,
"step": 680
},
{
"epoch": 1.8551117129316181,
"grad_norm": 3.7176624860947345,
"learning_rate": 3.228005493558402e-07,
"loss": 0.7094,
"step": 685
},
{
"epoch": 1.8686526743398781,
"grad_norm": 3.718001701556429,
"learning_rate": 3.1612412725990305e-07,
"loss": 0.7312,
"step": 690
},
{
"epoch": 1.8821936357481381,
"grad_norm": 3.491455379714816,
"learning_rate": 3.0948540537884185e-07,
"loss": 0.7264,
"step": 695
},
{
"epoch": 1.8957345971563981,
"grad_norm": 3.7382317800607376,
"learning_rate": 3.0288574485502756e-07,
"loss": 0.7277,
"step": 700
},
{
"epoch": 1.8957345971563981,
"eval_loss": 0.7811039090156555,
"eval_runtime": 182.9386,
"eval_samples_per_second": 57.396,
"eval_steps_per_second": 0.902,
"step": 700
},
{
"epoch": 1.9092755585646581,
"grad_norm": 3.4296946924569442,
"learning_rate": 2.9632649882205083e-07,
"loss": 0.7287,
"step": 705
},
{
"epoch": 1.9228165199729181,
"grad_norm": 3.810691598305239,
"learning_rate": 2.8980901212728723e-07,
"loss": 0.7193,
"step": 710
},
{
"epoch": 1.9363574813811781,
"grad_norm": 3.3584278752072496,
"learning_rate": 2.833346210561619e-07,
"loss": 0.7112,
"step": 715
},
{
"epoch": 1.9498984427894381,
"grad_norm": 3.5743899932997185,
"learning_rate": 2.769046530581708e-07,
"loss": 0.7235,
"step": 720
},
{
"epoch": 1.9634394041976981,
"grad_norm": 3.8331759574897375,
"learning_rate": 2.705204264747125e-07,
"loss": 0.724,
"step": 725
},
{
"epoch": 1.9769803656059581,
"grad_norm": 3.6084594988279908,
"learning_rate": 2.6418325026878665e-07,
"loss": 0.7156,
"step": 730
},
{
"epoch": 1.9905213270142181,
"grad_norm": 3.56873955236049,
"learning_rate": 2.578944237566174e-07,
"loss": 0.7163,
"step": 735
},
{
"epoch": 2.004062288422478,
"grad_norm": 4.103162581101771,
"learning_rate": 2.5165523634125337e-07,
"loss": 0.7161,
"step": 740
},
{
"epoch": 2.017603249830738,
"grad_norm": 3.7064092978900844,
"learning_rate": 2.454669672481996e-07,
"loss": 0.6754,
"step": 745
},
{
"epoch": 2.031144211238998,
"grad_norm": 3.6900542156905196,
"learning_rate": 2.393308852631373e-07,
"loss": 0.6724,
"step": 750
},
{
"epoch": 2.031144211238998,
"eval_loss": 0.7857776880264282,
"eval_runtime": 183.0378,
"eval_samples_per_second": 57.365,
"eval_steps_per_second": 0.901,
"step": 750
},
{
"epoch": 2.044685172647258,
"grad_norm": 3.5536516436485255,
"learning_rate": 2.3324824847178494e-07,
"loss": 0.6887,
"step": 755
},
{
"epoch": 2.058226134055518,
"grad_norm": 3.7965121612299564,
"learning_rate": 2.2722030400194975e-07,
"loss": 0.666,
"step": 760
},
{
"epoch": 2.071767095463778,
"grad_norm": 3.7936989153822007,
"learning_rate": 2.2124828776782955e-07,
"loss": 0.6789,
"step": 765
},
{
"epoch": 2.085308056872038,
"grad_norm": 3.5401816973807043,
"learning_rate": 2.1533342421661228e-07,
"loss": 0.6665,
"step": 770
},
{
"epoch": 2.0988490182802977,
"grad_norm": 3.7787164350636555,
"learning_rate": 2.0947692607742618e-07,
"loss": 0.6755,
"step": 775
},
{
"epoch": 2.1123899796885577,
"grad_norm": 3.9382718196335267,
"learning_rate": 2.0367999411269282e-07,
"loss": 0.6821,
"step": 780
},
{
"epoch": 2.1259309410968177,
"grad_norm": 3.8112238429444782,
"learning_rate": 1.9794381687193456e-07,
"loss": 0.6805,
"step": 785
},
{
"epoch": 2.1394719025050777,
"grad_norm": 3.8744335724512204,
"learning_rate": 1.9226957044808494e-07,
"loss": 0.6657,
"step": 790
},
{
"epoch": 2.1530128639133377,
"grad_norm": 3.7804638456283346,
"learning_rate": 1.866584182363528e-07,
"loss": 0.6789,
"step": 795
},
{
"epoch": 2.1665538253215977,
"grad_norm": 3.8021451485147963,
"learning_rate": 1.811115106956918e-07,
"loss": 0.6883,
"step": 800
},
{
"epoch": 2.1665538253215977,
"eval_loss": 0.7850033044815063,
"eval_runtime": 182.9949,
"eval_samples_per_second": 57.379,
"eval_steps_per_second": 0.902,
"step": 800
},
{
"epoch": 2.1800947867298577,
"grad_norm": 3.864215108703362,
"learning_rate": 1.7562998511291943e-07,
"loss": 0.6811,
"step": 805
},
{
"epoch": 2.1936357481381177,
"grad_norm": 3.8300913859664667,
"learning_rate": 1.702149653695395e-07,
"loss": 0.6766,
"step": 810
},
{
"epoch": 2.2071767095463777,
"grad_norm": 3.8635188226813666,
"learning_rate": 1.6486756171131062e-07,
"loss": 0.675,
"step": 815
},
{
"epoch": 2.2207176709546377,
"grad_norm": 3.9283113465457355,
"learning_rate": 1.595888705206128e-07,
"loss": 0.6678,
"step": 820
},
{
"epoch": 2.2342586323628977,
"grad_norm": 3.726732058605602,
"learning_rate": 1.5437997409165476e-07,
"loss": 0.6733,
"step": 825
},
{
"epoch": 2.2477995937711577,
"grad_norm": 3.6073721199402318,
"learning_rate": 1.4924194040856973e-07,
"loss": 0.6794,
"step": 830
},
{
"epoch": 2.2613405551794177,
"grad_norm": 3.920320006141431,
"learning_rate": 1.4417582292644691e-07,
"loss": 0.6871,
"step": 835
},
{
"epoch": 2.2748815165876777,
"grad_norm": 3.781911882917061,
"learning_rate": 1.3918266035534027e-07,
"loss": 0.6774,
"step": 840
},
{
"epoch": 2.2884224779959377,
"grad_norm": 4.013058729107201,
"learning_rate": 1.3426347644730047e-07,
"loss": 0.6816,
"step": 845
},
{
"epoch": 2.3019634394041977,
"grad_norm": 3.776810144116961,
"learning_rate": 1.2941927978647526e-07,
"loss": 0.6709,
"step": 850
},
{
"epoch": 2.3019634394041977,
"eval_loss": 0.7840232253074646,
"eval_runtime": 182.9773,
"eval_samples_per_second": 57.384,
"eval_steps_per_second": 0.902,
"step": 850
},
{
"epoch": 2.3155044008124577,
"grad_norm": 4.0267886200903344,
"learning_rate": 1.2465106358231753e-07,
"loss": 0.6765,
"step": 855
},
{
"epoch": 2.3290453622207177,
"grad_norm": 3.79514301881657,
"learning_rate": 1.1995980546594775e-07,
"loss": 0.6633,
"step": 860
},
{
"epoch": 2.3425863236289777,
"grad_norm": 3.7026026349952086,
"learning_rate": 1.153464672897091e-07,
"loss": 0.678,
"step": 865
},
{
"epoch": 2.3561272850372377,
"grad_norm": 3.977299316585606,
"learning_rate": 1.108119949299578e-07,
"loss": 0.6875,
"step": 870
},
{
"epoch": 2.3696682464454977,
"grad_norm": 3.9505974017459544,
"learning_rate": 1.0635731809312992e-07,
"loss": 0.6955,
"step": 875
},
{
"epoch": 2.3832092078537577,
"grad_norm": 3.9944161998447116,
"learning_rate": 1.0198335012512271e-07,
"loss": 0.6843,
"step": 880
},
{
"epoch": 2.3967501692620177,
"grad_norm": 4.08644897660094,
"learning_rate": 9.769098782403041e-08,
"loss": 0.7081,
"step": 885
},
{
"epoch": 2.4102911306702777,
"grad_norm": 4.033807984306314,
"learning_rate": 9.348111125627278e-08,
"loss": 0.6758,
"step": 890
},
{
"epoch": 2.4238320920785377,
"grad_norm": 3.615156557294799,
"learning_rate": 8.935458357615583e-08,
"loss": 0.6718,
"step": 895
},
{
"epoch": 2.4373730534867977,
"grad_norm": 3.876477554855966,
"learning_rate": 8.531225084889654e-08,
"loss": 0.6598,
"step": 900
},
{
"epoch": 2.4373730534867977,
"eval_loss": 0.7834283113479614,
"eval_runtime": 183.0709,
"eval_samples_per_second": 57.355,
"eval_steps_per_second": 0.901,
"step": 900
},
{
"epoch": 2.4509140148950577,
"grad_norm": 3.6988949380997336,
"learning_rate": 8.135494187715475e-08,
"loss": 0.6603,
"step": 905
},
{
"epoch": 2.4644549763033177,
"grad_norm": 3.931452073089016,
"learning_rate": 7.748346803110295e-08,
"loss": 0.6832,
"step": 910
},
{
"epoch": 2.4779959377115777,
"grad_norm": 3.8160191178139047,
"learning_rate": 7.369862308207025e-08,
"loss": 0.6583,
"step": 915
},
{
"epoch": 2.4915368991198377,
"grad_norm": 3.8469114382677874,
"learning_rate": 7.000118303979463e-08,
"loss": 0.6808,
"step": 920
},
{
"epoch": 2.5050778605280977,
"grad_norm": 3.7497256852290115,
"learning_rate": 6.639190599331746e-08,
"loss": 0.6762,
"step": 925
},
{
"epoch": 2.518618821936357,
"grad_norm": 3.663314489242292,
"learning_rate": 6.287153195555173e-08,
"loss": 0.6663,
"step": 930
},
{
"epoch": 2.5321597833446177,
"grad_norm": 3.8930436232018333,
"learning_rate": 5.944078271155639e-08,
"loss": 0.6648,
"step": 935
},
{
"epoch": 2.545700744752877,
"grad_norm": 3.6616608952378904,
"learning_rate": 5.610036167054838e-08,
"loss": 0.6596,
"step": 940
},
{
"epoch": 2.5592417061611377,
"grad_norm": 3.986331709466641,
"learning_rate": 5.2850953721682635e-08,
"loss": 0.669,
"step": 945
},
{
"epoch": 2.572782667569397,
"grad_norm": 3.960581833122488,
"learning_rate": 4.969322509362761e-08,
"loss": 0.674,
"step": 950
},
{
"epoch": 2.572782667569397,
"eval_loss": 0.7830283641815186,
"eval_runtime": 182.8342,
"eval_samples_per_second": 57.429,
"eval_steps_per_second": 0.902,
"step": 950
},
{
"epoch": 2.5863236289776577,
"grad_norm": 3.8618441431288217,
"learning_rate": 4.662782321796849e-08,
"loss": 0.6713,
"step": 955
},
{
"epoch": 2.599864590385917,
"grad_norm": 3.5409233232724335,
"learning_rate": 4.365537659646418e-08,
"loss": 0.6747,
"step": 960
},
{
"epoch": 2.6134055517941777,
"grad_norm": 3.744738202206873,
"learning_rate": 4.0776494672184356e-08,
"loss": 0.6846,
"step": 965
},
{
"epoch": 2.626946513202437,
"grad_norm": 3.727245201869487,
"learning_rate": 3.799176770455526e-08,
"loss": 0.6616,
"step": 970
},
{
"epoch": 2.640487474610697,
"grad_norm": 3.7258573002382147,
"learning_rate": 3.530176664833834e-08,
"loss": 0.675,
"step": 975
},
{
"epoch": 2.654028436018957,
"grad_norm": 3.869690791825916,
"learning_rate": 3.270704303656696e-08,
"loss": 0.6875,
"step": 980
},
{
"epoch": 2.667569397427217,
"grad_norm": 4.064616477774205,
"learning_rate": 3.020812886746477e-08,
"loss": 0.6808,
"step": 985
},
{
"epoch": 2.681110358835477,
"grad_norm": 3.704371552936023,
"learning_rate": 2.7805536495370373e-08,
"loss": 0.6687,
"step": 990
},
{
"epoch": 2.694651320243737,
"grad_norm": 4.055603563401218,
"learning_rate": 2.5499758525688197e-08,
"loss": 0.6584,
"step": 995
},
{
"epoch": 2.708192281651997,
"grad_norm": 3.749384489878185,
"learning_rate": 2.329126771388995e-08,
"loss": 0.656,
"step": 1000
},
{
"epoch": 2.708192281651997,
"eval_loss": 0.7828182578086853,
"eval_runtime": 182.6838,
"eval_samples_per_second": 57.476,
"eval_steps_per_second": 0.903,
"step": 1000
},
{
"epoch": 2.721733243060257,
"grad_norm": 3.869599930871293,
"learning_rate": 2.1180516868584464e-08,
"loss": 0.6716,
"step": 1005
},
{
"epoch": 2.735274204468517,
"grad_norm": 3.930506514677681,
"learning_rate": 1.916793875867839e-08,
"loss": 0.6822,
"step": 1010
},
{
"epoch": 2.748815165876777,
"grad_norm": 3.8320813018837616,
"learning_rate": 1.7253946024645472e-08,
"loss": 0.6627,
"step": 1015
},
{
"epoch": 2.762356127285037,
"grad_norm": 3.986150848206186,
"learning_rate": 1.5438931093921804e-08,
"loss": 0.6727,
"step": 1020
},
{
"epoch": 2.775897088693297,
"grad_norm": 3.764718626888124,
"learning_rate": 1.372326610044705e-08,
"loss": 0.6618,
"step": 1025
},
{
"epoch": 2.789438050101557,
"grad_norm": 3.7384921853849393,
"learning_rate": 1.2107302808364638e-08,
"loss": 0.6614,
"step": 1030
},
{
"epoch": 2.802979011509817,
"grad_norm": 3.8134941063063867,
"learning_rate": 1.0591372539900056e-08,
"loss": 0.6665,
"step": 1035
},
{
"epoch": 2.816519972918077,
"grad_norm": 3.7735093872780197,
"learning_rate": 9.175786107429085e-09,
"loss": 0.6643,
"step": 1040
},
{
"epoch": 2.830060934326337,
"grad_norm": 3.987550484105897,
"learning_rate": 7.860833749751772e-09,
"loss": 0.6739,
"step": 1045
},
{
"epoch": 2.843601895734597,
"grad_norm": 3.9380769036431893,
"learning_rate": 6.6467850725848705e-09,
"loss": 0.6741,
"step": 1050
},
{
"epoch": 2.843601895734597,
"eval_loss": 0.7824584245681763,
"eval_runtime": 183.0944,
"eval_samples_per_second": 57.347,
"eval_steps_per_second": 0.901,
"step": 1050
},
{
"epoch": 2.857142857142857,
"grad_norm": 4.011659897593238,
"learning_rate": 5.5338889932838306e-09,
"loss": 0.6842,
"step": 1055
},
{
"epoch": 2.870683818551117,
"grad_norm": 3.808051194891409,
"learning_rate": 4.5223736898076235e-09,
"loss": 0.6806,
"step": 1060
},
{
"epoch": 2.884224779959377,
"grad_norm": 3.8839001927108856,
"learning_rate": 3.612446553934723e-09,
"loss": 0.6679,
"step": 1065
},
{
"epoch": 2.897765741367637,
"grad_norm": 3.9713825006231054,
"learning_rate": 2.804294148741948e-09,
"loss": 0.6733,
"step": 1070
},
{
"epoch": 2.911306702775897,
"grad_norm": 3.783696934827739,
"learning_rate": 2.0980821703527886e-09,
"loss": 0.6736,
"step": 1075
},
{
"epoch": 2.924847664184157,
"grad_norm": 3.770344601359413,
"learning_rate": 1.4939554139648536e-09,
"loss": 0.6649,
"step": 1080
},
{
"epoch": 2.938388625592417,
"grad_norm": 3.8357523688679565,
"learning_rate": 9.920377441623994e-10,
"loss": 0.6718,
"step": 1085
},
{
"epoch": 2.951929587000677,
"grad_norm": 4.03615476407359,
"learning_rate": 5.92432069520199e-10,
"loss": 0.6805,
"step": 1090
},
{
"epoch": 2.9654705484089368,
"grad_norm": 3.8321791456875283,
"learning_rate": 2.9522032150419705e-10,
"loss": 0.6629,
"step": 1095
},
{
"epoch": 2.979011509817197,
"grad_norm": 3.9373632743696056,
"learning_rate": 1.0046343767294852e-10,
"loss": 0.6592,
"step": 1100
},
{
"epoch": 2.979011509817197,
"eval_loss": 0.7824262976646423,
"eval_runtime": 182.8708,
"eval_samples_per_second": 57.418,
"eval_steps_per_second": 0.902,
"step": 1100
},
{
"epoch": 2.9925524712254568,
"grad_norm": 3.6875439191522075,
"learning_rate": 8.201349183611927e-12,
"loss": 0.6534,
"step": 1105
},
{
"epoch": 2.997968855788761,
"step": 1107,
"total_flos": 6527139780231168.0,
"train_loss": 0.7629147509572306,
"train_runtime": 18558.2767,
"train_samples_per_second": 15.276,
"train_steps_per_second": 0.06
}
],
"logging_steps": 5,
"max_steps": 1107,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6527139780231168.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}