pylate-49500 / trainer_state.json
Speedsy's picture
Upload folder using huggingface_hub
eddc8cd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9924215084806929,
"eval_steps": 500,
"global_step": 49500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01002445968162316,
"grad_norm": 0.1908789426088333,
"learning_rate": 2.969926620955131e-05,
"loss": 0.0305,
"step": 500
},
{
"epoch": 0.02004891936324632,
"grad_norm": 0.2333366870880127,
"learning_rate": 2.939853241910261e-05,
"loss": 0.027,
"step": 1000
},
{
"epoch": 0.03007337904486948,
"grad_norm": 0.18207737803459167,
"learning_rate": 2.9097798628653917e-05,
"loss": 0.026,
"step": 1500
},
{
"epoch": 0.04009783872649264,
"grad_norm": 0.13153837621212006,
"learning_rate": 2.879706483820522e-05,
"loss": 0.0253,
"step": 2000
},
{
"epoch": 0.0501222984081158,
"grad_norm": 0.20580655336380005,
"learning_rate": 2.849633104775653e-05,
"loss": 0.0249,
"step": 2500
},
{
"epoch": 0.06014675808973896,
"grad_norm": 0.16466622054576874,
"learning_rate": 2.8196198724888727e-05,
"loss": 0.0239,
"step": 3000
},
{
"epoch": 0.07017121777136212,
"grad_norm": 0.1590748131275177,
"learning_rate": 2.7895464934440035e-05,
"loss": 0.0239,
"step": 3500
},
{
"epoch": 0.08019567745298528,
"grad_norm": 0.14150911569595337,
"learning_rate": 2.759473114399134e-05,
"loss": 0.0236,
"step": 4000
},
{
"epoch": 0.09022013713460844,
"grad_norm": 0.09914161264896393,
"learning_rate": 2.7293997353542647e-05,
"loss": 0.0236,
"step": 4500
},
{
"epoch": 0.1002445968162316,
"grad_norm": 0.16175945103168488,
"learning_rate": 2.6993263563093947e-05,
"loss": 0.0232,
"step": 5000
},
{
"epoch": 0.11026905649785476,
"grad_norm": 0.14557033777236938,
"learning_rate": 2.6692529772645255e-05,
"loss": 0.0229,
"step": 5500
},
{
"epoch": 0.12029351617947792,
"grad_norm": 0.14236459136009216,
"learning_rate": 2.639179598219656e-05,
"loss": 0.0228,
"step": 6000
},
{
"epoch": 0.13031797586110108,
"grad_norm": 0.1859973669052124,
"learning_rate": 2.6091062191747867e-05,
"loss": 0.0226,
"step": 6500
},
{
"epoch": 0.14034243554272424,
"grad_norm": 0.21533679962158203,
"learning_rate": 2.5790929868880065e-05,
"loss": 0.0226,
"step": 7000
},
{
"epoch": 0.1503668952243474,
"grad_norm": 0.13067375123500824,
"learning_rate": 2.5490196078431373e-05,
"loss": 0.0222,
"step": 7500
},
{
"epoch": 0.16039135490597056,
"grad_norm": 0.15966835618019104,
"learning_rate": 2.518946228798268e-05,
"loss": 0.0223,
"step": 8000
},
{
"epoch": 0.17041581458759372,
"grad_norm": 0.16358782351016998,
"learning_rate": 2.4888728497533985e-05,
"loss": 0.0218,
"step": 8500
},
{
"epoch": 0.18044027426921688,
"grad_norm": 0.2502037286758423,
"learning_rate": 2.458799470708529e-05,
"loss": 0.0219,
"step": 9000
},
{
"epoch": 0.19046473395084004,
"grad_norm": 0.13426147401332855,
"learning_rate": 2.4287260916636593e-05,
"loss": 0.0221,
"step": 9500
},
{
"epoch": 0.2004891936324632,
"grad_norm": 0.17509840428829193,
"learning_rate": 2.39865271261879e-05,
"loss": 0.0219,
"step": 10000
},
{
"epoch": 0.21051365331408636,
"grad_norm": 0.12009692937135696,
"learning_rate": 2.3685793335739205e-05,
"loss": 0.0216,
"step": 10500
},
{
"epoch": 0.22053811299570952,
"grad_norm": 0.1322207748889923,
"learning_rate": 2.338505954529051e-05,
"loss": 0.0211,
"step": 11000
},
{
"epoch": 0.23056257267733268,
"grad_norm": 0.14883308112621307,
"learning_rate": 2.308492722242271e-05,
"loss": 0.0214,
"step": 11500
},
{
"epoch": 0.24058703235895584,
"grad_norm": 0.13109040260314941,
"learning_rate": 2.278419343197402e-05,
"loss": 0.0211,
"step": 12000
},
{
"epoch": 0.25061149204057903,
"grad_norm": 0.1747232973575592,
"learning_rate": 2.2483459641525323e-05,
"loss": 0.0214,
"step": 12500
},
{
"epoch": 0.26063595172220216,
"grad_norm": 0.14633044600486755,
"learning_rate": 2.2182725851076627e-05,
"loss": 0.0212,
"step": 13000
},
{
"epoch": 0.27066041140382535,
"grad_norm": 0.16928279399871826,
"learning_rate": 2.188199206062793e-05,
"loss": 0.0209,
"step": 13500
},
{
"epoch": 0.2806848710854485,
"grad_norm": 0.1422567069530487,
"learning_rate": 2.158125827017924e-05,
"loss": 0.0206,
"step": 14000
},
{
"epoch": 0.29070933076707167,
"grad_norm": 0.1617174744606018,
"learning_rate": 2.1280524479730543e-05,
"loss": 0.0208,
"step": 14500
},
{
"epoch": 0.3007337904486948,
"grad_norm": 0.12710513174533844,
"learning_rate": 2.0979790689281848e-05,
"loss": 0.0204,
"step": 15000
},
{
"epoch": 0.310758250130318,
"grad_norm": 0.12561199069023132,
"learning_rate": 2.0679056898833152e-05,
"loss": 0.0207,
"step": 15500
},
{
"epoch": 0.3207827098119411,
"grad_norm": 0.12565076351165771,
"learning_rate": 2.037832310838446e-05,
"loss": 0.0204,
"step": 16000
},
{
"epoch": 0.3308071694935643,
"grad_norm": 0.16111774742603302,
"learning_rate": 2.0077589317935764e-05,
"loss": 0.0204,
"step": 16500
},
{
"epoch": 0.34083162917518744,
"grad_norm": 0.08807434886693954,
"learning_rate": 1.9776855527487068e-05,
"loss": 0.0204,
"step": 17000
},
{
"epoch": 0.35085608885681063,
"grad_norm": 0.12991702556610107,
"learning_rate": 1.9476121737038372e-05,
"loss": 0.0203,
"step": 17500
},
{
"epoch": 0.36088054853843377,
"grad_norm": 0.17382492125034332,
"learning_rate": 1.917538794658968e-05,
"loss": 0.0199,
"step": 18000
},
{
"epoch": 0.37090500822005695,
"grad_norm": 0.16158685088157654,
"learning_rate": 1.8875255623721885e-05,
"loss": 0.02,
"step": 18500
},
{
"epoch": 0.3809294679016801,
"grad_norm": 0.1214708685874939,
"learning_rate": 1.8575123300854082e-05,
"loss": 0.0199,
"step": 19000
},
{
"epoch": 0.3909539275833033,
"grad_norm": 0.11400719732046127,
"learning_rate": 1.827438951040539e-05,
"loss": 0.0197,
"step": 19500
},
{
"epoch": 0.4009783872649264,
"grad_norm": 0.09430436044931412,
"learning_rate": 1.7973655719956694e-05,
"loss": 0.0197,
"step": 20000
},
{
"epoch": 0.4110028469465496,
"grad_norm": 0.13852345943450928,
"learning_rate": 1.7672921929508002e-05,
"loss": 0.0199,
"step": 20500
},
{
"epoch": 0.4210273066281727,
"grad_norm": 0.13223719596862793,
"learning_rate": 1.7372188139059303e-05,
"loss": 0.0198,
"step": 21000
},
{
"epoch": 0.4310517663097959,
"grad_norm": 0.08761810511350632,
"learning_rate": 1.707145434861061e-05,
"loss": 0.0196,
"step": 21500
},
{
"epoch": 0.44107622599141905,
"grad_norm": 0.13589079678058624,
"learning_rate": 1.6770720558161915e-05,
"loss": 0.02,
"step": 22000
},
{
"epoch": 0.45110068567304223,
"grad_norm": 0.08927265554666519,
"learning_rate": 1.6469986767713223e-05,
"loss": 0.0196,
"step": 22500
},
{
"epoch": 0.46112514535466537,
"grad_norm": 0.06504929065704346,
"learning_rate": 1.6169252977264524e-05,
"loss": 0.0196,
"step": 23000
},
{
"epoch": 0.47114960503628855,
"grad_norm": 0.17200474441051483,
"learning_rate": 1.586851918681583e-05,
"loss": 0.0192,
"step": 23500
},
{
"epoch": 0.4811740647179117,
"grad_norm": 0.15591026842594147,
"learning_rate": 1.5567785396367136e-05,
"loss": 0.0195,
"step": 24000
},
{
"epoch": 0.4911985243995349,
"grad_norm": 0.07878988236188889,
"learning_rate": 1.5267051605918443e-05,
"loss": 0.0197,
"step": 24500
},
{
"epoch": 0.5012229840811581,
"grad_norm": 0.14917835593223572,
"learning_rate": 1.4967520750631541e-05,
"loss": 0.0194,
"step": 25000
},
{
"epoch": 0.5112474437627812,
"grad_norm": 0.13516920804977417,
"learning_rate": 1.4666786960182846e-05,
"loss": 0.0191,
"step": 25500
},
{
"epoch": 0.5212719034444043,
"grad_norm": 0.13264451920986176,
"learning_rate": 1.4366053169734152e-05,
"loss": 0.019,
"step": 26000
},
{
"epoch": 0.5312963631260275,
"grad_norm": 0.11801792681217194,
"learning_rate": 1.4065319379285456e-05,
"loss": 0.019,
"step": 26500
},
{
"epoch": 0.5413208228076507,
"grad_norm": 0.10833830386400223,
"learning_rate": 1.3764585588836762e-05,
"loss": 0.0192,
"step": 27000
},
{
"epoch": 0.5513452824892738,
"grad_norm": 0.1249992698431015,
"learning_rate": 1.3463851798388068e-05,
"loss": 0.0193,
"step": 27500
},
{
"epoch": 0.561369742170897,
"grad_norm": 0.1460021287202835,
"learning_rate": 1.3163118007939374e-05,
"loss": 0.0189,
"step": 28000
},
{
"epoch": 0.5713942018525201,
"grad_norm": 0.10502651333808899,
"learning_rate": 1.2862384217490678e-05,
"loss": 0.019,
"step": 28500
},
{
"epoch": 0.5814186615341433,
"grad_norm": 0.1526552140712738,
"learning_rate": 1.2561650427041984e-05,
"loss": 0.0189,
"step": 29000
},
{
"epoch": 0.5914431212157665,
"grad_norm": 0.10992331802845001,
"learning_rate": 1.2261518104174185e-05,
"loss": 0.0189,
"step": 29500
},
{
"epoch": 0.6014675808973896,
"grad_norm": 0.13150319457054138,
"learning_rate": 1.1961385781306388e-05,
"loss": 0.0189,
"step": 30000
},
{
"epoch": 0.6114920405790127,
"grad_norm": 0.13109181821346283,
"learning_rate": 1.1661253458438591e-05,
"loss": 0.0187,
"step": 30500
},
{
"epoch": 0.621516500260636,
"grad_norm": 0.1083877831697464,
"learning_rate": 1.1360519667989895e-05,
"loss": 0.0187,
"step": 31000
},
{
"epoch": 0.6315409599422591,
"grad_norm": 0.1419740468263626,
"learning_rate": 1.1059785877541201e-05,
"loss": 0.0186,
"step": 31500
},
{
"epoch": 0.6415654196238822,
"grad_norm": 0.13743899762630463,
"learning_rate": 1.0759052087092506e-05,
"loss": 0.0187,
"step": 32000
},
{
"epoch": 0.6515898793055054,
"grad_norm": 0.11756709218025208,
"learning_rate": 1.0458318296643812e-05,
"loss": 0.0187,
"step": 32500
},
{
"epoch": 0.6616143389871286,
"grad_norm": 0.28276416659355164,
"learning_rate": 1.0157584506195116e-05,
"loss": 0.0186,
"step": 33000
},
{
"epoch": 0.6716387986687518,
"grad_norm": 0.08689709007740021,
"learning_rate": 9.856850715746422e-06,
"loss": 0.0185,
"step": 33500
},
{
"epoch": 0.6816632583503749,
"grad_norm": 0.16307085752487183,
"learning_rate": 9.556116925297726e-06,
"loss": 0.0188,
"step": 34000
},
{
"epoch": 0.691687718031998,
"grad_norm": 0.09724652022123337,
"learning_rate": 9.25598460242993e-06,
"loss": 0.0186,
"step": 34500
},
{
"epoch": 0.7017121777136213,
"grad_norm": 0.1357831209897995,
"learning_rate": 8.955250811981233e-06,
"loss": 0.0185,
"step": 35000
},
{
"epoch": 0.7117366373952444,
"grad_norm": 0.07240907102823257,
"learning_rate": 8.65451702153254e-06,
"loss": 0.0182,
"step": 35500
},
{
"epoch": 0.7217610970768675,
"grad_norm": 0.12668687105178833,
"learning_rate": 8.353783231083844e-06,
"loss": 0.0184,
"step": 36000
},
{
"epoch": 0.7317855567584907,
"grad_norm": 0.10139577835798264,
"learning_rate": 8.05304944063515e-06,
"loss": 0.0183,
"step": 36500
},
{
"epoch": 0.7418100164401139,
"grad_norm": 0.13369520008563995,
"learning_rate": 7.752315650186454e-06,
"loss": 0.0184,
"step": 37000
},
{
"epoch": 0.751834476121737,
"grad_norm": 0.10762561112642288,
"learning_rate": 7.451581859737761e-06,
"loss": 0.0183,
"step": 37500
},
{
"epoch": 0.7618589358033602,
"grad_norm": 0.11992637068033218,
"learning_rate": 7.150848069289066e-06,
"loss": 0.0182,
"step": 38000
},
{
"epoch": 0.7718833954849834,
"grad_norm": 0.10149220377206802,
"learning_rate": 6.850114278840371e-06,
"loss": 0.0184,
"step": 38500
},
{
"epoch": 0.7819078551666065,
"grad_norm": 0.11299099028110504,
"learning_rate": 6.549981955972573e-06,
"loss": 0.0183,
"step": 39000
},
{
"epoch": 0.7919323148482297,
"grad_norm": 0.1773054599761963,
"learning_rate": 6.249248165523878e-06,
"loss": 0.0184,
"step": 39500
},
{
"epoch": 0.8019567745298528,
"grad_norm": 0.09563998878002167,
"learning_rate": 5.949115842656081e-06,
"loss": 0.0184,
"step": 40000
},
{
"epoch": 0.811981234211476,
"grad_norm": 0.12298491597175598,
"learning_rate": 5.648382052207386e-06,
"loss": 0.0182,
"step": 40500
},
{
"epoch": 0.8220056938930992,
"grad_norm": 0.1283864825963974,
"learning_rate": 5.3476482617586915e-06,
"loss": 0.0182,
"step": 41000
},
{
"epoch": 0.8320301535747223,
"grad_norm": 0.10248496383428574,
"learning_rate": 5.046914471309997e-06,
"loss": 0.0182,
"step": 41500
},
{
"epoch": 0.8420546132563455,
"grad_norm": 0.09150300920009613,
"learning_rate": 4.746180680861302e-06,
"loss": 0.0183,
"step": 42000
},
{
"epoch": 0.8520790729379687,
"grad_norm": 0.1290718913078308,
"learning_rate": 4.445446890412607e-06,
"loss": 0.0183,
"step": 42500
},
{
"epoch": 0.8621035326195918,
"grad_norm": 0.12163352966308594,
"learning_rate": 4.144713099963912e-06,
"loss": 0.0179,
"step": 43000
},
{
"epoch": 0.872127992301215,
"grad_norm": 0.15341030061244965,
"learning_rate": 3.843979309515217e-06,
"loss": 0.0178,
"step": 43500
},
{
"epoch": 0.8821524519828381,
"grad_norm": 0.1429668664932251,
"learning_rate": 3.5438469866474197e-06,
"loss": 0.0178,
"step": 44000
},
{
"epoch": 0.8921769116644613,
"grad_norm": 0.10146286338567734,
"learning_rate": 3.243113196198725e-06,
"loss": 0.018,
"step": 44500
},
{
"epoch": 0.9022013713460845,
"grad_norm": 0.1440785676240921,
"learning_rate": 2.9429808733309277e-06,
"loss": 0.0181,
"step": 45000
},
{
"epoch": 0.9122258310277076,
"grad_norm": 0.10165946930646896,
"learning_rate": 2.642247082882233e-06,
"loss": 0.0178,
"step": 45500
},
{
"epoch": 0.9222502907093307,
"grad_norm": 0.14765135943889618,
"learning_rate": 2.341513292433538e-06,
"loss": 0.018,
"step": 46000
},
{
"epoch": 0.932274750390954,
"grad_norm": 0.07201279699802399,
"learning_rate": 2.040779501984843e-06,
"loss": 0.018,
"step": 46500
},
{
"epoch": 0.9422992100725771,
"grad_norm": 0.13335958123207092,
"learning_rate": 1.7400457115361483e-06,
"loss": 0.0178,
"step": 47000
},
{
"epoch": 0.9523236697542002,
"grad_norm": 0.10147497057914734,
"learning_rate": 1.4393119210874535e-06,
"loss": 0.0178,
"step": 47500
},
{
"epoch": 0.9623481294358234,
"grad_norm": 0.09820819646120071,
"learning_rate": 1.1385781306387586e-06,
"loss": 0.0179,
"step": 48000
},
{
"epoch": 0.9723725891174466,
"grad_norm": 0.1263681799173355,
"learning_rate": 8.378443401900638e-07,
"loss": 0.018,
"step": 48500
},
{
"epoch": 0.9823970487990697,
"grad_norm": 0.08029880374670029,
"learning_rate": 5.377120173222663e-07,
"loss": 0.0181,
"step": 49000
},
{
"epoch": 0.9924215084806929,
"grad_norm": 0.12001931667327881,
"learning_rate": 2.369782268735715e-07,
"loss": 0.018,
"step": 49500
}
],
"logging_steps": 500,
"max_steps": 49878,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}