{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 6935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021629416005767843, "grad_norm": 76.04898834228516, "learning_rate": 4.978370583994232e-05, "loss": 1.1957, "step": 30 }, { "epoch": 0.043258832011535686, "grad_norm": 15.82689094543457, "learning_rate": 4.956741167988465e-05, "loss": 0.1664, "step": 60 }, { "epoch": 0.06488824801730353, "grad_norm": 0.9152764678001404, "learning_rate": 4.935111751982697e-05, "loss": 0.0335, "step": 90 }, { "epoch": 0.08651766402307137, "grad_norm": 0.8367058634757996, "learning_rate": 4.913482335976929e-05, "loss": 0.0071, "step": 120 }, { "epoch": 0.10814708002883922, "grad_norm": 0.9554459452629089, "learning_rate": 4.891852919971161e-05, "loss": 0.0019, "step": 150 }, { "epoch": 0.12977649603460706, "grad_norm": 0.09180067479610443, "learning_rate": 4.8702235039653934e-05, "loss": 0.009, "step": 180 }, { "epoch": 0.1514059120403749, "grad_norm": 0.45300760865211487, "learning_rate": 4.8485940879596254e-05, "loss": 0.002, "step": 210 }, { "epoch": 0.17303532804614274, "grad_norm": 0.018595978617668152, "learning_rate": 4.826964671953857e-05, "loss": 0.0038, "step": 240 }, { "epoch": 0.1946647440519106, "grad_norm": 0.014309559017419815, "learning_rate": 4.80533525594809e-05, "loss": 0.0024, "step": 270 }, { "epoch": 0.21629416005767843, "grad_norm": 0.2507375180721283, "learning_rate": 4.783705839942322e-05, "loss": 0.009, "step": 300 }, { "epoch": 0.23792357606344627, "grad_norm": 0.031992267817258835, "learning_rate": 4.762076423936554e-05, "loss": 0.0095, "step": 330 }, { "epoch": 0.2595529920692141, "grad_norm": 0.04604390636086464, "learning_rate": 4.740447007930786e-05, "loss": 0.0024, "step": 360 }, { "epoch": 0.28118240807498196, "grad_norm": 0.03024614229798317, "learning_rate": 4.7188175919250185e-05, "loss": 0.0403, "step": 390 }, { "epoch": 0.3028118240807498, "grad_norm": 0.3150508403778076, "learning_rate": 4.6971881759192505e-05, "loss": 0.0028, "step": 420 }, { "epoch": 0.32444124008651765, "grad_norm": 0.11980323493480682, "learning_rate": 4.6755587599134824e-05, "loss": 0.0023, "step": 450 }, { "epoch": 0.3460706560922855, "grad_norm": 0.5558916926383972, "learning_rate": 4.6539293439077144e-05, "loss": 0.0117, "step": 480 }, { "epoch": 0.36770007209805333, "grad_norm": 0.1800965517759323, "learning_rate": 4.632299927901947e-05, "loss": 0.0029, "step": 510 }, { "epoch": 0.3893294881038212, "grad_norm": 0.05012692138552666, "learning_rate": 4.610670511896179e-05, "loss": 0.0104, "step": 540 }, { "epoch": 0.410958904109589, "grad_norm": 0.38384830951690674, "learning_rate": 4.589041095890411e-05, "loss": 0.0065, "step": 570 }, { "epoch": 0.43258832011535686, "grad_norm": 0.010520127601921558, "learning_rate": 4.5674116798846436e-05, "loss": 0.0014, "step": 600 }, { "epoch": 0.4542177361211247, "grad_norm": 0.5669275522232056, "learning_rate": 4.5457822638788756e-05, "loss": 0.0247, "step": 630 }, { "epoch": 0.47584715212689255, "grad_norm": 0.26488080620765686, "learning_rate": 4.5241528478731076e-05, "loss": 0.0065, "step": 660 }, { "epoch": 0.4974765681326604, "grad_norm": 0.6768619418144226, "learning_rate": 4.5025234318673395e-05, "loss": 0.005, "step": 690 }, { "epoch": 0.5191059841384282, "grad_norm": 0.007197729777544737, "learning_rate": 4.480894015861572e-05, "loss": 0.0013, "step": 720 }, { "epoch": 0.5407354001441961, "grad_norm": 0.10392127186059952, "learning_rate": 4.459264599855804e-05, "loss": 0.0014, "step": 750 }, { "epoch": 0.5623648161499639, "grad_norm": 0.01045119110494852, "learning_rate": 4.437635183850036e-05, "loss": 0.0194, "step": 780 }, { "epoch": 0.5839942321557318, "grad_norm": 0.004247739445418119, "learning_rate": 4.416005767844269e-05, "loss": 0.0001, "step": 810 }, { "epoch": 0.6056236481614996, "grad_norm": 0.0034305218141525984, "learning_rate": 4.394376351838501e-05, "loss": 0.003, "step": 840 }, { "epoch": 0.6272530641672674, "grad_norm": 0.0035491471644490957, "learning_rate": 4.372746935832733e-05, "loss": 0.0004, "step": 870 }, { "epoch": 0.6488824801730353, "grad_norm": 0.32892727851867676, "learning_rate": 4.3511175198269646e-05, "loss": 0.0128, "step": 900 }, { "epoch": 0.6705118961788031, "grad_norm": 0.0013508042320609093, "learning_rate": 4.329488103821197e-05, "loss": 0.0186, "step": 930 }, { "epoch": 0.692141312184571, "grad_norm": 0.7996568083763123, "learning_rate": 4.307858687815429e-05, "loss": 0.0033, "step": 960 }, { "epoch": 0.7137707281903388, "grad_norm": 0.49807533621788025, "learning_rate": 4.286229271809661e-05, "loss": 0.0131, "step": 990 }, { "epoch": 0.7354001441961067, "grad_norm": 0.002483088057488203, "learning_rate": 4.264599855803894e-05, "loss": 0.0006, "step": 1020 }, { "epoch": 0.7570295602018745, "grad_norm": 0.0030093444511294365, "learning_rate": 4.242970439798126e-05, "loss": 0.0013, "step": 1050 }, { "epoch": 0.7786589762076424, "grad_norm": 0.0018839107360690832, "learning_rate": 4.221341023792358e-05, "loss": 0.0123, "step": 1080 }, { "epoch": 0.8002883922134102, "grad_norm": 0.604235827922821, "learning_rate": 4.19971160778659e-05, "loss": 0.0006, "step": 1110 }, { "epoch": 0.821917808219178, "grad_norm": 0.009270986542105675, "learning_rate": 4.1780821917808224e-05, "loss": 0.0051, "step": 1140 }, { "epoch": 0.8435472242249459, "grad_norm": 0.168134868144989, "learning_rate": 4.1564527757750544e-05, "loss": 0.003, "step": 1170 }, { "epoch": 0.8651766402307137, "grad_norm": 0.3333891034126282, "learning_rate": 4.134823359769286e-05, "loss": 0.0247, "step": 1200 }, { "epoch": 0.8868060562364816, "grad_norm": 3.897454023361206, "learning_rate": 4.113193943763518e-05, "loss": 0.0005, "step": 1230 }, { "epoch": 0.9084354722422494, "grad_norm": 0.003549683140590787, "learning_rate": 4.091564527757751e-05, "loss": 0.0047, "step": 1260 }, { "epoch": 0.9300648882480173, "grad_norm": 0.01988086849451065, "learning_rate": 4.069935111751983e-05, "loss": 0.0025, "step": 1290 }, { "epoch": 0.9516943042537851, "grad_norm": 0.011269631795585155, "learning_rate": 4.048305695746215e-05, "loss": 0.0003, "step": 1320 }, { "epoch": 0.9733237202595529, "grad_norm": 0.0007853784481994808, "learning_rate": 4.0266762797404475e-05, "loss": 0.0003, "step": 1350 }, { "epoch": 0.9949531362653208, "grad_norm": 0.6295477747917175, "learning_rate": 4.0050468637346795e-05, "loss": 0.0018, "step": 1380 }, { "epoch": 1.0165825522710887, "grad_norm": 0.0005618122522719204, "learning_rate": 3.9834174477289115e-05, "loss": 0.0002, "step": 1410 }, { "epoch": 1.0382119682768565, "grad_norm": 0.00244883238337934, "learning_rate": 3.9617880317231434e-05, "loss": 0.0002, "step": 1440 }, { "epoch": 1.0598413842826244, "grad_norm": 0.0004200914700049907, "learning_rate": 3.940158615717376e-05, "loss": 0.0058, "step": 1470 }, { "epoch": 1.0814708002883922, "grad_norm": 0.0034315199591219425, "learning_rate": 3.918529199711608e-05, "loss": 0.0001, "step": 1500 }, { "epoch": 1.10310021629416, "grad_norm": 0.00489798141643405, "learning_rate": 3.89689978370584e-05, "loss": 0.0003, "step": 1530 }, { "epoch": 1.1247296322999278, "grad_norm": 1.112964391708374, "learning_rate": 3.8752703677000726e-05, "loss": 0.0001, "step": 1560 }, { "epoch": 1.1463590483056958, "grad_norm": 0.00021760116214863956, "learning_rate": 3.8536409516943046e-05, "loss": 0.0002, "step": 1590 }, { "epoch": 1.1679884643114635, "grad_norm": 0.03224096819758415, "learning_rate": 3.8320115356885366e-05, "loss": 0.0001, "step": 1620 }, { "epoch": 1.1896178803172315, "grad_norm": 0.0035550747998058796, "learning_rate": 3.8103821196827685e-05, "loss": 0.0002, "step": 1650 }, { "epoch": 1.2112472963229992, "grad_norm": 0.002545340685173869, "learning_rate": 3.788752703677001e-05, "loss": 0.0017, "step": 1680 }, { "epoch": 1.2328767123287672, "grad_norm": 0.0011141377035528421, "learning_rate": 3.767123287671233e-05, "loss": 0.0008, "step": 1710 }, { "epoch": 1.254506128334535, "grad_norm": 0.002276007318869233, "learning_rate": 3.745493871665465e-05, "loss": 0.0, "step": 1740 }, { "epoch": 1.2761355443403029, "grad_norm": 0.0016464549116790295, "learning_rate": 3.723864455659698e-05, "loss": 0.0001, "step": 1770 }, { "epoch": 1.2977649603460706, "grad_norm": 0.010479186661541462, "learning_rate": 3.70223503965393e-05, "loss": 0.0004, "step": 1800 }, { "epoch": 1.3193943763518385, "grad_norm": 0.0026020577643066645, "learning_rate": 3.680605623648162e-05, "loss": 0.0158, "step": 1830 }, { "epoch": 1.3410237923576063, "grad_norm": 0.007555871736258268, "learning_rate": 3.6589762076423937e-05, "loss": 0.0003, "step": 1860 }, { "epoch": 1.3626532083633742, "grad_norm": 0.05298294499516487, "learning_rate": 3.637346791636626e-05, "loss": 0.0001, "step": 1890 }, { "epoch": 1.384282624369142, "grad_norm": 0.10733333975076675, "learning_rate": 3.615717375630858e-05, "loss": 0.0001, "step": 1920 }, { "epoch": 1.40591204037491, "grad_norm": 0.0003156494931317866, "learning_rate": 3.59408795962509e-05, "loss": 0.0001, "step": 1950 }, { "epoch": 1.4275414563806776, "grad_norm": 0.0002769752754829824, "learning_rate": 3.572458543619322e-05, "loss": 0.0001, "step": 1980 }, { "epoch": 1.4491708723864456, "grad_norm": 0.0004361444734968245, "learning_rate": 3.550829127613555e-05, "loss": 0.0001, "step": 2010 }, { "epoch": 1.4708002883922133, "grad_norm": 0.012317474000155926, "learning_rate": 3.529199711607787e-05, "loss": 0.0001, "step": 2040 }, { "epoch": 1.4924297043979813, "grad_norm": 7.03033510944806e-05, "learning_rate": 3.507570295602019e-05, "loss": 0.0, "step": 2070 }, { "epoch": 1.5140591204037492, "grad_norm": 0.04190248250961304, "learning_rate": 3.4859408795962514e-05, "loss": 0.0001, "step": 2100 }, { "epoch": 1.535688536409517, "grad_norm": 0.017271051183342934, "learning_rate": 3.4643114635904834e-05, "loss": 0.0001, "step": 2130 }, { "epoch": 1.5573179524152847, "grad_norm": 0.023393860086798668, "learning_rate": 3.4426820475847154e-05, "loss": 0.0267, "step": 2160 }, { "epoch": 1.5789473684210527, "grad_norm": 0.0016879999311640859, "learning_rate": 3.421052631578947e-05, "loss": 0.0001, "step": 2190 }, { "epoch": 1.6005767844268206, "grad_norm": 0.00021976766583975405, "learning_rate": 3.39942321557318e-05, "loss": 0.0001, "step": 2220 }, { "epoch": 1.6222062004325883, "grad_norm": 0.04353722557425499, "learning_rate": 3.377793799567412e-05, "loss": 0.0017, "step": 2250 }, { "epoch": 1.643835616438356, "grad_norm": 0.10071003437042236, "learning_rate": 3.356164383561644e-05, "loss": 0.0002, "step": 2280 }, { "epoch": 1.665465032444124, "grad_norm": 0.0025140675716102123, "learning_rate": 3.3345349675558765e-05, "loss": 0.0001, "step": 2310 }, { "epoch": 1.687094448449892, "grad_norm": 0.0024733401369303465, "learning_rate": 3.3129055515501085e-05, "loss": 0.0001, "step": 2340 }, { "epoch": 1.7087238644556597, "grad_norm": 0.008491634391248226, "learning_rate": 3.2912761355443405e-05, "loss": 0.0002, "step": 2370 }, { "epoch": 1.7303532804614274, "grad_norm": 0.0007006598170846701, "learning_rate": 3.2696467195385724e-05, "loss": 0.0062, "step": 2400 }, { "epoch": 1.7519826964671954, "grad_norm": 0.007970300503075123, "learning_rate": 3.248017303532805e-05, "loss": 0.0003, "step": 2430 }, { "epoch": 1.7736121124729634, "grad_norm": 0.006255310960114002, "learning_rate": 3.226387887527037e-05, "loss": 0.0003, "step": 2460 }, { "epoch": 1.795241528478731, "grad_norm": 0.006729640997946262, "learning_rate": 3.204758471521269e-05, "loss": 0.0001, "step": 2490 }, { "epoch": 1.8168709444844988, "grad_norm": 0.002449074760079384, "learning_rate": 3.1831290555155017e-05, "loss": 0.0004, "step": 2520 }, { "epoch": 1.8385003604902668, "grad_norm": 0.005135863088071346, "learning_rate": 3.1614996395097336e-05, "loss": 0.0002, "step": 2550 }, { "epoch": 1.8601297764960347, "grad_norm": 0.006429166533052921, "learning_rate": 3.1398702235039656e-05, "loss": 0.0001, "step": 2580 }, { "epoch": 1.8817591925018025, "grad_norm": 0.1541679948568344, "learning_rate": 3.1182408074981976e-05, "loss": 0.0026, "step": 2610 }, { "epoch": 1.9033886085075702, "grad_norm": 0.005381646100431681, "learning_rate": 3.09661139149243e-05, "loss": 0.0007, "step": 2640 }, { "epoch": 1.9250180245133381, "grad_norm": 0.003323981538414955, "learning_rate": 3.074981975486662e-05, "loss": 0.0006, "step": 2670 }, { "epoch": 1.946647440519106, "grad_norm": 0.0015521268360316753, "learning_rate": 3.053352559480894e-05, "loss": 0.0086, "step": 2700 }, { "epoch": 1.9682768565248738, "grad_norm": 0.005004936829209328, "learning_rate": 3.0317231434751264e-05, "loss": 0.0001, "step": 2730 }, { "epoch": 1.9899062725306416, "grad_norm": 0.0032932290341705084, "learning_rate": 3.0100937274693587e-05, "loss": 0.0039, "step": 2760 }, { "epoch": 2.0115356885364095, "grad_norm": 0.014805459417402744, "learning_rate": 2.9884643114635907e-05, "loss": 0.0001, "step": 2790 }, { "epoch": 2.0331651045421775, "grad_norm": 0.15489494800567627, "learning_rate": 2.966834895457823e-05, "loss": 0.0006, "step": 2820 }, { "epoch": 2.0547945205479454, "grad_norm": 0.0016624957788735628, "learning_rate": 2.945205479452055e-05, "loss": 0.0051, "step": 2850 }, { "epoch": 2.076423936553713, "grad_norm": 0.0007174916681833565, "learning_rate": 2.9235760634462873e-05, "loss": 0.0, "step": 2880 }, { "epoch": 2.098053352559481, "grad_norm": 0.012850388884544373, "learning_rate": 2.9019466474405192e-05, "loss": 0.0007, "step": 2910 }, { "epoch": 2.119682768565249, "grad_norm": 0.0006323842681013048, "learning_rate": 2.8803172314347516e-05, "loss": 0.0002, "step": 2940 }, { "epoch": 2.1413121845710164, "grad_norm": 0.0014068299205973744, "learning_rate": 2.8586878154289835e-05, "loss": 0.0001, "step": 2970 }, { "epoch": 2.1629416005767843, "grad_norm": 0.01084825862199068, "learning_rate": 2.8370583994232158e-05, "loss": 0.0001, "step": 3000 }, { "epoch": 2.1845710165825523, "grad_norm": 0.0006276288186199963, "learning_rate": 2.815428983417448e-05, "loss": 0.0, "step": 3030 }, { "epoch": 2.20620043258832, "grad_norm": 0.002132813213393092, "learning_rate": 2.79379956741168e-05, "loss": 0.0001, "step": 3060 }, { "epoch": 2.227829848594088, "grad_norm": 6.484696388244629, "learning_rate": 2.7721701514059124e-05, "loss": 0.0003, "step": 3090 }, { "epoch": 2.2494592645998557, "grad_norm": 0.0018794754287227988, "learning_rate": 2.7505407354001444e-05, "loss": 0.0001, "step": 3120 }, { "epoch": 2.2710886806056236, "grad_norm": 0.006788654252886772, "learning_rate": 2.7289113193943767e-05, "loss": 0.0, "step": 3150 }, { "epoch": 2.2927180966113916, "grad_norm": 0.002043538261204958, "learning_rate": 2.7072819033886086e-05, "loss": 0.0, "step": 3180 }, { "epoch": 2.314347512617159, "grad_norm": 0.0010403773048892617, "learning_rate": 2.685652487382841e-05, "loss": 0.0, "step": 3210 }, { "epoch": 2.335976928622927, "grad_norm": 0.009223374538123608, "learning_rate": 2.664023071377073e-05, "loss": 0.0, "step": 3240 }, { "epoch": 2.357606344628695, "grad_norm": 0.030018163844943047, "learning_rate": 2.6423936553713052e-05, "loss": 0.0, "step": 3270 }, { "epoch": 2.379235760634463, "grad_norm": 0.11071939766407013, "learning_rate": 2.6207642393655375e-05, "loss": 0.0, "step": 3300 }, { "epoch": 2.400865176640231, "grad_norm": 0.00016689977201167494, "learning_rate": 2.5991348233597695e-05, "loss": 0.0, "step": 3330 }, { "epoch": 2.4224945926459984, "grad_norm": 0.00032225140603259206, "learning_rate": 2.5775054073540018e-05, "loss": 0.0, "step": 3360 }, { "epoch": 2.4441240086517664, "grad_norm": 0.0011028953595086932, "learning_rate": 2.5558759913482338e-05, "loss": 0.0001, "step": 3390 }, { "epoch": 2.4657534246575343, "grad_norm": 0.00032993065542541444, "learning_rate": 2.534246575342466e-05, "loss": 0.0, "step": 3420 }, { "epoch": 2.487382840663302, "grad_norm": 0.00264033698476851, "learning_rate": 2.512617159336698e-05, "loss": 0.0, "step": 3450 }, { "epoch": 2.50901225666907, "grad_norm": 0.0002550692006479949, "learning_rate": 2.49098774333093e-05, "loss": 0.0001, "step": 3480 }, { "epoch": 2.5306416726748377, "grad_norm": 0.007194190286099911, "learning_rate": 2.4693583273251623e-05, "loss": 0.0, "step": 3510 }, { "epoch": 2.5522710886806057, "grad_norm": 0.001481028157286346, "learning_rate": 2.4477289113193943e-05, "loss": 0.0002, "step": 3540 }, { "epoch": 2.5739005046863737, "grad_norm": 0.014366182498633862, "learning_rate": 2.4260994953136266e-05, "loss": 0.0001, "step": 3570 }, { "epoch": 2.595529920692141, "grad_norm": 6.567173113580793e-05, "learning_rate": 2.4044700793078585e-05, "loss": 0.0001, "step": 3600 }, { "epoch": 2.617159336697909, "grad_norm": 9.722941467771307e-05, "learning_rate": 2.382840663302091e-05, "loss": 0.0, "step": 3630 }, { "epoch": 2.638788752703677, "grad_norm": 1.6977719496935606e-05, "learning_rate": 2.361211247296323e-05, "loss": 0.0, "step": 3660 }, { "epoch": 2.6604181687094446, "grad_norm": 0.01529549341648817, "learning_rate": 2.339581831290555e-05, "loss": 0.0, "step": 3690 }, { "epoch": 2.6820475847152125, "grad_norm": 0.0031721957493573427, "learning_rate": 2.3179524152847874e-05, "loss": 0.0003, "step": 3720 }, { "epoch": 2.7036770007209805, "grad_norm": 0.006362732034176588, "learning_rate": 2.2963229992790194e-05, "loss": 0.0001, "step": 3750 }, { "epoch": 2.7253064167267484, "grad_norm": 0.002783670322969556, "learning_rate": 2.2746935832732517e-05, "loss": 0.0, "step": 3780 }, { "epoch": 2.7469358327325164, "grad_norm": 0.00039837489021010697, "learning_rate": 2.2530641672674837e-05, "loss": 0.0, "step": 3810 }, { "epoch": 2.768565248738284, "grad_norm": 0.0010957849444821477, "learning_rate": 2.231434751261716e-05, "loss": 0.0, "step": 3840 }, { "epoch": 2.790194664744052, "grad_norm": 0.02844543755054474, "learning_rate": 2.209805335255948e-05, "loss": 0.0001, "step": 3870 }, { "epoch": 2.81182408074982, "grad_norm": 6.718010263284668e-05, "learning_rate": 2.1881759192501802e-05, "loss": 0.0, "step": 3900 }, { "epoch": 2.8334534967555873, "grad_norm": 0.006276572123169899, "learning_rate": 2.1665465032444125e-05, "loss": 0.0004, "step": 3930 }, { "epoch": 2.8550829127613553, "grad_norm": 0.0002856945211533457, "learning_rate": 2.1449170872386445e-05, "loss": 0.0002, "step": 3960 }, { "epoch": 2.8767123287671232, "grad_norm": 0.0002423626574454829, "learning_rate": 2.1232876712328768e-05, "loss": 0.0001, "step": 3990 }, { "epoch": 2.898341744772891, "grad_norm": 0.007484122645109892, "learning_rate": 2.1016582552271088e-05, "loss": 0.0, "step": 4020 }, { "epoch": 2.919971160778659, "grad_norm": 0.0007640399853698909, "learning_rate": 2.080028839221341e-05, "loss": 0.0, "step": 4050 }, { "epoch": 2.9416005767844267, "grad_norm": 0.00018265791004523635, "learning_rate": 2.058399423215573e-05, "loss": 0.0, "step": 4080 }, { "epoch": 2.9632299927901946, "grad_norm": 0.0007543243118561804, "learning_rate": 2.0367700072098053e-05, "loss": 0.0, "step": 4110 }, { "epoch": 2.9848594087959626, "grad_norm": 0.00029646529583260417, "learning_rate": 2.0151405912040373e-05, "loss": 0.0, "step": 4140 }, { "epoch": 3.0064888248017305, "grad_norm": 0.00011471308243926615, "learning_rate": 1.9935111751982696e-05, "loss": 0.0, "step": 4170 }, { "epoch": 3.028118240807498, "grad_norm": 0.008354193530976772, "learning_rate": 1.971881759192502e-05, "loss": 0.0, "step": 4200 }, { "epoch": 3.049747656813266, "grad_norm": 0.9368739724159241, "learning_rate": 1.950252343186734e-05, "loss": 0.0001, "step": 4230 }, { "epoch": 3.071377072819034, "grad_norm": 0.006005523260682821, "learning_rate": 1.9286229271809662e-05, "loss": 0.0, "step": 4260 }, { "epoch": 3.093006488824802, "grad_norm": 0.006732122041285038, "learning_rate": 1.906993511175198e-05, "loss": 0.0001, "step": 4290 }, { "epoch": 3.1146359048305694, "grad_norm": 0.00011062090197810903, "learning_rate": 1.8853640951694305e-05, "loss": 0.0001, "step": 4320 }, { "epoch": 3.1362653208363374, "grad_norm": 0.0026187694165855646, "learning_rate": 1.8637346791636624e-05, "loss": 0.0, "step": 4350 }, { "epoch": 3.1578947368421053, "grad_norm": 0.0003347834281157702, "learning_rate": 1.8421052631578947e-05, "loss": 0.0, "step": 4380 }, { "epoch": 3.1795241528478733, "grad_norm": 0.0027828714810311794, "learning_rate": 1.820475847152127e-05, "loss": 0.0, "step": 4410 }, { "epoch": 3.2011535688536408, "grad_norm": 0.01742876134812832, "learning_rate": 1.798846431146359e-05, "loss": 0.0001, "step": 4440 }, { "epoch": 3.2227829848594087, "grad_norm": 0.001497482880949974, "learning_rate": 1.7772170151405913e-05, "loss": 0.0, "step": 4470 }, { "epoch": 3.2444124008651767, "grad_norm": 0.0012416379759088159, "learning_rate": 1.7555875991348233e-05, "loss": 0.0, "step": 4500 }, { "epoch": 3.2660418168709446, "grad_norm": 0.004536696709692478, "learning_rate": 1.7339581831290556e-05, "loss": 0.0001, "step": 4530 }, { "epoch": 3.287671232876712, "grad_norm": 0.000561400200240314, "learning_rate": 1.7123287671232875e-05, "loss": 0.0, "step": 4560 }, { "epoch": 3.30930064888248, "grad_norm": 0.002419061027467251, "learning_rate": 1.69069935111752e-05, "loss": 0.0, "step": 4590 }, { "epoch": 3.330930064888248, "grad_norm": 0.012188425287604332, "learning_rate": 1.6690699351117518e-05, "loss": 0.0, "step": 4620 }, { "epoch": 3.352559480894016, "grad_norm": 0.0004496476030908525, "learning_rate": 1.647440519105984e-05, "loss": 0.0004, "step": 4650 }, { "epoch": 3.3741888968997835, "grad_norm": 0.00017531379126012325, "learning_rate": 1.6258111031002164e-05, "loss": 0.0, "step": 4680 }, { "epoch": 3.3958183129055515, "grad_norm": 0.00026417901972308755, "learning_rate": 1.6041816870944484e-05, "loss": 0.0, "step": 4710 }, { "epoch": 3.4174477289113194, "grad_norm": 0.004818809684365988, "learning_rate": 1.5825522710886807e-05, "loss": 0.0, "step": 4740 }, { "epoch": 3.4390771449170874, "grad_norm": 0.001972823403775692, "learning_rate": 1.5609228550829127e-05, "loss": 0.0, "step": 4770 }, { "epoch": 3.460706560922855, "grad_norm": 0.0018862077267840505, "learning_rate": 1.539293439077145e-05, "loss": 0.0, "step": 4800 }, { "epoch": 3.482335976928623, "grad_norm": 0.013610747642815113, "learning_rate": 1.5176640230713771e-05, "loss": 0.0001, "step": 4830 }, { "epoch": 3.503965392934391, "grad_norm": 0.00016503581719007343, "learning_rate": 1.4960346070656092e-05, "loss": 0.0, "step": 4860 }, { "epoch": 3.5255948089401588, "grad_norm": 0.0001457014586776495, "learning_rate": 1.4744051910598414e-05, "loss": 0.0, "step": 4890 }, { "epoch": 3.5472242249459267, "grad_norm": 0.006511267740279436, "learning_rate": 1.4527757750540735e-05, "loss": 0.0, "step": 4920 }, { "epoch": 3.568853640951694, "grad_norm": 0.0005859262309968472, "learning_rate": 1.4311463590483056e-05, "loss": 0.0002, "step": 4950 }, { "epoch": 3.590483056957462, "grad_norm": 0.002227773889899254, "learning_rate": 1.4095169430425378e-05, "loss": 0.0, "step": 4980 }, { "epoch": 3.61211247296323, "grad_norm": 0.0008713772404007614, "learning_rate": 1.38788752703677e-05, "loss": 0.0, "step": 5010 }, { "epoch": 3.6337418889689976, "grad_norm": 0.0006510260864160955, "learning_rate": 1.3662581110310022e-05, "loss": 0.0, "step": 5040 }, { "epoch": 3.6553713049747656, "grad_norm": 0.004389900714159012, "learning_rate": 1.3446286950252344e-05, "loss": 0.0, "step": 5070 }, { "epoch": 3.6770007209805335, "grad_norm": 0.0011542979627847672, "learning_rate": 1.3229992790194665e-05, "loss": 0.0, "step": 5100 }, { "epoch": 3.6986301369863015, "grad_norm": 0.014204882085323334, "learning_rate": 1.3013698630136986e-05, "loss": 0.0, "step": 5130 }, { "epoch": 3.7202595529920695, "grad_norm": 0.0007670574705116451, "learning_rate": 1.2797404470079308e-05, "loss": 0.0, "step": 5160 }, { "epoch": 3.741888968997837, "grad_norm": 0.0014096363447606564, "learning_rate": 1.2581110310021629e-05, "loss": 0.0001, "step": 5190 }, { "epoch": 3.763518385003605, "grad_norm": 6.187328835949302e-05, "learning_rate": 1.236481614996395e-05, "loss": 0.0, "step": 5220 }, { "epoch": 3.785147801009373, "grad_norm": 0.0003874276007991284, "learning_rate": 1.2148521989906272e-05, "loss": 0.0002, "step": 5250 }, { "epoch": 3.8067772170151404, "grad_norm": 0.00010346725321142003, "learning_rate": 1.1932227829848595e-05, "loss": 0.0, "step": 5280 }, { "epoch": 3.8284066330209083, "grad_norm": 0.020370708778500557, "learning_rate": 1.1715933669790916e-05, "loss": 0.0, "step": 5310 }, { "epoch": 3.8500360490266763, "grad_norm": 0.000221878188312985, "learning_rate": 1.1499639509733238e-05, "loss": 0.0, "step": 5340 }, { "epoch": 3.8716654650324442, "grad_norm": 0.004732194356620312, "learning_rate": 1.1283345349675559e-05, "loss": 0.0, "step": 5370 }, { "epoch": 3.893294881038212, "grad_norm": 0.011568304151296616, "learning_rate": 1.106705118961788e-05, "loss": 0.0023, "step": 5400 }, { "epoch": 3.9149242970439797, "grad_norm": 0.0011147951008751988, "learning_rate": 1.0850757029560202e-05, "loss": 0.0, "step": 5430 }, { "epoch": 3.9365537130497477, "grad_norm": 0.00035450098221190274, "learning_rate": 1.0634462869502523e-05, "loss": 0.0, "step": 5460 }, { "epoch": 3.9581831290555156, "grad_norm": 0.0011028106091544032, "learning_rate": 1.0418168709444844e-05, "loss": 0.0012, "step": 5490 }, { "epoch": 3.979812545061283, "grad_norm": 0.00045547273475676775, "learning_rate": 1.0201874549387166e-05, "loss": 0.0, "step": 5520 }, { "epoch": 4.0014419610670515, "grad_norm": 7.938985800137743e-05, "learning_rate": 9.985580389329489e-06, "loss": 0.0, "step": 5550 }, { "epoch": 4.023071377072819, "grad_norm": 0.003888361854478717, "learning_rate": 9.76928622927181e-06, "loss": 0.0, "step": 5580 }, { "epoch": 4.0447007930785865, "grad_norm": 0.00029194954549893737, "learning_rate": 9.552992069214131e-06, "loss": 0.0, "step": 5610 }, { "epoch": 4.066330209084355, "grad_norm": 0.000853860517963767, "learning_rate": 9.336697909156453e-06, "loss": 0.0, "step": 5640 }, { "epoch": 4.0879596250901225, "grad_norm": 0.0010367140639573336, "learning_rate": 9.120403749098774e-06, "loss": 0.0, "step": 5670 }, { "epoch": 4.109589041095891, "grad_norm": 0.00010384136840002611, "learning_rate": 8.904109589041095e-06, "loss": 0.0, "step": 5700 }, { "epoch": 4.131218457101658, "grad_norm": 0.0006316221551969647, "learning_rate": 8.687815428983417e-06, "loss": 0.0, "step": 5730 }, { "epoch": 4.152847873107426, "grad_norm": 0.0005390524747781456, "learning_rate": 8.471521268925738e-06, "loss": 0.0, "step": 5760 }, { "epoch": 4.174477289113194, "grad_norm": 5.524172306060791, "learning_rate": 8.255227108868061e-06, "loss": 0.0022, "step": 5790 }, { "epoch": 4.196106705118962, "grad_norm": 0.0010216891532763839, "learning_rate": 8.038932948810383e-06, "loss": 0.0, "step": 5820 }, { "epoch": 4.217736121124729, "grad_norm": 0.0003730811004061252, "learning_rate": 7.822638788752704e-06, "loss": 0.0, "step": 5850 }, { "epoch": 4.239365537130498, "grad_norm": 0.001277286559343338, "learning_rate": 7.606344628695025e-06, "loss": 0.0, "step": 5880 }, { "epoch": 4.260994953136265, "grad_norm": 0.0025212133768945932, "learning_rate": 7.390050468637347e-06, "loss": 0.0, "step": 5910 }, { "epoch": 4.282624369142033, "grad_norm": 0.004302350804209709, "learning_rate": 7.173756308579668e-06, "loss": 0.0, "step": 5940 }, { "epoch": 4.304253785147801, "grad_norm": 0.0012293955078348517, "learning_rate": 6.95746214852199e-06, "loss": 0.0, "step": 5970 }, { "epoch": 4.325883201153569, "grad_norm": 0.0006065890775062144, "learning_rate": 6.7411679884643116e-06, "loss": 0.0, "step": 6000 }, { "epoch": 4.347512617159337, "grad_norm": 0.0015407137107104063, "learning_rate": 6.524873828406633e-06, "loss": 0.0, "step": 6030 }, { "epoch": 4.3691420331651045, "grad_norm": 0.0002221543254563585, "learning_rate": 6.308579668348954e-06, "loss": 0.0, "step": 6060 }, { "epoch": 4.390771449170872, "grad_norm": 0.007599177770316601, "learning_rate": 6.0922855082912765e-06, "loss": 0.0, "step": 6090 }, { "epoch": 4.41240086517664, "grad_norm": 0.003168993629515171, "learning_rate": 5.875991348233598e-06, "loss": 0.0, "step": 6120 }, { "epoch": 4.434030281182408, "grad_norm": 0.0012669307179749012, "learning_rate": 5.659697188175919e-06, "loss": 0.0, "step": 6150 }, { "epoch": 4.455659697188176, "grad_norm": 0.0024521639570593834, "learning_rate": 5.4434030281182405e-06, "loss": 0.0, "step": 6180 }, { "epoch": 4.477289113193944, "grad_norm": 0.03333938494324684, "learning_rate": 5.227108868060563e-06, "loss": 0.005, "step": 6210 }, { "epoch": 4.498918529199711, "grad_norm": 0.008298776112496853, "learning_rate": 5.010814708002884e-06, "loss": 0.0, "step": 6240 }, { "epoch": 4.52054794520548, "grad_norm": 0.0008691897382959723, "learning_rate": 4.7945205479452054e-06, "loss": 0.0, "step": 6270 }, { "epoch": 4.542177361211247, "grad_norm": 0.011668123304843903, "learning_rate": 4.578226387887527e-06, "loss": 0.0, "step": 6300 }, { "epoch": 4.563806777217015, "grad_norm": 0.0002202845789724961, "learning_rate": 4.361932227829848e-06, "loss": 0.0033, "step": 6330 }, { "epoch": 4.585436193222783, "grad_norm": 0.00031415498233400285, "learning_rate": 4.14563806777217e-06, "loss": 0.0, "step": 6360 }, { "epoch": 4.607065609228551, "grad_norm": 0.00010377545549999923, "learning_rate": 3.929343907714492e-06, "loss": 0.0, "step": 6390 }, { "epoch": 4.628695025234318, "grad_norm": 0.0029919766820967197, "learning_rate": 3.713049747656813e-06, "loss": 0.0, "step": 6420 }, { "epoch": 4.650324441240087, "grad_norm": 0.000775750435423106, "learning_rate": 3.496755587599135e-06, "loss": 0.0, "step": 6450 }, { "epoch": 4.671953857245854, "grad_norm": 0.0015133414417505264, "learning_rate": 3.280461427541456e-06, "loss": 0.0, "step": 6480 }, { "epoch": 4.6935832732516225, "grad_norm": 0.0023428713902831078, "learning_rate": 3.064167267483778e-06, "loss": 0.0, "step": 6510 }, { "epoch": 4.71521268925739, "grad_norm": 0.002895868383347988, "learning_rate": 2.8478731074260993e-06, "loss": 0.0, "step": 6540 }, { "epoch": 4.7368421052631575, "grad_norm": 0.00880496297031641, "learning_rate": 2.631578947368421e-06, "loss": 0.0, "step": 6570 }, { "epoch": 4.758471521268926, "grad_norm": 0.0009111640974879265, "learning_rate": 2.4152847873107425e-06, "loss": 0.0001, "step": 6600 }, { "epoch": 4.780100937274693, "grad_norm": 0.0007791388779878616, "learning_rate": 2.1989906272530642e-06, "loss": 0.0, "step": 6630 }, { "epoch": 4.801730353280462, "grad_norm": 0.00030644936487078667, "learning_rate": 1.9826964671953856e-06, "loss": 0.0001, "step": 6660 }, { "epoch": 4.823359769286229, "grad_norm": 0.00020950015459675342, "learning_rate": 1.7664023071377072e-06, "loss": 0.0004, "step": 6690 }, { "epoch": 4.844989185291997, "grad_norm": 0.0002440083189867437, "learning_rate": 1.5501081470800287e-06, "loss": 0.0, "step": 6720 }, { "epoch": 4.866618601297765, "grad_norm": 0.0005655589047819376, "learning_rate": 1.3338139870223503e-06, "loss": 0.0, "step": 6750 }, { "epoch": 4.888248017303533, "grad_norm": 0.0002683950588107109, "learning_rate": 1.1175198269646719e-06, "loss": 0.0, "step": 6780 }, { "epoch": 4.9098774333093, "grad_norm": 0.0026675432454794645, "learning_rate": 9.012256669069935e-07, "loss": 0.0, "step": 6810 }, { "epoch": 4.931506849315069, "grad_norm": 0.0039031701162457466, "learning_rate": 6.849315068493151e-07, "loss": 0.0, "step": 6840 }, { "epoch": 4.953136265320836, "grad_norm": 0.056871198117733, "learning_rate": 4.686373467916366e-07, "loss": 0.0, "step": 6870 }, { "epoch": 4.974765681326604, "grad_norm": 0.00018152125994674861, "learning_rate": 2.523431867339582e-07, "loss": 0.0008, "step": 6900 }, { "epoch": 4.996395097332372, "grad_norm": 0.0004676603712141514, "learning_rate": 3.604902667627974e-08, "loss": 0.0006, "step": 6930 } ], "logging_steps": 30, "max_steps": 6935, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }