LLaMA-MoE-v2-3_8B-2_8-sft / trainer_state.json
huxy912's picture
update
4f1c13a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9293516810895164,
"eval_steps": 500,
"global_step": 6800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005674563767910342,
"grad_norm": 1.8945719003677368,
"learning_rate": 2.830188679245283e-06,
"loss": 0.9878,
"step": 20
},
{
"epoch": 0.011349127535820683,
"grad_norm": 0.8699278235435486,
"learning_rate": 5.660377358490566e-06,
"loss": 0.9338,
"step": 40
},
{
"epoch": 0.017023691303731027,
"grad_norm": 0.9612842798233032,
"learning_rate": 8.49056603773585e-06,
"loss": 0.8992,
"step": 60
},
{
"epoch": 0.022698255071641367,
"grad_norm": 1.0209581851959229,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.8802,
"step": 80
},
{
"epoch": 0.02837281883955171,
"grad_norm": 1.1397087574005127,
"learning_rate": 1.4150943396226415e-05,
"loss": 0.8636,
"step": 100
},
{
"epoch": 0.034047382607462054,
"grad_norm": 1.0688011646270752,
"learning_rate": 1.69811320754717e-05,
"loss": 0.8589,
"step": 120
},
{
"epoch": 0.039721946375372394,
"grad_norm": 1.0701323747634888,
"learning_rate": 1.981132075471698e-05,
"loss": 0.8445,
"step": 140
},
{
"epoch": 0.045396510143282734,
"grad_norm": 1.0749995708465576,
"learning_rate": 2.2641509433962265e-05,
"loss": 0.8438,
"step": 160
},
{
"epoch": 0.051071073911193074,
"grad_norm": 1.2973322868347168,
"learning_rate": 2.547169811320755e-05,
"loss": 0.8399,
"step": 180
},
{
"epoch": 0.05674563767910342,
"grad_norm": 0.9941120743751526,
"learning_rate": 2.830188679245283e-05,
"loss": 0.8459,
"step": 200
},
{
"epoch": 0.06242020144701376,
"grad_norm": 1.1092499494552612,
"learning_rate": 2.9999898623711896e-05,
"loss": 0.8396,
"step": 220
},
{
"epoch": 0.06809476521492411,
"grad_norm": 1.10667085647583,
"learning_rate": 2.999875815620755e-05,
"loss": 0.8403,
"step": 240
},
{
"epoch": 0.07376932898283445,
"grad_norm": 1.0986227989196777,
"learning_rate": 2.999635059750628e-05,
"loss": 0.8296,
"step": 260
},
{
"epoch": 0.07944389275074479,
"grad_norm": 0.9648028612136841,
"learning_rate": 2.9992676150998032e-05,
"loss": 0.8187,
"step": 280
},
{
"epoch": 0.08511845651865513,
"grad_norm": 0.8029258251190186,
"learning_rate": 2.998773512709909e-05,
"loss": 0.8224,
"step": 300
},
{
"epoch": 0.09079302028656547,
"grad_norm": 0.888502299785614,
"learning_rate": 2.9981527943225862e-05,
"loss": 0.8178,
"step": 320
},
{
"epoch": 0.09646758405447581,
"grad_norm": 0.7894881963729858,
"learning_rate": 2.997405512375964e-05,
"loss": 0.8153,
"step": 340
},
{
"epoch": 0.10214214782238615,
"grad_norm": 0.8492247462272644,
"learning_rate": 2.996531730000227e-05,
"loss": 0.8105,
"step": 360
},
{
"epoch": 0.1078167115902965,
"grad_norm": 0.8247759938240051,
"learning_rate": 2.9955315210122842e-05,
"loss": 0.8,
"step": 380
},
{
"epoch": 0.11349127535820684,
"grad_norm": 0.8270812034606934,
"learning_rate": 2.99440496990953e-05,
"loss": 0.802,
"step": 400
},
{
"epoch": 0.11916583912611718,
"grad_norm": 0.8336136937141418,
"learning_rate": 2.9931521718627107e-05,
"loss": 0.7932,
"step": 420
},
{
"epoch": 0.12484040289402752,
"grad_norm": 0.7927630543708801,
"learning_rate": 2.991773232707879e-05,
"loss": 0.7903,
"step": 440
},
{
"epoch": 0.13051496666193788,
"grad_norm": 0.8075955510139465,
"learning_rate": 2.9902682689374578e-05,
"loss": 0.7897,
"step": 460
},
{
"epoch": 0.13618953042984822,
"grad_norm": 0.7381598353385925,
"learning_rate": 2.9886374076903945e-05,
"loss": 0.785,
"step": 480
},
{
"epoch": 0.14186409419775856,
"grad_norm": 0.799022912979126,
"learning_rate": 2.986880786741426e-05,
"loss": 0.7862,
"step": 500
},
{
"epoch": 0.1475386579656689,
"grad_norm": 0.7515665292739868,
"learning_rate": 2.9849985544894333e-05,
"loss": 0.7845,
"step": 520
},
{
"epoch": 0.15321322173357924,
"grad_norm": 0.8161646723747253,
"learning_rate": 2.982990869944908e-05,
"loss": 0.7745,
"step": 540
},
{
"epoch": 0.15888778550148958,
"grad_norm": 0.671816885471344,
"learning_rate": 2.9808579027165204e-05,
"loss": 0.7786,
"step": 560
},
{
"epoch": 0.16456234926939992,
"grad_norm": 0.7310769557952881,
"learning_rate": 2.978599832996788e-05,
"loss": 0.7742,
"step": 580
},
{
"epoch": 0.17023691303731026,
"grad_norm": 0.7568747401237488,
"learning_rate": 2.9762168515468548e-05,
"loss": 0.7691,
"step": 600
},
{
"epoch": 0.1759114768052206,
"grad_norm": 0.6345218420028687,
"learning_rate": 2.973709159680375e-05,
"loss": 0.7695,
"step": 620
},
{
"epoch": 0.18158604057313094,
"grad_norm": 0.7218050360679626,
"learning_rate": 2.9710769692465073e-05,
"loss": 0.7681,
"step": 640
},
{
"epoch": 0.18726060434104128,
"grad_norm": 0.7665095925331116,
"learning_rate": 2.9683205026120163e-05,
"loss": 0.7667,
"step": 660
},
{
"epoch": 0.19293516810895162,
"grad_norm": 0.6717973947525024,
"learning_rate": 2.9654399926424884e-05,
"loss": 0.7684,
"step": 680
},
{
"epoch": 0.19860973187686196,
"grad_norm": 0.7454754114151001,
"learning_rate": 2.9624356826826577e-05,
"loss": 0.7622,
"step": 700
},
{
"epoch": 0.2042842956447723,
"grad_norm": 0.6865426898002625,
"learning_rate": 2.9593078265358498e-05,
"loss": 0.761,
"step": 720
},
{
"epoch": 0.20995885941268266,
"grad_norm": 0.7075285315513611,
"learning_rate": 2.956056688442541e-05,
"loss": 0.7578,
"step": 740
},
{
"epoch": 0.215633423180593,
"grad_norm": 0.7438149452209473,
"learning_rate": 2.9526825430580337e-05,
"loss": 0.7571,
"step": 760
},
{
"epoch": 0.22130798694850334,
"grad_norm": 0.6830400228500366,
"learning_rate": 2.949185675429254e-05,
"loss": 0.759,
"step": 780
},
{
"epoch": 0.22698255071641368,
"grad_norm": 0.7147162556648254,
"learning_rate": 2.9455663809706725e-05,
"loss": 0.756,
"step": 800
},
{
"epoch": 0.23265711448432402,
"grad_norm": 0.7116013765335083,
"learning_rate": 2.9418249654393443e-05,
"loss": 0.7538,
"step": 820
},
{
"epoch": 0.23833167825223436,
"grad_norm": 0.64736407995224,
"learning_rate": 2.9379617449090847e-05,
"loss": 0.7513,
"step": 840
},
{
"epoch": 0.2440062420201447,
"grad_norm": 0.6453843116760254,
"learning_rate": 2.93397704574376e-05,
"loss": 0.7538,
"step": 860
},
{
"epoch": 0.24968080578805504,
"grad_norm": 0.6253499388694763,
"learning_rate": 2.929871204569722e-05,
"loss": 0.7463,
"step": 880
},
{
"epoch": 0.2553553695559654,
"grad_norm": 0.6677010655403137,
"learning_rate": 2.9256445682473683e-05,
"loss": 0.7419,
"step": 900
},
{
"epoch": 0.26102993332387575,
"grad_norm": 0.7070403695106506,
"learning_rate": 2.9212974938418385e-05,
"loss": 0.7449,
"step": 920
},
{
"epoch": 0.26670449709178606,
"grad_norm": 0.6784743070602417,
"learning_rate": 2.9168303485928495e-05,
"loss": 0.7453,
"step": 940
},
{
"epoch": 0.27237906085969643,
"grad_norm": 0.6076740026473999,
"learning_rate": 2.912243509883673e-05,
"loss": 0.7457,
"step": 960
},
{
"epoch": 0.27805362462760674,
"grad_norm": 0.6722409129142761,
"learning_rate": 2.9075373652092535e-05,
"loss": 0.7373,
"step": 980
},
{
"epoch": 0.2837281883955171,
"grad_norm": 0.7188818454742432,
"learning_rate": 2.9027123121434714e-05,
"loss": 0.7343,
"step": 1000
},
{
"epoch": 0.2894027521634274,
"grad_norm": 0.657289981842041,
"learning_rate": 2.897768758305558e-05,
"loss": 0.7336,
"step": 1020
},
{
"epoch": 0.2950773159313378,
"grad_norm": 0.6076385378837585,
"learning_rate": 2.892707121325658e-05,
"loss": 0.7331,
"step": 1040
},
{
"epoch": 0.3007518796992481,
"grad_norm": 0.6217896342277527,
"learning_rate": 2.8875278288095507e-05,
"loss": 0.7339,
"step": 1060
},
{
"epoch": 0.30642644346715847,
"grad_norm": 0.6453694701194763,
"learning_rate": 2.882231318302523e-05,
"loss": 0.7334,
"step": 1080
},
{
"epoch": 0.3121010072350688,
"grad_norm": 0.6069263219833374,
"learning_rate": 2.8768180372524093e-05,
"loss": 0.734,
"step": 1100
},
{
"epoch": 0.31777557100297915,
"grad_norm": 0.6342785358428955,
"learning_rate": 2.8712884429717873e-05,
"loss": 0.7254,
"step": 1120
},
{
"epoch": 0.32345013477088946,
"grad_norm": 0.5936433672904968,
"learning_rate": 2.8656430025993464e-05,
"loss": 0.7232,
"step": 1140
},
{
"epoch": 0.32912469853879983,
"grad_norm": 0.5988269448280334,
"learning_rate": 2.8598821930604252e-05,
"loss": 0.726,
"step": 1160
},
{
"epoch": 0.3347992623067102,
"grad_norm": 0.6247944235801697,
"learning_rate": 2.8540065010267183e-05,
"loss": 0.729,
"step": 1180
},
{
"epoch": 0.3404738260746205,
"grad_norm": 0.6017037034034729,
"learning_rate": 2.848016422875164e-05,
"loss": 0.7216,
"step": 1200
},
{
"epoch": 0.3461483898425309,
"grad_norm": 0.7368952631950378,
"learning_rate": 2.84191246464601e-05,
"loss": 0.7331,
"step": 1220
},
{
"epoch": 0.3518229536104412,
"grad_norm": 0.6655734777450562,
"learning_rate": 2.835695142000064e-05,
"loss": 0.7233,
"step": 1240
},
{
"epoch": 0.35749751737835156,
"grad_norm": 0.6325275301933289,
"learning_rate": 2.8293649801751288e-05,
"loss": 0.7208,
"step": 1260
},
{
"epoch": 0.36317208114626187,
"grad_norm": 0.6046157479286194,
"learning_rate": 2.822922513941634e-05,
"loss": 0.7156,
"step": 1280
},
{
"epoch": 0.36884664491417224,
"grad_norm": 0.6081031560897827,
"learning_rate": 2.816368287557454e-05,
"loss": 0.722,
"step": 1300
},
{
"epoch": 0.37452120868208255,
"grad_norm": 0.6153631806373596,
"learning_rate": 2.809702854721934e-05,
"loss": 0.7171,
"step": 1320
},
{
"epoch": 0.3801957724499929,
"grad_norm": 0.6361656188964844,
"learning_rate": 2.8029267785291092e-05,
"loss": 0.7134,
"step": 1340
},
{
"epoch": 0.38587033621790323,
"grad_norm": 0.6033869981765747,
"learning_rate": 2.796040631420139e-05,
"loss": 0.7171,
"step": 1360
},
{
"epoch": 0.3915448999858136,
"grad_norm": 0.6300106644630432,
"learning_rate": 2.789044995134944e-05,
"loss": 0.7139,
"step": 1380
},
{
"epoch": 0.3972194637537239,
"grad_norm": 0.5989068150520325,
"learning_rate": 2.781940460663062e-05,
"loss": 0.7142,
"step": 1400
},
{
"epoch": 0.4028940275216343,
"grad_norm": 0.5790150761604309,
"learning_rate": 2.774727628193721e-05,
"loss": 0.7126,
"step": 1420
},
{
"epoch": 0.4085685912895446,
"grad_norm": 0.5948804616928101,
"learning_rate": 2.7674071070651378e-05,
"loss": 0.7103,
"step": 1440
},
{
"epoch": 0.41424315505745496,
"grad_norm": 0.6838712096214294,
"learning_rate": 2.7599795157130364e-05,
"loss": 0.7169,
"step": 1460
},
{
"epoch": 0.4199177188253653,
"grad_norm": 0.6502018570899963,
"learning_rate": 2.7524454816184076e-05,
"loss": 0.7094,
"step": 1480
},
{
"epoch": 0.42559228259327564,
"grad_norm": 0.6322967410087585,
"learning_rate": 2.7448056412544956e-05,
"loss": 0.7134,
"step": 1500
},
{
"epoch": 0.431266846361186,
"grad_norm": 0.5761287212371826,
"learning_rate": 2.7370606400330334e-05,
"loss": 0.7067,
"step": 1520
},
{
"epoch": 0.4369414101290963,
"grad_norm": 0.6147580742835999,
"learning_rate": 2.729211132249713e-05,
"loss": 0.7078,
"step": 1540
},
{
"epoch": 0.4426159738970067,
"grad_norm": 0.6231666207313538,
"learning_rate": 2.7212577810289157e-05,
"loss": 0.7066,
"step": 1560
},
{
"epoch": 0.448290537664917,
"grad_norm": 0.5739862322807312,
"learning_rate": 2.713201258267689e-05,
"loss": 0.708,
"step": 1580
},
{
"epoch": 0.45396510143282737,
"grad_norm": 0.7059602737426758,
"learning_rate": 2.7050422445789843e-05,
"loss": 0.7043,
"step": 1600
},
{
"epoch": 0.4596396652007377,
"grad_norm": 0.6156895160675049,
"learning_rate": 2.696781429234162e-05,
"loss": 0.7118,
"step": 1620
},
{
"epoch": 0.46531422896864805,
"grad_norm": 0.5444714426994324,
"learning_rate": 2.6884195101047567e-05,
"loss": 0.7031,
"step": 1640
},
{
"epoch": 0.47098879273655836,
"grad_norm": 0.6431369185447693,
"learning_rate": 2.6799571936035284e-05,
"loss": 0.7056,
"step": 1660
},
{
"epoch": 0.4766633565044687,
"grad_norm": 0.6375367641448975,
"learning_rate": 2.671395194624779e-05,
"loss": 0.6991,
"step": 1680
},
{
"epoch": 0.48233792027237904,
"grad_norm": 0.6311667561531067,
"learning_rate": 2.6627342364839604e-05,
"loss": 0.6991,
"step": 1700
},
{
"epoch": 0.4880124840402894,
"grad_norm": 0.580328643321991,
"learning_rate": 2.6539750508565683e-05,
"loss": 0.7027,
"step": 1720
},
{
"epoch": 0.4936870478081997,
"grad_norm": 0.6254743933677673,
"learning_rate": 2.6451183777163316e-05,
"loss": 0.6977,
"step": 1740
},
{
"epoch": 0.4993616115761101,
"grad_norm": 0.8747753500938416,
"learning_rate": 2.636164965272699e-05,
"loss": 0.6974,
"step": 1760
},
{
"epoch": 0.5050361753440205,
"grad_norm": 0.5931680798530579,
"learning_rate": 2.6271155699076305e-05,
"loss": 0.7001,
"step": 1780
},
{
"epoch": 0.5107107391119308,
"grad_norm": 0.5763223767280579,
"learning_rate": 2.6179709561116983e-05,
"loss": 0.7023,
"step": 1800
},
{
"epoch": 0.5163853028798411,
"grad_norm": 0.5211492776870728,
"learning_rate": 2.6087318964195032e-05,
"loss": 0.6957,
"step": 1820
},
{
"epoch": 0.5220598666477515,
"grad_norm": 0.5684000253677368,
"learning_rate": 2.59939917134441e-05,
"loss": 0.6916,
"step": 1840
},
{
"epoch": 0.5277344304156618,
"grad_norm": 0.6029589176177979,
"learning_rate": 2.5899735693126113e-05,
"loss": 0.6942,
"step": 1860
},
{
"epoch": 0.5334089941835721,
"grad_norm": 0.5765926837921143,
"learning_rate": 2.5804558865965206e-05,
"loss": 0.6973,
"step": 1880
},
{
"epoch": 0.5390835579514824,
"grad_norm": 0.5227144956588745,
"learning_rate": 2.5708469272475044e-05,
"loss": 0.6929,
"step": 1900
},
{
"epoch": 0.5447581217193929,
"grad_norm": 0.6175386309623718,
"learning_rate": 2.5611475030279546e-05,
"loss": 0.6908,
"step": 1920
},
{
"epoch": 0.5504326854873032,
"grad_norm": 0.5724866986274719,
"learning_rate": 2.5513584333427125e-05,
"loss": 0.6893,
"step": 1940
},
{
"epoch": 0.5561072492552135,
"grad_norm": 0.5964395403862,
"learning_rate": 2.541480545169846e-05,
"loss": 0.6944,
"step": 1960
},
{
"epoch": 0.5617818130231238,
"grad_norm": 0.6019209027290344,
"learning_rate": 2.5315146729907827e-05,
"loss": 0.6899,
"step": 1980
},
{
"epoch": 0.5674563767910342,
"grad_norm": 0.6371375918388367,
"learning_rate": 2.521461658719819e-05,
"loss": 0.6904,
"step": 2000
},
{
"epoch": 0.5731309405589445,
"grad_norm": 0.5762882232666016,
"learning_rate": 2.5113223516329924e-05,
"loss": 0.6887,
"step": 2020
},
{
"epoch": 0.5788055043268548,
"grad_norm": 0.591663122177124,
"learning_rate": 2.501097608296334e-05,
"loss": 0.6894,
"step": 2040
},
{
"epoch": 0.5844800680947652,
"grad_norm": 0.5833630561828613,
"learning_rate": 2.4907882924935072e-05,
"loss": 0.6866,
"step": 2060
},
{
"epoch": 0.5901546318626756,
"grad_norm": 0.5615355968475342,
"learning_rate": 2.4803952751528363e-05,
"loss": 0.6927,
"step": 2080
},
{
"epoch": 0.5958291956305859,
"grad_norm": 0.5507014989852905,
"learning_rate": 2.4699194342737295e-05,
"loss": 0.6934,
"step": 2100
},
{
"epoch": 0.6015037593984962,
"grad_norm": 0.5132161974906921,
"learning_rate": 2.459361654852505e-05,
"loss": 0.688,
"step": 2120
},
{
"epoch": 0.6071783231664066,
"grad_norm": 0.5238850116729736,
"learning_rate": 2.4487228288076293e-05,
"loss": 0.6804,
"step": 2140
},
{
"epoch": 0.6128528869343169,
"grad_norm": 0.5849164724349976,
"learning_rate": 2.438003854904366e-05,
"loss": 0.6911,
"step": 2160
},
{
"epoch": 0.6185274507022273,
"grad_norm": 0.5290674567222595,
"learning_rate": 2.4272056386788485e-05,
"loss": 0.6838,
"step": 2180
},
{
"epoch": 0.6242020144701376,
"grad_norm": 0.5804121494293213,
"learning_rate": 2.4163290923615814e-05,
"loss": 0.6894,
"step": 2200
},
{
"epoch": 0.629876578238048,
"grad_norm": 0.5559779405593872,
"learning_rate": 2.4053751348003757e-05,
"loss": 0.6859,
"step": 2220
},
{
"epoch": 0.6355511420059583,
"grad_norm": 0.5486791133880615,
"learning_rate": 2.394344691382723e-05,
"loss": 0.6836,
"step": 2240
},
{
"epoch": 0.6412257057738686,
"grad_norm": 0.5544127225875854,
"learning_rate": 2.3832386939576214e-05,
"loss": 0.681,
"step": 2260
},
{
"epoch": 0.6469002695417789,
"grad_norm": 0.5256103277206421,
"learning_rate": 2.3720580807568513e-05,
"loss": 0.6823,
"step": 2280
},
{
"epoch": 0.6525748333096894,
"grad_norm": 0.5488288402557373,
"learning_rate": 2.3608037963157142e-05,
"loss": 0.6818,
"step": 2300
},
{
"epoch": 0.6582493970775997,
"grad_norm": 0.5254908204078674,
"learning_rate": 2.3494767913932393e-05,
"loss": 0.6774,
"step": 2320
},
{
"epoch": 0.66392396084551,
"grad_norm": 0.5880591869354248,
"learning_rate": 2.338078022891864e-05,
"loss": 0.6795,
"step": 2340
},
{
"epoch": 0.6695985246134204,
"grad_norm": 0.5331950783729553,
"learning_rate": 2.3266084537765924e-05,
"loss": 0.6777,
"step": 2360
},
{
"epoch": 0.6752730883813307,
"grad_norm": 0.5736955404281616,
"learning_rate": 2.3150690529936475e-05,
"loss": 0.6792,
"step": 2380
},
{
"epoch": 0.680947652149241,
"grad_norm": 0.5705032348632812,
"learning_rate": 2.303460795388613e-05,
"loss": 0.6736,
"step": 2400
},
{
"epoch": 0.6866222159171513,
"grad_norm": 0.569355845451355,
"learning_rate": 2.2917846616240784e-05,
"loss": 0.6767,
"step": 2420
},
{
"epoch": 0.6922967796850618,
"grad_norm": 1.2819143533706665,
"learning_rate": 2.2800416380967952e-05,
"loss": 0.6772,
"step": 2440
},
{
"epoch": 0.6979713434529721,
"grad_norm": 0.5238373279571533,
"learning_rate": 2.268232716854343e-05,
"loss": 0.674,
"step": 2460
},
{
"epoch": 0.7036459072208824,
"grad_norm": 0.5886688828468323,
"learning_rate": 2.2563588955113246e-05,
"loss": 0.6757,
"step": 2480
},
{
"epoch": 0.7093204709887927,
"grad_norm": 0.5450348854064941,
"learning_rate": 2.244421177165085e-05,
"loss": 0.6691,
"step": 2500
},
{
"epoch": 0.7149950347567031,
"grad_norm": 0.5553733706474304,
"learning_rate": 2.232420570310974e-05,
"loss": 0.6751,
"step": 2520
},
{
"epoch": 0.7206695985246134,
"grad_norm": 0.5076789259910583,
"learning_rate": 2.2203580887571423e-05,
"loss": 0.6739,
"step": 2540
},
{
"epoch": 0.7263441622925237,
"grad_norm": 0.5153952240943909,
"learning_rate": 2.2082347515389027e-05,
"loss": 0.6734,
"step": 2560
},
{
"epoch": 0.732018726060434,
"grad_norm": 0.5176730155944824,
"learning_rate": 2.1960515828326372e-05,
"loss": 0.6706,
"step": 2580
},
{
"epoch": 0.7376932898283445,
"grad_norm": 0.526030421257019,
"learning_rate": 2.1838096118692768e-05,
"loss": 0.6694,
"step": 2600
},
{
"epoch": 0.7433678535962548,
"grad_norm": 0.6030652523040771,
"learning_rate": 2.1715098728473518e-05,
"loss": 0.6707,
"step": 2620
},
{
"epoch": 0.7490424173641651,
"grad_norm": 0.6607082486152649,
"learning_rate": 2.1591534048456225e-05,
"loss": 0.6668,
"step": 2640
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.5300272107124329,
"learning_rate": 2.1467412517352996e-05,
"loss": 0.6696,
"step": 2660
},
{
"epoch": 0.7603915448999858,
"grad_norm": 0.5344169735908508,
"learning_rate": 2.1342744620918568e-05,
"loss": 0.6736,
"step": 2680
},
{
"epoch": 0.7660661086678962,
"grad_norm": 0.5058417916297913,
"learning_rate": 2.121754089106448e-05,
"loss": 0.6681,
"step": 2700
},
{
"epoch": 0.7717406724358065,
"grad_norm": 0.5440433621406555,
"learning_rate": 2.1091811904969344e-05,
"loss": 0.6702,
"step": 2720
},
{
"epoch": 0.7774152362037169,
"grad_norm": 0.5361486077308655,
"learning_rate": 2.096556828418528e-05,
"loss": 0.6686,
"step": 2740
},
{
"epoch": 0.7830897999716272,
"grad_norm": 0.6350403428077698,
"learning_rate": 2.0838820693740603e-05,
"loss": 0.6678,
"step": 2760
},
{
"epoch": 0.7887643637395375,
"grad_norm": 0.5326098203659058,
"learning_rate": 2.0711579841238875e-05,
"loss": 0.6711,
"step": 2780
},
{
"epoch": 0.7944389275074478,
"grad_norm": 0.540676474571228,
"learning_rate": 2.058385647595429e-05,
"loss": 0.6705,
"step": 2800
},
{
"epoch": 0.8001134912753582,
"grad_norm": 0.4930702745914459,
"learning_rate": 2.045566138792361e-05,
"loss": 0.6683,
"step": 2820
},
{
"epoch": 0.8057880550432686,
"grad_norm": 0.5729920268058777,
"learning_rate": 2.032700540703459e-05,
"loss": 0.6646,
"step": 2840
},
{
"epoch": 0.8114626188111789,
"grad_norm": 0.5179927945137024,
"learning_rate": 2.0197899402111127e-05,
"loss": 0.6632,
"step": 2860
},
{
"epoch": 0.8171371825790892,
"grad_norm": 0.5147942900657654,
"learning_rate": 2.0068354279995008e-05,
"loss": 0.6558,
"step": 2880
},
{
"epoch": 0.8228117463469996,
"grad_norm": 0.5044906735420227,
"learning_rate": 1.9938380984624533e-05,
"loss": 0.6634,
"step": 2900
},
{
"epoch": 0.8284863101149099,
"grad_norm": 0.5231923460960388,
"learning_rate": 1.9807990496109965e-05,
"loss": 0.6698,
"step": 2920
},
{
"epoch": 0.8341608738828202,
"grad_norm": 0.5322957634925842,
"learning_rate": 1.967719382980594e-05,
"loss": 0.6568,
"step": 2940
},
{
"epoch": 0.8398354376507307,
"grad_norm": 0.512269139289856,
"learning_rate": 1.9546002035380886e-05,
"loss": 0.6654,
"step": 2960
},
{
"epoch": 0.845510001418641,
"grad_norm": 0.508976399898529,
"learning_rate": 1.9414426195883558e-05,
"loss": 0.6552,
"step": 2980
},
{
"epoch": 0.8511845651865513,
"grad_norm": 0.5061299204826355,
"learning_rate": 1.9282477426806723e-05,
"loss": 0.6599,
"step": 3000
},
{
"epoch": 0.8568591289544616,
"grad_norm": 0.510822057723999,
"learning_rate": 1.9150166875148155e-05,
"loss": 0.6612,
"step": 3020
},
{
"epoch": 0.862533692722372,
"grad_norm": 0.5578708648681641,
"learning_rate": 1.9017505718468934e-05,
"loss": 0.658,
"step": 3040
},
{
"epoch": 0.8682082564902823,
"grad_norm": 0.5130868554115295,
"learning_rate": 1.888450516394914e-05,
"loss": 0.6541,
"step": 3060
},
{
"epoch": 0.8738828202581926,
"grad_norm": 0.5147811770439148,
"learning_rate": 1.8751176447441104e-05,
"loss": 0.6586,
"step": 3080
},
{
"epoch": 0.879557384026103,
"grad_norm": 0.5556140542030334,
"learning_rate": 1.861753083252021e-05,
"loss": 0.6535,
"step": 3100
},
{
"epoch": 0.8852319477940134,
"grad_norm": 0.509611964225769,
"learning_rate": 1.8483579609533318e-05,
"loss": 0.6537,
"step": 3120
},
{
"epoch": 0.8909065115619237,
"grad_norm": 0.5088684558868408,
"learning_rate": 1.834933409464499e-05,
"loss": 0.6562,
"step": 3140
},
{
"epoch": 0.896581075329834,
"grad_norm": 0.48405396938323975,
"learning_rate": 1.821480562888148e-05,
"loss": 0.6583,
"step": 3160
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.5087782144546509,
"learning_rate": 1.808000557717268e-05,
"loss": 0.6558,
"step": 3180
},
{
"epoch": 0.9079302028656547,
"grad_norm": 0.5303909778594971,
"learning_rate": 1.7944945327391957e-05,
"loss": 0.6517,
"step": 3200
},
{
"epoch": 0.913604766633565,
"grad_norm": 0.5164442658424377,
"learning_rate": 1.7809636289394185e-05,
"loss": 0.6529,
"step": 3220
},
{
"epoch": 0.9192793304014754,
"grad_norm": 0.5162308216094971,
"learning_rate": 1.7674089894051774e-05,
"loss": 0.6542,
"step": 3240
},
{
"epoch": 0.9249538941693858,
"grad_norm": 0.545396625995636,
"learning_rate": 1.753831759228903e-05,
"loss": 0.6527,
"step": 3260
},
{
"epoch": 0.9306284579372961,
"grad_norm": 0.5134595632553101,
"learning_rate": 1.740233085411477e-05,
"loss": 0.6555,
"step": 3280
},
{
"epoch": 0.9363030217052064,
"grad_norm": 0.48815637826919556,
"learning_rate": 1.7266141167653353e-05,
"loss": 0.6554,
"step": 3300
},
{
"epoch": 0.9419775854731167,
"grad_norm": 0.5034410953521729,
"learning_rate": 1.7129760038174146e-05,
"loss": 0.6514,
"step": 3320
},
{
"epoch": 0.9476521492410271,
"grad_norm": 0.5322323441505432,
"learning_rate": 1.6993198987119576e-05,
"loss": 0.6533,
"step": 3340
},
{
"epoch": 0.9533267130089375,
"grad_norm": 0.48363253474235535,
"learning_rate": 1.6856469551131805e-05,
"loss": 0.6468,
"step": 3360
},
{
"epoch": 0.9590012767768478,
"grad_norm": 0.4600164592266083,
"learning_rate": 1.67195832810781e-05,
"loss": 0.6472,
"step": 3380
},
{
"epoch": 0.9646758405447581,
"grad_norm": 0.49600768089294434,
"learning_rate": 1.6582551741075033e-05,
"loss": 0.6467,
"step": 3400
},
{
"epoch": 0.9703504043126685,
"grad_norm": 0.7202423810958862,
"learning_rate": 1.6445386507511546e-05,
"loss": 0.6502,
"step": 3420
},
{
"epoch": 0.9760249680805788,
"grad_norm": 0.502703070640564,
"learning_rate": 1.630809916807098e-05,
"loss": 0.6424,
"step": 3440
},
{
"epoch": 0.9816995318484891,
"grad_norm": 0.49266818165779114,
"learning_rate": 1.617070132075214e-05,
"loss": 0.6485,
"step": 3460
},
{
"epoch": 0.9873740956163994,
"grad_norm": 0.5194821357727051,
"learning_rate": 1.6033204572889516e-05,
"loss": 0.6499,
"step": 3480
},
{
"epoch": 0.9930486593843099,
"grad_norm": 0.49109163880348206,
"learning_rate": 1.5895620540172682e-05,
"loss": 0.6506,
"step": 3500
},
{
"epoch": 0.9987232231522202,
"grad_norm": 0.5099320411682129,
"learning_rate": 1.575796084566503e-05,
"loss": 0.6466,
"step": 3520
},
{
"epoch": 1.0043977869201306,
"grad_norm": 0.5476223230361938,
"learning_rate": 1.562023711882182e-05,
"loss": 0.5924,
"step": 3540
},
{
"epoch": 1.010072350688041,
"grad_norm": 0.4934983551502228,
"learning_rate": 1.548246099450776e-05,
"loss": 0.5683,
"step": 3560
},
{
"epoch": 1.0157469144559512,
"grad_norm": 0.5262681841850281,
"learning_rate": 1.534464411201409e-05,
"loss": 0.5733,
"step": 3580
},
{
"epoch": 1.0214214782238615,
"grad_norm": 0.5271425843238831,
"learning_rate": 1.520679811407526e-05,
"loss": 0.5697,
"step": 3600
},
{
"epoch": 1.0270960419917718,
"grad_norm": 0.5124356150627136,
"learning_rate": 1.506893464588542e-05,
"loss": 0.5653,
"step": 3620
},
{
"epoch": 1.0327706057596822,
"grad_norm": 0.5131009817123413,
"learning_rate": 1.4931065354114584e-05,
"loss": 0.5669,
"step": 3640
},
{
"epoch": 1.0384451695275925,
"grad_norm": 0.5003370046615601,
"learning_rate": 1.4793201885924745e-05,
"loss": 0.565,
"step": 3660
},
{
"epoch": 1.044119733295503,
"grad_norm": 0.5440374612808228,
"learning_rate": 1.465535588798592e-05,
"loss": 0.5708,
"step": 3680
},
{
"epoch": 1.0497942970634133,
"grad_norm": 0.5212259292602539,
"learning_rate": 1.4517539005492237e-05,
"loss": 0.57,
"step": 3700
},
{
"epoch": 1.0554688608313236,
"grad_norm": 0.5004721879959106,
"learning_rate": 1.4379762881178182e-05,
"loss": 0.5692,
"step": 3720
},
{
"epoch": 1.061143424599234,
"grad_norm": 0.5253936648368835,
"learning_rate": 1.4242039154334973e-05,
"loss": 0.5685,
"step": 3740
},
{
"epoch": 1.0668179883671443,
"grad_norm": 0.5163034200668335,
"learning_rate": 1.410437945982732e-05,
"loss": 0.5706,
"step": 3760
},
{
"epoch": 1.0724925521350546,
"grad_norm": 0.49630168080329895,
"learning_rate": 1.3966795427110493e-05,
"loss": 0.5725,
"step": 3780
},
{
"epoch": 1.0781671159029649,
"grad_norm": 0.5117852091789246,
"learning_rate": 1.3829298679247865e-05,
"loss": 0.5646,
"step": 3800
},
{
"epoch": 1.0838416796708752,
"grad_norm": 0.5082918405532837,
"learning_rate": 1.369190083192902e-05,
"loss": 0.5705,
"step": 3820
},
{
"epoch": 1.0895162434387857,
"grad_norm": 0.5319990515708923,
"learning_rate": 1.3554613492488453e-05,
"loss": 0.5684,
"step": 3840
},
{
"epoch": 1.095190807206696,
"grad_norm": 0.5344195365905762,
"learning_rate": 1.3417448258924971e-05,
"loss": 0.5658,
"step": 3860
},
{
"epoch": 1.1008653709746063,
"grad_norm": 0.507433295249939,
"learning_rate": 1.3280416718921902e-05,
"loss": 0.5717,
"step": 3880
},
{
"epoch": 1.1065399347425167,
"grad_norm": 0.5090216398239136,
"learning_rate": 1.3143530448868198e-05,
"loss": 0.5663,
"step": 3900
},
{
"epoch": 1.112214498510427,
"grad_norm": 0.512146532535553,
"learning_rate": 1.3006801012880425e-05,
"loss": 0.5656,
"step": 3920
},
{
"epoch": 1.1178890622783373,
"grad_norm": 0.5273200869560242,
"learning_rate": 1.2870239961825853e-05,
"loss": 0.5621,
"step": 3940
},
{
"epoch": 1.1235636260462476,
"grad_norm": 0.5408139824867249,
"learning_rate": 1.2733858832346648e-05,
"loss": 0.5744,
"step": 3960
},
{
"epoch": 1.1292381898141581,
"grad_norm": 0.4986436069011688,
"learning_rate": 1.2597669145885231e-05,
"loss": 0.5704,
"step": 3980
},
{
"epoch": 1.1349127535820684,
"grad_norm": 0.5186699628829956,
"learning_rate": 1.2461682407710973e-05,
"loss": 0.5588,
"step": 4000
},
{
"epoch": 1.1405873173499788,
"grad_norm": 0.5081115365028381,
"learning_rate": 1.2325910105948229e-05,
"loss": 0.5667,
"step": 4020
},
{
"epoch": 1.146261881117889,
"grad_norm": 0.501616358757019,
"learning_rate": 1.219036371060582e-05,
"loss": 0.5628,
"step": 4040
},
{
"epoch": 1.1519364448857994,
"grad_norm": 0.5288362503051758,
"learning_rate": 1.2055054672608043e-05,
"loss": 0.5642,
"step": 4060
},
{
"epoch": 1.1576110086537097,
"grad_norm": 0.5392152070999146,
"learning_rate": 1.1919994422827326e-05,
"loss": 0.5606,
"step": 4080
},
{
"epoch": 1.16328557242162,
"grad_norm": 0.514348030090332,
"learning_rate": 1.1785194371118521e-05,
"loss": 0.5653,
"step": 4100
},
{
"epoch": 1.1689601361895305,
"grad_norm": 0.4942004978656769,
"learning_rate": 1.1650665905355014e-05,
"loss": 0.5622,
"step": 4120
},
{
"epoch": 1.1746346999574409,
"grad_norm": 0.48802751302719116,
"learning_rate": 1.1516420390466685e-05,
"loss": 0.5613,
"step": 4140
},
{
"epoch": 1.1803092637253512,
"grad_norm": 0.5025625228881836,
"learning_rate": 1.1382469167479795e-05,
"loss": 0.5656,
"step": 4160
},
{
"epoch": 1.1859838274932615,
"grad_norm": 0.5276467204093933,
"learning_rate": 1.1248823552558895e-05,
"loss": 0.5639,
"step": 4180
},
{
"epoch": 1.1916583912611718,
"grad_norm": 0.5035718083381653,
"learning_rate": 1.1115494836050861e-05,
"loss": 0.5612,
"step": 4200
},
{
"epoch": 1.197332955029082,
"grad_norm": 0.5080997347831726,
"learning_rate": 1.0982494281531069e-05,
"loss": 0.5647,
"step": 4220
},
{
"epoch": 1.2030075187969924,
"grad_norm": 0.505695104598999,
"learning_rate": 1.0849833124851846e-05,
"loss": 0.5681,
"step": 4240
},
{
"epoch": 1.2086820825649027,
"grad_norm": 0.48905614018440247,
"learning_rate": 1.0717522573193281e-05,
"loss": 0.561,
"step": 4260
},
{
"epoch": 1.2143566463328133,
"grad_norm": 0.49127668142318726,
"learning_rate": 1.0585573804116448e-05,
"loss": 0.5639,
"step": 4280
},
{
"epoch": 1.2200312101007236,
"grad_norm": 0.5206524729728699,
"learning_rate": 1.0453997964619112e-05,
"loss": 0.5594,
"step": 4300
},
{
"epoch": 1.2257057738686339,
"grad_norm": 0.48683062195777893,
"learning_rate": 1.0322806170194061e-05,
"loss": 0.5622,
"step": 4320
},
{
"epoch": 1.2313803376365442,
"grad_norm": 0.532207190990448,
"learning_rate": 1.0192009503890037e-05,
"loss": 0.5581,
"step": 4340
},
{
"epoch": 1.2370549014044545,
"grad_norm": 0.49200239777565,
"learning_rate": 1.0061619015375473e-05,
"loss": 0.5594,
"step": 4360
},
{
"epoch": 1.2427294651723648,
"grad_norm": 0.504898190498352,
"learning_rate": 9.931645720004995e-06,
"loss": 0.5622,
"step": 4380
},
{
"epoch": 1.2484040289402751,
"grad_norm": 0.5061923861503601,
"learning_rate": 9.802100597888877e-06,
"loss": 0.5572,
"step": 4400
},
{
"epoch": 1.2540785927081854,
"grad_norm": 0.4961055815219879,
"learning_rate": 9.672994592965409e-06,
"loss": 0.5609,
"step": 4420
},
{
"epoch": 1.259753156476096,
"grad_norm": 0.4930592477321625,
"learning_rate": 9.544338612076396e-06,
"loss": 0.5637,
"step": 4440
},
{
"epoch": 1.2654277202440063,
"grad_norm": 0.4978179335594177,
"learning_rate": 9.41614352404571e-06,
"loss": 0.5615,
"step": 4460
},
{
"epoch": 1.2711022840119166,
"grad_norm": 0.5112114548683167,
"learning_rate": 9.288420158761127e-06,
"loss": 0.558,
"step": 4480
},
{
"epoch": 1.276776847779827,
"grad_norm": 0.5114573240280151,
"learning_rate": 9.161179306259401e-06,
"loss": 0.5561,
"step": 4500
},
{
"epoch": 1.2824514115477372,
"grad_norm": 0.5023430585861206,
"learning_rate": 9.034431715814726e-06,
"loss": 0.5558,
"step": 4520
},
{
"epoch": 1.2881259753156475,
"grad_norm": 0.503487765789032,
"learning_rate": 8.908188095030655e-06,
"loss": 0.5607,
"step": 4540
},
{
"epoch": 1.2938005390835579,
"grad_norm": 0.5188455581665039,
"learning_rate": 8.78245910893552e-06,
"loss": 0.5639,
"step": 4560
},
{
"epoch": 1.2994751028514684,
"grad_norm": 0.5216081738471985,
"learning_rate": 8.657255379081438e-06,
"loss": 0.5584,
"step": 4580
},
{
"epoch": 1.3051496666193787,
"grad_norm": 0.5024508833885193,
"learning_rate": 8.532587482647013e-06,
"loss": 0.5604,
"step": 4600
},
{
"epoch": 1.310824230387289,
"grad_norm": 0.5100445747375488,
"learning_rate": 8.408465951543779e-06,
"loss": 0.5596,
"step": 4620
},
{
"epoch": 1.3164987941551993,
"grad_norm": 0.5005710124969482,
"learning_rate": 8.284901271526481e-06,
"loss": 0.5591,
"step": 4640
},
{
"epoch": 1.3221733579231096,
"grad_norm": 0.5151055455207825,
"learning_rate": 8.161903881307231e-06,
"loss": 0.5462,
"step": 4660
},
{
"epoch": 1.32784792169102,
"grad_norm": 0.4919968545436859,
"learning_rate": 8.039484171673628e-06,
"loss": 0.5523,
"step": 4680
},
{
"epoch": 1.3335224854589303,
"grad_norm": 0.5007758140563965,
"learning_rate": 7.917652484610975e-06,
"loss": 0.5545,
"step": 4700
},
{
"epoch": 1.3391970492268408,
"grad_norm": 0.4885912537574768,
"learning_rate": 7.796419112428583e-06,
"loss": 0.5582,
"step": 4720
},
{
"epoch": 1.344871612994751,
"grad_norm": 0.4874049127101898,
"learning_rate": 7.675794296890265e-06,
"loss": 0.5505,
"step": 4740
},
{
"epoch": 1.3505461767626614,
"grad_norm": 0.46998655796051025,
"learning_rate": 7.555788228349143e-06,
"loss": 0.554,
"step": 4760
},
{
"epoch": 1.3562207405305717,
"grad_norm": 0.4996753931045532,
"learning_rate": 7.436411044886753e-06,
"loss": 0.5513,
"step": 4780
},
{
"epoch": 1.361895304298482,
"grad_norm": 0.502571165561676,
"learning_rate": 7.31767283145657e-06,
"loss": 0.5547,
"step": 4800
},
{
"epoch": 1.3675698680663924,
"grad_norm": 0.48792627453804016,
"learning_rate": 7.199583619032052e-06,
"loss": 0.5551,
"step": 4820
},
{
"epoch": 1.3732444318343027,
"grad_norm": 0.48799988627433777,
"learning_rate": 7.082153383759222e-06,
"loss": 0.5524,
"step": 4840
},
{
"epoch": 1.3789189956022132,
"grad_norm": 0.4976406991481781,
"learning_rate": 6.9653920461138755e-06,
"loss": 0.5548,
"step": 4860
},
{
"epoch": 1.3845935593701233,
"grad_norm": 0.5006715655326843,
"learning_rate": 6.849309470063529e-06,
"loss": 0.5544,
"step": 4880
},
{
"epoch": 1.3902681231380338,
"grad_norm": 0.4864628314971924,
"learning_rate": 6.7339154622340754e-06,
"loss": 0.5483,
"step": 4900
},
{
"epoch": 1.3959426869059441,
"grad_norm": 0.48580724000930786,
"learning_rate": 6.619219771081361e-06,
"loss": 0.5544,
"step": 4920
},
{
"epoch": 1.4016172506738545,
"grad_norm": 0.5042415857315063,
"learning_rate": 6.505232086067607e-06,
"loss": 0.5504,
"step": 4940
},
{
"epoch": 1.4072918144417648,
"grad_norm": 0.4970082640647888,
"learning_rate": 6.391962036842863e-06,
"loss": 0.547,
"step": 4960
},
{
"epoch": 1.412966378209675,
"grad_norm": 0.47866857051849365,
"learning_rate": 6.279419192431494e-06,
"loss": 0.5548,
"step": 4980
},
{
"epoch": 1.4186409419775854,
"grad_norm": 0.4664076566696167,
"learning_rate": 6.167613060423789e-06,
"loss": 0.5454,
"step": 5000
},
{
"epoch": 1.4243155057454957,
"grad_norm": 0.49711087346076965,
"learning_rate": 6.0565530861727685e-06,
"loss": 0.5519,
"step": 5020
},
{
"epoch": 1.4299900695134062,
"grad_norm": 0.46965324878692627,
"learning_rate": 5.946248651996244e-06,
"loss": 0.5519,
"step": 5040
},
{
"epoch": 1.4356646332813165,
"grad_norm": 0.505743145942688,
"learning_rate": 5.836709076384188e-06,
"loss": 0.5482,
"step": 5060
},
{
"epoch": 1.4413391970492269,
"grad_norm": 0.5078002214431763,
"learning_rate": 5.727943613211521e-06,
"loss": 0.5575,
"step": 5080
},
{
"epoch": 1.4470137608171372,
"grad_norm": 0.48647207021713257,
"learning_rate": 5.619961450956347e-06,
"loss": 0.5461,
"step": 5100
},
{
"epoch": 1.4526883245850475,
"grad_norm": 0.4711668789386749,
"learning_rate": 5.5127717119237084e-06,
"loss": 0.5472,
"step": 5120
},
{
"epoch": 1.4583628883529578,
"grad_norm": 0.518395721912384,
"learning_rate": 5.406383451474948e-06,
"loss": 0.5483,
"step": 5140
},
{
"epoch": 1.464037452120868,
"grad_norm": 0.4849320948123932,
"learning_rate": 5.300805657262706e-06,
"loss": 0.5459,
"step": 5160
},
{
"epoch": 1.4697120158887786,
"grad_norm": 0.501943826675415,
"learning_rate": 5.1960472484716374e-06,
"loss": 0.5482,
"step": 5180
},
{
"epoch": 1.475386579656689,
"grad_norm": 0.48699691891670227,
"learning_rate": 5.092117075064931e-06,
"loss": 0.5522,
"step": 5200
},
{
"epoch": 1.4810611434245993,
"grad_norm": 0.48894861340522766,
"learning_rate": 4.989023917036667e-06,
"loss": 0.5502,
"step": 5220
},
{
"epoch": 1.4867357071925096,
"grad_norm": 0.49131521582603455,
"learning_rate": 4.886776483670077e-06,
"loss": 0.5466,
"step": 5240
},
{
"epoch": 1.49241027096042,
"grad_norm": 0.47139400243759155,
"learning_rate": 4.78538341280181e-06,
"loss": 0.5473,
"step": 5260
},
{
"epoch": 1.4980848347283302,
"grad_norm": 0.49604731798171997,
"learning_rate": 4.684853270092173e-06,
"loss": 0.5498,
"step": 5280
},
{
"epoch": 1.5037593984962405,
"grad_norm": 0.4864351749420166,
"learning_rate": 4.585194548301545e-06,
"loss": 0.5448,
"step": 5300
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.48130905628204346,
"learning_rate": 4.486415666572874e-06,
"loss": 0.5469,
"step": 5320
},
{
"epoch": 1.5151085260320611,
"grad_norm": 0.4783124625682831,
"learning_rate": 4.388524969720458e-06,
"loss": 0.546,
"step": 5340
},
{
"epoch": 1.5207830897999717,
"grad_norm": 0.4969868063926697,
"learning_rate": 4.2915307275249585e-06,
"loss": 0.5453,
"step": 5360
},
{
"epoch": 1.526457653567882,
"grad_norm": 0.4832542836666107,
"learning_rate": 4.195441134034799e-06,
"loss": 0.5463,
"step": 5380
},
{
"epoch": 1.5321322173357923,
"grad_norm": 0.4712090790271759,
"learning_rate": 4.10026430687389e-06,
"loss": 0.5449,
"step": 5400
},
{
"epoch": 1.5378067811037026,
"grad_norm": 0.4822421967983246,
"learning_rate": 4.0060082865559035e-06,
"loss": 0.5465,
"step": 5420
},
{
"epoch": 1.543481344871613,
"grad_norm": 0.4809670150279999,
"learning_rate": 3.912681035804971e-06,
"loss": 0.5406,
"step": 5440
},
{
"epoch": 1.5491559086395235,
"grad_norm": 0.4631410539150238,
"learning_rate": 3.820290438883018e-06,
"loss": 0.5461,
"step": 5460
},
{
"epoch": 1.5548304724074336,
"grad_norm": 0.46498140692710876,
"learning_rate": 3.728844300923694e-06,
"loss": 0.5419,
"step": 5480
},
{
"epoch": 1.560505036175344,
"grad_norm": 0.4786704480648041,
"learning_rate": 3.6383503472730116e-06,
"loss": 0.5476,
"step": 5500
},
{
"epoch": 1.5661795999432544,
"grad_norm": 0.4655323624610901,
"learning_rate": 3.548816222836688e-06,
"loss": 0.5406,
"step": 5520
},
{
"epoch": 1.5718541637111647,
"grad_norm": 0.46424925327301025,
"learning_rate": 3.460249491434319e-06,
"loss": 0.5415,
"step": 5540
},
{
"epoch": 1.577528727479075,
"grad_norm": 0.45783787965774536,
"learning_rate": 3.3726576351603985e-06,
"loss": 0.5503,
"step": 5560
},
{
"epoch": 1.5832032912469853,
"grad_norm": 0.49086692929267883,
"learning_rate": 3.2860480537522103e-06,
"loss": 0.543,
"step": 5580
},
{
"epoch": 1.5888778550148959,
"grad_norm": 0.48474520444869995,
"learning_rate": 3.2004280639647122e-06,
"loss": 0.539,
"step": 5600
},
{
"epoch": 1.594552418782806,
"grad_norm": 0.5037649869918823,
"learning_rate": 3.115804898952434e-06,
"loss": 0.5415,
"step": 5620
},
{
"epoch": 1.6002269825507165,
"grad_norm": 0.4954313337802887,
"learning_rate": 3.032185707658389e-06,
"loss": 0.5487,
"step": 5640
},
{
"epoch": 1.6059015463186268,
"grad_norm": 0.4597771465778351,
"learning_rate": 2.949577554210157e-06,
"loss": 0.5445,
"step": 5660
},
{
"epoch": 1.6115761100865371,
"grad_norm": 0.4839852750301361,
"learning_rate": 2.8679874173231137e-06,
"loss": 0.5499,
"step": 5680
},
{
"epoch": 1.6172506738544474,
"grad_norm": 0.4653310179710388,
"learning_rate": 2.787422189710844e-06,
"loss": 0.5453,
"step": 5700
},
{
"epoch": 1.6229252376223577,
"grad_norm": 0.485579252243042,
"learning_rate": 2.7078886775028693e-06,
"loss": 0.5383,
"step": 5720
},
{
"epoch": 1.6285998013902683,
"grad_norm": 0.4727838337421417,
"learning_rate": 2.629393599669667e-06,
"loss": 0.5421,
"step": 5740
},
{
"epoch": 1.6342743651581784,
"grad_norm": 0.45239365100860596,
"learning_rate": 2.5519435874550434e-06,
"loss": 0.5357,
"step": 5760
},
{
"epoch": 1.639948928926089,
"grad_norm": 0.4669874310493469,
"learning_rate": 2.475545183815926e-06,
"loss": 0.5385,
"step": 5780
},
{
"epoch": 1.645623492693999,
"grad_norm": 0.4859563410282135,
"learning_rate": 2.400204842869637e-06,
"loss": 0.5446,
"step": 5800
},
{
"epoch": 1.6512980564619095,
"grad_norm": 0.4492729902267456,
"learning_rate": 2.3259289293486246e-06,
"loss": 0.5418,
"step": 5820
},
{
"epoch": 1.6569726202298198,
"grad_norm": 0.46383896470069885,
"learning_rate": 2.252723718062787e-06,
"loss": 0.5401,
"step": 5840
},
{
"epoch": 1.6626471839977301,
"grad_norm": 0.48168492317199707,
"learning_rate": 2.1805953933693835e-06,
"loss": 0.5423,
"step": 5860
},
{
"epoch": 1.6683217477656405,
"grad_norm": 0.46742239594459534,
"learning_rate": 2.109550048650563e-06,
"loss": 0.542,
"step": 5880
},
{
"epoch": 1.6739963115335508,
"grad_norm": 0.46751725673675537,
"learning_rate": 2.0395936857986125e-06,
"loss": 0.5402,
"step": 5900
},
{
"epoch": 1.6796708753014613,
"grad_norm": 0.49627310037612915,
"learning_rate": 1.970732214708908e-06,
"loss": 0.5461,
"step": 5920
},
{
"epoch": 1.6853454390693714,
"grad_norm": 0.46826520562171936,
"learning_rate": 1.9029714527806652e-06,
"loss": 0.5385,
"step": 5940
},
{
"epoch": 1.691020002837282,
"grad_norm": 0.4701858162879944,
"learning_rate": 1.8363171244254606e-06,
"loss": 0.5376,
"step": 5960
},
{
"epoch": 1.6966945666051922,
"grad_norm": 0.4635229706764221,
"learning_rate": 1.7707748605836632e-06,
"loss": 0.5378,
"step": 5980
},
{
"epoch": 1.7023691303731026,
"grad_norm": 0.4729613661766052,
"learning_rate": 1.7063501982487135e-06,
"loss": 0.5437,
"step": 6000
},
{
"epoch": 1.7080436941410129,
"grad_norm": 0.4672451913356781,
"learning_rate": 1.6430485799993673e-06,
"loss": 0.5428,
"step": 6020
},
{
"epoch": 1.7137182579089232,
"grad_norm": 0.46772390604019165,
"learning_rate": 1.5808753535399022e-06,
"loss": 0.5392,
"step": 6040
},
{
"epoch": 1.7193928216768337,
"grad_norm": 0.46337825059890747,
"learning_rate": 1.5198357712483629e-06,
"loss": 0.5413,
"step": 6060
},
{
"epoch": 1.7250673854447438,
"grad_norm": 0.48103076219558716,
"learning_rate": 1.459934989732818e-06,
"loss": 0.5416,
"step": 6080
},
{
"epoch": 1.7307419492126543,
"grad_norm": 0.45769959688186646,
"learning_rate": 1.4011780693957492e-06,
"loss": 0.5436,
"step": 6100
},
{
"epoch": 1.7364165129805647,
"grad_norm": 0.4552821218967438,
"learning_rate": 1.3435699740065377e-06,
"loss": 0.5425,
"step": 6120
},
{
"epoch": 1.742091076748475,
"grad_norm": 0.48623600602149963,
"learning_rate": 1.2871155702821324e-06,
"loss": 0.5427,
"step": 6140
},
{
"epoch": 1.7477656405163853,
"grad_norm": 0.5024483799934387,
"learning_rate": 1.231819627475911e-06,
"loss": 0.5384,
"step": 6160
},
{
"epoch": 1.7534402042842956,
"grad_norm": 0.4556623101234436,
"learning_rate": 1.1776868169747702e-06,
"loss": 0.5393,
"step": 6180
},
{
"epoch": 1.7591147680522061,
"grad_norm": 0.4748471677303314,
"learning_rate": 1.1247217119044951e-06,
"loss": 0.5385,
"step": 6200
},
{
"epoch": 1.7647893318201162,
"grad_norm": 0.4622340500354767,
"learning_rate": 1.07292878674342e-06,
"loss": 0.5377,
"step": 6220
},
{
"epoch": 1.7704638955880267,
"grad_norm": 0.4581329822540283,
"learning_rate": 1.0223124169444236e-06,
"loss": 0.5366,
"step": 6240
},
{
"epoch": 1.776138459355937,
"grad_norm": 0.4667391777038574,
"learning_rate": 9.72876878565287e-07,
"loss": 0.539,
"step": 6260
},
{
"epoch": 1.7818130231238474,
"grad_norm": 0.4563803970813751,
"learning_rate": 9.246263479074663e-07,
"loss": 0.5403,
"step": 6280
},
{
"epoch": 1.7874875868917577,
"grad_norm": 0.44948819279670715,
"learning_rate": 8.775649011632703e-07,
"loss": 0.5392,
"step": 6300
},
{
"epoch": 1.793162150659668,
"grad_norm": 0.4829549193382263,
"learning_rate": 8.316965140715071e-07,
"loss": 0.5373,
"step": 6320
},
{
"epoch": 1.7988367144275785,
"grad_norm": 0.4718981683254242,
"learning_rate": 7.870250615816182e-07,
"loss": 0.5383,
"step": 6340
},
{
"epoch": 1.8045112781954886,
"grad_norm": 0.4641667306423187,
"learning_rate": 7.435543175263166e-07,
"loss": 0.543,
"step": 6360
},
{
"epoch": 1.8101858419633992,
"grad_norm": 0.45884087681770325,
"learning_rate": 7.012879543027801e-07,
"loss": 0.538,
"step": 6380
},
{
"epoch": 1.8158604057313092,
"grad_norm": 0.4888609051704407,
"learning_rate": 6.602295425624033e-07,
"loss": 0.5366,
"step": 6400
},
{
"epoch": 1.8215349694992198,
"grad_norm": 0.46243107318878174,
"learning_rate": 6.20382550909157e-07,
"loss": 0.5365,
"step": 6420
},
{
"epoch": 1.82720953326713,
"grad_norm": 0.46520647406578064,
"learning_rate": 5.817503456065559e-07,
"loss": 0.5339,
"step": 6440
},
{
"epoch": 1.8328840970350404,
"grad_norm": 0.47549664974212646,
"learning_rate": 5.443361902932792e-07,
"loss": 0.5361,
"step": 6460
},
{
"epoch": 1.838558660802951,
"grad_norm": 0.4677965044975281,
"learning_rate": 5.081432457074614e-07,
"loss": 0.5394,
"step": 6480
},
{
"epoch": 1.844233224570861,
"grad_norm": 0.46250638365745544,
"learning_rate": 4.7317456941966597e-07,
"loss": 0.5388,
"step": 6500
},
{
"epoch": 1.8499077883387716,
"grad_norm": 0.4758864641189575,
"learning_rate": 4.3943311557459177e-07,
"loss": 0.534,
"step": 6520
},
{
"epoch": 1.8555823521066817,
"grad_norm": 0.4370381832122803,
"learning_rate": 4.069217346415027e-07,
"loss": 0.5339,
"step": 6540
},
{
"epoch": 1.8612569158745922,
"grad_norm": 0.4617324769496918,
"learning_rate": 3.756431731734272e-07,
"loss": 0.5396,
"step": 6560
},
{
"epoch": 1.8669314796425025,
"grad_norm": 0.4532717168331146,
"learning_rate": 3.4560007357511856e-07,
"loss": 0.5393,
"step": 6580
},
{
"epoch": 1.8726060434104128,
"grad_norm": 0.46486184000968933,
"learning_rate": 3.16794973879837e-07,
"loss": 0.5367,
"step": 6600
},
{
"epoch": 1.8782806071783231,
"grad_norm": 0.44514200091362,
"learning_rate": 2.8923030753492783e-07,
"loss": 0.5384,
"step": 6620
},
{
"epoch": 1.8839551709462334,
"grad_norm": 0.4737865924835205,
"learning_rate": 2.6290840319625255e-07,
"loss": 0.5355,
"step": 6640
},
{
"epoch": 1.889629734714144,
"grad_norm": 0.45271801948547363,
"learning_rate": 2.378314845314561e-07,
"loss": 0.5451,
"step": 6660
},
{
"epoch": 1.895304298482054,
"grad_norm": 0.46050384640693665,
"learning_rate": 2.14001670032124e-07,
"loss": 0.5347,
"step": 6680
},
{
"epoch": 1.9009788622499646,
"grad_norm": 0.4726841151714325,
"learning_rate": 1.9142097283479876e-07,
"loss": 0.5428,
"step": 6700
},
{
"epoch": 1.906653426017875,
"grad_norm": 0.4662003815174103,
"learning_rate": 1.700913005509208e-07,
"loss": 0.5407,
"step": 6720
},
{
"epoch": 1.9123279897857852,
"grad_norm": 0.44422999024391174,
"learning_rate": 1.500144551056709e-07,
"loss": 0.535,
"step": 6740
},
{
"epoch": 1.9180025535536955,
"grad_norm": 0.4599597752094269,
"learning_rate": 1.3119213258574015e-07,
"loss": 0.5376,
"step": 6760
},
{
"epoch": 1.9236771173216058,
"grad_norm": 0.4735456705093384,
"learning_rate": 1.1362592309605291e-07,
"loss": 0.5392,
"step": 6780
},
{
"epoch": 1.9293516810895164,
"grad_norm": 0.4692912995815277,
"learning_rate": 9.731731062542604e-08,
"loss": 0.5398,
"step": 6800
}
],
"logging_steps": 20,
"max_steps": 7048,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5124467391135325e+20,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}