{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997968855788761, "eval_steps": 50, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013540961408259987, "grad_norm": 18.463732975176427, "learning_rate": 5e-07, "loss": 1.749, "step": 5 }, { "epoch": 0.027081922816519974, "grad_norm": 12.15126324106773, "learning_rate": 1e-06, "loss": 1.5921, "step": 10 }, { "epoch": 0.040622884224779957, "grad_norm": 7.7718176367972, "learning_rate": 9.9994874230328e-07, "loss": 1.2948, "step": 15 }, { "epoch": 0.05416384563303995, "grad_norm": 4.024003853982352, "learning_rate": 9.997949797225268e-07, "loss": 1.1393, "step": 20 }, { "epoch": 0.06770480704129993, "grad_norm": 3.796718690402949, "learning_rate": 9.995387437838025e-07, "loss": 1.0604, "step": 25 }, { "epoch": 0.08124576844955991, "grad_norm": 3.4511274356883295, "learning_rate": 9.991800870233637e-07, "loss": 1.0272, "step": 30 }, { "epoch": 0.0947867298578199, "grad_norm": 3.6605756447735724, "learning_rate": 9.98719082976888e-07, "loss": 1.003, "step": 35 }, { "epoch": 0.1083276912660799, "grad_norm": 3.5416553128618156, "learning_rate": 9.981558261643982e-07, "loss": 0.9719, "step": 40 }, { "epoch": 0.12186865267433988, "grad_norm": 3.5000585696846245, "learning_rate": 9.97490432070881e-07, "loss": 0.9584, "step": 45 }, { "epoch": 0.13540961408259986, "grad_norm": 3.805541453776684, "learning_rate": 9.967230371226118e-07, "loss": 0.9444, "step": 50 }, { "epoch": 0.13540961408259986, "eval_loss": 0.9377400875091553, "eval_runtime": 182.856, "eval_samples_per_second": 57.422, "eval_steps_per_second": 0.902, "step": 50 }, { "epoch": 0.14895057549085985, "grad_norm": 3.6018765357986844, "learning_rate": 9.958537986591803e-07, "loss": 0.9198, "step": 55 }, { "epoch": 0.16249153689911983, "grad_norm": 3.5280703732572545, "learning_rate": 9.948828949012327e-07, "loss": 0.9181, "step": 60 }, { "epoch": 0.17603249830737983, "grad_norm": 3.6595010678642925, "learning_rate": 9.938105249139305e-07, "loss": 0.9296, "step": 65 }, { "epoch": 0.1895734597156398, "grad_norm": 3.323630668058957, "learning_rate": 9.92636908566136e-07, "loss": 0.9241, "step": 70 }, { "epoch": 0.2031144211238998, "grad_norm": 3.425849289666098, "learning_rate": 9.913622864853324e-07, "loss": 0.8917, "step": 75 }, { "epoch": 0.2166553825321598, "grad_norm": 3.9967436509747025, "learning_rate": 9.89986920008288e-07, "loss": 0.8988, "step": 80 }, { "epoch": 0.23019634394041977, "grad_norm": 3.436149937331634, "learning_rate": 9.885110911274738e-07, "loss": 0.8774, "step": 85 }, { "epoch": 0.24373730534867977, "grad_norm": 3.642660003309155, "learning_rate": 9.869351024332466e-07, "loss": 0.8787, "step": 90 }, { "epoch": 0.25727826675693977, "grad_norm": 3.3574313517064978, "learning_rate": 9.852592770518082e-07, "loss": 0.8897, "step": 95 }, { "epoch": 0.2708192281651997, "grad_norm": 3.534867148811258, "learning_rate": 9.834839585789557e-07, "loss": 0.8668, "step": 100 }, { "epoch": 0.2708192281651997, "eval_loss": 0.8815732002258301, "eval_runtime": 182.989, "eval_samples_per_second": 57.38, "eval_steps_per_second": 0.902, "step": 100 }, { "epoch": 0.2843601895734597, "grad_norm": 3.476334412992533, "learning_rate": 9.816095110096324e-07, "loss": 0.8806, "step": 105 }, { "epoch": 0.2979011509817197, "grad_norm": 3.5218146742622856, "learning_rate": 9.796363186632983e-07, "loss": 0.8895, "step": 110 }, { "epoch": 0.3114421123899797, "grad_norm": 3.6344919656503545, "learning_rate": 9.775647861051328e-07, "loss": 0.8675, "step": 115 }, { "epoch": 0.32498307379823965, "grad_norm": 3.8058175800193537, "learning_rate": 9.753953380630862e-07, "loss": 0.8739, "step": 120 }, { "epoch": 0.33852403520649965, "grad_norm": 3.5458347600694515, "learning_rate": 9.731284193407981e-07, "loss": 0.8536, "step": 125 }, { "epoch": 0.35206499661475965, "grad_norm": 3.4919069279296138, "learning_rate": 9.707644947263975e-07, "loss": 0.8598, "step": 130 }, { "epoch": 0.36560595802301965, "grad_norm": 3.2003162498092848, "learning_rate": 9.683040488972086e-07, "loss": 0.8628, "step": 135 }, { "epoch": 0.3791469194312796, "grad_norm": 3.337658064243358, "learning_rate": 9.657475863203756e-07, "loss": 0.8633, "step": 140 }, { "epoch": 0.3926878808395396, "grad_norm": 3.4844992093115987, "learning_rate": 9.63095631149432e-07, "loss": 0.859, "step": 145 }, { "epoch": 0.4062288422477996, "grad_norm": 3.5999127416589296, "learning_rate": 9.603487271168336e-07, "loss": 0.8436, "step": 150 }, { "epoch": 0.4062288422477996, "eval_loss": 0.8559273481369019, "eval_runtime": 182.9208, "eval_samples_per_second": 57.402, "eval_steps_per_second": 0.902, "step": 150 }, { "epoch": 0.4197698036560596, "grad_norm": 3.4543800855029807, "learning_rate": 9.575074374224758e-07, "loss": 0.8431, "step": 155 }, { "epoch": 0.4333107650643196, "grad_norm": 3.574884162982998, "learning_rate": 9.545723446182201e-07, "loss": 0.8466, "step": 160 }, { "epoch": 0.44685172647257954, "grad_norm": 3.230714522717894, "learning_rate": 9.515440504884539e-07, "loss": 0.829, "step": 165 }, { "epoch": 0.46039268788083954, "grad_norm": 3.38192753716993, "learning_rate": 9.484231759267054e-07, "loss": 0.8462, "step": 170 }, { "epoch": 0.47393364928909953, "grad_norm": 3.562118447176729, "learning_rate": 9.452103608083417e-07, "loss": 0.8602, "step": 175 }, { "epoch": 0.48747461069735953, "grad_norm": 3.2476691940428353, "learning_rate": 9.419062638593748e-07, "loss": 0.8266, "step": 180 }, { "epoch": 0.5010155721056195, "grad_norm": 3.494748883924626, "learning_rate": 9.385115625214021e-07, "loss": 0.8219, "step": 185 }, { "epoch": 0.5145565335138795, "grad_norm": 3.4365818103766452, "learning_rate": 9.350269528127101e-07, "loss": 0.8438, "step": 190 }, { "epoch": 0.5280974949221394, "grad_norm": 3.245299565361765, "learning_rate": 9.31453149185569e-07, "loss": 0.8337, "step": 195 }, { "epoch": 0.5416384563303994, "grad_norm": 3.603292666759954, "learning_rate": 9.277908843797492e-07, "loss": 0.8192, "step": 200 }, { "epoch": 0.5416384563303994, "eval_loss": 0.8383815884590149, "eval_runtime": 183.0029, "eval_samples_per_second": 57.376, "eval_steps_per_second": 0.902, "step": 200 }, { "epoch": 0.5551794177386594, "grad_norm": 3.3561089172228797, "learning_rate": 9.240409092722852e-07, "loss": 0.8483, "step": 205 }, { "epoch": 0.5687203791469194, "grad_norm": 3.351512113878825, "learning_rate": 9.20203992723524e-07, "loss": 0.8109, "step": 210 }, { "epoch": 0.5822613405551794, "grad_norm": 3.4763361691712293, "learning_rate": 9.162809214194851e-07, "loss": 0.8335, "step": 215 }, { "epoch": 0.5958023019634394, "grad_norm": 3.292224110319897, "learning_rate": 9.122724997105646e-07, "loss": 0.8465, "step": 220 }, { "epoch": 0.6093432633716994, "grad_norm": 3.566456464228769, "learning_rate": 9.0817954944662e-07, "loss": 0.8257, "step": 225 }, { "epoch": 0.6228842247799594, "grad_norm": 3.6677580289310088, "learning_rate": 9.040029098084643e-07, "loss": 0.8479, "step": 230 }, { "epoch": 0.6364251861882194, "grad_norm": 3.4606320291357546, "learning_rate": 8.997434371358092e-07, "loss": 0.8412, "step": 235 }, { "epoch": 0.6499661475964793, "grad_norm": 3.2432058195015467, "learning_rate": 8.954020047516884e-07, "loss": 0.7984, "step": 240 }, { "epoch": 0.6635071090047393, "grad_norm": 3.24817794736711, "learning_rate": 8.909795027833996e-07, "loss": 0.834, "step": 245 }, { "epoch": 0.6770480704129993, "grad_norm": 3.653634156312455, "learning_rate": 8.864768379800016e-07, "loss": 0.8285, "step": 250 }, { "epoch": 0.6770480704129993, "eval_loss": 0.8261091709136963, "eval_runtime": 182.9038, "eval_samples_per_second": 57.407, "eval_steps_per_second": 0.902, "step": 250 }, { "epoch": 0.6905890318212593, "grad_norm": 3.5255613554356384, "learning_rate": 8.81894933526402e-07, "loss": 0.8322, "step": 255 }, { "epoch": 0.7041299932295193, "grad_norm": 3.727303124024427, "learning_rate": 8.772347288540763e-07, "loss": 0.8384, "step": 260 }, { "epoch": 0.7176709546377793, "grad_norm": 3.731075409838065, "learning_rate": 8.724971794484555e-07, "loss": 0.8263, "step": 265 }, { "epoch": 0.7312119160460393, "grad_norm": 3.6981830618689484, "learning_rate": 8.676832566530221e-07, "loss": 0.8165, "step": 270 }, { "epoch": 0.7447528774542993, "grad_norm": 3.498794505216147, "learning_rate": 8.62793947470155e-07, "loss": 0.8037, "step": 275 }, { "epoch": 0.7582938388625592, "grad_norm": 3.5635605377852024, "learning_rate": 8.578302543587629e-07, "loss": 0.7896, "step": 280 }, { "epoch": 0.7718348002708192, "grad_norm": 3.63798214652303, "learning_rate": 8.527931950287506e-07, "loss": 0.8129, "step": 285 }, { "epoch": 0.7853757616790792, "grad_norm": 3.518857236471562, "learning_rate": 8.47683802232356e-07, "loss": 0.8182, "step": 290 }, { "epoch": 0.7989167230873392, "grad_norm": 3.5608373625732694, "learning_rate": 8.425031235524045e-07, "loss": 0.8256, "step": 295 }, { "epoch": 0.8124576844955992, "grad_norm": 3.6755850332375286, "learning_rate": 8.372522211875224e-07, "loss": 0.8132, "step": 300 }, { "epoch": 0.8124576844955992, "eval_loss": 0.8165345191955566, "eval_runtime": 182.8879, "eval_samples_per_second": 57.412, "eval_steps_per_second": 0.902, "step": 300 }, { "epoch": 0.8259986459038592, "grad_norm": 3.596321476724727, "learning_rate": 8.319321717343535e-07, "loss": 0.8149, "step": 305 }, { "epoch": 0.8395396073121192, "grad_norm": 3.7225323914645747, "learning_rate": 8.265440659668234e-07, "loss": 0.8124, "step": 310 }, { "epoch": 0.8530805687203792, "grad_norm": 3.5663300158516016, "learning_rate": 8.210890086124977e-07, "loss": 0.7906, "step": 315 }, { "epoch": 0.8666215301286392, "grad_norm": 3.4256587260000604, "learning_rate": 8.155681181260776e-07, "loss": 0.81, "step": 320 }, { "epoch": 0.8801624915368991, "grad_norm": 3.4929314527584077, "learning_rate": 8.099825264600842e-07, "loss": 0.818, "step": 325 }, { "epoch": 0.8937034529451591, "grad_norm": 3.5559616159997742, "learning_rate": 8.04333378832772e-07, "loss": 0.8067, "step": 330 }, { "epoch": 0.9072444143534191, "grad_norm": 3.3786924383796544, "learning_rate": 7.98621833493324e-07, "loss": 0.802, "step": 335 }, { "epoch": 0.9207853757616791, "grad_norm": 3.6281675631205377, "learning_rate": 7.928490614843757e-07, "loss": 0.7991, "step": 340 }, { "epoch": 0.9343263371699391, "grad_norm": 3.5783671775239068, "learning_rate": 7.870162464019143e-07, "loss": 0.7961, "step": 345 }, { "epoch": 0.9478672985781991, "grad_norm": 3.523112182862875, "learning_rate": 7.811245841526062e-07, "loss": 0.7997, "step": 350 }, { "epoch": 0.9478672985781991, "eval_loss": 0.8075853586196899, "eval_runtime": 182.8371, "eval_samples_per_second": 57.428, "eval_steps_per_second": 0.902, "step": 350 }, { "epoch": 0.9614082599864591, "grad_norm": 3.327823061584991, "learning_rate": 7.75175282708598e-07, "loss": 0.7955, "step": 355 }, { "epoch": 0.9749492213947191, "grad_norm": 3.353254883691996, "learning_rate": 7.691695618598466e-07, "loss": 0.7945, "step": 360 }, { "epoch": 0.988490182802979, "grad_norm": 3.4573365027199547, "learning_rate": 7.631086529640229e-07, "loss": 0.8037, "step": 365 }, { "epoch": 1.002031144211239, "grad_norm": 3.704603819142361, "learning_rate": 7.569937986940475e-07, "loss": 0.7833, "step": 370 }, { "epoch": 1.015572105619499, "grad_norm": 3.338765249405955, "learning_rate": 7.508262527833028e-07, "loss": 0.736, "step": 375 }, { "epoch": 1.029113067027759, "grad_norm": 3.5005014577141154, "learning_rate": 7.446072797685799e-07, "loss": 0.7393, "step": 380 }, { "epoch": 1.042654028436019, "grad_norm": 3.5415652623772753, "learning_rate": 7.383381547308099e-07, "loss": 0.7461, "step": 385 }, { "epoch": 1.0561949898442788, "grad_norm": 3.430934641227424, "learning_rate": 7.320201630336318e-07, "loss": 0.7359, "step": 390 }, { "epoch": 1.0697359512525388, "grad_norm": 3.6468766296195896, "learning_rate": 7.256546000598551e-07, "loss": 0.7306, "step": 395 }, { "epoch": 1.0832769126607988, "grad_norm": 4.055674159253643, "learning_rate": 7.192427709458655e-07, "loss": 0.7335, "step": 400 }, { "epoch": 1.0832769126607988, "eval_loss": 0.804972231388092, "eval_runtime": 182.8412, "eval_samples_per_second": 57.427, "eval_steps_per_second": 0.902, "step": 400 }, { "epoch": 1.0968178740690588, "grad_norm": 3.7131013971600244, "learning_rate": 7.127859903140311e-07, "loss": 0.7346, "step": 405 }, { "epoch": 1.1103588354773188, "grad_norm": 3.496845579088122, "learning_rate": 7.062855820031659e-07, "loss": 0.7409, "step": 410 }, { "epoch": 1.1238997968855788, "grad_norm": 3.652963743841258, "learning_rate": 6.997428787971005e-07, "loss": 0.7236, "step": 415 }, { "epoch": 1.1374407582938388, "grad_norm": 3.6316016258989916, "learning_rate": 6.93159222151422e-07, "loss": 0.734, "step": 420 }, { "epoch": 1.1509817197020988, "grad_norm": 4.106556416672479, "learning_rate": 6.86535961918433e-07, "loss": 0.7425, "step": 425 }, { "epoch": 1.1645226811103588, "grad_norm": 3.4406596945092764, "learning_rate": 6.798744560703904e-07, "loss": 0.7271, "step": 430 }, { "epoch": 1.1780636425186188, "grad_norm": 3.5644142212223886, "learning_rate": 6.731760704210802e-07, "loss": 0.729, "step": 435 }, { "epoch": 1.1916046039268788, "grad_norm": 3.5207779341108316, "learning_rate": 6.66442178345783e-07, "loss": 0.7295, "step": 440 }, { "epoch": 1.2051455653351388, "grad_norm": 3.8426763234364643, "learning_rate": 6.596741604996897e-07, "loss": 0.7285, "step": 445 }, { "epoch": 1.2186865267433988, "grad_norm": 3.537416567969802, "learning_rate": 6.528734045348248e-07, "loss": 0.7466, "step": 450 }, { "epoch": 1.2186865267433988, "eval_loss": 0.7996942400932312, "eval_runtime": 182.8629, "eval_samples_per_second": 57.42, "eval_steps_per_second": 0.902, "step": 450 }, { "epoch": 1.2322274881516588, "grad_norm": 3.5566566741869603, "learning_rate": 6.460413048155354e-07, "loss": 0.7291, "step": 455 }, { "epoch": 1.2457684495599188, "grad_norm": 3.6983928953608323, "learning_rate": 6.391792621326027e-07, "loss": 0.7502, "step": 460 }, { "epoch": 1.2593094109681786, "grad_norm": 3.679881191981186, "learning_rate": 6.322886834160377e-07, "loss": 0.7375, "step": 465 }, { "epoch": 1.2728503723764386, "grad_norm": 3.635647992456833, "learning_rate": 6.253709814466167e-07, "loss": 0.7446, "step": 470 }, { "epoch": 1.2863913337846986, "grad_norm": 3.7784368285832675, "learning_rate": 6.184275745662179e-07, "loss": 0.7307, "step": 475 }, { "epoch": 1.2999322951929586, "grad_norm": 3.975716487359526, "learning_rate": 6.114598863870178e-07, "loss": 0.727, "step": 480 }, { "epoch": 1.3134732566012186, "grad_norm": 3.8898112879763818, "learning_rate": 6.044693454996059e-07, "loss": 0.7351, "step": 485 }, { "epoch": 1.3270142180094786, "grad_norm": 3.883701681898452, "learning_rate": 5.974573851800817e-07, "loss": 0.7376, "step": 490 }, { "epoch": 1.3405551794177386, "grad_norm": 3.5463439014644695, "learning_rate": 5.904254430961869e-07, "loss": 0.7413, "step": 495 }, { "epoch": 1.3540961408259986, "grad_norm": 3.5029550543033374, "learning_rate": 5.833749610125401e-07, "loss": 0.7264, "step": 500 }, { "epoch": 1.3540961408259986, "eval_loss": 0.7957150340080261, "eval_runtime": 182.9392, "eval_samples_per_second": 57.396, "eval_steps_per_second": 0.902, "step": 500 }, { "epoch": 1.3676371022342586, "grad_norm": 3.9766897722099124, "learning_rate": 5.763073844950309e-07, "loss": 0.7327, "step": 505 }, { "epoch": 1.3811780636425186, "grad_norm": 3.80954804006231, "learning_rate": 5.69224162614434e-07, "loss": 0.7443, "step": 510 }, { "epoch": 1.3947190250507786, "grad_norm": 3.524183540063996, "learning_rate": 5.621267476493052e-07, "loss": 0.7345, "step": 515 }, { "epoch": 1.4082599864590386, "grad_norm": 3.620522282934874, "learning_rate": 5.550165947882196e-07, "loss": 0.7236, "step": 520 }, { "epoch": 1.4218009478672986, "grad_norm": 3.768208934721801, "learning_rate": 5.478951618314132e-07, "loss": 0.7165, "step": 525 }, { "epoch": 1.4353419092755586, "grad_norm": 3.5182066154018345, "learning_rate": 5.407639088918888e-07, "loss": 0.7297, "step": 530 }, { "epoch": 1.4488828706838186, "grad_norm": 3.824867160497226, "learning_rate": 5.33624298096048e-07, "loss": 0.7307, "step": 535 }, { "epoch": 1.4624238320920786, "grad_norm": 3.488297423659722, "learning_rate": 5.264777932839104e-07, "loss": 0.7487, "step": 540 }, { "epoch": 1.4759647935003386, "grad_norm": 3.493644177453455, "learning_rate": 5.193258597089809e-07, "loss": 0.7291, "step": 545 }, { "epoch": 1.4895057549085986, "grad_norm": 3.6179715470055824, "learning_rate": 5.121699637378282e-07, "loss": 0.7286, "step": 550 }, { "epoch": 1.4895057549085986, "eval_loss": 0.7910673022270203, "eval_runtime": 182.8784, "eval_samples_per_second": 57.415, "eval_steps_per_second": 0.902, "step": 550 }, { "epoch": 1.5030467163168586, "grad_norm": 3.5922430809215777, "learning_rate": 5.050115725494339e-07, "loss": 0.7179, "step": 555 }, { "epoch": 1.5165876777251186, "grad_norm": 3.4032672519789786, "learning_rate": 4.978521538343764e-07, "loss": 0.7366, "step": 560 }, { "epoch": 1.5301286391333786, "grad_norm": 3.6567954327498327, "learning_rate": 4.906931754939083e-07, "loss": 0.7391, "step": 565 }, { "epoch": 1.5436696005416386, "grad_norm": 3.5644839916468243, "learning_rate": 4.835361053389921e-07, "loss": 0.7288, "step": 570 }, { "epoch": 1.5572105619498986, "grad_norm": 3.650299513434273, "learning_rate": 4.763824107893532e-07, "loss": 0.729, "step": 575 }, { "epoch": 1.5707515233581584, "grad_norm": 3.8055072261605116, "learning_rate": 4.692335585726145e-07, "loss": 0.7177, "step": 580 }, { "epoch": 1.5842924847664184, "grad_norm": 3.5985047420048697, "learning_rate": 4.6209101442357116e-07, "loss": 0.7224, "step": 585 }, { "epoch": 1.5978334461746784, "grad_norm": 3.547274876419971, "learning_rate": 4.549562427836701e-07, "loss": 0.7327, "step": 590 }, { "epoch": 1.6113744075829384, "grad_norm": 3.511159114275773, "learning_rate": 4.4783070650075537e-07, "loss": 0.7406, "step": 595 }, { "epoch": 1.6249153689911984, "grad_norm": 3.46127753253687, "learning_rate": 4.407158665291376e-07, "loss": 0.7251, "step": 600 }, { "epoch": 1.6249153689911984, "eval_loss": 0.7875649333000183, "eval_runtime": 182.8944, "eval_samples_per_second": 57.41, "eval_steps_per_second": 0.902, "step": 600 }, { "epoch": 1.6384563303994584, "grad_norm": 3.409510769647259, "learning_rate": 4.336131816300548e-07, "loss": 0.719, "step": 605 }, { "epoch": 1.6519972918077184, "grad_norm": 3.5447361212717925, "learning_rate": 4.265241080725808e-07, "loss": 0.7287, "step": 610 }, { "epoch": 1.6655382532159784, "grad_norm": 4.059666587103208, "learning_rate": 4.194500993350453e-07, "loss": 0.7399, "step": 615 }, { "epoch": 1.6790792146242384, "grad_norm": 3.774832103683009, "learning_rate": 4.1239260580702634e-07, "loss": 0.7386, "step": 620 }, { "epoch": 1.6926201760324981, "grad_norm": 3.5490582421055885, "learning_rate": 4.053530744919749e-07, "loss": 0.7246, "step": 625 }, { "epoch": 1.7061611374407581, "grad_norm": 3.747116763144729, "learning_rate": 3.983329487105363e-07, "loss": 0.7372, "step": 630 }, { "epoch": 1.7197020988490181, "grad_norm": 3.4822384940575795, "learning_rate": 3.913336678046232e-07, "loss": 0.7323, "step": 635 }, { "epoch": 1.7332430602572781, "grad_norm": 3.825022969793332, "learning_rate": 3.8435666684230726e-07, "loss": 0.7097, "step": 640 }, { "epoch": 1.7467840216655381, "grad_norm": 3.696972226883697, "learning_rate": 3.774033763235861e-07, "loss": 0.7304, "step": 645 }, { "epoch": 1.7603249830737981, "grad_norm": 3.821368374115622, "learning_rate": 3.7047522188708606e-07, "loss": 0.727, "step": 650 }, { "epoch": 1.7603249830737981, "eval_loss": 0.7839689254760742, "eval_runtime": 182.875, "eval_samples_per_second": 57.416, "eval_steps_per_second": 0.902, "step": 650 }, { "epoch": 1.7738659444820581, "grad_norm": 3.665010892266409, "learning_rate": 3.635736240177627e-07, "loss": 0.7223, "step": 655 }, { "epoch": 1.7874069058903181, "grad_norm": 3.8705676996293916, "learning_rate": 3.5669999775565816e-07, "loss": 0.7313, "step": 660 }, { "epoch": 1.8009478672985781, "grad_norm": 3.5842111135660057, "learning_rate": 3.4985575240577365e-07, "loss": 0.7321, "step": 665 }, { "epoch": 1.8144888287068381, "grad_norm": 3.4424140077908767, "learning_rate": 3.4304229124911856e-07, "loss": 0.7316, "step": 670 }, { "epoch": 1.8280297901150981, "grad_norm": 3.4068267304866646, "learning_rate": 3.362610112549955e-07, "loss": 0.704, "step": 675 }, { "epoch": 1.8415707515233581, "grad_norm": 3.685081317112416, "learning_rate": 3.295133027945778e-07, "loss": 0.7167, "step": 680 }, { "epoch": 1.8551117129316181, "grad_norm": 3.7176624860947345, "learning_rate": 3.228005493558402e-07, "loss": 0.7094, "step": 685 }, { "epoch": 1.8686526743398781, "grad_norm": 3.718001701556429, "learning_rate": 3.1612412725990305e-07, "loss": 0.7312, "step": 690 }, { "epoch": 1.8821936357481381, "grad_norm": 3.491455379714816, "learning_rate": 3.0948540537884185e-07, "loss": 0.7264, "step": 695 }, { "epoch": 1.8957345971563981, "grad_norm": 3.7382317800607376, "learning_rate": 3.0288574485502756e-07, "loss": 0.7277, "step": 700 }, { "epoch": 1.8957345971563981, "eval_loss": 0.7811039090156555, "eval_runtime": 182.9386, "eval_samples_per_second": 57.396, "eval_steps_per_second": 0.902, "step": 700 }, { "epoch": 1.9092755585646581, "grad_norm": 3.4296946924569442, "learning_rate": 2.9632649882205083e-07, "loss": 0.7287, "step": 705 }, { "epoch": 1.9228165199729181, "grad_norm": 3.810691598305239, "learning_rate": 2.8980901212728723e-07, "loss": 0.7193, "step": 710 }, { "epoch": 1.9363574813811781, "grad_norm": 3.3584278752072496, "learning_rate": 2.833346210561619e-07, "loss": 0.7112, "step": 715 }, { "epoch": 1.9498984427894381, "grad_norm": 3.5743899932997185, "learning_rate": 2.769046530581708e-07, "loss": 0.7235, "step": 720 }, { "epoch": 1.9634394041976981, "grad_norm": 3.8331759574897375, "learning_rate": 2.705204264747125e-07, "loss": 0.724, "step": 725 }, { "epoch": 1.9769803656059581, "grad_norm": 3.6084594988279908, "learning_rate": 2.6418325026878665e-07, "loss": 0.7156, "step": 730 }, { "epoch": 1.9905213270142181, "grad_norm": 3.56873955236049, "learning_rate": 2.578944237566174e-07, "loss": 0.7163, "step": 735 }, { "epoch": 2.004062288422478, "grad_norm": 4.103162581101771, "learning_rate": 2.5165523634125337e-07, "loss": 0.7161, "step": 740 }, { "epoch": 2.017603249830738, "grad_norm": 3.7064092978900844, "learning_rate": 2.454669672481996e-07, "loss": 0.6754, "step": 745 }, { "epoch": 2.031144211238998, "grad_norm": 3.6900542156905196, "learning_rate": 2.393308852631373e-07, "loss": 0.6724, "step": 750 }, { "epoch": 2.031144211238998, "eval_loss": 0.7857776880264282, "eval_runtime": 183.0378, "eval_samples_per_second": 57.365, "eval_steps_per_second": 0.901, "step": 750 }, { "epoch": 2.044685172647258, "grad_norm": 3.5536516436485255, "learning_rate": 2.3324824847178494e-07, "loss": 0.6887, "step": 755 }, { "epoch": 2.058226134055518, "grad_norm": 3.7965121612299564, "learning_rate": 2.2722030400194975e-07, "loss": 0.666, "step": 760 }, { "epoch": 2.071767095463778, "grad_norm": 3.7936989153822007, "learning_rate": 2.2124828776782955e-07, "loss": 0.6789, "step": 765 }, { "epoch": 2.085308056872038, "grad_norm": 3.5401816973807043, "learning_rate": 2.1533342421661228e-07, "loss": 0.6665, "step": 770 }, { "epoch": 2.0988490182802977, "grad_norm": 3.7787164350636555, "learning_rate": 2.0947692607742618e-07, "loss": 0.6755, "step": 775 }, { "epoch": 2.1123899796885577, "grad_norm": 3.9382718196335267, "learning_rate": 2.0367999411269282e-07, "loss": 0.6821, "step": 780 }, { "epoch": 2.1259309410968177, "grad_norm": 3.8112238429444782, "learning_rate": 1.9794381687193456e-07, "loss": 0.6805, "step": 785 }, { "epoch": 2.1394719025050777, "grad_norm": 3.8744335724512204, "learning_rate": 1.9226957044808494e-07, "loss": 0.6657, "step": 790 }, { "epoch": 2.1530128639133377, "grad_norm": 3.7804638456283346, "learning_rate": 1.866584182363528e-07, "loss": 0.6789, "step": 795 }, { "epoch": 2.1665538253215977, "grad_norm": 3.8021451485147963, "learning_rate": 1.811115106956918e-07, "loss": 0.6883, "step": 800 }, { "epoch": 2.1665538253215977, "eval_loss": 0.7850033044815063, "eval_runtime": 182.9949, "eval_samples_per_second": 57.379, "eval_steps_per_second": 0.902, "step": 800 }, { "epoch": 2.1800947867298577, "grad_norm": 3.864215108703362, "learning_rate": 1.7562998511291943e-07, "loss": 0.6811, "step": 805 }, { "epoch": 2.1936357481381177, "grad_norm": 3.8300913859664667, "learning_rate": 1.702149653695395e-07, "loss": 0.6766, "step": 810 }, { "epoch": 2.2071767095463777, "grad_norm": 3.8635188226813666, "learning_rate": 1.6486756171131062e-07, "loss": 0.675, "step": 815 }, { "epoch": 2.2207176709546377, "grad_norm": 3.9283113465457355, "learning_rate": 1.595888705206128e-07, "loss": 0.6678, "step": 820 }, { "epoch": 2.2342586323628977, "grad_norm": 3.726732058605602, "learning_rate": 1.5437997409165476e-07, "loss": 0.6733, "step": 825 }, { "epoch": 2.2477995937711577, "grad_norm": 3.6073721199402318, "learning_rate": 1.4924194040856973e-07, "loss": 0.6794, "step": 830 }, { "epoch": 2.2613405551794177, "grad_norm": 3.920320006141431, "learning_rate": 1.4417582292644691e-07, "loss": 0.6871, "step": 835 }, { "epoch": 2.2748815165876777, "grad_norm": 3.781911882917061, "learning_rate": 1.3918266035534027e-07, "loss": 0.6774, "step": 840 }, { "epoch": 2.2884224779959377, "grad_norm": 4.013058729107201, "learning_rate": 1.3426347644730047e-07, "loss": 0.6816, "step": 845 }, { "epoch": 2.3019634394041977, "grad_norm": 3.776810144116961, "learning_rate": 1.2941927978647526e-07, "loss": 0.6709, "step": 850 }, { "epoch": 2.3019634394041977, "eval_loss": 0.7840232253074646, "eval_runtime": 182.9773, "eval_samples_per_second": 57.384, "eval_steps_per_second": 0.902, "step": 850 }, { "epoch": 2.3155044008124577, "grad_norm": 4.0267886200903344, "learning_rate": 1.2465106358231753e-07, "loss": 0.6765, "step": 855 }, { "epoch": 2.3290453622207177, "grad_norm": 3.79514301881657, "learning_rate": 1.1995980546594775e-07, "loss": 0.6633, "step": 860 }, { "epoch": 2.3425863236289777, "grad_norm": 3.7026026349952086, "learning_rate": 1.153464672897091e-07, "loss": 0.678, "step": 865 }, { "epoch": 2.3561272850372377, "grad_norm": 3.977299316585606, "learning_rate": 1.108119949299578e-07, "loss": 0.6875, "step": 870 }, { "epoch": 2.3696682464454977, "grad_norm": 3.9505974017459544, "learning_rate": 1.0635731809312992e-07, "loss": 0.6955, "step": 875 }, { "epoch": 2.3832092078537577, "grad_norm": 3.9944161998447116, "learning_rate": 1.0198335012512271e-07, "loss": 0.6843, "step": 880 }, { "epoch": 2.3967501692620177, "grad_norm": 4.08644897660094, "learning_rate": 9.769098782403041e-08, "loss": 0.7081, "step": 885 }, { "epoch": 2.4102911306702777, "grad_norm": 4.033807984306314, "learning_rate": 9.348111125627278e-08, "loss": 0.6758, "step": 890 }, { "epoch": 2.4238320920785377, "grad_norm": 3.615156557294799, "learning_rate": 8.935458357615583e-08, "loss": 0.6718, "step": 895 }, { "epoch": 2.4373730534867977, "grad_norm": 3.876477554855966, "learning_rate": 8.531225084889654e-08, "loss": 0.6598, "step": 900 }, { "epoch": 2.4373730534867977, "eval_loss": 0.7834283113479614, "eval_runtime": 183.0709, "eval_samples_per_second": 57.355, "eval_steps_per_second": 0.901, "step": 900 }, { "epoch": 2.4509140148950577, "grad_norm": 3.6988949380997336, "learning_rate": 8.135494187715475e-08, "loss": 0.6603, "step": 905 }, { "epoch": 2.4644549763033177, "grad_norm": 3.931452073089016, "learning_rate": 7.748346803110295e-08, "loss": 0.6832, "step": 910 }, { "epoch": 2.4779959377115777, "grad_norm": 3.8160191178139047, "learning_rate": 7.369862308207025e-08, "loss": 0.6583, "step": 915 }, { "epoch": 2.4915368991198377, "grad_norm": 3.8469114382677874, "learning_rate": 7.000118303979463e-08, "loss": 0.6808, "step": 920 }, { "epoch": 2.5050778605280977, "grad_norm": 3.7497256852290115, "learning_rate": 6.639190599331746e-08, "loss": 0.6762, "step": 925 }, { "epoch": 2.518618821936357, "grad_norm": 3.663314489242292, "learning_rate": 6.287153195555173e-08, "loss": 0.6663, "step": 930 }, { "epoch": 2.5321597833446177, "grad_norm": 3.8930436232018333, "learning_rate": 5.944078271155639e-08, "loss": 0.6648, "step": 935 }, { "epoch": 2.545700744752877, "grad_norm": 3.6616608952378904, "learning_rate": 5.610036167054838e-08, "loss": 0.6596, "step": 940 }, { "epoch": 2.5592417061611377, "grad_norm": 3.986331709466641, "learning_rate": 5.2850953721682635e-08, "loss": 0.669, "step": 945 }, { "epoch": 2.572782667569397, "grad_norm": 3.960581833122488, "learning_rate": 4.969322509362761e-08, "loss": 0.674, "step": 950 }, { "epoch": 2.572782667569397, "eval_loss": 0.7830283641815186, "eval_runtime": 182.8342, "eval_samples_per_second": 57.429, "eval_steps_per_second": 0.902, "step": 950 }, { "epoch": 2.5863236289776577, "grad_norm": 3.8618441431288217, "learning_rate": 4.662782321796849e-08, "loss": 0.6713, "step": 955 }, { "epoch": 2.599864590385917, "grad_norm": 3.5409233232724335, "learning_rate": 4.365537659646418e-08, "loss": 0.6747, "step": 960 }, { "epoch": 2.6134055517941777, "grad_norm": 3.744738202206873, "learning_rate": 4.0776494672184356e-08, "loss": 0.6846, "step": 965 }, { "epoch": 2.626946513202437, "grad_norm": 3.727245201869487, "learning_rate": 3.799176770455526e-08, "loss": 0.6616, "step": 970 }, { "epoch": 2.640487474610697, "grad_norm": 3.7258573002382147, "learning_rate": 3.530176664833834e-08, "loss": 0.675, "step": 975 }, { "epoch": 2.654028436018957, "grad_norm": 3.869690791825916, "learning_rate": 3.270704303656696e-08, "loss": 0.6875, "step": 980 }, { "epoch": 2.667569397427217, "grad_norm": 4.064616477774205, "learning_rate": 3.020812886746477e-08, "loss": 0.6808, "step": 985 }, { "epoch": 2.681110358835477, "grad_norm": 3.704371552936023, "learning_rate": 2.7805536495370373e-08, "loss": 0.6687, "step": 990 }, { "epoch": 2.694651320243737, "grad_norm": 4.055603563401218, "learning_rate": 2.5499758525688197e-08, "loss": 0.6584, "step": 995 }, { "epoch": 2.708192281651997, "grad_norm": 3.749384489878185, "learning_rate": 2.329126771388995e-08, "loss": 0.656, "step": 1000 }, { "epoch": 2.708192281651997, "eval_loss": 0.7828182578086853, "eval_runtime": 182.6838, "eval_samples_per_second": 57.476, "eval_steps_per_second": 0.903, "step": 1000 }, { "epoch": 2.721733243060257, "grad_norm": 3.869599930871293, "learning_rate": 2.1180516868584464e-08, "loss": 0.6716, "step": 1005 }, { "epoch": 2.735274204468517, "grad_norm": 3.930506514677681, "learning_rate": 1.916793875867839e-08, "loss": 0.6822, "step": 1010 }, { "epoch": 2.748815165876777, "grad_norm": 3.8320813018837616, "learning_rate": 1.7253946024645472e-08, "loss": 0.6627, "step": 1015 }, { "epoch": 2.762356127285037, "grad_norm": 3.986150848206186, "learning_rate": 1.5438931093921804e-08, "loss": 0.6727, "step": 1020 }, { "epoch": 2.775897088693297, "grad_norm": 3.764718626888124, "learning_rate": 1.372326610044705e-08, "loss": 0.6618, "step": 1025 }, { "epoch": 2.789438050101557, "grad_norm": 3.7384921853849393, "learning_rate": 1.2107302808364638e-08, "loss": 0.6614, "step": 1030 }, { "epoch": 2.802979011509817, "grad_norm": 3.8134941063063867, "learning_rate": 1.0591372539900056e-08, "loss": 0.6665, "step": 1035 }, { "epoch": 2.816519972918077, "grad_norm": 3.7735093872780197, "learning_rate": 9.175786107429085e-09, "loss": 0.6643, "step": 1040 }, { "epoch": 2.830060934326337, "grad_norm": 3.987550484105897, "learning_rate": 7.860833749751772e-09, "loss": 0.6739, "step": 1045 }, { "epoch": 2.843601895734597, "grad_norm": 3.9380769036431893, "learning_rate": 6.6467850725848705e-09, "loss": 0.6741, "step": 1050 }, { "epoch": 2.843601895734597, "eval_loss": 0.7824584245681763, "eval_runtime": 183.0944, "eval_samples_per_second": 57.347, "eval_steps_per_second": 0.901, "step": 1050 }, { "epoch": 2.857142857142857, "grad_norm": 4.011659897593238, "learning_rate": 5.5338889932838306e-09, "loss": 0.6842, "step": 1055 }, { "epoch": 2.870683818551117, "grad_norm": 3.808051194891409, "learning_rate": 4.5223736898076235e-09, "loss": 0.6806, "step": 1060 }, { "epoch": 2.884224779959377, "grad_norm": 3.8839001927108856, "learning_rate": 3.612446553934723e-09, "loss": 0.6679, "step": 1065 }, { "epoch": 2.897765741367637, "grad_norm": 3.9713825006231054, "learning_rate": 2.804294148741948e-09, "loss": 0.6733, "step": 1070 }, { "epoch": 2.911306702775897, "grad_norm": 3.783696934827739, "learning_rate": 2.0980821703527886e-09, "loss": 0.6736, "step": 1075 }, { "epoch": 2.924847664184157, "grad_norm": 3.770344601359413, "learning_rate": 1.4939554139648536e-09, "loss": 0.6649, "step": 1080 }, { "epoch": 2.938388625592417, "grad_norm": 3.8357523688679565, "learning_rate": 9.920377441623994e-10, "loss": 0.6718, "step": 1085 }, { "epoch": 2.951929587000677, "grad_norm": 4.03615476407359, "learning_rate": 5.92432069520199e-10, "loss": 0.6805, "step": 1090 }, { "epoch": 2.9654705484089368, "grad_norm": 3.8321791456875283, "learning_rate": 2.9522032150419705e-10, "loss": 0.6629, "step": 1095 }, { "epoch": 2.979011509817197, "grad_norm": 3.9373632743696056, "learning_rate": 1.0046343767294852e-10, "loss": 0.6592, "step": 1100 }, { "epoch": 2.979011509817197, "eval_loss": 0.7824262976646423, "eval_runtime": 182.8708, "eval_samples_per_second": 57.418, "eval_steps_per_second": 0.902, "step": 1100 }, { "epoch": 2.9925524712254568, "grad_norm": 3.6875439191522075, "learning_rate": 8.201349183611927e-12, "loss": 0.6534, "step": 1105 }, { "epoch": 2.997968855788761, "step": 1107, "total_flos": 6527139780231168.0, "train_loss": 0.7629147509572306, "train_runtime": 18558.2767, "train_samples_per_second": 15.276, "train_steps_per_second": 0.06 } ], "logging_steps": 5, "max_steps": 1107, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6527139780231168.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }