|
{ |
|
"best_metric": 0.026174582540988922, |
|
"best_model_checkpoint": "models/BKAI/checkpoint-24000", |
|
"epoch": 6.974716652136007, |
|
"eval_steps": 300, |
|
"global_step": 24000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02906131938390003, |
|
"grad_norm": 6.751401901245117, |
|
"learning_rate": 1.816695431010991e-07, |
|
"loss": 0.202, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05812263876780006, |
|
"grad_norm": 7.665974140167236, |
|
"learning_rate": 3.633390862021982e-07, |
|
"loss": 0.1929, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08718395815170009, |
|
"grad_norm": 5.795717239379883, |
|
"learning_rate": 5.450086293032973e-07, |
|
"loss": 0.1751, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08718395815170009, |
|
"eval_loss": 0.14298047125339508, |
|
"eval_runtime": 27.3707, |
|
"eval_samples_per_second": 446.937, |
|
"eval_steps_per_second": 13.993, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11624527753560011, |
|
"grad_norm": 2.549802303314209, |
|
"learning_rate": 7.266781724043964e-07, |
|
"loss": 0.1245, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14530659691950015, |
|
"grad_norm": 4.094095230102539, |
|
"learning_rate": 9.083477155054955e-07, |
|
"loss": 0.1347, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17436791630340018, |
|
"grad_norm": 3.1329522132873535, |
|
"learning_rate": 1.0900172586065947e-06, |
|
"loss": 0.105, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17436791630340018, |
|
"eval_loss": 0.09839651733636856, |
|
"eval_runtime": 26.3248, |
|
"eval_samples_per_second": 464.695, |
|
"eval_steps_per_second": 14.549, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2034292356873002, |
|
"grad_norm": 4.288038730621338, |
|
"learning_rate": 1.2716868017076938e-06, |
|
"loss": 0.0873, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.23249055507120023, |
|
"grad_norm": 5.486178398132324, |
|
"learning_rate": 1.4533563448087928e-06, |
|
"loss": 0.0916, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.26155187445510025, |
|
"grad_norm": 3.6196625232696533, |
|
"learning_rate": 1.6350258879098921e-06, |
|
"loss": 0.0882, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.26155187445510025, |
|
"eval_loss": 0.08222991973161697, |
|
"eval_runtime": 27.0353, |
|
"eval_samples_per_second": 452.483, |
|
"eval_steps_per_second": 14.167, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2906131938390003, |
|
"grad_norm": 5.516987323760986, |
|
"learning_rate": 1.816695431010991e-06, |
|
"loss": 0.079, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3196745132229003, |
|
"grad_norm": 0.9596192836761475, |
|
"learning_rate": 1.9983649741120904e-06, |
|
"loss": 0.071, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.34873583260680036, |
|
"grad_norm": 5.4598069190979, |
|
"learning_rate": 2.1800345172131893e-06, |
|
"loss": 0.0817, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.34873583260680036, |
|
"eval_loss": 0.07388558238744736, |
|
"eval_runtime": 26.4246, |
|
"eval_samples_per_second": 462.94, |
|
"eval_steps_per_second": 14.494, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.37779715199070035, |
|
"grad_norm": 3.363698959350586, |
|
"learning_rate": 2.3617040603142887e-06, |
|
"loss": 0.063, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4068584713746004, |
|
"grad_norm": 4.268736839294434, |
|
"learning_rate": 2.5433736034153876e-06, |
|
"loss": 0.0726, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.43591979075850046, |
|
"grad_norm": 0.6281489133834839, |
|
"learning_rate": 2.7250431465164866e-06, |
|
"loss": 0.0767, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.43591979075850046, |
|
"eval_loss": 0.06805469840765, |
|
"eval_runtime": 27.0745, |
|
"eval_samples_per_second": 451.827, |
|
"eval_steps_per_second": 14.146, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.46498111014240046, |
|
"grad_norm": 1.1631907224655151, |
|
"learning_rate": 2.9067126896175855e-06, |
|
"loss": 0.0753, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4940424295263005, |
|
"grad_norm": 1.6831640005111694, |
|
"learning_rate": 3.0883822327186853e-06, |
|
"loss": 0.0647, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5231037489102005, |
|
"grad_norm": 0.39547181129455566, |
|
"learning_rate": 3.268235080388773e-06, |
|
"loss": 0.0538, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5231037489102005, |
|
"eval_loss": 0.06306594610214233, |
|
"eval_runtime": 26.3785, |
|
"eval_samples_per_second": 463.748, |
|
"eval_steps_per_second": 14.519, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5521650682941005, |
|
"grad_norm": 2.629936933517456, |
|
"learning_rate": 3.449904623489872e-06, |
|
"loss": 0.057, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5812263876780006, |
|
"grad_norm": 0.44680893421173096, |
|
"learning_rate": 3.6315741665909717e-06, |
|
"loss": 0.0592, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6102877070619006, |
|
"grad_norm": 6.168762683868408, |
|
"learning_rate": 3.8132437096920706e-06, |
|
"loss": 0.06, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6102877070619006, |
|
"eval_loss": 0.058808207511901855, |
|
"eval_runtime": 27.1053, |
|
"eval_samples_per_second": 451.314, |
|
"eval_steps_per_second": 14.13, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6393490264458006, |
|
"grad_norm": 2.6968770027160645, |
|
"learning_rate": 3.994913252793169e-06, |
|
"loss": 0.0626, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6684103458297007, |
|
"grad_norm": 1.397504210472107, |
|
"learning_rate": 4.176582795894268e-06, |
|
"loss": 0.0562, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6974716652136007, |
|
"grad_norm": 3.1529994010925293, |
|
"learning_rate": 4.358252338995368e-06, |
|
"loss": 0.0589, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6974716652136007, |
|
"eval_loss": 0.05554657801985741, |
|
"eval_runtime": 26.2164, |
|
"eval_samples_per_second": 466.617, |
|
"eval_steps_per_second": 14.609, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7265329845975007, |
|
"grad_norm": 3.10916805267334, |
|
"learning_rate": 4.539921882096467e-06, |
|
"loss": 0.0515, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7555943039814007, |
|
"grad_norm": 1.4427434206008911, |
|
"learning_rate": 4.721591425197566e-06, |
|
"loss": 0.0581, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7846556233653008, |
|
"grad_norm": 2.437389850616455, |
|
"learning_rate": 4.9032609682986655e-06, |
|
"loss": 0.0498, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7846556233653008, |
|
"eval_loss": 0.053471144288778305, |
|
"eval_runtime": 26.9156, |
|
"eval_samples_per_second": 454.495, |
|
"eval_steps_per_second": 14.23, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8137169427492008, |
|
"grad_norm": 1.7395578622817993, |
|
"learning_rate": 5.084930511399764e-06, |
|
"loss": 0.0503, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8427782621331008, |
|
"grad_norm": 0.2161996066570282, |
|
"learning_rate": 5.266600054500863e-06, |
|
"loss": 0.047, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8718395815170009, |
|
"grad_norm": 0.2165631204843521, |
|
"learning_rate": 5.448269597601963e-06, |
|
"loss": 0.0454, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8718395815170009, |
|
"eval_loss": 0.052133820950984955, |
|
"eval_runtime": 26.4629, |
|
"eval_samples_per_second": 462.27, |
|
"eval_steps_per_second": 14.473, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9009009009009009, |
|
"grad_norm": 0.16847111284732819, |
|
"learning_rate": 5.629939140703061e-06, |
|
"loss": 0.0497, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9299622202848009, |
|
"grad_norm": 4.040170669555664, |
|
"learning_rate": 5.811608683804161e-06, |
|
"loss": 0.0524, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9590235396687009, |
|
"grad_norm": 3.6626970767974854, |
|
"learning_rate": 5.99327822690526e-06, |
|
"loss": 0.0423, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9590235396687009, |
|
"eval_loss": 0.04921015352010727, |
|
"eval_runtime": 27.2421, |
|
"eval_samples_per_second": 449.048, |
|
"eval_steps_per_second": 14.059, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.988084859052601, |
|
"grad_norm": 0.33825862407684326, |
|
"learning_rate": 6.174947770006359e-06, |
|
"loss": 0.0468, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.017146178436501, |
|
"grad_norm": 2.7132785320281982, |
|
"learning_rate": 6.356617313107458e-06, |
|
"loss": 0.0578, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.046207497820401, |
|
"grad_norm": 1.4403324127197266, |
|
"learning_rate": 6.538286856208557e-06, |
|
"loss": 0.0428, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.046207497820401, |
|
"eval_loss": 0.047407593578100204, |
|
"eval_runtime": 26.5867, |
|
"eval_samples_per_second": 460.117, |
|
"eval_steps_per_second": 14.406, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.075268817204301, |
|
"grad_norm": 3.5505058765411377, |
|
"learning_rate": 6.7199563993096566e-06, |
|
"loss": 0.0458, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.104330136588201, |
|
"grad_norm": 1.7245237827301025, |
|
"learning_rate": 6.9016259424107555e-06, |
|
"loss": 0.0377, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.1333914559721012, |
|
"grad_norm": 1.81324303150177, |
|
"learning_rate": 7.0832954855118544e-06, |
|
"loss": 0.0432, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1333914559721012, |
|
"eval_loss": 0.04528222978115082, |
|
"eval_runtime": 26.7505, |
|
"eval_samples_per_second": 457.299, |
|
"eval_steps_per_second": 14.317, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1624527753560012, |
|
"grad_norm": 0.5516623854637146, |
|
"learning_rate": 7.264965028612953e-06, |
|
"loss": 0.0402, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1915140947399012, |
|
"grad_norm": 3.600156545639038, |
|
"learning_rate": 7.446634571714053e-06, |
|
"loss": 0.0403, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.2205754141238012, |
|
"grad_norm": 0.5710099935531616, |
|
"learning_rate": 7.628304114815151e-06, |
|
"loss": 0.0401, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2205754141238012, |
|
"eval_loss": 0.04419328644871712, |
|
"eval_runtime": 26.7562, |
|
"eval_samples_per_second": 457.203, |
|
"eval_steps_per_second": 14.314, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.2496367335077012, |
|
"grad_norm": 0.9024702310562134, |
|
"learning_rate": 7.809973657916251e-06, |
|
"loss": 0.0441, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2786980528916012, |
|
"grad_norm": 0.7580955028533936, |
|
"learning_rate": 7.99164320101735e-06, |
|
"loss": 0.0373, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.3077593722755014, |
|
"grad_norm": 0.7472540140151978, |
|
"learning_rate": 8.173312744118449e-06, |
|
"loss": 0.043, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3077593722755014, |
|
"eval_loss": 0.0427699089050293, |
|
"eval_runtime": 26.7025, |
|
"eval_samples_per_second": 458.123, |
|
"eval_steps_per_second": 14.343, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.3368206916594012, |
|
"grad_norm": 5.362630367279053, |
|
"learning_rate": 8.354982287219548e-06, |
|
"loss": 0.0432, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3658820110433014, |
|
"grad_norm": 2.7061116695404053, |
|
"learning_rate": 8.536651830320647e-06, |
|
"loss": 0.0386, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3949433304272014, |
|
"grad_norm": 1.1733832359313965, |
|
"learning_rate": 8.718321373421746e-06, |
|
"loss": 0.0352, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3949433304272014, |
|
"eval_loss": 0.041440799832344055, |
|
"eval_runtime": 26.9106, |
|
"eval_samples_per_second": 454.58, |
|
"eval_steps_per_second": 14.232, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.4240046498111014, |
|
"grad_norm": 2.7326438426971436, |
|
"learning_rate": 8.899990916522846e-06, |
|
"loss": 0.0389, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.4530659691950014, |
|
"grad_norm": 0.20390258729457855, |
|
"learning_rate": 9.081660459623944e-06, |
|
"loss": 0.04, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4821272885789014, |
|
"grad_norm": 4.956657886505127, |
|
"learning_rate": 9.263330002725044e-06, |
|
"loss": 0.0394, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.4821272885789014, |
|
"eval_loss": 0.042791176587343216, |
|
"eval_runtime": 26.6527, |
|
"eval_samples_per_second": 458.977, |
|
"eval_steps_per_second": 14.37, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.5111886079628016, |
|
"grad_norm": 0.23265118896961212, |
|
"learning_rate": 9.444999545826143e-06, |
|
"loss": 0.0342, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.5402499273467014, |
|
"grad_norm": 2.009842872619629, |
|
"learning_rate": 9.626669088927242e-06, |
|
"loss": 0.0462, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.5693112467306016, |
|
"grad_norm": 0.5204980969429016, |
|
"learning_rate": 9.808338632028341e-06, |
|
"loss": 0.0412, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5693112467306016, |
|
"eval_loss": 0.04058554396033287, |
|
"eval_runtime": 27.0549, |
|
"eval_samples_per_second": 452.154, |
|
"eval_steps_per_second": 14.156, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5983725661145016, |
|
"grad_norm": 1.264519214630127, |
|
"learning_rate": 9.990008175129442e-06, |
|
"loss": 0.0352, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.6274338854984016, |
|
"grad_norm": 1.4044185876846313, |
|
"learning_rate": 1.0171677718230539e-05, |
|
"loss": 0.0363, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.6564952048823016, |
|
"grad_norm": 0.773714542388916, |
|
"learning_rate": 1.035334726133164e-05, |
|
"loss": 0.0416, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.6564952048823016, |
|
"eval_loss": 0.039202187210321426, |
|
"eval_runtime": 26.6017, |
|
"eval_samples_per_second": 459.858, |
|
"eval_steps_per_second": 14.398, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.6855565242662016, |
|
"grad_norm": 0.1811431348323822, |
|
"learning_rate": 1.0535016804432738e-05, |
|
"loss": 0.0287, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.7146178436501018, |
|
"grad_norm": 3.958439350128174, |
|
"learning_rate": 1.0716686347533837e-05, |
|
"loss": 0.0325, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.7436791630340016, |
|
"grad_norm": 0.5695263147354126, |
|
"learning_rate": 1.0898355890634935e-05, |
|
"loss": 0.0331, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7436791630340016, |
|
"eval_loss": 0.037496764212846756, |
|
"eval_runtime": 26.9127, |
|
"eval_samples_per_second": 454.543, |
|
"eval_steps_per_second": 14.231, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.7727404824179018, |
|
"grad_norm": 3.3903560638427734, |
|
"learning_rate": 1.1080025433736035e-05, |
|
"loss": 0.0361, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.8018018018018018, |
|
"grad_norm": 2.8720123767852783, |
|
"learning_rate": 1.1261694976837134e-05, |
|
"loss": 0.0468, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.8308631211857018, |
|
"grad_norm": 1.0204403400421143, |
|
"learning_rate": 1.1443364519938233e-05, |
|
"loss": 0.0343, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8308631211857018, |
|
"eval_loss": 0.037053827196359634, |
|
"eval_runtime": 26.3034, |
|
"eval_samples_per_second": 465.074, |
|
"eval_steps_per_second": 14.561, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.8599244405696018, |
|
"grad_norm": 2.0732407569885254, |
|
"learning_rate": 1.1625034063039334e-05, |
|
"loss": 0.0401, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8889857599535018, |
|
"grad_norm": 3.727902412414551, |
|
"learning_rate": 1.1806703606140431e-05, |
|
"loss": 0.0425, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.918047079337402, |
|
"grad_norm": 1.8206995725631714, |
|
"learning_rate": 1.198837314924153e-05, |
|
"loss": 0.0342, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.918047079337402, |
|
"eval_loss": 0.038177795708179474, |
|
"eval_runtime": 27.1924, |
|
"eval_samples_per_second": 449.869, |
|
"eval_steps_per_second": 14.085, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.9471083987213018, |
|
"grad_norm": 0.9479349255561829, |
|
"learning_rate": 1.2170042692342629e-05, |
|
"loss": 0.0411, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.976169718105202, |
|
"grad_norm": 1.6102315187454224, |
|
"learning_rate": 1.235171223544373e-05, |
|
"loss": 0.0393, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.005231037489102, |
|
"grad_norm": 0.39504683017730713, |
|
"learning_rate": 1.2533381778544828e-05, |
|
"loss": 0.0379, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.005231037489102, |
|
"eval_loss": 0.037571169435977936, |
|
"eval_runtime": 26.9177, |
|
"eval_samples_per_second": 454.46, |
|
"eval_steps_per_second": 14.229, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.034292356873002, |
|
"grad_norm": 0.09198792278766632, |
|
"learning_rate": 1.2715051321645927e-05, |
|
"loss": 0.0226, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.0633536762569022, |
|
"grad_norm": 0.5564383864402771, |
|
"learning_rate": 1.2896720864747025e-05, |
|
"loss": 0.0251, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.092414995640802, |
|
"grad_norm": 1.9805023670196533, |
|
"learning_rate": 1.3078390407848125e-05, |
|
"loss": 0.0232, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.092414995640802, |
|
"eval_loss": 0.03626781702041626, |
|
"eval_runtime": 26.9532, |
|
"eval_samples_per_second": 453.86, |
|
"eval_steps_per_second": 14.21, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.1214763150247022, |
|
"grad_norm": 2.2151503562927246, |
|
"learning_rate": 1.3260059950949224e-05, |
|
"loss": 0.0311, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.150537634408602, |
|
"grad_norm": 3.625196695327759, |
|
"learning_rate": 1.3441729494050325e-05, |
|
"loss": 0.025, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.1795989537925022, |
|
"grad_norm": 3.1274352073669434, |
|
"learning_rate": 1.3623399037151424e-05, |
|
"loss": 0.0245, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.1795989537925022, |
|
"eval_loss": 0.036362066864967346, |
|
"eval_runtime": 26.6185, |
|
"eval_samples_per_second": 459.567, |
|
"eval_steps_per_second": 14.388, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.208660273176402, |
|
"grad_norm": 0.11472488194704056, |
|
"learning_rate": 1.3805068580252521e-05, |
|
"loss": 0.0291, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.2377215925603022, |
|
"grad_norm": 1.0436164140701294, |
|
"learning_rate": 1.398673812335362e-05, |
|
"loss": 0.03, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.2667829119442024, |
|
"grad_norm": 1.9227488040924072, |
|
"learning_rate": 1.416840766645472e-05, |
|
"loss": 0.0277, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.2667829119442024, |
|
"eval_loss": 0.03540974110364914, |
|
"eval_runtime": 26.7128, |
|
"eval_samples_per_second": 457.944, |
|
"eval_steps_per_second": 14.338, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.2958442313281022, |
|
"grad_norm": 4.112268447875977, |
|
"learning_rate": 1.4348260514124808e-05, |
|
"loss": 0.0242, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.3249055507120024, |
|
"grad_norm": 1.2703102827072144, |
|
"learning_rate": 1.4529930057225907e-05, |
|
"loss": 0.03, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.353966870095902, |
|
"grad_norm": 2.46053409576416, |
|
"learning_rate": 1.4711599600327007e-05, |
|
"loss": 0.0301, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.353966870095902, |
|
"eval_loss": 0.035567089915275574, |
|
"eval_runtime": 26.7341, |
|
"eval_samples_per_second": 457.58, |
|
"eval_steps_per_second": 14.326, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.3830281894798024, |
|
"grad_norm": 1.049849271774292, |
|
"learning_rate": 1.4893269143428106e-05, |
|
"loss": 0.0288, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.412089508863702, |
|
"grad_norm": 2.9456753730773926, |
|
"learning_rate": 1.5074938686529204e-05, |
|
"loss": 0.0197, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.4411508282476024, |
|
"grad_norm": 0.44164979457855225, |
|
"learning_rate": 1.5254791534199294e-05, |
|
"loss": 0.0351, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4411508282476024, |
|
"eval_loss": 0.03602970018982887, |
|
"eval_runtime": 26.2631, |
|
"eval_samples_per_second": 465.787, |
|
"eval_steps_per_second": 14.583, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.4702121476315027, |
|
"grad_norm": 0.2530701160430908, |
|
"learning_rate": 1.543646107730039e-05, |
|
"loss": 0.0215, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.4992734670154024, |
|
"grad_norm": 0.1154685765504837, |
|
"learning_rate": 1.561813062040149e-05, |
|
"loss": 0.0359, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.5283347863993026, |
|
"grad_norm": 0.16534653306007385, |
|
"learning_rate": 1.579980016350259e-05, |
|
"loss": 0.0257, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.5283347863993026, |
|
"eval_loss": 0.03711829334497452, |
|
"eval_runtime": 26.9334, |
|
"eval_samples_per_second": 454.194, |
|
"eval_steps_per_second": 14.22, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.5573961057832024, |
|
"grad_norm": 4.247994899749756, |
|
"learning_rate": 1.5981469706603688e-05, |
|
"loss": 0.025, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.5864574251671026, |
|
"grad_norm": 3.4336764812469482, |
|
"learning_rate": 1.616313924970479e-05, |
|
"loss": 0.0337, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.615518744551003, |
|
"grad_norm": 3.021991014480591, |
|
"learning_rate": 1.634480879280589e-05, |
|
"loss": 0.0236, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.615518744551003, |
|
"eval_loss": 0.03501100093126297, |
|
"eval_runtime": 26.2977, |
|
"eval_samples_per_second": 465.175, |
|
"eval_steps_per_second": 14.564, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.6445800639349026, |
|
"grad_norm": 4.529074192047119, |
|
"learning_rate": 1.6526478335906987e-05, |
|
"loss": 0.0245, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.6736413833188024, |
|
"grad_norm": 2.828688859939575, |
|
"learning_rate": 1.6708147879008084e-05, |
|
"loss": 0.0293, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.3687061071395874, |
|
"learning_rate": 1.6889817422109185e-05, |
|
"loss": 0.0291, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"eval_loss": 0.03628409281373024, |
|
"eval_runtime": 26.8142, |
|
"eval_samples_per_second": 456.214, |
|
"eval_steps_per_second": 14.283, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.731764022086603, |
|
"grad_norm": 0.35936692357063293, |
|
"learning_rate": 1.7071486965210285e-05, |
|
"loss": 0.0294, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.7608253414705026, |
|
"grad_norm": 0.48872271180152893, |
|
"learning_rate": 1.7253156508311382e-05, |
|
"loss": 0.0273, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.789886660854403, |
|
"grad_norm": 0.6060481071472168, |
|
"learning_rate": 1.743482605141248e-05, |
|
"loss": 0.0358, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.789886660854403, |
|
"eval_loss": 0.037452585995197296, |
|
"eval_runtime": 26.8066, |
|
"eval_samples_per_second": 456.344, |
|
"eval_steps_per_second": 14.288, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.8189479802383026, |
|
"grad_norm": 0.19470466673374176, |
|
"learning_rate": 1.761649559451358e-05, |
|
"loss": 0.0251, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.848009299622203, |
|
"grad_norm": 0.7011229991912842, |
|
"learning_rate": 1.779816513761468e-05, |
|
"loss": 0.0352, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.877070619006103, |
|
"grad_norm": 1.1960341930389404, |
|
"learning_rate": 1.7979834680715778e-05, |
|
"loss": 0.0289, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.877070619006103, |
|
"eval_loss": 0.036693498492240906, |
|
"eval_runtime": 28.4942, |
|
"eval_samples_per_second": 429.315, |
|
"eval_steps_per_second": 13.441, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.906131938390003, |
|
"grad_norm": 0.120386503636837, |
|
"learning_rate": 1.816150422381688e-05, |
|
"loss": 0.0306, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.935193257773903, |
|
"grad_norm": 0.15321113169193268, |
|
"learning_rate": 1.834317376691798e-05, |
|
"loss": 0.0249, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.964254577157803, |
|
"grad_norm": 0.7859700918197632, |
|
"learning_rate": 1.8524843310019077e-05, |
|
"loss": 0.0257, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.964254577157803, |
|
"eval_loss": 0.036162618547677994, |
|
"eval_runtime": 30.9741, |
|
"eval_samples_per_second": 394.944, |
|
"eval_steps_per_second": 12.365, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.993315896541703, |
|
"grad_norm": 0.7350950837135315, |
|
"learning_rate": 1.8706512853120174e-05, |
|
"loss": 0.0332, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 3.022377215925603, |
|
"grad_norm": 1.5788044929504395, |
|
"learning_rate": 1.8888182396221275e-05, |
|
"loss": 0.0208, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.051438535309503, |
|
"grad_norm": 1.8751883506774902, |
|
"learning_rate": 1.9069851939322375e-05, |
|
"loss": 0.0231, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.051438535309503, |
|
"eval_loss": 0.03594466298818588, |
|
"eval_runtime": 29.5347, |
|
"eval_samples_per_second": 414.191, |
|
"eval_steps_per_second": 12.968, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.0804998546934033, |
|
"grad_norm": 2.0853710174560547, |
|
"learning_rate": 1.9251521482423476e-05, |
|
"loss": 0.0216, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.109561174077303, |
|
"grad_norm": 0.48473218083381653, |
|
"learning_rate": 1.9433191025524573e-05, |
|
"loss": 0.0193, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.1386224934612033, |
|
"grad_norm": 0.1810705065727234, |
|
"learning_rate": 1.961486056862567e-05, |
|
"loss": 0.0175, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.1386224934612033, |
|
"eval_loss": 0.03672377020120621, |
|
"eval_runtime": 32.0584, |
|
"eval_samples_per_second": 381.584, |
|
"eval_steps_per_second": 11.947, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.167683812845103, |
|
"grad_norm": 0.8471315503120422, |
|
"learning_rate": 1.979653011172677e-05, |
|
"loss": 0.0219, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.1967451322290033, |
|
"grad_norm": 0.23426327109336853, |
|
"learning_rate": 1.997819965482787e-05, |
|
"loss": 0.0188, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 0.2814389765262604, |
|
"learning_rate": 1.9865350776528193e-05, |
|
"loss": 0.0188, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"eval_loss": 0.034333910793066025, |
|
"eval_runtime": 27.346, |
|
"eval_samples_per_second": 447.341, |
|
"eval_steps_per_second": 14.006, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.2548677709968032, |
|
"grad_norm": 2.401367425918579, |
|
"learning_rate": 1.971234029531023e-05, |
|
"loss": 0.0265, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.2839290903807035, |
|
"grad_norm": 0.25739622116088867, |
|
"learning_rate": 1.9559329814092266e-05, |
|
"loss": 0.0218, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.3129904097646032, |
|
"grad_norm": 0.7228596210479736, |
|
"learning_rate": 1.9406319332874305e-05, |
|
"loss": 0.0208, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.3129904097646032, |
|
"eval_loss": 0.03501081466674805, |
|
"eval_runtime": 31.5795, |
|
"eval_samples_per_second": 387.372, |
|
"eval_steps_per_second": 12.128, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.3420517291485035, |
|
"grad_norm": 0.4601193964481354, |
|
"learning_rate": 1.925330885165634e-05, |
|
"loss": 0.0184, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.3711130485324032, |
|
"grad_norm": 0.033944468945264816, |
|
"learning_rate": 1.9100298370438376e-05, |
|
"loss": 0.0232, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.4001743679163035, |
|
"grad_norm": 5.643810749053955, |
|
"learning_rate": 1.8947287889220414e-05, |
|
"loss": 0.0193, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.4001743679163035, |
|
"eval_loss": 0.03553704172372818, |
|
"eval_runtime": 27.8198, |
|
"eval_samples_per_second": 439.723, |
|
"eval_steps_per_second": 13.767, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.4292356873002037, |
|
"grad_norm": 0.33075031638145447, |
|
"learning_rate": 1.879427740800245e-05, |
|
"loss": 0.0147, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.4582970066841034, |
|
"grad_norm": 0.36583471298217773, |
|
"learning_rate": 1.8641266926784488e-05, |
|
"loss": 0.0209, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.4873583260680037, |
|
"grad_norm": 0.2965396046638489, |
|
"learning_rate": 1.8488256445566524e-05, |
|
"loss": 0.028, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.4873583260680037, |
|
"eval_loss": 0.03441368043422699, |
|
"eval_runtime": 31.5965, |
|
"eval_samples_per_second": 387.163, |
|
"eval_steps_per_second": 12.122, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.5164196454519034, |
|
"grad_norm": 0.10767892003059387, |
|
"learning_rate": 1.833524596434856e-05, |
|
"loss": 0.0203, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.5454809648358037, |
|
"grad_norm": 0.4897698163986206, |
|
"learning_rate": 1.8182235483130597e-05, |
|
"loss": 0.0186, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.5745422842197034, |
|
"grad_norm": 0.7638312578201294, |
|
"learning_rate": 1.8029225001912633e-05, |
|
"loss": 0.0233, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.5745422842197034, |
|
"eval_loss": 0.0343441404402256, |
|
"eval_runtime": 27.0272, |
|
"eval_samples_per_second": 452.618, |
|
"eval_steps_per_second": 14.171, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.6036036036036037, |
|
"grad_norm": 2.574690341949463, |
|
"learning_rate": 1.7876214520694668e-05, |
|
"loss": 0.0231, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.6326649229875034, |
|
"grad_norm": 0.528499960899353, |
|
"learning_rate": 1.7723204039476703e-05, |
|
"loss": 0.022, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.6617262423714037, |
|
"grad_norm": 0.08761309087276459, |
|
"learning_rate": 1.7570193558258742e-05, |
|
"loss": 0.0232, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.6617262423714037, |
|
"eval_loss": 0.03451988101005554, |
|
"eval_runtime": 32.6592, |
|
"eval_samples_per_second": 374.565, |
|
"eval_steps_per_second": 11.727, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.690787561755304, |
|
"grad_norm": 0.10693353414535522, |
|
"learning_rate": 1.7417183077040777e-05, |
|
"loss": 0.0249, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.7198488811392036, |
|
"grad_norm": 0.23877334594726562, |
|
"learning_rate": 1.7264172595822816e-05, |
|
"loss": 0.0241, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.748910200523104, |
|
"grad_norm": 0.376647025346756, |
|
"learning_rate": 1.711116211460485e-05, |
|
"loss": 0.025, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.748910200523104, |
|
"eval_loss": 0.033673714846372604, |
|
"eval_runtime": 27.1988, |
|
"eval_samples_per_second": 449.763, |
|
"eval_steps_per_second": 14.082, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.7779715199070036, |
|
"grad_norm": 0.7081992030143738, |
|
"learning_rate": 1.6958151633386886e-05, |
|
"loss": 0.0182, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.807032839290904, |
|
"grad_norm": 0.9505396485328674, |
|
"learning_rate": 1.6805141152168925e-05, |
|
"loss": 0.0197, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.836094158674804, |
|
"grad_norm": 0.3097359836101532, |
|
"learning_rate": 1.665213067095096e-05, |
|
"loss": 0.0187, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.836094158674804, |
|
"eval_loss": 0.03151945769786835, |
|
"eval_runtime": 31.0646, |
|
"eval_samples_per_second": 393.793, |
|
"eval_steps_per_second": 12.329, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.865155478058704, |
|
"grad_norm": 2.482163667678833, |
|
"learning_rate": 1.6499120189733e-05, |
|
"loss": 0.0168, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.8942167974426036, |
|
"grad_norm": 0.2182529866695404, |
|
"learning_rate": 1.6346109708515034e-05, |
|
"loss": 0.0244, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.923278116826504, |
|
"grad_norm": 0.4611717462539673, |
|
"learning_rate": 1.619309922729707e-05, |
|
"loss": 0.0179, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.923278116826504, |
|
"eval_loss": 0.03170377016067505, |
|
"eval_runtime": 28.0835, |
|
"eval_samples_per_second": 435.594, |
|
"eval_steps_per_second": 13.638, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.952339436210404, |
|
"grad_norm": 2.3013105392456055, |
|
"learning_rate": 1.6040088746079108e-05, |
|
"loss": 0.019, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.981400755594304, |
|
"grad_norm": 1.6527272462844849, |
|
"learning_rate": 1.5887078264861143e-05, |
|
"loss": 0.0196, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 4.010462074978204, |
|
"grad_norm": 0.042673151940107346, |
|
"learning_rate": 1.5734067783643182e-05, |
|
"loss": 0.0162, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 4.010462074978204, |
|
"eval_loss": 0.03321225941181183, |
|
"eval_runtime": 30.968, |
|
"eval_samples_per_second": 395.021, |
|
"eval_steps_per_second": 12.368, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 4.039523394362104, |
|
"grad_norm": 1.3607178926467896, |
|
"learning_rate": 1.5581057302425217e-05, |
|
"loss": 0.0141, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 4.068584713746004, |
|
"grad_norm": 0.053625594824552536, |
|
"learning_rate": 1.5428046821207253e-05, |
|
"loss": 0.0172, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.097646033129904, |
|
"grad_norm": 1.0355161428451538, |
|
"learning_rate": 1.527503633998929e-05, |
|
"loss": 0.0173, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 4.097646033129904, |
|
"eval_loss": 0.032125264406204224, |
|
"eval_runtime": 29.8018, |
|
"eval_samples_per_second": 410.478, |
|
"eval_steps_per_second": 12.852, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 4.1267073525138045, |
|
"grad_norm": 0.1364974081516266, |
|
"learning_rate": 1.5122025858771328e-05, |
|
"loss": 0.0126, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 4.155768671897704, |
|
"grad_norm": 0.20887862145900726, |
|
"learning_rate": 1.4969015377553364e-05, |
|
"loss": 0.0113, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 4.184829991281604, |
|
"grad_norm": 0.16281986236572266, |
|
"learning_rate": 1.48160048963354e-05, |
|
"loss": 0.017, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 4.184829991281604, |
|
"eval_loss": 0.03163045644760132, |
|
"eval_runtime": 30.8438, |
|
"eval_samples_per_second": 396.611, |
|
"eval_steps_per_second": 12.417, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 4.213891310665504, |
|
"grad_norm": 0.06442072987556458, |
|
"learning_rate": 1.4662994415117438e-05, |
|
"loss": 0.0132, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.2429526300494045, |
|
"grad_norm": 0.49993807077407837, |
|
"learning_rate": 1.4509983933899474e-05, |
|
"loss": 0.0137, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 4.272013949433305, |
|
"grad_norm": 0.045188985764980316, |
|
"learning_rate": 1.4356973452681511e-05, |
|
"loss": 0.0153, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 4.272013949433305, |
|
"eval_loss": 0.031595949083566666, |
|
"eval_runtime": 30.8789, |
|
"eval_samples_per_second": 396.16, |
|
"eval_steps_per_second": 12.403, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 4.301075268817204, |
|
"grad_norm": 0.030895119532942772, |
|
"learning_rate": 1.4205493076275728e-05, |
|
"loss": 0.0141, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 4.330136588201104, |
|
"grad_norm": 1.5610733032226562, |
|
"learning_rate": 1.4052482595057764e-05, |
|
"loss": 0.0117, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 4.3591979075850045, |
|
"grad_norm": 1.8453948497772217, |
|
"learning_rate": 1.3899472113839798e-05, |
|
"loss": 0.0169, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.3591979075850045, |
|
"eval_loss": 0.03333577141165733, |
|
"eval_runtime": 30.1339, |
|
"eval_samples_per_second": 405.955, |
|
"eval_steps_per_second": 12.71, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.388259226968905, |
|
"grad_norm": 1.0826033353805542, |
|
"learning_rate": 1.3746461632621835e-05, |
|
"loss": 0.0169, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 4.417320546352804, |
|
"grad_norm": 0.9390700459480286, |
|
"learning_rate": 1.3593451151403872e-05, |
|
"loss": 0.0134, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 4.446381865736704, |
|
"grad_norm": 0.7679401636123657, |
|
"learning_rate": 1.3440440670185907e-05, |
|
"loss": 0.0178, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 4.446381865736704, |
|
"eval_loss": 0.030997373163700104, |
|
"eval_runtime": 31.2406, |
|
"eval_samples_per_second": 391.574, |
|
"eval_steps_per_second": 12.26, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 4.4754431851206045, |
|
"grad_norm": 1.1060469150543213, |
|
"learning_rate": 1.3287430188967944e-05, |
|
"loss": 0.0155, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 4.504504504504505, |
|
"grad_norm": 2.564753293991089, |
|
"learning_rate": 1.313594981256216e-05, |
|
"loss": 0.0178, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 4.533565823888405, |
|
"grad_norm": 0.15116174519062042, |
|
"learning_rate": 1.2982939331344197e-05, |
|
"loss": 0.0098, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 4.533565823888405, |
|
"eval_loss": 0.030752593651413918, |
|
"eval_runtime": 29.6271, |
|
"eval_samples_per_second": 412.899, |
|
"eval_steps_per_second": 12.927, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 4.562627143272304, |
|
"grad_norm": 0.29445090889930725, |
|
"learning_rate": 1.2829928850126234e-05, |
|
"loss": 0.0118, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 4.5916884626562045, |
|
"grad_norm": 0.19161444902420044, |
|
"learning_rate": 1.2676918368908271e-05, |
|
"loss": 0.0122, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 4.620749782040105, |
|
"grad_norm": 0.15567593276500702, |
|
"learning_rate": 1.2523907887690308e-05, |
|
"loss": 0.0138, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 4.620749782040105, |
|
"eval_loss": 0.030682172626256943, |
|
"eval_runtime": 32.042, |
|
"eval_samples_per_second": 381.78, |
|
"eval_steps_per_second": 11.953, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 4.649811101424005, |
|
"grad_norm": 0.9265925884246826, |
|
"learning_rate": 1.2370897406472343e-05, |
|
"loss": 0.0125, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.678872420807904, |
|
"grad_norm": 0.4911397695541382, |
|
"learning_rate": 1.221788692525438e-05, |
|
"loss": 0.0135, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 4.707933740191804, |
|
"grad_norm": 4.223952293395996, |
|
"learning_rate": 1.2064876444036417e-05, |
|
"loss": 0.0155, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 4.707933740191804, |
|
"eval_loss": 0.029881037771701813, |
|
"eval_runtime": 29.3642, |
|
"eval_samples_per_second": 416.595, |
|
"eval_steps_per_second": 13.043, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 4.736995059575705, |
|
"grad_norm": 0.14090295135974884, |
|
"learning_rate": 1.1911865962818454e-05, |
|
"loss": 0.0129, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 4.766056378959605, |
|
"grad_norm": 4.120369911193848, |
|
"learning_rate": 1.1758855481600491e-05, |
|
"loss": 0.0131, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 4.795117698343505, |
|
"grad_norm": 0.8263002634048462, |
|
"learning_rate": 1.1605845000382527e-05, |
|
"loss": 0.0114, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.795117698343505, |
|
"eval_loss": 0.029694661498069763, |
|
"eval_runtime": 31.2583, |
|
"eval_samples_per_second": 391.352, |
|
"eval_steps_per_second": 12.253, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.824179017727404, |
|
"grad_norm": 0.40641260147094727, |
|
"learning_rate": 1.1452834519164563e-05, |
|
"loss": 0.0131, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 4.853240337111305, |
|
"grad_norm": 0.818534791469574, |
|
"learning_rate": 1.12998240379466e-05, |
|
"loss": 0.0131, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 4.882301656495205, |
|
"grad_norm": 0.12895511090755463, |
|
"learning_rate": 1.1146813556728637e-05, |
|
"loss": 0.0144, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 4.882301656495205, |
|
"eval_loss": 0.02863692305982113, |
|
"eval_runtime": 28.8059, |
|
"eval_samples_per_second": 424.671, |
|
"eval_steps_per_second": 13.296, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 4.911362975879105, |
|
"grad_norm": 0.09267265349626541, |
|
"learning_rate": 1.0993803075510674e-05, |
|
"loss": 0.0116, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 4.940424295263005, |
|
"grad_norm": 0.10310707986354828, |
|
"learning_rate": 1.0840792594292711e-05, |
|
"loss": 0.0141, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.969485614646905, |
|
"grad_norm": 0.27060624957084656, |
|
"learning_rate": 1.0687782113074747e-05, |
|
"loss": 0.019, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.969485614646905, |
|
"eval_loss": 0.02940661646425724, |
|
"eval_runtime": 31.7266, |
|
"eval_samples_per_second": 385.575, |
|
"eval_steps_per_second": 12.072, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.998546934030805, |
|
"grad_norm": 0.17917053401470184, |
|
"learning_rate": 1.0534771631856784e-05, |
|
"loss": 0.0141, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 5.027608253414705, |
|
"grad_norm": 1.1561379432678223, |
|
"learning_rate": 1.038176115063882e-05, |
|
"loss": 0.0136, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 5.056669572798605, |
|
"grad_norm": 0.08390885591506958, |
|
"learning_rate": 1.0228750669420858e-05, |
|
"loss": 0.0123, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 5.056669572798605, |
|
"eval_loss": 0.027470601722598076, |
|
"eval_runtime": 29.8277, |
|
"eval_samples_per_second": 410.122, |
|
"eval_steps_per_second": 12.84, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 5.0857308921825055, |
|
"grad_norm": 1.385046362876892, |
|
"learning_rate": 1.0075740188202894e-05, |
|
"loss": 0.0124, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.114792211566405, |
|
"grad_norm": 0.07293292135000229, |
|
"learning_rate": 9.92272970698493e-06, |
|
"loss": 0.0088, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 5.143853530950305, |
|
"grad_norm": 0.14433690905570984, |
|
"learning_rate": 9.769719225766967e-06, |
|
"loss": 0.0079, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 5.143853530950305, |
|
"eval_loss": 0.028321975842118263, |
|
"eval_runtime": 31.9063, |
|
"eval_samples_per_second": 383.404, |
|
"eval_steps_per_second": 12.004, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 5.172914850334205, |
|
"grad_norm": 0.06696674972772598, |
|
"learning_rate": 9.616708744549002e-06, |
|
"loss": 0.0097, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 5.2019761697181055, |
|
"grad_norm": 0.5811833739280701, |
|
"learning_rate": 9.463698263331039e-06, |
|
"loss": 0.0085, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 5.231037489102005, |
|
"grad_norm": 0.05103592202067375, |
|
"learning_rate": 9.310687782113076e-06, |
|
"loss": 0.0159, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.231037489102005, |
|
"eval_loss": 0.029669631272554398, |
|
"eval_runtime": 28.3217, |
|
"eval_samples_per_second": 431.931, |
|
"eval_steps_per_second": 13.523, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.260098808485905, |
|
"grad_norm": 0.31854337453842163, |
|
"learning_rate": 9.157677300895113e-06, |
|
"loss": 0.0092, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 5.289160127869805, |
|
"grad_norm": 0.05867089703679085, |
|
"learning_rate": 9.00466681967715e-06, |
|
"loss": 0.0168, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 5.3182214472537055, |
|
"grad_norm": 0.01062140055000782, |
|
"learning_rate": 8.851656338459185e-06, |
|
"loss": 0.01, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 5.3182214472537055, |
|
"eval_loss": 0.02898811176419258, |
|
"eval_runtime": 30.964, |
|
"eval_samples_per_second": 395.071, |
|
"eval_steps_per_second": 12.369, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 5.347282766637606, |
|
"grad_norm": 0.23528796434402466, |
|
"learning_rate": 8.698645857241222e-06, |
|
"loss": 0.0086, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 5.376344086021505, |
|
"grad_norm": 0.7098844647407532, |
|
"learning_rate": 8.545635376023257e-06, |
|
"loss": 0.0083, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"grad_norm": 0.0675228089094162, |
|
"learning_rate": 8.392624894805294e-06, |
|
"loss": 0.0127, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"eval_loss": 0.028332242742180824, |
|
"eval_runtime": 28.1702, |
|
"eval_samples_per_second": 434.253, |
|
"eval_steps_per_second": 13.596, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 5.4344667247893055, |
|
"grad_norm": 0.8665882349014282, |
|
"learning_rate": 8.239614413587331e-06, |
|
"loss": 0.0107, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 5.463528044173206, |
|
"grad_norm": 0.13911496102809906, |
|
"learning_rate": 8.086603932369368e-06, |
|
"loss": 0.0121, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 5.492589363557105, |
|
"grad_norm": 0.08510652929544449, |
|
"learning_rate": 7.935123555963584e-06, |
|
"loss": 0.0098, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 5.492589363557105, |
|
"eval_loss": 0.02792692370712757, |
|
"eval_runtime": 30.8767, |
|
"eval_samples_per_second": 396.189, |
|
"eval_steps_per_second": 12.404, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 5.521650682941005, |
|
"grad_norm": 0.048594117164611816, |
|
"learning_rate": 7.782113074745621e-06, |
|
"loss": 0.014, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 5.5507120023249055, |
|
"grad_norm": 0.09377363324165344, |
|
"learning_rate": 7.629102593527657e-06, |
|
"loss": 0.0114, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 5.579773321708806, |
|
"grad_norm": 0.033514928072690964, |
|
"learning_rate": 7.476092112309694e-06, |
|
"loss": 0.012, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 5.579773321708806, |
|
"eval_loss": 0.027103085070848465, |
|
"eval_runtime": 27.6915, |
|
"eval_samples_per_second": 441.76, |
|
"eval_steps_per_second": 13.831, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 5.608834641092706, |
|
"grad_norm": 0.8264727592468262, |
|
"learning_rate": 7.3230816310917305e-06, |
|
"loss": 0.0105, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 5.637895960476605, |
|
"grad_norm": 2.6640799045562744, |
|
"learning_rate": 7.1700711498737675e-06, |
|
"loss": 0.0142, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 5.6669572798605055, |
|
"grad_norm": 0.03503794968128204, |
|
"learning_rate": 7.017060668655804e-06, |
|
"loss": 0.0096, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 5.6669572798605055, |
|
"eval_loss": 0.02664261683821678, |
|
"eval_runtime": 29.6973, |
|
"eval_samples_per_second": 411.923, |
|
"eval_steps_per_second": 12.897, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 5.696018599244406, |
|
"grad_norm": 0.022000476717948914, |
|
"learning_rate": 6.86405018743784e-06, |
|
"loss": 0.0113, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 5.725079918628306, |
|
"grad_norm": 0.03974687680602074, |
|
"learning_rate": 6.711039706219876e-06, |
|
"loss": 0.0119, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 5.754141238012206, |
|
"grad_norm": 0.0372069887816906, |
|
"learning_rate": 6.558029225001913e-06, |
|
"loss": 0.0142, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 5.754141238012206, |
|
"eval_loss": 0.027487969025969505, |
|
"eval_runtime": 27.3827, |
|
"eval_samples_per_second": 446.742, |
|
"eval_steps_per_second": 13.987, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 5.7832025573961054, |
|
"grad_norm": 0.24150213599205017, |
|
"learning_rate": 6.40501874378395e-06, |
|
"loss": 0.0097, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 5.812263876780006, |
|
"grad_norm": 1.2151767015457153, |
|
"learning_rate": 6.252008262565986e-06, |
|
"loss": 0.008, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 5.841325196163906, |
|
"grad_norm": 0.12997518479824066, |
|
"learning_rate": 6.098997781348023e-06, |
|
"loss": 0.0103, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 5.841325196163906, |
|
"eval_loss": 0.027195250615477562, |
|
"eval_runtime": 30.0975, |
|
"eval_samples_per_second": 406.446, |
|
"eval_steps_per_second": 12.725, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 5.870386515547806, |
|
"grad_norm": 1.2384765148162842, |
|
"learning_rate": 5.94598730013006e-06, |
|
"loss": 0.0115, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 5.899447834931706, |
|
"grad_norm": 0.46222779154777527, |
|
"learning_rate": 5.792976818912096e-06, |
|
"loss": 0.0107, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 5.928509154315606, |
|
"grad_norm": 0.08042553812265396, |
|
"learning_rate": 5.639966337694133e-06, |
|
"loss": 0.0089, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 5.928509154315606, |
|
"eval_loss": 0.02774212509393692, |
|
"eval_runtime": 27.6569, |
|
"eval_samples_per_second": 442.313, |
|
"eval_steps_per_second": 13.848, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 5.957570473699506, |
|
"grad_norm": 0.01730126515030861, |
|
"learning_rate": 5.486955856476169e-06, |
|
"loss": 0.0114, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 5.986631793083406, |
|
"grad_norm": 0.11168185621500015, |
|
"learning_rate": 5.333945375258206e-06, |
|
"loss": 0.0097, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 6.015693112467306, |
|
"grad_norm": 0.294203519821167, |
|
"learning_rate": 5.180934894040243e-06, |
|
"loss": 0.0098, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 6.015693112467306, |
|
"eval_loss": 0.027102118358016014, |
|
"eval_runtime": 30.4406, |
|
"eval_samples_per_second": 401.864, |
|
"eval_steps_per_second": 12.582, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 6.044754431851206, |
|
"grad_norm": 0.7714498043060303, |
|
"learning_rate": 5.027924412822279e-06, |
|
"loss": 0.012, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 6.073815751235106, |
|
"grad_norm": 0.008903966285288334, |
|
"learning_rate": 4.874913931604315e-06, |
|
"loss": 0.0096, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 6.102877070619006, |
|
"grad_norm": 0.07710820436477661, |
|
"learning_rate": 4.721903450386352e-06, |
|
"loss": 0.0081, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.102877070619006, |
|
"eval_loss": 0.027398647740483284, |
|
"eval_runtime": 27.3332, |
|
"eval_samples_per_second": 447.551, |
|
"eval_steps_per_second": 14.012, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.131938390002906, |
|
"grad_norm": 0.07280821353197098, |
|
"learning_rate": 4.568892969168388e-06, |
|
"loss": 0.007, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 6.1609997093868065, |
|
"grad_norm": 0.15252958238124847, |
|
"learning_rate": 4.415882487950424e-06, |
|
"loss": 0.0086, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 6.190061028770706, |
|
"grad_norm": 0.058942727744579315, |
|
"learning_rate": 4.262872006732461e-06, |
|
"loss": 0.0077, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 6.190061028770706, |
|
"eval_loss": 0.027552621439099312, |
|
"eval_runtime": 31.1781, |
|
"eval_samples_per_second": 392.358, |
|
"eval_steps_per_second": 12.284, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 6.219122348154606, |
|
"grad_norm": 0.10249049216508865, |
|
"learning_rate": 4.109861525514498e-06, |
|
"loss": 0.0096, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 6.248183667538506, |
|
"grad_norm": 0.0850997343659401, |
|
"learning_rate": 3.956851044296534e-06, |
|
"loss": 0.0071, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.2772449869224065, |
|
"grad_norm": 0.39275994896888733, |
|
"learning_rate": 3.8038405630785714e-06, |
|
"loss": 0.0084, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 6.2772449869224065, |
|
"eval_loss": 0.027445398271083832, |
|
"eval_runtime": 27.3905, |
|
"eval_samples_per_second": 446.615, |
|
"eval_steps_per_second": 13.983, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 6.306306306306306, |
|
"grad_norm": 0.09213391691446304, |
|
"learning_rate": 3.650830081860608e-06, |
|
"loss": 0.0105, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 6.335367625690206, |
|
"grad_norm": 0.012103458866477013, |
|
"learning_rate": 3.4993497054548235e-06, |
|
"loss": 0.0089, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 6.364428945074106, |
|
"grad_norm": 1.7496978044509888, |
|
"learning_rate": 3.3463392242368605e-06, |
|
"loss": 0.0112, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 6.364428945074106, |
|
"eval_loss": 0.027640603482723236, |
|
"eval_runtime": 31.8396, |
|
"eval_samples_per_second": 384.207, |
|
"eval_steps_per_second": 12.029, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 6.3934902644580065, |
|
"grad_norm": 0.04138137400150299, |
|
"learning_rate": 3.193328743018897e-06, |
|
"loss": 0.0095, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 6.422551583841907, |
|
"grad_norm": 0.08476943522691727, |
|
"learning_rate": 3.0403182618009336e-06, |
|
"loss": 0.0081, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 6.451612903225806, |
|
"grad_norm": 2.349595785140991, |
|
"learning_rate": 2.88730778058297e-06, |
|
"loss": 0.0085, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 6.451612903225806, |
|
"eval_loss": 0.027077894657850266, |
|
"eval_runtime": 28.0475, |
|
"eval_samples_per_second": 436.153, |
|
"eval_steps_per_second": 13.655, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 6.480674222609706, |
|
"grad_norm": 1.728858232498169, |
|
"learning_rate": 2.7342972993650067e-06, |
|
"loss": 0.0113, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 6.5097355419936065, |
|
"grad_norm": 0.10394562035799026, |
|
"learning_rate": 2.5812868181470436e-06, |
|
"loss": 0.0088, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 6.538796861377507, |
|
"grad_norm": 0.07329772412776947, |
|
"learning_rate": 2.4282763369290798e-06, |
|
"loss": 0.0094, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 6.538796861377507, |
|
"eval_loss": 0.02670624852180481, |
|
"eval_runtime": 31.4268, |
|
"eval_samples_per_second": 389.254, |
|
"eval_steps_per_second": 12.187, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 6.567858180761407, |
|
"grad_norm": 0.09306680411100388, |
|
"learning_rate": 2.2752658557111167e-06, |
|
"loss": 0.0073, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 6.596919500145306, |
|
"grad_norm": 0.061711717396974564, |
|
"learning_rate": 2.122255374493153e-06, |
|
"loss": 0.0075, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 6.6259808195292065, |
|
"grad_norm": 1.4829280376434326, |
|
"learning_rate": 1.9692448932751894e-06, |
|
"loss": 0.0078, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 6.6259808195292065, |
|
"eval_loss": 0.02658041939139366, |
|
"eval_runtime": 27.5526, |
|
"eval_samples_per_second": 443.986, |
|
"eval_steps_per_second": 13.901, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 6.655042138913107, |
|
"grad_norm": 0.017336523160338402, |
|
"learning_rate": 1.8162344120572261e-06, |
|
"loss": 0.0108, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 6.684103458297007, |
|
"grad_norm": 0.16300371289253235, |
|
"learning_rate": 1.6632239308392625e-06, |
|
"loss": 0.0125, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 6.713164777680907, |
|
"grad_norm": 0.4278203845024109, |
|
"learning_rate": 1.5102134496212992e-06, |
|
"loss": 0.0099, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 6.713164777680907, |
|
"eval_loss": 0.02630489505827427, |
|
"eval_runtime": 31.2321, |
|
"eval_samples_per_second": 391.681, |
|
"eval_steps_per_second": 12.263, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 6.7422260970648065, |
|
"grad_norm": 0.5904819965362549, |
|
"learning_rate": 1.3572029684033358e-06, |
|
"loss": 0.0087, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 6.771287416448707, |
|
"grad_norm": 1.0529608726501465, |
|
"learning_rate": 1.2041924871853723e-06, |
|
"loss": 0.0078, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 6.800348735832607, |
|
"grad_norm": 0.13827526569366455, |
|
"learning_rate": 1.0511820059674089e-06, |
|
"loss": 0.0113, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 6.800348735832607, |
|
"eval_loss": 0.026266321539878845, |
|
"eval_runtime": 27.2864, |
|
"eval_samples_per_second": 448.319, |
|
"eval_steps_per_second": 14.036, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 6.829410055216507, |
|
"grad_norm": 0.8667532801628113, |
|
"learning_rate": 8.981715247494455e-07, |
|
"loss": 0.0097, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 6.858471374600407, |
|
"grad_norm": 0.045186400413513184, |
|
"learning_rate": 7.45161043531482e-07, |
|
"loss": 0.0066, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 6.887532693984307, |
|
"grad_norm": 0.05325442925095558, |
|
"learning_rate": 5.921505623135185e-07, |
|
"loss": 0.0053, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 6.887532693984307, |
|
"eval_loss": 0.02622653916478157, |
|
"eval_runtime": 30.4773, |
|
"eval_samples_per_second": 401.381, |
|
"eval_steps_per_second": 12.567, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 6.916594013368207, |
|
"grad_norm": 0.22071638703346252, |
|
"learning_rate": 4.3914008109555514e-07, |
|
"loss": 0.0095, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 6.945655332752107, |
|
"grad_norm": 0.026897892355918884, |
|
"learning_rate": 2.8612959987759163e-07, |
|
"loss": 0.0067, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 6.974716652136007, |
|
"grad_norm": 1.3487067222595215, |
|
"learning_rate": 1.331191186596282e-07, |
|
"loss": 0.01, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 6.974716652136007, |
|
"eval_loss": 0.026174582540988922, |
|
"eval_runtime": 27.261, |
|
"eval_samples_per_second": 448.736, |
|
"eval_steps_per_second": 14.049, |
|
"step": 24000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 24087, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|