|
{ |
|
"best_metric": 1.4004004001617432, |
|
"best_model_checkpoint": "finetuned_student_model/checkpoint-900", |
|
"epoch": 1.9969604863221884, |
|
"eval_steps": 100, |
|
"global_step": 904, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022105554020447636, |
|
"grad_norm": 9.875, |
|
"learning_rate": 0.0002973392461197339, |
|
"loss": 3.2514, |
|
"mean_token_accuracy": 0.4204087435267866, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04421110804089527, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 0.0002940133037694013, |
|
"loss": 2.3718, |
|
"mean_token_accuracy": 0.5071762848645449, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06631666206134292, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.0002906873614190687, |
|
"loss": 2.1087, |
|
"mean_token_accuracy": 0.5472177878022194, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08842221608179054, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.0002873614190687361, |
|
"loss": 2.0888, |
|
"mean_token_accuracy": 0.5551092017441988, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11052777010223819, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.0002840354767184035, |
|
"loss": 2.1327, |
|
"mean_token_accuracy": 0.5446394145488739, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13263332412268583, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0002807095343680709, |
|
"loss": 2.0511, |
|
"mean_token_accuracy": 0.5573025777935982, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15473887814313347, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 0.0002773835920177383, |
|
"loss": 2.0515, |
|
"mean_token_accuracy": 0.5619197305291891, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1768444321635811, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.0002740576496674058, |
|
"loss": 1.9854, |
|
"mean_token_accuracy": 0.5682128138840199, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19894998618402873, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00027073170731707315, |
|
"loss": 1.9689, |
|
"mean_token_accuracy": 0.5708833433687687, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22105554020447638, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0002674057649667406, |
|
"loss": 1.9959, |
|
"mean_token_accuracy": 0.5728900354355574, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22105554020447638, |
|
"eval_loss": 1.9453095197677612, |
|
"eval_mean_token_accuracy": 0.5785076600132566, |
|
"eval_runtime": 23.0365, |
|
"eval_samples_per_second": 34.293, |
|
"eval_steps_per_second": 4.298, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 2.5, |
|
"learning_rate": 0.00026407982261640795, |
|
"loss": 1.967, |
|
"mean_token_accuracy": 0.5708974912762642, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.26526664824537166, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.0002607538802660754, |
|
"loss": 1.9176, |
|
"mean_token_accuracy": 0.5783881578594446, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2873722022658193, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.00025742793791574275, |
|
"loss": 1.8481, |
|
"mean_token_accuracy": 0.5897452015429735, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.30947775628626695, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.0002541019955654102, |
|
"loss": 1.8706, |
|
"mean_token_accuracy": 0.5862758502364158, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.33158331030671456, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.0002507760532150776, |
|
"loss": 1.8667, |
|
"mean_token_accuracy": 0.5883878566324711, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3536888643271622, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.000247450110864745, |
|
"loss": 1.8101, |
|
"mean_token_accuracy": 0.5959903877228498, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.37579441834760985, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 0.00024412416851441238, |
|
"loss": 1.8353, |
|
"mean_token_accuracy": 0.5912164930254221, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.39789997236805746, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00024079822616407978, |
|
"loss": 1.7821, |
|
"mean_token_accuracy": 0.6001696892082691, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.42000552638850513, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.0002374722838137472, |
|
"loss": 1.8351, |
|
"mean_token_accuracy": 0.5922294020652771, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.44211108040895275, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.0002341463414634146, |
|
"loss": 1.7342, |
|
"mean_token_accuracy": 0.6008941765874625, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.44211108040895275, |
|
"eval_loss": 1.7812447547912598, |
|
"eval_mean_token_accuracy": 0.6040364732645979, |
|
"eval_runtime": 21.5006, |
|
"eval_samples_per_second": 36.743, |
|
"eval_steps_per_second": 4.605, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.46421663442940037, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.000230820399113082, |
|
"loss": 1.7778, |
|
"mean_token_accuracy": 0.6000470589846373, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0002274944567627494, |
|
"loss": 1.7515, |
|
"mean_token_accuracy": 0.6070282235741615, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5084277424702957, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.0002241685144124168, |
|
"loss": 1.7348, |
|
"mean_token_accuracy": 0.6033936321735383, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5305332964907433, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.0002208425720620842, |
|
"loss": 1.7198, |
|
"mean_token_accuracy": 0.6069234035909176, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5526388505111909, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.00021751662971175166, |
|
"loss": 1.6866, |
|
"mean_token_accuracy": 0.6128835912793875, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5747444045316386, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00021419068736141907, |
|
"loss": 1.6751, |
|
"mean_token_accuracy": 0.6139085631817579, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5968499585520862, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.00021086474501108647, |
|
"loss": 1.7237, |
|
"mean_token_accuracy": 0.6062099590897561, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6189555125725339, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.00020753880266075387, |
|
"loss": 1.6894, |
|
"mean_token_accuracy": 0.6122250266373157, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6410610665929815, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00020421286031042127, |
|
"loss": 1.6325, |
|
"mean_token_accuracy": 0.6204121351242066, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6631666206134291, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0002008869179600887, |
|
"loss": 1.6668, |
|
"mean_token_accuracy": 0.615133136883378, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6631666206134291, |
|
"eval_loss": 1.6475698947906494, |
|
"eval_mean_token_accuracy": 0.6219914049813242, |
|
"eval_runtime": 21.4767, |
|
"eval_samples_per_second": 36.784, |
|
"eval_steps_per_second": 4.61, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6852721746338768, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0001975609756097561, |
|
"loss": 1.614, |
|
"mean_token_accuracy": 0.6179928559809923, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7073777286543244, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 0.0001942350332594235, |
|
"loss": 1.7166, |
|
"mean_token_accuracy": 0.6136065050959587, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0001909090909090909, |
|
"loss": 1.657, |
|
"mean_token_accuracy": 0.6190799050033092, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7515888366952197, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001875831485587583, |
|
"loss": 1.6038, |
|
"mean_token_accuracy": 0.6287192285060883, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7736943907156673, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.0001842572062084257, |
|
"loss": 1.5914, |
|
"mean_token_accuracy": 0.6284744247794152, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7957999447361149, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00018093126385809312, |
|
"loss": 1.6075, |
|
"mean_token_accuracy": 0.6283752031624317, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8179054987565626, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00017760532150776052, |
|
"loss": 1.6082, |
|
"mean_token_accuracy": 0.6261149801313877, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8400110527770103, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.00017427937915742792, |
|
"loss": 1.5879, |
|
"mean_token_accuracy": 0.6261836618185044, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8621166067974578, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.00017095343680709532, |
|
"loss": 1.5785, |
|
"mean_token_accuracy": 0.631008780002594, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8842221608179055, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00016762749445676272, |
|
"loss": 1.4931, |
|
"mean_token_accuracy": 0.6441986732184887, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8842221608179055, |
|
"eval_loss": 1.5408236980438232, |
|
"eval_mean_token_accuracy": 0.6406004489070237, |
|
"eval_runtime": 21.4648, |
|
"eval_samples_per_second": 36.804, |
|
"eval_steps_per_second": 4.612, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9063277148383532, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00016430155210643015, |
|
"loss": 1.5441, |
|
"mean_token_accuracy": 0.6400343291461468, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9284332688588007, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.00016097560975609755, |
|
"loss": 1.5299, |
|
"mean_token_accuracy": 0.6389802560210228, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9505388228792484, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.00015764966740576495, |
|
"loss": 1.524, |
|
"mean_token_accuracy": 0.6369420018047094, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9726443768996961, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00015432372505543235, |
|
"loss": 1.5423, |
|
"mean_token_accuracy": 0.6327314972877502, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9947499309201436, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00015099778270509975, |
|
"loss": 1.5033, |
|
"mean_token_accuracy": 0.6431983485817909, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0154738878143132, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.00014767184035476718, |
|
"loss": 1.2191, |
|
"mean_token_accuracy": 0.6965674503644308, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.037579441834761, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.00014434589800443458, |
|
"loss": 1.0348, |
|
"mean_token_accuracy": 0.7245359934866429, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0596849958552086, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00014101995565410198, |
|
"loss": 1.0563, |
|
"mean_token_accuracy": 0.7208410792052746, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0817905498756564, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.00013769401330376938, |
|
"loss": 1.0467, |
|
"mean_token_accuracy": 0.7243687815964222, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00013436807095343678, |
|
"loss": 1.048, |
|
"mean_token_accuracy": 0.718181136995554, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"eval_loss": 1.5222158432006836, |
|
"eval_mean_token_accuracy": 0.6474175338793282, |
|
"eval_runtime": 21.4789, |
|
"eval_samples_per_second": 36.78, |
|
"eval_steps_per_second": 4.609, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1260016579165515, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0001310421286031042, |
|
"loss": 1.0292, |
|
"mean_token_accuracy": 0.724801865965128, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.148107211936999, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001277161862527716, |
|
"loss": 1.0431, |
|
"mean_token_accuracy": 0.7216434337198734, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.000124390243902439, |
|
"loss": 1.0261, |
|
"mean_token_accuracy": 0.7233444675803185, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1923183199778944, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00012106430155210642, |
|
"loss": 1.0001, |
|
"mean_token_accuracy": 0.7310704313218593, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2144238739983422, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00011773835920177382, |
|
"loss": 0.9996, |
|
"mean_token_accuracy": 0.7315604917705059, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2365294280187897, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00011441241685144122, |
|
"loss": 0.9732, |
|
"mean_token_accuracy": 0.7363451808691025, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2586349820392373, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.00011108647450110864, |
|
"loss": 1.0185, |
|
"mean_token_accuracy": 0.7294765569269657, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.280740536059685, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.00010776053215077604, |
|
"loss": 0.9948, |
|
"mean_token_accuracy": 0.7329611636698246, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3028460900801326, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.00010443458980044345, |
|
"loss": 0.9808, |
|
"mean_token_accuracy": 0.7348393484950065, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.3249516441005802, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.00010110864745011085, |
|
"loss": 0.9809, |
|
"mean_token_accuracy": 0.7370587438344955, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3249516441005802, |
|
"eval_loss": 1.4869917631149292, |
|
"eval_mean_token_accuracy": 0.6575891068487456, |
|
"eval_runtime": 21.4699, |
|
"eval_samples_per_second": 36.796, |
|
"eval_steps_per_second": 4.611, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.347057198121028, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.778270509977825e-05, |
|
"loss": 0.9891, |
|
"mean_token_accuracy": 0.7354318417608738, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3691627521414755, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 9.445676274944568e-05, |
|
"loss": 0.9653, |
|
"mean_token_accuracy": 0.7393273778259755, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.391268306161923, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.113082039911308e-05, |
|
"loss": 0.9552, |
|
"mean_token_accuracy": 0.7432896822690964, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4133738601823709, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 8.780487804878047e-05, |
|
"loss": 0.9713, |
|
"mean_token_accuracy": 0.7419402815401555, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.4354794142028184, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 8.44789356984479e-05, |
|
"loss": 0.9548, |
|
"mean_token_accuracy": 0.7421382986009121, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4575849682232662, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.11529933481153e-05, |
|
"loss": 0.9824, |
|
"mean_token_accuracy": 0.737388264387846, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4796905222437138, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 7.78270509977827e-05, |
|
"loss": 0.937, |
|
"mean_token_accuracy": 0.7477126508951187, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5017960762641613, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.450110864745011e-05, |
|
"loss": 0.9433, |
|
"mean_token_accuracy": 0.744731155782938, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.523901630284609, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.117516629711751e-05, |
|
"loss": 0.9259, |
|
"mean_token_accuracy": 0.745546705648303, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5460071843050567, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.784922394678491e-05, |
|
"loss": 0.9625, |
|
"mean_token_accuracy": 0.7406142510473728, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5460071843050567, |
|
"eval_loss": 1.429708480834961, |
|
"eval_mean_token_accuracy": 0.6674101424939705, |
|
"eval_runtime": 21.4679, |
|
"eval_samples_per_second": 36.799, |
|
"eval_steps_per_second": 4.612, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5681127383255042, |
|
"grad_norm": 1.75, |
|
"learning_rate": 6.452328159645232e-05, |
|
"loss": 0.9285, |
|
"mean_token_accuracy": 0.7476192332804203, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.590218292345952, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.119733924611973e-05, |
|
"loss": 0.9173, |
|
"mean_token_accuracy": 0.75327173396945, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6123238463663996, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 5.787139689578713e-05, |
|
"loss": 0.9392, |
|
"mean_token_accuracy": 0.7470508739352226, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.6344294003868471, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 5.454545454545454e-05, |
|
"loss": 0.9148, |
|
"mean_token_accuracy": 0.7503846064209938, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6565349544072947, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 5.121951219512195e-05, |
|
"loss": 0.9286, |
|
"mean_token_accuracy": 0.749035281687975, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6786405084277425, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.7893569844789354e-05, |
|
"loss": 0.9312, |
|
"mean_token_accuracy": 0.748369749635458, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.7007460624481903, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.456762749445676e-05, |
|
"loss": 0.9585, |
|
"mean_token_accuracy": 0.742963894456625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7228516164686378, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.124168514412417e-05, |
|
"loss": 1.0002, |
|
"mean_token_accuracy": 0.7539634991437196, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7449571704890854, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.791574279379157e-05, |
|
"loss": 0.93, |
|
"mean_token_accuracy": 0.7481286890804768, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.767062724509533, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.4589800443458975e-05, |
|
"loss": 0.9119, |
|
"mean_token_accuracy": 0.7515073113143445, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.767062724509533, |
|
"eval_loss": 1.4062376022338867, |
|
"eval_mean_token_accuracy": 0.6735276628022242, |
|
"eval_runtime": 21.4799, |
|
"eval_samples_per_second": 36.779, |
|
"eval_steps_per_second": 4.609, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7891682785299805, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.126385809312638e-05, |
|
"loss": 0.9018, |
|
"mean_token_accuracy": 0.7531212449073792, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8112738325504283, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.793791574279379e-05, |
|
"loss": 0.9048, |
|
"mean_token_accuracy": 0.7560021050274373, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.833379386570876, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.4611973392461197e-05, |
|
"loss": 0.9028, |
|
"mean_token_accuracy": 0.7558302395045757, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8554849405913236, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.12860310421286e-05, |
|
"loss": 0.9158, |
|
"mean_token_accuracy": 0.7508154392242432, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8775904946117712, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.7960088691796008e-05, |
|
"loss": 0.8864, |
|
"mean_token_accuracy": 0.7576524101197719, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8996960486322187, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.4634146341463413e-05, |
|
"loss": 0.9228, |
|
"mean_token_accuracy": 0.7508851245045662, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.9218016026526665, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.130820399113082e-05, |
|
"loss": 0.8975, |
|
"mean_token_accuracy": 0.7541297495365142, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.943907156673114, |
|
"grad_norm": 1.75, |
|
"learning_rate": 7.982261640798226e-06, |
|
"loss": 0.8971, |
|
"mean_token_accuracy": 0.7586074694991112, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9660127106935619, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.656319290465632e-06, |
|
"loss": 0.9251, |
|
"mean_token_accuracy": 0.7490280956029892, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.9881182647140094, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3303769401330375e-06, |
|
"loss": 0.9005, |
|
"mean_token_accuracy": 0.7555492661893368, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9881182647140094, |
|
"eval_loss": 1.4004004001617432, |
|
"eval_mean_token_accuracy": 0.6744482210188201, |
|
"eval_runtime": 21.4879, |
|
"eval_samples_per_second": 36.765, |
|
"eval_steps_per_second": 4.607, |
|
"step": 900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 904, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7777625251882496e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|