|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9223300970873787, |
|
"eval_steps": 500, |
|
"global_step": 153, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019417475728155338, |
|
"grad_norm": 6.841878890991211, |
|
"learning_rate": 2e-05, |
|
"loss": 4.5864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.038834951456310676, |
|
"grad_norm": 6.640402793884277, |
|
"learning_rate": 4e-05, |
|
"loss": 4.6138, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05825242718446602, |
|
"grad_norm": 6.41209077835083, |
|
"learning_rate": 6e-05, |
|
"loss": 4.6064, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07766990291262135, |
|
"grad_norm": 6.082383632659912, |
|
"learning_rate": 8e-05, |
|
"loss": 4.5534, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0970873786407767, |
|
"grad_norm": 4.00201416015625, |
|
"learning_rate": 0.0001, |
|
"loss": 4.3245, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11650485436893204, |
|
"grad_norm": 3.1707470417022705, |
|
"learning_rate": 0.00012, |
|
"loss": 4.0971, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.13592233009708737, |
|
"grad_norm": 3.160670518875122, |
|
"learning_rate": 0.00014, |
|
"loss": 3.789, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1553398058252427, |
|
"grad_norm": 2.994746208190918, |
|
"learning_rate": 0.00016, |
|
"loss": 3.3244, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.17475728155339806, |
|
"grad_norm": 2.6664786338806152, |
|
"learning_rate": 0.00018, |
|
"loss": 2.8561, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1941747572815534, |
|
"grad_norm": 2.029910087585449, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3926, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.21359223300970873, |
|
"grad_norm": 1.3144137859344482, |
|
"learning_rate": 0.00019998688836656323, |
|
"loss": 2.0283, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.23300970873786409, |
|
"grad_norm": 0.8263471722602844, |
|
"learning_rate": 0.00019994755690455152, |
|
"loss": 1.8683, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2524271844660194, |
|
"grad_norm": 0.7193414568901062, |
|
"learning_rate": 0.0001998820159279591, |
|
"loss": 1.732, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.27184466019417475, |
|
"grad_norm": 0.8350910544395447, |
|
"learning_rate": 0.00019979028262377118, |
|
"loss": 1.6208, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2912621359223301, |
|
"grad_norm": 0.6700139045715332, |
|
"learning_rate": 0.00019967238104745696, |
|
"loss": 1.493, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 1.1213518381118774, |
|
"learning_rate": 0.0001995283421166614, |
|
"loss": 1.4857, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3300970873786408, |
|
"grad_norm": 0.7505896687507629, |
|
"learning_rate": 0.00019935820360309777, |
|
"loss": 1.4643, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.34951456310679613, |
|
"grad_norm": 0.6984190940856934, |
|
"learning_rate": 0.00019916201012264254, |
|
"loss": 1.4227, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.36893203883495146, |
|
"grad_norm": 0.46819525957107544, |
|
"learning_rate": 0.00019893981312363562, |
|
"loss": 1.3889, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 0.5214670300483704, |
|
"learning_rate": 0.00019869167087338907, |
|
"loss": 1.3764, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4077669902912621, |
|
"grad_norm": 0.43329402804374695, |
|
"learning_rate": 0.00019841764844290744, |
|
"loss": 1.351, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42718446601941745, |
|
"grad_norm": 0.6535053253173828, |
|
"learning_rate": 0.0001981178176898239, |
|
"loss": 1.3576, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.44660194174757284, |
|
"grad_norm": 0.6363189816474915, |
|
"learning_rate": 0.00019779225723955707, |
|
"loss": 1.3159, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.46601941747572817, |
|
"grad_norm": 0.42140620946884155, |
|
"learning_rate": 0.00019744105246469263, |
|
"loss": 1.3072, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.6757440567016602, |
|
"learning_rate": 0.00019706429546259593, |
|
"loss": 1.3261, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5048543689320388, |
|
"grad_norm": 0.4116632044315338, |
|
"learning_rate": 0.00019666208503126112, |
|
"loss": 1.317, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5242718446601942, |
|
"grad_norm": 0.5359898209571838, |
|
"learning_rate": 0.00019623452664340306, |
|
"loss": 1.2932, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5436893203883495, |
|
"grad_norm": 0.4596949517726898, |
|
"learning_rate": 0.00019578173241879872, |
|
"loss": 1.2601, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5631067961165048, |
|
"grad_norm": 0.6857442855834961, |
|
"learning_rate": 0.0001953038210948861, |
|
"loss": 1.2734, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5825242718446602, |
|
"grad_norm": 0.4453957676887512, |
|
"learning_rate": 0.00019480091799562704, |
|
"loss": 1.2521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6019417475728155, |
|
"grad_norm": 0.6118847131729126, |
|
"learning_rate": 0.00019427315499864344, |
|
"loss": 1.2252, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 0.4894339442253113, |
|
"learning_rate": 0.00019372067050063438, |
|
"loss": 1.2369, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6407766990291263, |
|
"grad_norm": 0.5901785492897034, |
|
"learning_rate": 0.00019314360938108425, |
|
"loss": 1.2507, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6601941747572816, |
|
"grad_norm": 0.4147897958755493, |
|
"learning_rate": 0.00019254212296427044, |
|
"loss": 1.2454, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6796116504854369, |
|
"grad_norm": 0.2836126685142517, |
|
"learning_rate": 0.00019191636897958122, |
|
"loss": 1.2515, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6990291262135923, |
|
"grad_norm": 0.4523944556713104, |
|
"learning_rate": 0.00019126651152015403, |
|
"loss": 1.2217, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7184466019417476, |
|
"grad_norm": 0.4187013804912567, |
|
"learning_rate": 0.0001905927209998447, |
|
"loss": 1.2382, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7378640776699029, |
|
"grad_norm": 0.516381561756134, |
|
"learning_rate": 0.00018989517410853955, |
|
"loss": 1.2214, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7572815533980582, |
|
"grad_norm": 0.5928078889846802, |
|
"learning_rate": 0.00018917405376582145, |
|
"loss": 1.2407, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 0.4062119722366333, |
|
"learning_rate": 0.00018842954907300236, |
|
"loss": 1.1958, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7961165048543689, |
|
"grad_norm": 0.5966278910636902, |
|
"learning_rate": 0.0001876618552635348, |
|
"loss": 1.2043, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8155339805825242, |
|
"grad_norm": 0.47196483612060547, |
|
"learning_rate": 0.00018687117365181512, |
|
"loss": 1.2321, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8349514563106796, |
|
"grad_norm": 0.6465126872062683, |
|
"learning_rate": 0.00018605771158039253, |
|
"loss": 1.1996, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8543689320388349, |
|
"grad_norm": 0.4234643578529358, |
|
"learning_rate": 0.00018522168236559695, |
|
"loss": 1.198, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8737864077669902, |
|
"grad_norm": 0.3910824656486511, |
|
"learning_rate": 0.00018436330524160047, |
|
"loss": 1.1895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8932038834951457, |
|
"grad_norm": 0.5380635261535645, |
|
"learning_rate": 0.00018348280530292713, |
|
"loss": 1.1693, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.912621359223301, |
|
"grad_norm": 0.31823453307151794, |
|
"learning_rate": 0.00018258041344542566, |
|
"loss": 1.159, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 0.5679419636726379, |
|
"learning_rate": 0.0001816563663057211, |
|
"loss": 1.1931, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9514563106796117, |
|
"grad_norm": 0.3980861008167267, |
|
"learning_rate": 0.00018071090619916093, |
|
"loss": 1.1826, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.3688671588897705, |
|
"learning_rate": 0.00017974428105627208, |
|
"loss": 1.176, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9902912621359223, |
|
"grad_norm": 0.4137047827243805, |
|
"learning_rate": 0.00017875674435774547, |
|
"loss": 1.142, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.0097087378640777, |
|
"grad_norm": 0.3141655921936035, |
|
"learning_rate": 0.00017774855506796496, |
|
"loss": 1.1614, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0048543689320388, |
|
"grad_norm": 0.3113202154636383, |
|
"learning_rate": 0.00017671997756709863, |
|
"loss": 1.1668, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0242718446601942, |
|
"grad_norm": 0.5369274616241455, |
|
"learning_rate": 0.00017567128158176953, |
|
"loss": 1.1157, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0436893203883495, |
|
"grad_norm": 0.550740122795105, |
|
"learning_rate": 0.0001746027421143246, |
|
"loss": 1.1674, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0631067961165048, |
|
"grad_norm": 0.4507598578929901, |
|
"learning_rate": 0.00017351463937072004, |
|
"loss": 1.1484, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.0825242718446602, |
|
"grad_norm": 0.5626426935195923, |
|
"learning_rate": 0.00017240725868704218, |
|
"loss": 1.1614, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.1019417475728155, |
|
"grad_norm": 0.36562928557395935, |
|
"learning_rate": 0.00017128089045468294, |
|
"loss": 1.1246, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.1213592233009708, |
|
"grad_norm": 0.6106365323066711, |
|
"learning_rate": 0.00017013583004418993, |
|
"loss": 1.141, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.1407766990291262, |
|
"grad_norm": 0.3751409947872162, |
|
"learning_rate": 0.00016897237772781044, |
|
"loss": 1.1565, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1601941747572815, |
|
"grad_norm": 0.5318562984466553, |
|
"learning_rate": 0.00016779083860075033, |
|
"loss": 1.1501, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1796116504854368, |
|
"grad_norm": 0.3675023317337036, |
|
"learning_rate": 0.00016659152250116812, |
|
"loss": 1.1162, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.1990291262135921, |
|
"grad_norm": 0.38894137740135193, |
|
"learning_rate": 0.00016537474392892528, |
|
"loss": 1.146, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.2184466019417475, |
|
"grad_norm": 0.3107282817363739, |
|
"learning_rate": 0.000164140821963114, |
|
"loss": 1.135, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.237864077669903, |
|
"grad_norm": 0.2496870756149292, |
|
"learning_rate": 0.00016289008017838445, |
|
"loss": 1.133, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.2572815533980584, |
|
"grad_norm": 0.35233214497566223, |
|
"learning_rate": 0.00016162284656009274, |
|
"loss": 1.1446, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2766990291262137, |
|
"grad_norm": 0.33078905940055847, |
|
"learning_rate": 0.00016033945341829248, |
|
"loss": 1.1077, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.296116504854369, |
|
"grad_norm": 0.24236944317817688, |
|
"learning_rate": 0.00015904023730059228, |
|
"loss": 1.1367, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3155339805825244, |
|
"grad_norm": 0.3248869776725769, |
|
"learning_rate": 0.00015772553890390197, |
|
"loss": 1.1405, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.3349514563106797, |
|
"grad_norm": 0.3145497739315033, |
|
"learning_rate": 0.00015639570298509064, |
|
"loss": 1.1134, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.354368932038835, |
|
"grad_norm": 0.3858337700366974, |
|
"learning_rate": 0.00015505107827058036, |
|
"loss": 1.1293, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3737864077669903, |
|
"grad_norm": 0.3683490753173828, |
|
"learning_rate": 0.0001536920173648984, |
|
"loss": 1.1069, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3932038834951457, |
|
"grad_norm": 0.5532976388931274, |
|
"learning_rate": 0.000152318876658213, |
|
"loss": 1.1432, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.412621359223301, |
|
"grad_norm": 0.3198872208595276, |
|
"learning_rate": 0.00015093201623287631, |
|
"loss": 1.1423, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4320388349514563, |
|
"grad_norm": 0.7470155954360962, |
|
"learning_rate": 0.00014953179976899878, |
|
"loss": 1.1475, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4514563106796117, |
|
"grad_norm": 0.4092984199523926, |
|
"learning_rate": 0.00014811859444908052, |
|
"loss": 1.0955, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.470873786407767, |
|
"grad_norm": 0.6536680459976196, |
|
"learning_rate": 0.00014669277086172406, |
|
"loss": 1.1359, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.4902912621359223, |
|
"grad_norm": 0.35692015290260315, |
|
"learning_rate": 0.00014525470290445392, |
|
"loss": 1.1131, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5097087378640777, |
|
"grad_norm": 0.46786415576934814, |
|
"learning_rate": 0.00014380476768566824, |
|
"loss": 1.1133, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.529126213592233, |
|
"grad_norm": 0.6847173571586609, |
|
"learning_rate": 0.00014234334542574906, |
|
"loss": 1.125, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5485436893203883, |
|
"grad_norm": 0.5090566277503967, |
|
"learning_rate": 0.00014087081935735564, |
|
"loss": 1.1562, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5679611650485437, |
|
"grad_norm": 0.7293880581855774, |
|
"learning_rate": 0.00013938757562492873, |
|
"loss": 1.1348, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.587378640776699, |
|
"grad_norm": 0.3536413908004761, |
|
"learning_rate": 0.00013789400318343068, |
|
"loss": 1.1134, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.6067961165048543, |
|
"grad_norm": 0.802762508392334, |
|
"learning_rate": 0.00013639049369634876, |
|
"loss": 1.1256, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6262135922330097, |
|
"grad_norm": 0.27904102206230164, |
|
"learning_rate": 0.00013487744143298822, |
|
"loss": 1.1367, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.645631067961165, |
|
"grad_norm": 0.6024283170700073, |
|
"learning_rate": 0.00013335524316508208, |
|
"loss": 1.1174, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.6650485436893203, |
|
"grad_norm": 0.5118915438652039, |
|
"learning_rate": 0.0001318242980627444, |
|
"loss": 1.0848, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6844660194174756, |
|
"grad_norm": 0.3924892544746399, |
|
"learning_rate": 0.00013028500758979506, |
|
"loss": 1.1114, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.703883495145631, |
|
"grad_norm": 0.4876178801059723, |
|
"learning_rate": 0.00012873777539848283, |
|
"loss": 1.1303, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.7233009708737863, |
|
"grad_norm": 0.4133356809616089, |
|
"learning_rate": 0.0001271830072236343, |
|
"loss": 1.0894, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7427184466019416, |
|
"grad_norm": 0.42908573150634766, |
|
"learning_rate": 0.00012562111077625722, |
|
"loss": 1.0806, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.762135922330097, |
|
"grad_norm": 0.5923053026199341, |
|
"learning_rate": 0.00012405249563662537, |
|
"loss": 1.1254, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.7815533980582523, |
|
"grad_norm": 0.34368380904197693, |
|
"learning_rate": 0.00012247757314687297, |
|
"loss": 1.1251, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.8009708737864076, |
|
"grad_norm": 0.6336709856987, |
|
"learning_rate": 0.00012089675630312754, |
|
"loss": 1.1042, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.820388349514563, |
|
"grad_norm": 0.32966285943984985, |
|
"learning_rate": 0.00011931045964720881, |
|
"loss": 1.09, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.8398058252427183, |
|
"grad_norm": 0.42973244190216064, |
|
"learning_rate": 0.0001177190991579223, |
|
"loss": 1.0896, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.8592233009708736, |
|
"grad_norm": 0.5365500450134277, |
|
"learning_rate": 0.00011612309214197599, |
|
"loss": 1.1281, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.8786407766990292, |
|
"grad_norm": 0.4330235421657562, |
|
"learning_rate": 0.00011452285712454904, |
|
"loss": 1.087, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.8980582524271845, |
|
"grad_norm": 0.47839391231536865, |
|
"learning_rate": 0.00011291881373954065, |
|
"loss": 1.0967, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.9174757281553398, |
|
"grad_norm": 0.27464163303375244, |
|
"learning_rate": 0.00011131138261952845, |
|
"loss": 1.0981, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.9368932038834952, |
|
"grad_norm": 0.41849982738494873, |
|
"learning_rate": 0.00010970098528546481, |
|
"loss": 1.1203, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.9563106796116505, |
|
"grad_norm": 0.4630599915981293, |
|
"learning_rate": 0.00010808804403614043, |
|
"loss": 1.1224, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.9757281553398058, |
|
"grad_norm": 0.34116509556770325, |
|
"learning_rate": 0.00010647298183744359, |
|
"loss": 1.0871, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.9951456310679612, |
|
"grad_norm": 0.4391232430934906, |
|
"learning_rate": 0.00010485622221144484, |
|
"loss": 1.1331, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.0145631067961167, |
|
"grad_norm": 0.4344836175441742, |
|
"learning_rate": 0.00010323818912533561, |
|
"loss": 1.0874, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.0097087378640777, |
|
"grad_norm": 0.42520374059677124, |
|
"learning_rate": 0.00010161930688025017, |
|
"loss": 1.1068, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.029126213592233, |
|
"grad_norm": 0.3572416305541992, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1001, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.0485436893203883, |
|
"grad_norm": 0.38590529561042786, |
|
"learning_rate": 9.838069311974986e-05, |
|
"loss": 1.0833, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.0679611650485437, |
|
"grad_norm": 0.2969411313533783, |
|
"learning_rate": 9.676181087466444e-05, |
|
"loss": 1.0939, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.087378640776699, |
|
"grad_norm": 0.25997498631477356, |
|
"learning_rate": 9.514377778855521e-05, |
|
"loss": 1.1063, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1067961165048543, |
|
"grad_norm": 0.3671320676803589, |
|
"learning_rate": 9.352701816255643e-05, |
|
"loss": 1.103, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.1262135922330097, |
|
"grad_norm": 0.552992582321167, |
|
"learning_rate": 9.19119559638596e-05, |
|
"loss": 1.0874, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.145631067961165, |
|
"grad_norm": 0.25193411111831665, |
|
"learning_rate": 9.02990147145352e-05, |
|
"loss": 1.0946, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.1650485436893203, |
|
"grad_norm": 0.33370667695999146, |
|
"learning_rate": 8.868861738047158e-05, |
|
"loss": 1.1108, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.1844660194174756, |
|
"grad_norm": 0.42085111141204834, |
|
"learning_rate": 8.70811862604594e-05, |
|
"loss": 1.0883, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.203883495145631, |
|
"grad_norm": 0.40206724405288696, |
|
"learning_rate": 8.5477142875451e-05, |
|
"loss": 1.1006, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.2233009708737863, |
|
"grad_norm": 0.3052491247653961, |
|
"learning_rate": 8.387690785802402e-05, |
|
"loss": 1.1155, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.2427184466019416, |
|
"grad_norm": 0.42322033643722534, |
|
"learning_rate": 8.228090084207774e-05, |
|
"loss": 1.0929, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.262135922330097, |
|
"grad_norm": 0.5083087682723999, |
|
"learning_rate": 8.068954035279121e-05, |
|
"loss": 1.122, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.2815533980582523, |
|
"grad_norm": 0.2693060636520386, |
|
"learning_rate": 7.91032436968725e-05, |
|
"loss": 1.0837, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3009708737864076, |
|
"grad_norm": 0.34834596514701843, |
|
"learning_rate": 7.75224268531271e-05, |
|
"loss": 1.0768, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.320388349514563, |
|
"grad_norm": 0.3885416090488434, |
|
"learning_rate": 7.594750436337467e-05, |
|
"loss": 1.0616, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.3398058252427183, |
|
"grad_norm": 0.3093293011188507, |
|
"learning_rate": 7.437888922374276e-05, |
|
"loss": 1.0799, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.3592233009708736, |
|
"grad_norm": 0.4098259210586548, |
|
"learning_rate": 7.281699277636572e-05, |
|
"loss": 1.1184, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.378640776699029, |
|
"grad_norm": 0.29187697172164917, |
|
"learning_rate": 7.126222460151719e-05, |
|
"loss": 1.0883, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.3980582524271843, |
|
"grad_norm": 0.3151191174983978, |
|
"learning_rate": 6.971499241020495e-05, |
|
"loss": 1.0692, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.4174757281553396, |
|
"grad_norm": 0.3117734491825104, |
|
"learning_rate": 6.817570193725564e-05, |
|
"loss": 1.0926, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.436893203883495, |
|
"grad_norm": 0.25921401381492615, |
|
"learning_rate": 6.664475683491796e-05, |
|
"loss": 1.0707, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.4563106796116507, |
|
"grad_norm": 0.31505605578422546, |
|
"learning_rate": 6.512255856701177e-05, |
|
"loss": 1.0988, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.475728155339806, |
|
"grad_norm": 0.25132912397384644, |
|
"learning_rate": 6.360950630365126e-05, |
|
"loss": 1.0726, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.4951456310679614, |
|
"grad_norm": 0.2596098780632019, |
|
"learning_rate": 6.210599681656933e-05, |
|
"loss": 1.1138, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.5145631067961167, |
|
"grad_norm": 0.24262911081314087, |
|
"learning_rate": 6.061242437507131e-05, |
|
"loss": 1.0759, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.533980582524272, |
|
"grad_norm": 0.29739508032798767, |
|
"learning_rate": 5.9129180642644414e-05, |
|
"loss": 1.082, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.5533980582524274, |
|
"grad_norm": 0.32182446122169495, |
|
"learning_rate": 5.765665457425102e-05, |
|
"loss": 1.0698, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.5728155339805827, |
|
"grad_norm": 0.3175111711025238, |
|
"learning_rate": 5.6195232314331766e-05, |
|
"loss": 1.099, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.592233009708738, |
|
"grad_norm": 0.34597843885421753, |
|
"learning_rate": 5.474529709554612e-05, |
|
"loss": 1.0971, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.6116504854368934, |
|
"grad_norm": 0.2773433327674866, |
|
"learning_rate": 5.3307229138275936e-05, |
|
"loss": 1.0747, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.6310679611650487, |
|
"grad_norm": 0.3997284770011902, |
|
"learning_rate": 5.1881405550919493e-05, |
|
"loss": 1.0579, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.650485436893204, |
|
"grad_norm": 0.3829750716686249, |
|
"learning_rate": 5.0468200231001286e-05, |
|
"loss": 1.0671, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.6699029126213594, |
|
"grad_norm": 0.2364608198404312, |
|
"learning_rate": 4.9067983767123736e-05, |
|
"loss": 1.0739, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.6893203883495147, |
|
"grad_norm": 0.29015663266181946, |
|
"learning_rate": 4.768112334178699e-05, |
|
"loss": 1.0626, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.70873786407767, |
|
"grad_norm": 0.39254897832870483, |
|
"learning_rate": 4.630798263510162e-05, |
|
"loss": 1.0648, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.7281553398058254, |
|
"grad_norm": 0.42674142122268677, |
|
"learning_rate": 4.494892172941965e-05, |
|
"loss": 1.1137, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.7475728155339807, |
|
"grad_norm": 0.4450013041496277, |
|
"learning_rate": 4.360429701490934e-05, |
|
"loss": 1.1096, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.766990291262136, |
|
"grad_norm": 0.333667516708374, |
|
"learning_rate": 4.227446109609809e-05, |
|
"loss": 1.0758, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.7864077669902914, |
|
"grad_norm": 0.4920090436935425, |
|
"learning_rate": 4.0959762699407766e-05, |
|
"loss": 1.0698, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.8058252427184467, |
|
"grad_norm": 0.31687042117118835, |
|
"learning_rate": 3.966054658170754e-05, |
|
"loss": 1.0901, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.825242718446602, |
|
"grad_norm": 0.3884165585041046, |
|
"learning_rate": 3.8377153439907266e-05, |
|
"loss": 1.0753, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.8446601941747574, |
|
"grad_norm": 0.41056135296821594, |
|
"learning_rate": 3.710991982161555e-05, |
|
"loss": 1.1145, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.8640776699029127, |
|
"grad_norm": 0.32460349798202515, |
|
"learning_rate": 3.585917803688603e-05, |
|
"loss": 1.0716, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.883495145631068, |
|
"grad_norm": 0.3436354994773865, |
|
"learning_rate": 3.4625256071074773e-05, |
|
"loss": 1.0995, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.9029126213592233, |
|
"grad_norm": 0.29039478302001953, |
|
"learning_rate": 3.340847749883191e-05, |
|
"loss": 1.058, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.9223300970873787, |
|
"grad_norm": 0.2641606330871582, |
|
"learning_rate": 3.2209161399249674e-05, |
|
"loss": 1.0705, |
|
"step": 153 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 204, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 51, |
|
"total_flos": 3.182608847260877e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|