|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 268, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007462686567164179, |
|
"grad_norm": 0.02919401967076395, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.1054, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 0.020838068877362836, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.0753, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022388059701492536, |
|
"grad_norm": 0.01937615418470836, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0831, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 0.028469822431763606, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.0989, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03731343283582089, |
|
"grad_norm": 0.03395939106878981, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.103, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 0.041383149287986205, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.1343, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05223880597014925, |
|
"grad_norm": 0.0413395483383645, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.1273, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 0.024480164341057986, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.0813, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06716417910447761, |
|
"grad_norm": 0.033946084199447085, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.0887, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 0.03513746660341717, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.082, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08208955223880597, |
|
"grad_norm": 0.025282860695458678, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.0686, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 0.03350566986793766, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.072, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09701492537313433, |
|
"grad_norm": 0.05550069293455211, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.1, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 0.03956999409254062, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 0.0842, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11194029850746269, |
|
"grad_norm": 0.05135762708917783, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.0866, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 0.05114207718083794, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 0.0755, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12686567164179105, |
|
"grad_norm": 0.033864855495790755, |
|
"learning_rate": 6.296296296296296e-05, |
|
"loss": 0.0642, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 0.031422023139512284, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0547, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1417910447761194, |
|
"grad_norm": 0.03313447893394791, |
|
"learning_rate": 7.037037037037038e-05, |
|
"loss": 0.0695, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.02448036621370803, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.0582, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15671641791044777, |
|
"grad_norm": 0.025763892833299634, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.0543, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 0.027745981321721364, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 0.0711, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17164179104477612, |
|
"grad_norm": 0.042796285583299835, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 0.069, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 0.03557712335612332, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.0579, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1865671641791045, |
|
"grad_norm": 0.044133668448392234, |
|
"learning_rate": 9.25925925925926e-05, |
|
"loss": 0.0573, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 0.04917021976502218, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 0.0606, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.20149253731343283, |
|
"grad_norm": 0.039476881840879816, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0591, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 0.046858172620967635, |
|
"learning_rate": 9.999575185316994e-05, |
|
"loss": 0.059, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.21641791044776118, |
|
"grad_norm": 0.05376514163226526, |
|
"learning_rate": 9.998300813454982e-05, |
|
"loss": 0.0676, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 0.047620783901977486, |
|
"learning_rate": 9.996177100962714e-05, |
|
"loss": 0.0739, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23134328358208955, |
|
"grad_norm": 0.06404757715770105, |
|
"learning_rate": 9.99320440871389e-05, |
|
"loss": 0.081, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.032370232949459114, |
|
"learning_rate": 9.989383241845838e-05, |
|
"loss": 0.0631, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2462686567164179, |
|
"grad_norm": 0.02198221870558641, |
|
"learning_rate": 9.984714249673675e-05, |
|
"loss": 0.0468, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 0.029148788282928297, |
|
"learning_rate": 9.979198225579968e-05, |
|
"loss": 0.0634, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.26119402985074625, |
|
"grad_norm": 0.047984517615355146, |
|
"learning_rate": 9.972836106879935e-05, |
|
"loss": 0.0822, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 0.025256795499786325, |
|
"learning_rate": 9.965628974662144e-05, |
|
"loss": 0.0497, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.27611940298507465, |
|
"grad_norm": 0.041039829048038795, |
|
"learning_rate": 9.957578053604837e-05, |
|
"loss": 0.0537, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 0.022712174218454447, |
|
"learning_rate": 9.9486847117678e-05, |
|
"loss": 0.0529, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.291044776119403, |
|
"grad_norm": 0.027820439299072, |
|
"learning_rate": 9.938950460359913e-05, |
|
"loss": 0.0512, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.029625704751798147, |
|
"learning_rate": 9.928376953482343e-05, |
|
"loss": 0.0526, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30597014925373134, |
|
"grad_norm": 0.035649943358832585, |
|
"learning_rate": 9.916965987847485e-05, |
|
"loss": 0.0612, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 0.03718438599924447, |
|
"learning_rate": 9.904719502473634e-05, |
|
"loss": 0.0699, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3208955223880597, |
|
"grad_norm": 0.030176691889828987, |
|
"learning_rate": 9.891639578355511e-05, |
|
"loss": 0.0641, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 0.031071278949314794, |
|
"learning_rate": 9.877728438110645e-05, |
|
"loss": 0.0562, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3358208955223881, |
|
"grad_norm": 0.04495281452755722, |
|
"learning_rate": 9.862988445601688e-05, |
|
"loss": 0.0625, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 0.03063389414608373, |
|
"learning_rate": 9.847422105534739e-05, |
|
"loss": 0.0725, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.35074626865671643, |
|
"grad_norm": 0.036502084694794205, |
|
"learning_rate": 9.831032063033726e-05, |
|
"loss": 0.0682, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.035498747609547306, |
|
"learning_rate": 9.813821103190932e-05, |
|
"loss": 0.0528, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3656716417910448, |
|
"grad_norm": 0.04409053680205051, |
|
"learning_rate": 9.795792150593739e-05, |
|
"loss": 0.0624, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.030682616113562263, |
|
"learning_rate": 9.776948268827659e-05, |
|
"loss": 0.0575, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3805970149253731, |
|
"grad_norm": 0.030470506614129134, |
|
"learning_rate": 9.757292659955755e-05, |
|
"loss": 0.0591, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 0.02671947240839377, |
|
"learning_rate": 9.736828663974527e-05, |
|
"loss": 0.0583, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.39552238805970147, |
|
"grad_norm": 0.031037639144256054, |
|
"learning_rate": 9.715559758246363e-05, |
|
"loss": 0.0518, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 0.02841501008727202, |
|
"learning_rate": 9.693489556908641e-05, |
|
"loss": 0.0584, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.41044776119402987, |
|
"grad_norm": 0.04837015226261885, |
|
"learning_rate": 9.670621810259595e-05, |
|
"loss": 0.0727, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 0.03250895479571609, |
|
"learning_rate": 9.646960404121042e-05, |
|
"loss": 0.0518, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4253731343283582, |
|
"grad_norm": 0.031411258255519776, |
|
"learning_rate": 9.62250935917808e-05, |
|
"loss": 0.0614, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 0.059967490499311084, |
|
"learning_rate": 9.597272830295876e-05, |
|
"loss": 0.0726, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.44029850746268656, |
|
"grad_norm": 0.04086918510114368, |
|
"learning_rate": 9.571255105813632e-05, |
|
"loss": 0.0608, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.039609700799485066, |
|
"learning_rate": 9.5444606068159e-05, |
|
"loss": 0.07, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4552238805970149, |
|
"grad_norm": 0.03186573999267819, |
|
"learning_rate": 9.516893886381323e-05, |
|
"loss": 0.072, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 0.03326197860069246, |
|
"learning_rate": 9.488559628808939e-05, |
|
"loss": 0.0647, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4701492537313433, |
|
"grad_norm": 0.03167948302339881, |
|
"learning_rate": 9.459462648822208e-05, |
|
"loss": 0.0616, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.02603734725183324, |
|
"learning_rate": 9.429607890750863e-05, |
|
"loss": 0.0541, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.48507462686567165, |
|
"grad_norm": 0.047753395728305725, |
|
"learning_rate": 9.399000427690735e-05, |
|
"loss": 0.056, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 0.0319902663818862, |
|
"learning_rate": 9.367645460641716e-05, |
|
"loss": 0.0562, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.10592144833550245, |
|
"learning_rate": 9.335548317623957e-05, |
|
"loss": 0.0703, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.04415289013333453, |
|
"learning_rate": 9.302714452772516e-05, |
|
"loss": 0.0578, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5149253731343284, |
|
"grad_norm": 0.05922882437474166, |
|
"learning_rate": 9.269149445410545e-05, |
|
"loss": 0.0602, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.04087608521226697, |
|
"learning_rate": 9.234858999101231e-05, |
|
"loss": 0.0493, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5298507462686567, |
|
"grad_norm": 0.06407150308129361, |
|
"learning_rate": 9.199848940678606e-05, |
|
"loss": 0.0602, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 0.03286157656507991, |
|
"learning_rate": 9.164125219257418e-05, |
|
"loss": 0.0528, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5447761194029851, |
|
"grad_norm": 0.045437734403213616, |
|
"learning_rate": 9.127693905222224e-05, |
|
"loss": 0.0608, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.03573058005192053, |
|
"learning_rate": 9.09056118919587e-05, |
|
"loss": 0.061, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5597014925373134, |
|
"grad_norm": 0.04555803646336276, |
|
"learning_rate": 9.052733380987554e-05, |
|
"loss": 0.0616, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 0.04474047330665906, |
|
"learning_rate": 9.014216908520618e-05, |
|
"loss": 0.0627, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5746268656716418, |
|
"grad_norm": 0.040211174281131454, |
|
"learning_rate": 8.975018316740278e-05, |
|
"loss": 0.0459, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 0.03261428866046916, |
|
"learning_rate": 8.935144266501469e-05, |
|
"loss": 0.0624, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5895522388059702, |
|
"grad_norm": 0.029388880489185733, |
|
"learning_rate": 8.894601533436999e-05, |
|
"loss": 0.0572, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.028886106980896584, |
|
"learning_rate": 8.853397006806182e-05, |
|
"loss": 0.0651, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6044776119402985, |
|
"grad_norm": 0.04454335383255878, |
|
"learning_rate": 8.811537688324188e-05, |
|
"loss": 0.0592, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 0.0471135611588739, |
|
"learning_rate": 8.769030690972262e-05, |
|
"loss": 0.0697, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6194029850746269, |
|
"grad_norm": 0.0296623365867784, |
|
"learning_rate": 8.725883237789045e-05, |
|
"loss": 0.0469, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.03167282878695001, |
|
"learning_rate": 8.682102660643197e-05, |
|
"loss": 0.0547, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6343283582089553, |
|
"grad_norm": 0.04865018192575572, |
|
"learning_rate": 8.637696398987516e-05, |
|
"loss": 0.0716, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 0.028735120219778615, |
|
"learning_rate": 8.592671998594794e-05, |
|
"loss": 0.0572, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6492537313432836, |
|
"grad_norm": 0.026050366630294017, |
|
"learning_rate": 8.547037110275579e-05, |
|
"loss": 0.0481, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 0.04073494985026619, |
|
"learning_rate": 8.50079948857812e-05, |
|
"loss": 0.0661, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.664179104477612, |
|
"grad_norm": 0.033495112935702676, |
|
"learning_rate": 8.453966990470656e-05, |
|
"loss": 0.0623, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 0.030497361040509492, |
|
"learning_rate": 8.406547574006325e-05, |
|
"loss": 0.0612, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6791044776119403, |
|
"grad_norm": 0.04298436163572994, |
|
"learning_rate": 8.358549296970876e-05, |
|
"loss": 0.0569, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 0.028361321641782122, |
|
"learning_rate": 8.309980315513444e-05, |
|
"loss": 0.0585, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6940298507462687, |
|
"grad_norm": 0.03803075573155342, |
|
"learning_rate": 8.260848882760615e-05, |
|
"loss": 0.0571, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 0.05072647730034115, |
|
"learning_rate": 8.211163347414003e-05, |
|
"loss": 0.0635, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7089552238805971, |
|
"grad_norm": 0.028524701829953716, |
|
"learning_rate": 8.160932152331586e-05, |
|
"loss": 0.0507, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.03163367256659563, |
|
"learning_rate": 8.11016383309305e-05, |
|
"loss": 0.0641, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7238805970149254, |
|
"grad_norm": 0.03317124309154403, |
|
"learning_rate": 8.058867016549372e-05, |
|
"loss": 0.0488, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.03740803806211238, |
|
"learning_rate": 8.007050419356899e-05, |
|
"loss": 0.0611, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7388059701492538, |
|
"grad_norm": 0.02706261492437245, |
|
"learning_rate": 7.95472284649615e-05, |
|
"loss": 0.0443, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.027321548744590845, |
|
"learning_rate": 7.90189318977564e-05, |
|
"loss": 0.0499, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.753731343283582, |
|
"grad_norm": 0.03331256964229156, |
|
"learning_rate": 7.848570426320917e-05, |
|
"loss": 0.0574, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 0.04488781019829851, |
|
"learning_rate": 7.794763617049124e-05, |
|
"loss": 0.0675, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7686567164179104, |
|
"grad_norm": 0.03589965410994236, |
|
"learning_rate": 7.740481905129306e-05, |
|
"loss": 0.0704, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 0.038503509262491756, |
|
"learning_rate": 7.685734514428766e-05, |
|
"loss": 0.0648, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7835820895522388, |
|
"grad_norm": 0.02427146713864325, |
|
"learning_rate": 7.630530747945673e-05, |
|
"loss": 0.0451, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 0.03683952347349413, |
|
"learning_rate": 7.574879986228245e-05, |
|
"loss": 0.0595, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7985074626865671, |
|
"grad_norm": 0.04856315142914648, |
|
"learning_rate": 7.518791685780768e-05, |
|
"loss": 0.0554, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 0.0366517612862182, |
|
"learning_rate": 7.46227537745667e-05, |
|
"loss": 0.0649, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8134328358208955, |
|
"grad_norm": 0.06587252219576946, |
|
"learning_rate": 7.405340664838993e-05, |
|
"loss": 0.0724, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.031328118637405186, |
|
"learning_rate": 7.347997222608492e-05, |
|
"loss": 0.0532, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8283582089552238, |
|
"grad_norm": 0.027882967444547536, |
|
"learning_rate": 7.290254794899664e-05, |
|
"loss": 0.0478, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.039566219868688, |
|
"learning_rate": 7.232123193644957e-05, |
|
"loss": 0.0584, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8432835820895522, |
|
"grad_norm": 0.0315149053030866, |
|
"learning_rate": 7.173612296907472e-05, |
|
"loss": 0.0506, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 0.03184981504463341, |
|
"learning_rate": 7.114732047202433e-05, |
|
"loss": 0.0432, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8582089552238806, |
|
"grad_norm": 0.03272549516697388, |
|
"learning_rate": 7.055492449807684e-05, |
|
"loss": 0.0552, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 0.03189649842931783, |
|
"learning_rate": 6.99590357106354e-05, |
|
"loss": 0.0518, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8731343283582089, |
|
"grad_norm": 0.030147363978434292, |
|
"learning_rate": 6.935975536662253e-05, |
|
"loss": 0.0525, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 0.040517602492255646, |
|
"learning_rate": 6.875718529927405e-05, |
|
"loss": 0.0669, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8880597014925373, |
|
"grad_norm": 0.05316845870182898, |
|
"learning_rate": 6.815142790083472e-05, |
|
"loss": 0.0628, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.034819746793725306, |
|
"learning_rate": 6.75425861051595e-05, |
|
"loss": 0.0555, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9029850746268657, |
|
"grad_norm": 0.030100164091976505, |
|
"learning_rate": 6.693076337022211e-05, |
|
"loss": 0.05, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 0.0314797643475597, |
|
"learning_rate": 6.631606366053506e-05, |
|
"loss": 0.0578, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.917910447761194, |
|
"grad_norm": 0.029223545243950332, |
|
"learning_rate": 6.569859142948328e-05, |
|
"loss": 0.0476, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 0.0331348198460615, |
|
"learning_rate": 6.507845160157477e-05, |
|
"loss": 0.0632, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9328358208955224, |
|
"grad_norm": 0.031079975615423625, |
|
"learning_rate": 6.445574955461134e-05, |
|
"loss": 0.0498, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 0.035932362771267295, |
|
"learning_rate": 6.383059110178204e-05, |
|
"loss": 0.0649, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9477611940298507, |
|
"grad_norm": 0.026660107576390996, |
|
"learning_rate": 6.320308247368286e-05, |
|
"loss": 0.0429, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 0.042220621034671266, |
|
"learning_rate": 6.257333030026538e-05, |
|
"loss": 0.0756, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9626865671641791, |
|
"grad_norm": 0.031042783082721563, |
|
"learning_rate": 6.194144159271756e-05, |
|
"loss": 0.0644, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.028923034995146566, |
|
"learning_rate": 6.130752372527982e-05, |
|
"loss": 0.0535, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9776119402985075, |
|
"grad_norm": 0.027560166142966646, |
|
"learning_rate": 6.0671684416999273e-05, |
|
"loss": 0.0515, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 0.026132856653996697, |
|
"learning_rate": 6.003403171342563e-05, |
|
"loss": 0.054, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9925373134328358, |
|
"grad_norm": 0.06392037144081057, |
|
"learning_rate": 5.939467396825137e-05, |
|
"loss": 0.06, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.042679994559313705, |
|
"learning_rate": 5.875371982489959e-05, |
|
"loss": 0.0619, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.007462686567164, |
|
"grad_norm": 0.026673005745909233, |
|
"learning_rate": 5.811127819806277e-05, |
|
"loss": 0.0459, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0149253731343284, |
|
"grad_norm": 0.029023621087789386, |
|
"learning_rate": 5.7467458255195384e-05, |
|
"loss": 0.0446, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0223880597014925, |
|
"grad_norm": 0.029071701785167842, |
|
"learning_rate": 5.682236939796337e-05, |
|
"loss": 0.0462, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0298507462686568, |
|
"grad_norm": 0.031121046523713853, |
|
"learning_rate": 5.61761212436541e-05, |
|
"loss": 0.0537, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.037313432835821, |
|
"grad_norm": 0.03221700721378129, |
|
"learning_rate": 5.55288236065495e-05, |
|
"loss": 0.0548, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.03151438885910822, |
|
"learning_rate": 5.488058647926577e-05, |
|
"loss": 0.0534, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0522388059701493, |
|
"grad_norm": 0.029920771304069553, |
|
"learning_rate": 5.423152001406282e-05, |
|
"loss": 0.0465, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0597014925373134, |
|
"grad_norm": 0.02988182757789321, |
|
"learning_rate": 5.3581734504126494e-05, |
|
"loss": 0.0427, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0671641791044777, |
|
"grad_norm": 0.04146879785852393, |
|
"learning_rate": 5.293134036482698e-05, |
|
"loss": 0.0663, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 0.039152659337203376, |
|
"learning_rate": 5.2280448114956316e-05, |
|
"loss": 0.0426, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0820895522388059, |
|
"grad_norm": 0.036826130066773974, |
|
"learning_rate": 5.1629168357948435e-05, |
|
"loss": 0.0554, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0895522388059702, |
|
"grad_norm": 0.03349602680235328, |
|
"learning_rate": 5.097761176308471e-05, |
|
"loss": 0.0463, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0970149253731343, |
|
"grad_norm": 0.03863625091563246, |
|
"learning_rate": 5.032588904668851e-05, |
|
"loss": 0.05, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1044776119402986, |
|
"grad_norm": 0.03414480378687678, |
|
"learning_rate": 4.967411095331149e-05, |
|
"loss": 0.0532, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1119402985074627, |
|
"grad_norm": 0.03458114819717078, |
|
"learning_rate": 4.90223882369153e-05, |
|
"loss": 0.0509, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.03408662760824781, |
|
"learning_rate": 4.837083164205159e-05, |
|
"loss": 0.0483, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.126865671641791, |
|
"grad_norm": 0.0420831349151817, |
|
"learning_rate": 4.771955188504371e-05, |
|
"loss": 0.06, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1343283582089552, |
|
"grad_norm": 0.03479713775750584, |
|
"learning_rate": 4.7068659635173026e-05, |
|
"loss": 0.0544, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1417910447761195, |
|
"grad_norm": 0.04008915214172921, |
|
"learning_rate": 4.641826549587352e-05, |
|
"loss": 0.0515, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1492537313432836, |
|
"grad_norm": 0.02486863707158603, |
|
"learning_rate": 4.57684799859372e-05, |
|
"loss": 0.0357, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1567164179104479, |
|
"grad_norm": 0.047042070292214286, |
|
"learning_rate": 4.511941352073424e-05, |
|
"loss": 0.0572, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.164179104477612, |
|
"grad_norm": 0.03758648119450133, |
|
"learning_rate": 4.447117639345052e-05, |
|
"loss": 0.051, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.171641791044776, |
|
"grad_norm": 0.03672648821870239, |
|
"learning_rate": 4.382387875634591e-05, |
|
"loss": 0.056, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1791044776119404, |
|
"grad_norm": 0.031421123448233466, |
|
"learning_rate": 4.317763060203664e-05, |
|
"loss": 0.0406, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1865671641791045, |
|
"grad_norm": 0.03920340761152944, |
|
"learning_rate": 4.253254174480462e-05, |
|
"loss": 0.0457, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.03183854980282451, |
|
"learning_rate": 4.188872180193723e-05, |
|
"loss": 0.0416, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2014925373134329, |
|
"grad_norm": 0.04378327235602219, |
|
"learning_rate": 4.124628017510043e-05, |
|
"loss": 0.0571, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.208955223880597, |
|
"grad_norm": 0.039625445906029455, |
|
"learning_rate": 4.0605326031748645e-05, |
|
"loss": 0.0441, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2164179104477613, |
|
"grad_norm": 0.03812779985968061, |
|
"learning_rate": 3.9965968286574376e-05, |
|
"loss": 0.0456, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2238805970149254, |
|
"grad_norm": 0.03587300189954543, |
|
"learning_rate": 3.932831558300074e-05, |
|
"loss": 0.0573, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2313432835820897, |
|
"grad_norm": 0.032411868193150284, |
|
"learning_rate": 3.869247627472021e-05, |
|
"loss": 0.0378, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2388059701492538, |
|
"grad_norm": 0.03454002672639094, |
|
"learning_rate": 3.8058558407282464e-05, |
|
"loss": 0.0472, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2462686567164178, |
|
"grad_norm": 0.04911967144007234, |
|
"learning_rate": 3.742666969973463e-05, |
|
"loss": 0.0487, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2537313432835822, |
|
"grad_norm": 0.044265339009080897, |
|
"learning_rate": 3.6796917526317156e-05, |
|
"loss": 0.0588, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2611940298507462, |
|
"grad_norm": 0.03747737389183713, |
|
"learning_rate": 3.616940889821797e-05, |
|
"loss": 0.0574, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.03511814609607568, |
|
"learning_rate": 3.5544250445388675e-05, |
|
"loss": 0.0393, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2761194029850746, |
|
"grad_norm": 0.04698205222474845, |
|
"learning_rate": 3.492154839842525e-05, |
|
"loss": 0.0442, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2835820895522387, |
|
"grad_norm": 0.03454787270419316, |
|
"learning_rate": 3.430140857051675e-05, |
|
"loss": 0.0454, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.291044776119403, |
|
"grad_norm": 0.05089940274324606, |
|
"learning_rate": 3.368393633946496e-05, |
|
"loss": 0.0538, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2985074626865671, |
|
"grad_norm": 0.037051061355777494, |
|
"learning_rate": 3.306923662977789e-05, |
|
"loss": 0.0486, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3059701492537314, |
|
"grad_norm": 0.04009215418213975, |
|
"learning_rate": 3.245741389484052e-05, |
|
"loss": 0.0491, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 0.03345150363311112, |
|
"learning_rate": 3.184857209916528e-05, |
|
"loss": 0.0389, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3208955223880596, |
|
"grad_norm": 0.048558141019666455, |
|
"learning_rate": 3.124281470072597e-05, |
|
"loss": 0.0618, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.328358208955224, |
|
"grad_norm": 0.04663730712313748, |
|
"learning_rate": 3.064024463337747e-05, |
|
"loss": 0.0489, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.335820895522388, |
|
"grad_norm": 0.03522439433466753, |
|
"learning_rate": 3.0040964289364616e-05, |
|
"loss": 0.0382, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.04483417143212535, |
|
"learning_rate": 2.944507550192318e-05, |
|
"loss": 0.0661, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3507462686567164, |
|
"grad_norm": 0.037625578707262686, |
|
"learning_rate": 2.885267952797569e-05, |
|
"loss": 0.0392, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3582089552238805, |
|
"grad_norm": 0.04068557239382607, |
|
"learning_rate": 2.8263877030925277e-05, |
|
"loss": 0.0542, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3656716417910448, |
|
"grad_norm": 0.0389079661184576, |
|
"learning_rate": 2.7678768063550452e-05, |
|
"loss": 0.0562, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.373134328358209, |
|
"grad_norm": 0.04888440207831169, |
|
"learning_rate": 2.7097452051003375e-05, |
|
"loss": 0.0546, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3805970149253732, |
|
"grad_norm": 0.05062117973516137, |
|
"learning_rate": 2.6520027773915075e-05, |
|
"loss": 0.0346, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3880597014925373, |
|
"grad_norm": 0.03839649084491081, |
|
"learning_rate": 2.5946593351610082e-05, |
|
"loss": 0.043, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3955223880597014, |
|
"grad_norm": 0.03894797485622364, |
|
"learning_rate": 2.5377246225433303e-05, |
|
"loss": 0.0499, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4029850746268657, |
|
"grad_norm": 0.03574246701240581, |
|
"learning_rate": 2.4812083142192328e-05, |
|
"loss": 0.0427, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4104477611940298, |
|
"grad_norm": 0.035928493587213936, |
|
"learning_rate": 2.4251200137717544e-05, |
|
"loss": 0.0495, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.0561432642757027, |
|
"learning_rate": 2.3694692520543295e-05, |
|
"loss": 0.0437, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4253731343283582, |
|
"grad_norm": 0.029967493597351468, |
|
"learning_rate": 2.3142654855712354e-05, |
|
"loss": 0.0359, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.030336483710158894, |
|
"learning_rate": 2.259518094870693e-05, |
|
"loss": 0.0333, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4402985074626866, |
|
"grad_norm": 0.04356827813942889, |
|
"learning_rate": 2.2052363829508775e-05, |
|
"loss": 0.0508, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4477611940298507, |
|
"grad_norm": 0.03897503968612337, |
|
"learning_rate": 2.151429573679084e-05, |
|
"loss": 0.0484, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.455223880597015, |
|
"grad_norm": 0.04667243646797709, |
|
"learning_rate": 2.0981068102243616e-05, |
|
"loss": 0.0471, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.462686567164179, |
|
"grad_norm": 0.039721557688575104, |
|
"learning_rate": 2.0452771535038518e-05, |
|
"loss": 0.0546, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4701492537313432, |
|
"grad_norm": 0.03703957811531601, |
|
"learning_rate": 1.9929495806431025e-05, |
|
"loss": 0.0417, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4776119402985075, |
|
"grad_norm": 0.05652478416127825, |
|
"learning_rate": 1.9411329834506286e-05, |
|
"loss": 0.0494, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4850746268656716, |
|
"grad_norm": 0.03617159966848812, |
|
"learning_rate": 1.8898361669069497e-05, |
|
"loss": 0.046, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.037324198259541815, |
|
"learning_rate": 1.8390678476684142e-05, |
|
"loss": 0.0436, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_loss": 0.0473957434296608, |
|
"eval_runtime": 6.4787, |
|
"eval_samples_per_second": 0.926, |
|
"eval_steps_per_second": 0.309, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.042848477030181235, |
|
"learning_rate": 1.7888366525859968e-05, |
|
"loss": 0.0513, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.5074626865671643, |
|
"grad_norm": 0.04133178844515671, |
|
"learning_rate": 1.739151117239385e-05, |
|
"loss": 0.0434, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5149253731343284, |
|
"grad_norm": 0.0470387983073566, |
|
"learning_rate": 1.6900196844865573e-05, |
|
"loss": 0.049, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.5223880597014925, |
|
"grad_norm": 0.04143101813950498, |
|
"learning_rate": 1.641450703029125e-05, |
|
"loss": 0.0505, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5298507462686568, |
|
"grad_norm": 0.03907054079716859, |
|
"learning_rate": 1.5934524259936756e-05, |
|
"loss": 0.0448, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.537313432835821, |
|
"grad_norm": 0.0360128545432656, |
|
"learning_rate": 1.5460330095293447e-05, |
|
"loss": 0.0448, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.544776119402985, |
|
"grad_norm": 0.04127920465748009, |
|
"learning_rate": 1.4992005114218805e-05, |
|
"loss": 0.0472, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.037629999831237014, |
|
"learning_rate": 1.4529628897244212e-05, |
|
"loss": 0.045, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5597014925373134, |
|
"grad_norm": 0.04441048097377383, |
|
"learning_rate": 1.4073280014052077e-05, |
|
"loss": 0.0479, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.03871964970576081, |
|
"learning_rate": 1.3623036010124846e-05, |
|
"loss": 0.0397, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5746268656716418, |
|
"grad_norm": 0.04375110246238586, |
|
"learning_rate": 1.3178973393568057e-05, |
|
"loss": 0.051, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.582089552238806, |
|
"grad_norm": 0.04933919962951339, |
|
"learning_rate": 1.2741167622109556e-05, |
|
"loss": 0.0426, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.5895522388059702, |
|
"grad_norm": 0.03861540416331047, |
|
"learning_rate": 1.230969309027739e-05, |
|
"loss": 0.0461, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5970149253731343, |
|
"grad_norm": 0.04244105016684205, |
|
"learning_rate": 1.1884623116758121e-05, |
|
"loss": 0.0521, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6044776119402986, |
|
"grad_norm": 0.046525033488129953, |
|
"learning_rate": 1.1466029931938182e-05, |
|
"loss": 0.0547, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6119402985074627, |
|
"grad_norm": 0.05356919684133014, |
|
"learning_rate": 1.1053984665630024e-05, |
|
"loss": 0.0423, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.6194029850746268, |
|
"grad_norm": 0.04764484850078174, |
|
"learning_rate": 1.0648557334985309e-05, |
|
"loss": 0.0578, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.626865671641791, |
|
"grad_norm": 0.04353330012930423, |
|
"learning_rate": 1.024981683259723e-05, |
|
"loss": 0.0508, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6343283582089554, |
|
"grad_norm": 0.0439439630643127, |
|
"learning_rate": 9.857830914793826e-06, |
|
"loss": 0.0503, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 0.0431244279753579, |
|
"learning_rate": 9.472666190124457e-06, |
|
"loss": 0.0438, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6492537313432836, |
|
"grad_norm": 0.06281467675130384, |
|
"learning_rate": 9.094388108041302e-06, |
|
"loss": 0.0525, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6567164179104479, |
|
"grad_norm": 0.03911370572303081, |
|
"learning_rate": 8.723060947777777e-06, |
|
"loss": 0.0468, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.664179104477612, |
|
"grad_norm": 0.04237425485741587, |
|
"learning_rate": 8.358747807425826e-06, |
|
"loss": 0.0488, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.04205074962767055, |
|
"learning_rate": 8.001510593213946e-06, |
|
"loss": 0.0499, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6791044776119404, |
|
"grad_norm": 0.038935088692434276, |
|
"learning_rate": 7.651410008987697e-06, |
|
"loss": 0.0505, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6865671641791045, |
|
"grad_norm": 0.036013941908150396, |
|
"learning_rate": 7.308505545894567e-06, |
|
"loss": 0.0423, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6940298507462686, |
|
"grad_norm": 0.04324030148308444, |
|
"learning_rate": 6.972855472274853e-06, |
|
"loss": 0.0501, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7014925373134329, |
|
"grad_norm": 0.03960682986150278, |
|
"learning_rate": 6.6445168237604385e-06, |
|
"loss": 0.0548, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7089552238805972, |
|
"grad_norm": 0.040566743727832796, |
|
"learning_rate": 6.323545393582847e-06, |
|
"loss": 0.0485, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 0.04146343378333961, |
|
"learning_rate": 6.009995723092654e-06, |
|
"loss": 0.0495, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7238805970149254, |
|
"grad_norm": 0.05743102370183059, |
|
"learning_rate": 5.703921092491393e-06, |
|
"loss": 0.0482, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.7313432835820897, |
|
"grad_norm": 0.04399341838703738, |
|
"learning_rate": 5.405373511777939e-06, |
|
"loss": 0.0579, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7388059701492538, |
|
"grad_norm": 0.042551807139630436, |
|
"learning_rate": 5.114403711910632e-06, |
|
"loss": 0.0524, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.7462686567164178, |
|
"grad_norm": 0.05244930515335262, |
|
"learning_rate": 4.8310611361867875e-06, |
|
"loss": 0.0536, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7537313432835822, |
|
"grad_norm": 0.037991763265651006, |
|
"learning_rate": 4.555393931841001e-06, |
|
"loss": 0.0438, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7611940298507462, |
|
"grad_norm": 0.03923496935939972, |
|
"learning_rate": 4.287448941863692e-06, |
|
"loss": 0.0441, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7686567164179103, |
|
"grad_norm": 0.04213613231764651, |
|
"learning_rate": 4.027271697041252e-06, |
|
"loss": 0.0451, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7761194029850746, |
|
"grad_norm": 0.04391085681016651, |
|
"learning_rate": 3.7749064082191977e-06, |
|
"loss": 0.0391, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.783582089552239, |
|
"grad_norm": 0.040751284452250174, |
|
"learning_rate": 3.5303959587895898e-06, |
|
"loss": 0.0473, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.044746440629601196, |
|
"learning_rate": 3.2937818974040635e-06, |
|
"loss": 0.0596, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7985074626865671, |
|
"grad_norm": 0.04365755886772311, |
|
"learning_rate": 3.065104430913601e-06, |
|
"loss": 0.0535, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8059701492537314, |
|
"grad_norm": 0.04282275262011111, |
|
"learning_rate": 2.844402417536374e-06, |
|
"loss": 0.0504, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8134328358208955, |
|
"grad_norm": 0.040175140129488814, |
|
"learning_rate": 2.631713360254734e-06, |
|
"loss": 0.0512, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8208955223880596, |
|
"grad_norm": 0.038332926215459244, |
|
"learning_rate": 2.4270734004424643e-06, |
|
"loss": 0.0395, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.828358208955224, |
|
"grad_norm": 0.05597837964991172, |
|
"learning_rate": 2.2305173117234236e-06, |
|
"loss": 0.0537, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.835820895522388, |
|
"grad_norm": 0.04065238850547589, |
|
"learning_rate": 2.0420784940626157e-06, |
|
"loss": 0.0452, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8432835820895521, |
|
"grad_norm": 0.0476763458104042, |
|
"learning_rate": 1.861788968090683e-06, |
|
"loss": 0.0451, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8507462686567164, |
|
"grad_norm": 0.04498306599006193, |
|
"learning_rate": 1.68967936966275e-06, |
|
"loss": 0.0429, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8582089552238807, |
|
"grad_norm": 0.05003148848106867, |
|
"learning_rate": 1.5257789446526172e-06, |
|
"loss": 0.0441, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.03810179831443809, |
|
"learning_rate": 1.3701155439831249e-06, |
|
"loss": 0.045, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.873134328358209, |
|
"grad_norm": 0.046823905381387004, |
|
"learning_rate": 1.222715618893555e-06, |
|
"loss": 0.046, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8805970149253732, |
|
"grad_norm": 0.03508296706383799, |
|
"learning_rate": 1.0836042164448945e-06, |
|
"loss": 0.0414, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.8880597014925373, |
|
"grad_norm": 0.03914973587838145, |
|
"learning_rate": 9.528049752636714e-07, |
|
"loss": 0.0382, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8955223880597014, |
|
"grad_norm": 0.04522068545246384, |
|
"learning_rate": 8.303401215251583e-07, |
|
"loss": 0.0622, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9029850746268657, |
|
"grad_norm": 0.0351448987447696, |
|
"learning_rate": 7.16230465176565e-07, |
|
"loss": 0.0365, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.04164400823222792, |
|
"learning_rate": 6.104953964008897e-07, |
|
"loss": 0.0417, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.917910447761194, |
|
"grad_norm": 0.043785343997244305, |
|
"learning_rate": 5.131528823220099e-07, |
|
"loss": 0.0479, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9253731343283582, |
|
"grad_norm": 0.04778705545009903, |
|
"learning_rate": 4.242194639516417e-07, |
|
"loss": 0.0515, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.9328358208955225, |
|
"grad_norm": 0.04402744832754031, |
|
"learning_rate": 3.4371025337855413e-07, |
|
"loss": 0.0459, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.031589252550306986, |
|
"learning_rate": 2.7163893120066285e-07, |
|
"loss": 0.0345, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9477611940298507, |
|
"grad_norm": 0.04157687462847643, |
|
"learning_rate": 2.0801774420031173e-07, |
|
"loss": 0.0491, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.955223880597015, |
|
"grad_norm": 0.03811048776366228, |
|
"learning_rate": 1.5285750326325954e-07, |
|
"loss": 0.0399, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.962686567164179, |
|
"grad_norm": 0.0459881175457512, |
|
"learning_rate": 1.0616758154161632e-07, |
|
"loss": 0.0536, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9701492537313432, |
|
"grad_norm": 0.04236843642259093, |
|
"learning_rate": 6.795591286109515e-08, |
|
"loss": 0.0576, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9776119402985075, |
|
"grad_norm": 0.04498028444612301, |
|
"learning_rate": 3.822899037286276e-08, |
|
"loss": 0.0437, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9850746268656716, |
|
"grad_norm": 0.04723224011434202, |
|
"learning_rate": 1.6991865450188827e-08, |
|
"loss": 0.0511, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.9925373134328357, |
|
"grad_norm": 0.04408571036477793, |
|
"learning_rate": 4.248146830060362e-09, |
|
"loss": 0.0644, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.036686659173178024, |
|
"learning_rate": 0.0, |
|
"loss": 0.0407, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 268, |
|
"total_flos": 678679592042496.0, |
|
"train_loss": 0.055738416959100694, |
|
"train_runtime": 2038.148, |
|
"train_samples_per_second": 0.523, |
|
"train_steps_per_second": 0.131 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 268, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 678679592042496.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|