|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99492385786802, |
|
"eval_steps": 50, |
|
"global_step": 885, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01692047377326565, |
|
"grad_norm": 18.89720056709149, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7331, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0338409475465313, |
|
"grad_norm": 12.380230951490994, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5909, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.050761421319796954, |
|
"grad_norm": 7.703103534110165, |
|
"learning_rate": 9.99919433964529e-07, |
|
"loss": 1.3028, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0676818950930626, |
|
"grad_norm": 4.2630631588076096, |
|
"learning_rate": 9.996777618216605e-07, |
|
"loss": 1.1478, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08460236886632826, |
|
"grad_norm": 3.9048597217577643, |
|
"learning_rate": 9.992750614536604e-07, |
|
"loss": 1.0675, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 4.006647373245602, |
|
"learning_rate": 9.98711462636417e-07, |
|
"loss": 1.0324, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11844331641285956, |
|
"grad_norm": 3.5760575012528384, |
|
"learning_rate": 9.979871469976195e-07, |
|
"loss": 0.9783, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1353637901861252, |
|
"grad_norm": 3.7598475655002166, |
|
"learning_rate": 9.971023479582256e-07, |
|
"loss": 0.9717, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15228426395939088, |
|
"grad_norm": 3.841087015616369, |
|
"learning_rate": 9.960573506572389e-07, |
|
"loss": 0.9541, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"grad_norm": 3.3432807772469215, |
|
"learning_rate": 9.948524918598173e-07, |
|
"loss": 0.9459, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"eval_loss": 0.9274849891662598, |
|
"eval_runtime": 148.0955, |
|
"eval_samples_per_second": 56.72, |
|
"eval_steps_per_second": 0.891, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18612521150592218, |
|
"grad_norm": 3.3571009660316427, |
|
"learning_rate": 9.934881598487478e-07, |
|
"loss": 0.9205, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 3.658058336105581, |
|
"learning_rate": 9.919647942993147e-07, |
|
"loss": 0.9129, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21996615905245348, |
|
"grad_norm": 3.5170771780637082, |
|
"learning_rate": 9.9028288613761e-07, |
|
"loss": 0.893, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23688663282571912, |
|
"grad_norm": 3.4488412657588325, |
|
"learning_rate": 9.884429773823236e-07, |
|
"loss": 0.9107, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25380710659898476, |
|
"grad_norm": 3.5097568943852737, |
|
"learning_rate": 9.864456609700723e-07, |
|
"loss": 0.8996, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2707275803722504, |
|
"grad_norm": 3.3607900424314705, |
|
"learning_rate": 9.842915805643156e-07, |
|
"loss": 0.897, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2876480541455161, |
|
"grad_norm": 3.5364404109551226, |
|
"learning_rate": 9.819814303479267e-07, |
|
"loss": 0.8995, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 3.6326997998059873, |
|
"learning_rate": 9.795159547994828e-07, |
|
"loss": 0.8845, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32148900169204736, |
|
"grad_norm": 3.342400707382201, |
|
"learning_rate": 9.76895948453346e-07, |
|
"loss": 0.8761, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"grad_norm": 3.5106421873680307, |
|
"learning_rate": 9.74122255643613e-07, |
|
"loss": 0.8992, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"eval_loss": 0.8709495663642883, |
|
"eval_runtime": 146.2171, |
|
"eval_samples_per_second": 57.449, |
|
"eval_steps_per_second": 0.903, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3553299492385787, |
|
"grad_norm": 3.412793355215337, |
|
"learning_rate": 9.711957702320174e-07, |
|
"loss": 0.8915, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.37225042301184436, |
|
"grad_norm": 3.4690823915126363, |
|
"learning_rate": 9.681174353198686e-07, |
|
"loss": 0.8814, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38917089678510997, |
|
"grad_norm": 3.370071251369415, |
|
"learning_rate": 9.648882429441256e-07, |
|
"loss": 0.8661, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 3.687286677170038, |
|
"learning_rate": 9.615092337576987e-07, |
|
"loss": 0.8737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4230118443316413, |
|
"grad_norm": 3.4414847568273346, |
|
"learning_rate": 9.579814966940833e-07, |
|
"loss": 0.8585, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.43993231810490696, |
|
"grad_norm": 3.555637933684099, |
|
"learning_rate": 9.543061686164372e-07, |
|
"loss": 0.8617, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45685279187817257, |
|
"grad_norm": 3.364839125431491, |
|
"learning_rate": 9.504844339512094e-07, |
|
"loss": 0.843, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.47377326565143824, |
|
"grad_norm": 3.5040756898524745, |
|
"learning_rate": 9.465175243064428e-07, |
|
"loss": 0.864, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4906937394247039, |
|
"grad_norm": 3.4002571057955366, |
|
"learning_rate": 9.424067180748691e-07, |
|
"loss": 0.853, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 3.5879355202073193, |
|
"learning_rate": 9.381533400219317e-07, |
|
"loss": 0.8515, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"eval_loss": 0.844601571559906, |
|
"eval_runtime": 146.2138, |
|
"eval_samples_per_second": 57.45, |
|
"eval_steps_per_second": 0.903, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5245346869712352, |
|
"grad_norm": 3.3747757696246925, |
|
"learning_rate": 9.337587608588588e-07, |
|
"loss": 0.8511, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5414551607445008, |
|
"grad_norm": 3.5565216959326644, |
|
"learning_rate": 9.29224396800933e-07, |
|
"loss": 0.8405, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5583756345177665, |
|
"grad_norm": 4.512291872844238, |
|
"learning_rate": 9.245517091110968e-07, |
|
"loss": 0.8318, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5752961082910322, |
|
"grad_norm": 3.332018798468554, |
|
"learning_rate": 9.197422036290386e-07, |
|
"loss": 0.8507, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5922165820642978, |
|
"grad_norm": 3.5149539879760603, |
|
"learning_rate": 9.147974302859156e-07, |
|
"loss": 0.8679, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 3.5792424943759924, |
|
"learning_rate": 9.097189826048659e-07, |
|
"loss": 0.8422, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.626057529610829, |
|
"grad_norm": 3.6495743743155806, |
|
"learning_rate": 9.045084971874737e-07, |
|
"loss": 0.8392, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6429780033840947, |
|
"grad_norm": 3.4613931720229836, |
|
"learning_rate": 8.991676531863507e-07, |
|
"loss": 0.8423, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6598984771573604, |
|
"grad_norm": 3.552450649531331, |
|
"learning_rate": 8.93698171764006e-07, |
|
"loss": 0.838, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"grad_norm": 3.4396681594412057, |
|
"learning_rate": 8.881018155381765e-07, |
|
"loss": 0.8498, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"eval_loss": 0.8278397917747498, |
|
"eval_runtime": 146.1617, |
|
"eval_samples_per_second": 57.471, |
|
"eval_steps_per_second": 0.903, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6937394247038917, |
|
"grad_norm": 3.395442461461206, |
|
"learning_rate": 8.823803880137992e-07, |
|
"loss": 0.8466, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7106598984771574, |
|
"grad_norm": 3.3738365361465825, |
|
"learning_rate": 8.765357330018055e-07, |
|
"loss": 0.8115, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.727580372250423, |
|
"grad_norm": 3.492914457997946, |
|
"learning_rate": 8.705697340249274e-07, |
|
"loss": 0.8325, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7445008460236887, |
|
"grad_norm": 3.8131605424361084, |
|
"learning_rate": 8.644843137107057e-07, |
|
"loss": 0.8204, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7614213197969543, |
|
"grad_norm": 3.243553096025284, |
|
"learning_rate": 8.58281433171896e-07, |
|
"loss": 0.8196, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7783417935702199, |
|
"grad_norm": 3.6561787618198625, |
|
"learning_rate": 8.519630913744724e-07, |
|
"loss": 0.8173, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7952622673434856, |
|
"grad_norm": 3.411165657715004, |
|
"learning_rate": 8.455313244934324e-07, |
|
"loss": 0.8248, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 3.302331738504403, |
|
"learning_rate": 8.389882052566105e-07, |
|
"loss": 0.8138, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8291032148900169, |
|
"grad_norm": 3.8604845681587605, |
|
"learning_rate": 8.323358422767128e-07, |
|
"loss": 0.8301, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8460236886632826, |
|
"grad_norm": 3.3819331633437617, |
|
"learning_rate": 8.255763793717867e-07, |
|
"loss": 0.8105, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8460236886632826, |
|
"eval_loss": 0.815843939781189, |
|
"eval_runtime": 146.2558, |
|
"eval_samples_per_second": 57.434, |
|
"eval_steps_per_second": 0.903, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8629441624365483, |
|
"grad_norm": 3.693173177209318, |
|
"learning_rate": 8.187119948743449e-07, |
|
"loss": 0.8241, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8798646362098139, |
|
"grad_norm": 3.557032123737138, |
|
"learning_rate": 8.117449009293668e-07, |
|
"loss": 0.8147, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8967851099830795, |
|
"grad_norm": 3.681990028371156, |
|
"learning_rate": 8.046773427814041e-07, |
|
"loss": 0.8281, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9137055837563451, |
|
"grad_norm": 3.472801541082773, |
|
"learning_rate": 7.975115980510185e-07, |
|
"loss": 0.8243, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9306260575296108, |
|
"grad_norm": 3.421718448250381, |
|
"learning_rate": 7.902499760007867e-07, |
|
"loss": 0.8296, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9475465313028765, |
|
"grad_norm": 3.3839679140101864, |
|
"learning_rate": 7.828948167911073e-07, |
|
"loss": 0.8113, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9644670050761421, |
|
"grad_norm": 3.368606848173354, |
|
"learning_rate": 7.754484907260512e-07, |
|
"loss": 0.8051, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9813874788494078, |
|
"grad_norm": 3.6569030875938027, |
|
"learning_rate": 7.679133974894982e-07, |
|
"loss": 0.8208, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9983079526226735, |
|
"grad_norm": 3.425142212232347, |
|
"learning_rate": 7.602919653718043e-07, |
|
"loss": 0.8058, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 3.456597451379273, |
|
"learning_rate": 7.525866504872506e-07, |
|
"loss": 0.7739, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"eval_loss": 0.8076632022857666, |
|
"eval_runtime": 146.1422, |
|
"eval_samples_per_second": 57.478, |
|
"eval_steps_per_second": 0.903, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0321489001692048, |
|
"grad_norm": 3.4266068907750933, |
|
"learning_rate": 7.447999359825262e-07, |
|
"loss": 0.7588, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0490693739424704, |
|
"grad_norm": 3.4868988018143354, |
|
"learning_rate": 7.369343312364993e-07, |
|
"loss": 0.7289, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0659898477157361, |
|
"grad_norm": 3.4449977296428105, |
|
"learning_rate": 7.289923710515338e-07, |
|
"loss": 0.7482, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0829103214890017, |
|
"grad_norm": 3.658957259722615, |
|
"learning_rate": 7.209766148366134e-07, |
|
"loss": 0.7309, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0998307952622675, |
|
"grad_norm": 3.359992422940204, |
|
"learning_rate": 7.128896457825363e-07, |
|
"loss": 0.7371, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.116751269035533, |
|
"grad_norm": 3.7951723668245902, |
|
"learning_rate": 7.047340700294453e-07, |
|
"loss": 0.75, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1336717428087986, |
|
"grad_norm": 3.5347910519960837, |
|
"learning_rate": 6.965125158269618e-07, |
|
"loss": 0.7582, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.1505922165820643, |
|
"grad_norm": 3.5966614567208928, |
|
"learning_rate": 6.882276326871959e-07, |
|
"loss": 0.7455, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.16751269035533, |
|
"grad_norm": 3.488331150730137, |
|
"learning_rate": 6.798820905309035e-07, |
|
"loss": 0.7327, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1844331641285957, |
|
"grad_norm": 3.5101724849934444, |
|
"learning_rate": 6.714785788270657e-07, |
|
"loss": 0.7286, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1844331641285957, |
|
"eval_loss": 0.8026402592658997, |
|
"eval_runtime": 146.1899, |
|
"eval_samples_per_second": 57.459, |
|
"eval_steps_per_second": 0.903, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2013536379018612, |
|
"grad_norm": 3.6348180590276526, |
|
"learning_rate": 6.630198057261709e-07, |
|
"loss": 0.751, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 3.7503514910524793, |
|
"learning_rate": 6.545084971874736e-07, |
|
"loss": 0.7379, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2351945854483926, |
|
"grad_norm": 3.5989366875197244, |
|
"learning_rate": 6.459473961005168e-07, |
|
"loss": 0.7451, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.252115059221658, |
|
"grad_norm": 3.522773718959504, |
|
"learning_rate": 6.373392614011951e-07, |
|
"loss": 0.745, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2690355329949239, |
|
"grad_norm": 3.44987123406693, |
|
"learning_rate": 6.286868671826511e-07, |
|
"loss": 0.7508, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2859560067681894, |
|
"grad_norm": 4.122111211031723, |
|
"learning_rate": 6.199930018012829e-07, |
|
"loss": 0.7226, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3028764805414552, |
|
"grad_norm": 3.73599818619215, |
|
"learning_rate": 6.112604669781572e-07, |
|
"loss": 0.7295, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.3197969543147208, |
|
"grad_norm": 3.529190177869209, |
|
"learning_rate": 6.024920768961152e-07, |
|
"loss": 0.7356, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3367174280879865, |
|
"grad_norm": 3.461330446213272, |
|
"learning_rate": 5.936906572928624e-07, |
|
"loss": 0.7556, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"grad_norm": 3.571724685541136, |
|
"learning_rate": 5.848590445503344e-07, |
|
"loss": 0.7582, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"eval_loss": 0.7977383732795715, |
|
"eval_runtime": 146.2021, |
|
"eval_samples_per_second": 57.455, |
|
"eval_steps_per_second": 0.903, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3705583756345177, |
|
"grad_norm": 3.6572559945618304, |
|
"learning_rate": 5.760000847806337e-07, |
|
"loss": 0.7725, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3874788494077834, |
|
"grad_norm": 3.697093126496799, |
|
"learning_rate": 5.671166329088277e-07, |
|
"loss": 0.7514, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.404399323181049, |
|
"grad_norm": 3.836867403357349, |
|
"learning_rate": 5.582115517529114e-07, |
|
"loss": 0.7483, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 3.4989314616970764, |
|
"learning_rate": 5.492877111012218e-07, |
|
"loss": 0.7557, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4382402707275803, |
|
"grad_norm": 3.597183035056808, |
|
"learning_rate": 5.403479867876087e-07, |
|
"loss": 0.7436, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.455160744500846, |
|
"grad_norm": 3.7802523149502734, |
|
"learning_rate": 5.313952597646567e-07, |
|
"loss": 0.7424, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4720812182741116, |
|
"grad_norm": 3.5635818578964344, |
|
"learning_rate": 5.224324151752575e-07, |
|
"loss": 0.7271, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4890016920473772, |
|
"grad_norm": 3.577963950120742, |
|
"learning_rate": 5.134623414228315e-07, |
|
"loss": 0.7609, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.505922165820643, |
|
"grad_norm": 3.54002403050294, |
|
"learning_rate": 5.044879292404989e-07, |
|
"loss": 0.7734, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 3.6965077572035665, |
|
"learning_rate": 4.95512070759501e-07, |
|
"loss": 0.7386, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"eval_loss": 0.7924867272377014, |
|
"eval_runtime": 146.1132, |
|
"eval_samples_per_second": 57.49, |
|
"eval_steps_per_second": 0.903, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5397631133671743, |
|
"grad_norm": 3.731715501345578, |
|
"learning_rate": 4.865376585771687e-07, |
|
"loss": 0.7393, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5566835871404399, |
|
"grad_norm": 3.5303543042860452, |
|
"learning_rate": 4.775675848247427e-07, |
|
"loss": 0.7362, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5736040609137056, |
|
"grad_norm": 3.6723372038452693, |
|
"learning_rate": 4.686047402353433e-07, |
|
"loss": 0.7356, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5905245346869712, |
|
"grad_norm": 3.3963951427952694, |
|
"learning_rate": 4.596520132123914e-07, |
|
"loss": 0.7284, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.6074450084602367, |
|
"grad_norm": 3.654129078679521, |
|
"learning_rate": 4.507122888987782e-07, |
|
"loss": 0.7504, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 3.5729221837567438, |
|
"learning_rate": 4.417884482470886e-07, |
|
"loss": 0.7505, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6412859560067683, |
|
"grad_norm": 3.7212291449669457, |
|
"learning_rate": 4.328833670911724e-07, |
|
"loss": 0.7426, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6582064297800339, |
|
"grad_norm": 3.6827531820047894, |
|
"learning_rate": 4.239999152193664e-07, |
|
"loss": 0.7299, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6751269035532994, |
|
"grad_norm": 3.5324806419119805, |
|
"learning_rate": 4.1514095544966557e-07, |
|
"loss": 0.7259, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"grad_norm": 3.3737883396364463, |
|
"learning_rate": 4.0630934270713755e-07, |
|
"loss": 0.7479, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"eval_loss": 0.7887324690818787, |
|
"eval_runtime": 146.106, |
|
"eval_samples_per_second": 57.493, |
|
"eval_steps_per_second": 0.903, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.708967851099831, |
|
"grad_norm": 3.645393673633237, |
|
"learning_rate": 3.9750792310388483e-07, |
|
"loss": 0.7429, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.7258883248730963, |
|
"grad_norm": 3.826381146162921, |
|
"learning_rate": 3.8873953302184283e-07, |
|
"loss": 0.7207, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.742808798646362, |
|
"grad_norm": 3.6852464302004786, |
|
"learning_rate": 3.80006998198717e-07, |
|
"loss": 0.7523, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.7597292724196278, |
|
"grad_norm": 3.53598971920285, |
|
"learning_rate": 3.713131328173489e-07, |
|
"loss": 0.7447, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7766497461928934, |
|
"grad_norm": 3.468275223623456, |
|
"learning_rate": 3.62660738598805e-07, |
|
"loss": 0.7406, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.793570219966159, |
|
"grad_norm": 3.750572464422938, |
|
"learning_rate": 3.5405260389948333e-07, |
|
"loss": 0.7472, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8104906937394247, |
|
"grad_norm": 3.804608232839517, |
|
"learning_rate": 3.454915028125263e-07, |
|
"loss": 0.7269, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 3.6798567744235626, |
|
"learning_rate": 3.369801942738291e-07, |
|
"loss": 0.7194, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.844331641285956, |
|
"grad_norm": 3.5376430558792604, |
|
"learning_rate": 3.285214211729343e-07, |
|
"loss": 0.7161, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.8612521150592216, |
|
"grad_norm": 3.508281360828586, |
|
"learning_rate": 3.2011790946909666e-07, |
|
"loss": 0.7374, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8612521150592216, |
|
"eval_loss": 0.7846313118934631, |
|
"eval_runtime": 146.1979, |
|
"eval_samples_per_second": 57.456, |
|
"eval_steps_per_second": 0.903, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8781725888324874, |
|
"grad_norm": 3.6370839999982794, |
|
"learning_rate": 3.11772367312804e-07, |
|
"loss": 0.7363, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.895093062605753, |
|
"grad_norm": 3.6667447920505767, |
|
"learning_rate": 3.034874841730382e-07, |
|
"loss": 0.7205, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9120135363790185, |
|
"grad_norm": 3.6329553320121333, |
|
"learning_rate": 2.9526592997055483e-07, |
|
"loss": 0.7405, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9289340101522843, |
|
"grad_norm": 3.625112558207132, |
|
"learning_rate": 2.8711035421746363e-07, |
|
"loss": 0.7262, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.94585448392555, |
|
"grad_norm": 3.593092005119921, |
|
"learning_rate": 2.7902338516338674e-07, |
|
"loss": 0.7321, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.9627749576988156, |
|
"grad_norm": 3.8305447044984087, |
|
"learning_rate": 2.7100762894846627e-07, |
|
"loss": 0.7318, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9796954314720812, |
|
"grad_norm": 3.7348260486234075, |
|
"learning_rate": 2.6306566876350067e-07, |
|
"loss": 0.7254, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.996615905245347, |
|
"grad_norm": 3.575785573364731, |
|
"learning_rate": 2.5520006401747395e-07, |
|
"loss": 0.7506, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.0135363790186127, |
|
"grad_norm": 3.91770848066435, |
|
"learning_rate": 2.474133495127494e-07, |
|
"loss": 0.7056, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 3.5555997135824184, |
|
"learning_rate": 2.3970803462819583e-07, |
|
"loss": 0.7075, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"eval_loss": 0.7871306538581848, |
|
"eval_runtime": 146.0491, |
|
"eval_samples_per_second": 57.515, |
|
"eval_steps_per_second": 0.904, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.047377326565144, |
|
"grad_norm": 3.7633202447201377, |
|
"learning_rate": 2.3208660251050156e-07, |
|
"loss": 0.6935, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.0642978003384096, |
|
"grad_norm": 3.671976347169534, |
|
"learning_rate": 2.2455150927394878e-07, |
|
"loss": 0.6892, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.081218274111675, |
|
"grad_norm": 3.7546761949911165, |
|
"learning_rate": 2.1710518320889276e-07, |
|
"loss": 0.6762, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.0981387478849407, |
|
"grad_norm": 3.6517301083486684, |
|
"learning_rate": 2.097500239992132e-07, |
|
"loss": 0.6789, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.1150592216582065, |
|
"grad_norm": 3.8614591719892286, |
|
"learning_rate": 2.0248840194898155e-07, |
|
"loss": 0.6782, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.1319796954314723, |
|
"grad_norm": 3.7437837099098137, |
|
"learning_rate": 1.9532265721859597e-07, |
|
"loss": 0.6694, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.1489001692047376, |
|
"grad_norm": 3.7205556217713482, |
|
"learning_rate": 1.8825509907063326e-07, |
|
"loss": 0.6848, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.1658206429780034, |
|
"grad_norm": 3.957502578909951, |
|
"learning_rate": 1.812880051256551e-07, |
|
"loss": 0.6742, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.182741116751269, |
|
"grad_norm": 3.764749715334981, |
|
"learning_rate": 1.744236206282132e-07, |
|
"loss": 0.6949, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.199661590524535, |
|
"grad_norm": 3.6886209727019326, |
|
"learning_rate": 1.6766415772328728e-07, |
|
"loss": 0.6818, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.199661590524535, |
|
"eval_loss": 0.7874469757080078, |
|
"eval_runtime": 146.2957, |
|
"eval_samples_per_second": 57.418, |
|
"eval_steps_per_second": 0.902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.2165820642978002, |
|
"grad_norm": 3.8767093338924434, |
|
"learning_rate": 1.6101179474338966e-07, |
|
"loss": 0.698, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 3.9008502666439955, |
|
"learning_rate": 1.5446867550656767e-07, |
|
"loss": 0.6951, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.250423011844332, |
|
"grad_norm": 3.600044317004702, |
|
"learning_rate": 1.4803690862552753e-07, |
|
"loss": 0.6736, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.267343485617597, |
|
"grad_norm": 3.8954074320863623, |
|
"learning_rate": 1.4171856682810384e-07, |
|
"loss": 0.6796, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.284263959390863, |
|
"grad_norm": 4.011977846573959, |
|
"learning_rate": 1.3551568628929432e-07, |
|
"loss": 0.673, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.3011844331641287, |
|
"grad_norm": 3.823282280716888, |
|
"learning_rate": 1.2943026597507267e-07, |
|
"loss": 0.6729, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3181049069373945, |
|
"grad_norm": 3.992535275167548, |
|
"learning_rate": 1.2346426699819456e-07, |
|
"loss": 0.6919, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.33502538071066, |
|
"grad_norm": 3.7501814671126255, |
|
"learning_rate": 1.176196119862008e-07, |
|
"loss": 0.6808, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3519458544839256, |
|
"grad_norm": 3.886306791969186, |
|
"learning_rate": 1.1189818446182358e-07, |
|
"loss": 0.6817, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"grad_norm": 3.795337541416798, |
|
"learning_rate": 1.0630182823599399e-07, |
|
"loss": 0.6747, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"eval_loss": 0.7868276834487915, |
|
"eval_runtime": 146.1359, |
|
"eval_samples_per_second": 57.481, |
|
"eval_steps_per_second": 0.903, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3857868020304567, |
|
"grad_norm": 3.9750084710216482, |
|
"learning_rate": 1.0083234681364932e-07, |
|
"loss": 0.6816, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.4027072758037225, |
|
"grad_norm": 3.8888332091113544, |
|
"learning_rate": 9.549150281252632e-08, |
|
"loss": 0.6798, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.4196277495769882, |
|
"grad_norm": 3.8260776584323786, |
|
"learning_rate": 9.028101739513405e-08, |
|
"loss": 0.6767, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 3.9871840761205415, |
|
"learning_rate": 8.520256971408452e-08, |
|
"loss": 0.6864, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.4534686971235193, |
|
"grad_norm": 3.689159095170659, |
|
"learning_rate": 8.025779637096137e-08, |
|
"loss": 0.6879, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.470389170896785, |
|
"grad_norm": 4.029588500065508, |
|
"learning_rate": 7.544829088890325e-08, |
|
"loss": 0.6955, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.487309644670051, |
|
"grad_norm": 3.911239676026825, |
|
"learning_rate": 7.077560319906694e-08, |
|
"loss": 0.68, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.504230118443316, |
|
"grad_norm": 3.9088480475319742, |
|
"learning_rate": 6.624123914114122e-08, |
|
"loss": 0.6914, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.521150592216582, |
|
"grad_norm": 3.8604080998201677, |
|
"learning_rate": 6.184665997806831e-08, |
|
"loss": 0.6944, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 3.7419532051364546, |
|
"learning_rate": 5.759328192513074e-08, |
|
"loss": 0.6849, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"eval_loss": 0.7860944271087646, |
|
"eval_runtime": 146.2056, |
|
"eval_samples_per_second": 57.453, |
|
"eval_steps_per_second": 0.903, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5549915397631136, |
|
"grad_norm": 3.8511488857625675, |
|
"learning_rate": 5.348247569355735e-08, |
|
"loss": 0.6915, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.571912013536379, |
|
"grad_norm": 3.820407730040828, |
|
"learning_rate": 4.951556604879048e-08, |
|
"loss": 0.6753, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5888324873096447, |
|
"grad_norm": 4.10704692193588, |
|
"learning_rate": 4.569383138356275e-08, |
|
"loss": 0.6839, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.6057529610829104, |
|
"grad_norm": 3.8660705841934315, |
|
"learning_rate": 4.201850330591677e-08, |
|
"loss": 0.6748, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.6226734348561758, |
|
"grad_norm": 3.7908753446624086, |
|
"learning_rate": 3.8490766242301353e-08, |
|
"loss": 0.6832, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 3.8661392064625137, |
|
"learning_rate": 3.5111757055874326e-08, |
|
"loss": 0.6902, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.6565143824027073, |
|
"grad_norm": 3.7157772134571623, |
|
"learning_rate": 3.188256468013139e-08, |
|
"loss": 0.6742, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.673434856175973, |
|
"grad_norm": 3.6900269313746743, |
|
"learning_rate": 2.8804229767982636e-08, |
|
"loss": 0.6769, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6903553299492384, |
|
"grad_norm": 3.8096781294038538, |
|
"learning_rate": 2.587774435638679e-08, |
|
"loss": 0.6962, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"grad_norm": 3.790546594792546, |
|
"learning_rate": 2.3104051546654013e-08, |
|
"loss": 0.6922, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"eval_loss": 0.7856406569480896, |
|
"eval_runtime": 146.2171, |
|
"eval_samples_per_second": 57.449, |
|
"eval_steps_per_second": 0.903, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.72419627749577, |
|
"grad_norm": 3.8429877767556664, |
|
"learning_rate": 2.048404520051722e-08, |
|
"loss": 0.6888, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.7411167512690353, |
|
"grad_norm": 3.6312764917444165, |
|
"learning_rate": 1.8018569652073378e-08, |
|
"loss": 0.6814, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.758037225042301, |
|
"grad_norm": 3.787975571643184, |
|
"learning_rate": 1.570841943568446e-08, |
|
"loss": 0.6807, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.774957698815567, |
|
"grad_norm": 3.9473751919547055, |
|
"learning_rate": 1.3554339029927531e-08, |
|
"loss": 0.6719, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.7918781725888326, |
|
"grad_norm": 4.009002167477452, |
|
"learning_rate": 1.1557022617676216e-08, |
|
"loss": 0.6869, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.808798646362098, |
|
"grad_norm": 3.8014391891380934, |
|
"learning_rate": 9.717113862389992e-09, |
|
"loss": 0.6805, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.8257191201353637, |
|
"grad_norm": 3.6206775186302282, |
|
"learning_rate": 8.035205700685165e-09, |
|
"loss": 0.6712, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 3.7790956293706195, |
|
"learning_rate": 6.511840151252168e-09, |
|
"loss": 0.6857, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.859560067681895, |
|
"grad_norm": 3.946872462701907, |
|
"learning_rate": 5.147508140182555e-09, |
|
"loss": 0.6891, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.8764805414551606, |
|
"grad_norm": 4.188561175850185, |
|
"learning_rate": 3.9426493427611175e-09, |
|
"loss": 0.6948, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.8764805414551606, |
|
"eval_loss": 0.7854181528091431, |
|
"eval_runtime": 146.4648, |
|
"eval_samples_per_second": 57.352, |
|
"eval_steps_per_second": 0.901, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.8934010152284264, |
|
"grad_norm": 3.9493971658293505, |
|
"learning_rate": 2.897652041774279e-09, |
|
"loss": 0.6828, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.910321489001692, |
|
"grad_norm": 3.921042083811142, |
|
"learning_rate": 2.0128530023804656e-09, |
|
"loss": 0.6912, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.927241962774958, |
|
"grad_norm": 3.765705430002199, |
|
"learning_rate": 1.2885373635829754e-09, |
|
"loss": 0.6955, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.9441624365482233, |
|
"grad_norm": 3.7974861126821366, |
|
"learning_rate": 7.249385463395374e-10, |
|
"loss": 0.691, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.961082910321489, |
|
"grad_norm": 3.7353656428622743, |
|
"learning_rate": 3.22238178339318e-10, |
|
"loss": 0.6855, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.9780033840947544, |
|
"grad_norm": 3.7079032515430557, |
|
"learning_rate": 8.056603547090812e-11, |
|
"loss": 0.692, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.99492385786802, |
|
"grad_norm": 3.6976604610814037, |
|
"learning_rate": 0.0, |
|
"loss": 0.6867, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.99492385786802, |
|
"step": 885, |
|
"total_flos": 5218127163949056.0, |
|
"train_loss": 0.7771235016106213, |
|
"train_runtime": 14098.017, |
|
"train_samples_per_second": 16.087, |
|
"train_steps_per_second": 0.063 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 885, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5218127163949056.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|