|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.984771573604061, |
|
"eval_steps": 50, |
|
"global_step": 441, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0338409475465313, |
|
"grad_norm": 19.362921643157705, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7543, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0676818950930626, |
|
"grad_norm": 12.440471039796169, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5921, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10152284263959391, |
|
"grad_norm": 7.151840796848598, |
|
"learning_rate": 9.996679701338661e-07, |
|
"loss": 1.2895, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1353637901861252, |
|
"grad_norm": 4.254605069904108, |
|
"learning_rate": 9.986723215107924e-07, |
|
"loss": 1.135, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1692047377326565, |
|
"grad_norm": 3.6917219941972412, |
|
"learning_rate": 9.97014376471095e-07, |
|
"loss": 1.0651, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 3.7987347848198634, |
|
"learning_rate": 9.946963369638524e-07, |
|
"loss": 1.0038, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23688663282571912, |
|
"grad_norm": 3.637845138860867, |
|
"learning_rate": 9.917212816224536e-07, |
|
"loss": 0.9911, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2707275803722504, |
|
"grad_norm": 3.3460567231716873, |
|
"learning_rate": 9.880931616758056e-07, |
|
"loss": 0.9617, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30456852791878175, |
|
"grad_norm": 3.377056006196627, |
|
"learning_rate": 9.838167957006293e-07, |
|
"loss": 0.955, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"grad_norm": 3.3982403942088624, |
|
"learning_rate": 9.788978632218138e-07, |
|
"loss": 0.9458, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.338409475465313, |
|
"eval_loss": 0.9426366686820984, |
|
"eval_runtime": 73.0561, |
|
"eval_samples_per_second": 57.49, |
|
"eval_steps_per_second": 0.903, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.37225042301184436, |
|
"grad_norm": 3.689104664245426, |
|
"learning_rate": 9.73342897169329e-07, |
|
"loss": 0.943, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 3.395123607137787, |
|
"learning_rate": 9.671592752017137e-07, |
|
"loss": 0.9093, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.43993231810490696, |
|
"grad_norm": 3.357446602126202, |
|
"learning_rate": 9.603552099076648e-07, |
|
"loss": 0.9195, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.47377326565143824, |
|
"grad_norm": 3.691462819097496, |
|
"learning_rate": 9.52939737898737e-07, |
|
"loss": 0.886, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5076142131979695, |
|
"grad_norm": 3.5969341059847415, |
|
"learning_rate": 9.449227078076443e-07, |
|
"loss": 0.9185, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5414551607445008, |
|
"grad_norm": 3.558555135432629, |
|
"learning_rate": 9.363147672080985e-07, |
|
"loss": 0.8983, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5752961082910322, |
|
"grad_norm": 3.6252630464826523, |
|
"learning_rate": 9.271273484735592e-07, |
|
"loss": 0.8843, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 3.3121290866804913, |
|
"learning_rate": 9.173726535936766e-07, |
|
"loss": 0.8844, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6429780033840947, |
|
"grad_norm": 3.580257416417439, |
|
"learning_rate": 9.070636379685915e-07, |
|
"loss": 0.8994, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"grad_norm": 3.629629801061202, |
|
"learning_rate": 8.962139932026156e-07, |
|
"loss": 0.8779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.676818950930626, |
|
"eval_loss": 0.887722373008728, |
|
"eval_runtime": 72.6917, |
|
"eval_samples_per_second": 57.778, |
|
"eval_steps_per_second": 0.908, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7106598984771574, |
|
"grad_norm": 3.3980001711357466, |
|
"learning_rate": 8.848381289201459e-07, |
|
"loss": 0.864, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7445008460236887, |
|
"grad_norm": 3.37541287934786, |
|
"learning_rate": 8.72951153627962e-07, |
|
"loss": 0.8795, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7783417935702199, |
|
"grad_norm": 5.903555404027213, |
|
"learning_rate": 8.605688546493238e-07, |
|
"loss": 0.8657, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 3.3534687236406615, |
|
"learning_rate": 8.477076771565202e-07, |
|
"loss": 0.8627, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8460236886632826, |
|
"grad_norm": 3.472402321033115, |
|
"learning_rate": 8.343847023297169e-07, |
|
"loss": 0.8695, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8798646362098139, |
|
"grad_norm": 3.5485673752671865, |
|
"learning_rate": 8.206176246711065e-07, |
|
"loss": 0.8429, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9137055837563451, |
|
"grad_norm": 3.4858020678381054, |
|
"learning_rate": 8.064247285044972e-07, |
|
"loss": 0.8603, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9475465313028765, |
|
"grad_norm": 3.7500528392849324, |
|
"learning_rate": 7.918248636915459e-07, |
|
"loss": 0.8622, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9813874788494078, |
|
"grad_norm": 3.409665848861188, |
|
"learning_rate": 7.768374205968906e-07, |
|
"loss": 0.8473, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 3.6326130834418944, |
|
"learning_rate": 7.614823043354285e-07, |
|
"loss": 0.82, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"eval_loss": 0.8625058531761169, |
|
"eval_runtime": 72.7377, |
|
"eval_samples_per_second": 57.742, |
|
"eval_steps_per_second": 0.907, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0490693739424704, |
|
"grad_norm": 3.6224664863477494, |
|
"learning_rate": 7.457799083359471e-07, |
|
"loss": 0.823, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0829103214890017, |
|
"grad_norm": 3.392243863011647, |
|
"learning_rate": 7.297510872562131e-07, |
|
"loss": 0.7903, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.116751269035533, |
|
"grad_norm": 3.6101258998571875, |
|
"learning_rate": 7.134171292854955e-07, |
|
"loss": 0.7844, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1505922165820643, |
|
"grad_norm": 3.4039338742997685, |
|
"learning_rate": 6.967997278713089e-07, |
|
"loss": 0.7888, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1844331641285957, |
|
"grad_norm": 3.3872407965058384, |
|
"learning_rate": 6.79920952907921e-07, |
|
"loss": 0.7968, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 3.5554493985038684, |
|
"learning_rate": 6.628032214248982e-07, |
|
"loss": 0.7823, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.252115059221658, |
|
"grad_norm": 3.655696778331916, |
|
"learning_rate": 6.454692678146119e-07, |
|
"loss": 0.7848, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2859560067681894, |
|
"grad_norm": 3.459528935552729, |
|
"learning_rate": 6.279421136382494e-07, |
|
"loss": 0.7683, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3197969543147208, |
|
"grad_norm": 3.506826792292506, |
|
"learning_rate": 6.102450370504299e-07, |
|
"loss": 0.7776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"grad_norm": 3.53257152623125, |
|
"learning_rate": 5.924015418830354e-07, |
|
"loss": 0.7763, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.353637901861252, |
|
"eval_loss": 0.8517683148384094, |
|
"eval_runtime": 72.7613, |
|
"eval_samples_per_second": 57.723, |
|
"eval_steps_per_second": 0.907, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3874788494077834, |
|
"grad_norm": 3.4808019080846138, |
|
"learning_rate": 5.74435326429313e-07, |
|
"loss": 0.7586, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 3.4920276219604696, |
|
"learning_rate": 5.563702519697108e-07, |
|
"loss": 0.7693, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.455160744500846, |
|
"grad_norm": 3.563387899492764, |
|
"learning_rate": 5.382303110812466e-07, |
|
"loss": 0.7784, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4890016920473772, |
|
"grad_norm": 3.3319886045415914, |
|
"learning_rate": 5.200395957725005e-07, |
|
"loss": 0.7789, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5228426395939088, |
|
"grad_norm": 3.824253256636034, |
|
"learning_rate": 5.018222654865471e-07, |
|
"loss": 0.7692, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5566835871404399, |
|
"grad_norm": 3.49580497047461, |
|
"learning_rate": 4.836025150143318e-07, |
|
"loss": 0.7954, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5905245346869712, |
|
"grad_norm": 3.726391320444098, |
|
"learning_rate": 4.654045423610952e-07, |
|
"loss": 0.7882, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 3.940367355899551, |
|
"learning_rate": 4.4725251660853357e-07, |
|
"loss": 0.7782, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6582064297800339, |
|
"grad_norm": 3.603115659587366, |
|
"learning_rate": 4.2917054581536926e-07, |
|
"loss": 0.7709, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"grad_norm": 3.5695746889936726, |
|
"learning_rate": 4.1118264499897003e-07, |
|
"loss": 0.7719, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6920473773265652, |
|
"eval_loss": 0.8421301245689392, |
|
"eval_runtime": 72.7217, |
|
"eval_samples_per_second": 57.754, |
|
"eval_steps_per_second": 0.908, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7258883248730963, |
|
"grad_norm": 3.654464869264395, |
|
"learning_rate": 3.9331270424053616e-07, |
|
"loss": 0.7599, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7597292724196278, |
|
"grad_norm": 3.669706546573186, |
|
"learning_rate": 3.755844569562191e-07, |
|
"loss": 0.7727, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.793570219966159, |
|
"grad_norm": 3.514197357489897, |
|
"learning_rate": 3.580214483763093e-07, |
|
"loss": 0.7709, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 3.8443336447359244, |
|
"learning_rate": 3.406470042743574e-07, |
|
"loss": 0.782, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8612521150592216, |
|
"grad_norm": 3.625482123749312, |
|
"learning_rate": 3.23484199987761e-07, |
|
"loss": 0.7793, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.895093062605753, |
|
"grad_norm": 3.5197175718189317, |
|
"learning_rate": 3.065558297709588e-07, |
|
"loss": 0.7623, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9289340101522843, |
|
"grad_norm": 3.4330767306351935, |
|
"learning_rate": 2.898843765219388e-07, |
|
"loss": 0.7628, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9627749576988156, |
|
"grad_norm": 3.822061440419418, |
|
"learning_rate": 2.7349198192226295e-07, |
|
"loss": 0.7584, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.996615905245347, |
|
"grad_norm": 3.553974834774039, |
|
"learning_rate": 2.574004170302696e-07, |
|
"loss": 0.7645, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 3.9479223535228574, |
|
"learning_rate": 2.4163105336650643e-07, |
|
"loss": 0.7347, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"eval_loss": 0.8380723595619202, |
|
"eval_runtime": 72.7338, |
|
"eval_samples_per_second": 57.745, |
|
"eval_steps_per_second": 0.907, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0642978003384096, |
|
"grad_norm": 3.7132773157030425, |
|
"learning_rate": 2.2620483452979887e-07, |
|
"loss": 0.7348, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.0981387478849407, |
|
"grad_norm": 3.6057007836329147, |
|
"learning_rate": 2.1114224838164806e-07, |
|
"loss": 0.7193, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.1319796954314723, |
|
"grad_norm": 3.6945593121525286, |
|
"learning_rate": 1.964632998359036e-07, |
|
"loss": 0.7286, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.1658206429780034, |
|
"grad_norm": 3.9031426698674663, |
|
"learning_rate": 1.8218748428984782e-07, |
|
"loss": 0.7024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.199661590524535, |
|
"grad_norm": 3.4579222711663786, |
|
"learning_rate": 1.6833376173198005e-07, |
|
"loss": 0.7084, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 3.719715731546776, |
|
"learning_rate": 1.5492053156088498e-07, |
|
"loss": 0.7088, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.267343485617597, |
|
"grad_norm": 3.8269194060342446, |
|
"learning_rate": 1.4196560814863567e-07, |
|
"loss": 0.7244, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.3011844331641287, |
|
"grad_norm": 3.839398221167279, |
|
"learning_rate": 1.294861971811773e-07, |
|
"loss": 0.7261, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.33502538071066, |
|
"grad_norm": 3.6636890535123885, |
|
"learning_rate": 1.1749887280712161e-07, |
|
"loss": 0.7193, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"grad_norm": 3.9676568766055165, |
|
"learning_rate": 1.0601955562529774e-07, |
|
"loss": 0.7232, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3688663282571913, |
|
"eval_loss": 0.84018474817276, |
|
"eval_runtime": 72.7538, |
|
"eval_samples_per_second": 57.729, |
|
"eval_steps_per_second": 0.907, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.4027072758037225, |
|
"grad_norm": 3.871757730003177, |
|
"learning_rate": 9.506349154029425e-08, |
|
"loss": 0.7072, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 3.709431977886577, |
|
"learning_rate": 8.46452315140772e-08, |
|
"loss": 0.724, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.470389170896785, |
|
"grad_norm": 3.778002504886439, |
|
"learning_rate": 7.477861224057403e-08, |
|
"loss": 0.726, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.504230118443316, |
|
"grad_norm": 4.034569033748593, |
|
"learning_rate": 6.547673776889095e-08, |
|
"loss": 0.7152, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.5380710659898478, |
|
"grad_norm": 3.6052144662756582, |
|
"learning_rate": 5.6751962099570396e-08, |
|
"loss": 0.7222, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.571912013536379, |
|
"grad_norm": 3.6412688810760887, |
|
"learning_rate": 4.861587277700274e-08, |
|
"loss": 0.7141, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.6057529610829104, |
|
"grad_norm": 3.788492541312578, |
|
"learning_rate": 4.107927549978235e-08, |
|
"loss": 0.7075, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 3.7740858241104385, |
|
"learning_rate": 3.4152179769449396e-08, |
|
"loss": 0.7171, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.673434856175973, |
|
"grad_norm": 3.693130131986868, |
|
"learning_rate": 2.784378559667622e-08, |
|
"loss": 0.7198, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"grad_norm": 3.627471625901726, |
|
"learning_rate": 2.2162471282553553e-08, |
|
"loss": 0.721, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.707275803722504, |
|
"eval_loss": 0.8391405940055847, |
|
"eval_runtime": 72.6846, |
|
"eval_samples_per_second": 57.784, |
|
"eval_steps_per_second": 0.908, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.7411167512690353, |
|
"grad_norm": 3.7944346677217715, |
|
"learning_rate": 1.7115782291206082e-08, |
|
"loss": 0.7091, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.774957698815567, |
|
"grad_norm": 3.684276269607114, |
|
"learning_rate": 1.2710421228514733e-08, |
|
"loss": 0.7247, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.808798646362098, |
|
"grad_norm": 3.551925911903825, |
|
"learning_rate": 8.952238940255153e-09, |
|
"loss": 0.7245, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 3.549163935230389, |
|
"learning_rate": 5.846226741475557e-09, |
|
"loss": 0.7254, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.8764805414551606, |
|
"grad_norm": 3.809089531185781, |
|
"learning_rate": 3.3965097874343872e-09, |
|
"loss": 0.7037, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.910321489001692, |
|
"grad_norm": 3.5360521655033934, |
|
"learning_rate": 1.6063415949008618e-09, |
|
"loss": 0.7319, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.9441624365482233, |
|
"grad_norm": 3.8076879389100875, |
|
"learning_rate": 4.780997210962478e-10, |
|
"loss": 0.7199, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.9780033840947544, |
|
"grad_norm": 3.6857618291895236, |
|
"learning_rate": 1.328260601385356e-11, |
|
"loss": 0.7036, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.984771573604061, |
|
"step": 441, |
|
"total_flos": 2600101931384832.0, |
|
"train_loss": 0.8256486911081673, |
|
"train_runtime": 6391.2706, |
|
"train_samples_per_second": 17.743, |
|
"train_steps_per_second": 0.069 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 441, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2600101931384832.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|