{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 19.973544973544975, "eval_steps": 500, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21164021164021163, "grad_norm": 0.01096427347511053, "learning_rate": 5.4347826086956525e-06, "loss": 0.3143, "step": 5 }, { "epoch": 0.42328042328042326, "grad_norm": 0.013951408676803112, "learning_rate": 1.0869565217391305e-05, "loss": 0.3576, "step": 10 }, { "epoch": 0.6349206349206349, "grad_norm": 0.01673784852027893, "learning_rate": 1.630434782608696e-05, "loss": 0.3639, "step": 15 }, { "epoch": 0.8465608465608465, "grad_norm": 0.019934486597776413, "learning_rate": 2.173913043478261e-05, "loss": 0.3458, "step": 20 }, { "epoch": 1.0846560846560847, "grad_norm": 0.02395496889948845, "learning_rate": 2.7173913043478262e-05, "loss": 0.4603, "step": 25 }, { "epoch": 1.2962962962962963, "grad_norm": 0.017271244898438454, "learning_rate": 3.260869565217392e-05, "loss": 0.3034, "step": 30 }, { "epoch": 1.507936507936508, "grad_norm": 0.023819392547011375, "learning_rate": 3.804347826086957e-05, "loss": 0.3317, "step": 35 }, { "epoch": 1.7195767195767195, "grad_norm": 0.021737879142165184, "learning_rate": 4.347826086956522e-05, "loss": 0.3382, "step": 40 }, { "epoch": 1.9312169312169312, "grad_norm": 0.023687848821282387, "learning_rate": 4.891304347826087e-05, "loss": 0.3288, "step": 45 }, { "epoch": 2.1693121693121693, "grad_norm": 0.024081828072667122, "learning_rate": 4.9988484157560136e-05, "loss": 0.3521, "step": 50 }, { "epoch": 2.380952380952381, "grad_norm": 0.03236062824726105, "learning_rate": 4.994171922976348e-05, "loss": 0.3485, "step": 55 }, { "epoch": 2.5925925925925926, "grad_norm": 0.03175266832113266, "learning_rate": 4.9859052738933966e-05, "loss": 0.2979, "step": 60 }, { "epoch": 2.804232804232804, "grad_norm": 0.030012985691428185, "learning_rate": 4.974060367671783e-05, "loss": 0.3055, "step": 65 }, { "epoch": 3.0423280423280423, "grad_norm": 0.10118062049150467, "learning_rate": 4.958654254084355e-05, "loss": 0.3521, "step": 70 }, { "epoch": 3.253968253968254, "grad_norm": 0.02641722746193409, "learning_rate": 4.9397091089704364e-05, "loss": 0.3041, "step": 75 }, { "epoch": 3.4656084656084656, "grad_norm": 0.03561725839972496, "learning_rate": 4.9172522023155154e-05, "loss": 0.2778, "step": 80 }, { "epoch": 3.677248677248677, "grad_norm": 0.04121287539601326, "learning_rate": 4.8913158589983374e-05, "loss": 0.2698, "step": 85 }, { "epoch": 3.888888888888889, "grad_norm": 0.04533043131232262, "learning_rate": 4.8619374122618854e-05, "loss": 0.2704, "step": 90 }, { "epoch": 4.1269841269841265, "grad_norm": 0.04685904085636139, "learning_rate": 4.8291591499752365e-05, "loss": 0.3338, "step": 95 }, { "epoch": 4.338624338624339, "grad_norm": 0.06093249469995499, "learning_rate": 4.793028253763633e-05, "loss": 0.2794, "step": 100 }, { "epoch": 4.550264550264551, "grad_norm": 0.060147590935230255, "learning_rate": 4.7535967310943955e-05, "loss": 0.2724, "step": 105 }, { "epoch": 4.761904761904762, "grad_norm": 0.05850926414132118, "learning_rate": 4.710921340416431e-05, "loss": 0.2547, "step": 110 }, { "epoch": 4.973544973544973, "grad_norm": 0.0650046169757843, "learning_rate": 4.665063509461097e-05, "loss": 0.241, "step": 115 }, { "epoch": 5.211640211640212, "grad_norm": 0.07884380221366882, "learning_rate": 4.616089246822003e-05, "loss": 0.291, "step": 120 }, { "epoch": 5.423280423280423, "grad_norm": 0.07263286411762238, "learning_rate": 4.564069046941049e-05, "loss": 0.256, "step": 125 }, { "epoch": 5.634920634920634, "grad_norm": 0.08540436625480652, "learning_rate": 4.509077788637446e-05, "loss": 0.2362, "step": 130 }, { "epoch": 5.8465608465608465, "grad_norm": 0.07003481686115265, "learning_rate": 4.4511946273257846e-05, "loss": 0.1973, "step": 135 }, { "epoch": 6.084656084656085, "grad_norm": 0.08097032457590103, "learning_rate": 4.390502881078296e-05, "loss": 0.3011, "step": 140 }, { "epoch": 6.296296296296296, "grad_norm": 0.07847806811332703, "learning_rate": 4.3270899106953105e-05, "loss": 0.2066, "step": 145 }, { "epoch": 6.507936507936508, "grad_norm": 0.10350590199232101, "learning_rate": 4.261046993956531e-05, "loss": 0.2236, "step": 150 }, { "epoch": 6.71957671957672, "grad_norm": 0.11323926597833633, "learning_rate": 4.192469194234148e-05, "loss": 0.2215, "step": 155 }, { "epoch": 6.931216931216931, "grad_norm": 0.09281644225120544, "learning_rate": 4.12145522365689e-05, "loss": 0.2251, "step": 160 }, { "epoch": 7.169312169312169, "grad_norm": 0.09772001951932907, "learning_rate": 4.048107301022005e-05, "loss": 0.2554, "step": 165 }, { "epoch": 7.380952380952381, "grad_norm": 0.10654748976230621, "learning_rate": 3.9725310046596595e-05, "loss": 0.2058, "step": 170 }, { "epoch": 7.592592592592593, "grad_norm": 0.08744902163743973, "learning_rate": 3.894835120461584e-05, "loss": 0.1731, "step": 175 }, { "epoch": 7.804232804232804, "grad_norm": 0.11190956830978394, "learning_rate": 3.815131485292678e-05, "loss": 0.1955, "step": 180 }, { "epoch": 8.042328042328043, "grad_norm": 0.3187606632709503, "learning_rate": 3.733534826011008e-05, "loss": 0.1913, "step": 185 }, { "epoch": 8.253968253968253, "grad_norm": 0.12188898772001266, "learning_rate": 3.6501625943278805e-05, "loss": 0.1777, "step": 190 }, { "epoch": 8.465608465608465, "grad_norm": 0.09240734577178955, "learning_rate": 3.5651347977457214e-05, "loss": 0.1776, "step": 195 }, { "epoch": 8.677248677248677, "grad_norm": 0.12097247689962387, "learning_rate": 3.478573826817099e-05, "loss": 0.1812, "step": 200 }, { "epoch": 8.88888888888889, "grad_norm": 0.14317023754119873, "learning_rate": 3.390604278973543e-05, "loss": 0.1636, "step": 205 }, { "epoch": 9.126984126984127, "grad_norm": 0.1255073994398117, "learning_rate": 3.301352779177743e-05, "loss": 0.2084, "step": 210 }, { "epoch": 9.338624338624339, "grad_norm": 0.15191367268562317, "learning_rate": 3.21094779765728e-05, "loss": 0.159, "step": 215 }, { "epoch": 9.55026455026455, "grad_norm": 0.14779330790042877, "learning_rate": 3.11951946498225e-05, "loss": 0.1521, "step": 220 }, { "epoch": 9.761904761904763, "grad_norm": 0.14298541843891144, "learning_rate": 3.027199384752962e-05, "loss": 0.1485, "step": 225 }, { "epoch": 9.973544973544973, "grad_norm": 0.14848679304122925, "learning_rate": 2.9341204441673266e-05, "loss": 0.1488, "step": 230 }, { "epoch": 10.211640211640212, "grad_norm": 0.15250274538993835, "learning_rate": 2.840416622740617e-05, "loss": 0.1604, "step": 235 }, { "epoch": 10.423280423280424, "grad_norm": 0.2765931487083435, "learning_rate": 2.7462227994529217e-05, "loss": 0.1455, "step": 240 }, { "epoch": 10.634920634920634, "grad_norm": 0.16372708976268768, "learning_rate": 2.6516745586018965e-05, "loss": 0.1236, "step": 245 }, { "epoch": 10.846560846560847, "grad_norm": 0.3377860486507416, "learning_rate": 2.556907994640264e-05, "loss": 0.1393, "step": 250 }, { "epoch": 11.084656084656086, "grad_norm": 0.1836676001548767, "learning_rate": 2.4620595162789936e-05, "loss": 0.1473, "step": 255 }, { "epoch": 11.296296296296296, "grad_norm": 0.15243783593177795, "learning_rate": 2.3672656501381272e-05, "loss": 0.1366, "step": 260 }, { "epoch": 11.507936507936508, "grad_norm": 0.1481999307870865, "learning_rate": 2.2726628442278826e-05, "loss": 0.1039, "step": 265 }, { "epoch": 11.71957671957672, "grad_norm": 0.17082872986793518, "learning_rate": 2.1783872715429228e-05, "loss": 0.1196, "step": 270 }, { "epoch": 11.93121693121693, "grad_norm": 0.16833926737308502, "learning_rate": 2.084574634052465e-05, "loss": 0.1245, "step": 275 }, { "epoch": 12.16931216931217, "grad_norm": 0.18557614088058472, "learning_rate": 1.991359967368416e-05, "loss": 0.1174, "step": 280 }, { "epoch": 12.380952380952381, "grad_norm": 0.13334429264068604, "learning_rate": 1.8988774463726543e-05, "loss": 0.0977, "step": 285 }, { "epoch": 12.592592592592592, "grad_norm": 0.18030914664268494, "learning_rate": 1.8072601920832786e-05, "loss": 0.1147, "step": 290 }, { "epoch": 12.804232804232804, "grad_norm": 0.19089557230472565, "learning_rate": 1.7166400800377948e-05, "loss": 0.1132, "step": 295 }, { "epoch": 13.042328042328043, "grad_norm": 0.3375261127948761, "learning_rate": 1.6271475504690792e-05, "loss": 0.1448, "step": 300 }, { "epoch": 13.253968253968253, "grad_norm": 0.1948479562997818, "learning_rate": 1.5389114205473377e-05, "loss": 0.1119, "step": 305 }, { "epoch": 13.465608465608465, "grad_norm": 0.1664939522743225, "learning_rate": 1.4520586989583406e-05, "loss": 0.0932, "step": 310 }, { "epoch": 13.677248677248677, "grad_norm": 0.16317923367023468, "learning_rate": 1.3667144030848073e-05, "loss": 0.1007, "step": 315 }, { "epoch": 13.88888888888889, "grad_norm": 0.18476559221744537, "learning_rate": 1.2830013790541279e-05, "loss": 0.105, "step": 320 }, { "epoch": 14.126984126984127, "grad_norm": 0.13438890874385834, "learning_rate": 1.2010401249114167e-05, "loss": 0.1177, "step": 325 }, { "epoch": 14.338624338624339, "grad_norm": 0.18302133679389954, "learning_rate": 1.120948617172432e-05, "loss": 0.0864, "step": 330 }, { "epoch": 14.55026455026455, "grad_norm": 0.18818943202495575, "learning_rate": 1.0428421410060541e-05, "loss": 0.0878, "step": 335 }, { "epoch": 14.761904761904763, "grad_norm": 0.17307747900485992, "learning_rate": 9.668331242907089e-06, "loss": 0.0895, "step": 340 }, { "epoch": 14.973544973544973, "grad_norm": 0.2092498242855072, "learning_rate": 8.930309757836517e-06, "loss": 0.0918, "step": 345 }, { "epoch": 15.211640211640212, "grad_norm": 0.17772510647773743, "learning_rate": 8.215419276360084e-06, "loss": 0.1119, "step": 350 }, { "epoch": 15.423280423280424, "grad_norm": 0.2180255800485611, "learning_rate": 7.524688824802953e-06, "loss": 0.0934, "step": 355 }, { "epoch": 15.634920634920634, "grad_norm": 0.17044204473495483, "learning_rate": 6.859112653105024e-06, "loss": 0.087, "step": 360 }, { "epoch": 15.846560846560847, "grad_norm": 0.16892142593860626, "learning_rate": 6.219648803679559e-06, "loss": 0.0802, "step": 365 }, { "epoch": 16.084656084656086, "grad_norm": 0.1476411372423172, "learning_rate": 5.607217732389503e-06, "loss": 0.0958, "step": 370 }, { "epoch": 16.296296296296298, "grad_norm": 0.16109658777713776, "learning_rate": 5.022700983626691e-06, "loss": 0.0922, "step": 375 }, { "epoch": 16.507936507936506, "grad_norm": 0.17337115108966827, "learning_rate": 4.4669399214007785e-06, "loss": 0.0957, "step": 380 }, { "epoch": 16.719576719576718, "grad_norm": 0.15418995916843414, "learning_rate": 3.940734518264713e-06, "loss": 0.0814, "step": 385 }, { "epoch": 16.93121693121693, "grad_norm": 0.1691560000181198, "learning_rate": 3.444842203819662e-06, "loss": 0.0743, "step": 390 }, { "epoch": 17.16931216931217, "grad_norm": 0.18097734451293945, "learning_rate": 2.9799767744571632e-06, "loss": 0.0885, "step": 395 }, { "epoch": 17.38095238095238, "grad_norm": 0.1728585809469223, "learning_rate": 2.5468073659076e-06, "loss": 0.0893, "step": 400 }, { "epoch": 17.59259259259259, "grad_norm": 0.13586170971393585, "learning_rate": 2.14595749007413e-06, "loss": 0.0698, "step": 405 }, { "epoch": 17.804232804232804, "grad_norm": 0.16407901048660278, "learning_rate": 1.778004137538325e-06, "loss": 0.0843, "step": 410 }, { "epoch": 18.04232804232804, "grad_norm": 0.4242843985557556, "learning_rate": 1.44347694702949e-06, "loss": 0.109, "step": 415 }, { "epoch": 18.253968253968253, "grad_norm": 0.14408718049526215, "learning_rate": 1.1428574430530276e-06, "loss": 0.0772, "step": 420 }, { "epoch": 18.465608465608465, "grad_norm": 0.14139463007450104, "learning_rate": 8.765783427753721e-07, "loss": 0.0828, "step": 425 }, { "epoch": 18.677248677248677, "grad_norm": 0.18470263481140137, "learning_rate": 6.450229331630253e-07, "loss": 0.0733, "step": 430 }, { "epoch": 18.88888888888889, "grad_norm": 0.15952105820178986, "learning_rate": 4.4852451927235304e-07, "loss": 0.0775, "step": 435 }, { "epoch": 19.126984126984127, "grad_norm": 0.14280001819133759, "learning_rate": 2.8736594448424415e-07, "loss": 0.1054, "step": 440 }, { "epoch": 19.33862433862434, "grad_norm": 0.1590239554643631, "learning_rate": 1.6177918337422216e-07, "loss": 0.0832, "step": 445 }, { "epoch": 19.55026455026455, "grad_norm": 0.14798839390277863, "learning_rate": 7.194500780401958e-08, "loss": 0.0858, "step": 450 }, { "epoch": 19.761904761904763, "grad_norm": 0.15491671860218048, "learning_rate": 1.7992726715299058e-08, "loss": 0.0814, "step": 455 }, { "epoch": 19.973544973544975, "grad_norm": 0.15275807678699493, "learning_rate": 0.0, "loss": 0.0712, "step": 460 }, { "epoch": 19.973544973544975, "step": 460, "total_flos": 2.793725194059907e+18, "train_loss": 0.18111413237841234, "train_runtime": 12414.497, "train_samples_per_second": 1.218, "train_steps_per_second": 0.037 } ], "logging_steps": 5, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 23, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.793725194059907e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }