|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.973544973544973, |
|
"eval_steps": 500, |
|
"global_step": 322, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21164021164021163, |
|
"grad_norm": 0.01096427347511053, |
|
"learning_rate": 5.4347826086956525e-06, |
|
"loss": 0.3143, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.42328042328042326, |
|
"grad_norm": 0.013951408676803112, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 0.3576, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.01673784852027893, |
|
"learning_rate": 1.630434782608696e-05, |
|
"loss": 0.3639, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.8465608465608465, |
|
"grad_norm": 0.019934486597776413, |
|
"learning_rate": 2.173913043478261e-05, |
|
"loss": 0.3458, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0846560846560847, |
|
"grad_norm": 0.02395496889948845, |
|
"learning_rate": 2.7173913043478262e-05, |
|
"loss": 0.4603, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.017271244898438454, |
|
"learning_rate": 3.260869565217392e-05, |
|
"loss": 0.3034, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.507936507936508, |
|
"grad_norm": 0.023819392547011375, |
|
"learning_rate": 3.804347826086957e-05, |
|
"loss": 0.3317, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.7195767195767195, |
|
"grad_norm": 0.021737879142165184, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.3382, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.9312169312169312, |
|
"grad_norm": 0.023687848821282387, |
|
"learning_rate": 4.891304347826087e-05, |
|
"loss": 0.3288, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.1693121693121693, |
|
"grad_norm": 0.024081828072667122, |
|
"learning_rate": 4.9988484157560136e-05, |
|
"loss": 0.3521, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.03236062824726105, |
|
"learning_rate": 4.994171922976348e-05, |
|
"loss": 0.3485, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 0.03175266832113266, |
|
"learning_rate": 4.9859052738933966e-05, |
|
"loss": 0.2979, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.804232804232804, |
|
"grad_norm": 0.030012985691428185, |
|
"learning_rate": 4.974060367671783e-05, |
|
"loss": 0.3055, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.0423280423280423, |
|
"grad_norm": 0.10118062049150467, |
|
"learning_rate": 4.958654254084355e-05, |
|
"loss": 0.3521, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.253968253968254, |
|
"grad_norm": 0.02641722746193409, |
|
"learning_rate": 4.9397091089704364e-05, |
|
"loss": 0.3041, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.4656084656084656, |
|
"grad_norm": 0.03561725839972496, |
|
"learning_rate": 4.9172522023155154e-05, |
|
"loss": 0.2778, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.677248677248677, |
|
"grad_norm": 0.04121287539601326, |
|
"learning_rate": 4.8913158589983374e-05, |
|
"loss": 0.2698, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.04533043131232262, |
|
"learning_rate": 4.8619374122618854e-05, |
|
"loss": 0.2704, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.1269841269841265, |
|
"grad_norm": 0.04685904085636139, |
|
"learning_rate": 4.8291591499752365e-05, |
|
"loss": 0.3338, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.338624338624339, |
|
"grad_norm": 0.06093249469995499, |
|
"learning_rate": 4.793028253763633e-05, |
|
"loss": 0.2794, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.550264550264551, |
|
"grad_norm": 0.060147590935230255, |
|
"learning_rate": 4.7535967310943955e-05, |
|
"loss": 0.2724, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.05850926414132118, |
|
"learning_rate": 4.710921340416431e-05, |
|
"loss": 0.2547, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.973544973544973, |
|
"grad_norm": 0.0650046169757843, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.241, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.211640211640212, |
|
"grad_norm": 0.07884380221366882, |
|
"learning_rate": 4.616089246822003e-05, |
|
"loss": 0.291, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.423280423280423, |
|
"grad_norm": 0.07263286411762238, |
|
"learning_rate": 4.564069046941049e-05, |
|
"loss": 0.256, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 5.634920634920634, |
|
"grad_norm": 0.08540436625480652, |
|
"learning_rate": 4.509077788637446e-05, |
|
"loss": 0.2362, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 5.8465608465608465, |
|
"grad_norm": 0.07003481686115265, |
|
"learning_rate": 4.4511946273257846e-05, |
|
"loss": 0.1973, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.084656084656085, |
|
"grad_norm": 0.08097032457590103, |
|
"learning_rate": 4.390502881078296e-05, |
|
"loss": 0.3011, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.296296296296296, |
|
"grad_norm": 0.07847806811332703, |
|
"learning_rate": 4.3270899106953105e-05, |
|
"loss": 0.2066, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 6.507936507936508, |
|
"grad_norm": 0.10350590199232101, |
|
"learning_rate": 4.261046993956531e-05, |
|
"loss": 0.2236, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.71957671957672, |
|
"grad_norm": 0.11323926597833633, |
|
"learning_rate": 4.192469194234148e-05, |
|
"loss": 0.2215, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 6.931216931216931, |
|
"grad_norm": 0.09281644225120544, |
|
"learning_rate": 4.12145522365689e-05, |
|
"loss": 0.2251, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.169312169312169, |
|
"grad_norm": 0.09772001951932907, |
|
"learning_rate": 4.048107301022005e-05, |
|
"loss": 0.2554, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 7.380952380952381, |
|
"grad_norm": 0.10654748976230621, |
|
"learning_rate": 3.9725310046596595e-05, |
|
"loss": 0.2058, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 7.592592592592593, |
|
"grad_norm": 0.08744902163743973, |
|
"learning_rate": 3.894835120461584e-05, |
|
"loss": 0.1731, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 7.804232804232804, |
|
"grad_norm": 0.11190956830978394, |
|
"learning_rate": 3.815131485292678e-05, |
|
"loss": 0.1955, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.042328042328043, |
|
"grad_norm": 0.3187606632709503, |
|
"learning_rate": 3.733534826011008e-05, |
|
"loss": 0.1913, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 8.253968253968253, |
|
"grad_norm": 0.12188898772001266, |
|
"learning_rate": 3.6501625943278805e-05, |
|
"loss": 0.1777, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.465608465608465, |
|
"grad_norm": 0.09240734577178955, |
|
"learning_rate": 3.5651347977457214e-05, |
|
"loss": 0.1776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 8.677248677248677, |
|
"grad_norm": 0.12097247689962387, |
|
"learning_rate": 3.478573826817099e-05, |
|
"loss": 0.1812, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.14317023754119873, |
|
"learning_rate": 3.390604278973543e-05, |
|
"loss": 0.1636, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 9.126984126984127, |
|
"grad_norm": 0.1255073994398117, |
|
"learning_rate": 3.301352779177743e-05, |
|
"loss": 0.2084, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.338624338624339, |
|
"grad_norm": 0.15191367268562317, |
|
"learning_rate": 3.21094779765728e-05, |
|
"loss": 0.159, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 9.55026455026455, |
|
"grad_norm": 0.14779330790042877, |
|
"learning_rate": 3.11951946498225e-05, |
|
"loss": 0.1521, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 9.761904761904763, |
|
"grad_norm": 0.14298541843891144, |
|
"learning_rate": 3.027199384752962e-05, |
|
"loss": 0.1485, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 9.973544973544973, |
|
"grad_norm": 0.14848679304122925, |
|
"learning_rate": 2.9341204441673266e-05, |
|
"loss": 0.1488, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.211640211640212, |
|
"grad_norm": 0.15250274538993835, |
|
"learning_rate": 2.840416622740617e-05, |
|
"loss": 0.1604, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 10.423280423280424, |
|
"grad_norm": 0.2765931487083435, |
|
"learning_rate": 2.7462227994529217e-05, |
|
"loss": 0.1455, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 10.634920634920634, |
|
"grad_norm": 0.16372708976268768, |
|
"learning_rate": 2.6516745586018965e-05, |
|
"loss": 0.1236, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 10.846560846560847, |
|
"grad_norm": 0.3377860486507416, |
|
"learning_rate": 2.556907994640264e-05, |
|
"loss": 0.1393, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.084656084656086, |
|
"grad_norm": 0.1836676001548767, |
|
"learning_rate": 2.4620595162789936e-05, |
|
"loss": 0.1473, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 11.296296296296296, |
|
"grad_norm": 0.15243783593177795, |
|
"learning_rate": 2.3672656501381272e-05, |
|
"loss": 0.1366, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 11.507936507936508, |
|
"grad_norm": 0.1481999307870865, |
|
"learning_rate": 2.2726628442278826e-05, |
|
"loss": 0.1039, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 11.71957671957672, |
|
"grad_norm": 0.17082872986793518, |
|
"learning_rate": 2.1783872715429228e-05, |
|
"loss": 0.1196, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 11.93121693121693, |
|
"grad_norm": 0.16833926737308502, |
|
"learning_rate": 2.084574634052465e-05, |
|
"loss": 0.1245, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 12.16931216931217, |
|
"grad_norm": 0.18557614088058472, |
|
"learning_rate": 1.991359967368416e-05, |
|
"loss": 0.1174, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 12.380952380952381, |
|
"grad_norm": 0.13334429264068604, |
|
"learning_rate": 1.8988774463726543e-05, |
|
"loss": 0.0977, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 12.592592592592592, |
|
"grad_norm": 0.18030914664268494, |
|
"learning_rate": 1.8072601920832786e-05, |
|
"loss": 0.1147, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 12.804232804232804, |
|
"grad_norm": 0.19089557230472565, |
|
"learning_rate": 1.7166400800377948e-05, |
|
"loss": 0.1132, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 13.042328042328043, |
|
"grad_norm": 0.3375261127948761, |
|
"learning_rate": 1.6271475504690792e-05, |
|
"loss": 0.1448, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.253968253968253, |
|
"grad_norm": 0.1948479562997818, |
|
"learning_rate": 1.5389114205473377e-05, |
|
"loss": 0.1119, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 13.465608465608465, |
|
"grad_norm": 0.1664939522743225, |
|
"learning_rate": 1.4520586989583406e-05, |
|
"loss": 0.0932, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 13.677248677248677, |
|
"grad_norm": 0.16317923367023468, |
|
"learning_rate": 1.3667144030848073e-05, |
|
"loss": 0.1007, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 0.18476559221744537, |
|
"learning_rate": 1.2830013790541279e-05, |
|
"loss": 0.105, |
|
"step": 320 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 23, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.956024584985641e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|