|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9916434540389973, |
|
"eval_steps": 200, |
|
"global_step": 807, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03714020427112349, |
|
"grad_norm": 0.9345887331215147, |
|
"learning_rate": 1.2345679012345678e-05, |
|
"loss": 0.7627, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07428040854224698, |
|
"grad_norm": 1.188973319567107, |
|
"learning_rate": 2.4691358024691357e-05, |
|
"loss": 0.7982, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11142061281337047, |
|
"grad_norm": 0.6516884136437275, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.5529, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14856081708449395, |
|
"grad_norm": 0.6588490543859995, |
|
"learning_rate": 4.938271604938271e-05, |
|
"loss": 0.4245, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18570102135561745, |
|
"grad_norm": 0.4226844096352398, |
|
"learning_rate": 6.17283950617284e-05, |
|
"loss": 0.4319, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22284122562674094, |
|
"grad_norm": 0.7485838342795699, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.3556, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25998142989786444, |
|
"grad_norm": 0.5245389532842025, |
|
"learning_rate": 8.641975308641975e-05, |
|
"loss": 0.3381, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2971216341689879, |
|
"grad_norm": 0.5491860599786594, |
|
"learning_rate": 9.876543209876543e-05, |
|
"loss": 0.3287, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3342618384401114, |
|
"grad_norm": 0.46666477130003114, |
|
"learning_rate": 9.996208625907141e-05, |
|
"loss": 0.2942, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3714020427112349, |
|
"grad_norm": 0.5298545126148352, |
|
"learning_rate": 9.983110023102147e-05, |
|
"loss": 0.3021, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4085422469823584, |
|
"grad_norm": 0.5132219959623447, |
|
"learning_rate": 9.960681902224691e-05, |
|
"loss": 0.3133, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4456824512534819, |
|
"grad_norm": 0.7322228366411259, |
|
"learning_rate": 9.92896625383049e-05, |
|
"loss": 0.3027, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4828226555246054, |
|
"grad_norm": 2.89785379394605, |
|
"learning_rate": 9.888022456844251e-05, |
|
"loss": 0.2983, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5199628597957289, |
|
"grad_norm": 0.9602775250127539, |
|
"learning_rate": 9.837927167388792e-05, |
|
"loss": 0.2937, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5571030640668524, |
|
"grad_norm": 0.587882249681093, |
|
"learning_rate": 9.778774175267295e-05, |
|
"loss": 0.2733, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5942432683379758, |
|
"grad_norm": 0.5952699909669362, |
|
"learning_rate": 9.710674228367423e-05, |
|
"loss": 0.3127, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6313834726090993, |
|
"grad_norm": 0.8601590917310498, |
|
"learning_rate": 9.633754825316015e-05, |
|
"loss": 0.2846, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6685236768802229, |
|
"grad_norm": 0.5276550802440668, |
|
"learning_rate": 9.548159976772592e-05, |
|
"loss": 0.289, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7056638811513464, |
|
"grad_norm": 0.6256007560816419, |
|
"learning_rate": 9.454049935808568e-05, |
|
"loss": 0.2722, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7428040854224698, |
|
"grad_norm": 0.7083045077849746, |
|
"learning_rate": 9.351600897876964e-05, |
|
"loss": 0.2724, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7428040854224698, |
|
"eval_loss": 0.32194846868515015, |
|
"eval_runtime": 69.2853, |
|
"eval_samples_per_second": 5.08, |
|
"eval_steps_per_second": 0.635, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7799442896935933, |
|
"grad_norm": 0.5489982408476731, |
|
"learning_rate": 9.241004670934348e-05, |
|
"loss": 0.2706, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8170844939647168, |
|
"grad_norm": 0.4976435465000861, |
|
"learning_rate": 9.122468316332611e-05, |
|
"loss": 0.2878, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8542246982358404, |
|
"grad_norm": 0.527946791379517, |
|
"learning_rate": 8.99621376115291e-05, |
|
"loss": 0.2653, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8913649025069638, |
|
"grad_norm": 0.4489299048179587, |
|
"learning_rate": 8.862477382707568e-05, |
|
"loss": 0.2653, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9285051067780873, |
|
"grad_norm": 0.35733720135057156, |
|
"learning_rate": 8.721509565987859e-05, |
|
"loss": 0.2613, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9656453110492108, |
|
"grad_norm": 0.940140704929043, |
|
"learning_rate": 8.573574234886217e-05, |
|
"loss": 0.2719, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.41579714400957546, |
|
"learning_rate": 8.418948358070535e-05, |
|
"loss": 0.268, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0371402042711235, |
|
"grad_norm": 0.44513719030149457, |
|
"learning_rate": 8.257921430435678e-05, |
|
"loss": 0.2292, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.074280408542247, |
|
"grad_norm": 0.36739314586440486, |
|
"learning_rate": 8.090794931103026e-05, |
|
"loss": 0.2161, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1114206128133706, |
|
"grad_norm": 0.49411650365866194, |
|
"learning_rate": 7.917881758982837e-05, |
|
"loss": 0.2242, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1485608170844939, |
|
"grad_norm": 0.4197677070916283, |
|
"learning_rate": 7.739505646956135e-05, |
|
"loss": 0.2497, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1857010213556174, |
|
"grad_norm": 0.6822804225967691, |
|
"learning_rate": 7.556000555772967e-05, |
|
"loss": 0.2278, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.222841225626741, |
|
"grad_norm": 0.49319142186511206, |
|
"learning_rate": 7.367710048801715e-05, |
|
"loss": 0.2403, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2599814298978644, |
|
"grad_norm": 0.5235786948264898, |
|
"learning_rate": 7.174986648800161e-05, |
|
"loss": 0.231, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.297121634168988, |
|
"grad_norm": 0.6055462763909709, |
|
"learning_rate": 6.978191177912498e-05, |
|
"loss": 0.2403, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3342618384401115, |
|
"grad_norm": 0.5300067921269699, |
|
"learning_rate": 6.777692082128024e-05, |
|
"loss": 0.218, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3714020427112348, |
|
"grad_norm": 0.42875365947814814, |
|
"learning_rate": 6.573864741466235e-05, |
|
"loss": 0.2291, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4085422469823583, |
|
"grad_norm": 0.43773461983892203, |
|
"learning_rate": 6.367090767179855e-05, |
|
"loss": 0.2318, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4456824512534818, |
|
"grad_norm": 0.44564163439616256, |
|
"learning_rate": 6.157757287291557e-05, |
|
"loss": 0.2265, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4828226555246053, |
|
"grad_norm": 0.5057499808132443, |
|
"learning_rate": 5.946256221802051e-05, |
|
"loss": 0.2418, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4828226555246053, |
|
"eval_loss": 0.3125605285167694, |
|
"eval_runtime": 69.1118, |
|
"eval_samples_per_second": 5.093, |
|
"eval_steps_per_second": 0.637, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5199628597957289, |
|
"grad_norm": 1.7611100076709196, |
|
"learning_rate": 5.732983548926485e-05, |
|
"loss": 0.2205, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5571030640668524, |
|
"grad_norm": 0.43196833026875653, |
|
"learning_rate": 5.5183385637329446e-05, |
|
"loss": 0.2239, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.594243268337976, |
|
"grad_norm": 1.0867050785435954, |
|
"learning_rate": 5.30272313057105e-05, |
|
"loss": 0.2167, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.6313834726090994, |
|
"grad_norm": 0.4896739604884348, |
|
"learning_rate": 5.086540930690276e-05, |
|
"loss": 0.227, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.668523676880223, |
|
"grad_norm": 0.471584728228977, |
|
"learning_rate": 4.8701967064566095e-05, |
|
"loss": 0.2199, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7056638811513465, |
|
"grad_norm": 0.6316398824394676, |
|
"learning_rate": 4.6540955035825676e-05, |
|
"loss": 0.2155, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7428040854224698, |
|
"grad_norm": 0.5306507117386736, |
|
"learning_rate": 4.438641912789277e-05, |
|
"loss": 0.2216, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7799442896935933, |
|
"grad_norm": 0.5074632121656725, |
|
"learning_rate": 4.2242393123203986e-05, |
|
"loss": 0.2157, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.8170844939647168, |
|
"grad_norm": 0.6786392166404536, |
|
"learning_rate": 4.011289112726085e-05, |
|
"loss": 0.2121, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8542246982358404, |
|
"grad_norm": 0.37737812515921004, |
|
"learning_rate": 3.8001900053309184e-05, |
|
"loss": 0.2053, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8913649025069637, |
|
"grad_norm": 0.48751413968023255, |
|
"learning_rate": 3.591337215792852e-05, |
|
"loss": 0.239, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.9285051067780872, |
|
"grad_norm": 0.6383741405331227, |
|
"learning_rate": 3.3851217641506656e-05, |
|
"loss": 0.2329, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9656453110492107, |
|
"grad_norm": 0.510851907275428, |
|
"learning_rate": 3.1819297327453045e-05, |
|
"loss": 0.2129, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.4517371979118407, |
|
"learning_rate": 2.9821415433857174e-05, |
|
"loss": 0.2212, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.0371402042711235, |
|
"grad_norm": 0.6477795533764817, |
|
"learning_rate": 2.786131245112495e-05, |
|
"loss": 0.1835, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.074280408542247, |
|
"grad_norm": 0.679636260658929, |
|
"learning_rate": 2.5942658138927867e-05, |
|
"loss": 0.187, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.1114206128133706, |
|
"grad_norm": 0.4922817129749062, |
|
"learning_rate": 2.406904465557614e-05, |
|
"loss": 0.1727, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.148560817084494, |
|
"grad_norm": 0.568983694160024, |
|
"learning_rate": 2.224397983267951e-05, |
|
"loss": 0.1718, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.1857010213556176, |
|
"grad_norm": 0.47125172130466897, |
|
"learning_rate": 2.0470880607686603e-05, |
|
"loss": 0.1611, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.222841225626741, |
|
"grad_norm": 0.5574999246238416, |
|
"learning_rate": 1.8753066626599086e-05, |
|
"loss": 0.1724, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.222841225626741, |
|
"eval_loss": 0.31889092922210693, |
|
"eval_runtime": 69.2279, |
|
"eval_samples_per_second": 5.085, |
|
"eval_steps_per_second": 0.636, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.2599814298978647, |
|
"grad_norm": 0.43455165935100337, |
|
"learning_rate": 1.7093754028837345e-05, |
|
"loss": 0.1786, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2971216341689877, |
|
"grad_norm": 0.6604753838181086, |
|
"learning_rate": 1.549604942589441e-05, |
|
"loss": 0.1577, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.3342618384401113, |
|
"grad_norm": 0.5420133852475517, |
|
"learning_rate": 1.3962944085050832e-05, |
|
"loss": 0.1631, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.371402042711235, |
|
"grad_norm": 0.4970087479761992, |
|
"learning_rate": 1.2497308329040475e-05, |
|
"loss": 0.1714, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.4085422469823583, |
|
"grad_norm": 0.5146757449079139, |
|
"learning_rate": 1.1101886162151764e-05, |
|
"loss": 0.1637, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.445682451253482, |
|
"grad_norm": 0.5176923026163144, |
|
"learning_rate": 9.779290132826224e-06, |
|
"loss": 0.1728, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.4828226555246053, |
|
"grad_norm": 0.6169350122017051, |
|
"learning_rate": 8.531996442372048e-06, |
|
"loss": 0.1655, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.519962859795729, |
|
"grad_norm": 0.7057265499202516, |
|
"learning_rate": 7.3623403089507825e-06, |
|
"loss": 0.1538, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.5571030640668524, |
|
"grad_norm": 0.5942398255873979, |
|
"learning_rate": 6.272511595516401e-06, |
|
"loss": 0.1601, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.594243268337976, |
|
"grad_norm": 0.5027857142402828, |
|
"learning_rate": 5.264550709892685e-06, |
|
"loss": 0.1706, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.6313834726090994, |
|
"grad_norm": 0.44395576699553374, |
|
"learning_rate": 4.340344784664535e-06, |
|
"loss": 0.1644, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.668523676880223, |
|
"grad_norm": 0.583338230930087, |
|
"learning_rate": 3.501624144035559e-06, |
|
"loss": 0.1712, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.7056638811513465, |
|
"grad_norm": 0.645584661097912, |
|
"learning_rate": 2.7499590642665774e-06, |
|
"loss": 0.1685, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.7428040854224696, |
|
"grad_norm": 0.6016994024894197, |
|
"learning_rate": 2.0867568337605616e-06, |
|
"loss": 0.1488, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.779944289693593, |
|
"grad_norm": 0.4944654437056398, |
|
"learning_rate": 1.5132591182978106e-06, |
|
"loss": 0.17, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.8170844939647166, |
|
"grad_norm": 0.5030466881427367, |
|
"learning_rate": 1.0305396363545717e-06, |
|
"loss": 0.156, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.85422469823584, |
|
"grad_norm": 0.6234438565731387, |
|
"learning_rate": 6.395021488572128e-07, |
|
"loss": 0.1625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.8913649025069637, |
|
"grad_norm": 0.4826847791868803, |
|
"learning_rate": 3.408787671357494e-07, |
|
"loss": 0.1714, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.928505106778087, |
|
"grad_norm": 0.6381886732044015, |
|
"learning_rate": 1.352285822445065e-07, |
|
"loss": 0.1688, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.9656453110492107, |
|
"grad_norm": 0.5324016878778625, |
|
"learning_rate": 2.2936618216201632e-08, |
|
"loss": 0.1525, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9656453110492107, |
|
"eval_loss": 0.31994694471359253, |
|
"eval_runtime": 69.2114, |
|
"eval_samples_per_second": 5.086, |
|
"eval_steps_per_second": 0.636, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9916434540389973, |
|
"step": 807, |
|
"total_flos": 4.48907289034752e+16, |
|
"train_loss": 0.24671251490213614, |
|
"train_runtime": 34121.0002, |
|
"train_samples_per_second": 1.515, |
|
"train_steps_per_second": 0.024 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 807, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.48907289034752e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|