{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9916434540389973, "eval_steps": 200, "global_step": 807, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03714020427112349, "grad_norm": 0.9345887331215147, "learning_rate": 1.2345679012345678e-05, "loss": 0.7627, "step": 10 }, { "epoch": 0.07428040854224698, "grad_norm": 1.188973319567107, "learning_rate": 2.4691358024691357e-05, "loss": 0.7982, "step": 20 }, { "epoch": 0.11142061281337047, "grad_norm": 0.6516884136437275, "learning_rate": 3.7037037037037037e-05, "loss": 0.5529, "step": 30 }, { "epoch": 0.14856081708449395, "grad_norm": 0.6588490543859995, "learning_rate": 4.938271604938271e-05, "loss": 0.4245, "step": 40 }, { "epoch": 0.18570102135561745, "grad_norm": 0.4226844096352398, "learning_rate": 6.17283950617284e-05, "loss": 0.4319, "step": 50 }, { "epoch": 0.22284122562674094, "grad_norm": 0.7485838342795699, "learning_rate": 7.407407407407407e-05, "loss": 0.3556, "step": 60 }, { "epoch": 0.25998142989786444, "grad_norm": 0.5245389532842025, "learning_rate": 8.641975308641975e-05, "loss": 0.3381, "step": 70 }, { "epoch": 0.2971216341689879, "grad_norm": 0.5491860599786594, "learning_rate": 9.876543209876543e-05, "loss": 0.3287, "step": 80 }, { "epoch": 0.3342618384401114, "grad_norm": 0.46666477130003114, "learning_rate": 9.996208625907141e-05, "loss": 0.2942, "step": 90 }, { "epoch": 0.3714020427112349, "grad_norm": 0.5298545126148352, "learning_rate": 9.983110023102147e-05, "loss": 0.3021, "step": 100 }, { "epoch": 0.4085422469823584, "grad_norm": 0.5132219959623447, "learning_rate": 9.960681902224691e-05, "loss": 0.3133, "step": 110 }, { "epoch": 0.4456824512534819, "grad_norm": 0.7322228366411259, "learning_rate": 9.92896625383049e-05, "loss": 0.3027, "step": 120 }, { "epoch": 0.4828226555246054, "grad_norm": 2.89785379394605, "learning_rate": 9.888022456844251e-05, "loss": 0.2983, "step": 130 }, { "epoch": 0.5199628597957289, "grad_norm": 0.9602775250127539, "learning_rate": 9.837927167388792e-05, "loss": 0.2937, "step": 140 }, { "epoch": 0.5571030640668524, "grad_norm": 0.587882249681093, "learning_rate": 9.778774175267295e-05, "loss": 0.2733, "step": 150 }, { "epoch": 0.5942432683379758, "grad_norm": 0.5952699909669362, "learning_rate": 9.710674228367423e-05, "loss": 0.3127, "step": 160 }, { "epoch": 0.6313834726090993, "grad_norm": 0.8601590917310498, "learning_rate": 9.633754825316015e-05, "loss": 0.2846, "step": 170 }, { "epoch": 0.6685236768802229, "grad_norm": 0.5276550802440668, "learning_rate": 9.548159976772592e-05, "loss": 0.289, "step": 180 }, { "epoch": 0.7056638811513464, "grad_norm": 0.6256007560816419, "learning_rate": 9.454049935808568e-05, "loss": 0.2722, "step": 190 }, { "epoch": 0.7428040854224698, "grad_norm": 0.7083045077849746, "learning_rate": 9.351600897876964e-05, "loss": 0.2724, "step": 200 }, { "epoch": 0.7428040854224698, "eval_loss": 0.32194846868515015, "eval_runtime": 69.2853, "eval_samples_per_second": 5.08, "eval_steps_per_second": 0.635, "step": 200 }, { "epoch": 0.7799442896935933, "grad_norm": 0.5489982408476731, "learning_rate": 9.241004670934348e-05, "loss": 0.2706, "step": 210 }, { "epoch": 0.8170844939647168, "grad_norm": 0.4976435465000861, "learning_rate": 9.122468316332611e-05, "loss": 0.2878, "step": 220 }, { "epoch": 0.8542246982358404, "grad_norm": 0.527946791379517, "learning_rate": 8.99621376115291e-05, "loss": 0.2653, "step": 230 }, { "epoch": 0.8913649025069638, "grad_norm": 0.4489299048179587, "learning_rate": 8.862477382707568e-05, "loss": 0.2653, "step": 240 }, { "epoch": 0.9285051067780873, "grad_norm": 0.35733720135057156, "learning_rate": 8.721509565987859e-05, "loss": 0.2613, "step": 250 }, { "epoch": 0.9656453110492108, "grad_norm": 0.940140704929043, "learning_rate": 8.573574234886217e-05, "loss": 0.2719, "step": 260 }, { "epoch": 1.0, "grad_norm": 0.41579714400957546, "learning_rate": 8.418948358070535e-05, "loss": 0.268, "step": 270 }, { "epoch": 1.0371402042711235, "grad_norm": 0.44513719030149457, "learning_rate": 8.257921430435678e-05, "loss": 0.2292, "step": 280 }, { "epoch": 1.074280408542247, "grad_norm": 0.36739314586440486, "learning_rate": 8.090794931103026e-05, "loss": 0.2161, "step": 290 }, { "epoch": 1.1114206128133706, "grad_norm": 0.49411650365866194, "learning_rate": 7.917881758982837e-05, "loss": 0.2242, "step": 300 }, { "epoch": 1.1485608170844939, "grad_norm": 0.4197677070916283, "learning_rate": 7.739505646956135e-05, "loss": 0.2497, "step": 310 }, { "epoch": 1.1857010213556174, "grad_norm": 0.6822804225967691, "learning_rate": 7.556000555772967e-05, "loss": 0.2278, "step": 320 }, { "epoch": 1.222841225626741, "grad_norm": 0.49319142186511206, "learning_rate": 7.367710048801715e-05, "loss": 0.2403, "step": 330 }, { "epoch": 1.2599814298978644, "grad_norm": 0.5235786948264898, "learning_rate": 7.174986648800161e-05, "loss": 0.231, "step": 340 }, { "epoch": 1.297121634168988, "grad_norm": 0.6055462763909709, "learning_rate": 6.978191177912498e-05, "loss": 0.2403, "step": 350 }, { "epoch": 1.3342618384401115, "grad_norm": 0.5300067921269699, "learning_rate": 6.777692082128024e-05, "loss": 0.218, "step": 360 }, { "epoch": 1.3714020427112348, "grad_norm": 0.42875365947814814, "learning_rate": 6.573864741466235e-05, "loss": 0.2291, "step": 370 }, { "epoch": 1.4085422469823583, "grad_norm": 0.43773461983892203, "learning_rate": 6.367090767179855e-05, "loss": 0.2318, "step": 380 }, { "epoch": 1.4456824512534818, "grad_norm": 0.44564163439616256, "learning_rate": 6.157757287291557e-05, "loss": 0.2265, "step": 390 }, { "epoch": 1.4828226555246053, "grad_norm": 0.5057499808132443, "learning_rate": 5.946256221802051e-05, "loss": 0.2418, "step": 400 }, { "epoch": 1.4828226555246053, "eval_loss": 0.3125605285167694, "eval_runtime": 69.1118, "eval_samples_per_second": 5.093, "eval_steps_per_second": 0.637, "step": 400 }, { "epoch": 1.5199628597957289, "grad_norm": 1.7611100076709196, "learning_rate": 5.732983548926485e-05, "loss": 0.2205, "step": 410 }, { "epoch": 1.5571030640668524, "grad_norm": 0.43196833026875653, "learning_rate": 5.5183385637329446e-05, "loss": 0.2239, "step": 420 }, { "epoch": 1.594243268337976, "grad_norm": 1.0867050785435954, "learning_rate": 5.30272313057105e-05, "loss": 0.2167, "step": 430 }, { "epoch": 1.6313834726090994, "grad_norm": 0.4896739604884348, "learning_rate": 5.086540930690276e-05, "loss": 0.227, "step": 440 }, { "epoch": 1.668523676880223, "grad_norm": 0.471584728228977, "learning_rate": 4.8701967064566095e-05, "loss": 0.2199, "step": 450 }, { "epoch": 1.7056638811513465, "grad_norm": 0.6316398824394676, "learning_rate": 4.6540955035825676e-05, "loss": 0.2155, "step": 460 }, { "epoch": 1.7428040854224698, "grad_norm": 0.5306507117386736, "learning_rate": 4.438641912789277e-05, "loss": 0.2216, "step": 470 }, { "epoch": 1.7799442896935933, "grad_norm": 0.5074632121656725, "learning_rate": 4.2242393123203986e-05, "loss": 0.2157, "step": 480 }, { "epoch": 1.8170844939647168, "grad_norm": 0.6786392166404536, "learning_rate": 4.011289112726085e-05, "loss": 0.2121, "step": 490 }, { "epoch": 1.8542246982358404, "grad_norm": 0.37737812515921004, "learning_rate": 3.8001900053309184e-05, "loss": 0.2053, "step": 500 }, { "epoch": 1.8913649025069637, "grad_norm": 0.48751413968023255, "learning_rate": 3.591337215792852e-05, "loss": 0.239, "step": 510 }, { "epoch": 1.9285051067780872, "grad_norm": 0.6383741405331227, "learning_rate": 3.3851217641506656e-05, "loss": 0.2329, "step": 520 }, { "epoch": 1.9656453110492107, "grad_norm": 0.510851907275428, "learning_rate": 3.1819297327453045e-05, "loss": 0.2129, "step": 530 }, { "epoch": 2.0, "grad_norm": 0.4517371979118407, "learning_rate": 2.9821415433857174e-05, "loss": 0.2212, "step": 540 }, { "epoch": 2.0371402042711235, "grad_norm": 0.6477795533764817, "learning_rate": 2.786131245112495e-05, "loss": 0.1835, "step": 550 }, { "epoch": 2.074280408542247, "grad_norm": 0.679636260658929, "learning_rate": 2.5942658138927867e-05, "loss": 0.187, "step": 560 }, { "epoch": 2.1114206128133706, "grad_norm": 0.4922817129749062, "learning_rate": 2.406904465557614e-05, "loss": 0.1727, "step": 570 }, { "epoch": 2.148560817084494, "grad_norm": 0.568983694160024, "learning_rate": 2.224397983267951e-05, "loss": 0.1718, "step": 580 }, { "epoch": 2.1857010213556176, "grad_norm": 0.47125172130466897, "learning_rate": 2.0470880607686603e-05, "loss": 0.1611, "step": 590 }, { "epoch": 2.222841225626741, "grad_norm": 0.5574999246238416, "learning_rate": 1.8753066626599086e-05, "loss": 0.1724, "step": 600 }, { "epoch": 2.222841225626741, "eval_loss": 0.31889092922210693, "eval_runtime": 69.2279, "eval_samples_per_second": 5.085, "eval_steps_per_second": 0.636, "step": 600 }, { "epoch": 2.2599814298978647, "grad_norm": 0.43455165935100337, "learning_rate": 1.7093754028837345e-05, "loss": 0.1786, "step": 610 }, { "epoch": 2.2971216341689877, "grad_norm": 0.6604753838181086, "learning_rate": 1.549604942589441e-05, "loss": 0.1577, "step": 620 }, { "epoch": 2.3342618384401113, "grad_norm": 0.5420133852475517, "learning_rate": 1.3962944085050832e-05, "loss": 0.1631, "step": 630 }, { "epoch": 2.371402042711235, "grad_norm": 0.4970087479761992, "learning_rate": 1.2497308329040475e-05, "loss": 0.1714, "step": 640 }, { "epoch": 2.4085422469823583, "grad_norm": 0.5146757449079139, "learning_rate": 1.1101886162151764e-05, "loss": 0.1637, "step": 650 }, { "epoch": 2.445682451253482, "grad_norm": 0.5176923026163144, "learning_rate": 9.779290132826224e-06, "loss": 0.1728, "step": 660 }, { "epoch": 2.4828226555246053, "grad_norm": 0.6169350122017051, "learning_rate": 8.531996442372048e-06, "loss": 0.1655, "step": 670 }, { "epoch": 2.519962859795729, "grad_norm": 0.7057265499202516, "learning_rate": 7.3623403089507825e-06, "loss": 0.1538, "step": 680 }, { "epoch": 2.5571030640668524, "grad_norm": 0.5942398255873979, "learning_rate": 6.272511595516401e-06, "loss": 0.1601, "step": 690 }, { "epoch": 2.594243268337976, "grad_norm": 0.5027857142402828, "learning_rate": 5.264550709892685e-06, "loss": 0.1706, "step": 700 }, { "epoch": 2.6313834726090994, "grad_norm": 0.44395576699553374, "learning_rate": 4.340344784664535e-06, "loss": 0.1644, "step": 710 }, { "epoch": 2.668523676880223, "grad_norm": 0.583338230930087, "learning_rate": 3.501624144035559e-06, "loss": 0.1712, "step": 720 }, { "epoch": 2.7056638811513465, "grad_norm": 0.645584661097912, "learning_rate": 2.7499590642665774e-06, "loss": 0.1685, "step": 730 }, { "epoch": 2.7428040854224696, "grad_norm": 0.6016994024894197, "learning_rate": 2.0867568337605616e-06, "loss": 0.1488, "step": 740 }, { "epoch": 2.779944289693593, "grad_norm": 0.4944654437056398, "learning_rate": 1.5132591182978106e-06, "loss": 0.17, "step": 750 }, { "epoch": 2.8170844939647166, "grad_norm": 0.5030466881427367, "learning_rate": 1.0305396363545717e-06, "loss": 0.156, "step": 760 }, { "epoch": 2.85422469823584, "grad_norm": 0.6234438565731387, "learning_rate": 6.395021488572128e-07, "loss": 0.1625, "step": 770 }, { "epoch": 2.8913649025069637, "grad_norm": 0.4826847791868803, "learning_rate": 3.408787671357494e-07, "loss": 0.1714, "step": 780 }, { "epoch": 2.928505106778087, "grad_norm": 0.6381886732044015, "learning_rate": 1.352285822445065e-07, "loss": 0.1688, "step": 790 }, { "epoch": 2.9656453110492107, "grad_norm": 0.5324016878778625, "learning_rate": 2.2936618216201632e-08, "loss": 0.1525, "step": 800 }, { "epoch": 2.9656453110492107, "eval_loss": 0.31994694471359253, "eval_runtime": 69.2114, "eval_samples_per_second": 5.086, "eval_steps_per_second": 0.636, "step": 800 }, { "epoch": 2.9916434540389973, "step": 807, "total_flos": 4.48907289034752e+16, "train_loss": 0.24671251490213614, "train_runtime": 34121.0002, "train_samples_per_second": 1.515, "train_steps_per_second": 0.024 } ], "logging_steps": 10, "max_steps": 807, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.48907289034752e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }