{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 222, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06756756756756757, "grad_norm": 18.160281746562354, "learning_rate": 5e-07, "loss": 1.725, "step": 5 }, { "epoch": 0.13513513513513514, "grad_norm": 12.466524806615743, "learning_rate": 1e-06, "loss": 1.5906, "step": 10 }, { "epoch": 0.20270270270270271, "grad_norm": 8.158179790073767, "learning_rate": 9.98628141419305e-07, "loss": 1.2918, "step": 15 }, { "epoch": 0.2702702702702703, "grad_norm": 4.085200528685059, "learning_rate": 9.94520093661082e-07, "loss": 1.1519, "step": 20 }, { "epoch": 0.33783783783783783, "grad_norm": 3.850134297695916, "learning_rate": 9.876983993675989e-07, "loss": 1.0558, "step": 25 }, { "epoch": 0.40540540540540543, "grad_norm": 3.6261076115421855, "learning_rate": 9.78200492138261e-07, "loss": 0.998, "step": 30 }, { "epoch": 0.47297297297297297, "grad_norm": 3.5652322410577693, "learning_rate": 9.66078491115194e-07, "loss": 0.9869, "step": 35 }, { "epoch": 0.5405405405405406, "grad_norm": 3.429856478581444, "learning_rate": 9.513989149828717e-07, "loss": 0.9609, "step": 40 }, { "epoch": 0.6081081081081081, "grad_norm": 3.503725889554788, "learning_rate": 9.342423169512071e-07, "loss": 0.9515, "step": 45 }, { "epoch": 0.6756756756756757, "grad_norm": 3.9352337469583434, "learning_rate": 9.147028427251009e-07, "loss": 0.9353, "step": 50 }, { "epoch": 0.6756756756756757, "eval_loss": 0.9305333495140076, "eval_runtime": 36.7139, "eval_samples_per_second": 57.199, "eval_steps_per_second": 0.899, "step": 50 }, { "epoch": 0.7432432432432432, "grad_norm": 3.3810767942983877, "learning_rate": 8.928877138860706e-07, "loss": 0.9362, "step": 55 }, { "epoch": 0.8108108108108109, "grad_norm": 3.480739656688697, "learning_rate": 8.689166395208636e-07, "loss": 0.9287, "step": 60 }, { "epoch": 0.8783783783783784, "grad_norm": 3.5875143505506037, "learning_rate": 8.429211593257052e-07, "loss": 0.94, "step": 65 }, { "epoch": 0.9459459459459459, "grad_norm": 3.3991304127197157, "learning_rate": 8.150439217908556e-07, "loss": 0.8969, "step": 70 }, { "epoch": 1.0135135135135136, "grad_norm": 3.739421526863416, "learning_rate": 7.854379014263876e-07, "loss": 0.8994, "step": 75 }, { "epoch": 1.0810810810810811, "grad_norm": 3.315727217565176, "learning_rate": 7.542655593246103e-07, "loss": 0.8338, "step": 80 }, { "epoch": 1.1486486486486487, "grad_norm": 3.481817716947705, "learning_rate": 7.216979516654943e-07, "loss": 0.8294, "step": 85 }, { "epoch": 1.2162162162162162, "grad_norm": 3.4570894761347306, "learning_rate": 6.87913791057119e-07, "loss": 0.8202, "step": 90 }, { "epoch": 1.2837837837837838, "grad_norm": 3.4227093804265443, "learning_rate": 6.530984658619733e-07, "loss": 0.8417, "step": 95 }, { "epoch": 1.3513513513513513, "grad_norm": 3.3473320785124807, "learning_rate": 6.174430228904919e-07, "loss": 0.8186, "step": 100 }, { "epoch": 1.3513513513513513, "eval_loss": 0.8826867341995239, "eval_runtime": 36.3957, "eval_samples_per_second": 57.699, "eval_steps_per_second": 0.907, "step": 100 }, { "epoch": 1.4189189189189189, "grad_norm": 3.708290081383714, "learning_rate": 5.8114311904423e-07, "loss": 0.8265, "step": 105 }, { "epoch": 1.4864864864864864, "grad_norm": 3.561238644912943, "learning_rate": 5.443979476614674e-07, "loss": 0.8196, "step": 110 }, { "epoch": 1.554054054054054, "grad_norm": 3.752106899664045, "learning_rate": 5.074091454568463e-07, "loss": 0.8243, "step": 115 }, { "epoch": 1.6216216216216215, "grad_norm": 3.524245248482102, "learning_rate": 4.703796860531429e-07, "loss": 0.8, "step": 120 }, { "epoch": 1.689189189189189, "grad_norm": 3.626113759504586, "learning_rate": 4.3351276617684285e-07, "loss": 0.8235, "step": 125 }, { "epoch": 1.7567567567567568, "grad_norm": 3.555163306181175, "learning_rate": 3.970106906294509e-07, "loss": 0.8176, "step": 130 }, { "epoch": 1.8243243243243243, "grad_norm": 3.5936443942910485, "learning_rate": 3.610737621531781e-07, "loss": 0.7933, "step": 135 }, { "epoch": 1.8918918918918919, "grad_norm": 3.670717085034342, "learning_rate": 3.2589918228280066e-07, "loss": 0.8117, "step": 140 }, { "epoch": 1.9594594594594594, "grad_norm": 3.4707422128547942, "learning_rate": 2.916799692151884e-07, "loss": 0.8011, "step": 145 }, { "epoch": 2.027027027027027, "grad_norm": 3.6322557161041504, "learning_rate": 2.5860389863462763e-07, "loss": 0.8014, "step": 150 }, { "epoch": 2.027027027027027, "eval_loss": 0.8658075332641602, "eval_runtime": 36.3437, "eval_samples_per_second": 57.782, "eval_steps_per_second": 0.908, "step": 150 }, { "epoch": 2.0945945945945947, "grad_norm": 3.8365084383169954, "learning_rate": 2.2685247330608414e-07, "loss": 0.7716, "step": 155 }, { "epoch": 2.1621621621621623, "grad_norm": 3.6658531003049237, "learning_rate": 1.9659992709070344e-07, "loss": 0.7524, "step": 160 }, { "epoch": 2.22972972972973, "grad_norm": 3.687250815077673, "learning_rate": 1.6801226884893893e-07, "loss": 0.7735, "step": 165 }, { "epoch": 2.2972972972972974, "grad_norm": 3.5931982468186354, "learning_rate": 1.412463714778343e-07, "loss": 0.7535, "step": 170 }, { "epoch": 2.364864864864865, "grad_norm": 3.726827517069166, "learning_rate": 1.1644911108130434e-07, "loss": 0.7456, "step": 175 }, { "epoch": 2.4324324324324325, "grad_norm": 3.6188523000860955, "learning_rate": 9.375656099715934e-08, "loss": 0.7546, "step": 180 }, { "epoch": 2.5, "grad_norm": 3.5682831997451228, "learning_rate": 7.329324510360269e-08, "loss": 0.745, "step": 185 }, { "epoch": 2.5675675675675675, "grad_norm": 3.6369152466203434, "learning_rate": 5.517145450262639e-08, "loss": 0.7699, "step": 190 }, { "epoch": 2.635135135135135, "grad_norm": 3.604157345726647, "learning_rate": 3.9490631329964554e-08, "loss": 0.7461, "step": 195 }, { "epoch": 2.7027027027027026, "grad_norm": 3.6723873051705356, "learning_rate": 2.63368230729043e-08, "loss": 0.7437, "step": 200 }, { "epoch": 2.7027027027027026, "eval_loss": 0.8665739297866821, "eval_runtime": 36.3505, "eval_samples_per_second": 57.771, "eval_steps_per_second": 0.908, "step": 200 }, { "epoch": 2.77027027027027, "grad_norm": 3.421061261868487, "learning_rate": 1.5782210390350713e-08, "loss": 0.7692, "step": 205 }, { "epoch": 2.8378378378378377, "grad_norm": 3.6288030361158103, "learning_rate": 7.884711026201584e-09, "loss": 0.7482, "step": 210 }, { "epoch": 2.9054054054054053, "grad_norm": 3.4807112881688616, "learning_rate": 2.687661989531964e-09, "loss": 0.7521, "step": 215 }, { "epoch": 2.972972972972973, "grad_norm": 3.773156943868378, "learning_rate": 2.1958174560282594e-10, "loss": 0.7664, "step": 220 }, { "epoch": 3.0, "step": 222, "total_flos": 1308778674782208.0, "train_loss": 0.8880831481100203, "train_runtime": 3084.8066, "train_samples_per_second": 18.38, "train_steps_per_second": 0.072 } ], "logging_steps": 5, "max_steps": 222, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1308778674782208.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }