|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9973661106233538, |
|
"eval_steps": 500, |
|
"global_step": 284, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03511852502194908, |
|
"grad_norm": 194.98139599033956, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 2.1612, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07023705004389816, |
|
"grad_norm": 11.574681452175845, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.5263, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10535557506584724, |
|
"grad_norm": 4.620556133093483, |
|
"learning_rate": 4.999810275287077e-06, |
|
"loss": 1.1514, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14047410008779632, |
|
"grad_norm": 4.8618300415248425, |
|
"learning_rate": 4.977078132728901e-06, |
|
"loss": 1.0009, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17559262510974538, |
|
"grad_norm": 5.039664200269418, |
|
"learning_rate": 4.916796010672969e-06, |
|
"loss": 0.9159, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21071115013169447, |
|
"grad_norm": 4.677695184320373, |
|
"learning_rate": 4.819877724641437e-06, |
|
"loss": 0.8819, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24582967515364354, |
|
"grad_norm": 4.476208091052348, |
|
"learning_rate": 4.687792457057482e-06, |
|
"loss": 0.8205, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28094820017559263, |
|
"grad_norm": 4.076277674081994, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.8138, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3160667251975417, |
|
"grad_norm": 3.706466470362564, |
|
"learning_rate": 4.326632832396733e-06, |
|
"loss": 0.7546, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35118525021949076, |
|
"grad_norm": 3.807926746840706, |
|
"learning_rate": 4.1030332870839466e-06, |
|
"loss": 0.7552, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3863037752414399, |
|
"grad_norm": 4.448525410750735, |
|
"learning_rate": 3.855133391181124e-06, |
|
"loss": 0.742, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42142230026338895, |
|
"grad_norm": 3.9020262955384557, |
|
"learning_rate": 3.586691054414913e-06, |
|
"loss": 0.7188, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.456540825285338, |
|
"grad_norm": 4.436756217145944, |
|
"learning_rate": 3.3017755889756382e-06, |
|
"loss": 0.7112, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4916593503072871, |
|
"grad_norm": 3.3937208218898154, |
|
"learning_rate": 3.0047060228925256e-06, |
|
"loss": 0.6893, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5267778753292361, |
|
"grad_norm": 5.294699703871525, |
|
"learning_rate": 2.699985627971354e-06, |
|
"loss": 0.6668, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5618964003511853, |
|
"grad_norm": 4.373298006073623, |
|
"learning_rate": 2.392233654784262e-06, |
|
"loss": 0.6659, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 4.608979891075067, |
|
"learning_rate": 2.086115309539675e-06, |
|
"loss": 0.643, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6321334503950834, |
|
"grad_norm": 5.305060414443979, |
|
"learning_rate": 1.7862710343116451e-06, |
|
"loss": 0.6655, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6672519754170325, |
|
"grad_norm": 3.420875543474142, |
|
"learning_rate": 1.4972461626682033e-06, |
|
"loss": 0.6251, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7023705004389815, |
|
"grad_norm": 4.603437178273803, |
|
"learning_rate": 1.2234220170477332e-06, |
|
"loss": 0.6449, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7374890254609306, |
|
"grad_norm": 5.3351263316177215, |
|
"learning_rate": 9.689494923768756e-07, |
|
"loss": 0.6284, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7726075504828798, |
|
"grad_norm": 3.607022821385861, |
|
"learning_rate": 7.376861327346325e-07, |
|
"loss": 0.6072, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8077260755048288, |
|
"grad_norm": 4.795667171198547, |
|
"learning_rate": 5.33137654916292e-07, |
|
"loss": 0.6187, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8428446005267779, |
|
"grad_norm": 3.9213724314825216, |
|
"learning_rate": 3.5840480534034355e-07, |
|
"loss": 0.6145, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8779631255487269, |
|
"grad_norm": 5.451274818264296, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.6012, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.913081650570676, |
|
"grad_norm": 4.312276909206807, |
|
"learning_rate": 1.0848895124889819e-07, |
|
"loss": 0.6184, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9482001755926251, |
|
"grad_norm": 4.611522839692978, |
|
"learning_rate": 3.709441633123367e-08, |
|
"loss": 0.5952, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9833187006145742, |
|
"grad_norm": 6.192176760901921, |
|
"learning_rate": 3.035019514275317e-09, |
|
"loss": 0.5973, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9973661106233538, |
|
"step": 284, |
|
"total_flos": 1.1831516853383987e+17, |
|
"train_loss": 0.7974992327287164, |
|
"train_runtime": 7644.4468, |
|
"train_samples_per_second": 4.767, |
|
"train_steps_per_second": 0.037 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 284, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1831516853383987e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|