|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 19173, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02607833933135138, |
|
"grad_norm": 1.7889361381530762, |
|
"learning_rate": 4.8696083033432434e-05, |
|
"loss": 4.3597, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05215667866270276, |
|
"grad_norm": 4.862887859344482, |
|
"learning_rate": 4.7392166066864866e-05, |
|
"loss": 2.8882, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05215667866270276, |
|
"eval_accuracy": 0.4476888906424723, |
|
"eval_loss": 2.4481201171875, |
|
"eval_runtime": 53.2088, |
|
"eval_samples_per_second": 115.939, |
|
"eval_steps_per_second": 3.627, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07823501799405413, |
|
"grad_norm": 2.319875478744507, |
|
"learning_rate": 4.608824910029729e-05, |
|
"loss": 2.2851, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.10431335732540552, |
|
"grad_norm": 1.8578275442123413, |
|
"learning_rate": 4.478433213372973e-05, |
|
"loss": 1.9734, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.10431335732540552, |
|
"eval_accuracy": 0.5687458831064476, |
|
"eval_loss": 1.797487497329712, |
|
"eval_runtime": 53.0283, |
|
"eval_samples_per_second": 116.334, |
|
"eval_steps_per_second": 3.64, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1303916966567569, |
|
"grad_norm": 1.4474238157272339, |
|
"learning_rate": 4.348041516716216e-05, |
|
"loss": 1.8203, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.15647003598810827, |
|
"grad_norm": 1.3132545948028564, |
|
"learning_rate": 4.2176498200594586e-05, |
|
"loss": 1.7272, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15647003598810827, |
|
"eval_accuracy": 0.6016759609227673, |
|
"eval_loss": 1.6134130954742432, |
|
"eval_runtime": 52.5134, |
|
"eval_samples_per_second": 117.475, |
|
"eval_steps_per_second": 3.675, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18254837531945967, |
|
"grad_norm": 1.4560322761535645, |
|
"learning_rate": 4.087258123402702e-05, |
|
"loss": 1.6575, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.20862671465081103, |
|
"grad_norm": 1.2237803936004639, |
|
"learning_rate": 3.956866426745945e-05, |
|
"loss": 1.6087, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.20862671465081103, |
|
"eval_accuracy": 0.6195010305207493, |
|
"eval_loss": 1.5135347843170166, |
|
"eval_runtime": 52.9452, |
|
"eval_samples_per_second": 116.517, |
|
"eval_steps_per_second": 3.645, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2347050539821624, |
|
"grad_norm": 1.2754594087600708, |
|
"learning_rate": 3.826474730089188e-05, |
|
"loss": 1.568, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2607833933135138, |
|
"grad_norm": 1.2609021663665771, |
|
"learning_rate": 3.696083033432431e-05, |
|
"loss": 1.5337, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2607833933135138, |
|
"eval_accuracy": 0.6312996889343764, |
|
"eval_loss": 1.4511637687683105, |
|
"eval_runtime": 52.2902, |
|
"eval_samples_per_second": 117.976, |
|
"eval_steps_per_second": 3.691, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2868617326448652, |
|
"grad_norm": 1.185180425643921, |
|
"learning_rate": 3.5656913367756745e-05, |
|
"loss": 1.5055, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.31294007197621654, |
|
"grad_norm": 1.199750542640686, |
|
"learning_rate": 3.435299640118918e-05, |
|
"loss": 1.4808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.31294007197621654, |
|
"eval_accuracy": 0.6399303299203424, |
|
"eval_loss": 1.4058290719985962, |
|
"eval_runtime": 52.2985, |
|
"eval_samples_per_second": 117.958, |
|
"eval_steps_per_second": 3.69, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.33901841130756794, |
|
"grad_norm": 1.1488664150238037, |
|
"learning_rate": 3.30490794346216e-05, |
|
"loss": 1.4643, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.36509675063891933, |
|
"grad_norm": 1.1860216856002808, |
|
"learning_rate": 3.1745162468054033e-05, |
|
"loss": 1.444, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.36509675063891933, |
|
"eval_accuracy": 0.6465880311277955, |
|
"eval_loss": 1.3705039024353027, |
|
"eval_runtime": 52.4523, |
|
"eval_samples_per_second": 117.612, |
|
"eval_steps_per_second": 3.68, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.3911750899702707, |
|
"grad_norm": 1.1179466247558594, |
|
"learning_rate": 3.0441245501486465e-05, |
|
"loss": 1.4237, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.41725342930162207, |
|
"grad_norm": 1.187573790550232, |
|
"learning_rate": 2.9137328534918894e-05, |
|
"loss": 1.4094, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.41725342930162207, |
|
"eval_accuracy": 0.6524089244507151, |
|
"eval_loss": 1.3407615423202515, |
|
"eval_runtime": 52.059, |
|
"eval_samples_per_second": 118.5, |
|
"eval_steps_per_second": 3.707, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.44333176863297347, |
|
"grad_norm": 1.1021448373794556, |
|
"learning_rate": 2.7833411568351332e-05, |
|
"loss": 1.3971, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.4694101079643248, |
|
"grad_norm": 1.1032530069351196, |
|
"learning_rate": 2.652949460178376e-05, |
|
"loss": 1.385, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.4694101079643248, |
|
"eval_accuracy": 0.6566206937313249, |
|
"eval_loss": 1.3190594911575317, |
|
"eval_runtime": 52.2008, |
|
"eval_samples_per_second": 118.178, |
|
"eval_steps_per_second": 3.697, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.4954884472956762, |
|
"grad_norm": 1.1370844841003418, |
|
"learning_rate": 2.5225577635216192e-05, |
|
"loss": 1.3754, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5215667866270276, |
|
"grad_norm": 1.1347073316574097, |
|
"learning_rate": 2.392166066864862e-05, |
|
"loss": 1.364, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5215667866270276, |
|
"eval_accuracy": 0.6607722496061171, |
|
"eval_loss": 1.298751950263977, |
|
"eval_runtime": 52.8669, |
|
"eval_samples_per_second": 116.689, |
|
"eval_steps_per_second": 3.651, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.547645125958379, |
|
"grad_norm": 1.0973560810089111, |
|
"learning_rate": 2.2617743702081052e-05, |
|
"loss": 1.3556, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5737234652897304, |
|
"grad_norm": 1.130311131477356, |
|
"learning_rate": 2.1313826735513484e-05, |
|
"loss": 1.3413, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5737234652897304, |
|
"eval_accuracy": 0.6643421122894452, |
|
"eval_loss": 1.281341552734375, |
|
"eval_runtime": 52.871, |
|
"eval_samples_per_second": 116.68, |
|
"eval_steps_per_second": 3.65, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5998018046210817, |
|
"grad_norm": 1.146698236465454, |
|
"learning_rate": 2.0009909768945913e-05, |
|
"loss": 1.3378, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6258801439524331, |
|
"grad_norm": 1.1305065155029297, |
|
"learning_rate": 1.8705992802378344e-05, |
|
"loss": 1.3267, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6258801439524331, |
|
"eval_accuracy": 0.6669354086042105, |
|
"eval_loss": 1.2677369117736816, |
|
"eval_runtime": 52.3527, |
|
"eval_samples_per_second": 117.835, |
|
"eval_steps_per_second": 3.687, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6519584832837845, |
|
"grad_norm": 1.0919705629348755, |
|
"learning_rate": 1.7402075835810776e-05, |
|
"loss": 1.3242, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.6780368226151359, |
|
"grad_norm": 1.0956826210021973, |
|
"learning_rate": 1.6098158869243208e-05, |
|
"loss": 1.3161, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6780368226151359, |
|
"eval_accuracy": 0.6696912811146832, |
|
"eval_loss": 1.253438949584961, |
|
"eval_runtime": 52.2006, |
|
"eval_samples_per_second": 118.179, |
|
"eval_steps_per_second": 3.697, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7041151619464873, |
|
"grad_norm": 1.1230990886688232, |
|
"learning_rate": 1.4794241902675638e-05, |
|
"loss": 1.3122, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7301935012778387, |
|
"grad_norm": 1.101837396621704, |
|
"learning_rate": 1.349032493610807e-05, |
|
"loss": 1.3083, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7301935012778387, |
|
"eval_accuracy": 0.6716822849149414, |
|
"eval_loss": 1.2439320087432861, |
|
"eval_runtime": 52.4583, |
|
"eval_samples_per_second": 117.598, |
|
"eval_steps_per_second": 3.679, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7562718406091901, |
|
"grad_norm": 1.1633756160736084, |
|
"learning_rate": 1.21864079695405e-05, |
|
"loss": 1.2988, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7823501799405413, |
|
"grad_norm": 1.153860330581665, |
|
"learning_rate": 1.0882491002972931e-05, |
|
"loss": 1.2955, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7823501799405413, |
|
"eval_accuracy": 0.6731286743052126, |
|
"eval_loss": 1.2366282939910889, |
|
"eval_runtime": 52.0235, |
|
"eval_samples_per_second": 118.581, |
|
"eval_steps_per_second": 3.71, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8084285192718927, |
|
"grad_norm": 1.1294723749160767, |
|
"learning_rate": 9.578574036405362e-06, |
|
"loss": 1.2883, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8345068586032441, |
|
"grad_norm": 1.1133977174758911, |
|
"learning_rate": 8.274657069837793e-06, |
|
"loss": 1.285, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8345068586032441, |
|
"eval_accuracy": 0.6754039487634622, |
|
"eval_loss": 1.2262341976165771, |
|
"eval_runtime": 53.3, |
|
"eval_samples_per_second": 115.741, |
|
"eval_steps_per_second": 3.621, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8605851979345955, |
|
"grad_norm": 1.113081693649292, |
|
"learning_rate": 6.970740103270223e-06, |
|
"loss": 1.2835, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.8866635372659469, |
|
"grad_norm": 1.0940190553665161, |
|
"learning_rate": 5.666823136702655e-06, |
|
"loss": 1.2796, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8866635372659469, |
|
"eval_accuracy": 0.6767322881870647, |
|
"eval_loss": 1.2194445133209229, |
|
"eval_runtime": 53.2048, |
|
"eval_samples_per_second": 115.948, |
|
"eval_steps_per_second": 3.627, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.9127418765972983, |
|
"grad_norm": 1.1177361011505127, |
|
"learning_rate": 4.362906170135086e-06, |
|
"loss": 1.2731, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9388202159286496, |
|
"grad_norm": 1.143923044204712, |
|
"learning_rate": 3.058989203567517e-06, |
|
"loss": 1.271, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9388202159286496, |
|
"eval_accuracy": 0.6780031079624782, |
|
"eval_loss": 1.2132734060287476, |
|
"eval_runtime": 53.1787, |
|
"eval_samples_per_second": 116.005, |
|
"eval_steps_per_second": 3.629, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.964898555260001, |
|
"grad_norm": 1.1030622720718384, |
|
"learning_rate": 1.7550722369999478e-06, |
|
"loss": 1.2708, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.9909768945913524, |
|
"grad_norm": 1.1247570514678955, |
|
"learning_rate": 4.511552704323789e-07, |
|
"loss": 1.2678, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.9909768945913524, |
|
"eval_accuracy": 0.6787497541946164, |
|
"eval_loss": 1.210123062133789, |
|
"eval_runtime": 52.2652, |
|
"eval_samples_per_second": 118.033, |
|
"eval_steps_per_second": 3.693, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 19173, |
|
"total_flos": 3.20619433033728e+17, |
|
"train_loss": 1.5528178578381098, |
|
"train_runtime": 7442.3449, |
|
"train_samples_per_second": 82.437, |
|
"train_steps_per_second": 2.576 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 19173, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.20619433033728e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|