{ "best_metric": 1.826446294784546, "best_model_checkpoint": "miner_id_24/checkpoint-1350", "epoch": 1.1958688168146403, "eval_steps": 150, "global_step": 1650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007247689798876608, "eval_loss": 3.070380210876465, "eval_runtime": 85.9153, "eval_samples_per_second": 13.525, "eval_steps_per_second": 3.387, "step": 1 }, { "epoch": 0.03623844899438304, "grad_norm": 3.273714542388916, "learning_rate": 0.0002, "loss": 2.1084, "step": 50 }, { "epoch": 0.07247689798876608, "grad_norm": 1.639181137084961, "learning_rate": 0.0001998582695676762, "loss": 1.989, "step": 100 }, { "epoch": 0.10871534698314912, "grad_norm": 3.0539445877075195, "learning_rate": 0.00019943348002101371, "loss": 1.9929, "step": 150 }, { "epoch": 0.10871534698314912, "eval_loss": 2.0453155040740967, "eval_runtime": 86.7107, "eval_samples_per_second": 13.401, "eval_steps_per_second": 3.356, "step": 150 }, { "epoch": 0.14495379597753216, "grad_norm": 1.7175414562225342, "learning_rate": 0.00019872683547213446, "loss": 1.9457, "step": 200 }, { "epoch": 0.1811922449719152, "grad_norm": 3.7000324726104736, "learning_rate": 0.00019774033898178667, "loss": 1.9367, "step": 250 }, { "epoch": 0.21743069396629824, "grad_norm": 2.244403123855591, "learning_rate": 0.0001964767868814516, "loss": 1.9372, "step": 300 }, { "epoch": 0.21743069396629824, "eval_loss": 1.9559648036956787, "eval_runtime": 86.7219, "eval_samples_per_second": 13.399, "eval_steps_per_second": 3.356, "step": 300 }, { "epoch": 0.25366914296068127, "grad_norm": 2.310765027999878, "learning_rate": 0.00019493976084683813, "loss": 1.9331, "step": 350 }, { "epoch": 0.2899075919550643, "grad_norm": 2.4135353565216064, "learning_rate": 0.00019313361774523385, "loss": 1.9337, "step": 400 }, { "epoch": 0.3261460409494474, "grad_norm": 3.474848747253418, "learning_rate": 0.00019106347728549135, "loss": 1.9164, "step": 450 }, { "epoch": 0.3261460409494474, "eval_loss": 1.952536702156067, "eval_runtime": 86.9956, "eval_samples_per_second": 13.357, "eval_steps_per_second": 3.345, "step": 450 }, { "epoch": 0.3623844899438304, "grad_norm": 2.140085458755493, "learning_rate": 0.00018873520750565718, "loss": 1.877, "step": 500 }, { "epoch": 0.39862293893821343, "grad_norm": 2.0083487033843994, "learning_rate": 0.0001861554081393806, "loss": 1.8591, "step": 550 }, { "epoch": 0.4348613879325965, "grad_norm": 3.061316967010498, "learning_rate": 0.0001833313919082515, "loss": 1.9117, "step": 600 }, { "epoch": 0.4348613879325965, "eval_loss": 1.942734956741333, "eval_runtime": 86.6654, "eval_samples_per_second": 13.408, "eval_steps_per_second": 3.358, "step": 600 }, { "epoch": 0.47109983692697954, "grad_norm": 2.36161470413208, "learning_rate": 0.00018027116379309638, "loss": 1.9137, "step": 650 }, { "epoch": 0.5073382859213625, "grad_norm": 2.791883707046509, "learning_rate": 0.00017698339834299061, "loss": 1.9284, "step": 700 }, { "epoch": 0.5435767349157457, "grad_norm": 2.8679757118225098, "learning_rate": 0.00017347741508630672, "loss": 1.8691, "step": 750 }, { "epoch": 0.5435767349157457, "eval_loss": 1.9142440557479858, "eval_runtime": 86.7231, "eval_samples_per_second": 13.399, "eval_steps_per_second": 3.356, "step": 750 }, { "epoch": 0.5798151839101287, "grad_norm": 2.4429304599761963, "learning_rate": 0.0001697631521134985, "loss": 1.9101, "step": 800 }, { "epoch": 0.6160536329045116, "grad_norm": 2.8223161697387695, "learning_rate": 0.00016585113790650388, "loss": 1.8409, "step": 850 }, { "epoch": 0.6522920818988948, "grad_norm": 1.8749058246612549, "learning_rate": 0.0001617524614946192, "loss": 1.8731, "step": 900 }, { "epoch": 0.6522920818988948, "eval_loss": 1.8797276020050049, "eval_runtime": 86.9372, "eval_samples_per_second": 13.366, "eval_steps_per_second": 3.347, "step": 900 }, { "epoch": 0.6885305308932778, "grad_norm": 3.5651133060455322, "learning_rate": 0.0001574787410214407, "loss": 1.8334, "step": 950 }, { "epoch": 0.7247689798876608, "grad_norm": 2.771768569946289, "learning_rate": 0.00015304209081197425, "loss": 1.8534, "step": 1000 }, { "epoch": 0.7610074288820439, "grad_norm": 2.3072197437286377, "learning_rate": 0.00014845508703326504, "loss": 1.8311, "step": 1050 }, { "epoch": 0.7610074288820439, "eval_loss": 1.865134835243225, "eval_runtime": 86.9546, "eval_samples_per_second": 13.363, "eval_steps_per_second": 3.347, "step": 1050 }, { "epoch": 0.7972458778764269, "grad_norm": 2.7600791454315186, "learning_rate": 0.00014373073204588556, "loss": 1.8482, "step": 1100 }, { "epoch": 0.83348432687081, "grad_norm": 3.6101410388946533, "learning_rate": 0.00013888241754733208, "loss": 1.8151, "step": 1150 }, { "epoch": 0.869722775865193, "grad_norm": 2.4437174797058105, "learning_rate": 0.00013392388661180303, "loss": 1.836, "step": 1200 }, { "epoch": 0.869722775865193, "eval_loss": 1.8474547863006592, "eval_runtime": 86.4314, "eval_samples_per_second": 13.444, "eval_steps_per_second": 3.367, "step": 1200 }, { "epoch": 0.905961224859576, "grad_norm": 2.752454996109009, "learning_rate": 0.0001288691947339621, "loss": 1.8837, "step": 1250 }, { "epoch": 0.9421996738539591, "grad_norm": 2.2887587547302246, "learning_rate": 0.0001237326699871115, "loss": 1.8392, "step": 1300 }, { "epoch": 0.9784381228483421, "grad_norm": 4.001684665679932, "learning_rate": 0.00011852887240871145, "loss": 1.8195, "step": 1350 }, { "epoch": 0.9784381228483421, "eval_loss": 1.826446294784546, "eval_runtime": 86.7246, "eval_samples_per_second": 13.399, "eval_steps_per_second": 3.355, "step": 1350 }, { "epoch": 1.014676571842725, "grad_norm": 0.9796865582466125, "learning_rate": 0.00011327255272837221, "loss": 1.7798, "step": 1400 }, { "epoch": 1.050915020837108, "grad_norm": 0.8189029693603516, "learning_rate": 0.00010797861055530831, "loss": 1.4192, "step": 1450 }, { "epoch": 1.0871534698314913, "grad_norm": 0.9342411756515503, "learning_rate": 0.00010266205214377748, "loss": 1.4029, "step": 1500 }, { "epoch": 1.0871534698314913, "eval_loss": 1.8509882688522339, "eval_runtime": 86.7099, "eval_samples_per_second": 13.401, "eval_steps_per_second": 3.356, "step": 1500 }, { "epoch": 1.1233919188258743, "grad_norm": 0.8776970505714417, "learning_rate": 9.733794785622253e-05, "loss": 1.3963, "step": 1550 }, { "epoch": 1.1596303678202573, "grad_norm": 0.9756889939308167, "learning_rate": 9.202138944469168e-05, "loss": 1.393, "step": 1600 }, { "epoch": 1.1958688168146403, "grad_norm": 0.8379771113395691, "learning_rate": 8.672744727162781e-05, "loss": 1.4002, "step": 1650 }, { "epoch": 1.1958688168146403, "eval_loss": 1.837064504623413, "eval_runtime": 86.6691, "eval_samples_per_second": 13.407, "eval_steps_per_second": 3.358, "step": 1650 } ], "logging_steps": 50, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1904824490055434e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }