|
{ |
|
"best_metric": 0.4194607138633728, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.37091988130563797, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000741839762611276, |
|
"eval_loss": 0.6393800377845764, |
|
"eval_runtime": 63.0989, |
|
"eval_samples_per_second": 9.002, |
|
"eval_steps_per_second": 2.25, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00741839762611276, |
|
"grad_norm": 0.4622786045074463, |
|
"learning_rate": 4.24e-05, |
|
"loss": 0.641, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01483679525222552, |
|
"grad_norm": 0.6135218739509583, |
|
"learning_rate": 8.48e-05, |
|
"loss": 0.5886, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02225519287833828, |
|
"grad_norm": 0.36783531308174133, |
|
"learning_rate": 0.0001272, |
|
"loss": 0.5411, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02967359050445104, |
|
"grad_norm": 0.40542206168174744, |
|
"learning_rate": 0.0001696, |
|
"loss": 0.4827, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.037091988130563795, |
|
"grad_norm": 0.4334207773208618, |
|
"learning_rate": 0.000212, |
|
"loss": 0.463, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.037091988130563795, |
|
"eval_loss": 0.4923464357852936, |
|
"eval_runtime": 63.1469, |
|
"eval_samples_per_second": 8.995, |
|
"eval_steps_per_second": 2.249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04451038575667656, |
|
"grad_norm": 0.4054912030696869, |
|
"learning_rate": 0.00021174178932754136, |
|
"loss": 0.5025, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05192878338278932, |
|
"grad_norm": 0.39622294902801514, |
|
"learning_rate": 0.00021096841528660647, |
|
"loss": 0.504, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05934718100890208, |
|
"grad_norm": 0.4143044054508209, |
|
"learning_rate": 0.0002096836456777834, |
|
"loss": 0.4841, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06676557863501484, |
|
"grad_norm": 0.39025792479515076, |
|
"learning_rate": 0.00020789373976946182, |
|
"loss": 0.4727, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07418397626112759, |
|
"grad_norm": 0.3781468868255615, |
|
"learning_rate": 0.0002056074178033063, |
|
"loss": 0.4287, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07418397626112759, |
|
"eval_loss": 0.47025901079177856, |
|
"eval_runtime": 63.1627, |
|
"eval_samples_per_second": 8.993, |
|
"eval_steps_per_second": 2.248, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08160237388724036, |
|
"grad_norm": 0.43171194195747375, |
|
"learning_rate": 0.00020283581851011567, |
|
"loss": 0.4821, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08902077151335312, |
|
"grad_norm": 0.3957735002040863, |
|
"learning_rate": 0.00019959244484304625, |
|
"loss": 0.5191, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09643916913946587, |
|
"grad_norm": 0.40845993161201477, |
|
"learning_rate": 0.00019589309819258114, |
|
"loss": 0.4756, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10385756676557864, |
|
"grad_norm": 0.42555275559425354, |
|
"learning_rate": 0.00019175580140374444, |
|
"loss": 0.4571, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11127596439169139, |
|
"grad_norm": 0.39264148473739624, |
|
"learning_rate": 0.00018720071097061167, |
|
"loss": 0.4268, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11127596439169139, |
|
"eval_loss": 0.45653077960014343, |
|
"eval_runtime": 63.1089, |
|
"eval_samples_per_second": 9.0, |
|
"eval_steps_per_second": 2.25, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11869436201780416, |
|
"grad_norm": 0.4264899790287018, |
|
"learning_rate": 0.00018225001883589702, |
|
"loss": 0.4657, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1261127596439169, |
|
"grad_norm": 0.39543238282203674, |
|
"learning_rate": 0.00017692784427403898, |
|
"loss": 0.5229, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13353115727002968, |
|
"grad_norm": 0.38356471061706543, |
|
"learning_rate": 0.00017126011638451976, |
|
"loss": 0.4691, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.14094955489614244, |
|
"grad_norm": 0.381939560174942, |
|
"learning_rate": 0.00016527444776789915, |
|
"loss": 0.4569, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14836795252225518, |
|
"grad_norm": 0.37929609417915344, |
|
"learning_rate": 0.00015900000000000002, |
|
"loss": 0.4476, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14836795252225518, |
|
"eval_loss": 0.44979003071784973, |
|
"eval_runtime": 63.1627, |
|
"eval_samples_per_second": 8.993, |
|
"eval_steps_per_second": 2.248, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15578635014836795, |
|
"grad_norm": 0.3976655602455139, |
|
"learning_rate": 0.0001524673415596422, |
|
"loss": 0.4856, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1632047477744807, |
|
"grad_norm": 0.38872918486595154, |
|
"learning_rate": 0.00014570829890208668, |
|
"loss": 0.4764, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17062314540059348, |
|
"grad_norm": 0.43565240502357483, |
|
"learning_rate": 0.00013875580140374443, |
|
"loss": 0.4344, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17804154302670624, |
|
"grad_norm": 0.3594364821910858, |
|
"learning_rate": 0.00013164372093356477, |
|
"loss": 0.4404, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18545994065281898, |
|
"grad_norm": 0.40855279564857483, |
|
"learning_rate": 0.00012440670683269464, |
|
"loss": 0.4161, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18545994065281898, |
|
"eval_loss": 0.4417240619659424, |
|
"eval_runtime": 63.0987, |
|
"eval_samples_per_second": 9.002, |
|
"eval_steps_per_second": 2.25, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19287833827893175, |
|
"grad_norm": 0.4053170084953308, |
|
"learning_rate": 0.00011708001710637128, |
|
"loss": 0.4592, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.20029673590504452, |
|
"grad_norm": 0.40735432505607605, |
|
"learning_rate": 0.00010969934665046512, |
|
"loss": 0.4535, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.20771513353115728, |
|
"grad_norm": 0.3716582953929901, |
|
"learning_rate": 0.00010230065334953492, |
|
"loss": 0.4357, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21513353115727002, |
|
"grad_norm": 0.3450051546096802, |
|
"learning_rate": 9.491998289362875e-05, |
|
"loss": 0.3879, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.22255192878338279, |
|
"grad_norm": 0.34627842903137207, |
|
"learning_rate": 8.759329316730539e-05, |
|
"loss": 0.4372, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22255192878338279, |
|
"eval_loss": 0.43453946709632874, |
|
"eval_runtime": 63.1414, |
|
"eval_samples_per_second": 8.996, |
|
"eval_steps_per_second": 2.249, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22997032640949555, |
|
"grad_norm": 0.4011070132255554, |
|
"learning_rate": 8.035627906643523e-05, |
|
"loss": 0.444, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23738872403560832, |
|
"grad_norm": 0.4104556739330292, |
|
"learning_rate": 7.324419859625559e-05, |
|
"loss": 0.4814, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.24480712166172106, |
|
"grad_norm": 0.39093947410583496, |
|
"learning_rate": 6.629170109791332e-05, |
|
"loss": 0.437, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2522255192878338, |
|
"grad_norm": 0.34994885325431824, |
|
"learning_rate": 5.9532658440357784e-05, |
|
"loss": 0.3983, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2596439169139466, |
|
"grad_norm": 0.3946869671344757, |
|
"learning_rate": 5.300000000000002e-05, |
|
"loss": 0.4235, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2596439169139466, |
|
"eval_loss": 0.4264315962791443, |
|
"eval_runtime": 63.1459, |
|
"eval_samples_per_second": 8.995, |
|
"eval_steps_per_second": 2.249, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.26706231454005935, |
|
"grad_norm": 0.40910935401916504, |
|
"learning_rate": 4.672555223210085e-05, |
|
"loss": 0.4254, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2744807121661721, |
|
"grad_norm": 0.4382137358188629, |
|
"learning_rate": 4.073988361548022e-05, |
|
"loss": 0.4489, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2818991097922849, |
|
"grad_norm": 0.3514750599861145, |
|
"learning_rate": 3.507215572596106e-05, |
|
"loss": 0.4396, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2893175074183976, |
|
"grad_norm": 0.3766017258167267, |
|
"learning_rate": 2.9749981164102997e-05, |
|
"loss": 0.3942, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"grad_norm": 0.3943521976470947, |
|
"learning_rate": 2.479928902938834e-05, |
|
"loss": 0.3899, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"eval_loss": 0.42192989587783813, |
|
"eval_runtime": 63.1588, |
|
"eval_samples_per_second": 8.993, |
|
"eval_steps_per_second": 2.248, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.30415430267062316, |
|
"grad_norm": 0.39846813678741455, |
|
"learning_rate": 2.024419859625558e-05, |
|
"loss": 0.4341, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3115727002967359, |
|
"grad_norm": 0.351679265499115, |
|
"learning_rate": 1.610690180741885e-05, |
|
"loss": 0.4537, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3189910979228487, |
|
"grad_norm": 0.3849548399448395, |
|
"learning_rate": 1.240755515695374e-05, |
|
"loss": 0.3926, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3264094955489614, |
|
"grad_norm": 0.34369608759880066, |
|
"learning_rate": 9.164181489884296e-06, |
|
"loss": 0.3969, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.33382789317507416, |
|
"grad_norm": 0.3698439300060272, |
|
"learning_rate": 6.392582196693718e-06, |
|
"loss": 0.3801, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33382789317507416, |
|
"eval_loss": 0.4198478162288666, |
|
"eval_runtime": 63.1389, |
|
"eval_samples_per_second": 8.996, |
|
"eval_steps_per_second": 2.249, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.34124629080118696, |
|
"grad_norm": 0.37923330068588257, |
|
"learning_rate": 4.106260230538197e-06, |
|
"loss": 0.4221, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3486646884272997, |
|
"grad_norm": 0.3553723990917206, |
|
"learning_rate": 2.316354322216597e-06, |
|
"loss": 0.4364, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3560830860534125, |
|
"grad_norm": 0.38167324662208557, |
|
"learning_rate": 1.0315847133935416e-06, |
|
"loss": 0.4317, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.36350148367952523, |
|
"grad_norm": 0.36284124851226807, |
|
"learning_rate": 2.582106724586351e-07, |
|
"loss": 0.4302, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.37091988130563797, |
|
"grad_norm": 0.3405497074127197, |
|
"learning_rate": 0.0, |
|
"loss": 0.3725, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.37091988130563797, |
|
"eval_loss": 0.4194607138633728, |
|
"eval_runtime": 63.1755, |
|
"eval_samples_per_second": 8.991, |
|
"eval_steps_per_second": 2.248, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.85471820890112e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|