lesso12's picture
Training in progress, step 500, checkpoint
250a48f verified
{
"best_metric": 0.4194607138633728,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.37091988130563797,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000741839762611276,
"eval_loss": 0.6393800377845764,
"eval_runtime": 63.0989,
"eval_samples_per_second": 9.002,
"eval_steps_per_second": 2.25,
"step": 1
},
{
"epoch": 0.00741839762611276,
"grad_norm": 0.4622786045074463,
"learning_rate": 4.24e-05,
"loss": 0.641,
"step": 10
},
{
"epoch": 0.01483679525222552,
"grad_norm": 0.6135218739509583,
"learning_rate": 8.48e-05,
"loss": 0.5886,
"step": 20
},
{
"epoch": 0.02225519287833828,
"grad_norm": 0.36783531308174133,
"learning_rate": 0.0001272,
"loss": 0.5411,
"step": 30
},
{
"epoch": 0.02967359050445104,
"grad_norm": 0.40542206168174744,
"learning_rate": 0.0001696,
"loss": 0.4827,
"step": 40
},
{
"epoch": 0.037091988130563795,
"grad_norm": 0.4334207773208618,
"learning_rate": 0.000212,
"loss": 0.463,
"step": 50
},
{
"epoch": 0.037091988130563795,
"eval_loss": 0.4923464357852936,
"eval_runtime": 63.1469,
"eval_samples_per_second": 8.995,
"eval_steps_per_second": 2.249,
"step": 50
},
{
"epoch": 0.04451038575667656,
"grad_norm": 0.4054912030696869,
"learning_rate": 0.00021174178932754136,
"loss": 0.5025,
"step": 60
},
{
"epoch": 0.05192878338278932,
"grad_norm": 0.39622294902801514,
"learning_rate": 0.00021096841528660647,
"loss": 0.504,
"step": 70
},
{
"epoch": 0.05934718100890208,
"grad_norm": 0.4143044054508209,
"learning_rate": 0.0002096836456777834,
"loss": 0.4841,
"step": 80
},
{
"epoch": 0.06676557863501484,
"grad_norm": 0.39025792479515076,
"learning_rate": 0.00020789373976946182,
"loss": 0.4727,
"step": 90
},
{
"epoch": 0.07418397626112759,
"grad_norm": 0.3781468868255615,
"learning_rate": 0.0002056074178033063,
"loss": 0.4287,
"step": 100
},
{
"epoch": 0.07418397626112759,
"eval_loss": 0.47025901079177856,
"eval_runtime": 63.1627,
"eval_samples_per_second": 8.993,
"eval_steps_per_second": 2.248,
"step": 100
},
{
"epoch": 0.08160237388724036,
"grad_norm": 0.43171194195747375,
"learning_rate": 0.00020283581851011567,
"loss": 0.4821,
"step": 110
},
{
"epoch": 0.08902077151335312,
"grad_norm": 0.3957735002040863,
"learning_rate": 0.00019959244484304625,
"loss": 0.5191,
"step": 120
},
{
"epoch": 0.09643916913946587,
"grad_norm": 0.40845993161201477,
"learning_rate": 0.00019589309819258114,
"loss": 0.4756,
"step": 130
},
{
"epoch": 0.10385756676557864,
"grad_norm": 0.42555275559425354,
"learning_rate": 0.00019175580140374444,
"loss": 0.4571,
"step": 140
},
{
"epoch": 0.11127596439169139,
"grad_norm": 0.39264148473739624,
"learning_rate": 0.00018720071097061167,
"loss": 0.4268,
"step": 150
},
{
"epoch": 0.11127596439169139,
"eval_loss": 0.45653077960014343,
"eval_runtime": 63.1089,
"eval_samples_per_second": 9.0,
"eval_steps_per_second": 2.25,
"step": 150
},
{
"epoch": 0.11869436201780416,
"grad_norm": 0.4264899790287018,
"learning_rate": 0.00018225001883589702,
"loss": 0.4657,
"step": 160
},
{
"epoch": 0.1261127596439169,
"grad_norm": 0.39543238282203674,
"learning_rate": 0.00017692784427403898,
"loss": 0.5229,
"step": 170
},
{
"epoch": 0.13353115727002968,
"grad_norm": 0.38356471061706543,
"learning_rate": 0.00017126011638451976,
"loss": 0.4691,
"step": 180
},
{
"epoch": 0.14094955489614244,
"grad_norm": 0.381939560174942,
"learning_rate": 0.00016527444776789915,
"loss": 0.4569,
"step": 190
},
{
"epoch": 0.14836795252225518,
"grad_norm": 0.37929609417915344,
"learning_rate": 0.00015900000000000002,
"loss": 0.4476,
"step": 200
},
{
"epoch": 0.14836795252225518,
"eval_loss": 0.44979003071784973,
"eval_runtime": 63.1627,
"eval_samples_per_second": 8.993,
"eval_steps_per_second": 2.248,
"step": 200
},
{
"epoch": 0.15578635014836795,
"grad_norm": 0.3976655602455139,
"learning_rate": 0.0001524673415596422,
"loss": 0.4856,
"step": 210
},
{
"epoch": 0.1632047477744807,
"grad_norm": 0.38872918486595154,
"learning_rate": 0.00014570829890208668,
"loss": 0.4764,
"step": 220
},
{
"epoch": 0.17062314540059348,
"grad_norm": 0.43565240502357483,
"learning_rate": 0.00013875580140374443,
"loss": 0.4344,
"step": 230
},
{
"epoch": 0.17804154302670624,
"grad_norm": 0.3594364821910858,
"learning_rate": 0.00013164372093356477,
"loss": 0.4404,
"step": 240
},
{
"epoch": 0.18545994065281898,
"grad_norm": 0.40855279564857483,
"learning_rate": 0.00012440670683269464,
"loss": 0.4161,
"step": 250
},
{
"epoch": 0.18545994065281898,
"eval_loss": 0.4417240619659424,
"eval_runtime": 63.0987,
"eval_samples_per_second": 9.002,
"eval_steps_per_second": 2.25,
"step": 250
},
{
"epoch": 0.19287833827893175,
"grad_norm": 0.4053170084953308,
"learning_rate": 0.00011708001710637128,
"loss": 0.4592,
"step": 260
},
{
"epoch": 0.20029673590504452,
"grad_norm": 0.40735432505607605,
"learning_rate": 0.00010969934665046512,
"loss": 0.4535,
"step": 270
},
{
"epoch": 0.20771513353115728,
"grad_norm": 0.3716582953929901,
"learning_rate": 0.00010230065334953492,
"loss": 0.4357,
"step": 280
},
{
"epoch": 0.21513353115727002,
"grad_norm": 0.3450051546096802,
"learning_rate": 9.491998289362875e-05,
"loss": 0.3879,
"step": 290
},
{
"epoch": 0.22255192878338279,
"grad_norm": 0.34627842903137207,
"learning_rate": 8.759329316730539e-05,
"loss": 0.4372,
"step": 300
},
{
"epoch": 0.22255192878338279,
"eval_loss": 0.43453946709632874,
"eval_runtime": 63.1414,
"eval_samples_per_second": 8.996,
"eval_steps_per_second": 2.249,
"step": 300
},
{
"epoch": 0.22997032640949555,
"grad_norm": 0.4011070132255554,
"learning_rate": 8.035627906643523e-05,
"loss": 0.444,
"step": 310
},
{
"epoch": 0.23738872403560832,
"grad_norm": 0.4104556739330292,
"learning_rate": 7.324419859625559e-05,
"loss": 0.4814,
"step": 320
},
{
"epoch": 0.24480712166172106,
"grad_norm": 0.39093947410583496,
"learning_rate": 6.629170109791332e-05,
"loss": 0.437,
"step": 330
},
{
"epoch": 0.2522255192878338,
"grad_norm": 0.34994885325431824,
"learning_rate": 5.9532658440357784e-05,
"loss": 0.3983,
"step": 340
},
{
"epoch": 0.2596439169139466,
"grad_norm": 0.3946869671344757,
"learning_rate": 5.300000000000002e-05,
"loss": 0.4235,
"step": 350
},
{
"epoch": 0.2596439169139466,
"eval_loss": 0.4264315962791443,
"eval_runtime": 63.1459,
"eval_samples_per_second": 8.995,
"eval_steps_per_second": 2.249,
"step": 350
},
{
"epoch": 0.26706231454005935,
"grad_norm": 0.40910935401916504,
"learning_rate": 4.672555223210085e-05,
"loss": 0.4254,
"step": 360
},
{
"epoch": 0.2744807121661721,
"grad_norm": 0.4382137358188629,
"learning_rate": 4.073988361548022e-05,
"loss": 0.4489,
"step": 370
},
{
"epoch": 0.2818991097922849,
"grad_norm": 0.3514750599861145,
"learning_rate": 3.507215572596106e-05,
"loss": 0.4396,
"step": 380
},
{
"epoch": 0.2893175074183976,
"grad_norm": 0.3766017258167267,
"learning_rate": 2.9749981164102997e-05,
"loss": 0.3942,
"step": 390
},
{
"epoch": 0.29673590504451036,
"grad_norm": 0.3943521976470947,
"learning_rate": 2.479928902938834e-05,
"loss": 0.3899,
"step": 400
},
{
"epoch": 0.29673590504451036,
"eval_loss": 0.42192989587783813,
"eval_runtime": 63.1588,
"eval_samples_per_second": 8.993,
"eval_steps_per_second": 2.248,
"step": 400
},
{
"epoch": 0.30415430267062316,
"grad_norm": 0.39846813678741455,
"learning_rate": 2.024419859625558e-05,
"loss": 0.4341,
"step": 410
},
{
"epoch": 0.3115727002967359,
"grad_norm": 0.351679265499115,
"learning_rate": 1.610690180741885e-05,
"loss": 0.4537,
"step": 420
},
{
"epoch": 0.3189910979228487,
"grad_norm": 0.3849548399448395,
"learning_rate": 1.240755515695374e-05,
"loss": 0.3926,
"step": 430
},
{
"epoch": 0.3264094955489614,
"grad_norm": 0.34369608759880066,
"learning_rate": 9.164181489884296e-06,
"loss": 0.3969,
"step": 440
},
{
"epoch": 0.33382789317507416,
"grad_norm": 0.3698439300060272,
"learning_rate": 6.392582196693718e-06,
"loss": 0.3801,
"step": 450
},
{
"epoch": 0.33382789317507416,
"eval_loss": 0.4198478162288666,
"eval_runtime": 63.1389,
"eval_samples_per_second": 8.996,
"eval_steps_per_second": 2.249,
"step": 450
},
{
"epoch": 0.34124629080118696,
"grad_norm": 0.37923330068588257,
"learning_rate": 4.106260230538197e-06,
"loss": 0.4221,
"step": 460
},
{
"epoch": 0.3486646884272997,
"grad_norm": 0.3553723990917206,
"learning_rate": 2.316354322216597e-06,
"loss": 0.4364,
"step": 470
},
{
"epoch": 0.3560830860534125,
"grad_norm": 0.38167324662208557,
"learning_rate": 1.0315847133935416e-06,
"loss": 0.4317,
"step": 480
},
{
"epoch": 0.36350148367952523,
"grad_norm": 0.36284124851226807,
"learning_rate": 2.582106724586351e-07,
"loss": 0.4302,
"step": 490
},
{
"epoch": 0.37091988130563797,
"grad_norm": 0.3405497074127197,
"learning_rate": 0.0,
"loss": 0.3725,
"step": 500
},
{
"epoch": 0.37091988130563797,
"eval_loss": 0.4194607138633728,
"eval_runtime": 63.1755,
"eval_samples_per_second": 8.991,
"eval_steps_per_second": 2.248,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.85471820890112e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}