{ "best_metric": 0.3964671194553375, "best_model_checkpoint": "limb_classification_person_crop_seq/t2_4heads_1layers_5e-4lr/checkpoint-2520", "epoch": 15.0, "eval_steps": 500, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1388888888888889, "grad_norm": 275926.53125, "learning_rate": 5e-05, "loss": 1.2686, "step": 25 }, { "epoch": 0.2777777777777778, "grad_norm": 328248.0, "learning_rate": 0.0001, "loss": 0.9051, "step": 50 }, { "epoch": 0.4166666666666667, "grad_norm": 368871.40625, "learning_rate": 0.00015, "loss": 0.7478, "step": 75 }, { "epoch": 0.5555555555555556, "grad_norm": 98371.5390625, "learning_rate": 0.0002, "loss": 0.5482, "step": 100 }, { "epoch": 0.6944444444444444, "grad_norm": 116783.9921875, "learning_rate": 0.00025, "loss": 0.5521, "step": 125 }, { "epoch": 0.8333333333333334, "grad_norm": 390853.9375, "learning_rate": 0.0003, "loss": 0.4814, "step": 150 }, { "epoch": 0.9722222222222222, "grad_norm": 156001.8125, "learning_rate": 0.00035, "loss": 0.5406, "step": 175 }, { "epoch": 1.0, "eval_accuracy": 0.8883399209486166, "eval_loss": 0.48044389486312866, "eval_runtime": 37.7276, "eval_samples_per_second": 26.824, "eval_steps_per_second": 0.848, "step": 180 }, { "epoch": 1.1111111111111112, "grad_norm": 191408.703125, "learning_rate": 0.0004, "loss": 0.5133, "step": 200 }, { "epoch": 1.25, "grad_norm": 313042.46875, "learning_rate": 0.00045000000000000004, "loss": 0.4485, "step": 225 }, { "epoch": 1.3888888888888888, "grad_norm": 164934.65625, "learning_rate": 0.0005, "loss": 0.5473, "step": 250 }, { "epoch": 1.5277777777777777, "grad_norm": 266101.15625, "learning_rate": 0.0004948979591836735, "loss": 0.4893, "step": 275 }, { "epoch": 1.6666666666666665, "grad_norm": 216736.703125, "learning_rate": 0.0004897959183673469, "loss": 0.4952, "step": 300 }, { "epoch": 1.8055555555555556, "grad_norm": 130781.34375, "learning_rate": 0.0004846938775510204, "loss": 0.5304, "step": 325 }, { "epoch": 1.9444444444444444, "grad_norm": 83755.078125, "learning_rate": 0.00047959183673469387, "loss": 0.4852, "step": 350 }, { "epoch": 2.0, "eval_accuracy": 0.8794466403162056, "eval_loss": 0.5456948280334473, "eval_runtime": 36.26, "eval_samples_per_second": 27.91, "eval_steps_per_second": 0.883, "step": 360 }, { "epoch": 2.0833333333333335, "grad_norm": 140542.96875, "learning_rate": 0.0004744897959183674, "loss": 0.5491, "step": 375 }, { "epoch": 2.2222222222222223, "grad_norm": 275681.375, "learning_rate": 0.00046938775510204083, "loss": 0.4691, "step": 400 }, { "epoch": 2.361111111111111, "grad_norm": 119873.5625, "learning_rate": 0.00046428571428571433, "loss": 0.4607, "step": 425 }, { "epoch": 2.5, "grad_norm": 147333.703125, "learning_rate": 0.0004591836734693878, "loss": 0.5189, "step": 450 }, { "epoch": 2.638888888888889, "grad_norm": 152648.078125, "learning_rate": 0.00045408163265306124, "loss": 0.4508, "step": 475 }, { "epoch": 2.7777777777777777, "grad_norm": 129466.5078125, "learning_rate": 0.0004489795918367347, "loss": 0.4649, "step": 500 }, { "epoch": 2.9166666666666665, "grad_norm": 39402.28515625, "learning_rate": 0.00044387755102040814, "loss": 0.4664, "step": 525 }, { "epoch": 3.0, "eval_accuracy": 0.9051383399209486, "eval_loss": 0.42033523321151733, "eval_runtime": 36.9286, "eval_samples_per_second": 27.404, "eval_steps_per_second": 0.867, "step": 540 }, { "epoch": 3.0555555555555554, "grad_norm": 167756.265625, "learning_rate": 0.00043877551020408165, "loss": 0.4957, "step": 550 }, { "epoch": 3.1944444444444446, "grad_norm": 178619.546875, "learning_rate": 0.0004336734693877551, "loss": 0.4743, "step": 575 }, { "epoch": 3.3333333333333335, "grad_norm": 109380.5, "learning_rate": 0.00042857142857142855, "loss": 0.5037, "step": 600 }, { "epoch": 3.4722222222222223, "grad_norm": 161733.875, "learning_rate": 0.00042346938775510206, "loss": 0.4998, "step": 625 }, { "epoch": 3.611111111111111, "grad_norm": 139706.953125, "learning_rate": 0.00041836734693877556, "loss": 0.4484, "step": 650 }, { "epoch": 3.75, "grad_norm": 150799.125, "learning_rate": 0.000413265306122449, "loss": 0.5218, "step": 675 }, { "epoch": 3.888888888888889, "grad_norm": 233782.0625, "learning_rate": 0.00040816326530612246, "loss": 0.4929, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.900197628458498, "eval_loss": 0.43486830592155457, "eval_runtime": 35.8273, "eval_samples_per_second": 28.247, "eval_steps_per_second": 0.893, "step": 720 }, { "epoch": 4.027777777777778, "grad_norm": 124010.9140625, "learning_rate": 0.0004030612244897959, "loss": 0.4506, "step": 725 }, { "epoch": 4.166666666666667, "grad_norm": 142697.78125, "learning_rate": 0.00039795918367346937, "loss": 0.4781, "step": 750 }, { "epoch": 4.305555555555555, "grad_norm": 167024.359375, "learning_rate": 0.0003928571428571429, "loss": 0.4571, "step": 775 }, { "epoch": 4.444444444444445, "grad_norm": 91725.46875, "learning_rate": 0.0003877551020408163, "loss": 0.45, "step": 800 }, { "epoch": 4.583333333333333, "grad_norm": 99782.9921875, "learning_rate": 0.0003826530612244898, "loss": 0.4521, "step": 825 }, { "epoch": 4.722222222222222, "grad_norm": 178208.5625, "learning_rate": 0.00037755102040816323, "loss": 0.4598, "step": 850 }, { "epoch": 4.861111111111111, "grad_norm": 189875.09375, "learning_rate": 0.0003724489795918368, "loss": 0.4492, "step": 875 }, { "epoch": 5.0, "grad_norm": 31828.625, "learning_rate": 0.00036734693877551024, "loss": 0.4334, "step": 900 }, { "epoch": 5.0, "eval_accuracy": 0.8764822134387352, "eval_loss": 0.4815811514854431, "eval_runtime": 36.4103, "eval_samples_per_second": 27.794, "eval_steps_per_second": 0.879, "step": 900 }, { "epoch": 5.138888888888889, "grad_norm": 131415.0, "learning_rate": 0.0003622448979591837, "loss": 0.4699, "step": 925 }, { "epoch": 5.277777777777778, "grad_norm": 42832.09765625, "learning_rate": 0.00035714285714285714, "loss": 0.4448, "step": 950 }, { "epoch": 5.416666666666667, "grad_norm": 202659.328125, "learning_rate": 0.00035204081632653065, "loss": 0.4595, "step": 975 }, { "epoch": 5.555555555555555, "grad_norm": 95209.203125, "learning_rate": 0.0003469387755102041, "loss": 0.4968, "step": 1000 }, { "epoch": 5.694444444444445, "grad_norm": 121417.4140625, "learning_rate": 0.00034183673469387755, "loss": 0.4662, "step": 1025 }, { "epoch": 5.833333333333333, "grad_norm": 103243.7734375, "learning_rate": 0.000336734693877551, "loss": 0.4726, "step": 1050 }, { "epoch": 5.972222222222222, "grad_norm": 79883.8671875, "learning_rate": 0.00033163265306122445, "loss": 0.4709, "step": 1075 }, { "epoch": 6.0, "eval_accuracy": 0.8932806324110671, "eval_loss": 0.45736971497535706, "eval_runtime": 36.8482, "eval_samples_per_second": 27.464, "eval_steps_per_second": 0.868, "step": 1080 }, { "epoch": 6.111111111111111, "grad_norm": 70588.4609375, "learning_rate": 0.00032653061224489796, "loss": 0.4736, "step": 1100 }, { "epoch": 6.25, "grad_norm": 235716.375, "learning_rate": 0.00032142857142857147, "loss": 0.4877, "step": 1125 }, { "epoch": 6.388888888888889, "grad_norm": 118860.515625, "learning_rate": 0.0003163265306122449, "loss": 0.4801, "step": 1150 }, { "epoch": 6.527777777777778, "grad_norm": 96496.296875, "learning_rate": 0.00031122448979591837, "loss": 0.4387, "step": 1175 }, { "epoch": 6.666666666666667, "grad_norm": 122115.0859375, "learning_rate": 0.0003061224489795919, "loss": 0.3712, "step": 1200 }, { "epoch": 6.805555555555555, "grad_norm": 87567.2265625, "learning_rate": 0.0003010204081632653, "loss": 0.443, "step": 1225 }, { "epoch": 6.944444444444445, "grad_norm": 102648.7265625, "learning_rate": 0.0002959183673469388, "loss": 0.4525, "step": 1250 }, { "epoch": 7.0, "eval_accuracy": 0.8883399209486166, "eval_loss": 0.465226411819458, "eval_runtime": 36.2286, "eval_samples_per_second": 27.934, "eval_steps_per_second": 0.883, "step": 1260 }, { "epoch": 7.083333333333333, "grad_norm": 54489.8828125, "learning_rate": 0.00029081632653061223, "loss": 0.4306, "step": 1275 }, { "epoch": 7.222222222222222, "grad_norm": 81611.7890625, "learning_rate": 0.0002857142857142857, "loss": 0.4381, "step": 1300 }, { "epoch": 7.361111111111111, "grad_norm": 139663.171875, "learning_rate": 0.0002806122448979592, "loss": 0.457, "step": 1325 }, { "epoch": 7.5, "grad_norm": 73111.8828125, "learning_rate": 0.00027551020408163264, "loss": 0.4403, "step": 1350 }, { "epoch": 7.638888888888889, "grad_norm": 56448.75, "learning_rate": 0.00027040816326530614, "loss": 0.4308, "step": 1375 }, { "epoch": 7.777777777777778, "grad_norm": 59582.625, "learning_rate": 0.0002653061224489796, "loss": 0.4565, "step": 1400 }, { "epoch": 7.916666666666667, "grad_norm": 167981.71875, "learning_rate": 0.0002602040816326531, "loss": 0.4601, "step": 1425 }, { "epoch": 8.0, "eval_accuracy": 0.900197628458498, "eval_loss": 0.4387129545211792, "eval_runtime": 36.2307, "eval_samples_per_second": 27.932, "eval_steps_per_second": 0.883, "step": 1440 }, { "epoch": 8.055555555555555, "grad_norm": 146485.015625, "learning_rate": 0.00025510204081632655, "loss": 0.4442, "step": 1450 }, { "epoch": 8.194444444444445, "grad_norm": 77854.203125, "learning_rate": 0.00025, "loss": 0.4185, "step": 1475 }, { "epoch": 8.333333333333334, "grad_norm": 123192.34375, "learning_rate": 0.00024489795918367346, "loss": 0.4692, "step": 1500 }, { "epoch": 8.472222222222221, "grad_norm": 74983.546875, "learning_rate": 0.00023979591836734694, "loss": 0.4189, "step": 1525 }, { "epoch": 8.61111111111111, "grad_norm": 136541.65625, "learning_rate": 0.00023469387755102041, "loss": 0.4135, "step": 1550 }, { "epoch": 8.75, "grad_norm": 90828.046875, "learning_rate": 0.0002295918367346939, "loss": 0.419, "step": 1575 }, { "epoch": 8.88888888888889, "grad_norm": 92607.2109375, "learning_rate": 0.00022448979591836734, "loss": 0.4361, "step": 1600 }, { "epoch": 9.0, "eval_accuracy": 0.9071146245059288, "eval_loss": 0.4137505292892456, "eval_runtime": 36.7818, "eval_samples_per_second": 27.514, "eval_steps_per_second": 0.87, "step": 1620 }, { "epoch": 9.027777777777779, "grad_norm": 206206.59375, "learning_rate": 0.00021938775510204082, "loss": 0.4229, "step": 1625 }, { "epoch": 9.166666666666666, "grad_norm": 61973.49609375, "learning_rate": 0.00021428571428571427, "loss": 0.4488, "step": 1650 }, { "epoch": 9.305555555555555, "grad_norm": 134002.28125, "learning_rate": 0.00020918367346938778, "loss": 0.4099, "step": 1675 }, { "epoch": 9.444444444444445, "grad_norm": 162290.21875, "learning_rate": 0.00020408163265306123, "loss": 0.4352, "step": 1700 }, { "epoch": 9.583333333333334, "grad_norm": 108405.265625, "learning_rate": 0.00019897959183673468, "loss": 0.4241, "step": 1725 }, { "epoch": 9.722222222222221, "grad_norm": 66163.1875, "learning_rate": 0.00019387755102040816, "loss": 0.4053, "step": 1750 }, { "epoch": 9.86111111111111, "grad_norm": 35489.41796875, "learning_rate": 0.00018877551020408161, "loss": 0.4023, "step": 1775 }, { "epoch": 10.0, "grad_norm": 148451.015625, "learning_rate": 0.00018367346938775512, "loss": 0.4297, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.9140316205533597, "eval_loss": 0.4089234173297882, "eval_runtime": 36.486, "eval_samples_per_second": 27.737, "eval_steps_per_second": 0.877, "step": 1800 }, { "epoch": 10.13888888888889, "grad_norm": 132313.015625, "learning_rate": 0.00017857142857142857, "loss": 0.4157, "step": 1825 }, { "epoch": 10.277777777777779, "grad_norm": 79190.890625, "learning_rate": 0.00017346938775510205, "loss": 0.4155, "step": 1850 }, { "epoch": 10.416666666666666, "grad_norm": 59022.73828125, "learning_rate": 0.0001683673469387755, "loss": 0.3802, "step": 1875 }, { "epoch": 10.555555555555555, "grad_norm": 113138.1640625, "learning_rate": 0.00016326530612244898, "loss": 0.4633, "step": 1900 }, { "epoch": 10.694444444444445, "grad_norm": 80191.265625, "learning_rate": 0.00015816326530612246, "loss": 0.4034, "step": 1925 }, { "epoch": 10.833333333333334, "grad_norm": 80256.8984375, "learning_rate": 0.00015306122448979594, "loss": 0.4033, "step": 1950 }, { "epoch": 10.972222222222221, "grad_norm": 92320.3359375, "learning_rate": 0.0001479591836734694, "loss": 0.4, "step": 1975 }, { "epoch": 11.0, "eval_accuracy": 0.9199604743083004, "eval_loss": 0.3999524712562561, "eval_runtime": 36.413, "eval_samples_per_second": 27.792, "eval_steps_per_second": 0.879, "step": 1980 }, { "epoch": 11.11111111111111, "grad_norm": 83646.3828125, "learning_rate": 0.00014285714285714284, "loss": 0.4017, "step": 2000 }, { "epoch": 11.25, "grad_norm": 47865.28515625, "learning_rate": 0.00013775510204081632, "loss": 0.4172, "step": 2025 }, { "epoch": 11.38888888888889, "grad_norm": 100859.5859375, "learning_rate": 0.0001326530612244898, "loss": 0.3601, "step": 2050 }, { "epoch": 11.527777777777779, "grad_norm": 89678.1796875, "learning_rate": 0.00012755102040816328, "loss": 0.459, "step": 2075 }, { "epoch": 11.666666666666666, "grad_norm": 97468.703125, "learning_rate": 0.00012244897959183673, "loss": 0.3659, "step": 2100 }, { "epoch": 11.805555555555555, "grad_norm": 116296.359375, "learning_rate": 0.00011734693877551021, "loss": 0.4134, "step": 2125 }, { "epoch": 11.944444444444445, "grad_norm": 66697.9296875, "learning_rate": 0.00011224489795918367, "loss": 0.4035, "step": 2150 }, { "epoch": 12.0, "eval_accuracy": 0.9071146245059288, "eval_loss": 0.42599722743034363, "eval_runtime": 36.752, "eval_samples_per_second": 27.536, "eval_steps_per_second": 0.871, "step": 2160 }, { "epoch": 12.083333333333334, "grad_norm": 139186.21875, "learning_rate": 0.00010714285714285714, "loss": 0.3609, "step": 2175 }, { "epoch": 12.222222222222221, "grad_norm": 69709.2109375, "learning_rate": 0.00010204081632653062, "loss": 0.4146, "step": 2200 }, { "epoch": 12.36111111111111, "grad_norm": 84500.0859375, "learning_rate": 9.693877551020408e-05, "loss": 0.4013, "step": 2225 }, { "epoch": 12.5, "grad_norm": 45239.5703125, "learning_rate": 9.183673469387756e-05, "loss": 0.3918, "step": 2250 }, { "epoch": 12.63888888888889, "grad_norm": 49387.7421875, "learning_rate": 8.673469387755102e-05, "loss": 0.388, "step": 2275 }, { "epoch": 12.777777777777779, "grad_norm": 98527.546875, "learning_rate": 8.163265306122449e-05, "loss": 0.3941, "step": 2300 }, { "epoch": 12.916666666666666, "grad_norm": 75106.1640625, "learning_rate": 7.653061224489797e-05, "loss": 0.3875, "step": 2325 }, { "epoch": 13.0, "eval_accuracy": 0.9100790513833992, "eval_loss": 0.40881994366645813, "eval_runtime": 36.7096, "eval_samples_per_second": 27.568, "eval_steps_per_second": 0.872, "step": 2340 }, { "epoch": 13.055555555555555, "grad_norm": 94333.46875, "learning_rate": 7.142857142857142e-05, "loss": 0.395, "step": 2350 }, { "epoch": 13.194444444444445, "grad_norm": 150090.71875, "learning_rate": 6.63265306122449e-05, "loss": 0.3972, "step": 2375 }, { "epoch": 13.333333333333334, "grad_norm": 86562.6015625, "learning_rate": 6.122448979591836e-05, "loss": 0.3347, "step": 2400 }, { "epoch": 13.472222222222221, "grad_norm": 205886.484375, "learning_rate": 5.6122448979591836e-05, "loss": 0.4316, "step": 2425 }, { "epoch": 13.61111111111111, "grad_norm": 90394.1640625, "learning_rate": 5.102040816326531e-05, "loss": 0.3521, "step": 2450 }, { "epoch": 13.75, "grad_norm": 118663.3359375, "learning_rate": 4.591836734693878e-05, "loss": 0.4071, "step": 2475 }, { "epoch": 13.88888888888889, "grad_norm": 91543.4765625, "learning_rate": 4.0816326530612245e-05, "loss": 0.4117, "step": 2500 }, { "epoch": 14.0, "eval_accuracy": 0.9179841897233202, "eval_loss": 0.3964671194553375, "eval_runtime": 36.2893, "eval_samples_per_second": 27.887, "eval_steps_per_second": 0.882, "step": 2520 }, { "epoch": 14.027777777777779, "grad_norm": 48705.08203125, "learning_rate": 3.571428571428571e-05, "loss": 0.3965, "step": 2525 }, { "epoch": 14.166666666666666, "grad_norm": 71733.8046875, "learning_rate": 3.061224489795918e-05, "loss": 0.4043, "step": 2550 }, { "epoch": 14.305555555555555, "grad_norm": 113618.6484375, "learning_rate": 2.5510204081632654e-05, "loss": 0.3868, "step": 2575 }, { "epoch": 14.444444444444445, "grad_norm": 90760.4609375, "learning_rate": 2.0408163265306123e-05, "loss": 0.3633, "step": 2600 }, { "epoch": 14.583333333333334, "grad_norm": 58063.44921875, "learning_rate": 1.530612244897959e-05, "loss": 0.3651, "step": 2625 }, { "epoch": 14.722222222222221, "grad_norm": 66486.3984375, "learning_rate": 1.0204081632653061e-05, "loss": 0.3904, "step": 2650 }, { "epoch": 14.86111111111111, "grad_norm": 68429.1484375, "learning_rate": 5.102040816326531e-06, "loss": 0.4017, "step": 2675 }, { "epoch": 15.0, "grad_norm": 489841.0625, "learning_rate": 0.0, "loss": 0.3518, "step": 2700 }, { "epoch": 15.0, "eval_accuracy": 0.91600790513834, "eval_loss": 0.3987027406692505, "eval_runtime": 36.8038, "eval_samples_per_second": 27.497, "eval_steps_per_second": 0.869, "step": 2700 }, { "epoch": 15.0, "step": 2700, "total_flos": 0.0, "train_loss": 0.45526096591243037, "train_runtime": 5324.9693, "train_samples_per_second": 16.141, "train_steps_per_second": 0.507 } ], "logging_steps": 25, "max_steps": 2700, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }