{ "best_metric": 0.484894722700119, "best_model_checkpoint": "./kd_results/microsoft/beit-base-finetuned-ade-640-640_alpha0.5_temp5.0_t2/checkpoint-3600", "epoch": 20.0, "eval_steps": 500, "global_step": 3600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1388888888888889, "grad_norm": 7.286985397338867, "learning_rate": 3.4722222222222224e-06, "loss": 1.2381, "step": 25 }, { "epoch": 0.2777777777777778, "grad_norm": 8.051214218139648, "learning_rate": 6.944444444444445e-06, "loss": 1.2328, "step": 50 }, { "epoch": 0.4166666666666667, "grad_norm": 6.174274921417236, "learning_rate": 1.0416666666666668e-05, "loss": 1.2463, "step": 75 }, { "epoch": 0.5555555555555556, "grad_norm": 6.397758483886719, "learning_rate": 1.388888888888889e-05, "loss": 1.2479, "step": 100 }, { "epoch": 0.6944444444444444, "grad_norm": 5.561864852905273, "learning_rate": 1.736111111111111e-05, "loss": 1.2072, "step": 125 }, { "epoch": 0.8333333333333334, "grad_norm": 5.702479839324951, "learning_rate": 2.0833333333333336e-05, "loss": 1.1851, "step": 150 }, { "epoch": 0.9722222222222222, "grad_norm": 7.537128925323486, "learning_rate": 2.4305555555555558e-05, "loss": 1.1929, "step": 175 }, { "epoch": 1.0, "eval_accuracy": 0.39723320158102765, "eval_loss": 1.3065259456634521, "eval_runtime": 67.5962, "eval_samples_per_second": 14.971, "eval_steps_per_second": 0.473, "step": 180 }, { "epoch": 1.1111111111111112, "grad_norm": 11.624950408935547, "learning_rate": 2.777777777777778e-05, "loss": 1.0737, "step": 200 }, { "epoch": 1.25, "grad_norm": 6.659048080444336, "learning_rate": 3.125e-05, "loss": 1.0401, "step": 225 }, { "epoch": 1.3888888888888888, "grad_norm": 6.487913608551025, "learning_rate": 3.472222222222222e-05, "loss": 0.9825, "step": 250 }, { "epoch": 1.5277777777777777, "grad_norm": 6.304473400115967, "learning_rate": 3.8194444444444444e-05, "loss": 0.9422, "step": 275 }, { "epoch": 1.6666666666666665, "grad_norm": 5.928019046783447, "learning_rate": 4.166666666666667e-05, "loss": 0.8461, "step": 300 }, { "epoch": 1.8055555555555556, "grad_norm": 7.59010124206543, "learning_rate": 4.5138888888888894e-05, "loss": 0.7379, "step": 325 }, { "epoch": 1.9444444444444444, "grad_norm": 8.608816146850586, "learning_rate": 4.8611111111111115e-05, "loss": 0.7186, "step": 350 }, { "epoch": 2.0, "eval_accuracy": 0.691699604743083, "eval_loss": 0.8221395611763, "eval_runtime": 70.6549, "eval_samples_per_second": 14.323, "eval_steps_per_second": 0.453, "step": 360 }, { "epoch": 2.0833333333333335, "grad_norm": 6.835705757141113, "learning_rate": 4.976851851851852e-05, "loss": 0.6226, "step": 375 }, { "epoch": 2.2222222222222223, "grad_norm": 6.270722389221191, "learning_rate": 4.938271604938271e-05, "loss": 0.5672, "step": 400 }, { "epoch": 2.361111111111111, "grad_norm": 10.751533508300781, "learning_rate": 4.899691358024692e-05, "loss": 0.5852, "step": 425 }, { "epoch": 2.5, "grad_norm": 11.694998741149902, "learning_rate": 4.8611111111111115e-05, "loss": 0.5879, "step": 450 }, { "epoch": 2.638888888888889, "grad_norm": 6.815727233886719, "learning_rate": 4.8225308641975306e-05, "loss": 0.6177, "step": 475 }, { "epoch": 2.7777777777777777, "grad_norm": 5.678001880645752, "learning_rate": 4.783950617283951e-05, "loss": 0.5625, "step": 500 }, { "epoch": 2.9166666666666665, "grad_norm": 6.710384368896484, "learning_rate": 4.745370370370371e-05, "loss": 0.5468, "step": 525 }, { "epoch": 3.0, "eval_accuracy": 0.7223320158102767, "eval_loss": 0.7486008405685425, "eval_runtime": 70.0543, "eval_samples_per_second": 14.446, "eval_steps_per_second": 0.457, "step": 540 }, { "epoch": 3.0555555555555554, "grad_norm": 3.762361764907837, "learning_rate": 4.70679012345679e-05, "loss": 0.4885, "step": 550 }, { "epoch": 3.1944444444444446, "grad_norm": 6.667619228363037, "learning_rate": 4.66820987654321e-05, "loss": 0.4782, "step": 575 }, { "epoch": 3.3333333333333335, "grad_norm": 6.843059539794922, "learning_rate": 4.62962962962963e-05, "loss": 0.4645, "step": 600 }, { "epoch": 3.4722222222222223, "grad_norm": 8.307331085205078, "learning_rate": 4.591049382716049e-05, "loss": 0.4655, "step": 625 }, { "epoch": 3.611111111111111, "grad_norm": 9.52717399597168, "learning_rate": 4.5524691358024696e-05, "loss": 0.4054, "step": 650 }, { "epoch": 3.75, "grad_norm": 5.222884654998779, "learning_rate": 4.5138888888888894e-05, "loss": 0.4206, "step": 675 }, { "epoch": 3.888888888888889, "grad_norm": 14.481228828430176, "learning_rate": 4.4753086419753084e-05, "loss": 0.4245, "step": 700 }, { "epoch": 4.0, "eval_accuracy": 0.7687747035573123, "eval_loss": 0.6551746726036072, "eval_runtime": 73.706, "eval_samples_per_second": 13.73, "eval_steps_per_second": 0.434, "step": 720 }, { "epoch": 4.027777777777778, "grad_norm": 4.888184547424316, "learning_rate": 4.436728395061729e-05, "loss": 0.4409, "step": 725 }, { "epoch": 4.166666666666667, "grad_norm": 6.129085540771484, "learning_rate": 4.3981481481481486e-05, "loss": 0.3622, "step": 750 }, { "epoch": 4.305555555555555, "grad_norm": 5.894360542297363, "learning_rate": 4.359567901234568e-05, "loss": 0.3552, "step": 775 }, { "epoch": 4.444444444444445, "grad_norm": 5.902454853057861, "learning_rate": 4.3209876543209875e-05, "loss": 0.3844, "step": 800 }, { "epoch": 4.583333333333333, "grad_norm": 4.399020195007324, "learning_rate": 4.282407407407408e-05, "loss": 0.3632, "step": 825 }, { "epoch": 4.722222222222222, "grad_norm": 4.9550461769104, "learning_rate": 4.243827160493827e-05, "loss": 0.3203, "step": 850 }, { "epoch": 4.861111111111111, "grad_norm": 4.3397064208984375, "learning_rate": 4.205246913580247e-05, "loss": 0.3408, "step": 875 }, { "epoch": 5.0, "grad_norm": 9.68095588684082, "learning_rate": 4.166666666666667e-05, "loss": 0.3503, "step": 900 }, { "epoch": 5.0, "eval_accuracy": 0.7944664031620553, "eval_loss": 0.5797867178916931, "eval_runtime": 72.0053, "eval_samples_per_second": 14.055, "eval_steps_per_second": 0.444, "step": 900 }, { "epoch": 5.138888888888889, "grad_norm": 9.086938858032227, "learning_rate": 4.128086419753087e-05, "loss": 0.2831, "step": 925 }, { "epoch": 5.277777777777778, "grad_norm": 4.564643859863281, "learning_rate": 4.089506172839506e-05, "loss": 0.3182, "step": 950 }, { "epoch": 5.416666666666667, "grad_norm": 3.565946102142334, "learning_rate": 4.0509259259259265e-05, "loss": 0.3034, "step": 975 }, { "epoch": 5.555555555555555, "grad_norm": 6.323774337768555, "learning_rate": 4.012345679012346e-05, "loss": 0.3164, "step": 1000 }, { "epoch": 5.694444444444445, "grad_norm": 6.860601902008057, "learning_rate": 3.973765432098765e-05, "loss": 0.3177, "step": 1025 }, { "epoch": 5.833333333333333, "grad_norm": 2.719980478286743, "learning_rate": 3.935185185185186e-05, "loss": 0.2967, "step": 1050 }, { "epoch": 5.972222222222222, "grad_norm": 3.2052273750305176, "learning_rate": 3.8966049382716055e-05, "loss": 0.2533, "step": 1075 }, { "epoch": 6.0, "eval_accuracy": 0.7875494071146245, "eval_loss": 0.6043540835380554, "eval_runtime": 71.2915, "eval_samples_per_second": 14.195, "eval_steps_per_second": 0.449, "step": 1080 }, { "epoch": 6.111111111111111, "grad_norm": 2.046257734298706, "learning_rate": 3.8580246913580246e-05, "loss": 0.2785, "step": 1100 }, { "epoch": 6.25, "grad_norm": 3.3659110069274902, "learning_rate": 3.8194444444444444e-05, "loss": 0.2568, "step": 1125 }, { "epoch": 6.388888888888889, "grad_norm": 5.183215141296387, "learning_rate": 3.780864197530865e-05, "loss": 0.2524, "step": 1150 }, { "epoch": 6.527777777777778, "grad_norm": 5.04970121383667, "learning_rate": 3.742283950617284e-05, "loss": 0.2375, "step": 1175 }, { "epoch": 6.666666666666667, "grad_norm": 4.041324615478516, "learning_rate": 3.7037037037037037e-05, "loss": 0.268, "step": 1200 }, { "epoch": 6.805555555555555, "grad_norm": 6.357531547546387, "learning_rate": 3.665123456790124e-05, "loss": 0.2462, "step": 1225 }, { "epoch": 6.944444444444445, "grad_norm": 8.738434791564941, "learning_rate": 3.626543209876543e-05, "loss": 0.2419, "step": 1250 }, { "epoch": 7.0, "eval_accuracy": 0.8013833992094862, "eval_loss": 0.5753681659698486, "eval_runtime": 72.8052, "eval_samples_per_second": 13.9, "eval_steps_per_second": 0.44, "step": 1260 }, { "epoch": 7.083333333333333, "grad_norm": 3.8972971439361572, "learning_rate": 3.587962962962963e-05, "loss": 0.2316, "step": 1275 }, { "epoch": 7.222222222222222, "grad_norm": 2.9535961151123047, "learning_rate": 3.5493827160493834e-05, "loss": 0.2311, "step": 1300 }, { "epoch": 7.361111111111111, "grad_norm": 2.292342185974121, "learning_rate": 3.5108024691358025e-05, "loss": 0.2344, "step": 1325 }, { "epoch": 7.5, "grad_norm": 2.524745464324951, "learning_rate": 3.472222222222222e-05, "loss": 0.2455, "step": 1350 }, { "epoch": 7.638888888888889, "grad_norm": 5.171721935272217, "learning_rate": 3.4336419753086427e-05, "loss": 0.2408, "step": 1375 }, { "epoch": 7.777777777777778, "grad_norm": 5.087492942810059, "learning_rate": 3.395061728395062e-05, "loss": 0.2414, "step": 1400 }, { "epoch": 7.916666666666667, "grad_norm": 2.728886365890503, "learning_rate": 3.3564814814814815e-05, "loss": 0.238, "step": 1425 }, { "epoch": 8.0, "eval_accuracy": 0.7727272727272727, "eval_loss": 0.6907724142074585, "eval_runtime": 70.0358, "eval_samples_per_second": 14.45, "eval_steps_per_second": 0.457, "step": 1440 }, { "epoch": 8.055555555555555, "grad_norm": 5.770048141479492, "learning_rate": 3.317901234567901e-05, "loss": 0.2412, "step": 1450 }, { "epoch": 8.194444444444445, "grad_norm": 8.897077560424805, "learning_rate": 3.279320987654321e-05, "loss": 0.2082, "step": 1475 }, { "epoch": 8.333333333333334, "grad_norm": 6.751387119293213, "learning_rate": 3.240740740740741e-05, "loss": 0.2028, "step": 1500 }, { "epoch": 8.472222222222221, "grad_norm": 4.849079132080078, "learning_rate": 3.2021604938271605e-05, "loss": 0.21, "step": 1525 }, { "epoch": 8.61111111111111, "grad_norm": 3.3051037788391113, "learning_rate": 3.16358024691358e-05, "loss": 0.212, "step": 1550 }, { "epoch": 8.75, "grad_norm": 1.1029444932937622, "learning_rate": 3.125e-05, "loss": 0.2162, "step": 1575 }, { "epoch": 8.88888888888889, "grad_norm": 1.2979087829589844, "learning_rate": 3.08641975308642e-05, "loss": 0.2325, "step": 1600 }, { "epoch": 9.0, "eval_accuracy": 0.8063241106719368, "eval_loss": 0.5578542351722717, "eval_runtime": 71.339, "eval_samples_per_second": 14.186, "eval_steps_per_second": 0.449, "step": 1620 }, { "epoch": 9.027777777777779, "grad_norm": 2.1377780437469482, "learning_rate": 3.04783950617284e-05, "loss": 0.2107, "step": 1625 }, { "epoch": 9.166666666666666, "grad_norm": 2.695441961288452, "learning_rate": 3.0092592592592593e-05, "loss": 0.2016, "step": 1650 }, { "epoch": 9.305555555555555, "grad_norm": 6.805841445922852, "learning_rate": 2.970679012345679e-05, "loss": 0.195, "step": 1675 }, { "epoch": 9.444444444444445, "grad_norm": 3.97465181350708, "learning_rate": 2.9320987654320992e-05, "loss": 0.2171, "step": 1700 }, { "epoch": 9.583333333333334, "grad_norm": 1.7595816850662231, "learning_rate": 2.8935185185185186e-05, "loss": 0.1984, "step": 1725 }, { "epoch": 9.722222222222221, "grad_norm": 1.7702546119689941, "learning_rate": 2.8549382716049384e-05, "loss": 0.1949, "step": 1750 }, { "epoch": 9.86111111111111, "grad_norm": 1.119817852973938, "learning_rate": 2.8163580246913578e-05, "loss": 0.1898, "step": 1775 }, { "epoch": 10.0, "grad_norm": 12.552109718322754, "learning_rate": 2.777777777777778e-05, "loss": 0.2118, "step": 1800 }, { "epoch": 10.0, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.5279130935668945, "eval_runtime": 72.7883, "eval_samples_per_second": 13.903, "eval_steps_per_second": 0.44, "step": 1800 }, { "epoch": 10.13888888888889, "grad_norm": 4.522942066192627, "learning_rate": 2.7391975308641977e-05, "loss": 0.1788, "step": 1825 }, { "epoch": 10.277777777777779, "grad_norm": 3.472414016723633, "learning_rate": 2.700617283950617e-05, "loss": 0.183, "step": 1850 }, { "epoch": 10.416666666666666, "grad_norm": 2.087977647781372, "learning_rate": 2.6620370370370372e-05, "loss": 0.1882, "step": 1875 }, { "epoch": 10.555555555555555, "grad_norm": 1.0252704620361328, "learning_rate": 2.623456790123457e-05, "loss": 0.1805, "step": 1900 }, { "epoch": 10.694444444444445, "grad_norm": 3.4632368087768555, "learning_rate": 2.5848765432098764e-05, "loss": 0.1921, "step": 1925 }, { "epoch": 10.833333333333334, "grad_norm": 2.140453815460205, "learning_rate": 2.5462962962962965e-05, "loss": 0.1821, "step": 1950 }, { "epoch": 10.972222222222221, "grad_norm": 2.493025302886963, "learning_rate": 2.5077160493827162e-05, "loss": 0.1827, "step": 1975 }, { "epoch": 11.0, "eval_accuracy": 0.8181818181818182, "eval_loss": 0.5353850722312927, "eval_runtime": 72.4433, "eval_samples_per_second": 13.97, "eval_steps_per_second": 0.442, "step": 1980 }, { "epoch": 11.11111111111111, "grad_norm": 1.8048105239868164, "learning_rate": 2.4691358024691357e-05, "loss": 0.1759, "step": 2000 }, { "epoch": 11.25, "grad_norm": 1.5512341260910034, "learning_rate": 2.4305555555555558e-05, "loss": 0.1758, "step": 2025 }, { "epoch": 11.38888888888889, "grad_norm": 1.2630689144134521, "learning_rate": 2.3919753086419755e-05, "loss": 0.1684, "step": 2050 }, { "epoch": 11.527777777777779, "grad_norm": 1.7419530153274536, "learning_rate": 2.353395061728395e-05, "loss": 0.1826, "step": 2075 }, { "epoch": 11.666666666666666, "grad_norm": 1.711014986038208, "learning_rate": 2.314814814814815e-05, "loss": 0.1814, "step": 2100 }, { "epoch": 11.805555555555555, "grad_norm": 1.6719098091125488, "learning_rate": 2.2762345679012348e-05, "loss": 0.1876, "step": 2125 }, { "epoch": 11.944444444444445, "grad_norm": 1.7764290571212769, "learning_rate": 2.2376543209876542e-05, "loss": 0.1797, "step": 2150 }, { "epoch": 12.0, "eval_accuracy": 0.8310276679841897, "eval_loss": 0.5395620465278625, "eval_runtime": 72.7565, "eval_samples_per_second": 13.909, "eval_steps_per_second": 0.44, "step": 2160 }, { "epoch": 12.083333333333334, "grad_norm": 1.550905466079712, "learning_rate": 2.1990740740740743e-05, "loss": 0.161, "step": 2175 }, { "epoch": 12.222222222222221, "grad_norm": 4.01070499420166, "learning_rate": 2.1604938271604937e-05, "loss": 0.1608, "step": 2200 }, { "epoch": 12.36111111111111, "grad_norm": 1.9554202556610107, "learning_rate": 2.1219135802469135e-05, "loss": 0.1654, "step": 2225 }, { "epoch": 12.5, "grad_norm": 1.9122201204299927, "learning_rate": 2.0833333333333336e-05, "loss": 0.181, "step": 2250 }, { "epoch": 12.63888888888889, "grad_norm": 1.9604136943817139, "learning_rate": 2.044753086419753e-05, "loss": 0.1748, "step": 2275 }, { "epoch": 12.777777777777779, "grad_norm": 1.233370304107666, "learning_rate": 2.006172839506173e-05, "loss": 0.166, "step": 2300 }, { "epoch": 12.916666666666666, "grad_norm": 1.906353235244751, "learning_rate": 1.967592592592593e-05, "loss": 0.1628, "step": 2325 }, { "epoch": 13.0, "eval_accuracy": 0.8349802371541502, "eval_loss": 0.5208285450935364, "eval_runtime": 73.301, "eval_samples_per_second": 13.806, "eval_steps_per_second": 0.437, "step": 2340 }, { "epoch": 13.055555555555555, "grad_norm": 1.1886399984359741, "learning_rate": 1.9290123456790123e-05, "loss": 0.1708, "step": 2350 }, { "epoch": 13.194444444444445, "grad_norm": 1.163377285003662, "learning_rate": 1.8904320987654324e-05, "loss": 0.1724, "step": 2375 }, { "epoch": 13.333333333333334, "grad_norm": 1.7299343347549438, "learning_rate": 1.8518518518518518e-05, "loss": 0.1693, "step": 2400 }, { "epoch": 13.472222222222221, "grad_norm": 1.1578242778778076, "learning_rate": 1.8132716049382716e-05, "loss": 0.1593, "step": 2425 }, { "epoch": 13.61111111111111, "grad_norm": 2.2730441093444824, "learning_rate": 1.7746913580246917e-05, "loss": 0.1644, "step": 2450 }, { "epoch": 13.75, "grad_norm": 1.5351886749267578, "learning_rate": 1.736111111111111e-05, "loss": 0.1632, "step": 2475 }, { "epoch": 13.88888888888889, "grad_norm": 3.478947162628174, "learning_rate": 1.697530864197531e-05, "loss": 0.1628, "step": 2500 }, { "epoch": 14.0, "eval_accuracy": 0.8181818181818182, "eval_loss": 0.5147923231124878, "eval_runtime": 73.8901, "eval_samples_per_second": 13.696, "eval_steps_per_second": 0.433, "step": 2520 }, { "epoch": 14.027777777777779, "grad_norm": 1.3570986986160278, "learning_rate": 1.6589506172839506e-05, "loss": 0.179, "step": 2525 }, { "epoch": 14.166666666666666, "grad_norm": 1.3316888809204102, "learning_rate": 1.6203703703703704e-05, "loss": 0.1676, "step": 2550 }, { "epoch": 14.305555555555555, "grad_norm": 2.0854692459106445, "learning_rate": 1.58179012345679e-05, "loss": 0.1658, "step": 2575 }, { "epoch": 14.444444444444445, "grad_norm": 4.994213104248047, "learning_rate": 1.54320987654321e-05, "loss": 0.1638, "step": 2600 }, { "epoch": 14.583333333333334, "grad_norm": 2.141885280609131, "learning_rate": 1.5046296296296297e-05, "loss": 0.1592, "step": 2625 }, { "epoch": 14.722222222222221, "grad_norm": 0.925370454788208, "learning_rate": 1.4660493827160496e-05, "loss": 0.1567, "step": 2650 }, { "epoch": 14.86111111111111, "grad_norm": 2.1615099906921387, "learning_rate": 1.4274691358024692e-05, "loss": 0.1512, "step": 2675 }, { "epoch": 15.0, "grad_norm": 4.803379058837891, "learning_rate": 1.388888888888889e-05, "loss": 0.1613, "step": 2700 }, { "epoch": 15.0, "eval_accuracy": 0.8162055335968379, "eval_loss": 0.5172973275184631, "eval_runtime": 71.0103, "eval_samples_per_second": 14.251, "eval_steps_per_second": 0.451, "step": 2700 }, { "epoch": 15.13888888888889, "grad_norm": 1.0440549850463867, "learning_rate": 1.3503086419753085e-05, "loss": 0.1569, "step": 2725 }, { "epoch": 15.277777777777779, "grad_norm": 1.1229736804962158, "learning_rate": 1.3117283950617285e-05, "loss": 0.1532, "step": 2750 }, { "epoch": 15.416666666666666, "grad_norm": 1.383908748626709, "learning_rate": 1.2731481481481482e-05, "loss": 0.153, "step": 2775 }, { "epoch": 15.555555555555555, "grad_norm": 1.8207199573516846, "learning_rate": 1.2345679012345678e-05, "loss": 0.1555, "step": 2800 }, { "epoch": 15.694444444444445, "grad_norm": 1.1413156986236572, "learning_rate": 1.1959876543209878e-05, "loss": 0.1585, "step": 2825 }, { "epoch": 15.833333333333334, "grad_norm": 1.0615586042404175, "learning_rate": 1.1574074074074075e-05, "loss": 0.155, "step": 2850 }, { "epoch": 15.972222222222221, "grad_norm": 0.9079870581626892, "learning_rate": 1.1188271604938271e-05, "loss": 0.151, "step": 2875 }, { "epoch": 16.0, "eval_accuracy": 0.8320158102766798, "eval_loss": 0.4948570728302002, "eval_runtime": 71.0407, "eval_samples_per_second": 14.245, "eval_steps_per_second": 0.45, "step": 2880 }, { "epoch": 16.11111111111111, "grad_norm": 0.9218528866767883, "learning_rate": 1.0802469135802469e-05, "loss": 0.1472, "step": 2900 }, { "epoch": 16.25, "grad_norm": 1.099493145942688, "learning_rate": 1.0416666666666668e-05, "loss": 0.156, "step": 2925 }, { "epoch": 16.38888888888889, "grad_norm": 1.494028091430664, "learning_rate": 1.0030864197530866e-05, "loss": 0.1518, "step": 2950 }, { "epoch": 16.52777777777778, "grad_norm": 1.20855712890625, "learning_rate": 9.645061728395062e-06, "loss": 0.1572, "step": 2975 }, { "epoch": 16.666666666666668, "grad_norm": 0.8262044787406921, "learning_rate": 9.259259259259259e-06, "loss": 0.1491, "step": 3000 }, { "epoch": 16.805555555555557, "grad_norm": 1.0807077884674072, "learning_rate": 8.873456790123458e-06, "loss": 0.1438, "step": 3025 }, { "epoch": 16.944444444444443, "grad_norm": 0.8184900879859924, "learning_rate": 8.487654320987654e-06, "loss": 0.1492, "step": 3050 }, { "epoch": 17.0, "eval_accuracy": 0.8280632411067194, "eval_loss": 0.50197434425354, "eval_runtime": 75.2374, "eval_samples_per_second": 13.451, "eval_steps_per_second": 0.425, "step": 3060 }, { "epoch": 17.083333333333332, "grad_norm": 0.7769345045089722, "learning_rate": 8.101851851851852e-06, "loss": 0.1478, "step": 3075 }, { "epoch": 17.22222222222222, "grad_norm": 0.6774124503135681, "learning_rate": 7.71604938271605e-06, "loss": 0.1494, "step": 3100 }, { "epoch": 17.36111111111111, "grad_norm": 0.8633320927619934, "learning_rate": 7.330246913580248e-06, "loss": 0.1404, "step": 3125 }, { "epoch": 17.5, "grad_norm": 0.7519739866256714, "learning_rate": 6.944444444444445e-06, "loss": 0.1506, "step": 3150 }, { "epoch": 17.63888888888889, "grad_norm": 1.2969201803207397, "learning_rate": 6.558641975308642e-06, "loss": 0.1439, "step": 3175 }, { "epoch": 17.77777777777778, "grad_norm": 0.6045782566070557, "learning_rate": 6.172839506172839e-06, "loss": 0.1509, "step": 3200 }, { "epoch": 17.916666666666668, "grad_norm": 4.1767754554748535, "learning_rate": 5.787037037037038e-06, "loss": 0.1538, "step": 3225 }, { "epoch": 18.0, "eval_accuracy": 0.8280632411067194, "eval_loss": 0.4888656735420227, "eval_runtime": 73.2995, "eval_samples_per_second": 13.806, "eval_steps_per_second": 0.437, "step": 3240 }, { "epoch": 18.055555555555557, "grad_norm": 0.5043900012969971, "learning_rate": 5.401234567901234e-06, "loss": 0.152, "step": 3250 }, { "epoch": 18.194444444444443, "grad_norm": 0.6517844200134277, "learning_rate": 5.015432098765433e-06, "loss": 0.1389, "step": 3275 }, { "epoch": 18.333333333333332, "grad_norm": 1.0455210208892822, "learning_rate": 4.6296296296296296e-06, "loss": 0.1486, "step": 3300 }, { "epoch": 18.47222222222222, "grad_norm": 0.9511592388153076, "learning_rate": 4.243827160493827e-06, "loss": 0.1441, "step": 3325 }, { "epoch": 18.61111111111111, "grad_norm": 0.7536895871162415, "learning_rate": 3.858024691358025e-06, "loss": 0.1463, "step": 3350 }, { "epoch": 18.75, "grad_norm": 0.7672238945960999, "learning_rate": 3.4722222222222224e-06, "loss": 0.1529, "step": 3375 }, { "epoch": 18.88888888888889, "grad_norm": 0.8008944392204285, "learning_rate": 3.0864197530864196e-06, "loss": 0.1434, "step": 3400 }, { "epoch": 19.0, "eval_accuracy": 0.8349802371541502, "eval_loss": 0.4909766912460327, "eval_runtime": 72.6929, "eval_samples_per_second": 13.922, "eval_steps_per_second": 0.44, "step": 3420 }, { "epoch": 19.02777777777778, "grad_norm": 0.5323883295059204, "learning_rate": 2.700617283950617e-06, "loss": 0.152, "step": 3425 }, { "epoch": 19.166666666666668, "grad_norm": 3.3585596084594727, "learning_rate": 2.3148148148148148e-06, "loss": 0.1517, "step": 3450 }, { "epoch": 19.305555555555557, "grad_norm": 0.42451515793800354, "learning_rate": 1.9290123456790124e-06, "loss": 0.1413, "step": 3475 }, { "epoch": 19.444444444444443, "grad_norm": 0.4548187553882599, "learning_rate": 1.5432098765432098e-06, "loss": 0.144, "step": 3500 }, { "epoch": 19.583333333333332, "grad_norm": 0.8863416314125061, "learning_rate": 1.1574074074074074e-06, "loss": 0.1436, "step": 3525 }, { "epoch": 19.72222222222222, "grad_norm": 0.5046945214271545, "learning_rate": 7.716049382716049e-07, "loss": 0.1448, "step": 3550 }, { "epoch": 19.86111111111111, "grad_norm": 0.6154670119285583, "learning_rate": 3.8580246913580245e-07, "loss": 0.1403, "step": 3575 }, { "epoch": 20.0, "grad_norm": 0.7444607019424438, "learning_rate": 0.0, "loss": 0.1391, "step": 3600 }, { "epoch": 20.0, "eval_accuracy": 0.8389328063241107, "eval_loss": 0.484894722700119, "eval_runtime": 73.4232, "eval_samples_per_second": 13.783, "eval_steps_per_second": 0.436, "step": 3600 }, { "epoch": 20.0, "step": 3600, "total_flos": 0.0, "train_loss": 0.31427801145447626, "train_runtime": 18365.9356, "train_samples_per_second": 6.24, "train_steps_per_second": 0.196 } ], "logging_steps": 25, "max_steps": 3600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }