{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 150.0, "eval_steps": 500, "global_step": 12900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_dummy": 1.0, "eval_loss": 34.110015869140625, "eval_runtime": 21.0886, "eval_samples_per_second": 2.276, "eval_steps_per_second": 0.379, "step": 86 }, { "epoch": 1.1627906976744187, "grad_norm": 79.07225036621094, "learning_rate": 4.9613178294573645e-05, "loss": 43.7572, "step": 100 }, { "epoch": 2.0, "eval_dummy": 1.0, "eval_loss": 30.164270401000977, "eval_runtime": 20.9519, "eval_samples_per_second": 2.291, "eval_steps_per_second": 0.382, "step": 172 }, { "epoch": 2.3255813953488373, "grad_norm": 105.73622131347656, "learning_rate": 4.922635658914729e-05, "loss": 32.0025, "step": 200 }, { "epoch": 3.0, "eval_dummy": 1.0, "eval_loss": 28.181777954101562, "eval_runtime": 21.9128, "eval_samples_per_second": 2.191, "eval_steps_per_second": 0.365, "step": 258 }, { "epoch": 3.488372093023256, "grad_norm": 67.89353942871094, "learning_rate": 4.883953488372093e-05, "loss": 26.8817, "step": 300 }, { "epoch": 4.0, "eval_dummy": 1.0, "eval_loss": 27.178003311157227, "eval_runtime": 22.5719, "eval_samples_per_second": 2.127, "eval_steps_per_second": 0.354, "step": 344 }, { "epoch": 4.651162790697675, "grad_norm": 112.37932586669922, "learning_rate": 4.8452713178294574e-05, "loss": 24.1857, "step": 400 }, { "epoch": 5.0, "eval_dummy": 1.0, "eval_loss": 26.582746505737305, "eval_runtime": 22.006, "eval_samples_per_second": 2.181, "eval_steps_per_second": 0.364, "step": 430 }, { "epoch": 5.813953488372093, "grad_norm": 114.278076171875, "learning_rate": 4.8065891472868224e-05, "loss": 22.9335, "step": 500 }, { "epoch": 6.0, "eval_dummy": 1.0, "eval_loss": 25.731016159057617, "eval_runtime": 22.1359, "eval_samples_per_second": 2.168, "eval_steps_per_second": 0.361, "step": 516 }, { "epoch": 6.976744186046512, "grad_norm": 95.67694091796875, "learning_rate": 4.7679069767441866e-05, "loss": 21.3521, "step": 600 }, { "epoch": 7.0, "eval_dummy": 1.0, "eval_loss": 25.244226455688477, "eval_runtime": 22.0409, "eval_samples_per_second": 2.178, "eval_steps_per_second": 0.363, "step": 602 }, { "epoch": 8.0, "eval_dummy": 1.0, "eval_loss": 25.051050186157227, "eval_runtime": 22.1279, "eval_samples_per_second": 2.169, "eval_steps_per_second": 0.362, "step": 688 }, { "epoch": 8.13953488372093, "grad_norm": 61.61929702758789, "learning_rate": 4.729224806201551e-05, "loss": 20.4144, "step": 700 }, { "epoch": 9.0, "eval_dummy": 1.0, "eval_loss": 25.383771896362305, "eval_runtime": 22.3943, "eval_samples_per_second": 2.143, "eval_steps_per_second": 0.357, "step": 774 }, { "epoch": 9.30232558139535, "grad_norm": 19.949560165405273, "learning_rate": 4.690542635658915e-05, "loss": 18.8722, "step": 800 }, { "epoch": 10.0, "eval_dummy": 1.0, "eval_loss": 25.72605323791504, "eval_runtime": 20.0048, "eval_samples_per_second": 2.399, "eval_steps_per_second": 0.4, "step": 860 }, { "epoch": 10.465116279069768, "grad_norm": 82.46937561035156, "learning_rate": 4.6518604651162795e-05, "loss": 18.576, "step": 900 }, { "epoch": 11.0, "eval_dummy": 1.0, "eval_loss": 25.0047550201416, "eval_runtime": 22.4267, "eval_samples_per_second": 2.14, "eval_steps_per_second": 0.357, "step": 946 }, { "epoch": 11.627906976744185, "grad_norm": 46.158851623535156, "learning_rate": 4.613178294573644e-05, "loss": 18.1119, "step": 1000 }, { "epoch": 12.0, "eval_dummy": 1.0, "eval_loss": 25.362991333007812, "eval_runtime": 21.9321, "eval_samples_per_second": 2.189, "eval_steps_per_second": 0.365, "step": 1032 }, { "epoch": 12.790697674418604, "grad_norm": 52.46383285522461, "learning_rate": 4.574496124031008e-05, "loss": 17.8769, "step": 1100 }, { "epoch": 13.0, "eval_dummy": 1.0, "eval_loss": 25.256628036499023, "eval_runtime": 23.004, "eval_samples_per_second": 2.087, "eval_steps_per_second": 0.348, "step": 1118 }, { "epoch": 13.953488372093023, "grad_norm": 59.59417724609375, "learning_rate": 4.5358139534883724e-05, "loss": 17.0204, "step": 1200 }, { "epoch": 14.0, "eval_dummy": 1.0, "eval_loss": 25.60231590270996, "eval_runtime": 22.5301, "eval_samples_per_second": 2.13, "eval_steps_per_second": 0.355, "step": 1204 }, { "epoch": 15.0, "eval_dummy": 1.0, "eval_loss": 26.32845115661621, "eval_runtime": 21.6881, "eval_samples_per_second": 2.213, "eval_steps_per_second": 0.369, "step": 1290 }, { "epoch": 15.116279069767442, "grad_norm": 18.75763511657715, "learning_rate": 4.497131782945737e-05, "loss": 16.3528, "step": 1300 }, { "epoch": 16.0, "eval_dummy": 1.0, "eval_loss": 26.325368881225586, "eval_runtime": 22.2169, "eval_samples_per_second": 2.161, "eval_steps_per_second": 0.36, "step": 1376 }, { "epoch": 16.27906976744186, "grad_norm": 79.44314575195312, "learning_rate": 4.458449612403101e-05, "loss": 16.5548, "step": 1400 }, { "epoch": 17.0, "eval_dummy": 1.0, "eval_loss": 26.924423217773438, "eval_runtime": 22.8317, "eval_samples_per_second": 2.102, "eval_steps_per_second": 0.35, "step": 1462 }, { "epoch": 17.441860465116278, "grad_norm": 36.895751953125, "learning_rate": 4.419767441860465e-05, "loss": 16.6848, "step": 1500 }, { "epoch": 18.0, "eval_dummy": 1.0, "eval_loss": 27.62941551208496, "eval_runtime": 22.0813, "eval_samples_per_second": 2.174, "eval_steps_per_second": 0.362, "step": 1548 }, { "epoch": 18.6046511627907, "grad_norm": 32.350345611572266, "learning_rate": 4.3810852713178295e-05, "loss": 15.4544, "step": 1600 }, { "epoch": 19.0, "eval_dummy": 1.0, "eval_loss": 25.75701141357422, "eval_runtime": 22.0108, "eval_samples_per_second": 2.181, "eval_steps_per_second": 0.363, "step": 1634 }, { "epoch": 19.767441860465116, "grad_norm": 37.6955680847168, "learning_rate": 4.342403100775194e-05, "loss": 15.7209, "step": 1700 }, { "epoch": 20.0, "eval_dummy": 1.0, "eval_loss": 25.709718704223633, "eval_runtime": 22.3822, "eval_samples_per_second": 2.145, "eval_steps_per_second": 0.357, "step": 1720 }, { "epoch": 20.930232558139537, "grad_norm": 38.947425842285156, "learning_rate": 4.303720930232558e-05, "loss": 15.3127, "step": 1800 }, { "epoch": 21.0, "eval_dummy": 1.0, "eval_loss": 27.260438919067383, "eval_runtime": 22.4951, "eval_samples_per_second": 2.134, "eval_steps_per_second": 0.356, "step": 1806 }, { "epoch": 22.0, "eval_dummy": 1.0, "eval_loss": 26.42862892150879, "eval_runtime": 23.2626, "eval_samples_per_second": 2.063, "eval_steps_per_second": 0.344, "step": 1892 }, { "epoch": 22.093023255813954, "grad_norm": 39.80724334716797, "learning_rate": 4.2650387596899224e-05, "loss": 14.9528, "step": 1900 }, { "epoch": 23.0, "eval_dummy": 1.0, "eval_loss": 27.576751708984375, "eval_runtime": 23.5885, "eval_samples_per_second": 2.035, "eval_steps_per_second": 0.339, "step": 1978 }, { "epoch": 23.25581395348837, "grad_norm": 27.231279373168945, "learning_rate": 4.226356589147287e-05, "loss": 15.1795, "step": 2000 }, { "epoch": 24.0, "eval_dummy": 1.0, "eval_loss": 26.471437454223633, "eval_runtime": 21.9954, "eval_samples_per_second": 2.182, "eval_steps_per_second": 0.364, "step": 2064 }, { "epoch": 24.41860465116279, "grad_norm": 32.53752517700195, "learning_rate": 4.187674418604651e-05, "loss": 14.707, "step": 2100 }, { "epoch": 25.0, "eval_dummy": 1.0, "eval_loss": 28.097671508789062, "eval_runtime": 23.8305, "eval_samples_per_second": 2.014, "eval_steps_per_second": 0.336, "step": 2150 }, { "epoch": 25.58139534883721, "grad_norm": 22.621362686157227, "learning_rate": 4.148992248062016e-05, "loss": 14.3456, "step": 2200 }, { "epoch": 26.0, "eval_dummy": 1.0, "eval_loss": 26.79138946533203, "eval_runtime": 22.0665, "eval_samples_per_second": 2.175, "eval_steps_per_second": 0.363, "step": 2236 }, { "epoch": 26.74418604651163, "grad_norm": 27.181503295898438, "learning_rate": 4.11031007751938e-05, "loss": 14.4534, "step": 2300 }, { "epoch": 27.0, "eval_dummy": 1.0, "eval_loss": 27.407852172851562, "eval_runtime": 22.3011, "eval_samples_per_second": 2.152, "eval_steps_per_second": 0.359, "step": 2322 }, { "epoch": 27.906976744186046, "grad_norm": 61.37733459472656, "learning_rate": 4.0716279069767445e-05, "loss": 14.4448, "step": 2400 }, { "epoch": 28.0, "eval_dummy": 1.0, "eval_loss": 26.829126358032227, "eval_runtime": 31.0502, "eval_samples_per_second": 1.546, "eval_steps_per_second": 0.258, "step": 2408 }, { "epoch": 29.0, "eval_dummy": 1.0, "eval_loss": 27.15058135986328, "eval_runtime": 22.5789, "eval_samples_per_second": 2.126, "eval_steps_per_second": 0.354, "step": 2494 }, { "epoch": 29.069767441860463, "grad_norm": 28.471744537353516, "learning_rate": 4.032945736434109e-05, "loss": 14.0327, "step": 2500 }, { "epoch": 30.0, "eval_dummy": 1.0, "eval_loss": 27.197296142578125, "eval_runtime": 23.3263, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.343, "step": 2580 }, { "epoch": 30.232558139534884, "grad_norm": 24.784605026245117, "learning_rate": 3.994263565891473e-05, "loss": 13.8785, "step": 2600 }, { "epoch": 31.0, "eval_dummy": 1.0, "eval_loss": 27.506200790405273, "eval_runtime": 19.8913, "eval_samples_per_second": 2.413, "eval_steps_per_second": 0.402, "step": 2666 }, { "epoch": 31.3953488372093, "grad_norm": 21.817359924316406, "learning_rate": 3.9555813953488374e-05, "loss": 14.3373, "step": 2700 }, { "epoch": 32.0, "eval_dummy": 1.0, "eval_loss": 27.950990676879883, "eval_runtime": 20.3531, "eval_samples_per_second": 2.358, "eval_steps_per_second": 0.393, "step": 2752 }, { "epoch": 32.55813953488372, "grad_norm": 37.52482986450195, "learning_rate": 3.9168992248062024e-05, "loss": 13.3176, "step": 2800 }, { "epoch": 33.0, "eval_dummy": 1.0, "eval_loss": 27.187788009643555, "eval_runtime": 22.4791, "eval_samples_per_second": 2.135, "eval_steps_per_second": 0.356, "step": 2838 }, { "epoch": 33.72093023255814, "grad_norm": 20.109516143798828, "learning_rate": 3.8782170542635666e-05, "loss": 13.8154, "step": 2900 }, { "epoch": 34.0, "eval_dummy": 1.0, "eval_loss": 25.575902938842773, "eval_runtime": 22.7865, "eval_samples_per_second": 2.107, "eval_steps_per_second": 0.351, "step": 2924 }, { "epoch": 34.883720930232556, "grad_norm": 28.61270523071289, "learning_rate": 3.839534883720931e-05, "loss": 13.8962, "step": 3000 }, { "epoch": 35.0, "eval_dummy": 1.0, "eval_loss": 27.762657165527344, "eval_runtime": 22.9543, "eval_samples_per_second": 2.091, "eval_steps_per_second": 0.349, "step": 3010 }, { "epoch": 36.0, "eval_dummy": 1.0, "eval_loss": 28.80609893798828, "eval_runtime": 21.8049, "eval_samples_per_second": 2.201, "eval_steps_per_second": 0.367, "step": 3096 }, { "epoch": 36.04651162790697, "grad_norm": 16.99755859375, "learning_rate": 3.800852713178295e-05, "loss": 13.3858, "step": 3100 }, { "epoch": 37.0, "eval_dummy": 1.0, "eval_loss": 28.43279457092285, "eval_runtime": 22.6206, "eval_samples_per_second": 2.122, "eval_steps_per_second": 0.354, "step": 3182 }, { "epoch": 37.2093023255814, "grad_norm": 28.740880966186523, "learning_rate": 3.7621705426356595e-05, "loss": 12.9659, "step": 3200 }, { "epoch": 38.0, "eval_dummy": 1.0, "eval_loss": 27.551528930664062, "eval_runtime": 22.7619, "eval_samples_per_second": 2.109, "eval_steps_per_second": 0.351, "step": 3268 }, { "epoch": 38.372093023255815, "grad_norm": 27.66327667236328, "learning_rate": 3.723488372093023e-05, "loss": 13.6813, "step": 3300 }, { "epoch": 39.0, "eval_dummy": 1.0, "eval_loss": 27.82057762145996, "eval_runtime": 22.3455, "eval_samples_per_second": 2.148, "eval_steps_per_second": 0.358, "step": 3354 }, { "epoch": 39.53488372093023, "grad_norm": 23.328880310058594, "learning_rate": 3.6848062015503874e-05, "loss": 13.3049, "step": 3400 }, { "epoch": 40.0, "eval_dummy": 1.0, "eval_loss": 28.60624885559082, "eval_runtime": 21.4157, "eval_samples_per_second": 2.241, "eval_steps_per_second": 0.374, "step": 3440 }, { "epoch": 40.69767441860465, "grad_norm": 15.833329200744629, "learning_rate": 3.646124031007752e-05, "loss": 13.1584, "step": 3500 }, { "epoch": 41.0, "eval_dummy": 1.0, "eval_loss": 28.53643798828125, "eval_runtime": 22.5851, "eval_samples_per_second": 2.125, "eval_steps_per_second": 0.354, "step": 3526 }, { "epoch": 41.86046511627907, "grad_norm": 20.939760208129883, "learning_rate": 3.607441860465116e-05, "loss": 12.9234, "step": 3600 }, { "epoch": 42.0, "eval_dummy": 1.0, "eval_loss": 29.316545486450195, "eval_runtime": 23.0269, "eval_samples_per_second": 2.085, "eval_steps_per_second": 0.347, "step": 3612 }, { "epoch": 43.0, "eval_dummy": 1.0, "eval_loss": 28.51560401916504, "eval_runtime": 23.1047, "eval_samples_per_second": 2.077, "eval_steps_per_second": 0.346, "step": 3698 }, { "epoch": 43.02325581395349, "grad_norm": 36.79478454589844, "learning_rate": 3.56875968992248e-05, "loss": 13.1375, "step": 3700 }, { "epoch": 44.0, "eval_dummy": 1.0, "eval_loss": 28.247644424438477, "eval_runtime": 22.1995, "eval_samples_per_second": 2.162, "eval_steps_per_second": 0.36, "step": 3784 }, { "epoch": 44.18604651162791, "grad_norm": 23.890663146972656, "learning_rate": 3.5300775193798446e-05, "loss": 12.7875, "step": 3800 }, { "epoch": 45.0, "eval_dummy": 1.0, "eval_loss": 29.995925903320312, "eval_runtime": 21.9104, "eval_samples_per_second": 2.191, "eval_steps_per_second": 0.365, "step": 3870 }, { "epoch": 45.348837209302324, "grad_norm": 24.69568634033203, "learning_rate": 3.4913953488372095e-05, "loss": 12.6507, "step": 3900 }, { "epoch": 46.0, "eval_dummy": 1.0, "eval_loss": 28.547975540161133, "eval_runtime": 21.9012, "eval_samples_per_second": 2.192, "eval_steps_per_second": 0.365, "step": 3956 }, { "epoch": 46.51162790697674, "grad_norm": 20.390417098999023, "learning_rate": 3.452713178294574e-05, "loss": 13.0131, "step": 4000 }, { "epoch": 47.0, "eval_dummy": 1.0, "eval_loss": 29.111730575561523, "eval_runtime": 21.9155, "eval_samples_per_second": 2.19, "eval_steps_per_second": 0.365, "step": 4042 }, { "epoch": 47.674418604651166, "grad_norm": 28.50699234008789, "learning_rate": 3.414031007751938e-05, "loss": 12.3806, "step": 4100 }, { "epoch": 48.0, "eval_dummy": 1.0, "eval_loss": 31.215286254882812, "eval_runtime": 22.3392, "eval_samples_per_second": 2.149, "eval_steps_per_second": 0.358, "step": 4128 }, { "epoch": 48.83720930232558, "grad_norm": 40.268310546875, "learning_rate": 3.3753488372093024e-05, "loss": 12.9016, "step": 4200 }, { "epoch": 49.0, "eval_dummy": 1.0, "eval_loss": 28.940536499023438, "eval_runtime": 22.1862, "eval_samples_per_second": 2.164, "eval_steps_per_second": 0.361, "step": 4214 }, { "epoch": 50.0, "grad_norm": 20.862178802490234, "learning_rate": 3.3366666666666674e-05, "loss": 12.274, "step": 4300 }, { "epoch": 50.0, "eval_dummy": 1.0, "eval_loss": 28.7396297454834, "eval_runtime": 23.512, "eval_samples_per_second": 2.042, "eval_steps_per_second": 0.34, "step": 4300 }, { "epoch": 51.0, "eval_dummy": 1.0, "eval_loss": 30.3947696685791, "eval_runtime": 22.1427, "eval_samples_per_second": 2.168, "eval_steps_per_second": 0.361, "step": 4386 }, { "epoch": 51.16279069767442, "grad_norm": 24.852718353271484, "learning_rate": 3.2979844961240316e-05, "loss": 12.5767, "step": 4400 }, { "epoch": 52.0, "eval_dummy": 1.0, "eval_loss": 29.386320114135742, "eval_runtime": 22.2511, "eval_samples_per_second": 2.157, "eval_steps_per_second": 0.36, "step": 4472 }, { "epoch": 52.325581395348834, "grad_norm": 17.813112258911133, "learning_rate": 3.259302325581396e-05, "loss": 12.5965, "step": 4500 }, { "epoch": 53.0, "eval_dummy": 1.0, "eval_loss": 29.451583862304688, "eval_runtime": 22.6655, "eval_samples_per_second": 2.118, "eval_steps_per_second": 0.353, "step": 4558 }, { "epoch": 53.48837209302326, "grad_norm": 30.23479461669922, "learning_rate": 3.22062015503876e-05, "loss": 11.9685, "step": 4600 }, { "epoch": 54.0, "eval_dummy": 1.0, "eval_loss": 27.297447204589844, "eval_runtime": 21.7839, "eval_samples_per_second": 2.203, "eval_steps_per_second": 0.367, "step": 4644 }, { "epoch": 54.651162790697676, "grad_norm": 22.38864517211914, "learning_rate": 3.1819379844961245e-05, "loss": 12.3025, "step": 4700 }, { "epoch": 55.0, "eval_dummy": 1.0, "eval_loss": 27.001306533813477, "eval_runtime": 20.0559, "eval_samples_per_second": 2.393, "eval_steps_per_second": 0.399, "step": 4730 }, { "epoch": 55.81395348837209, "grad_norm": 45.97740936279297, "learning_rate": 3.143255813953489e-05, "loss": 12.4256, "step": 4800 }, { "epoch": 56.0, "eval_dummy": 1.0, "eval_loss": 27.27130126953125, "eval_runtime": 19.8678, "eval_samples_per_second": 2.416, "eval_steps_per_second": 0.403, "step": 4816 }, { "epoch": 56.97674418604651, "grad_norm": 29.370532989501953, "learning_rate": 3.104573643410853e-05, "loss": 12.2008, "step": 4900 }, { "epoch": 57.0, "eval_dummy": 1.0, "eval_loss": 27.405370712280273, "eval_runtime": 22.0029, "eval_samples_per_second": 2.182, "eval_steps_per_second": 0.364, "step": 4902 }, { "epoch": 58.0, "eval_dummy": 1.0, "eval_loss": 27.954648971557617, "eval_runtime": 21.8467, "eval_samples_per_second": 2.197, "eval_steps_per_second": 0.366, "step": 4988 }, { "epoch": 58.13953488372093, "grad_norm": 51.44390869140625, "learning_rate": 3.0658914728682174e-05, "loss": 12.1018, "step": 5000 }, { "epoch": 59.0, "eval_dummy": 1.0, "eval_loss": 28.945287704467773, "eval_runtime": 21.6575, "eval_samples_per_second": 2.216, "eval_steps_per_second": 0.369, "step": 5074 }, { "epoch": 59.30232558139535, "grad_norm": 13.186450958251953, "learning_rate": 3.027209302325581e-05, "loss": 12.2156, "step": 5100 }, { "epoch": 60.0, "eval_dummy": 1.0, "eval_loss": 29.3121337890625, "eval_runtime": 21.8058, "eval_samples_per_second": 2.201, "eval_steps_per_second": 0.367, "step": 5160 }, { "epoch": 60.46511627906977, "grad_norm": 38.734771728515625, "learning_rate": 2.9885271317829456e-05, "loss": 11.9526, "step": 5200 }, { "epoch": 61.0, "eval_dummy": 1.0, "eval_loss": 30.190271377563477, "eval_runtime": 23.0086, "eval_samples_per_second": 2.086, "eval_steps_per_second": 0.348, "step": 5246 }, { "epoch": 61.627906976744185, "grad_norm": 61.52443313598633, "learning_rate": 2.9498449612403096e-05, "loss": 12.1103, "step": 5300 }, { "epoch": 62.0, "eval_dummy": 1.0, "eval_loss": 28.82759666442871, "eval_runtime": 20.1803, "eval_samples_per_second": 2.379, "eval_steps_per_second": 0.396, "step": 5332 }, { "epoch": 62.7906976744186, "grad_norm": 14.933253288269043, "learning_rate": 2.9111627906976742e-05, "loss": 11.8017, "step": 5400 }, { "epoch": 63.0, "eval_dummy": 1.0, "eval_loss": 28.789840698242188, "eval_runtime": 22.9342, "eval_samples_per_second": 2.093, "eval_steps_per_second": 0.349, "step": 5418 }, { "epoch": 63.95348837209303, "grad_norm": 16.43902015686035, "learning_rate": 2.8724806201550385e-05, "loss": 11.9907, "step": 5500 }, { "epoch": 64.0, "eval_dummy": 1.0, "eval_loss": 28.616714477539062, "eval_runtime": 21.968, "eval_samples_per_second": 2.185, "eval_steps_per_second": 0.364, "step": 5504 }, { "epoch": 65.0, "eval_dummy": 1.0, "eval_loss": 29.28215217590332, "eval_runtime": 23.1029, "eval_samples_per_second": 2.078, "eval_steps_per_second": 0.346, "step": 5590 }, { "epoch": 65.11627906976744, "grad_norm": 14.621051788330078, "learning_rate": 2.833798449612403e-05, "loss": 11.6683, "step": 5600 }, { "epoch": 66.0, "eval_dummy": 1.0, "eval_loss": 31.469453811645508, "eval_runtime": 21.6472, "eval_samples_per_second": 2.217, "eval_steps_per_second": 0.37, "step": 5676 }, { "epoch": 66.27906976744185, "grad_norm": 22.4487361907959, "learning_rate": 2.795116279069767e-05, "loss": 12.1544, "step": 5700 }, { "epoch": 67.0, "eval_dummy": 1.0, "eval_loss": 27.77729034423828, "eval_runtime": 22.155, "eval_samples_per_second": 2.167, "eval_steps_per_second": 0.361, "step": 5762 }, { "epoch": 67.44186046511628, "grad_norm": 26.262325286865234, "learning_rate": 2.7564341085271317e-05, "loss": 11.7442, "step": 5800 }, { "epoch": 68.0, "eval_dummy": 1.0, "eval_loss": 29.537628173828125, "eval_runtime": 22.0834, "eval_samples_per_second": 2.174, "eval_steps_per_second": 0.362, "step": 5848 }, { "epoch": 68.6046511627907, "grad_norm": 15.915388107299805, "learning_rate": 2.7177519379844963e-05, "loss": 11.1493, "step": 5900 }, { "epoch": 69.0, "eval_dummy": 1.0, "eval_loss": 27.891584396362305, "eval_runtime": 20.4067, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.392, "step": 5934 }, { "epoch": 69.76744186046511, "grad_norm": 14.056746482849121, "learning_rate": 2.679069767441861e-05, "loss": 12.0781, "step": 6000 }, { "epoch": 70.0, "eval_dummy": 1.0, "eval_loss": 28.40962791442871, "eval_runtime": 20.2928, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.394, "step": 6020 }, { "epoch": 70.93023255813954, "grad_norm": 12.681285858154297, "learning_rate": 2.640387596899225e-05, "loss": 11.8055, "step": 6100 }, { "epoch": 71.0, "eval_dummy": 1.0, "eval_loss": 29.22722816467285, "eval_runtime": 21.9669, "eval_samples_per_second": 2.185, "eval_steps_per_second": 0.364, "step": 6106 }, { "epoch": 72.0, "eval_dummy": 1.0, "eval_loss": 29.27687644958496, "eval_runtime": 22.1231, "eval_samples_per_second": 2.17, "eval_steps_per_second": 0.362, "step": 6192 }, { "epoch": 72.09302325581395, "grad_norm": 32.67131805419922, "learning_rate": 2.6017054263565892e-05, "loss": 11.4811, "step": 6200 }, { "epoch": 73.0, "eval_dummy": 1.0, "eval_loss": 29.255170822143555, "eval_runtime": 22.0309, "eval_samples_per_second": 2.179, "eval_steps_per_second": 0.363, "step": 6278 }, { "epoch": 73.25581395348837, "grad_norm": 10.397403717041016, "learning_rate": 2.5630232558139535e-05, "loss": 11.5947, "step": 6300 }, { "epoch": 74.0, "eval_dummy": 1.0, "eval_loss": 29.26108741760254, "eval_runtime": 22.8254, "eval_samples_per_second": 2.103, "eval_steps_per_second": 0.35, "step": 6364 }, { "epoch": 74.4186046511628, "grad_norm": 14.20971965789795, "learning_rate": 2.5243410852713178e-05, "loss": 11.7263, "step": 6400 }, { "epoch": 75.0, "eval_dummy": 1.0, "eval_loss": 30.79526710510254, "eval_runtime": 21.7371, "eval_samples_per_second": 2.208, "eval_steps_per_second": 0.368, "step": 6450 }, { "epoch": 75.5813953488372, "grad_norm": 22.18491554260254, "learning_rate": 2.485658914728682e-05, "loss": 11.7399, "step": 6500 }, { "epoch": 76.0, "eval_dummy": 1.0, "eval_loss": 30.069183349609375, "eval_runtime": 20.4489, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.391, "step": 6536 }, { "epoch": 76.74418604651163, "grad_norm": 40.54490280151367, "learning_rate": 2.4469767441860463e-05, "loss": 11.0851, "step": 6600 }, { "epoch": 77.0, "eval_dummy": 1.0, "eval_loss": 29.680282592773438, "eval_runtime": 22.1352, "eval_samples_per_second": 2.168, "eval_steps_per_second": 0.361, "step": 6622 }, { "epoch": 77.90697674418605, "grad_norm": 23.71202278137207, "learning_rate": 2.4082945736434106e-05, "loss": 11.5118, "step": 6700 }, { "epoch": 78.0, "eval_dummy": 1.0, "eval_loss": 30.73450469970703, "eval_runtime": 21.8262, "eval_samples_per_second": 2.199, "eval_steps_per_second": 0.367, "step": 6708 }, { "epoch": 79.0, "eval_dummy": 1.0, "eval_loss": 31.597990036010742, "eval_runtime": 20.799, "eval_samples_per_second": 2.308, "eval_steps_per_second": 0.385, "step": 6794 }, { "epoch": 79.06976744186046, "grad_norm": 31.14312744140625, "learning_rate": 2.369612403100775e-05, "loss": 11.516, "step": 6800 }, { "epoch": 80.0, "eval_dummy": 1.0, "eval_loss": 30.527856826782227, "eval_runtime": 22.2074, "eval_samples_per_second": 2.161, "eval_steps_per_second": 0.36, "step": 6880 }, { "epoch": 80.23255813953489, "grad_norm": 24.23548126220703, "learning_rate": 2.3309302325581395e-05, "loss": 11.3797, "step": 6900 }, { "epoch": 81.0, "eval_dummy": 1.0, "eval_loss": 30.22646141052246, "eval_runtime": 21.9444, "eval_samples_per_second": 2.187, "eval_steps_per_second": 0.365, "step": 6966 }, { "epoch": 81.3953488372093, "grad_norm": 19.255537033081055, "learning_rate": 2.2922480620155038e-05, "loss": 11.3335, "step": 7000 }, { "epoch": 82.0, "eval_dummy": 1.0, "eval_loss": 30.381624221801758, "eval_runtime": 20.7523, "eval_samples_per_second": 2.313, "eval_steps_per_second": 0.385, "step": 7052 }, { "epoch": 82.55813953488372, "grad_norm": 29.231863021850586, "learning_rate": 2.253565891472868e-05, "loss": 11.2303, "step": 7100 }, { "epoch": 83.0, "eval_dummy": 1.0, "eval_loss": 29.32378578186035, "eval_runtime": 21.1077, "eval_samples_per_second": 2.274, "eval_steps_per_second": 0.379, "step": 7138 }, { "epoch": 83.72093023255815, "grad_norm": 26.13906478881836, "learning_rate": 2.2148837209302324e-05, "loss": 11.1964, "step": 7200 }, { "epoch": 84.0, "eval_dummy": 1.0, "eval_loss": 30.398666381835938, "eval_runtime": 21.5159, "eval_samples_per_second": 2.231, "eval_steps_per_second": 0.372, "step": 7224 }, { "epoch": 84.88372093023256, "grad_norm": 9.493932723999023, "learning_rate": 2.1762015503875967e-05, "loss": 11.321, "step": 7300 }, { "epoch": 85.0, "eval_dummy": 1.0, "eval_loss": 30.193452835083008, "eval_runtime": 19.183, "eval_samples_per_second": 2.502, "eval_steps_per_second": 0.417, "step": 7310 }, { "epoch": 86.0, "eval_dummy": 1.0, "eval_loss": 29.14207649230957, "eval_runtime": 20.4862, "eval_samples_per_second": 2.343, "eval_steps_per_second": 0.391, "step": 7396 }, { "epoch": 86.04651162790698, "grad_norm": 22.432432174682617, "learning_rate": 2.137519379844961e-05, "loss": 11.3891, "step": 7400 }, { "epoch": 87.0, "eval_dummy": 1.0, "eval_loss": 31.20737648010254, "eval_runtime": 20.51, "eval_samples_per_second": 2.34, "eval_steps_per_second": 0.39, "step": 7482 }, { "epoch": 87.20930232558139, "grad_norm": 17.62310218811035, "learning_rate": 2.0988372093023253e-05, "loss": 11.1347, "step": 7500 }, { "epoch": 88.0, "eval_dummy": 1.0, "eval_loss": 30.67350196838379, "eval_runtime": 20.7488, "eval_samples_per_second": 2.313, "eval_steps_per_second": 0.386, "step": 7568 }, { "epoch": 88.37209302325581, "grad_norm": 30.231889724731445, "learning_rate": 2.0601550387596896e-05, "loss": 11.1945, "step": 7600 }, { "epoch": 89.0, "eval_dummy": 1.0, "eval_loss": 31.205347061157227, "eval_runtime": 20.2969, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.394, "step": 7654 }, { "epoch": 89.53488372093024, "grad_norm": 26.26261329650879, "learning_rate": 2.0214728682170545e-05, "loss": 10.9891, "step": 7700 }, { "epoch": 90.0, "eval_dummy": 1.0, "eval_loss": 31.437280654907227, "eval_runtime": 20.9436, "eval_samples_per_second": 2.292, "eval_steps_per_second": 0.382, "step": 7740 }, { "epoch": 90.69767441860465, "grad_norm": 36.840755462646484, "learning_rate": 1.9827906976744188e-05, "loss": 11.104, "step": 7800 }, { "epoch": 91.0, "eval_dummy": 1.0, "eval_loss": 31.39463233947754, "eval_runtime": 20.2092, "eval_samples_per_second": 2.375, "eval_steps_per_second": 0.396, "step": 7826 }, { "epoch": 91.86046511627907, "grad_norm": 14.336752891540527, "learning_rate": 1.944108527131783e-05, "loss": 11.1408, "step": 7900 }, { "epoch": 92.0, "eval_dummy": 1.0, "eval_loss": 31.218612670898438, "eval_runtime": 20.5575, "eval_samples_per_second": 2.335, "eval_steps_per_second": 0.389, "step": 7912 }, { "epoch": 93.0, "eval_dummy": 1.0, "eval_loss": 29.58709716796875, "eval_runtime": 20.4638, "eval_samples_per_second": 2.346, "eval_steps_per_second": 0.391, "step": 7998 }, { "epoch": 93.02325581395348, "grad_norm": 19.522462844848633, "learning_rate": 1.9054263565891474e-05, "loss": 11.0779, "step": 8000 }, { "epoch": 94.0, "eval_dummy": 1.0, "eval_loss": 30.467092514038086, "eval_runtime": 20.9305, "eval_samples_per_second": 2.293, "eval_steps_per_second": 0.382, "step": 8084 }, { "epoch": 94.18604651162791, "grad_norm": 13.406259536743164, "learning_rate": 1.8667441860465113e-05, "loss": 11.0551, "step": 8100 }, { "epoch": 95.0, "eval_dummy": 1.0, "eval_loss": 32.01297378540039, "eval_runtime": 22.2677, "eval_samples_per_second": 2.156, "eval_steps_per_second": 0.359, "step": 8170 }, { "epoch": 95.34883720930233, "grad_norm": 27.72422218322754, "learning_rate": 1.8280620155038756e-05, "loss": 10.8809, "step": 8200 }, { "epoch": 96.0, "eval_dummy": 1.0, "eval_loss": 30.445892333984375, "eval_runtime": 19.9061, "eval_samples_per_second": 2.411, "eval_steps_per_second": 0.402, "step": 8256 }, { "epoch": 96.51162790697674, "grad_norm": 22.884096145629883, "learning_rate": 1.78937984496124e-05, "loss": 11.1123, "step": 8300 }, { "epoch": 97.0, "eval_dummy": 1.0, "eval_loss": 30.841537475585938, "eval_runtime": 20.3958, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.392, "step": 8342 }, { "epoch": 97.67441860465117, "grad_norm": 18.161767959594727, "learning_rate": 1.7506976744186042e-05, "loss": 10.7116, "step": 8400 }, { "epoch": 98.0, "eval_dummy": 1.0, "eval_loss": 31.04449462890625, "eval_runtime": 20.58, "eval_samples_per_second": 2.332, "eval_steps_per_second": 0.389, "step": 8428 }, { "epoch": 98.83720930232558, "grad_norm": 24.269365310668945, "learning_rate": 1.7120155038759692e-05, "loss": 11.0086, "step": 8500 }, { "epoch": 99.0, "eval_dummy": 1.0, "eval_loss": 31.04711151123047, "eval_runtime": 22.2487, "eval_samples_per_second": 2.157, "eval_steps_per_second": 0.36, "step": 8514 }, { "epoch": 100.0, "grad_norm": 12.588210105895996, "learning_rate": 1.6733333333333335e-05, "loss": 11.0542, "step": 8600 }, { "epoch": 100.0, "eval_dummy": 1.0, "eval_loss": 31.0217227935791, "eval_runtime": 20.2048, "eval_samples_per_second": 2.376, "eval_steps_per_second": 0.396, "step": 8600 }, { "epoch": 101.0, "eval_dummy": 1.0, "eval_loss": 31.788482666015625, "eval_runtime": 20.6031, "eval_samples_per_second": 2.33, "eval_steps_per_second": 0.388, "step": 8686 }, { "epoch": 101.16279069767442, "grad_norm": 15.751969337463379, "learning_rate": 1.6346511627906978e-05, "loss": 10.8332, "step": 8700 }, { "epoch": 102.0, "eval_dummy": 1.0, "eval_loss": 30.61908531188965, "eval_runtime": 22.0561, "eval_samples_per_second": 2.176, "eval_steps_per_second": 0.363, "step": 8772 }, { "epoch": 102.32558139534883, "grad_norm": 21.087759017944336, "learning_rate": 1.595968992248062e-05, "loss": 10.8696, "step": 8800 }, { "epoch": 103.0, "eval_dummy": 1.0, "eval_loss": 31.207502365112305, "eval_runtime": 20.2373, "eval_samples_per_second": 2.372, "eval_steps_per_second": 0.395, "step": 8858 }, { "epoch": 103.48837209302326, "grad_norm": 12.689813613891602, "learning_rate": 1.5572868217054263e-05, "loss": 10.6959, "step": 8900 }, { "epoch": 104.0, "eval_dummy": 1.0, "eval_loss": 32.079524993896484, "eval_runtime": 20.7358, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.386, "step": 8944 }, { "epoch": 104.65116279069767, "grad_norm": 15.677764892578125, "learning_rate": 1.5186046511627904e-05, "loss": 11.0688, "step": 9000 }, { "epoch": 105.0, "eval_dummy": 1.0, "eval_loss": 33.7819709777832, "eval_runtime": 21.8207, "eval_samples_per_second": 2.2, "eval_steps_per_second": 0.367, "step": 9030 }, { "epoch": 105.81395348837209, "grad_norm": 30.146947860717773, "learning_rate": 1.479922480620155e-05, "loss": 10.6762, "step": 9100 }, { "epoch": 106.0, "eval_dummy": 1.0, "eval_loss": 31.94025421142578, "eval_runtime": 21.7851, "eval_samples_per_second": 2.203, "eval_steps_per_second": 0.367, "step": 9116 }, { "epoch": 106.97674418604652, "grad_norm": 15.782742500305176, "learning_rate": 1.4412403100775194e-05, "loss": 10.8607, "step": 9200 }, { "epoch": 107.0, "eval_dummy": 1.0, "eval_loss": 33.13449478149414, "eval_runtime": 20.3683, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.393, "step": 9202 }, { "epoch": 108.0, "eval_dummy": 1.0, "eval_loss": 31.081071853637695, "eval_runtime": 20.7274, "eval_samples_per_second": 2.316, "eval_steps_per_second": 0.386, "step": 9288 }, { "epoch": 108.13953488372093, "grad_norm": 18.13740348815918, "learning_rate": 1.4025581395348838e-05, "loss": 10.7504, "step": 9300 }, { "epoch": 109.0, "eval_dummy": 1.0, "eval_loss": 31.06629753112793, "eval_runtime": 21.3819, "eval_samples_per_second": 2.245, "eval_steps_per_second": 0.374, "step": 9374 }, { "epoch": 109.30232558139535, "grad_norm": 21.32436180114746, "learning_rate": 1.3638759689922484e-05, "loss": 10.7841, "step": 9400 }, { "epoch": 110.0, "eval_dummy": 1.0, "eval_loss": 30.08406639099121, "eval_runtime": 22.1084, "eval_samples_per_second": 2.171, "eval_steps_per_second": 0.362, "step": 9460 }, { "epoch": 110.46511627906976, "grad_norm": 16.499900817871094, "learning_rate": 1.3251937984496127e-05, "loss": 10.5677, "step": 9500 }, { "epoch": 111.0, "eval_dummy": 1.0, "eval_loss": 30.81854248046875, "eval_runtime": 33.4104, "eval_samples_per_second": 1.437, "eval_steps_per_second": 0.239, "step": 9546 }, { "epoch": 111.62790697674419, "grad_norm": 13.868083000183105, "learning_rate": 1.2865116279069769e-05, "loss": 11.0266, "step": 9600 }, { "epoch": 112.0, "eval_dummy": 1.0, "eval_loss": 32.15488815307617, "eval_runtime": 20.6227, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.388, "step": 9632 }, { "epoch": 112.79069767441861, "grad_norm": 15.670105934143066, "learning_rate": 1.2478294573643411e-05, "loss": 10.5912, "step": 9700 }, { "epoch": 113.0, "eval_dummy": 1.0, "eval_loss": 32.220767974853516, "eval_runtime": 21.7899, "eval_samples_per_second": 2.203, "eval_steps_per_second": 0.367, "step": 9718 }, { "epoch": 113.95348837209302, "grad_norm": 33.69122314453125, "learning_rate": 1.2091472868217054e-05, "loss": 10.6698, "step": 9800 }, { "epoch": 114.0, "eval_dummy": 1.0, "eval_loss": 31.533737182617188, "eval_runtime": 21.6546, "eval_samples_per_second": 2.217, "eval_steps_per_second": 0.369, "step": 9804 }, { "epoch": 115.0, "eval_dummy": 1.0, "eval_loss": 32.227298736572266, "eval_runtime": 21.942, "eval_samples_per_second": 2.188, "eval_steps_per_second": 0.365, "step": 9890 }, { "epoch": 115.11627906976744, "grad_norm": 14.536725997924805, "learning_rate": 1.1704651162790697e-05, "loss": 10.6857, "step": 9900 }, { "epoch": 116.0, "eval_dummy": 1.0, "eval_loss": 31.86484718322754, "eval_runtime": 23.6138, "eval_samples_per_second": 2.033, "eval_steps_per_second": 0.339, "step": 9976 }, { "epoch": 116.27906976744185, "grad_norm": 16.94423484802246, "learning_rate": 1.131782945736434e-05, "loss": 10.5977, "step": 10000 }, { "epoch": 117.0, "eval_dummy": 1.0, "eval_loss": 31.805801391601562, "eval_runtime": 21.9482, "eval_samples_per_second": 2.187, "eval_steps_per_second": 0.364, "step": 10062 }, { "epoch": 117.44186046511628, "grad_norm": 13.750273704528809, "learning_rate": 1.0931007751937988e-05, "loss": 10.6883, "step": 10100 }, { "epoch": 118.0, "eval_dummy": 1.0, "eval_loss": 31.72540855407715, "eval_runtime": 21.0483, "eval_samples_per_second": 2.28, "eval_steps_per_second": 0.38, "step": 10148 }, { "epoch": 118.6046511627907, "grad_norm": 10.593609809875488, "learning_rate": 1.0544186046511631e-05, "loss": 10.3506, "step": 10200 }, { "epoch": 119.0, "eval_dummy": 1.0, "eval_loss": 33.029788970947266, "eval_runtime": 21.054, "eval_samples_per_second": 2.28, "eval_steps_per_second": 0.38, "step": 10234 }, { "epoch": 119.76744186046511, "grad_norm": 15.655586242675781, "learning_rate": 1.0157364341085274e-05, "loss": 10.9217, "step": 10300 }, { "epoch": 120.0, "eval_dummy": 1.0, "eval_loss": 33.34033203125, "eval_runtime": 21.2355, "eval_samples_per_second": 2.26, "eval_steps_per_second": 0.377, "step": 10320 }, { "epoch": 120.93023255813954, "grad_norm": 18.73294448852539, "learning_rate": 9.770542635658917e-06, "loss": 10.5332, "step": 10400 }, { "epoch": 121.0, "eval_dummy": 1.0, "eval_loss": 32.53839111328125, "eval_runtime": 22.6909, "eval_samples_per_second": 2.115, "eval_steps_per_second": 0.353, "step": 10406 }, { "epoch": 122.0, "eval_dummy": 1.0, "eval_loss": 32.219181060791016, "eval_runtime": 23.461, "eval_samples_per_second": 2.046, "eval_steps_per_second": 0.341, "step": 10492 }, { "epoch": 122.09302325581395, "grad_norm": 28.769685745239258, "learning_rate": 9.383720930232558e-06, "loss": 10.4658, "step": 10500 }, { "epoch": 123.0, "eval_dummy": 1.0, "eval_loss": 32.89132308959961, "eval_runtime": 20.3977, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.392, "step": 10578 }, { "epoch": 123.25581395348837, "grad_norm": 15.078568458557129, "learning_rate": 8.9968992248062e-06, "loss": 10.4877, "step": 10600 }, { "epoch": 124.0, "eval_dummy": 1.0, "eval_loss": 33.1068229675293, "eval_runtime": 21.0832, "eval_samples_per_second": 2.277, "eval_steps_per_second": 0.379, "step": 10664 }, { "epoch": 124.4186046511628, "grad_norm": 12.698410987854004, "learning_rate": 8.610077519379844e-06, "loss": 10.7404, "step": 10700 }, { "epoch": 125.0, "eval_dummy": 1.0, "eval_loss": 34.11867904663086, "eval_runtime": 20.3679, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.393, "step": 10750 }, { "epoch": 125.5813953488372, "grad_norm": 13.220843315124512, "learning_rate": 8.223255813953487e-06, "loss": 10.2195, "step": 10800 }, { "epoch": 126.0, "eval_dummy": 1.0, "eval_loss": 32.441768646240234, "eval_runtime": 22.247, "eval_samples_per_second": 2.158, "eval_steps_per_second": 0.36, "step": 10836 }, { "epoch": 126.74418604651163, "grad_norm": 10.44630241394043, "learning_rate": 7.836434108527135e-06, "loss": 10.7622, "step": 10900 }, { "epoch": 127.0, "eval_dummy": 1.0, "eval_loss": 32.29350662231445, "eval_runtime": 19.9247, "eval_samples_per_second": 2.409, "eval_steps_per_second": 0.402, "step": 10922 }, { "epoch": 127.90697674418605, "grad_norm": 9.349663734436035, "learning_rate": 7.449612403100778e-06, "loss": 10.4301, "step": 11000 }, { "epoch": 128.0, "eval_dummy": 1.0, "eval_loss": 33.241119384765625, "eval_runtime": 21.6889, "eval_samples_per_second": 2.213, "eval_steps_per_second": 0.369, "step": 11008 }, { "epoch": 129.0, "eval_dummy": 1.0, "eval_loss": 32.369197845458984, "eval_runtime": 20.627, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.388, "step": 11094 }, { "epoch": 129.06976744186048, "grad_norm": 10.043197631835938, "learning_rate": 7.0627906976744195e-06, "loss": 10.6464, "step": 11100 }, { "epoch": 130.0, "eval_dummy": 1.0, "eval_loss": 32.629695892333984, "eval_runtime": 21.5106, "eval_samples_per_second": 2.231, "eval_steps_per_second": 0.372, "step": 11180 }, { "epoch": 130.2325581395349, "grad_norm": 12.1959810256958, "learning_rate": 6.675968992248062e-06, "loss": 10.4213, "step": 11200 }, { "epoch": 131.0, "eval_dummy": 1.0, "eval_loss": 33.751277923583984, "eval_runtime": 20.3699, "eval_samples_per_second": 2.356, "eval_steps_per_second": 0.393, "step": 11266 }, { "epoch": 131.3953488372093, "grad_norm": 8.733377456665039, "learning_rate": 6.289147286821704e-06, "loss": 10.382, "step": 11300 }, { "epoch": 132.0, "eval_dummy": 1.0, "eval_loss": 32.63821029663086, "eval_runtime": 21.2405, "eval_samples_per_second": 2.26, "eval_steps_per_second": 0.377, "step": 11352 }, { "epoch": 132.5581395348837, "grad_norm": 9.42501449584961, "learning_rate": 5.902325581395348e-06, "loss": 10.6049, "step": 11400 }, { "epoch": 133.0, "eval_dummy": 1.0, "eval_loss": 33.26212692260742, "eval_runtime": 19.892, "eval_samples_per_second": 2.413, "eval_steps_per_second": 0.402, "step": 11438 }, { "epoch": 133.72093023255815, "grad_norm": 15.421104431152344, "learning_rate": 5.515503875968991e-06, "loss": 10.3039, "step": 11500 }, { "epoch": 134.0, "eval_dummy": 1.0, "eval_loss": 32.94681167602539, "eval_runtime": 20.8546, "eval_samples_per_second": 2.302, "eval_steps_per_second": 0.384, "step": 11524 }, { "epoch": 134.88372093023256, "grad_norm": 6.813089847564697, "learning_rate": 5.128682170542633e-06, "loss": 10.3088, "step": 11600 }, { "epoch": 135.0, "eval_dummy": 1.0, "eval_loss": 33.48214340209961, "eval_runtime": 20.2807, "eval_samples_per_second": 2.367, "eval_steps_per_second": 0.394, "step": 11610 }, { "epoch": 136.0, "eval_dummy": 1.0, "eval_loss": 33.48236083984375, "eval_runtime": 20.8153, "eval_samples_per_second": 2.306, "eval_steps_per_second": 0.384, "step": 11696 }, { "epoch": 136.04651162790697, "grad_norm": 15.181730270385742, "learning_rate": 4.741860465116282e-06, "loss": 10.4832, "step": 11700 }, { "epoch": 137.0, "eval_dummy": 1.0, "eval_loss": 32.93196487426758, "eval_runtime": 22.5223, "eval_samples_per_second": 2.131, "eval_steps_per_second": 0.355, "step": 11782 }, { "epoch": 137.2093023255814, "grad_norm": 44.49610137939453, "learning_rate": 4.355038759689925e-06, "loss": 10.4149, "step": 11800 }, { "epoch": 138.0, "eval_dummy": 1.0, "eval_loss": 33.88530349731445, "eval_runtime": 22.2737, "eval_samples_per_second": 2.155, "eval_steps_per_second": 0.359, "step": 11868 }, { "epoch": 138.37209302325581, "grad_norm": 9.594252586364746, "learning_rate": 3.968217054263567e-06, "loss": 10.2473, "step": 11900 }, { "epoch": 139.0, "eval_dummy": 1.0, "eval_loss": 33.59774398803711, "eval_runtime": 22.6707, "eval_samples_per_second": 2.117, "eval_steps_per_second": 0.353, "step": 11954 }, { "epoch": 139.53488372093022, "grad_norm": 22.766454696655273, "learning_rate": 3.5813953488372097e-06, "loss": 10.7137, "step": 12000 }, { "epoch": 140.0, "eval_dummy": 1.0, "eval_loss": 34.181739807128906, "eval_runtime": 20.6754, "eval_samples_per_second": 2.322, "eval_steps_per_second": 0.387, "step": 12040 }, { "epoch": 140.69767441860466, "grad_norm": 14.830927848815918, "learning_rate": 3.1945736434108526e-06, "loss": 10.2686, "step": 12100 }, { "epoch": 141.0, "eval_dummy": 1.0, "eval_loss": 34.08915328979492, "eval_runtime": 20.927, "eval_samples_per_second": 2.294, "eval_steps_per_second": 0.382, "step": 12126 }, { "epoch": 141.86046511627907, "grad_norm": 11.483986854553223, "learning_rate": 2.8077519379844955e-06, "loss": 10.2581, "step": 12200 }, { "epoch": 142.0, "eval_dummy": 1.0, "eval_loss": 34.111263275146484, "eval_runtime": 20.251, "eval_samples_per_second": 2.37, "eval_steps_per_second": 0.395, "step": 12212 }, { "epoch": 143.0, "eval_dummy": 1.0, "eval_loss": 33.910648345947266, "eval_runtime": 21.4647, "eval_samples_per_second": 2.236, "eval_steps_per_second": 0.373, "step": 12298 }, { "epoch": 143.02325581395348, "grad_norm": 9.212081909179688, "learning_rate": 2.420930232558138e-06, "loss": 10.447, "step": 12300 }, { "epoch": 144.0, "eval_dummy": 1.0, "eval_loss": 33.34696960449219, "eval_runtime": 22.5636, "eval_samples_per_second": 2.127, "eval_steps_per_second": 0.355, "step": 12384 }, { "epoch": 144.1860465116279, "grad_norm": 11.666382789611816, "learning_rate": 2.034108527131781e-06, "loss": 10.3823, "step": 12400 }, { "epoch": 145.0, "eval_dummy": 1.0, "eval_loss": 33.305511474609375, "eval_runtime": 21.2521, "eval_samples_per_second": 2.259, "eval_steps_per_second": 0.376, "step": 12470 }, { "epoch": 145.34883720930233, "grad_norm": 10.381905555725098, "learning_rate": 1.6472868217054294e-06, "loss": 10.1283, "step": 12500 }, { "epoch": 146.0, "eval_dummy": 1.0, "eval_loss": 33.67618179321289, "eval_runtime": 22.6657, "eval_samples_per_second": 2.118, "eval_steps_per_second": 0.353, "step": 12556 }, { "epoch": 146.51162790697674, "grad_norm": 7.839820861816406, "learning_rate": 1.2604651162790717e-06, "loss": 10.5364, "step": 12600 }, { "epoch": 147.0, "eval_dummy": 1.0, "eval_loss": 33.99766159057617, "eval_runtime": 21.3852, "eval_samples_per_second": 2.245, "eval_steps_per_second": 0.374, "step": 12642 }, { "epoch": 147.67441860465115, "grad_norm": 8.196636199951172, "learning_rate": 8.736434108527145e-07, "loss": 10.1257, "step": 12700 }, { "epoch": 148.0, "eval_dummy": 1.0, "eval_loss": 34.032711029052734, "eval_runtime": 22.054, "eval_samples_per_second": 2.176, "eval_steps_per_second": 0.363, "step": 12728 }, { "epoch": 148.8372093023256, "grad_norm": 7.999052047729492, "learning_rate": 4.868217054263572e-07, "loss": 10.3092, "step": 12800 }, { "epoch": 149.0, "eval_dummy": 1.0, "eval_loss": 34.11701583862305, "eval_runtime": 20.4649, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.391, "step": 12814 }, { "epoch": 150.0, "grad_norm": 14.78354263305664, "learning_rate": 9.999999999999998e-08, "loss": 10.4947, "step": 12900 }, { "epoch": 150.0, "eval_dummy": 1.0, "eval_loss": 33.87764358520508, "eval_runtime": 21.856, "eval_samples_per_second": 2.196, "eval_steps_per_second": 0.366, "step": 12900 }, { "epoch": 150.0, "step": 12900, "total_flos": 2.194725062735954e+19, "train_loss": 12.942298725926598, "train_runtime": 48343.4254, "train_samples_per_second": 1.601, "train_steps_per_second": 0.267 } ], "logging_steps": 100, "max_steps": 12900, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.194725062735954e+19, "train_batch_size": 6, "trial_name": null, "trial_params": null }