{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.40017945266935845, "eval_steps": 500, "global_step": 2676, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014954389113204725, "grad_norm": 35.95169344376715, "learning_rate": 4.975124378109453e-08, "loss": 1.1911, "step": 1 }, { "epoch": 0.0002990877822640945, "grad_norm": 29.047342527504238, "learning_rate": 9.950248756218906e-08, "loss": 1.4707, "step": 2 }, { "epoch": 0.00044863167339614175, "grad_norm": 24.718727160032117, "learning_rate": 1.4925373134328358e-07, "loss": 0.9534, "step": 3 }, { "epoch": 0.000598175564528189, "grad_norm": 32.87218994198639, "learning_rate": 1.9900497512437812e-07, "loss": 1.2192, "step": 4 }, { "epoch": 0.0007477194556602363, "grad_norm": 25.398344980222138, "learning_rate": 2.4875621890547267e-07, "loss": 1.1835, "step": 5 }, { "epoch": 0.0008972633467922835, "grad_norm": 30.48079389364258, "learning_rate": 2.9850746268656716e-07, "loss": 1.0024, "step": 6 }, { "epoch": 0.0010468072379243307, "grad_norm": 27.780032565686206, "learning_rate": 3.4825870646766175e-07, "loss": 1.1796, "step": 7 }, { "epoch": 0.001196351129056378, "grad_norm": 33.19634259772052, "learning_rate": 3.9800995024875624e-07, "loss": 0.9585, "step": 8 }, { "epoch": 0.0013458950201884253, "grad_norm": 32.92097675417938, "learning_rate": 4.4776119402985074e-07, "loss": 1.1831, "step": 9 }, { "epoch": 0.0014954389113204726, "grad_norm": 31.267461918177617, "learning_rate": 4.975124378109453e-07, "loss": 0.9208, "step": 10 }, { "epoch": 0.0016449828024525197, "grad_norm": 31.652990928454088, "learning_rate": 5.472636815920398e-07, "loss": 0.8882, "step": 11 }, { "epoch": 0.001794526693584567, "grad_norm": 33.800482625732165, "learning_rate": 5.970149253731343e-07, "loss": 1.2138, "step": 12 }, { "epoch": 0.0019440705847166143, "grad_norm": 30.753216086819556, "learning_rate": 6.467661691542289e-07, "loss": 0.9896, "step": 13 }, { "epoch": 0.0020936144758486614, "grad_norm": 32.57679525538582, "learning_rate": 6.965174129353235e-07, "loss": 0.9195, "step": 14 }, { "epoch": 0.0022431583669807087, "grad_norm": 25.334089702892793, "learning_rate": 7.462686567164179e-07, "loss": 0.7515, "step": 15 }, { "epoch": 0.002392702258112756, "grad_norm": 22.2961872211284, "learning_rate": 7.960199004975125e-07, "loss": 0.6638, "step": 16 }, { "epoch": 0.0025422461492448033, "grad_norm": 24.245556768411276, "learning_rate": 8.457711442786071e-07, "loss": 0.7704, "step": 17 }, { "epoch": 0.0026917900403768506, "grad_norm": 19.23412917202397, "learning_rate": 8.955223880597015e-07, "loss": 0.7354, "step": 18 }, { "epoch": 0.002841333931508898, "grad_norm": 18.58051317424024, "learning_rate": 9.452736318407961e-07, "loss": 0.5749, "step": 19 }, { "epoch": 0.0029908778226409452, "grad_norm": 11.242228896944281, "learning_rate": 9.950248756218907e-07, "loss": 0.4914, "step": 20 }, { "epoch": 0.0031404217137729925, "grad_norm": 11.163527479225325, "learning_rate": 1.044776119402985e-06, "loss": 0.5823, "step": 21 }, { "epoch": 0.0032899656049050394, "grad_norm": 9.100766388616314, "learning_rate": 1.0945273631840796e-06, "loss": 0.6887, "step": 22 }, { "epoch": 0.0034395094960370867, "grad_norm": 9.371427313022828, "learning_rate": 1.1442786069651742e-06, "loss": 0.3365, "step": 23 }, { "epoch": 0.003589053387169134, "grad_norm": 6.591365654298028, "learning_rate": 1.1940298507462686e-06, "loss": 0.4092, "step": 24 }, { "epoch": 0.0037385972783011813, "grad_norm": 6.692920733889971, "learning_rate": 1.2437810945273632e-06, "loss": 0.4459, "step": 25 }, { "epoch": 0.0038881411694332286, "grad_norm": 6.609492289627464, "learning_rate": 1.2935323383084578e-06, "loss": 0.4577, "step": 26 }, { "epoch": 0.004037685060565276, "grad_norm": 4.9115623336358, "learning_rate": 1.3432835820895524e-06, "loss": 0.5349, "step": 27 }, { "epoch": 0.004187228951697323, "grad_norm": 5.117676678055004, "learning_rate": 1.393034825870647e-06, "loss": 0.5483, "step": 28 }, { "epoch": 0.0043367728428293706, "grad_norm": 5.263481949191207, "learning_rate": 1.4427860696517414e-06, "loss": 0.5991, "step": 29 }, { "epoch": 0.004486316733961417, "grad_norm": 6.131569220022702, "learning_rate": 1.4925373134328358e-06, "loss": 0.3908, "step": 30 }, { "epoch": 0.004635860625093465, "grad_norm": 5.928579435490833, "learning_rate": 1.5422885572139304e-06, "loss": 0.2084, "step": 31 }, { "epoch": 0.004785404516225512, "grad_norm": 5.916757088180695, "learning_rate": 1.592039800995025e-06, "loss": 0.3858, "step": 32 }, { "epoch": 0.00493494840735756, "grad_norm": 8.20423570651997, "learning_rate": 1.6417910447761196e-06, "loss": 0.2901, "step": 33 }, { "epoch": 0.005084492298489607, "grad_norm": 8.219360009824356, "learning_rate": 1.6915422885572142e-06, "loss": 0.3919, "step": 34 }, { "epoch": 0.005234036189621654, "grad_norm": 5.998450714995048, "learning_rate": 1.7412935323383088e-06, "loss": 0.2445, "step": 35 }, { "epoch": 0.005383580080753701, "grad_norm": 4.267389037528284, "learning_rate": 1.791044776119403e-06, "loss": 0.2062, "step": 36 }, { "epoch": 0.005533123971885748, "grad_norm": 5.463746992191978, "learning_rate": 1.8407960199004975e-06, "loss": 0.5357, "step": 37 }, { "epoch": 0.005682667863017796, "grad_norm": 4.306281637510176, "learning_rate": 1.8905472636815921e-06, "loss": 0.1867, "step": 38 }, { "epoch": 0.005832211754149843, "grad_norm": 6.551059942168939, "learning_rate": 1.9402985074626867e-06, "loss": 0.5944, "step": 39 }, { "epoch": 0.0059817556452818905, "grad_norm": 6.110559490141819, "learning_rate": 1.9900497512437813e-06, "loss": 0.6173, "step": 40 }, { "epoch": 0.006131299536413937, "grad_norm": 4.577457366278138, "learning_rate": 2.0398009950248755e-06, "loss": 0.3634, "step": 41 }, { "epoch": 0.006280843427545985, "grad_norm": 6.020057986889502, "learning_rate": 2.08955223880597e-06, "loss": 0.5398, "step": 42 }, { "epoch": 0.006430387318678032, "grad_norm": 12.119213807947853, "learning_rate": 2.1393034825870647e-06, "loss": 0.2376, "step": 43 }, { "epoch": 0.006579931209810079, "grad_norm": 4.977979102095054, "learning_rate": 2.1890547263681593e-06, "loss": 0.2455, "step": 44 }, { "epoch": 0.006729475100942127, "grad_norm": 3.4274663141099166, "learning_rate": 2.238805970149254e-06, "loss": 0.2356, "step": 45 }, { "epoch": 0.0068790189920741734, "grad_norm": 4.552279062958819, "learning_rate": 2.2885572139303485e-06, "loss": 0.1681, "step": 46 }, { "epoch": 0.007028562883206221, "grad_norm": 2.9323320786902496, "learning_rate": 2.338308457711443e-06, "loss": 0.2303, "step": 47 }, { "epoch": 0.007178106774338268, "grad_norm": 4.623033466327724, "learning_rate": 2.3880597014925373e-06, "loss": 0.2404, "step": 48 }, { "epoch": 0.007327650665470316, "grad_norm": 5.05007020882628, "learning_rate": 2.437810945273632e-06, "loss": 0.4128, "step": 49 }, { "epoch": 0.007477194556602363, "grad_norm": 2.5237349934200273, "learning_rate": 2.4875621890547264e-06, "loss": 0.2196, "step": 50 }, { "epoch": 0.00762673844773441, "grad_norm": 3.7483142878646594, "learning_rate": 2.537313432835821e-06, "loss": 0.1725, "step": 51 }, { "epoch": 0.007776282338866457, "grad_norm": 4.032155563605261, "learning_rate": 2.5870646766169156e-06, "loss": 0.3821, "step": 52 }, { "epoch": 0.007925826229998505, "grad_norm": 3.7782327104964333, "learning_rate": 2.6368159203980102e-06, "loss": 0.2207, "step": 53 }, { "epoch": 0.008075370121130552, "grad_norm": 4.816720331969929, "learning_rate": 2.686567164179105e-06, "loss": 0.2265, "step": 54 }, { "epoch": 0.008224914012262599, "grad_norm": 2.8481845548797478, "learning_rate": 2.736318407960199e-06, "loss": 0.2174, "step": 55 }, { "epoch": 0.008374457903394646, "grad_norm": 4.501151176073331, "learning_rate": 2.786069651741294e-06, "loss": 0.2306, "step": 56 }, { "epoch": 0.008524001794526694, "grad_norm": 4.326693136186164, "learning_rate": 2.835820895522388e-06, "loss": 0.4023, "step": 57 }, { "epoch": 0.008673545685658741, "grad_norm": 4.061925818141106, "learning_rate": 2.885572139303483e-06, "loss": 0.7602, "step": 58 }, { "epoch": 0.008823089576790788, "grad_norm": 6.144988240043741, "learning_rate": 2.9353233830845774e-06, "loss": 0.4451, "step": 59 }, { "epoch": 0.008972633467922835, "grad_norm": 4.985549166627373, "learning_rate": 2.9850746268656716e-06, "loss": 0.4621, "step": 60 }, { "epoch": 0.009122177359054883, "grad_norm": 3.192079125281125, "learning_rate": 3.0348258706467666e-06, "loss": 0.3694, "step": 61 }, { "epoch": 0.00927172125018693, "grad_norm": 4.653619400771914, "learning_rate": 3.0845771144278608e-06, "loss": 0.2416, "step": 62 }, { "epoch": 0.009421265141318977, "grad_norm": 3.4214006556775156, "learning_rate": 3.1343283582089558e-06, "loss": 0.4755, "step": 63 }, { "epoch": 0.009570809032451024, "grad_norm": 3.0809019894250613, "learning_rate": 3.18407960199005e-06, "loss": 0.4154, "step": 64 }, { "epoch": 0.009720352923583071, "grad_norm": 4.190290076677796, "learning_rate": 3.233830845771145e-06, "loss": 0.4362, "step": 65 }, { "epoch": 0.00986989681471512, "grad_norm": 3.1777725686355356, "learning_rate": 3.283582089552239e-06, "loss": 0.3635, "step": 66 }, { "epoch": 0.010019440705847166, "grad_norm": 2.592442539170553, "learning_rate": 3.3333333333333333e-06, "loss": 0.1739, "step": 67 }, { "epoch": 0.010168984596979213, "grad_norm": 4.610893839801018, "learning_rate": 3.3830845771144283e-06, "loss": 0.3845, "step": 68 }, { "epoch": 0.01031852848811126, "grad_norm": 2.941030939381248, "learning_rate": 3.4328358208955225e-06, "loss": 0.226, "step": 69 }, { "epoch": 0.010468072379243309, "grad_norm": 2.641062959772403, "learning_rate": 3.4825870646766175e-06, "loss": 0.2083, "step": 70 }, { "epoch": 0.010617616270375356, "grad_norm": 4.573399002022637, "learning_rate": 3.5323383084577117e-06, "loss": 0.3639, "step": 71 }, { "epoch": 0.010767160161507403, "grad_norm": 3.811597787697304, "learning_rate": 3.582089552238806e-06, "loss": 0.2046, "step": 72 }, { "epoch": 0.01091670405263945, "grad_norm": 7.593654702612937, "learning_rate": 3.631840796019901e-06, "loss": 0.3831, "step": 73 }, { "epoch": 0.011066247943771496, "grad_norm": 2.6372126137968013, "learning_rate": 3.681592039800995e-06, "loss": 0.2155, "step": 74 }, { "epoch": 0.011215791834903545, "grad_norm": 3.401033168780161, "learning_rate": 3.73134328358209e-06, "loss": 0.2439, "step": 75 }, { "epoch": 0.011365335726035592, "grad_norm": 2.8172647382036047, "learning_rate": 3.7810945273631843e-06, "loss": 0.1614, "step": 76 }, { "epoch": 0.011514879617167639, "grad_norm": 3.525793180439174, "learning_rate": 3.8308457711442784e-06, "loss": 0.2176, "step": 77 }, { "epoch": 0.011664423508299685, "grad_norm": 2.4029805525684527, "learning_rate": 3.8805970149253735e-06, "loss": 0.1893, "step": 78 }, { "epoch": 0.011813967399431732, "grad_norm": 5.727795685387504, "learning_rate": 3.930348258706468e-06, "loss": 0.5702, "step": 79 }, { "epoch": 0.011963511290563781, "grad_norm": 4.021893784746645, "learning_rate": 3.980099502487563e-06, "loss": 0.4027, "step": 80 }, { "epoch": 0.012113055181695828, "grad_norm": 2.7773808558650535, "learning_rate": 4.029850746268657e-06, "loss": 0.2963, "step": 81 }, { "epoch": 0.012262599072827875, "grad_norm": 3.4349426033049992, "learning_rate": 4.079601990049751e-06, "loss": 0.2211, "step": 82 }, { "epoch": 0.012412142963959922, "grad_norm": 4.127258766074891, "learning_rate": 4.129353233830846e-06, "loss": 0.2516, "step": 83 }, { "epoch": 0.01256168685509197, "grad_norm": 3.551977981988865, "learning_rate": 4.17910447761194e-06, "loss": 0.2206, "step": 84 }, { "epoch": 0.012711230746224017, "grad_norm": 2.988554589230421, "learning_rate": 4.228855721393035e-06, "loss": 0.366, "step": 85 }, { "epoch": 0.012860774637356064, "grad_norm": 3.256233912334862, "learning_rate": 4.278606965174129e-06, "loss": 0.341, "step": 86 }, { "epoch": 0.01301031852848811, "grad_norm": 3.917242635149468, "learning_rate": 4.3283582089552236e-06, "loss": 0.281, "step": 87 }, { "epoch": 0.013159862419620158, "grad_norm": 3.8372869351661247, "learning_rate": 4.378109452736319e-06, "loss": 0.1933, "step": 88 }, { "epoch": 0.013309406310752206, "grad_norm": 4.03192980896834, "learning_rate": 4.427860696517413e-06, "loss": 0.184, "step": 89 }, { "epoch": 0.013458950201884253, "grad_norm": 4.944440623197377, "learning_rate": 4.477611940298508e-06, "loss": 0.2406, "step": 90 }, { "epoch": 0.0136084940930163, "grad_norm": 3.2771345760625916, "learning_rate": 4.527363184079602e-06, "loss": 0.3635, "step": 91 }, { "epoch": 0.013758037984148347, "grad_norm": 2.5552685161479913, "learning_rate": 4.577114427860697e-06, "loss": 0.3581, "step": 92 }, { "epoch": 0.013907581875280395, "grad_norm": 3.825258197515859, "learning_rate": 4.626865671641791e-06, "loss": 0.2157, "step": 93 }, { "epoch": 0.014057125766412442, "grad_norm": 3.820006828326968, "learning_rate": 4.676616915422886e-06, "loss": 0.401, "step": 94 }, { "epoch": 0.01420666965754449, "grad_norm": 3.4269639891084056, "learning_rate": 4.72636815920398e-06, "loss": 0.21, "step": 95 }, { "epoch": 0.014356213548676536, "grad_norm": 3.614177044324435, "learning_rate": 4.7761194029850745e-06, "loss": 0.2305, "step": 96 }, { "epoch": 0.014505757439808583, "grad_norm": 2.8474787904051633, "learning_rate": 4.8258706467661695e-06, "loss": 0.2002, "step": 97 }, { "epoch": 0.014655301330940632, "grad_norm": 3.1529185682156333, "learning_rate": 4.875621890547264e-06, "loss": 0.3126, "step": 98 }, { "epoch": 0.014804845222072678, "grad_norm": 2.805579699726101, "learning_rate": 4.925373134328359e-06, "loss": 0.3977, "step": 99 }, { "epoch": 0.014954389113204725, "grad_norm": 2.5072872378288134, "learning_rate": 4.975124378109453e-06, "loss": 0.1986, "step": 100 }, { "epoch": 0.015103933004336772, "grad_norm": 2.8773082972301816, "learning_rate": 5.024875621890548e-06, "loss": 0.2421, "step": 101 }, { "epoch": 0.01525347689546882, "grad_norm": 2.3650776175631765, "learning_rate": 5.074626865671642e-06, "loss": 0.1864, "step": 102 }, { "epoch": 0.015403020786600868, "grad_norm": 4.721891286027898, "learning_rate": 5.124378109452737e-06, "loss": 0.2939, "step": 103 }, { "epoch": 0.015552564677732915, "grad_norm": 2.6753396233648705, "learning_rate": 5.174129353233831e-06, "loss": 0.2558, "step": 104 }, { "epoch": 0.01570210856886496, "grad_norm": 3.149876968312327, "learning_rate": 5.2238805970149255e-06, "loss": 0.3405, "step": 105 }, { "epoch": 0.01585165245999701, "grad_norm": 1.6322197066205648, "learning_rate": 5.2736318407960205e-06, "loss": 0.1453, "step": 106 }, { "epoch": 0.016001196351129055, "grad_norm": 3.3492234789043236, "learning_rate": 5.323383084577115e-06, "loss": 0.404, "step": 107 }, { "epoch": 0.016150740242261104, "grad_norm": 2.2518951047915157, "learning_rate": 5.37313432835821e-06, "loss": 0.2278, "step": 108 }, { "epoch": 0.016300284133393152, "grad_norm": 3.0471913491370404, "learning_rate": 5.422885572139304e-06, "loss": 0.265, "step": 109 }, { "epoch": 0.016449828024525198, "grad_norm": 1.6928519222295142, "learning_rate": 5.472636815920398e-06, "loss": 0.2169, "step": 110 }, { "epoch": 0.016599371915657246, "grad_norm": 3.265018826674296, "learning_rate": 5.522388059701493e-06, "loss": 0.429, "step": 111 }, { "epoch": 0.01674891580678929, "grad_norm": 2.637671664378066, "learning_rate": 5.572139303482588e-06, "loss": 0.2762, "step": 112 }, { "epoch": 0.01689845969792134, "grad_norm": 3.1617986987096134, "learning_rate": 5.621890547263682e-06, "loss": 0.4272, "step": 113 }, { "epoch": 0.01704800358905339, "grad_norm": 3.0132316717807175, "learning_rate": 5.671641791044776e-06, "loss": 0.3644, "step": 114 }, { "epoch": 0.017197547480185434, "grad_norm": 2.2850314864309813, "learning_rate": 5.721393034825871e-06, "loss": 0.1967, "step": 115 }, { "epoch": 0.017347091371317482, "grad_norm": 3.0835871860462314, "learning_rate": 5.771144278606966e-06, "loss": 0.2322, "step": 116 }, { "epoch": 0.017496635262449527, "grad_norm": 3.5275796788122893, "learning_rate": 5.820895522388061e-06, "loss": 0.3543, "step": 117 }, { "epoch": 0.017646179153581576, "grad_norm": 3.1301356173345494, "learning_rate": 5.870646766169155e-06, "loss": 0.5064, "step": 118 }, { "epoch": 0.017795723044713625, "grad_norm": 3.9689250366780313, "learning_rate": 5.920398009950249e-06, "loss": 0.8428, "step": 119 }, { "epoch": 0.01794526693584567, "grad_norm": 2.6992548320472984, "learning_rate": 5.970149253731343e-06, "loss": 0.2727, "step": 120 }, { "epoch": 0.01809481082697772, "grad_norm": 2.8823271138601414, "learning_rate": 6.019900497512439e-06, "loss": 0.3301, "step": 121 }, { "epoch": 0.018244354718109767, "grad_norm": 2.652199321292131, "learning_rate": 6.069651741293533e-06, "loss": 0.234, "step": 122 }, { "epoch": 0.018393898609241812, "grad_norm": 4.008459949806747, "learning_rate": 6.119402985074627e-06, "loss": 0.5713, "step": 123 }, { "epoch": 0.01854344250037386, "grad_norm": 2.8867543983581236, "learning_rate": 6.1691542288557215e-06, "loss": 0.2146, "step": 124 }, { "epoch": 0.018692986391505906, "grad_norm": 2.379666412119815, "learning_rate": 6.218905472636816e-06, "loss": 0.3812, "step": 125 }, { "epoch": 0.018842530282637954, "grad_norm": 2.8364015730213716, "learning_rate": 6.2686567164179116e-06, "loss": 0.3729, "step": 126 }, { "epoch": 0.018992074173770003, "grad_norm": 2.9731590306978957, "learning_rate": 6.318407960199006e-06, "loss": 0.3922, "step": 127 }, { "epoch": 0.019141618064902048, "grad_norm": 2.431931443805707, "learning_rate": 6.3681592039801e-06, "loss": 0.2316, "step": 128 }, { "epoch": 0.019291161956034097, "grad_norm": 2.5964092588685594, "learning_rate": 6.417910447761194e-06, "loss": 0.2129, "step": 129 }, { "epoch": 0.019440705847166142, "grad_norm": 4.241711858566103, "learning_rate": 6.46766169154229e-06, "loss": 0.2677, "step": 130 }, { "epoch": 0.01959024973829819, "grad_norm": 3.743763522090278, "learning_rate": 6.517412935323384e-06, "loss": 0.7324, "step": 131 }, { "epoch": 0.01973979362943024, "grad_norm": 2.325325226468886, "learning_rate": 6.567164179104478e-06, "loss": 0.2282, "step": 132 }, { "epoch": 0.019889337520562284, "grad_norm": 2.187485810642544, "learning_rate": 6.6169154228855725e-06, "loss": 0.3479, "step": 133 }, { "epoch": 0.020038881411694333, "grad_norm": 2.555235252803596, "learning_rate": 6.666666666666667e-06, "loss": 0.3084, "step": 134 }, { "epoch": 0.020188425302826378, "grad_norm": 2.1409254211343405, "learning_rate": 6.7164179104477625e-06, "loss": 0.2413, "step": 135 }, { "epoch": 0.020337969193958427, "grad_norm": 2.9475030013466292, "learning_rate": 6.766169154228857e-06, "loss": 0.5899, "step": 136 }, { "epoch": 0.020487513085090475, "grad_norm": 3.161190387153201, "learning_rate": 6.815920398009951e-06, "loss": 0.2722, "step": 137 }, { "epoch": 0.02063705697622252, "grad_norm": 3.4231688087143786, "learning_rate": 6.865671641791045e-06, "loss": 0.25, "step": 138 }, { "epoch": 0.02078660086735457, "grad_norm": 2.891852432700459, "learning_rate": 6.915422885572139e-06, "loss": 0.5206, "step": 139 }, { "epoch": 0.020936144758486618, "grad_norm": 2.4149596821734645, "learning_rate": 6.965174129353235e-06, "loss": 0.2792, "step": 140 }, { "epoch": 0.021085688649618663, "grad_norm": 2.737327253049286, "learning_rate": 7.014925373134329e-06, "loss": 0.1785, "step": 141 }, { "epoch": 0.02123523254075071, "grad_norm": 2.271710572333297, "learning_rate": 7.064676616915423e-06, "loss": 0.2216, "step": 142 }, { "epoch": 0.021384776431882756, "grad_norm": 3.123818135886555, "learning_rate": 7.114427860696518e-06, "loss": 0.5292, "step": 143 }, { "epoch": 0.021534320323014805, "grad_norm": 3.4353230085188775, "learning_rate": 7.164179104477612e-06, "loss": 0.257, "step": 144 }, { "epoch": 0.021683864214146854, "grad_norm": 3.292198842322858, "learning_rate": 7.213930348258708e-06, "loss": 0.4413, "step": 145 }, { "epoch": 0.0218334081052789, "grad_norm": 2.408669543365234, "learning_rate": 7.263681592039802e-06, "loss": 0.4034, "step": 146 }, { "epoch": 0.021982951996410947, "grad_norm": 2.918318139010717, "learning_rate": 7.313432835820896e-06, "loss": 0.1789, "step": 147 }, { "epoch": 0.022132495887542993, "grad_norm": 2.016064943310167, "learning_rate": 7.36318407960199e-06, "loss": 0.2454, "step": 148 }, { "epoch": 0.02228203977867504, "grad_norm": 3.375282717272202, "learning_rate": 7.412935323383084e-06, "loss": 0.5047, "step": 149 }, { "epoch": 0.02243158366980709, "grad_norm": 2.747548142801912, "learning_rate": 7.46268656716418e-06, "loss": 0.3193, "step": 150 }, { "epoch": 0.022581127560939135, "grad_norm": 5.014531999850111, "learning_rate": 7.512437810945274e-06, "loss": 0.5367, "step": 151 }, { "epoch": 0.022730671452071183, "grad_norm": 1.7396197448467992, "learning_rate": 7.5621890547263685e-06, "loss": 0.1602, "step": 152 }, { "epoch": 0.02288021534320323, "grad_norm": 3.9271159318267452, "learning_rate": 7.611940298507463e-06, "loss": 0.2763, "step": 153 }, { "epoch": 0.023029759234335277, "grad_norm": 2.093726492507833, "learning_rate": 7.661691542288557e-06, "loss": 0.169, "step": 154 }, { "epoch": 0.023179303125467326, "grad_norm": 1.5357011381308088, "learning_rate": 7.711442786069654e-06, "loss": 0.1619, "step": 155 }, { "epoch": 0.02332884701659937, "grad_norm": 2.3824458230974863, "learning_rate": 7.761194029850747e-06, "loss": 0.2094, "step": 156 }, { "epoch": 0.02347839090773142, "grad_norm": 2.8236663879690784, "learning_rate": 7.810945273631842e-06, "loss": 0.3426, "step": 157 }, { "epoch": 0.023627934798863465, "grad_norm": 3.1375695638809815, "learning_rate": 7.860696517412935e-06, "loss": 0.5518, "step": 158 }, { "epoch": 0.023777478689995513, "grad_norm": 3.2182906468856105, "learning_rate": 7.91044776119403e-06, "loss": 0.1995, "step": 159 }, { "epoch": 0.023927022581127562, "grad_norm": 14.749841980168513, "learning_rate": 7.960199004975125e-06, "loss": 0.5578, "step": 160 }, { "epoch": 0.024076566472259607, "grad_norm": 3.0100123201004045, "learning_rate": 8.00995024875622e-06, "loss": 0.5091, "step": 161 }, { "epoch": 0.024226110363391656, "grad_norm": 3.5091520525666433, "learning_rate": 8.059701492537314e-06, "loss": 0.5357, "step": 162 }, { "epoch": 0.024375654254523704, "grad_norm": 2.934851375582722, "learning_rate": 8.109452736318409e-06, "loss": 0.2267, "step": 163 }, { "epoch": 0.02452519814565575, "grad_norm": 2.5911339240383544, "learning_rate": 8.159203980099502e-06, "loss": 0.1782, "step": 164 }, { "epoch": 0.024674742036787798, "grad_norm": 2.847206263316536, "learning_rate": 8.208955223880599e-06, "loss": 0.2252, "step": 165 }, { "epoch": 0.024824285927919843, "grad_norm": 3.5380431553535976, "learning_rate": 8.258706467661692e-06, "loss": 0.4295, "step": 166 }, { "epoch": 0.024973829819051892, "grad_norm": 3.150492354924513, "learning_rate": 8.308457711442787e-06, "loss": 0.3276, "step": 167 }, { "epoch": 0.02512337371018394, "grad_norm": 3.114695975436696, "learning_rate": 8.35820895522388e-06, "loss": 0.5181, "step": 168 }, { "epoch": 0.025272917601315985, "grad_norm": 2.6180846619509355, "learning_rate": 8.407960199004975e-06, "loss": 0.2577, "step": 169 }, { "epoch": 0.025422461492448034, "grad_norm": 1.859950631659999, "learning_rate": 8.45771144278607e-06, "loss": 0.1838, "step": 170 }, { "epoch": 0.02557200538358008, "grad_norm": 4.092195798232618, "learning_rate": 8.507462686567165e-06, "loss": 0.2676, "step": 171 }, { "epoch": 0.025721549274712128, "grad_norm": 2.0820308098425766, "learning_rate": 8.557213930348259e-06, "loss": 0.2528, "step": 172 }, { "epoch": 0.025871093165844176, "grad_norm": 2.8153771201369087, "learning_rate": 8.606965174129354e-06, "loss": 0.3374, "step": 173 }, { "epoch": 0.02602063705697622, "grad_norm": 2.6417342231989114, "learning_rate": 8.656716417910447e-06, "loss": 0.4309, "step": 174 }, { "epoch": 0.02617018094810827, "grad_norm": 3.3553357791865825, "learning_rate": 8.706467661691544e-06, "loss": 0.279, "step": 175 }, { "epoch": 0.026319724839240315, "grad_norm": 2.5896987414147707, "learning_rate": 8.756218905472637e-06, "loss": 0.2505, "step": 176 }, { "epoch": 0.026469268730372364, "grad_norm": 15.917959164107543, "learning_rate": 8.805970149253732e-06, "loss": 0.3903, "step": 177 }, { "epoch": 0.026618812621504413, "grad_norm": 1.897502276352634, "learning_rate": 8.855721393034826e-06, "loss": 0.3051, "step": 178 }, { "epoch": 0.026768356512636458, "grad_norm": 3.498345426750877, "learning_rate": 8.905472636815922e-06, "loss": 0.8122, "step": 179 }, { "epoch": 0.026917900403768506, "grad_norm": 3.2270107650642297, "learning_rate": 8.955223880597016e-06, "loss": 0.2312, "step": 180 }, { "epoch": 0.027067444294900555, "grad_norm": 2.373617987334166, "learning_rate": 9.00497512437811e-06, "loss": 0.3553, "step": 181 }, { "epoch": 0.0272169881860326, "grad_norm": 2.022495433415561, "learning_rate": 9.054726368159204e-06, "loss": 0.3372, "step": 182 }, { "epoch": 0.02736653207716465, "grad_norm": 2.471303542690233, "learning_rate": 9.104477611940299e-06, "loss": 0.2764, "step": 183 }, { "epoch": 0.027516075968296694, "grad_norm": 2.170550660433261, "learning_rate": 9.154228855721394e-06, "loss": 0.2429, "step": 184 }, { "epoch": 0.027665619859428742, "grad_norm": 1.7750572924031363, "learning_rate": 9.203980099502489e-06, "loss": 0.1749, "step": 185 }, { "epoch": 0.02781516375056079, "grad_norm": 1.9803173977955488, "learning_rate": 9.253731343283582e-06, "loss": 0.3061, "step": 186 }, { "epoch": 0.027964707641692836, "grad_norm": 2.686793479118654, "learning_rate": 9.303482587064677e-06, "loss": 0.2704, "step": 187 }, { "epoch": 0.028114251532824885, "grad_norm": 3.0095995560762088, "learning_rate": 9.353233830845772e-06, "loss": 0.3935, "step": 188 }, { "epoch": 0.02826379542395693, "grad_norm": 3.296780241377357, "learning_rate": 9.402985074626867e-06, "loss": 0.4349, "step": 189 }, { "epoch": 0.02841333931508898, "grad_norm": 2.0473844316492262, "learning_rate": 9.45273631840796e-06, "loss": 0.3594, "step": 190 }, { "epoch": 0.028562883206221027, "grad_norm": 2.6746439974295986, "learning_rate": 9.502487562189056e-06, "loss": 0.2507, "step": 191 }, { "epoch": 0.028712427097353072, "grad_norm": 2.171372767224107, "learning_rate": 9.552238805970149e-06, "loss": 0.4442, "step": 192 }, { "epoch": 0.02886197098848512, "grad_norm": 3.412610878033882, "learning_rate": 9.601990049751244e-06, "loss": 0.5065, "step": 193 }, { "epoch": 0.029011514879617166, "grad_norm": 2.5249672849820843, "learning_rate": 9.651741293532339e-06, "loss": 0.2775, "step": 194 }, { "epoch": 0.029161058770749215, "grad_norm": 1.9244063665371054, "learning_rate": 9.701492537313434e-06, "loss": 0.2501, "step": 195 }, { "epoch": 0.029310602661881263, "grad_norm": 2.2928756876943788, "learning_rate": 9.751243781094527e-06, "loss": 0.391, "step": 196 }, { "epoch": 0.02946014655301331, "grad_norm": 3.2090175671059464, "learning_rate": 9.800995024875622e-06, "loss": 0.355, "step": 197 }, { "epoch": 0.029609690444145357, "grad_norm": 2.564275054094989, "learning_rate": 9.850746268656717e-06, "loss": 0.3824, "step": 198 }, { "epoch": 0.029759234335277406, "grad_norm": 2.2612313847384473, "learning_rate": 9.900497512437812e-06, "loss": 0.255, "step": 199 }, { "epoch": 0.02990877822640945, "grad_norm": 2.867410801811384, "learning_rate": 9.950248756218906e-06, "loss": 0.2321, "step": 200 }, { "epoch": 0.0300583221175415, "grad_norm": 2.7017080308625316, "learning_rate": 1e-05, "loss": 0.5355, "step": 201 }, { "epoch": 0.030207866008673544, "grad_norm": 1.7563631058650533, "learning_rate": 9.999999413475907e-06, "loss": 0.2366, "step": 202 }, { "epoch": 0.030357409899805593, "grad_norm": 2.7923486514729134, "learning_rate": 9.999997653903764e-06, "loss": 0.5735, "step": 203 }, { "epoch": 0.03050695379093764, "grad_norm": 2.5477270678585935, "learning_rate": 9.999994721283985e-06, "loss": 0.2316, "step": 204 }, { "epoch": 0.030656497682069687, "grad_norm": 1.6435827637040603, "learning_rate": 9.99999061561726e-06, "loss": 0.1958, "step": 205 }, { "epoch": 0.030806041573201735, "grad_norm": 4.225438559077688, "learning_rate": 9.999985336904546e-06, "loss": 0.6052, "step": 206 }, { "epoch": 0.03095558546433378, "grad_norm": 2.384218907777814, "learning_rate": 9.999978885147086e-06, "loss": 0.382, "step": 207 }, { "epoch": 0.03110512935546583, "grad_norm": 3.082533240684358, "learning_rate": 9.999971260346394e-06, "loss": 0.4615, "step": 208 }, { "epoch": 0.03125467324659788, "grad_norm": 2.126341746782405, "learning_rate": 9.999962462504259e-06, "loss": 0.3489, "step": 209 }, { "epoch": 0.03140421713772992, "grad_norm": 2.3157719584793974, "learning_rate": 9.99995249162274e-06, "loss": 0.351, "step": 210 }, { "epoch": 0.03155376102886197, "grad_norm": 3.2569828989709046, "learning_rate": 9.999941347704183e-06, "loss": 0.5452, "step": 211 }, { "epoch": 0.03170330491999402, "grad_norm": 2.4010549422177747, "learning_rate": 9.999929030751199e-06, "loss": 0.5511, "step": 212 }, { "epoch": 0.031852848811126065, "grad_norm": 2.2021354319659956, "learning_rate": 9.999915540766679e-06, "loss": 0.409, "step": 213 }, { "epoch": 0.03200239270225811, "grad_norm": 2.7467598032746467, "learning_rate": 9.999900877753786e-06, "loss": 0.2769, "step": 214 }, { "epoch": 0.03215193659339016, "grad_norm": 2.250991470386846, "learning_rate": 9.99988504171596e-06, "loss": 0.4243, "step": 215 }, { "epoch": 0.03230148048452221, "grad_norm": 7.389570164962262, "learning_rate": 9.999868032656921e-06, "loss": 0.5661, "step": 216 }, { "epoch": 0.03245102437565425, "grad_norm": 2.3232325152419904, "learning_rate": 9.999849850580653e-06, "loss": 0.3622, "step": 217 }, { "epoch": 0.032600568266786305, "grad_norm": 2.8448629192721153, "learning_rate": 9.999830495491425e-06, "loss": 0.5013, "step": 218 }, { "epoch": 0.03275011215791835, "grad_norm": 1.9203985094095042, "learning_rate": 9.99980996739378e-06, "loss": 0.2597, "step": 219 }, { "epoch": 0.032899656049050395, "grad_norm": 2.1343351176097705, "learning_rate": 9.99978826629253e-06, "loss": 0.333, "step": 220 }, { "epoch": 0.03304919994018244, "grad_norm": 2.675496675158128, "learning_rate": 9.999765392192766e-06, "loss": 0.4679, "step": 221 }, { "epoch": 0.03319874383131449, "grad_norm": 2.954897252892918, "learning_rate": 9.99974134509986e-06, "loss": 0.5779, "step": 222 }, { "epoch": 0.03334828772244654, "grad_norm": 3.164155125145253, "learning_rate": 9.999716125019448e-06, "loss": 0.5192, "step": 223 }, { "epoch": 0.03349783161357858, "grad_norm": 2.9422429580445377, "learning_rate": 9.99968973195745e-06, "loss": 0.3514, "step": 224 }, { "epoch": 0.033647375504710635, "grad_norm": 2.016818218277119, "learning_rate": 9.999662165920056e-06, "loss": 0.3657, "step": 225 }, { "epoch": 0.03379691939584268, "grad_norm": 2.805692301474297, "learning_rate": 9.999633426913733e-06, "loss": 0.1912, "step": 226 }, { "epoch": 0.033946463286974725, "grad_norm": 2.205403428118743, "learning_rate": 9.999603514945227e-06, "loss": 0.234, "step": 227 }, { "epoch": 0.03409600717810678, "grad_norm": 2.013271573198516, "learning_rate": 9.999572430021553e-06, "loss": 0.464, "step": 228 }, { "epoch": 0.03424555106923882, "grad_norm": 3.033803346792209, "learning_rate": 9.999540172150005e-06, "loss": 0.2599, "step": 229 }, { "epoch": 0.03439509496037087, "grad_norm": 2.854186400231596, "learning_rate": 9.99950674133815e-06, "loss": 0.6431, "step": 230 }, { "epoch": 0.03454463885150292, "grad_norm": 2.162434347622467, "learning_rate": 9.999472137593829e-06, "loss": 0.4779, "step": 231 }, { "epoch": 0.034694182742634964, "grad_norm": 1.4691335020169023, "learning_rate": 9.999436360925165e-06, "loss": 0.1827, "step": 232 }, { "epoch": 0.03484372663376701, "grad_norm": 1.6955188606947214, "learning_rate": 9.99939941134055e-06, "loss": 0.2336, "step": 233 }, { "epoch": 0.034993270524899055, "grad_norm": 2.0710606069082167, "learning_rate": 9.99936128884865e-06, "loss": 0.3671, "step": 234 }, { "epoch": 0.03514281441603111, "grad_norm": 2.128464465717484, "learning_rate": 9.999321993458411e-06, "loss": 0.2928, "step": 235 }, { "epoch": 0.03529235830716315, "grad_norm": 1.9685227247781487, "learning_rate": 9.999281525179054e-06, "loss": 0.185, "step": 236 }, { "epoch": 0.0354419021982952, "grad_norm": 2.3203573768463115, "learning_rate": 9.99923988402007e-06, "loss": 0.3733, "step": 237 }, { "epoch": 0.03559144608942725, "grad_norm": 2.2161639851963457, "learning_rate": 9.99919706999123e-06, "loss": 0.4, "step": 238 }, { "epoch": 0.035740989980559294, "grad_norm": 1.551687214387557, "learning_rate": 9.99915308310258e-06, "loss": 0.1723, "step": 239 }, { "epoch": 0.03589053387169134, "grad_norm": 1.9544776771870587, "learning_rate": 9.999107923364436e-06, "loss": 0.2587, "step": 240 }, { "epoch": 0.03604007776282339, "grad_norm": 2.1986380601508375, "learning_rate": 9.999061590787394e-06, "loss": 0.544, "step": 241 }, { "epoch": 0.03618962165395544, "grad_norm": 2.5816888510040457, "learning_rate": 9.999014085382326e-06, "loss": 0.4619, "step": 242 }, { "epoch": 0.03633916554508748, "grad_norm": 1.8291845348661409, "learning_rate": 9.998965407160377e-06, "loss": 0.2052, "step": 243 }, { "epoch": 0.036488709436219534, "grad_norm": 3.167062575704647, "learning_rate": 9.998915556132966e-06, "loss": 0.6123, "step": 244 }, { "epoch": 0.03663825332735158, "grad_norm": 1.8628898225455814, "learning_rate": 9.99886453231179e-06, "loss": 0.3634, "step": 245 }, { "epoch": 0.036787797218483624, "grad_norm": 1.7903762911789451, "learning_rate": 9.998812335708818e-06, "loss": 0.2162, "step": 246 }, { "epoch": 0.03693734110961567, "grad_norm": 1.3282642487848175, "learning_rate": 9.998758966336296e-06, "loss": 0.1875, "step": 247 }, { "epoch": 0.03708688500074772, "grad_norm": 1.8364953512469955, "learning_rate": 9.998704424206747e-06, "loss": 0.208, "step": 248 }, { "epoch": 0.037236428891879766, "grad_norm": 1.3941303606582691, "learning_rate": 9.998648709332965e-06, "loss": 0.1737, "step": 249 }, { "epoch": 0.03738597278301181, "grad_norm": 1.7239196409011197, "learning_rate": 9.998591821728022e-06, "loss": 0.2339, "step": 250 }, { "epoch": 0.037535516674143864, "grad_norm": 2.623262386600702, "learning_rate": 9.998533761405265e-06, "loss": 0.3988, "step": 251 }, { "epoch": 0.03768506056527591, "grad_norm": 3.0417113736320354, "learning_rate": 9.998474528378315e-06, "loss": 0.3998, "step": 252 }, { "epoch": 0.037834604456407954, "grad_norm": 2.3389769972346532, "learning_rate": 9.998414122661066e-06, "loss": 0.2157, "step": 253 }, { "epoch": 0.037984148347540006, "grad_norm": 2.776666496961099, "learning_rate": 9.998352544267696e-06, "loss": 0.5598, "step": 254 }, { "epoch": 0.03813369223867205, "grad_norm": 2.1472401976055746, "learning_rate": 9.998289793212645e-06, "loss": 0.2375, "step": 255 }, { "epoch": 0.038283236129804096, "grad_norm": 2.258529852719024, "learning_rate": 9.99822586951064e-06, "loss": 0.257, "step": 256 }, { "epoch": 0.03843278002093614, "grad_norm": 2.234662282588329, "learning_rate": 9.998160773176676e-06, "loss": 0.2513, "step": 257 }, { "epoch": 0.038582323912068194, "grad_norm": 1.557075634748184, "learning_rate": 9.998094504226025e-06, "loss": 0.2154, "step": 258 }, { "epoch": 0.03873186780320024, "grad_norm": 1.2782097805836874, "learning_rate": 9.998027062674236e-06, "loss": 0.1997, "step": 259 }, { "epoch": 0.038881411694332284, "grad_norm": 1.5754692941437902, "learning_rate": 9.997958448537129e-06, "loss": 0.2271, "step": 260 }, { "epoch": 0.039030955585464336, "grad_norm": 2.3273358127526516, "learning_rate": 9.997888661830803e-06, "loss": 0.4129, "step": 261 }, { "epoch": 0.03918049947659638, "grad_norm": 2.5932478274973705, "learning_rate": 9.997817702571631e-06, "loss": 0.2762, "step": 262 }, { "epoch": 0.039330043367728426, "grad_norm": 1.7415819067090217, "learning_rate": 9.99774557077626e-06, "loss": 0.2677, "step": 263 }, { "epoch": 0.03947958725886048, "grad_norm": 2.1983315861883974, "learning_rate": 9.997672266461613e-06, "loss": 0.3412, "step": 264 }, { "epoch": 0.03962913114999252, "grad_norm": 2.8445138272257666, "learning_rate": 9.997597789644889e-06, "loss": 0.3471, "step": 265 }, { "epoch": 0.03977867504112457, "grad_norm": 2.6658347323464575, "learning_rate": 9.997522140343558e-06, "loss": 0.3785, "step": 266 }, { "epoch": 0.03992821893225662, "grad_norm": 1.2913669477506569, "learning_rate": 9.997445318575371e-06, "loss": 0.2089, "step": 267 }, { "epoch": 0.040077762823388666, "grad_norm": 2.440102551085522, "learning_rate": 9.99736732435835e-06, "loss": 0.5639, "step": 268 }, { "epoch": 0.04022730671452071, "grad_norm": 2.252623935384866, "learning_rate": 9.997288157710795e-06, "loss": 0.447, "step": 269 }, { "epoch": 0.040376850605652756, "grad_norm": 1.9038309319538977, "learning_rate": 9.997207818651273e-06, "loss": 0.2784, "step": 270 }, { "epoch": 0.04052639449678481, "grad_norm": 2.05316637395224, "learning_rate": 9.99712630719864e-06, "loss": 0.3874, "step": 271 }, { "epoch": 0.04067593838791685, "grad_norm": 4.663034399257074, "learning_rate": 9.997043623372016e-06, "loss": 0.3558, "step": 272 }, { "epoch": 0.0408254822790489, "grad_norm": 2.0324793909935375, "learning_rate": 9.996959767190799e-06, "loss": 0.3884, "step": 273 }, { "epoch": 0.04097502617018095, "grad_norm": 2.1897027573531003, "learning_rate": 9.996874738674663e-06, "loss": 0.2372, "step": 274 }, { "epoch": 0.041124570061312996, "grad_norm": 1.9410471939157525, "learning_rate": 9.996788537843558e-06, "loss": 0.3478, "step": 275 }, { "epoch": 0.04127411395244504, "grad_norm": 3.650983914269082, "learning_rate": 9.996701164717704e-06, "loss": 0.4213, "step": 276 }, { "epoch": 0.04142365784357709, "grad_norm": 3.067988013237884, "learning_rate": 9.996612619317602e-06, "loss": 0.7209, "step": 277 }, { "epoch": 0.04157320173470914, "grad_norm": 2.5863303551652033, "learning_rate": 9.996522901664028e-06, "loss": 0.5418, "step": 278 }, { "epoch": 0.04172274562584118, "grad_norm": 2.1885641779249476, "learning_rate": 9.996432011778026e-06, "loss": 0.371, "step": 279 }, { "epoch": 0.041872289516973235, "grad_norm": 2.398824728854803, "learning_rate": 9.99633994968092e-06, "loss": 0.5508, "step": 280 }, { "epoch": 0.04202183340810528, "grad_norm": 1.5732032420608302, "learning_rate": 9.996246715394314e-06, "loss": 0.2468, "step": 281 }, { "epoch": 0.042171377299237325, "grad_norm": 2.8532279807617944, "learning_rate": 9.996152308940075e-06, "loss": 0.5503, "step": 282 }, { "epoch": 0.04232092119036937, "grad_norm": 2.4502727303222733, "learning_rate": 9.996056730340356e-06, "loss": 0.4046, "step": 283 }, { "epoch": 0.04247046508150142, "grad_norm": 1.9272098426705169, "learning_rate": 9.995959979617578e-06, "loss": 0.3906, "step": 284 }, { "epoch": 0.04262000897263347, "grad_norm": 2.290690335549339, "learning_rate": 9.995862056794441e-06, "loss": 0.2464, "step": 285 }, { "epoch": 0.04276955286376551, "grad_norm": 1.656564250859485, "learning_rate": 9.99576296189392e-06, "loss": 0.1996, "step": 286 }, { "epoch": 0.042919096754897565, "grad_norm": 2.1259148220336965, "learning_rate": 9.995662694939262e-06, "loss": 0.3994, "step": 287 }, { "epoch": 0.04306864064602961, "grad_norm": 2.286901143642134, "learning_rate": 9.99556125595399e-06, "loss": 0.4047, "step": 288 }, { "epoch": 0.043218184537161655, "grad_norm": 1.3559455912309712, "learning_rate": 9.995458644961902e-06, "loss": 0.2228, "step": 289 }, { "epoch": 0.04336772842829371, "grad_norm": 2.285750924681825, "learning_rate": 9.995354861987075e-06, "loss": 0.2367, "step": 290 }, { "epoch": 0.04351727231942575, "grad_norm": 1.923824453592428, "learning_rate": 9.995249907053854e-06, "loss": 0.3951, "step": 291 }, { "epoch": 0.0436668162105578, "grad_norm": 1.968047953500074, "learning_rate": 9.995143780186865e-06, "loss": 0.2149, "step": 292 }, { "epoch": 0.04381636010168984, "grad_norm": 2.3975790519132074, "learning_rate": 9.995036481411005e-06, "loss": 0.5312, "step": 293 }, { "epoch": 0.043965903992821895, "grad_norm": 1.9664546058841197, "learning_rate": 9.994928010751447e-06, "loss": 0.4832, "step": 294 }, { "epoch": 0.04411544788395394, "grad_norm": 2.1609011533249785, "learning_rate": 9.994818368233639e-06, "loss": 0.571, "step": 295 }, { "epoch": 0.044264991775085985, "grad_norm": 1.2099666806993736, "learning_rate": 9.994707553883305e-06, "loss": 0.1801, "step": 296 }, { "epoch": 0.04441453566621804, "grad_norm": 1.8811137964659612, "learning_rate": 9.994595567726444e-06, "loss": 0.2708, "step": 297 }, { "epoch": 0.04456407955735008, "grad_norm": 1.6387011737954997, "learning_rate": 9.994482409789329e-06, "loss": 0.245, "step": 298 }, { "epoch": 0.04471362344848213, "grad_norm": 2.4061797367092486, "learning_rate": 9.994368080098505e-06, "loss": 0.204, "step": 299 }, { "epoch": 0.04486316733961418, "grad_norm": 2.555264958903577, "learning_rate": 9.994252578680796e-06, "loss": 0.5251, "step": 300 }, { "epoch": 0.045012711230746225, "grad_norm": 3.1965886018503897, "learning_rate": 9.994135905563302e-06, "loss": 0.4353, "step": 301 }, { "epoch": 0.04516225512187827, "grad_norm": 2.390530599961774, "learning_rate": 9.994018060773396e-06, "loss": 0.4199, "step": 302 }, { "epoch": 0.04531179901301032, "grad_norm": 2.694731420269419, "learning_rate": 9.993899044338722e-06, "loss": 0.4029, "step": 303 }, { "epoch": 0.04546134290414237, "grad_norm": 2.5518583518075437, "learning_rate": 9.993778856287205e-06, "loss": 0.3712, "step": 304 }, { "epoch": 0.04561088679527441, "grad_norm": 1.958382495979976, "learning_rate": 9.99365749664704e-06, "loss": 0.3617, "step": 305 }, { "epoch": 0.04576043068640646, "grad_norm": 2.299652220902115, "learning_rate": 9.993534965446701e-06, "loss": 0.4059, "step": 306 }, { "epoch": 0.04590997457753851, "grad_norm": 4.086258301258261, "learning_rate": 9.993411262714934e-06, "loss": 0.2774, "step": 307 }, { "epoch": 0.046059518468670554, "grad_norm": 2.0081624141767156, "learning_rate": 9.993286388480763e-06, "loss": 0.2724, "step": 308 }, { "epoch": 0.0462090623598026, "grad_norm": 2.388037596587926, "learning_rate": 9.993160342773483e-06, "loss": 0.2706, "step": 309 }, { "epoch": 0.04635860625093465, "grad_norm": 1.5868739255084185, "learning_rate": 9.993033125622665e-06, "loss": 0.256, "step": 310 }, { "epoch": 0.0465081501420667, "grad_norm": 1.8286822342955051, "learning_rate": 9.992904737058157e-06, "loss": 0.209, "step": 311 }, { "epoch": 0.04665769403319874, "grad_norm": 2.2060332987484306, "learning_rate": 9.992775177110078e-06, "loss": 0.4253, "step": 312 }, { "epoch": 0.046807237924330794, "grad_norm": 1.39628419375001, "learning_rate": 9.992644445808826e-06, "loss": 0.1693, "step": 313 }, { "epoch": 0.04695678181546284, "grad_norm": 1.5668060198088787, "learning_rate": 9.99251254318507e-06, "loss": 0.24, "step": 314 }, { "epoch": 0.047106325706594884, "grad_norm": 1.998270389587923, "learning_rate": 9.992379469269758e-06, "loss": 0.2519, "step": 315 }, { "epoch": 0.04725586959772693, "grad_norm": 1.9609810436779118, "learning_rate": 9.99224522409411e-06, "loss": 0.2023, "step": 316 }, { "epoch": 0.04740541348885898, "grad_norm": 1.4580736241239847, "learning_rate": 9.992109807689619e-06, "loss": 0.2387, "step": 317 }, { "epoch": 0.04755495737999103, "grad_norm": 2.710681694340303, "learning_rate": 9.991973220088057e-06, "loss": 0.6738, "step": 318 }, { "epoch": 0.04770450127112307, "grad_norm": 1.2469776099691643, "learning_rate": 9.991835461321466e-06, "loss": 0.2013, "step": 319 }, { "epoch": 0.047854045162255124, "grad_norm": 2.128896128779159, "learning_rate": 9.99169653142217e-06, "loss": 0.3432, "step": 320 }, { "epoch": 0.04800358905338717, "grad_norm": 1.6053097848087672, "learning_rate": 9.991556430422759e-06, "loss": 0.2301, "step": 321 }, { "epoch": 0.048153132944519214, "grad_norm": 1.7774787600035602, "learning_rate": 9.991415158356106e-06, "loss": 0.2535, "step": 322 }, { "epoch": 0.048302676835651266, "grad_norm": 1.449815289318445, "learning_rate": 9.991272715255351e-06, "loss": 0.1878, "step": 323 }, { "epoch": 0.04845222072678331, "grad_norm": 1.5118547669168991, "learning_rate": 9.991129101153916e-06, "loss": 0.3186, "step": 324 }, { "epoch": 0.048601764617915356, "grad_norm": 1.461388444407636, "learning_rate": 9.99098431608549e-06, "loss": 0.1747, "step": 325 }, { "epoch": 0.04875130850904741, "grad_norm": 2.3912366570769974, "learning_rate": 9.990838360084045e-06, "loss": 0.5325, "step": 326 }, { "epoch": 0.048900852400179454, "grad_norm": 2.5611474084390937, "learning_rate": 9.990691233183823e-06, "loss": 0.2606, "step": 327 }, { "epoch": 0.0490503962913115, "grad_norm": 2.21899436894442, "learning_rate": 9.990542935419341e-06, "loss": 0.4253, "step": 328 }, { "epoch": 0.049199940182443544, "grad_norm": 1.6883179263006298, "learning_rate": 9.99039346682539e-06, "loss": 0.1768, "step": 329 }, { "epoch": 0.049349484073575596, "grad_norm": 3.2358870266119006, "learning_rate": 9.990242827437036e-06, "loss": 0.7866, "step": 330 }, { "epoch": 0.04949902796470764, "grad_norm": 2.0627143054944153, "learning_rate": 9.990091017289623e-06, "loss": 0.3286, "step": 331 }, { "epoch": 0.049648571855839686, "grad_norm": 2.1246533005850523, "learning_rate": 9.989938036418766e-06, "loss": 0.2716, "step": 332 }, { "epoch": 0.04979811574697174, "grad_norm": 2.6250279686209828, "learning_rate": 9.989783884860355e-06, "loss": 0.5058, "step": 333 }, { "epoch": 0.049947659638103784, "grad_norm": 2.3409062617647627, "learning_rate": 9.989628562650558e-06, "loss": 0.2589, "step": 334 }, { "epoch": 0.05009720352923583, "grad_norm": 1.835901073337933, "learning_rate": 9.989472069825811e-06, "loss": 0.3493, "step": 335 }, { "epoch": 0.05024674742036788, "grad_norm": 2.2454393810241298, "learning_rate": 9.989314406422835e-06, "loss": 0.4113, "step": 336 }, { "epoch": 0.050396291311499926, "grad_norm": 2.2906853778474674, "learning_rate": 9.989155572478611e-06, "loss": 0.5289, "step": 337 }, { "epoch": 0.05054583520263197, "grad_norm": 2.3899442476389665, "learning_rate": 9.98899556803041e-06, "loss": 0.2174, "step": 338 }, { "epoch": 0.05069537909376402, "grad_norm": 1.3681982854338133, "learning_rate": 9.988834393115768e-06, "loss": 0.2021, "step": 339 }, { "epoch": 0.05084492298489607, "grad_norm": 1.5118760155287632, "learning_rate": 9.988672047772497e-06, "loss": 0.1927, "step": 340 }, { "epoch": 0.05099446687602811, "grad_norm": 2.1144895431001105, "learning_rate": 9.988508532038685e-06, "loss": 0.3325, "step": 341 }, { "epoch": 0.05114401076716016, "grad_norm": 1.8616803287346595, "learning_rate": 9.988343845952697e-06, "loss": 0.3018, "step": 342 }, { "epoch": 0.05129355465829221, "grad_norm": 2.787967616575242, "learning_rate": 9.988177989553167e-06, "loss": 0.4641, "step": 343 }, { "epoch": 0.051443098549424256, "grad_norm": 2.2905797584406242, "learning_rate": 9.98801096287901e-06, "loss": 0.5336, "step": 344 }, { "epoch": 0.0515926424405563, "grad_norm": 1.769311364935245, "learning_rate": 9.987842765969408e-06, "loss": 0.2843, "step": 345 }, { "epoch": 0.05174218633168835, "grad_norm": 1.7122732613639495, "learning_rate": 9.987673398863824e-06, "loss": 0.2272, "step": 346 }, { "epoch": 0.0518917302228204, "grad_norm": 2.328359950454365, "learning_rate": 9.987502861601991e-06, "loss": 0.2645, "step": 347 }, { "epoch": 0.05204127411395244, "grad_norm": 2.208277642399548, "learning_rate": 9.987331154223922e-06, "loss": 0.5877, "step": 348 }, { "epoch": 0.052190818005084495, "grad_norm": 2.154817789687723, "learning_rate": 9.9871582767699e-06, "loss": 0.3414, "step": 349 }, { "epoch": 0.05234036189621654, "grad_norm": 2.0510314098551814, "learning_rate": 9.986984229280483e-06, "loss": 0.3981, "step": 350 }, { "epoch": 0.052489905787348586, "grad_norm": 2.346735661125246, "learning_rate": 9.986809011796503e-06, "loss": 0.6596, "step": 351 }, { "epoch": 0.05263944967848063, "grad_norm": 1.641693244293744, "learning_rate": 9.98663262435907e-06, "loss": 0.3657, "step": 352 }, { "epoch": 0.05278899356961268, "grad_norm": 2.240226359797858, "learning_rate": 9.986455067009566e-06, "loss": 0.3706, "step": 353 }, { "epoch": 0.05293853746074473, "grad_norm": 2.3791485993411357, "learning_rate": 9.986276339789648e-06, "loss": 0.5428, "step": 354 }, { "epoch": 0.05308808135187677, "grad_norm": 1.7806897327965683, "learning_rate": 9.986096442741241e-06, "loss": 0.2336, "step": 355 }, { "epoch": 0.053237625243008825, "grad_norm": 1.8563417208131827, "learning_rate": 9.98591537590656e-06, "loss": 0.2129, "step": 356 }, { "epoch": 0.05338716913414087, "grad_norm": 2.2115041121315895, "learning_rate": 9.98573313932808e-06, "loss": 0.5232, "step": 357 }, { "epoch": 0.053536713025272915, "grad_norm": 1.3693709893910027, "learning_rate": 9.985549733048556e-06, "loss": 0.3524, "step": 358 }, { "epoch": 0.05368625691640497, "grad_norm": 2.033727598383455, "learning_rate": 9.985365157111017e-06, "loss": 0.3987, "step": 359 }, { "epoch": 0.05383580080753701, "grad_norm": 2.3258255541409505, "learning_rate": 9.985179411558767e-06, "loss": 0.5489, "step": 360 }, { "epoch": 0.05398534469866906, "grad_norm": 2.0805855861837057, "learning_rate": 9.984992496435383e-06, "loss": 0.3982, "step": 361 }, { "epoch": 0.05413488858980111, "grad_norm": 1.4938394292792039, "learning_rate": 9.984804411784717e-06, "loss": 0.2279, "step": 362 }, { "epoch": 0.054284432480933155, "grad_norm": 1.935765339737269, "learning_rate": 9.984615157650896e-06, "loss": 0.2208, "step": 363 }, { "epoch": 0.0544339763720652, "grad_norm": 2.294825440673555, "learning_rate": 9.98442473407832e-06, "loss": 0.4006, "step": 364 }, { "epoch": 0.054583520263197245, "grad_norm": 1.7404498428206792, "learning_rate": 9.984233141111663e-06, "loss": 0.3859, "step": 365 }, { "epoch": 0.0547330641543293, "grad_norm": 2.382616866788976, "learning_rate": 9.984040378795879e-06, "loss": 0.5393, "step": 366 }, { "epoch": 0.05488260804546134, "grad_norm": 2.121310368782044, "learning_rate": 9.983846447176186e-06, "loss": 0.3808, "step": 367 }, { "epoch": 0.05503215193659339, "grad_norm": 1.4327836947551182, "learning_rate": 9.983651346298089e-06, "loss": 0.21, "step": 368 }, { "epoch": 0.05518169582772544, "grad_norm": 1.8551217286702022, "learning_rate": 9.983455076207353e-06, "loss": 0.3611, "step": 369 }, { "epoch": 0.055331239718857485, "grad_norm": 1.1962615317465979, "learning_rate": 9.983257636950032e-06, "loss": 0.1632, "step": 370 }, { "epoch": 0.05548078360998953, "grad_norm": 2.210937603202386, "learning_rate": 9.983059028572443e-06, "loss": 0.2054, "step": 371 }, { "epoch": 0.05563032750112158, "grad_norm": 1.3676870965949202, "learning_rate": 9.982859251121183e-06, "loss": 0.2257, "step": 372 }, { "epoch": 0.05577987139225363, "grad_norm": 1.877238753038072, "learning_rate": 9.98265830464312e-06, "loss": 0.3069, "step": 373 }, { "epoch": 0.05592941528338567, "grad_norm": 2.6215120058588743, "learning_rate": 9.9824561891854e-06, "loss": 0.3812, "step": 374 }, { "epoch": 0.056078959174517724, "grad_norm": 1.5353869053774183, "learning_rate": 9.982252904795437e-06, "loss": 0.3038, "step": 375 }, { "epoch": 0.05622850306564977, "grad_norm": 1.5387274188562523, "learning_rate": 9.98204845152093e-06, "loss": 0.1784, "step": 376 }, { "epoch": 0.056378046956781815, "grad_norm": 2.3221296907492444, "learning_rate": 9.981842829409842e-06, "loss": 0.4253, "step": 377 }, { "epoch": 0.05652759084791386, "grad_norm": 1.8464138105889263, "learning_rate": 9.981636038510414e-06, "loss": 0.2137, "step": 378 }, { "epoch": 0.05667713473904591, "grad_norm": 1.9213502252741161, "learning_rate": 9.98142807887116e-06, "loss": 0.2652, "step": 379 }, { "epoch": 0.05682667863017796, "grad_norm": 1.7697460473662174, "learning_rate": 9.981218950540874e-06, "loss": 0.2525, "step": 380 }, { "epoch": 0.05697622252131, "grad_norm": 2.001502054151958, "learning_rate": 9.981008653568613e-06, "loss": 0.3749, "step": 381 }, { "epoch": 0.057125766412442054, "grad_norm": 1.7507480997796745, "learning_rate": 9.98079718800372e-06, "loss": 0.3293, "step": 382 }, { "epoch": 0.0572753103035741, "grad_norm": 1.8995856376763527, "learning_rate": 9.980584553895805e-06, "loss": 0.2595, "step": 383 }, { "epoch": 0.057424854194706144, "grad_norm": 1.6960817341003291, "learning_rate": 9.980370751294754e-06, "loss": 0.3214, "step": 384 }, { "epoch": 0.057574398085838197, "grad_norm": 2.747620756274178, "learning_rate": 9.980155780250728e-06, "loss": 0.4678, "step": 385 }, { "epoch": 0.05772394197697024, "grad_norm": 1.429295181164985, "learning_rate": 9.979939640814158e-06, "loss": 0.3417, "step": 386 }, { "epoch": 0.05787348586810229, "grad_norm": 1.546941524577904, "learning_rate": 9.979722333035757e-06, "loss": 0.3017, "step": 387 }, { "epoch": 0.05802302975923433, "grad_norm": 2.3243262803022753, "learning_rate": 9.979503856966504e-06, "loss": 0.3906, "step": 388 }, { "epoch": 0.058172573650366384, "grad_norm": 1.5367077444523152, "learning_rate": 9.979284212657658e-06, "loss": 0.2735, "step": 389 }, { "epoch": 0.05832211754149843, "grad_norm": 1.0259751361449947, "learning_rate": 9.979063400160747e-06, "loss": 0.1788, "step": 390 }, { "epoch": 0.058471661432630474, "grad_norm": 1.7811616961442123, "learning_rate": 9.97884141952758e-06, "loss": 0.2071, "step": 391 }, { "epoch": 0.058621205323762526, "grad_norm": 2.347009922116326, "learning_rate": 9.978618270810229e-06, "loss": 0.4248, "step": 392 }, { "epoch": 0.05877074921489457, "grad_norm": 1.3076474084417338, "learning_rate": 9.978393954061052e-06, "loss": 0.1771, "step": 393 }, { "epoch": 0.05892029310602662, "grad_norm": 2.4165379692755455, "learning_rate": 9.978168469332677e-06, "loss": 0.4913, "step": 394 }, { "epoch": 0.05906983699715867, "grad_norm": 1.6584516839965744, "learning_rate": 9.977941816678e-06, "loss": 0.2292, "step": 395 }, { "epoch": 0.059219380888290714, "grad_norm": 1.3323879687206615, "learning_rate": 9.9777139961502e-06, "loss": 0.2042, "step": 396 }, { "epoch": 0.05936892477942276, "grad_norm": 1.242996863833067, "learning_rate": 9.977485007802725e-06, "loss": 0.1759, "step": 397 }, { "epoch": 0.05951846867055481, "grad_norm": 2.0289613301318057, "learning_rate": 9.977254851689297e-06, "loss": 0.3391, "step": 398 }, { "epoch": 0.059668012561686856, "grad_norm": 1.7111890076718022, "learning_rate": 9.977023527863913e-06, "loss": 0.318, "step": 399 }, { "epoch": 0.0598175564528189, "grad_norm": 2.360289838407607, "learning_rate": 9.976791036380844e-06, "loss": 0.7436, "step": 400 }, { "epoch": 0.059967100343950946, "grad_norm": 1.6556682149662436, "learning_rate": 9.976557377294634e-06, "loss": 0.3579, "step": 401 }, { "epoch": 0.060116644235083, "grad_norm": 1.9472299876725607, "learning_rate": 9.976322550660103e-06, "loss": 0.3939, "step": 402 }, { "epoch": 0.060266188126215044, "grad_norm": 1.2625006623785717, "learning_rate": 9.976086556532343e-06, "loss": 0.1777, "step": 403 }, { "epoch": 0.06041573201734709, "grad_norm": 2.142440158571368, "learning_rate": 9.975849394966721e-06, "loss": 0.4728, "step": 404 }, { "epoch": 0.06056527590847914, "grad_norm": 1.3109446375337697, "learning_rate": 9.975611066018876e-06, "loss": 0.2035, "step": 405 }, { "epoch": 0.060714819799611186, "grad_norm": 1.473069250695052, "learning_rate": 9.975371569744723e-06, "loss": 0.2502, "step": 406 }, { "epoch": 0.06086436369074323, "grad_norm": 1.4147256960977963, "learning_rate": 9.975130906200453e-06, "loss": 0.1861, "step": 407 }, { "epoch": 0.06101390758187528, "grad_norm": 1.5107559691714745, "learning_rate": 9.97488907544252e-06, "loss": 0.2309, "step": 408 }, { "epoch": 0.06116345147300733, "grad_norm": 1.5467720756101462, "learning_rate": 9.97464607752767e-06, "loss": 0.235, "step": 409 }, { "epoch": 0.061312995364139374, "grad_norm": 1.2901444374034334, "learning_rate": 9.974401912512905e-06, "loss": 0.1877, "step": 410 }, { "epoch": 0.061462539255271426, "grad_norm": 1.8751659558285558, "learning_rate": 9.974156580455512e-06, "loss": 0.2941, "step": 411 }, { "epoch": 0.06161208314640347, "grad_norm": 1.2187366523072891, "learning_rate": 9.973910081413048e-06, "loss": 0.2, "step": 412 }, { "epoch": 0.061761627037535516, "grad_norm": 2.56665763030278, "learning_rate": 9.973662415443342e-06, "loss": 0.4259, "step": 413 }, { "epoch": 0.06191117092866756, "grad_norm": 1.5201509236946156, "learning_rate": 9.973413582604502e-06, "loss": 0.2098, "step": 414 }, { "epoch": 0.06206071481979961, "grad_norm": 2.2299268067487183, "learning_rate": 9.973163582954903e-06, "loss": 0.5054, "step": 415 }, { "epoch": 0.06221025871093166, "grad_norm": 2.195400724979985, "learning_rate": 9.972912416553202e-06, "loss": 0.3856, "step": 416 }, { "epoch": 0.0623598026020637, "grad_norm": 2.3196273331545876, "learning_rate": 9.972660083458321e-06, "loss": 0.5608, "step": 417 }, { "epoch": 0.06250934649319576, "grad_norm": 1.6815269422927719, "learning_rate": 9.97240658372946e-06, "loss": 0.3682, "step": 418 }, { "epoch": 0.0626588903843278, "grad_norm": 1.7582779956751238, "learning_rate": 9.972151917426095e-06, "loss": 0.2256, "step": 419 }, { "epoch": 0.06280843427545985, "grad_norm": 1.9523974169697056, "learning_rate": 9.97189608460797e-06, "loss": 0.2303, "step": 420 }, { "epoch": 0.06295797816659189, "grad_norm": 2.120409254412015, "learning_rate": 9.97163908533511e-06, "loss": 0.2198, "step": 421 }, { "epoch": 0.06310752205772394, "grad_norm": 1.7213130956608376, "learning_rate": 9.971380919667806e-06, "loss": 0.3355, "step": 422 }, { "epoch": 0.063257065948856, "grad_norm": 1.6609701125154137, "learning_rate": 9.971121587666627e-06, "loss": 0.2354, "step": 423 }, { "epoch": 0.06340660983998804, "grad_norm": 1.2809919353271448, "learning_rate": 9.970861089392415e-06, "loss": 0.2043, "step": 424 }, { "epoch": 0.06355615373112009, "grad_norm": 1.137987748410028, "learning_rate": 9.970599424906285e-06, "loss": 0.1714, "step": 425 }, { "epoch": 0.06370569762225213, "grad_norm": 2.241505455994119, "learning_rate": 9.970336594269627e-06, "loss": 0.559, "step": 426 }, { "epoch": 0.06385524151338418, "grad_norm": 1.8145782296174282, "learning_rate": 9.970072597544102e-06, "loss": 0.4695, "step": 427 }, { "epoch": 0.06400478540451622, "grad_norm": 2.6609160560733924, "learning_rate": 9.96980743479165e-06, "loss": 0.3927, "step": 428 }, { "epoch": 0.06415432929564828, "grad_norm": 1.5902127205656447, "learning_rate": 9.969541106074477e-06, "loss": 0.3221, "step": 429 }, { "epoch": 0.06430387318678032, "grad_norm": 1.354440824254012, "learning_rate": 9.969273611455066e-06, "loss": 0.1982, "step": 430 }, { "epoch": 0.06445341707791237, "grad_norm": 2.1796464676908682, "learning_rate": 9.969004950996175e-06, "loss": 0.5947, "step": 431 }, { "epoch": 0.06460296096904442, "grad_norm": 1.6772295444343943, "learning_rate": 9.968735124760834e-06, "loss": 0.3567, "step": 432 }, { "epoch": 0.06475250486017646, "grad_norm": 2.326608368656497, "learning_rate": 9.968464132812348e-06, "loss": 0.3934, "step": 433 }, { "epoch": 0.0649020487513085, "grad_norm": 1.9737750855760885, "learning_rate": 9.968191975214293e-06, "loss": 0.3936, "step": 434 }, { "epoch": 0.06505159264244055, "grad_norm": 2.09687169461338, "learning_rate": 9.967918652030522e-06, "loss": 0.3644, "step": 435 }, { "epoch": 0.06520113653357261, "grad_norm": 2.1122151786614967, "learning_rate": 9.967644163325157e-06, "loss": 0.2169, "step": 436 }, { "epoch": 0.06535068042470465, "grad_norm": 1.8368706867911107, "learning_rate": 9.967368509162595e-06, "loss": 0.3956, "step": 437 }, { "epoch": 0.0655002243158367, "grad_norm": 1.7823169737575542, "learning_rate": 9.96709168960751e-06, "loss": 0.232, "step": 438 }, { "epoch": 0.06564976820696874, "grad_norm": 2.1565508943507194, "learning_rate": 9.966813704724844e-06, "loss": 0.2228, "step": 439 }, { "epoch": 0.06579931209810079, "grad_norm": 2.2075342060994414, "learning_rate": 9.966534554579816e-06, "loss": 0.204, "step": 440 }, { "epoch": 0.06594885598923284, "grad_norm": 2.0929887441012602, "learning_rate": 9.966254239237917e-06, "loss": 0.3946, "step": 441 }, { "epoch": 0.06609839988036488, "grad_norm": 2.0382287962872834, "learning_rate": 9.965972758764912e-06, "loss": 0.4633, "step": 442 }, { "epoch": 0.06624794377149694, "grad_norm": 1.2772439274586147, "learning_rate": 9.96569011322684e-06, "loss": 0.1784, "step": 443 }, { "epoch": 0.06639748766262898, "grad_norm": 1.1024457344648066, "learning_rate": 9.965406302690011e-06, "loss": 0.1625, "step": 444 }, { "epoch": 0.06654703155376103, "grad_norm": 1.2184559623271476, "learning_rate": 9.965121327221007e-06, "loss": 0.1959, "step": 445 }, { "epoch": 0.06669657544489307, "grad_norm": 1.9215235980087064, "learning_rate": 9.964835186886692e-06, "loss": 0.2493, "step": 446 }, { "epoch": 0.06684611933602512, "grad_norm": 2.1443052954533974, "learning_rate": 9.964547881754194e-06, "loss": 0.3611, "step": 447 }, { "epoch": 0.06699566322715717, "grad_norm": 2.6967138020110712, "learning_rate": 9.964259411890918e-06, "loss": 0.5427, "step": 448 }, { "epoch": 0.06714520711828922, "grad_norm": 1.688779610685555, "learning_rate": 9.96396977736454e-06, "loss": 0.2569, "step": 449 }, { "epoch": 0.06729475100942127, "grad_norm": 2.1241026975378694, "learning_rate": 9.963678978243014e-06, "loss": 0.3863, "step": 450 }, { "epoch": 0.06744429490055331, "grad_norm": 1.9388647656441462, "learning_rate": 9.96338701459456e-06, "loss": 0.2726, "step": 451 }, { "epoch": 0.06759383879168536, "grad_norm": 1.4657993620125664, "learning_rate": 9.963093886487683e-06, "loss": 0.2338, "step": 452 }, { "epoch": 0.0677433826828174, "grad_norm": 2.307173509923502, "learning_rate": 9.962799593991146e-06, "loss": 0.8039, "step": 453 }, { "epoch": 0.06789292657394945, "grad_norm": 1.2669540134016812, "learning_rate": 9.962504137173997e-06, "loss": 0.169, "step": 454 }, { "epoch": 0.0680424704650815, "grad_norm": 1.5981790001004936, "learning_rate": 9.962207516105552e-06, "loss": 0.2019, "step": 455 }, { "epoch": 0.06819201435621355, "grad_norm": 1.740837427237262, "learning_rate": 9.9619097308554e-06, "loss": 0.2116, "step": 456 }, { "epoch": 0.0683415582473456, "grad_norm": 1.9511590671787182, "learning_rate": 9.961610781493407e-06, "loss": 0.2611, "step": 457 }, { "epoch": 0.06849110213847764, "grad_norm": 1.9814713665794252, "learning_rate": 9.961310668089708e-06, "loss": 0.3714, "step": 458 }, { "epoch": 0.06864064602960969, "grad_norm": 2.755804773731971, "learning_rate": 9.96100939071471e-06, "loss": 0.5178, "step": 459 }, { "epoch": 0.06879018992074173, "grad_norm": 2.5378159735000225, "learning_rate": 9.960706949439101e-06, "loss": 0.7334, "step": 460 }, { "epoch": 0.06893973381187378, "grad_norm": 2.3557582569765003, "learning_rate": 9.960403344333832e-06, "loss": 0.5763, "step": 461 }, { "epoch": 0.06908927770300584, "grad_norm": 1.6501148783544786, "learning_rate": 9.960098575470131e-06, "loss": 0.3681, "step": 462 }, { "epoch": 0.06923882159413788, "grad_norm": 1.3521314881367383, "learning_rate": 9.959792642919505e-06, "loss": 0.216, "step": 463 }, { "epoch": 0.06938836548526993, "grad_norm": 1.9967115308447656, "learning_rate": 9.959485546753724e-06, "loss": 0.4411, "step": 464 }, { "epoch": 0.06953790937640197, "grad_norm": 1.6934835527025132, "learning_rate": 9.959177287044839e-06, "loss": 0.3013, "step": 465 }, { "epoch": 0.06968745326753402, "grad_norm": 2.1881268216288703, "learning_rate": 9.958867863865168e-06, "loss": 0.386, "step": 466 }, { "epoch": 0.06983699715866606, "grad_norm": 1.746249573857031, "learning_rate": 9.958557277287307e-06, "loss": 0.3486, "step": 467 }, { "epoch": 0.06998654104979811, "grad_norm": 1.3309239290400467, "learning_rate": 9.958245527384118e-06, "loss": 0.2512, "step": 468 }, { "epoch": 0.07013608494093017, "grad_norm": 1.780095751208227, "learning_rate": 9.957932614228746e-06, "loss": 0.3579, "step": 469 }, { "epoch": 0.07028562883206221, "grad_norm": 2.058627302052003, "learning_rate": 9.957618537894602e-06, "loss": 0.2234, "step": 470 }, { "epoch": 0.07043517272319426, "grad_norm": 2.1643867800571286, "learning_rate": 9.95730329845537e-06, "loss": 0.2658, "step": 471 }, { "epoch": 0.0705847166143263, "grad_norm": 1.9162877246393155, "learning_rate": 9.956986895985009e-06, "loss": 0.3514, "step": 472 }, { "epoch": 0.07073426050545835, "grad_norm": 2.0198300655217474, "learning_rate": 9.95666933055775e-06, "loss": 0.4191, "step": 473 }, { "epoch": 0.0708838043965904, "grad_norm": 1.8174642496449622, "learning_rate": 9.956350602248095e-06, "loss": 0.1802, "step": 474 }, { "epoch": 0.07103334828772245, "grad_norm": 1.7641599345266465, "learning_rate": 9.956030711130824e-06, "loss": 0.2181, "step": 475 }, { "epoch": 0.0711828921788545, "grad_norm": 1.5149058769435404, "learning_rate": 9.955709657280985e-06, "loss": 0.2068, "step": 476 }, { "epoch": 0.07133243606998654, "grad_norm": 2.14267612952952, "learning_rate": 9.955387440773902e-06, "loss": 0.2799, "step": 477 }, { "epoch": 0.07148197996111859, "grad_norm": 1.8794948861297893, "learning_rate": 9.955064061685166e-06, "loss": 0.3437, "step": 478 }, { "epoch": 0.07163152385225063, "grad_norm": 1.595856928796192, "learning_rate": 9.954739520090649e-06, "loss": 0.1741, "step": 479 }, { "epoch": 0.07178106774338268, "grad_norm": 1.4775459266699813, "learning_rate": 9.95441381606649e-06, "loss": 0.2009, "step": 480 }, { "epoch": 0.07193061163451472, "grad_norm": 1.4624583034603231, "learning_rate": 9.954086949689102e-06, "loss": 0.2413, "step": 481 }, { "epoch": 0.07208015552564678, "grad_norm": 1.5685428117813849, "learning_rate": 9.953758921035171e-06, "loss": 0.2381, "step": 482 }, { "epoch": 0.07222969941677883, "grad_norm": 2.0490413587537524, "learning_rate": 9.953429730181653e-06, "loss": 0.4092, "step": 483 }, { "epoch": 0.07237924330791087, "grad_norm": 2.605633491672469, "learning_rate": 9.953099377205786e-06, "loss": 0.56, "step": 484 }, { "epoch": 0.07252878719904292, "grad_norm": 1.6836189923086853, "learning_rate": 9.952767862185071e-06, "loss": 0.3514, "step": 485 }, { "epoch": 0.07267833109017496, "grad_norm": 2.165692386982445, "learning_rate": 9.952435185197281e-06, "loss": 0.4363, "step": 486 }, { "epoch": 0.07282787498130701, "grad_norm": 2.328987566639375, "learning_rate": 9.952101346320471e-06, "loss": 0.5953, "step": 487 }, { "epoch": 0.07297741887243907, "grad_norm": 1.857109300243422, "learning_rate": 9.951766345632957e-06, "loss": 0.4125, "step": 488 }, { "epoch": 0.07312696276357111, "grad_norm": 1.780608988332075, "learning_rate": 9.951430183213338e-06, "loss": 0.2793, "step": 489 }, { "epoch": 0.07327650665470316, "grad_norm": 1.2718866410706833, "learning_rate": 9.951092859140479e-06, "loss": 0.1878, "step": 490 }, { "epoch": 0.0734260505458352, "grad_norm": 1.389385388824981, "learning_rate": 9.95075437349352e-06, "loss": 0.1922, "step": 491 }, { "epoch": 0.07357559443696725, "grad_norm": 1.2364018773804621, "learning_rate": 9.950414726351873e-06, "loss": 0.1972, "step": 492 }, { "epoch": 0.0737251383280993, "grad_norm": 1.6438922682719497, "learning_rate": 9.95007391779522e-06, "loss": 0.3835, "step": 493 }, { "epoch": 0.07387468221923134, "grad_norm": 1.9223258334837023, "learning_rate": 9.949731947903523e-06, "loss": 0.5421, "step": 494 }, { "epoch": 0.0740242261103634, "grad_norm": 2.1294087718057955, "learning_rate": 9.949388816757009e-06, "loss": 0.6584, "step": 495 }, { "epoch": 0.07417377000149544, "grad_norm": 1.9620720670123732, "learning_rate": 9.949044524436178e-06, "loss": 0.3427, "step": 496 }, { "epoch": 0.07432331389262749, "grad_norm": 1.8767982308843718, "learning_rate": 9.948699071021806e-06, "loss": 0.2221, "step": 497 }, { "epoch": 0.07447285778375953, "grad_norm": 1.5717369659821445, "learning_rate": 9.948352456594938e-06, "loss": 0.3915, "step": 498 }, { "epoch": 0.07462240167489158, "grad_norm": 1.9105988284269253, "learning_rate": 9.948004681236896e-06, "loss": 0.4049, "step": 499 }, { "epoch": 0.07477194556602362, "grad_norm": 2.051255434710168, "learning_rate": 9.94765574502927e-06, "loss": 0.263, "step": 500 }, { "epoch": 0.07492148945715567, "grad_norm": 1.1727115808022262, "learning_rate": 9.947305648053924e-06, "loss": 0.2061, "step": 501 }, { "epoch": 0.07507103334828773, "grad_norm": 2.3851218898633566, "learning_rate": 9.946954390392995e-06, "loss": 0.3587, "step": 502 }, { "epoch": 0.07522057723941977, "grad_norm": 2.668333899893354, "learning_rate": 9.94660197212889e-06, "loss": 0.279, "step": 503 }, { "epoch": 0.07537012113055182, "grad_norm": 2.324044177768054, "learning_rate": 9.946248393344289e-06, "loss": 0.5219, "step": 504 }, { "epoch": 0.07551966502168386, "grad_norm": 2.252535927387564, "learning_rate": 9.945893654122147e-06, "loss": 0.4462, "step": 505 }, { "epoch": 0.07566920891281591, "grad_norm": 1.2553962948323492, "learning_rate": 9.945537754545689e-06, "loss": 0.1829, "step": 506 }, { "epoch": 0.07581875280394795, "grad_norm": 2.009514792075129, "learning_rate": 9.94518069469841e-06, "loss": 0.334, "step": 507 }, { "epoch": 0.07596829669508001, "grad_norm": 1.7045023449590413, "learning_rate": 9.944822474664082e-06, "loss": 0.3202, "step": 508 }, { "epoch": 0.07611784058621206, "grad_norm": 1.0508191419172128, "learning_rate": 9.944463094526747e-06, "loss": 0.205, "step": 509 }, { "epoch": 0.0762673844773441, "grad_norm": 1.6097293192900886, "learning_rate": 9.944102554370718e-06, "loss": 0.2324, "step": 510 }, { "epoch": 0.07641692836847615, "grad_norm": 1.9399148366487866, "learning_rate": 9.943740854280582e-06, "loss": 0.4526, "step": 511 }, { "epoch": 0.07656647225960819, "grad_norm": 2.0362256511499335, "learning_rate": 9.943377994341197e-06, "loss": 0.3979, "step": 512 }, { "epoch": 0.07671601615074024, "grad_norm": 1.5296316888698338, "learning_rate": 9.943013974637693e-06, "loss": 0.3789, "step": 513 }, { "epoch": 0.07686556004187228, "grad_norm": 1.496691000675503, "learning_rate": 9.942648795255473e-06, "loss": 0.2497, "step": 514 }, { "epoch": 0.07701510393300434, "grad_norm": 1.4146486247851384, "learning_rate": 9.942282456280212e-06, "loss": 0.3088, "step": 515 }, { "epoch": 0.07716464782413639, "grad_norm": 1.3671722765483707, "learning_rate": 9.941914957797855e-06, "loss": 0.2076, "step": 516 }, { "epoch": 0.07731419171526843, "grad_norm": 1.8485057563465108, "learning_rate": 9.941546299894623e-06, "loss": 0.3676, "step": 517 }, { "epoch": 0.07746373560640048, "grad_norm": 2.0438588429845255, "learning_rate": 9.941176482657005e-06, "loss": 0.4905, "step": 518 }, { "epoch": 0.07761327949753252, "grad_norm": 1.3215533906334498, "learning_rate": 9.940805506171765e-06, "loss": 0.2028, "step": 519 }, { "epoch": 0.07776282338866457, "grad_norm": 2.499241081917891, "learning_rate": 9.940433370525937e-06, "loss": 0.4323, "step": 520 }, { "epoch": 0.07791236727979663, "grad_norm": 1.4654220634749195, "learning_rate": 9.940060075806827e-06, "loss": 0.1928, "step": 521 }, { "epoch": 0.07806191117092867, "grad_norm": 2.32501667334618, "learning_rate": 9.939685622102013e-06, "loss": 0.6039, "step": 522 }, { "epoch": 0.07821145506206072, "grad_norm": 2.0353313744113644, "learning_rate": 9.939310009499348e-06, "loss": 0.434, "step": 523 }, { "epoch": 0.07836099895319276, "grad_norm": 1.5916248439200642, "learning_rate": 9.938933238086952e-06, "loss": 0.2484, "step": 524 }, { "epoch": 0.07851054284432481, "grad_norm": 1.510761606083, "learning_rate": 9.938555307953221e-06, "loss": 0.2761, "step": 525 }, { "epoch": 0.07866008673545685, "grad_norm": 1.6041562012438388, "learning_rate": 9.93817621918682e-06, "loss": 0.3032, "step": 526 }, { "epoch": 0.0788096306265889, "grad_norm": 1.5831322947558841, "learning_rate": 9.937795971876686e-06, "loss": 0.3486, "step": 527 }, { "epoch": 0.07895917451772096, "grad_norm": 2.2247878916503856, "learning_rate": 9.93741456611203e-06, "loss": 0.4087, "step": 528 }, { "epoch": 0.079108718408853, "grad_norm": 2.152252638423622, "learning_rate": 9.937032001982334e-06, "loss": 0.5629, "step": 529 }, { "epoch": 0.07925826229998505, "grad_norm": 2.0483514105705525, "learning_rate": 9.93664827957735e-06, "loss": 0.5279, "step": 530 }, { "epoch": 0.07940780619111709, "grad_norm": 1.2448870158155207, "learning_rate": 9.936263398987103e-06, "loss": 0.3744, "step": 531 }, { "epoch": 0.07955735008224914, "grad_norm": 0.9489762178863248, "learning_rate": 9.93587736030189e-06, "loss": 0.1631, "step": 532 }, { "epoch": 0.07970689397338118, "grad_norm": 1.3545590640653586, "learning_rate": 9.935490163612279e-06, "loss": 0.1975, "step": 533 }, { "epoch": 0.07985643786451324, "grad_norm": 1.3663228011672384, "learning_rate": 9.93510180900911e-06, "loss": 0.184, "step": 534 }, { "epoch": 0.08000598175564529, "grad_norm": 1.5768436668872405, "learning_rate": 9.934712296583497e-06, "loss": 0.3183, "step": 535 }, { "epoch": 0.08015552564677733, "grad_norm": 1.926347057489139, "learning_rate": 9.93432162642682e-06, "loss": 0.3305, "step": 536 }, { "epoch": 0.08030506953790938, "grad_norm": 2.0791782850566474, "learning_rate": 9.933929798630738e-06, "loss": 0.5009, "step": 537 }, { "epoch": 0.08045461342904142, "grad_norm": 2.1023331544425523, "learning_rate": 9.933536813287172e-06, "loss": 0.4292, "step": 538 }, { "epoch": 0.08060415732017347, "grad_norm": 2.8605361415271493, "learning_rate": 9.933142670488324e-06, "loss": 0.2666, "step": 539 }, { "epoch": 0.08075370121130551, "grad_norm": 2.7087693572573968, "learning_rate": 9.932747370326664e-06, "loss": 0.2544, "step": 540 }, { "epoch": 0.08090324510243757, "grad_norm": 1.5804074183588281, "learning_rate": 9.932350912894932e-06, "loss": 0.2089, "step": 541 }, { "epoch": 0.08105278899356962, "grad_norm": 1.6448934387271092, "learning_rate": 9.931953298286141e-06, "loss": 0.181, "step": 542 }, { "epoch": 0.08120233288470166, "grad_norm": 1.373017928034036, "learning_rate": 9.931554526593576e-06, "loss": 0.3218, "step": 543 }, { "epoch": 0.0813518767758337, "grad_norm": 1.4895748889012388, "learning_rate": 9.931154597910791e-06, "loss": 0.2472, "step": 544 }, { "epoch": 0.08150142066696575, "grad_norm": 2.064608760225509, "learning_rate": 9.930753512331615e-06, "loss": 0.3765, "step": 545 }, { "epoch": 0.0816509645580978, "grad_norm": 1.6526846905937504, "learning_rate": 9.930351269950144e-06, "loss": 0.3177, "step": 546 }, { "epoch": 0.08180050844922986, "grad_norm": 2.047798829134187, "learning_rate": 9.92994787086075e-06, "loss": 0.3192, "step": 547 }, { "epoch": 0.0819500523403619, "grad_norm": 2.122394373762569, "learning_rate": 9.929543315158073e-06, "loss": 0.5554, "step": 548 }, { "epoch": 0.08209959623149395, "grad_norm": 2.311960518258969, "learning_rate": 9.929137602937028e-06, "loss": 0.3797, "step": 549 }, { "epoch": 0.08224914012262599, "grad_norm": 1.8449832380251867, "learning_rate": 9.928730734292797e-06, "loss": 0.3894, "step": 550 }, { "epoch": 0.08239868401375804, "grad_norm": 1.995255157883457, "learning_rate": 9.928322709320834e-06, "loss": 0.3925, "step": 551 }, { "epoch": 0.08254822790489008, "grad_norm": 2.755405061449222, "learning_rate": 9.92791352811687e-06, "loss": 0.6899, "step": 552 }, { "epoch": 0.08269777179602213, "grad_norm": 1.2254981142470793, "learning_rate": 9.9275031907769e-06, "loss": 0.2225, "step": 553 }, { "epoch": 0.08284731568715419, "grad_norm": 1.9323036995913243, "learning_rate": 9.927091697397192e-06, "loss": 0.3865, "step": 554 }, { "epoch": 0.08299685957828623, "grad_norm": 2.0962863974348593, "learning_rate": 9.926679048074289e-06, "loss": 0.4, "step": 555 }, { "epoch": 0.08314640346941828, "grad_norm": 1.5847691098448267, "learning_rate": 9.926265242904998e-06, "loss": 0.247, "step": 556 }, { "epoch": 0.08329594736055032, "grad_norm": 2.5967594290859903, "learning_rate": 9.925850281986408e-06, "loss": 0.2083, "step": 557 }, { "epoch": 0.08344549125168237, "grad_norm": 2.0426826933231226, "learning_rate": 9.925434165415868e-06, "loss": 0.449, "step": 558 }, { "epoch": 0.08359503514281441, "grad_norm": 1.7693278888452375, "learning_rate": 9.925016893291007e-06, "loss": 0.2789, "step": 559 }, { "epoch": 0.08374457903394647, "grad_norm": 1.6227416269049326, "learning_rate": 9.924598465709717e-06, "loss": 0.2209, "step": 560 }, { "epoch": 0.08389412292507852, "grad_norm": 1.7055307729140163, "learning_rate": 9.924178882770166e-06, "loss": 0.3554, "step": 561 }, { "epoch": 0.08404366681621056, "grad_norm": 1.9245436136675982, "learning_rate": 9.923758144570792e-06, "loss": 0.5343, "step": 562 }, { "epoch": 0.0841932107073426, "grad_norm": 1.3916186974123048, "learning_rate": 9.923336251210306e-06, "loss": 0.2328, "step": 563 }, { "epoch": 0.08434275459847465, "grad_norm": 1.8724253939088875, "learning_rate": 9.92291320278769e-06, "loss": 0.2691, "step": 564 }, { "epoch": 0.0844922984896067, "grad_norm": 1.545927153493535, "learning_rate": 9.922488999402191e-06, "loss": 0.2049, "step": 565 }, { "epoch": 0.08464184238073874, "grad_norm": 2.216312298348258, "learning_rate": 9.922063641153332e-06, "loss": 0.5844, "step": 566 }, { "epoch": 0.0847913862718708, "grad_norm": 1.2444734652143745, "learning_rate": 9.921637128140909e-06, "loss": 0.2872, "step": 567 }, { "epoch": 0.08494093016300285, "grad_norm": 2.133851301389792, "learning_rate": 9.921209460464983e-06, "loss": 0.2418, "step": 568 }, { "epoch": 0.08509047405413489, "grad_norm": 1.5462263702909163, "learning_rate": 9.92078063822589e-06, "loss": 0.3438, "step": 569 }, { "epoch": 0.08524001794526694, "grad_norm": 2.341879963295622, "learning_rate": 9.920350661524237e-06, "loss": 0.5783, "step": 570 }, { "epoch": 0.08538956183639898, "grad_norm": 1.7633187330163729, "learning_rate": 9.919919530460899e-06, "loss": 0.3503, "step": 571 }, { "epoch": 0.08553910572753103, "grad_norm": 2.1676160714531107, "learning_rate": 9.919487245137024e-06, "loss": 0.2098, "step": 572 }, { "epoch": 0.08568864961866307, "grad_norm": 2.198855334486466, "learning_rate": 9.919053805654029e-06, "loss": 0.3876, "step": 573 }, { "epoch": 0.08583819350979513, "grad_norm": 1.821472616891953, "learning_rate": 9.918619212113607e-06, "loss": 0.391, "step": 574 }, { "epoch": 0.08598773740092717, "grad_norm": 1.4553776733520012, "learning_rate": 9.918183464617714e-06, "loss": 0.2032, "step": 575 }, { "epoch": 0.08613728129205922, "grad_norm": 1.5817735791823646, "learning_rate": 9.917746563268581e-06, "loss": 0.2658, "step": 576 }, { "epoch": 0.08628682518319127, "grad_norm": 2.255323258805483, "learning_rate": 9.917308508168712e-06, "loss": 0.39, "step": 577 }, { "epoch": 0.08643636907432331, "grad_norm": 1.699175902078527, "learning_rate": 9.916869299420875e-06, "loss": 0.1906, "step": 578 }, { "epoch": 0.08658591296545536, "grad_norm": 1.5572993513277051, "learning_rate": 9.916428937128117e-06, "loss": 0.3438, "step": 579 }, { "epoch": 0.08673545685658741, "grad_norm": 1.5095119263162684, "learning_rate": 9.915987421393747e-06, "loss": 0.272, "step": 580 }, { "epoch": 0.08688500074771946, "grad_norm": 2.8137128440101735, "learning_rate": 9.91554475232135e-06, "loss": 0.3833, "step": 581 }, { "epoch": 0.0870345446388515, "grad_norm": 1.845156278788705, "learning_rate": 9.915100930014786e-06, "loss": 0.4658, "step": 582 }, { "epoch": 0.08718408852998355, "grad_norm": 1.7624433765379017, "learning_rate": 9.914655954578171e-06, "loss": 0.3968, "step": 583 }, { "epoch": 0.0873336324211156, "grad_norm": 1.7915618837196812, "learning_rate": 9.914209826115906e-06, "loss": 0.4901, "step": 584 }, { "epoch": 0.08748317631224764, "grad_norm": 1.8335500777788887, "learning_rate": 9.913762544732654e-06, "loss": 0.249, "step": 585 }, { "epoch": 0.08763272020337969, "grad_norm": 1.5116580783389033, "learning_rate": 9.913314110533355e-06, "loss": 0.3999, "step": 586 }, { "epoch": 0.08778226409451174, "grad_norm": 1.9828537343745032, "learning_rate": 9.912864523623214e-06, "loss": 0.4153, "step": 587 }, { "epoch": 0.08793180798564379, "grad_norm": 1.6056147158647165, "learning_rate": 9.912413784107709e-06, "loss": 0.357, "step": 588 }, { "epoch": 0.08808135187677583, "grad_norm": 1.7642170812152784, "learning_rate": 9.911961892092587e-06, "loss": 0.3425, "step": 589 }, { "epoch": 0.08823089576790788, "grad_norm": 1.925307511563271, "learning_rate": 9.911508847683867e-06, "loss": 0.4476, "step": 590 }, { "epoch": 0.08838043965903992, "grad_norm": 1.9824372539957273, "learning_rate": 9.911054650987837e-06, "loss": 0.4597, "step": 591 }, { "epoch": 0.08852998355017197, "grad_norm": 1.5805088418089035, "learning_rate": 9.910599302111057e-06, "loss": 0.1935, "step": 592 }, { "epoch": 0.08867952744130403, "grad_norm": 2.157404890931188, "learning_rate": 9.910142801160355e-06, "loss": 0.3443, "step": 593 }, { "epoch": 0.08882907133243607, "grad_norm": 2.094900000445731, "learning_rate": 9.909685148242831e-06, "loss": 0.404, "step": 594 }, { "epoch": 0.08897861522356812, "grad_norm": 2.336415519412793, "learning_rate": 9.909226343465856e-06, "loss": 0.6382, "step": 595 }, { "epoch": 0.08912815911470016, "grad_norm": 2.0552137049182497, "learning_rate": 9.908766386937067e-06, "loss": 0.3908, "step": 596 }, { "epoch": 0.08927770300583221, "grad_norm": 1.1564393734179468, "learning_rate": 9.908305278764376e-06, "loss": 0.2457, "step": 597 }, { "epoch": 0.08942724689696425, "grad_norm": 1.8704284289450437, "learning_rate": 9.907843019055966e-06, "loss": 0.3604, "step": 598 }, { "epoch": 0.0895767907880963, "grad_norm": 1.295042190600909, "learning_rate": 9.907379607920281e-06, "loss": 0.2075, "step": 599 }, { "epoch": 0.08972633467922836, "grad_norm": 1.8305770820800886, "learning_rate": 9.90691504546605e-06, "loss": 0.2698, "step": 600 }, { "epoch": 0.0898758785703604, "grad_norm": 1.7240290275544472, "learning_rate": 9.906449331802256e-06, "loss": 0.2504, "step": 601 }, { "epoch": 0.09002542246149245, "grad_norm": 1.0036789417827203, "learning_rate": 9.905982467038167e-06, "loss": 0.195, "step": 602 }, { "epoch": 0.0901749663526245, "grad_norm": 1.6777253578130231, "learning_rate": 9.905514451283308e-06, "loss": 0.2436, "step": 603 }, { "epoch": 0.09032451024375654, "grad_norm": 1.9190873052270145, "learning_rate": 9.905045284647483e-06, "loss": 0.4006, "step": 604 }, { "epoch": 0.09047405413488858, "grad_norm": 1.77001911452716, "learning_rate": 9.904574967240764e-06, "loss": 0.3703, "step": 605 }, { "epoch": 0.09062359802602064, "grad_norm": 1.3114492277508998, "learning_rate": 9.904103499173487e-06, "loss": 0.2323, "step": 606 }, { "epoch": 0.09077314191715269, "grad_norm": 1.6694643051834908, "learning_rate": 9.90363088055627e-06, "loss": 0.2881, "step": 607 }, { "epoch": 0.09092268580828473, "grad_norm": 1.4448454411512122, "learning_rate": 9.903157111499988e-06, "loss": 0.2341, "step": 608 }, { "epoch": 0.09107222969941678, "grad_norm": 1.8302982894061834, "learning_rate": 9.902682192115795e-06, "loss": 0.3497, "step": 609 }, { "epoch": 0.09122177359054882, "grad_norm": 1.4089802820999182, "learning_rate": 9.902206122515113e-06, "loss": 0.1565, "step": 610 }, { "epoch": 0.09137131748168087, "grad_norm": 2.275670976517465, "learning_rate": 9.901728902809627e-06, "loss": 0.482, "step": 611 }, { "epoch": 0.09152086137281291, "grad_norm": 2.3916744409549997, "learning_rate": 9.901250533111301e-06, "loss": 0.539, "step": 612 }, { "epoch": 0.09167040526394497, "grad_norm": 1.110965438282227, "learning_rate": 9.900771013532367e-06, "loss": 0.2257, "step": 613 }, { "epoch": 0.09181994915507702, "grad_norm": 1.6169969209154105, "learning_rate": 9.900290344185321e-06, "loss": 0.2316, "step": 614 }, { "epoch": 0.09196949304620906, "grad_norm": 1.390950490331229, "learning_rate": 9.899808525182935e-06, "loss": 0.1735, "step": 615 }, { "epoch": 0.09211903693734111, "grad_norm": 1.26641152514348, "learning_rate": 9.899325556638247e-06, "loss": 0.2269, "step": 616 }, { "epoch": 0.09226858082847315, "grad_norm": 1.107259968960053, "learning_rate": 9.898841438664568e-06, "loss": 0.2082, "step": 617 }, { "epoch": 0.0924181247196052, "grad_norm": 1.6779136428714192, "learning_rate": 9.898356171375473e-06, "loss": 0.3744, "step": 618 }, { "epoch": 0.09256766861073726, "grad_norm": 1.8012739115801626, "learning_rate": 9.897869754884816e-06, "loss": 0.2438, "step": 619 }, { "epoch": 0.0927172125018693, "grad_norm": 1.6400812519548655, "learning_rate": 9.89738218930671e-06, "loss": 0.3692, "step": 620 }, { "epoch": 0.09286675639300135, "grad_norm": 2.7659374426954972, "learning_rate": 9.896893474755547e-06, "loss": 0.5873, "step": 621 }, { "epoch": 0.0930163002841334, "grad_norm": 3.020452608035097, "learning_rate": 9.89640361134598e-06, "loss": 0.4177, "step": 622 }, { "epoch": 0.09316584417526544, "grad_norm": 1.4907614824403637, "learning_rate": 9.895912599192937e-06, "loss": 0.2516, "step": 623 }, { "epoch": 0.09331538806639748, "grad_norm": 1.6636615032724535, "learning_rate": 9.895420438411616e-06, "loss": 0.1935, "step": 624 }, { "epoch": 0.09346493195752953, "grad_norm": 1.9719905447621995, "learning_rate": 9.89492712911748e-06, "loss": 0.2135, "step": 625 }, { "epoch": 0.09361447584866159, "grad_norm": 1.3681787330772102, "learning_rate": 9.894432671426264e-06, "loss": 0.208, "step": 626 }, { "epoch": 0.09376401973979363, "grad_norm": 2.0793649946453043, "learning_rate": 9.893937065453976e-06, "loss": 0.3719, "step": 627 }, { "epoch": 0.09391356363092568, "grad_norm": 1.685584025343787, "learning_rate": 9.893440311316887e-06, "loss": 0.2164, "step": 628 }, { "epoch": 0.09406310752205772, "grad_norm": 1.2145425693019332, "learning_rate": 9.892942409131541e-06, "loss": 0.1725, "step": 629 }, { "epoch": 0.09421265141318977, "grad_norm": 1.1438517718036314, "learning_rate": 9.892443359014752e-06, "loss": 0.2367, "step": 630 }, { "epoch": 0.09436219530432181, "grad_norm": 1.4416913213257094, "learning_rate": 9.8919431610836e-06, "loss": 0.2254, "step": 631 }, { "epoch": 0.09451173919545386, "grad_norm": 1.2656296241346114, "learning_rate": 9.891441815455436e-06, "loss": 0.2485, "step": 632 }, { "epoch": 0.09466128308658592, "grad_norm": 1.4276056880724206, "learning_rate": 9.890939322247881e-06, "loss": 0.1908, "step": 633 }, { "epoch": 0.09481082697771796, "grad_norm": 1.8185771152087218, "learning_rate": 9.890435681578827e-06, "loss": 0.2096, "step": 634 }, { "epoch": 0.09496037086885001, "grad_norm": 1.2794518689910337, "learning_rate": 9.88993089356643e-06, "loss": 0.2394, "step": 635 }, { "epoch": 0.09510991475998205, "grad_norm": 2.0227594086297738, "learning_rate": 9.88942495832912e-06, "loss": 0.59, "step": 636 }, { "epoch": 0.0952594586511141, "grad_norm": 1.3323082817593526, "learning_rate": 9.888917875985593e-06, "loss": 0.2073, "step": 637 }, { "epoch": 0.09540900254224614, "grad_norm": 1.7884206661676574, "learning_rate": 9.888409646654818e-06, "loss": 0.3897, "step": 638 }, { "epoch": 0.0955585464333782, "grad_norm": 2.124144136353745, "learning_rate": 9.887900270456025e-06, "loss": 0.5683, "step": 639 }, { "epoch": 0.09570809032451025, "grad_norm": 1.4793433841619534, "learning_rate": 9.887389747508725e-06, "loss": 0.3727, "step": 640 }, { "epoch": 0.09585763421564229, "grad_norm": 1.0661747667222115, "learning_rate": 9.88687807793269e-06, "loss": 0.1983, "step": 641 }, { "epoch": 0.09600717810677434, "grad_norm": 1.615153009655538, "learning_rate": 9.886365261847957e-06, "loss": 0.3675, "step": 642 }, { "epoch": 0.09615672199790638, "grad_norm": 1.4963878387365324, "learning_rate": 9.885851299374844e-06, "loss": 0.1805, "step": 643 }, { "epoch": 0.09630626588903843, "grad_norm": 1.8529323065992462, "learning_rate": 9.88533619063393e-06, "loss": 0.391, "step": 644 }, { "epoch": 0.09645580978017047, "grad_norm": 2.4764246014732145, "learning_rate": 9.884819935746063e-06, "loss": 0.2605, "step": 645 }, { "epoch": 0.09660535367130253, "grad_norm": 1.904672440883197, "learning_rate": 9.884302534832361e-06, "loss": 0.3935, "step": 646 }, { "epoch": 0.09675489756243458, "grad_norm": 1.9431435460380113, "learning_rate": 9.883783988014216e-06, "loss": 0.2092, "step": 647 }, { "epoch": 0.09690444145356662, "grad_norm": 2.0946695671241553, "learning_rate": 9.883264295413278e-06, "loss": 0.3957, "step": 648 }, { "epoch": 0.09705398534469867, "grad_norm": 1.0944344711946927, "learning_rate": 9.882743457151476e-06, "loss": 0.202, "step": 649 }, { "epoch": 0.09720352923583071, "grad_norm": 1.5147259026498003, "learning_rate": 9.882221473351e-06, "loss": 0.3029, "step": 650 }, { "epoch": 0.09735307312696276, "grad_norm": 1.3452835965457643, "learning_rate": 9.881698344134316e-06, "loss": 0.2159, "step": 651 }, { "epoch": 0.09750261701809482, "grad_norm": 1.7952640402406481, "learning_rate": 9.881174069624155e-06, "loss": 0.4006, "step": 652 }, { "epoch": 0.09765216090922686, "grad_norm": 2.468540255171398, "learning_rate": 9.880648649943515e-06, "loss": 0.4393, "step": 653 }, { "epoch": 0.09780170480035891, "grad_norm": 1.5332585075726441, "learning_rate": 9.880122085215664e-06, "loss": 0.2401, "step": 654 }, { "epoch": 0.09795124869149095, "grad_norm": 1.5882881108110953, "learning_rate": 9.87959437556414e-06, "loss": 0.2078, "step": 655 }, { "epoch": 0.098100792582623, "grad_norm": 1.7962702189497488, "learning_rate": 9.87906552111275e-06, "loss": 0.4793, "step": 656 }, { "epoch": 0.09825033647375504, "grad_norm": 1.860004859316795, "learning_rate": 9.878535521985568e-06, "loss": 0.2388, "step": 657 }, { "epoch": 0.09839988036488709, "grad_norm": 1.9861019609665855, "learning_rate": 9.878004378306934e-06, "loss": 0.3721, "step": 658 }, { "epoch": 0.09854942425601915, "grad_norm": 1.5404208138898199, "learning_rate": 9.877472090201463e-06, "loss": 0.3534, "step": 659 }, { "epoch": 0.09869896814715119, "grad_norm": 3.0119825067072306, "learning_rate": 9.876938657794036e-06, "loss": 0.6732, "step": 660 }, { "epoch": 0.09884851203828324, "grad_norm": 1.5069735817087104, "learning_rate": 9.876404081209796e-06, "loss": 0.4004, "step": 661 }, { "epoch": 0.09899805592941528, "grad_norm": 1.6856753387650372, "learning_rate": 9.875868360574164e-06, "loss": 0.2942, "step": 662 }, { "epoch": 0.09914759982054733, "grad_norm": 1.6896901311725145, "learning_rate": 9.875331496012822e-06, "loss": 0.239, "step": 663 }, { "epoch": 0.09929714371167937, "grad_norm": 2.2770505228904225, "learning_rate": 9.87479348765173e-06, "loss": 0.4755, "step": 664 }, { "epoch": 0.09944668760281143, "grad_norm": 1.9016485099179228, "learning_rate": 9.874254335617102e-06, "loss": 0.4645, "step": 665 }, { "epoch": 0.09959623149394348, "grad_norm": 1.6638896812103354, "learning_rate": 9.873714040035434e-06, "loss": 0.2512, "step": 666 }, { "epoch": 0.09974577538507552, "grad_norm": 1.7233554952000107, "learning_rate": 9.873172601033482e-06, "loss": 0.3958, "step": 667 }, { "epoch": 0.09989531927620757, "grad_norm": 1.7250170911584946, "learning_rate": 9.872630018738271e-06, "loss": 0.3115, "step": 668 }, { "epoch": 0.10004486316733961, "grad_norm": 1.8843746906489027, "learning_rate": 9.872086293277101e-06, "loss": 0.3789, "step": 669 }, { "epoch": 0.10019440705847166, "grad_norm": 1.943275185299739, "learning_rate": 9.871541424777534e-06, "loss": 0.4192, "step": 670 }, { "epoch": 0.1003439509496037, "grad_norm": 1.4918005726247283, "learning_rate": 9.870995413367397e-06, "loss": 0.2538, "step": 671 }, { "epoch": 0.10049349484073576, "grad_norm": 1.6441123648652987, "learning_rate": 9.870448259174791e-06, "loss": 0.2295, "step": 672 }, { "epoch": 0.1006430387318678, "grad_norm": 1.933429186975597, "learning_rate": 9.86989996232809e-06, "loss": 0.4015, "step": 673 }, { "epoch": 0.10079258262299985, "grad_norm": 1.8125640882474123, "learning_rate": 9.869350522955921e-06, "loss": 0.3807, "step": 674 }, { "epoch": 0.1009421265141319, "grad_norm": 1.9369733002230116, "learning_rate": 9.868799941187193e-06, "loss": 0.5201, "step": 675 }, { "epoch": 0.10109167040526394, "grad_norm": 1.5216959755972845, "learning_rate": 9.868248217151075e-06, "loss": 0.3624, "step": 676 }, { "epoch": 0.10124121429639599, "grad_norm": 1.532054269025379, "learning_rate": 9.867695350977009e-06, "loss": 0.2738, "step": 677 }, { "epoch": 0.10139075818752805, "grad_norm": 1.9725714032650388, "learning_rate": 9.867141342794703e-06, "loss": 0.5802, "step": 678 }, { "epoch": 0.10154030207866009, "grad_norm": 1.9107978616944274, "learning_rate": 9.86658619273413e-06, "loss": 0.482, "step": 679 }, { "epoch": 0.10168984596979214, "grad_norm": 1.919639496784501, "learning_rate": 9.866029900925535e-06, "loss": 0.3558, "step": 680 }, { "epoch": 0.10183938986092418, "grad_norm": 2.174841069849439, "learning_rate": 9.865472467499431e-06, "loss": 0.6996, "step": 681 }, { "epoch": 0.10198893375205623, "grad_norm": 2.2558702972279807, "learning_rate": 9.864913892586596e-06, "loss": 0.2397, "step": 682 }, { "epoch": 0.10213847764318827, "grad_norm": 2.1196800969183105, "learning_rate": 9.864354176318076e-06, "loss": 0.3793, "step": 683 }, { "epoch": 0.10228802153432032, "grad_norm": 2.165719475550091, "learning_rate": 9.863793318825186e-06, "loss": 0.2154, "step": 684 }, { "epoch": 0.10243756542545238, "grad_norm": 1.7513134063770632, "learning_rate": 9.86323132023951e-06, "loss": 0.3816, "step": 685 }, { "epoch": 0.10258710931658442, "grad_norm": 1.7103742255808732, "learning_rate": 9.862668180692897e-06, "loss": 0.2469, "step": 686 }, { "epoch": 0.10273665320771647, "grad_norm": 1.9784764768939407, "learning_rate": 9.862103900317467e-06, "loss": 0.2279, "step": 687 }, { "epoch": 0.10288619709884851, "grad_norm": 2.5430996153598877, "learning_rate": 9.861538479245603e-06, "loss": 0.4512, "step": 688 }, { "epoch": 0.10303574098998056, "grad_norm": 1.3584315188319882, "learning_rate": 9.86097191760996e-06, "loss": 0.2521, "step": 689 }, { "epoch": 0.1031852848811126, "grad_norm": 1.8041511333081743, "learning_rate": 9.860404215543458e-06, "loss": 0.3794, "step": 690 }, { "epoch": 0.10333482877224466, "grad_norm": 2.261581805469511, "learning_rate": 9.859835373179285e-06, "loss": 0.5264, "step": 691 }, { "epoch": 0.1034843726633767, "grad_norm": 1.4531049528328563, "learning_rate": 9.859265390650897e-06, "loss": 0.2069, "step": 692 }, { "epoch": 0.10363391655450875, "grad_norm": 1.6530791454319427, "learning_rate": 9.85869426809202e-06, "loss": 0.2304, "step": 693 }, { "epoch": 0.1037834604456408, "grad_norm": 1.5868398701857311, "learning_rate": 9.85812200563664e-06, "loss": 0.3894, "step": 694 }, { "epoch": 0.10393300433677284, "grad_norm": 1.4690408418702507, "learning_rate": 9.857548603419019e-06, "loss": 0.3383, "step": 695 }, { "epoch": 0.10408254822790489, "grad_norm": 1.868386725453433, "learning_rate": 9.856974061573682e-06, "loss": 0.4666, "step": 696 }, { "epoch": 0.10423209211903693, "grad_norm": 1.3771017197315938, "learning_rate": 9.856398380235422e-06, "loss": 0.2285, "step": 697 }, { "epoch": 0.10438163601016899, "grad_norm": 2.452990479638216, "learning_rate": 9.855821559539298e-06, "loss": 0.7219, "step": 698 }, { "epoch": 0.10453117990130104, "grad_norm": 1.8742322224001207, "learning_rate": 9.85524359962064e-06, "loss": 0.4803, "step": 699 }, { "epoch": 0.10468072379243308, "grad_norm": 1.858692042760981, "learning_rate": 9.854664500615041e-06, "loss": 0.2273, "step": 700 }, { "epoch": 0.10483026768356513, "grad_norm": 1.1355721780236596, "learning_rate": 9.854084262658365e-06, "loss": 0.1947, "step": 701 }, { "epoch": 0.10497981157469717, "grad_norm": 1.3464195395769243, "learning_rate": 9.853502885886738e-06, "loss": 0.1988, "step": 702 }, { "epoch": 0.10512935546582922, "grad_norm": 1.204875080370136, "learning_rate": 9.852920370436561e-06, "loss": 0.3027, "step": 703 }, { "epoch": 0.10527889935696126, "grad_norm": 1.3557124537174092, "learning_rate": 9.852336716444496e-06, "loss": 0.2158, "step": 704 }, { "epoch": 0.10542844324809332, "grad_norm": 1.5752529363149261, "learning_rate": 9.851751924047472e-06, "loss": 0.3324, "step": 705 }, { "epoch": 0.10557798713922537, "grad_norm": 1.7915590890665287, "learning_rate": 9.85116599338269e-06, "loss": 0.4936, "step": 706 }, { "epoch": 0.10572753103035741, "grad_norm": 1.6842493918087815, "learning_rate": 9.850578924587614e-06, "loss": 0.4249, "step": 707 }, { "epoch": 0.10587707492148946, "grad_norm": 1.781419189683173, "learning_rate": 9.849990717799975e-06, "loss": 0.2615, "step": 708 }, { "epoch": 0.1060266188126215, "grad_norm": 1.4202393409091985, "learning_rate": 9.849401373157772e-06, "loss": 0.3256, "step": 709 }, { "epoch": 0.10617616270375355, "grad_norm": 1.3714522045342281, "learning_rate": 9.84881089079927e-06, "loss": 0.219, "step": 710 }, { "epoch": 0.1063257065948856, "grad_norm": 1.7391677942386203, "learning_rate": 9.848219270863005e-06, "loss": 0.2249, "step": 711 }, { "epoch": 0.10647525048601765, "grad_norm": 1.3023890791191592, "learning_rate": 9.847626513487774e-06, "loss": 0.3693, "step": 712 }, { "epoch": 0.1066247943771497, "grad_norm": 1.7969068078667318, "learning_rate": 9.847032618812647e-06, "loss": 0.2298, "step": 713 }, { "epoch": 0.10677433826828174, "grad_norm": 2.102291030534645, "learning_rate": 9.846437586976952e-06, "loss": 0.4688, "step": 714 }, { "epoch": 0.10692388215941379, "grad_norm": 1.072288463866959, "learning_rate": 9.845841418120295e-06, "loss": 0.2023, "step": 715 }, { "epoch": 0.10707342605054583, "grad_norm": 1.3278088866624802, "learning_rate": 9.845244112382536e-06, "loss": 0.3492, "step": 716 }, { "epoch": 0.10722296994167788, "grad_norm": 1.3771047197586663, "learning_rate": 9.844645669903816e-06, "loss": 0.2152, "step": 717 }, { "epoch": 0.10737251383280993, "grad_norm": 1.243148446265919, "learning_rate": 9.844046090824533e-06, "loss": 0.2419, "step": 718 }, { "epoch": 0.10752205772394198, "grad_norm": 1.3994827626329662, "learning_rate": 9.843445375285351e-06, "loss": 0.3578, "step": 719 }, { "epoch": 0.10767160161507403, "grad_norm": 2.180600395588636, "learning_rate": 9.842843523427207e-06, "loss": 0.4159, "step": 720 }, { "epoch": 0.10782114550620607, "grad_norm": 1.853639106134475, "learning_rate": 9.842240535391301e-06, "loss": 0.3929, "step": 721 }, { "epoch": 0.10797068939733812, "grad_norm": 2.1662710209518306, "learning_rate": 9.841636411319098e-06, "loss": 0.298, "step": 722 }, { "epoch": 0.10812023328847016, "grad_norm": 1.604340910092426, "learning_rate": 9.841031151352332e-06, "loss": 0.2175, "step": 723 }, { "epoch": 0.10826977717960222, "grad_norm": 2.486345181702559, "learning_rate": 9.840424755633002e-06, "loss": 0.5179, "step": 724 }, { "epoch": 0.10841932107073426, "grad_norm": 1.415864057650498, "learning_rate": 9.83981722430338e-06, "loss": 0.3539, "step": 725 }, { "epoch": 0.10856886496186631, "grad_norm": 1.4949172725362427, "learning_rate": 9.839208557505989e-06, "loss": 0.382, "step": 726 }, { "epoch": 0.10871840885299835, "grad_norm": 2.2920695398684576, "learning_rate": 9.838598755383636e-06, "loss": 0.5086, "step": 727 }, { "epoch": 0.1088679527441304, "grad_norm": 1.6667328126020315, "learning_rate": 9.837987818079382e-06, "loss": 0.3736, "step": 728 }, { "epoch": 0.10901749663526245, "grad_norm": 1.3398213843433537, "learning_rate": 9.837375745736562e-06, "loss": 0.1827, "step": 729 }, { "epoch": 0.10916704052639449, "grad_norm": 1.9606925081810038, "learning_rate": 9.83676253849877e-06, "loss": 0.3992, "step": 730 }, { "epoch": 0.10931658441752655, "grad_norm": 1.7774772468148792, "learning_rate": 9.836148196509875e-06, "loss": 0.4769, "step": 731 }, { "epoch": 0.1094661283086586, "grad_norm": 1.9643639344581283, "learning_rate": 9.835532719914005e-06, "loss": 0.6049, "step": 732 }, { "epoch": 0.10961567219979064, "grad_norm": 2.205274189695602, "learning_rate": 9.834916108855557e-06, "loss": 0.3679, "step": 733 }, { "epoch": 0.10976521609092268, "grad_norm": 1.5293962633909763, "learning_rate": 9.834298363479193e-06, "loss": 0.3355, "step": 734 }, { "epoch": 0.10991475998205473, "grad_norm": 1.985725165029928, "learning_rate": 9.833679483929846e-06, "loss": 0.357, "step": 735 }, { "epoch": 0.11006430387318678, "grad_norm": 1.938156924090921, "learning_rate": 9.833059470352705e-06, "loss": 0.2667, "step": 736 }, { "epoch": 0.11021384776431883, "grad_norm": 1.6208704990029696, "learning_rate": 9.832438322893235e-06, "loss": 0.2751, "step": 737 }, { "epoch": 0.11036339165545088, "grad_norm": 1.6610841289834064, "learning_rate": 9.831816041697164e-06, "loss": 0.2319, "step": 738 }, { "epoch": 0.11051293554658292, "grad_norm": 2.003195385581558, "learning_rate": 9.831192626910482e-06, "loss": 0.3799, "step": 739 }, { "epoch": 0.11066247943771497, "grad_norm": 1.8861050802330894, "learning_rate": 9.83056807867945e-06, "loss": 0.4804, "step": 740 }, { "epoch": 0.11081202332884701, "grad_norm": 1.6483143403386502, "learning_rate": 9.829942397150593e-06, "loss": 0.3658, "step": 741 }, { "epoch": 0.11096156721997906, "grad_norm": 1.5438600790491723, "learning_rate": 9.829315582470702e-06, "loss": 0.2297, "step": 742 }, { "epoch": 0.1111111111111111, "grad_norm": 1.4215916414139778, "learning_rate": 9.828687634786834e-06, "loss": 0.2365, "step": 743 }, { "epoch": 0.11126065500224316, "grad_norm": 1.7761192463313074, "learning_rate": 9.828058554246309e-06, "loss": 0.3052, "step": 744 }, { "epoch": 0.11141019889337521, "grad_norm": 1.6148872971257344, "learning_rate": 9.82742834099672e-06, "loss": 0.2199, "step": 745 }, { "epoch": 0.11155974278450725, "grad_norm": 2.0162005563823646, "learning_rate": 9.826796995185916e-06, "loss": 0.3839, "step": 746 }, { "epoch": 0.1117092866756393, "grad_norm": 1.4737452330787222, "learning_rate": 9.826164516962022e-06, "loss": 0.1869, "step": 747 }, { "epoch": 0.11185883056677134, "grad_norm": 2.102437337141145, "learning_rate": 9.82553090647342e-06, "loss": 0.2615, "step": 748 }, { "epoch": 0.11200837445790339, "grad_norm": 1.7050095093194846, "learning_rate": 9.82489616386876e-06, "loss": 0.2518, "step": 749 }, { "epoch": 0.11215791834903545, "grad_norm": 1.5205595562607015, "learning_rate": 9.824260289296963e-06, "loss": 0.1792, "step": 750 }, { "epoch": 0.1123074622401675, "grad_norm": 1.7480859507023934, "learning_rate": 9.823623282907207e-06, "loss": 0.4179, "step": 751 }, { "epoch": 0.11245700613129954, "grad_norm": 1.5498394808983003, "learning_rate": 9.822985144848944e-06, "loss": 0.3358, "step": 752 }, { "epoch": 0.11260655002243158, "grad_norm": 1.6393482758244988, "learning_rate": 9.822345875271884e-06, "loss": 0.2149, "step": 753 }, { "epoch": 0.11275609391356363, "grad_norm": 1.9754923234031054, "learning_rate": 9.821705474326006e-06, "loss": 0.434, "step": 754 }, { "epoch": 0.11290563780469567, "grad_norm": 2.2364190645343154, "learning_rate": 9.821063942161558e-06, "loss": 0.2228, "step": 755 }, { "epoch": 0.11305518169582772, "grad_norm": 1.4307479990009164, "learning_rate": 9.820421278929045e-06, "loss": 0.3547, "step": 756 }, { "epoch": 0.11320472558695978, "grad_norm": 1.2078809494224174, "learning_rate": 9.819777484779242e-06, "loss": 0.2245, "step": 757 }, { "epoch": 0.11335426947809182, "grad_norm": 2.6305960032740354, "learning_rate": 9.819132559863194e-06, "loss": 0.6771, "step": 758 }, { "epoch": 0.11350381336922387, "grad_norm": 1.4792675137281683, "learning_rate": 9.818486504332203e-06, "loss": 0.318, "step": 759 }, { "epoch": 0.11365335726035591, "grad_norm": 2.127350110714495, "learning_rate": 9.817839318337839e-06, "loss": 0.4925, "step": 760 }, { "epoch": 0.11380290115148796, "grad_norm": 1.8233415508114148, "learning_rate": 9.81719100203194e-06, "loss": 0.2747, "step": 761 }, { "epoch": 0.11395244504262, "grad_norm": 2.1187219443475156, "learning_rate": 9.81654155556661e-06, "loss": 0.4595, "step": 762 }, { "epoch": 0.11410198893375205, "grad_norm": 1.5759792753813915, "learning_rate": 9.81589097909421e-06, "loss": 0.3553, "step": 763 }, { "epoch": 0.11425153282488411, "grad_norm": 1.708776908270571, "learning_rate": 9.815239272767373e-06, "loss": 0.3091, "step": 764 }, { "epoch": 0.11440107671601615, "grad_norm": 1.703775430420963, "learning_rate": 9.814586436738998e-06, "loss": 0.3728, "step": 765 }, { "epoch": 0.1145506206071482, "grad_norm": 1.6198262441466886, "learning_rate": 9.813932471162245e-06, "loss": 0.2498, "step": 766 }, { "epoch": 0.11470016449828024, "grad_norm": 1.4858642435718663, "learning_rate": 9.813277376190539e-06, "loss": 0.2299, "step": 767 }, { "epoch": 0.11484970838941229, "grad_norm": 1.8052387881768808, "learning_rate": 9.812621151977574e-06, "loss": 0.3834, "step": 768 }, { "epoch": 0.11499925228054433, "grad_norm": 1.7902664470941898, "learning_rate": 9.811963798677306e-06, "loss": 0.2282, "step": 769 }, { "epoch": 0.11514879617167639, "grad_norm": 1.6316784429425562, "learning_rate": 9.811305316443956e-06, "loss": 0.2396, "step": 770 }, { "epoch": 0.11529834006280844, "grad_norm": 1.414088700773603, "learning_rate": 9.81064570543201e-06, "loss": 0.2353, "step": 771 }, { "epoch": 0.11544788395394048, "grad_norm": 1.9219176628835946, "learning_rate": 9.80998496579622e-06, "loss": 0.3379, "step": 772 }, { "epoch": 0.11559742784507253, "grad_norm": 1.113085528787142, "learning_rate": 9.809323097691602e-06, "loss": 0.213, "step": 773 }, { "epoch": 0.11574697173620457, "grad_norm": 1.6091723360768109, "learning_rate": 9.808660101273435e-06, "loss": 0.3457, "step": 774 }, { "epoch": 0.11589651562733662, "grad_norm": 1.4626018681006754, "learning_rate": 9.807995976697267e-06, "loss": 0.1777, "step": 775 }, { "epoch": 0.11604605951846866, "grad_norm": 2.3953869334660522, "learning_rate": 9.807330724118906e-06, "loss": 0.449, "step": 776 }, { "epoch": 0.11619560340960072, "grad_norm": 1.595059614098865, "learning_rate": 9.806664343694425e-06, "loss": 0.3367, "step": 777 }, { "epoch": 0.11634514730073277, "grad_norm": 1.1067814732956414, "learning_rate": 9.805996835580169e-06, "loss": 0.1828, "step": 778 }, { "epoch": 0.11649469119186481, "grad_norm": 1.62643731894747, "learning_rate": 9.805328199932736e-06, "loss": 0.2005, "step": 779 }, { "epoch": 0.11664423508299686, "grad_norm": 1.5535907543030336, "learning_rate": 9.804658436908996e-06, "loss": 0.2635, "step": 780 }, { "epoch": 0.1167937789741289, "grad_norm": 1.0891099881486959, "learning_rate": 9.803987546666083e-06, "loss": 0.2012, "step": 781 }, { "epoch": 0.11694332286526095, "grad_norm": 1.9927493555965012, "learning_rate": 9.803315529361395e-06, "loss": 0.5297, "step": 782 }, { "epoch": 0.11709286675639301, "grad_norm": 1.6333695435696685, "learning_rate": 9.802642385152593e-06, "loss": 0.2959, "step": 783 }, { "epoch": 0.11724241064752505, "grad_norm": 2.0905230122366896, "learning_rate": 9.8019681141976e-06, "loss": 0.3662, "step": 784 }, { "epoch": 0.1173919545386571, "grad_norm": 1.3626106000772258, "learning_rate": 9.80129271665461e-06, "loss": 0.2065, "step": 785 }, { "epoch": 0.11754149842978914, "grad_norm": 1.7357589724302078, "learning_rate": 9.800616192682077e-06, "loss": 0.3269, "step": 786 }, { "epoch": 0.11769104232092119, "grad_norm": 1.3095611640218061, "learning_rate": 9.79993854243872e-06, "loss": 0.1993, "step": 787 }, { "epoch": 0.11784058621205323, "grad_norm": 2.008966146034465, "learning_rate": 9.799259766083522e-06, "loss": 0.2346, "step": 788 }, { "epoch": 0.11799013010318528, "grad_norm": 1.4166616715548845, "learning_rate": 9.798579863775733e-06, "loss": 0.2053, "step": 789 }, { "epoch": 0.11813967399431734, "grad_norm": 2.2231308106975742, "learning_rate": 9.79789883567486e-06, "loss": 0.3138, "step": 790 }, { "epoch": 0.11828921788544938, "grad_norm": 2.104337845030462, "learning_rate": 9.79721668194068e-06, "loss": 0.5896, "step": 791 }, { "epoch": 0.11843876177658143, "grad_norm": 1.3961164790172769, "learning_rate": 9.796533402733235e-06, "loss": 0.2023, "step": 792 }, { "epoch": 0.11858830566771347, "grad_norm": 1.9644410315987328, "learning_rate": 9.79584899821283e-06, "loss": 0.2389, "step": 793 }, { "epoch": 0.11873784955884552, "grad_norm": 1.911243493993425, "learning_rate": 9.795163468540028e-06, "loss": 0.2319, "step": 794 }, { "epoch": 0.11888739344997756, "grad_norm": 1.5442762653540485, "learning_rate": 9.794476813875665e-06, "loss": 0.256, "step": 795 }, { "epoch": 0.11903693734110962, "grad_norm": 1.6347185262551664, "learning_rate": 9.793789034380833e-06, "loss": 0.3659, "step": 796 }, { "epoch": 0.11918648123224167, "grad_norm": 6.93121052791821, "learning_rate": 9.793100130216895e-06, "loss": 0.3348, "step": 797 }, { "epoch": 0.11933602512337371, "grad_norm": 1.1914924746720745, "learning_rate": 9.792410101545475e-06, "loss": 0.2475, "step": 798 }, { "epoch": 0.11948556901450576, "grad_norm": 1.4728413245474197, "learning_rate": 9.791718948528457e-06, "loss": 0.3569, "step": 799 }, { "epoch": 0.1196351129056378, "grad_norm": 2.0173892018585113, "learning_rate": 9.791026671327996e-06, "loss": 0.3154, "step": 800 }, { "epoch": 0.11978465679676985, "grad_norm": 1.863844432530015, "learning_rate": 9.790333270106505e-06, "loss": 0.349, "step": 801 }, { "epoch": 0.11993420068790189, "grad_norm": 1.348620907056274, "learning_rate": 9.789638745026661e-06, "loss": 0.2553, "step": 802 }, { "epoch": 0.12008374457903395, "grad_norm": 1.7207895705367349, "learning_rate": 9.78894309625141e-06, "loss": 0.3931, "step": 803 }, { "epoch": 0.120233288470166, "grad_norm": 1.3131471894535065, "learning_rate": 9.788246323943954e-06, "loss": 0.1473, "step": 804 }, { "epoch": 0.12038283236129804, "grad_norm": 1.054327043113717, "learning_rate": 9.787548428267766e-06, "loss": 0.1945, "step": 805 }, { "epoch": 0.12053237625243009, "grad_norm": 1.9526157668969721, "learning_rate": 9.786849409386577e-06, "loss": 0.3906, "step": 806 }, { "epoch": 0.12068192014356213, "grad_norm": 1.8267497285704608, "learning_rate": 9.786149267464382e-06, "loss": 0.4193, "step": 807 }, { "epoch": 0.12083146403469418, "grad_norm": 0.8038770137897923, "learning_rate": 9.785448002665446e-06, "loss": 0.2392, "step": 808 }, { "epoch": 0.12098100792582624, "grad_norm": 1.5253624272396114, "learning_rate": 9.784745615154286e-06, "loss": 0.3366, "step": 809 }, { "epoch": 0.12113055181695828, "grad_norm": 2.093861559130543, "learning_rate": 9.784042105095694e-06, "loss": 0.4947, "step": 810 }, { "epoch": 0.12128009570809033, "grad_norm": 1.4395999315986885, "learning_rate": 9.78333747265472e-06, "loss": 0.3721, "step": 811 }, { "epoch": 0.12142963959922237, "grad_norm": 1.852326670020495, "learning_rate": 9.782631717996675e-06, "loss": 0.4779, "step": 812 }, { "epoch": 0.12157918349035442, "grad_norm": 1.6061911802246367, "learning_rate": 9.781924841287136e-06, "loss": 0.3634, "step": 813 }, { "epoch": 0.12172872738148646, "grad_norm": 1.6480313202927959, "learning_rate": 9.781216842691945e-06, "loss": 0.3486, "step": 814 }, { "epoch": 0.12187827127261851, "grad_norm": 1.7352908193060639, "learning_rate": 9.780507722377205e-06, "loss": 0.2405, "step": 815 }, { "epoch": 0.12202781516375057, "grad_norm": 1.819605505850209, "learning_rate": 9.779797480509281e-06, "loss": 0.2702, "step": 816 }, { "epoch": 0.12217735905488261, "grad_norm": 5.030925574416197, "learning_rate": 9.779086117254804e-06, "loss": 0.4802, "step": 817 }, { "epoch": 0.12232690294601466, "grad_norm": 1.4714728946101239, "learning_rate": 9.778373632780665e-06, "loss": 0.4002, "step": 818 }, { "epoch": 0.1224764468371467, "grad_norm": 1.8961195589940971, "learning_rate": 9.777660027254022e-06, "loss": 0.5022, "step": 819 }, { "epoch": 0.12262599072827875, "grad_norm": 2.1718036722230343, "learning_rate": 9.776945300842292e-06, "loss": 0.3274, "step": 820 }, { "epoch": 0.12277553461941079, "grad_norm": 1.620505499680087, "learning_rate": 9.776229453713158e-06, "loss": 0.2316, "step": 821 }, { "epoch": 0.12292507851054285, "grad_norm": 1.6978035349883904, "learning_rate": 9.775512486034564e-06, "loss": 0.3388, "step": 822 }, { "epoch": 0.1230746224016749, "grad_norm": 1.8097210824887537, "learning_rate": 9.774794397974715e-06, "loss": 0.2658, "step": 823 }, { "epoch": 0.12322416629280694, "grad_norm": 1.7832381045534218, "learning_rate": 9.774075189702085e-06, "loss": 0.236, "step": 824 }, { "epoch": 0.12337371018393899, "grad_norm": 1.5481034516154306, "learning_rate": 9.773354861385408e-06, "loss": 0.2209, "step": 825 }, { "epoch": 0.12352325407507103, "grad_norm": 1.808929914702085, "learning_rate": 9.772633413193677e-06, "loss": 0.3936, "step": 826 }, { "epoch": 0.12367279796620308, "grad_norm": 1.4632324741175244, "learning_rate": 9.771910845296151e-06, "loss": 0.1809, "step": 827 }, { "epoch": 0.12382234185733512, "grad_norm": 1.602480536861921, "learning_rate": 9.771187157862352e-06, "loss": 0.3631, "step": 828 }, { "epoch": 0.12397188574846718, "grad_norm": 1.695314807275002, "learning_rate": 9.770462351062065e-06, "loss": 0.3419, "step": 829 }, { "epoch": 0.12412142963959923, "grad_norm": 2.2542289621861262, "learning_rate": 9.769736425065333e-06, "loss": 0.4292, "step": 830 }, { "epoch": 0.12427097353073127, "grad_norm": 1.7697982755032058, "learning_rate": 9.76900938004247e-06, "loss": 0.3735, "step": 831 }, { "epoch": 0.12442051742186332, "grad_norm": 1.8120887881814032, "learning_rate": 9.768281216164045e-06, "loss": 0.3568, "step": 832 }, { "epoch": 0.12457006131299536, "grad_norm": 1.581710048140236, "learning_rate": 9.767551933600896e-06, "loss": 0.1999, "step": 833 }, { "epoch": 0.1247196052041274, "grad_norm": 1.8611636134135094, "learning_rate": 9.766821532524113e-06, "loss": 0.4111, "step": 834 }, { "epoch": 0.12486914909525945, "grad_norm": 1.3596930734620556, "learning_rate": 9.76609001310506e-06, "loss": 0.1893, "step": 835 }, { "epoch": 0.1250186929863915, "grad_norm": 1.7562268713789106, "learning_rate": 9.76535737551536e-06, "loss": 0.3948, "step": 836 }, { "epoch": 0.12516823687752354, "grad_norm": 2.0138359003020136, "learning_rate": 9.764623619926891e-06, "loss": 0.2182, "step": 837 }, { "epoch": 0.1253177807686556, "grad_norm": 1.447071144741178, "learning_rate": 9.763888746511804e-06, "loss": 0.2027, "step": 838 }, { "epoch": 0.12546732465978766, "grad_norm": 2.1652568729944734, "learning_rate": 9.763152755442504e-06, "loss": 0.6314, "step": 839 }, { "epoch": 0.1256168685509197, "grad_norm": 1.8038679466625127, "learning_rate": 9.762415646891665e-06, "loss": 0.3578, "step": 840 }, { "epoch": 0.12576641244205175, "grad_norm": 1.6058605347195138, "learning_rate": 9.761677421032218e-06, "loss": 0.4411, "step": 841 }, { "epoch": 0.12591595633318378, "grad_norm": 1.590419871100753, "learning_rate": 9.760938078037358e-06, "loss": 0.3562, "step": 842 }, { "epoch": 0.12606550022431584, "grad_norm": 2.1851801492136267, "learning_rate": 9.76019761808054e-06, "loss": 0.5822, "step": 843 }, { "epoch": 0.12621504411544787, "grad_norm": 1.5855983503039581, "learning_rate": 9.759456041335487e-06, "loss": 0.2229, "step": 844 }, { "epoch": 0.12636458800657993, "grad_norm": 1.370525319712137, "learning_rate": 9.758713347976179e-06, "loss": 0.2126, "step": 845 }, { "epoch": 0.126514131897712, "grad_norm": 6.394283226949693, "learning_rate": 9.757969538176856e-06, "loss": 0.5925, "step": 846 }, { "epoch": 0.12666367578884402, "grad_norm": 1.6599084242802136, "learning_rate": 9.757224612112026e-06, "loss": 0.2939, "step": 847 }, { "epoch": 0.12681321967997608, "grad_norm": 1.7907787465869436, "learning_rate": 9.756478569956455e-06, "loss": 0.222, "step": 848 }, { "epoch": 0.1269627635711081, "grad_norm": 2.2366122735755707, "learning_rate": 9.755731411885172e-06, "loss": 0.6684, "step": 849 }, { "epoch": 0.12711230746224017, "grad_norm": 1.2342377688986181, "learning_rate": 9.754983138073466e-06, "loss": 0.1731, "step": 850 }, { "epoch": 0.1272618513533722, "grad_norm": 1.890953555602396, "learning_rate": 9.75423374869689e-06, "loss": 0.3518, "step": 851 }, { "epoch": 0.12741139524450426, "grad_norm": 1.6475639435427634, "learning_rate": 9.75348324393126e-06, "loss": 0.2398, "step": 852 }, { "epoch": 0.12756093913563632, "grad_norm": 1.6270554683226957, "learning_rate": 9.752731623952647e-06, "loss": 0.4891, "step": 853 }, { "epoch": 0.12771048302676835, "grad_norm": 1.26855312080081, "learning_rate": 9.751978888937394e-06, "loss": 0.256, "step": 854 }, { "epoch": 0.1278600269179004, "grad_norm": 1.8185769247015318, "learning_rate": 9.751225039062096e-06, "loss": 0.4165, "step": 855 }, { "epoch": 0.12800957080903244, "grad_norm": 1.751422967721224, "learning_rate": 9.750470074503616e-06, "loss": 0.4006, "step": 856 }, { "epoch": 0.1281591147001645, "grad_norm": 1.520424463551602, "learning_rate": 9.749713995439072e-06, "loss": 0.221, "step": 857 }, { "epoch": 0.12830865859129656, "grad_norm": 1.3721869164602227, "learning_rate": 9.74895680204585e-06, "loss": 0.2902, "step": 858 }, { "epoch": 0.1284582024824286, "grad_norm": 0.9768480150555632, "learning_rate": 9.748198494501598e-06, "loss": 0.2115, "step": 859 }, { "epoch": 0.12860774637356065, "grad_norm": 1.1411458146693625, "learning_rate": 9.747439072984217e-06, "loss": 0.2657, "step": 860 }, { "epoch": 0.12875729026469268, "grad_norm": 1.351877109756975, "learning_rate": 9.746678537671876e-06, "loss": 0.1998, "step": 861 }, { "epoch": 0.12890683415582474, "grad_norm": 1.735519954859839, "learning_rate": 9.745916888743006e-06, "loss": 0.3916, "step": 862 }, { "epoch": 0.12905637804695677, "grad_norm": 1.5843585668319187, "learning_rate": 9.745154126376295e-06, "loss": 0.2412, "step": 863 }, { "epoch": 0.12920592193808883, "grad_norm": 1.2682977552751018, "learning_rate": 9.744390250750694e-06, "loss": 0.2082, "step": 864 }, { "epoch": 0.1293554658292209, "grad_norm": 1.972644277101951, "learning_rate": 9.74362526204542e-06, "loss": 0.5327, "step": 865 }, { "epoch": 0.12950500972035292, "grad_norm": 1.9379037313358354, "learning_rate": 9.74285916043994e-06, "loss": 0.5184, "step": 866 }, { "epoch": 0.12965455361148498, "grad_norm": 2.118494372996469, "learning_rate": 9.742091946113994e-06, "loss": 0.4367, "step": 867 }, { "epoch": 0.129804097502617, "grad_norm": 3.0042960877566904, "learning_rate": 9.741323619247575e-06, "loss": 0.2971, "step": 868 }, { "epoch": 0.12995364139374907, "grad_norm": 1.4822743195387478, "learning_rate": 9.740554180020944e-06, "loss": 0.2324, "step": 869 }, { "epoch": 0.1301031852848811, "grad_norm": 1.629265135056018, "learning_rate": 9.739783628614614e-06, "loss": 0.3717, "step": 870 }, { "epoch": 0.13025272917601316, "grad_norm": 2.663192450334603, "learning_rate": 9.739011965209366e-06, "loss": 0.4405, "step": 871 }, { "epoch": 0.13040227306714522, "grad_norm": 1.942071044818614, "learning_rate": 9.738239189986239e-06, "loss": 0.2141, "step": 872 }, { "epoch": 0.13055181695827725, "grad_norm": 3.5555352510097684, "learning_rate": 9.737465303126533e-06, "loss": 0.5084, "step": 873 }, { "epoch": 0.1307013608494093, "grad_norm": 131.61785973638575, "learning_rate": 9.736690304811811e-06, "loss": 0.2431, "step": 874 }, { "epoch": 0.13085090474054134, "grad_norm": 1.190310223040302, "learning_rate": 9.735914195223894e-06, "loss": 0.1586, "step": 875 }, { "epoch": 0.1310004486316734, "grad_norm": 1.8346983968963104, "learning_rate": 9.735136974544866e-06, "loss": 0.5247, "step": 876 }, { "epoch": 0.13114999252280543, "grad_norm": 1.7905067752668935, "learning_rate": 9.734358642957068e-06, "loss": 0.2645, "step": 877 }, { "epoch": 0.1312995364139375, "grad_norm": 1.9545038391601572, "learning_rate": 9.733579200643108e-06, "loss": 0.3769, "step": 878 }, { "epoch": 0.13144908030506955, "grad_norm": 2.1943279379956477, "learning_rate": 9.732798647785847e-06, "loss": 0.5142, "step": 879 }, { "epoch": 0.13159862419620158, "grad_norm": 1.8055649624971999, "learning_rate": 9.73201698456841e-06, "loss": 0.1857, "step": 880 }, { "epoch": 0.13174816808733364, "grad_norm": 1.8149442634221358, "learning_rate": 9.731234211174188e-06, "loss": 0.2233, "step": 881 }, { "epoch": 0.13189771197846567, "grad_norm": 2.080170101944024, "learning_rate": 9.73045032778682e-06, "loss": 0.3904, "step": 882 }, { "epoch": 0.13204725586959773, "grad_norm": 1.9510038015167501, "learning_rate": 9.729665334590217e-06, "loss": 0.3821, "step": 883 }, { "epoch": 0.13219679976072976, "grad_norm": 2.1650257376887545, "learning_rate": 9.728879231768547e-06, "loss": 0.2357, "step": 884 }, { "epoch": 0.13234634365186182, "grad_norm": 1.1711773735000268, "learning_rate": 9.728092019506233e-06, "loss": 0.181, "step": 885 }, { "epoch": 0.13249588754299388, "grad_norm": 1.3626230838557185, "learning_rate": 9.727303697987965e-06, "loss": 0.2283, "step": 886 }, { "epoch": 0.1326454314341259, "grad_norm": 1.9510781854659551, "learning_rate": 9.72651426739869e-06, "loss": 0.5154, "step": 887 }, { "epoch": 0.13279497532525797, "grad_norm": 0.9715289377560907, "learning_rate": 9.72572372792362e-06, "loss": 0.248, "step": 888 }, { "epoch": 0.13294451921639, "grad_norm": 2.146774938769699, "learning_rate": 9.724932079748218e-06, "loss": 0.5735, "step": 889 }, { "epoch": 0.13309406310752206, "grad_norm": 1.9362171525305378, "learning_rate": 9.724139323058213e-06, "loss": 0.4643, "step": 890 }, { "epoch": 0.13324360699865412, "grad_norm": 1.2101289634017103, "learning_rate": 9.723345458039595e-06, "loss": 0.2266, "step": 891 }, { "epoch": 0.13339315088978615, "grad_norm": 1.5975435725539564, "learning_rate": 9.722550484878612e-06, "loss": 0.2212, "step": 892 }, { "epoch": 0.1335426947809182, "grad_norm": 2.1131453338181783, "learning_rate": 9.721754403761773e-06, "loss": 0.5017, "step": 893 }, { "epoch": 0.13369223867205024, "grad_norm": 1.8229015728915987, "learning_rate": 9.720957214875846e-06, "loss": 0.3833, "step": 894 }, { "epoch": 0.1338417825631823, "grad_norm": 2.2277630697934416, "learning_rate": 9.720158918407859e-06, "loss": 0.2482, "step": 895 }, { "epoch": 0.13399132645431433, "grad_norm": 2.7493650830526413, "learning_rate": 9.719359514545097e-06, "loss": 0.3227, "step": 896 }, { "epoch": 0.1341408703454464, "grad_norm": 1.326522393176365, "learning_rate": 9.718559003475114e-06, "loss": 0.3694, "step": 897 }, { "epoch": 0.13429041423657845, "grad_norm": 2.031851235687534, "learning_rate": 9.717757385385713e-06, "loss": 0.4018, "step": 898 }, { "epoch": 0.13443995812771048, "grad_norm": 1.8621806446966467, "learning_rate": 9.716954660464962e-06, "loss": 0.4906, "step": 899 }, { "epoch": 0.13458950201884254, "grad_norm": 1.2770259017691152, "learning_rate": 9.716150828901189e-06, "loss": 0.1858, "step": 900 }, { "epoch": 0.13473904590997457, "grad_norm": 1.5913545450304718, "learning_rate": 9.71534589088298e-06, "loss": 0.197, "step": 901 }, { "epoch": 0.13488858980110663, "grad_norm": 1.2063585747949084, "learning_rate": 9.714539846599183e-06, "loss": 0.2366, "step": 902 }, { "epoch": 0.13503813369223866, "grad_norm": 2.5200659605885036, "learning_rate": 9.713732696238901e-06, "loss": 0.5999, "step": 903 }, { "epoch": 0.13518767758337072, "grad_norm": 1.9024340551353864, "learning_rate": 9.7129244399915e-06, "loss": 0.3618, "step": 904 }, { "epoch": 0.13533722147450278, "grad_norm": 2.069806998096416, "learning_rate": 9.712115078046606e-06, "loss": 0.2392, "step": 905 }, { "epoch": 0.1354867653656348, "grad_norm": 1.9190350188819136, "learning_rate": 9.711304610594104e-06, "loss": 0.4096, "step": 906 }, { "epoch": 0.13563630925676687, "grad_norm": 1.0348199460008871, "learning_rate": 9.710493037824133e-06, "loss": 0.2071, "step": 907 }, { "epoch": 0.1357858531478989, "grad_norm": 2.014537240536291, "learning_rate": 9.709680359927101e-06, "loss": 0.4374, "step": 908 }, { "epoch": 0.13593539703903096, "grad_norm": 1.7241079783150106, "learning_rate": 9.708866577093665e-06, "loss": 0.4161, "step": 909 }, { "epoch": 0.136084940930163, "grad_norm": 1.3280448342419884, "learning_rate": 9.70805168951475e-06, "loss": 0.1967, "step": 910 }, { "epoch": 0.13623448482129505, "grad_norm": 1.6364992809413539, "learning_rate": 9.707235697381536e-06, "loss": 0.3394, "step": 911 }, { "epoch": 0.1363840287124271, "grad_norm": 1.1211253304635729, "learning_rate": 9.706418600885462e-06, "loss": 0.3542, "step": 912 }, { "epoch": 0.13653357260355914, "grad_norm": 4.417441150249539, "learning_rate": 9.705600400218227e-06, "loss": 0.2605, "step": 913 }, { "epoch": 0.1366831164946912, "grad_norm": 1.6849430545358892, "learning_rate": 9.704781095571788e-06, "loss": 0.3434, "step": 914 }, { "epoch": 0.13683266038582323, "grad_norm": 1.0419590452262997, "learning_rate": 9.703960687138363e-06, "loss": 0.1759, "step": 915 }, { "epoch": 0.1369822042769553, "grad_norm": 2.3605687929632286, "learning_rate": 9.703139175110425e-06, "loss": 0.6175, "step": 916 }, { "epoch": 0.13713174816808735, "grad_norm": 1.6017722806543409, "learning_rate": 9.702316559680714e-06, "loss": 0.2687, "step": 917 }, { "epoch": 0.13728129205921938, "grad_norm": 1.3442020598442603, "learning_rate": 9.701492841042217e-06, "loss": 0.3801, "step": 918 }, { "epoch": 0.13743083595035144, "grad_norm": 1.6758219573938795, "learning_rate": 9.70066801938819e-06, "loss": 0.3869, "step": 919 }, { "epoch": 0.13758037984148347, "grad_norm": 1.787015495544375, "learning_rate": 9.699842094912146e-06, "loss": 0.319, "step": 920 }, { "epoch": 0.13772992373261553, "grad_norm": 1.8015526879631494, "learning_rate": 9.699015067807851e-06, "loss": 0.493, "step": 921 }, { "epoch": 0.13787946762374756, "grad_norm": 0.9887387021286004, "learning_rate": 9.698186938269334e-06, "loss": 0.1724, "step": 922 }, { "epoch": 0.13802901151487962, "grad_norm": 1.7253102296559673, "learning_rate": 9.697357706490885e-06, "loss": 0.5363, "step": 923 }, { "epoch": 0.13817855540601168, "grad_norm": 1.5558864293295054, "learning_rate": 9.696527372667046e-06, "loss": 0.2863, "step": 924 }, { "epoch": 0.1383280992971437, "grad_norm": 1.8210322672031793, "learning_rate": 9.695695936992624e-06, "loss": 0.4107, "step": 925 }, { "epoch": 0.13847764318827577, "grad_norm": 1.6117992353983686, "learning_rate": 9.69486339966268e-06, "loss": 0.2162, "step": 926 }, { "epoch": 0.1386271870794078, "grad_norm": 1.7018476473220923, "learning_rate": 9.694029760872539e-06, "loss": 0.3609, "step": 927 }, { "epoch": 0.13877673097053986, "grad_norm": 1.5673565467226127, "learning_rate": 9.693195020817776e-06, "loss": 0.3164, "step": 928 }, { "epoch": 0.1389262748616719, "grad_norm": 1.6536061805316273, "learning_rate": 9.69235917969423e-06, "loss": 0.5039, "step": 929 }, { "epoch": 0.13907581875280395, "grad_norm": 1.4953772716061529, "learning_rate": 9.691522237698001e-06, "loss": 0.2073, "step": 930 }, { "epoch": 0.139225362643936, "grad_norm": 1.0372555974478648, "learning_rate": 9.69068419502544e-06, "loss": 0.1904, "step": 931 }, { "epoch": 0.13937490653506804, "grad_norm": 1.2803091164977878, "learning_rate": 9.689845051873161e-06, "loss": 0.2085, "step": 932 }, { "epoch": 0.1395244504262001, "grad_norm": 1.4758036204854348, "learning_rate": 9.689004808438036e-06, "loss": 0.2012, "step": 933 }, { "epoch": 0.13967399431733213, "grad_norm": 1.6660973952553224, "learning_rate": 9.688163464917191e-06, "loss": 0.3286, "step": 934 }, { "epoch": 0.1398235382084642, "grad_norm": 1.1549059427655604, "learning_rate": 9.687321021508018e-06, "loss": 0.2267, "step": 935 }, { "epoch": 0.13997308209959622, "grad_norm": 1.272574916603474, "learning_rate": 9.686477478408159e-06, "loss": 0.1829, "step": 936 }, { "epoch": 0.14012262599072828, "grad_norm": 2.754918857840336, "learning_rate": 9.685632835815519e-06, "loss": 0.4481, "step": 937 }, { "epoch": 0.14027216988186034, "grad_norm": 1.1790985103907, "learning_rate": 9.684787093928256e-06, "loss": 0.1814, "step": 938 }, { "epoch": 0.14042171377299237, "grad_norm": 1.011660485817637, "learning_rate": 9.683940252944794e-06, "loss": 0.1863, "step": 939 }, { "epoch": 0.14057125766412443, "grad_norm": 1.3525074345715755, "learning_rate": 9.68309231306381e-06, "loss": 0.2084, "step": 940 }, { "epoch": 0.14072080155525646, "grad_norm": 1.6719478297190948, "learning_rate": 9.682243274484231e-06, "loss": 0.3459, "step": 941 }, { "epoch": 0.14087034544638852, "grad_norm": 1.5225980842484328, "learning_rate": 9.681393137405259e-06, "loss": 0.3082, "step": 942 }, { "epoch": 0.14101988933752055, "grad_norm": 1.4403779528104341, "learning_rate": 9.680541902026342e-06, "loss": 0.1952, "step": 943 }, { "epoch": 0.1411694332286526, "grad_norm": 1.7704358094140293, "learning_rate": 9.679689568547184e-06, "loss": 0.2925, "step": 944 }, { "epoch": 0.14131897711978467, "grad_norm": 1.8325825494125016, "learning_rate": 9.678836137167753e-06, "loss": 0.2354, "step": 945 }, { "epoch": 0.1414685210109167, "grad_norm": 8.228207444568621, "learning_rate": 9.677981608088274e-06, "loss": 0.1945, "step": 946 }, { "epoch": 0.14161806490204876, "grad_norm": 1.9420821742118657, "learning_rate": 9.677125981509227e-06, "loss": 0.3745, "step": 947 }, { "epoch": 0.1417676087931808, "grad_norm": 1.4287526091354055, "learning_rate": 9.676269257631348e-06, "loss": 0.159, "step": 948 }, { "epoch": 0.14191715268431285, "grad_norm": 2.2979804382628726, "learning_rate": 9.675411436655636e-06, "loss": 0.6715, "step": 949 }, { "epoch": 0.1420666965754449, "grad_norm": 1.2906292785961546, "learning_rate": 9.67455251878334e-06, "loss": 0.1863, "step": 950 }, { "epoch": 0.14221624046657694, "grad_norm": 1.542985394545003, "learning_rate": 9.673692504215974e-06, "loss": 0.276, "step": 951 }, { "epoch": 0.142365784357709, "grad_norm": 2.076155049712511, "learning_rate": 9.672831393155304e-06, "loss": 0.2878, "step": 952 }, { "epoch": 0.14251532824884103, "grad_norm": 3.2389836303118265, "learning_rate": 9.671969185803357e-06, "loss": 0.4539, "step": 953 }, { "epoch": 0.1426648721399731, "grad_norm": 0.8145950923200616, "learning_rate": 9.671105882362412e-06, "loss": 0.1916, "step": 954 }, { "epoch": 0.14281441603110512, "grad_norm": 1.7529614161433102, "learning_rate": 9.67024148303501e-06, "loss": 0.3852, "step": 955 }, { "epoch": 0.14296395992223718, "grad_norm": 1.6140653815156045, "learning_rate": 9.669375988023947e-06, "loss": 0.3317, "step": 956 }, { "epoch": 0.14311350381336924, "grad_norm": 1.5917328640289674, "learning_rate": 9.668509397532278e-06, "loss": 0.205, "step": 957 }, { "epoch": 0.14326304770450127, "grad_norm": 1.9053910041720175, "learning_rate": 9.667641711763311e-06, "loss": 0.2016, "step": 958 }, { "epoch": 0.14341259159563333, "grad_norm": 1.2223818916012819, "learning_rate": 9.666772930920614e-06, "loss": 0.1818, "step": 959 }, { "epoch": 0.14356213548676536, "grad_norm": 1.4130639929342779, "learning_rate": 9.665903055208013e-06, "loss": 0.1776, "step": 960 }, { "epoch": 0.14371167937789742, "grad_norm": 2.515402250912616, "learning_rate": 9.665032084829588e-06, "loss": 0.7429, "step": 961 }, { "epoch": 0.14386122326902945, "grad_norm": 2.118211041321287, "learning_rate": 9.66416001998968e-06, "loss": 0.5489, "step": 962 }, { "epoch": 0.1440107671601615, "grad_norm": 1.6323921378905693, "learning_rate": 9.663286860892877e-06, "loss": 0.3446, "step": 963 }, { "epoch": 0.14416031105129357, "grad_norm": 2.4057165427715335, "learning_rate": 9.662412607744036e-06, "loss": 0.2152, "step": 964 }, { "epoch": 0.1443098549424256, "grad_norm": 1.5942384074329368, "learning_rate": 9.661537260748264e-06, "loss": 0.3746, "step": 965 }, { "epoch": 0.14445939883355766, "grad_norm": 1.3815463338875527, "learning_rate": 9.660660820110926e-06, "loss": 0.2255, "step": 966 }, { "epoch": 0.1446089427246897, "grad_norm": 2.071453535788066, "learning_rate": 9.659783286037643e-06, "loss": 0.5075, "step": 967 }, { "epoch": 0.14475848661582175, "grad_norm": 1.3265840791490535, "learning_rate": 9.658904658734293e-06, "loss": 0.2295, "step": 968 }, { "epoch": 0.14490803050695378, "grad_norm": 1.5352887928749521, "learning_rate": 9.658024938407011e-06, "loss": 0.3484, "step": 969 }, { "epoch": 0.14505757439808584, "grad_norm": 1.5831033158953907, "learning_rate": 9.657144125262186e-06, "loss": 0.4039, "step": 970 }, { "epoch": 0.1452071182892179, "grad_norm": 1.6289190913913172, "learning_rate": 9.65626221950647e-06, "loss": 0.2471, "step": 971 }, { "epoch": 0.14535666218034993, "grad_norm": 1.6042830876012686, "learning_rate": 9.655379221346758e-06, "loss": 0.4886, "step": 972 }, { "epoch": 0.14550620607148199, "grad_norm": 1.2790362471165744, "learning_rate": 9.654495130990218e-06, "loss": 0.2065, "step": 973 }, { "epoch": 0.14565574996261402, "grad_norm": 1.72777953773445, "learning_rate": 9.653609948644263e-06, "loss": 0.4006, "step": 974 }, { "epoch": 0.14580529385374608, "grad_norm": 1.7629618868465593, "learning_rate": 9.652723674516566e-06, "loss": 0.2142, "step": 975 }, { "epoch": 0.14595483774487814, "grad_norm": 2.0277477662995174, "learning_rate": 9.651836308815055e-06, "loss": 0.5248, "step": 976 }, { "epoch": 0.14610438163601017, "grad_norm": 1.972708159415252, "learning_rate": 9.650947851747913e-06, "loss": 0.6236, "step": 977 }, { "epoch": 0.14625392552714223, "grad_norm": 1.760683770673202, "learning_rate": 9.650058303523583e-06, "loss": 0.3418, "step": 978 }, { "epoch": 0.14640346941827426, "grad_norm": 1.8429433949445744, "learning_rate": 9.649167664350762e-06, "loss": 0.475, "step": 979 }, { "epoch": 0.14655301330940632, "grad_norm": 1.9248727992532182, "learning_rate": 9.6482759344384e-06, "loss": 0.3368, "step": 980 }, { "epoch": 0.14670255720053835, "grad_norm": 1.9573379093599792, "learning_rate": 9.647383113995707e-06, "loss": 0.4203, "step": 981 }, { "epoch": 0.1468521010916704, "grad_norm": 2.7789135237612803, "learning_rate": 9.646489203232145e-06, "loss": 0.4581, "step": 982 }, { "epoch": 0.14700164498280247, "grad_norm": 2.5119171064254724, "learning_rate": 9.645594202357438e-06, "loss": 0.8202, "step": 983 }, { "epoch": 0.1471511888739345, "grad_norm": 1.4269567118604072, "learning_rate": 9.644698111581562e-06, "loss": 0.1954, "step": 984 }, { "epoch": 0.14730073276506656, "grad_norm": 1.5211432396508893, "learning_rate": 9.643800931114742e-06, "loss": 0.207, "step": 985 }, { "epoch": 0.1474502766561986, "grad_norm": 1.607901993958729, "learning_rate": 9.642902661167472e-06, "loss": 0.3046, "step": 986 }, { "epoch": 0.14759982054733065, "grad_norm": 2.0792720647718776, "learning_rate": 9.642003301950491e-06, "loss": 0.4314, "step": 987 }, { "epoch": 0.14774936443846268, "grad_norm": 1.6967562403841654, "learning_rate": 9.641102853674799e-06, "loss": 0.2142, "step": 988 }, { "epoch": 0.14789890832959474, "grad_norm": 1.8666018241841429, "learning_rate": 9.640201316551651e-06, "loss": 0.4817, "step": 989 }, { "epoch": 0.1480484522207268, "grad_norm": 1.6231253521213436, "learning_rate": 9.639298690792554e-06, "loss": 0.304, "step": 990 }, { "epoch": 0.14819799611185883, "grad_norm": 0.9588780548142521, "learning_rate": 9.638394976609274e-06, "loss": 0.1709, "step": 991 }, { "epoch": 0.14834754000299089, "grad_norm": 1.7153802262812925, "learning_rate": 9.637490174213828e-06, "loss": 0.1959, "step": 992 }, { "epoch": 0.14849708389412292, "grad_norm": 1.8524843327507126, "learning_rate": 9.636584283818496e-06, "loss": 0.3957, "step": 993 }, { "epoch": 0.14864662778525498, "grad_norm": 1.8045411979525945, "learning_rate": 9.635677305635807e-06, "loss": 0.2565, "step": 994 }, { "epoch": 0.148796171676387, "grad_norm": 2.1506980932575175, "learning_rate": 9.634769239878545e-06, "loss": 0.3777, "step": 995 }, { "epoch": 0.14894571556751907, "grad_norm": 2.1465696781275563, "learning_rate": 9.633860086759753e-06, "loss": 0.6056, "step": 996 }, { "epoch": 0.14909525945865112, "grad_norm": 1.3341555017873934, "learning_rate": 9.632949846492728e-06, "loss": 0.2219, "step": 997 }, { "epoch": 0.14924480334978316, "grad_norm": 1.2276798431588063, "learning_rate": 9.632038519291017e-06, "loss": 0.2074, "step": 998 }, { "epoch": 0.14939434724091522, "grad_norm": 1.53279693471178, "learning_rate": 9.63112610536843e-06, "loss": 0.4373, "step": 999 }, { "epoch": 0.14954389113204725, "grad_norm": 1.770529951492653, "learning_rate": 9.630212604939026e-06, "loss": 0.2495, "step": 1000 }, { "epoch": 0.1496934350231793, "grad_norm": 1.6948398115693124, "learning_rate": 9.629298018217123e-06, "loss": 0.467, "step": 1001 }, { "epoch": 0.14984297891431134, "grad_norm": 2.0292952301635534, "learning_rate": 9.628382345417291e-06, "loss": 0.5509, "step": 1002 }, { "epoch": 0.1499925228054434, "grad_norm": 2.4704877435341186, "learning_rate": 9.627465586754354e-06, "loss": 0.2214, "step": 1003 }, { "epoch": 0.15014206669657545, "grad_norm": 1.6257900509098846, "learning_rate": 9.626547742443394e-06, "loss": 0.3365, "step": 1004 }, { "epoch": 0.15029161058770749, "grad_norm": 1.8331565532381362, "learning_rate": 9.625628812699747e-06, "loss": 0.4435, "step": 1005 }, { "epoch": 0.15044115447883954, "grad_norm": 2.977053464112802, "learning_rate": 9.624708797739002e-06, "loss": 0.3177, "step": 1006 }, { "epoch": 0.15059069836997158, "grad_norm": 1.756417225108672, "learning_rate": 9.623787697777001e-06, "loss": 0.3878, "step": 1007 }, { "epoch": 0.15074024226110364, "grad_norm": 1.6276234348428167, "learning_rate": 9.622865513029846e-06, "loss": 0.1901, "step": 1008 }, { "epoch": 0.1508897861522357, "grad_norm": 3.3786401544653253, "learning_rate": 9.62194224371389e-06, "loss": 0.2454, "step": 1009 }, { "epoch": 0.15103933004336773, "grad_norm": 2.0516682586662456, "learning_rate": 9.621017890045739e-06, "loss": 0.7163, "step": 1010 }, { "epoch": 0.15118887393449978, "grad_norm": 1.0885572394332461, "learning_rate": 9.620092452242257e-06, "loss": 0.2232, "step": 1011 }, { "epoch": 0.15133841782563182, "grad_norm": 1.287954715641653, "learning_rate": 9.61916593052056e-06, "loss": 0.2124, "step": 1012 }, { "epoch": 0.15148796171676387, "grad_norm": 2.089649291216167, "learning_rate": 9.618238325098021e-06, "loss": 0.5129, "step": 1013 }, { "epoch": 0.1516375056078959, "grad_norm": 1.512994828933595, "learning_rate": 9.617309636192262e-06, "loss": 0.2986, "step": 1014 }, { "epoch": 0.15178704949902797, "grad_norm": 1.6901350993268085, "learning_rate": 9.616379864021163e-06, "loss": 0.1893, "step": 1015 }, { "epoch": 0.15193659339016002, "grad_norm": 1.6240769043159409, "learning_rate": 9.615449008802858e-06, "loss": 0.1984, "step": 1016 }, { "epoch": 0.15208613728129206, "grad_norm": 2.2110380817085815, "learning_rate": 9.614517070755736e-06, "loss": 0.4573, "step": 1017 }, { "epoch": 0.15223568117242411, "grad_norm": 2.3590384489615452, "learning_rate": 9.613584050098436e-06, "loss": 0.536, "step": 1018 }, { "epoch": 0.15238522506355615, "grad_norm": 1.1021064926756596, "learning_rate": 9.612649947049856e-06, "loss": 0.224, "step": 1019 }, { "epoch": 0.1525347689546882, "grad_norm": 1.73759279762283, "learning_rate": 9.611714761829146e-06, "loss": 0.2581, "step": 1020 }, { "epoch": 0.15268431284582024, "grad_norm": 1.9739121848543864, "learning_rate": 9.610778494655706e-06, "loss": 0.2116, "step": 1021 }, { "epoch": 0.1528338567369523, "grad_norm": 1.7608676639305771, "learning_rate": 9.609841145749196e-06, "loss": 0.2343, "step": 1022 }, { "epoch": 0.15298340062808435, "grad_norm": 1.8973842729049986, "learning_rate": 9.608902715329527e-06, "loss": 0.416, "step": 1023 }, { "epoch": 0.15313294451921639, "grad_norm": 1.232118282013805, "learning_rate": 9.607963203616862e-06, "loss": 0.2428, "step": 1024 }, { "epoch": 0.15328248841034844, "grad_norm": 1.65077206870404, "learning_rate": 9.607022610831623e-06, "loss": 0.395, "step": 1025 }, { "epoch": 0.15343203230148048, "grad_norm": 1.8519962462671438, "learning_rate": 9.606080937194478e-06, "loss": 0.3225, "step": 1026 }, { "epoch": 0.15358157619261253, "grad_norm": 1.5406605876000279, "learning_rate": 9.605138182926355e-06, "loss": 0.1962, "step": 1027 }, { "epoch": 0.15373112008374457, "grad_norm": 1.68724090741763, "learning_rate": 9.604194348248432e-06, "loss": 0.3412, "step": 1028 }, { "epoch": 0.15388066397487662, "grad_norm": 1.3196225832605089, "learning_rate": 9.603249433382145e-06, "loss": 0.203, "step": 1029 }, { "epoch": 0.15403020786600868, "grad_norm": 1.4552647720547889, "learning_rate": 9.602303438549177e-06, "loss": 0.2683, "step": 1030 }, { "epoch": 0.15417975175714072, "grad_norm": 1.9356496958747662, "learning_rate": 9.601356363971467e-06, "loss": 0.4085, "step": 1031 }, { "epoch": 0.15432929564827277, "grad_norm": 1.5727583829762162, "learning_rate": 9.60040820987121e-06, "loss": 0.2702, "step": 1032 }, { "epoch": 0.1544788395394048, "grad_norm": 1.9130911912208222, "learning_rate": 9.59945897647085e-06, "loss": 0.3831, "step": 1033 }, { "epoch": 0.15462838343053686, "grad_norm": 1.8240341076741502, "learning_rate": 9.59850866399309e-06, "loss": 0.4938, "step": 1034 }, { "epoch": 0.15477792732166892, "grad_norm": 3.1510439843833953, "learning_rate": 9.597557272660878e-06, "loss": 0.4082, "step": 1035 }, { "epoch": 0.15492747121280095, "grad_norm": 2.7952488908518998, "learning_rate": 9.596604802697422e-06, "loss": 1.0219, "step": 1036 }, { "epoch": 0.155077015103933, "grad_norm": 1.5968867223126475, "learning_rate": 9.595651254326179e-06, "loss": 0.3575, "step": 1037 }, { "epoch": 0.15522655899506504, "grad_norm": 1.7270374421609462, "learning_rate": 9.594696627770863e-06, "loss": 0.4184, "step": 1038 }, { "epoch": 0.1553761028861971, "grad_norm": 1.623026584906972, "learning_rate": 9.593740923255437e-06, "loss": 0.3364, "step": 1039 }, { "epoch": 0.15552564677732914, "grad_norm": 1.724702934068898, "learning_rate": 9.592784141004118e-06, "loss": 0.5197, "step": 1040 }, { "epoch": 0.1556751906684612, "grad_norm": 1.2087417414530044, "learning_rate": 9.591826281241379e-06, "loss": 0.3232, "step": 1041 }, { "epoch": 0.15582473455959325, "grad_norm": 2.0110371073704716, "learning_rate": 9.590867344191941e-06, "loss": 0.4617, "step": 1042 }, { "epoch": 0.15597427845072528, "grad_norm": 2.300355596627081, "learning_rate": 9.58990733008078e-06, "loss": 0.3483, "step": 1043 }, { "epoch": 0.15612382234185734, "grad_norm": 1.9228199791045963, "learning_rate": 9.588946239133123e-06, "loss": 0.4723, "step": 1044 }, { "epoch": 0.15627336623298937, "grad_norm": 1.3725700472934328, "learning_rate": 9.587984071574455e-06, "loss": 0.212, "step": 1045 }, { "epoch": 0.15642291012412143, "grad_norm": 1.9304969649682522, "learning_rate": 9.587020827630507e-06, "loss": 0.2317, "step": 1046 }, { "epoch": 0.15657245401525346, "grad_norm": 1.6204729661048964, "learning_rate": 9.586056507527266e-06, "loss": 0.2135, "step": 1047 }, { "epoch": 0.15672199790638552, "grad_norm": 1.2120441425361188, "learning_rate": 9.58509111149097e-06, "loss": 0.2785, "step": 1048 }, { "epoch": 0.15687154179751758, "grad_norm": 2.1856010368938135, "learning_rate": 9.584124639748114e-06, "loss": 0.4117, "step": 1049 }, { "epoch": 0.15702108568864961, "grad_norm": 1.4004860796245104, "learning_rate": 9.583157092525435e-06, "loss": 0.375, "step": 1050 }, { "epoch": 0.15717062957978167, "grad_norm": 1.1728395243844014, "learning_rate": 9.582188470049935e-06, "loss": 0.2286, "step": 1051 }, { "epoch": 0.1573201734709137, "grad_norm": 1.3417244382182179, "learning_rate": 9.58121877254886e-06, "loss": 0.2105, "step": 1052 }, { "epoch": 0.15746971736204576, "grad_norm": 1.3092141915312672, "learning_rate": 9.580248000249709e-06, "loss": 0.253, "step": 1053 }, { "epoch": 0.1576192612531778, "grad_norm": 0.8768020427524035, "learning_rate": 9.579276153380234e-06, "loss": 0.199, "step": 1054 }, { "epoch": 0.15776880514430985, "grad_norm": 1.8273743204900599, "learning_rate": 9.578303232168442e-06, "loss": 0.5377, "step": 1055 }, { "epoch": 0.1579183490354419, "grad_norm": 1.4617178204858172, "learning_rate": 9.57732923684259e-06, "loss": 0.2318, "step": 1056 }, { "epoch": 0.15806789292657394, "grad_norm": 1.7897112328181541, "learning_rate": 9.576354167631186e-06, "loss": 0.2312, "step": 1057 }, { "epoch": 0.158217436817706, "grad_norm": 2.055748147218842, "learning_rate": 9.575378024762991e-06, "loss": 0.208, "step": 1058 }, { "epoch": 0.15836698070883803, "grad_norm": 1.9825123132080376, "learning_rate": 9.574400808467015e-06, "loss": 0.4415, "step": 1059 }, { "epoch": 0.1585165245999701, "grad_norm": 1.203644734681587, "learning_rate": 9.573422518972524e-06, "loss": 0.1651, "step": 1060 }, { "epoch": 0.15866606849110212, "grad_norm": 2.143121544386558, "learning_rate": 9.572443156509035e-06, "loss": 0.2813, "step": 1061 }, { "epoch": 0.15881561238223418, "grad_norm": 1.7645898839865752, "learning_rate": 9.571462721306315e-06, "loss": 0.314, "step": 1062 }, { "epoch": 0.15896515627336624, "grad_norm": 1.6678516650591295, "learning_rate": 9.570481213594385e-06, "loss": 0.289, "step": 1063 }, { "epoch": 0.15911470016449827, "grad_norm": 1.2850170647115653, "learning_rate": 9.569498633603513e-06, "loss": 0.2024, "step": 1064 }, { "epoch": 0.15926424405563033, "grad_norm": 1.814229267711803, "learning_rate": 9.568514981564226e-06, "loss": 0.4606, "step": 1065 }, { "epoch": 0.15941378794676236, "grad_norm": 1.243462382180521, "learning_rate": 9.567530257707294e-06, "loss": 0.204, "step": 1066 }, { "epoch": 0.15956333183789442, "grad_norm": 1.2909371933651599, "learning_rate": 9.566544462263744e-06, "loss": 0.2209, "step": 1067 }, { "epoch": 0.15971287572902648, "grad_norm": 1.6817484277248902, "learning_rate": 9.565557595464854e-06, "loss": 0.2266, "step": 1068 }, { "epoch": 0.1598624196201585, "grad_norm": 1.6783737367668008, "learning_rate": 9.564569657542153e-06, "loss": 0.3126, "step": 1069 }, { "epoch": 0.16001196351129057, "grad_norm": 1.7121174458116146, "learning_rate": 9.56358064872742e-06, "loss": 0.4938, "step": 1070 }, { "epoch": 0.1601615074024226, "grad_norm": 1.4168299418446093, "learning_rate": 9.562590569252685e-06, "loss": 0.1859, "step": 1071 }, { "epoch": 0.16031105129355466, "grad_norm": 1.1574720210896832, "learning_rate": 9.561599419350233e-06, "loss": 0.2076, "step": 1072 }, { "epoch": 0.1604605951846867, "grad_norm": 1.88010212201121, "learning_rate": 9.560607199252594e-06, "loss": 0.489, "step": 1073 }, { "epoch": 0.16061013907581875, "grad_norm": 1.7557755514977327, "learning_rate": 9.559613909192553e-06, "loss": 0.2593, "step": 1074 }, { "epoch": 0.1607596829669508, "grad_norm": 2.0354173981462864, "learning_rate": 9.558619549403148e-06, "loss": 0.266, "step": 1075 }, { "epoch": 0.16090922685808284, "grad_norm": 1.984771028183608, "learning_rate": 9.557624120117663e-06, "loss": 0.1823, "step": 1076 }, { "epoch": 0.1610587707492149, "grad_norm": 1.2435802855207878, "learning_rate": 9.556627621569636e-06, "loss": 0.2133, "step": 1077 }, { "epoch": 0.16120831464034693, "grad_norm": 2.1628484109014603, "learning_rate": 9.555630053992854e-06, "loss": 0.6313, "step": 1078 }, { "epoch": 0.161357858531479, "grad_norm": 1.3025632600056798, "learning_rate": 9.554631417621358e-06, "loss": 0.1861, "step": 1079 }, { "epoch": 0.16150740242261102, "grad_norm": 1.2033276727813813, "learning_rate": 9.553631712689437e-06, "loss": 0.1411, "step": 1080 }, { "epoch": 0.16165694631374308, "grad_norm": 1.7192459951334418, "learning_rate": 9.55263093943163e-06, "loss": 0.3415, "step": 1081 }, { "epoch": 0.16180649020487514, "grad_norm": 2.195446146090599, "learning_rate": 9.55162909808273e-06, "loss": 0.5552, "step": 1082 }, { "epoch": 0.16195603409600717, "grad_norm": 1.2518961650965623, "learning_rate": 9.550626188877779e-06, "loss": 0.1376, "step": 1083 }, { "epoch": 0.16210557798713923, "grad_norm": 1.621475542449237, "learning_rate": 9.549622212052067e-06, "loss": 0.1918, "step": 1084 }, { "epoch": 0.16225512187827126, "grad_norm": 1.6022576009951304, "learning_rate": 9.548617167841139e-06, "loss": 0.4754, "step": 1085 }, { "epoch": 0.16240466576940332, "grad_norm": 2.140796057625746, "learning_rate": 9.547611056480785e-06, "loss": 0.3365, "step": 1086 }, { "epoch": 0.16255420966053535, "grad_norm": 2.1352436057574415, "learning_rate": 9.54660387820705e-06, "loss": 0.2548, "step": 1087 }, { "epoch": 0.1627037535516674, "grad_norm": 1.533857631085752, "learning_rate": 9.54559563325623e-06, "loss": 0.2295, "step": 1088 }, { "epoch": 0.16285329744279947, "grad_norm": 1.3212471086223223, "learning_rate": 9.544586321864865e-06, "loss": 0.2047, "step": 1089 }, { "epoch": 0.1630028413339315, "grad_norm": 1.0794095044455165, "learning_rate": 9.543575944269752e-06, "loss": 0.2824, "step": 1090 }, { "epoch": 0.16315238522506356, "grad_norm": 1.7780256493338729, "learning_rate": 9.542564500707934e-06, "loss": 0.3763, "step": 1091 }, { "epoch": 0.1633019291161956, "grad_norm": 1.1347608812400427, "learning_rate": 9.541551991416704e-06, "loss": 0.2046, "step": 1092 }, { "epoch": 0.16345147300732765, "grad_norm": 1.7298153575861623, "learning_rate": 9.540538416633611e-06, "loss": 0.3394, "step": 1093 }, { "epoch": 0.1636010168984597, "grad_norm": 1.822850678248627, "learning_rate": 9.539523776596446e-06, "loss": 0.4254, "step": 1094 }, { "epoch": 0.16375056078959174, "grad_norm": 1.762563084427053, "learning_rate": 9.538508071543253e-06, "loss": 0.4083, "step": 1095 }, { "epoch": 0.1639001046807238, "grad_norm": 1.653109483519857, "learning_rate": 9.537491301712328e-06, "loss": 0.3344, "step": 1096 }, { "epoch": 0.16404964857185583, "grad_norm": 2.0381344440794713, "learning_rate": 9.536473467342213e-06, "loss": 0.6724, "step": 1097 }, { "epoch": 0.1641991924629879, "grad_norm": 1.9754898794658875, "learning_rate": 9.535454568671705e-06, "loss": 0.5666, "step": 1098 }, { "epoch": 0.16434873635411992, "grad_norm": 2.3447226731261344, "learning_rate": 9.534434605939845e-06, "loss": 0.3964, "step": 1099 }, { "epoch": 0.16449828024525198, "grad_norm": 1.773763244389934, "learning_rate": 9.533413579385925e-06, "loss": 0.3883, "step": 1100 }, { "epoch": 0.16464782413638404, "grad_norm": 1.8563640201434743, "learning_rate": 9.532391489249489e-06, "loss": 0.5295, "step": 1101 }, { "epoch": 0.16479736802751607, "grad_norm": 1.610026424473214, "learning_rate": 9.53136833577033e-06, "loss": 0.2045, "step": 1102 }, { "epoch": 0.16494691191864813, "grad_norm": 1.1595609057345122, "learning_rate": 9.530344119188489e-06, "loss": 0.196, "step": 1103 }, { "epoch": 0.16509645580978016, "grad_norm": 1.380807492186586, "learning_rate": 9.529318839744257e-06, "loss": 0.1665, "step": 1104 }, { "epoch": 0.16524599970091222, "grad_norm": 1.224405894645481, "learning_rate": 9.528292497678175e-06, "loss": 0.1904, "step": 1105 }, { "epoch": 0.16539554359204425, "grad_norm": 1.6321539600343615, "learning_rate": 9.527265093231031e-06, "loss": 0.3776, "step": 1106 }, { "epoch": 0.1655450874831763, "grad_norm": 1.8242961573055154, "learning_rate": 9.526236626643867e-06, "loss": 0.2581, "step": 1107 }, { "epoch": 0.16569463137430837, "grad_norm": 1.3475980511137085, "learning_rate": 9.525207098157968e-06, "loss": 0.3415, "step": 1108 }, { "epoch": 0.1658441752654404, "grad_norm": 1.9882440245015858, "learning_rate": 9.524176508014873e-06, "loss": 0.5573, "step": 1109 }, { "epoch": 0.16599371915657246, "grad_norm": 1.5816342314828584, "learning_rate": 9.523144856456367e-06, "loss": 0.2691, "step": 1110 }, { "epoch": 0.1661432630477045, "grad_norm": 1.3850164199795663, "learning_rate": 9.522112143724489e-06, "loss": 0.3378, "step": 1111 }, { "epoch": 0.16629280693883655, "grad_norm": 1.4909627841603759, "learning_rate": 9.52107837006152e-06, "loss": 0.4023, "step": 1112 }, { "epoch": 0.16644235082996858, "grad_norm": 1.338411558324243, "learning_rate": 9.520043535709994e-06, "loss": 0.225, "step": 1113 }, { "epoch": 0.16659189472110064, "grad_norm": 1.5004852001929436, "learning_rate": 9.519007640912691e-06, "loss": 0.367, "step": 1114 }, { "epoch": 0.1667414386122327, "grad_norm": 1.3522433759924148, "learning_rate": 9.517970685912648e-06, "loss": 0.3267, "step": 1115 }, { "epoch": 0.16689098250336473, "grad_norm": 1.2771009278550414, "learning_rate": 9.516932670953137e-06, "loss": 0.2343, "step": 1116 }, { "epoch": 0.1670405263944968, "grad_norm": 1.2558718520533958, "learning_rate": 9.515893596277692e-06, "loss": 0.2146, "step": 1117 }, { "epoch": 0.16719007028562882, "grad_norm": 1.747427684722942, "learning_rate": 9.514853462130087e-06, "loss": 0.3897, "step": 1118 }, { "epoch": 0.16733961417676088, "grad_norm": 1.350710739291567, "learning_rate": 9.51381226875435e-06, "loss": 0.1962, "step": 1119 }, { "epoch": 0.16748915806789294, "grad_norm": 1.7048566484317351, "learning_rate": 9.512770016394754e-06, "loss": 0.4704, "step": 1120 }, { "epoch": 0.16763870195902497, "grad_norm": 1.1820262059156426, "learning_rate": 9.51172670529582e-06, "loss": 0.2269, "step": 1121 }, { "epoch": 0.16778824585015703, "grad_norm": 1.4968174934291172, "learning_rate": 9.51068233570232e-06, "loss": 0.2122, "step": 1122 }, { "epoch": 0.16793778974128906, "grad_norm": 1.7978736217167008, "learning_rate": 9.50963690785927e-06, "loss": 0.4423, "step": 1123 }, { "epoch": 0.16808733363242112, "grad_norm": 1.9086737935680094, "learning_rate": 9.508590422011943e-06, "loss": 0.5837, "step": 1124 }, { "epoch": 0.16823687752355315, "grad_norm": 1.809798628211579, "learning_rate": 9.507542878405853e-06, "loss": 0.4956, "step": 1125 }, { "epoch": 0.1683864214146852, "grad_norm": 2.256680269043329, "learning_rate": 9.506494277286762e-06, "loss": 0.3622, "step": 1126 }, { "epoch": 0.16853596530581727, "grad_norm": 1.7846176870962676, "learning_rate": 9.505444618900682e-06, "loss": 0.3712, "step": 1127 }, { "epoch": 0.1686855091969493, "grad_norm": 2.142681380142415, "learning_rate": 9.504393903493874e-06, "loss": 0.2523, "step": 1128 }, { "epoch": 0.16883505308808136, "grad_norm": 1.993049487456574, "learning_rate": 9.503342131312847e-06, "loss": 0.4667, "step": 1129 }, { "epoch": 0.1689845969792134, "grad_norm": 1.4929357757100867, "learning_rate": 9.502289302604355e-06, "loss": 0.2124, "step": 1130 }, { "epoch": 0.16913414087034545, "grad_norm": 1.763381368002964, "learning_rate": 9.501235417615402e-06, "loss": 0.2563, "step": 1131 }, { "epoch": 0.16928368476147748, "grad_norm": 1.879192084699487, "learning_rate": 9.50018047659324e-06, "loss": 0.6, "step": 1132 }, { "epoch": 0.16943322865260954, "grad_norm": 1.7482598713089221, "learning_rate": 9.49912447978537e-06, "loss": 0.4743, "step": 1133 }, { "epoch": 0.1695827725437416, "grad_norm": 2.413100609605461, "learning_rate": 9.498067427439535e-06, "loss": 0.5021, "step": 1134 }, { "epoch": 0.16973231643487363, "grad_norm": 1.6675332056868586, "learning_rate": 9.497009319803732e-06, "loss": 0.3704, "step": 1135 }, { "epoch": 0.1698818603260057, "grad_norm": 1.4103198339625334, "learning_rate": 9.495950157126204e-06, "loss": 0.3525, "step": 1136 }, { "epoch": 0.17003140421713772, "grad_norm": 1.7592756830906924, "learning_rate": 9.49488993965544e-06, "loss": 0.2118, "step": 1137 }, { "epoch": 0.17018094810826978, "grad_norm": 1.209600322516803, "learning_rate": 9.493828667640179e-06, "loss": 0.2156, "step": 1138 }, { "epoch": 0.1703304919994018, "grad_norm": 1.8747303848334145, "learning_rate": 9.492766341329402e-06, "loss": 0.2998, "step": 1139 }, { "epoch": 0.17048003589053387, "grad_norm": 1.9498503355521368, "learning_rate": 9.491702960972343e-06, "loss": 0.3723, "step": 1140 }, { "epoch": 0.17062957978166593, "grad_norm": 1.6726039919291162, "learning_rate": 9.490638526818482e-06, "loss": 0.408, "step": 1141 }, { "epoch": 0.17077912367279796, "grad_norm": 3.7262669891973386, "learning_rate": 9.489573039117543e-06, "loss": 0.4009, "step": 1142 }, { "epoch": 0.17092866756393002, "grad_norm": 1.8934485182919263, "learning_rate": 9.488506498119502e-06, "loss": 0.5141, "step": 1143 }, { "epoch": 0.17107821145506205, "grad_norm": 1.4710896960429016, "learning_rate": 9.487438904074581e-06, "loss": 0.3149, "step": 1144 }, { "epoch": 0.1712277553461941, "grad_norm": 0.7650448318548279, "learning_rate": 9.486370257233244e-06, "loss": 0.1787, "step": 1145 }, { "epoch": 0.17137729923732614, "grad_norm": 1.110992686058532, "learning_rate": 9.48530055784621e-06, "loss": 0.2058, "step": 1146 }, { "epoch": 0.1715268431284582, "grad_norm": 1.7079056261573908, "learning_rate": 9.484229806164435e-06, "loss": 0.2944, "step": 1147 }, { "epoch": 0.17167638701959026, "grad_norm": 1.5033153679810882, "learning_rate": 9.483158002439134e-06, "loss": 0.2221, "step": 1148 }, { "epoch": 0.1718259309107223, "grad_norm": 1.672220461796242, "learning_rate": 9.48208514692176e-06, "loss": 0.3671, "step": 1149 }, { "epoch": 0.17197547480185435, "grad_norm": 2.0626867491532885, "learning_rate": 9.481011239864014e-06, "loss": 0.3915, "step": 1150 }, { "epoch": 0.17212501869298638, "grad_norm": 1.6104087564965828, "learning_rate": 9.479936281517848e-06, "loss": 0.3084, "step": 1151 }, { "epoch": 0.17227456258411844, "grad_norm": 1.2158699033191027, "learning_rate": 9.478860272135452e-06, "loss": 0.2074, "step": 1152 }, { "epoch": 0.1724241064752505, "grad_norm": 1.5670367135087604, "learning_rate": 9.477783211969273e-06, "loss": 0.3216, "step": 1153 }, { "epoch": 0.17257365036638253, "grad_norm": 1.1456620290612602, "learning_rate": 9.476705101272e-06, "loss": 0.1631, "step": 1154 }, { "epoch": 0.1727231942575146, "grad_norm": 2.240627806354505, "learning_rate": 9.475625940296567e-06, "loss": 0.3657, "step": 1155 }, { "epoch": 0.17287273814864662, "grad_norm": 1.5745880507830639, "learning_rate": 9.474545729296152e-06, "loss": 0.2223, "step": 1156 }, { "epoch": 0.17302228203977868, "grad_norm": 2.112259044674356, "learning_rate": 9.47346446852419e-06, "loss": 0.6911, "step": 1157 }, { "epoch": 0.1731718259309107, "grad_norm": 1.5810443743533733, "learning_rate": 9.472382158234349e-06, "loss": 0.3099, "step": 1158 }, { "epoch": 0.17332136982204277, "grad_norm": 1.2778440299252458, "learning_rate": 9.47129879868055e-06, "loss": 0.2136, "step": 1159 }, { "epoch": 0.17347091371317483, "grad_norm": 1.4299544830347497, "learning_rate": 9.470214390116965e-06, "loss": 0.2145, "step": 1160 }, { "epoch": 0.17362045760430686, "grad_norm": 1.3698311545901527, "learning_rate": 9.469128932798e-06, "loss": 0.2377, "step": 1161 }, { "epoch": 0.17377000149543892, "grad_norm": 1.2952413351897127, "learning_rate": 9.468042426978319e-06, "loss": 0.2452, "step": 1162 }, { "epoch": 0.17391954538657095, "grad_norm": 1.3794054233492332, "learning_rate": 9.466954872912823e-06, "loss": 0.1923, "step": 1163 }, { "epoch": 0.174069089277703, "grad_norm": 1.1390407334512478, "learning_rate": 9.465866270856665e-06, "loss": 0.251, "step": 1164 }, { "epoch": 0.17421863316883504, "grad_norm": 2.280807269259536, "learning_rate": 9.46477662106524e-06, "loss": 0.5433, "step": 1165 }, { "epoch": 0.1743681770599671, "grad_norm": 1.4481272121275899, "learning_rate": 9.463685923794191e-06, "loss": 0.3209, "step": 1166 }, { "epoch": 0.17451772095109916, "grad_norm": 1.3194380931708358, "learning_rate": 9.462594179299408e-06, "loss": 0.2208, "step": 1167 }, { "epoch": 0.1746672648422312, "grad_norm": 2.0036222277986, "learning_rate": 9.46150138783702e-06, "loss": 0.2654, "step": 1168 }, { "epoch": 0.17481680873336325, "grad_norm": 0.8359301847512149, "learning_rate": 9.460407549663411e-06, "loss": 0.2131, "step": 1169 }, { "epoch": 0.17496635262449528, "grad_norm": 1.8746705630094216, "learning_rate": 9.459312665035203e-06, "loss": 0.2415, "step": 1170 }, { "epoch": 0.17511589651562734, "grad_norm": 2.187475783350278, "learning_rate": 9.458216734209269e-06, "loss": 0.6378, "step": 1171 }, { "epoch": 0.17526544040675937, "grad_norm": 1.3984177897039258, "learning_rate": 9.457119757442723e-06, "loss": 0.2034, "step": 1172 }, { "epoch": 0.17541498429789143, "grad_norm": 1.776218225609686, "learning_rate": 9.456021734992928e-06, "loss": 0.3717, "step": 1173 }, { "epoch": 0.1755645281890235, "grad_norm": 1.360440620305923, "learning_rate": 9.45492266711749e-06, "loss": 0.2499, "step": 1174 }, { "epoch": 0.17571407208015552, "grad_norm": 1.11686981125786, "learning_rate": 9.453822554074259e-06, "loss": 0.1718, "step": 1175 }, { "epoch": 0.17586361597128758, "grad_norm": 1.7801052355130151, "learning_rate": 9.452721396121333e-06, "loss": 0.392, "step": 1176 }, { "epoch": 0.1760131598624196, "grad_norm": 1.4238435171417776, "learning_rate": 9.451619193517057e-06, "loss": 0.4248, "step": 1177 }, { "epoch": 0.17616270375355167, "grad_norm": 1.4639661830539468, "learning_rate": 9.450515946520016e-06, "loss": 0.2049, "step": 1178 }, { "epoch": 0.17631224764468373, "grad_norm": 1.5499038616483287, "learning_rate": 9.449411655389042e-06, "loss": 0.4062, "step": 1179 }, { "epoch": 0.17646179153581576, "grad_norm": 1.7136605249554677, "learning_rate": 9.448306320383215e-06, "loss": 0.3911, "step": 1180 }, { "epoch": 0.17661133542694782, "grad_norm": 1.4594560853866743, "learning_rate": 9.447199941761852e-06, "loss": 0.2117, "step": 1181 }, { "epoch": 0.17676087931807985, "grad_norm": 1.6116818115006617, "learning_rate": 9.446092519784525e-06, "loss": 0.19, "step": 1182 }, { "epoch": 0.1769104232092119, "grad_norm": 1.5420971786965234, "learning_rate": 9.444984054711044e-06, "loss": 0.3199, "step": 1183 }, { "epoch": 0.17705996710034394, "grad_norm": 1.6968645734800356, "learning_rate": 9.443874546801465e-06, "loss": 0.3796, "step": 1184 }, { "epoch": 0.177209510991476, "grad_norm": 1.3181328359871092, "learning_rate": 9.442763996316093e-06, "loss": 0.3955, "step": 1185 }, { "epoch": 0.17735905488260806, "grad_norm": 2.8227798204695733, "learning_rate": 9.44165240351547e-06, "loss": 0.5894, "step": 1186 }, { "epoch": 0.1775085987737401, "grad_norm": 1.9231832633740291, "learning_rate": 9.440539768660386e-06, "loss": 0.3844, "step": 1187 }, { "epoch": 0.17765814266487215, "grad_norm": 1.4508084994855301, "learning_rate": 9.439426092011877e-06, "loss": 0.1983, "step": 1188 }, { "epoch": 0.17780768655600418, "grad_norm": 1.838910674129457, "learning_rate": 9.438311373831224e-06, "loss": 0.5758, "step": 1189 }, { "epoch": 0.17795723044713624, "grad_norm": 1.878745494476723, "learning_rate": 9.437195614379947e-06, "loss": 0.4892, "step": 1190 }, { "epoch": 0.17810677433826827, "grad_norm": 1.5763295601740275, "learning_rate": 9.436078813919818e-06, "loss": 0.4209, "step": 1191 }, { "epoch": 0.17825631822940033, "grad_norm": 1.939965363898736, "learning_rate": 9.434960972712846e-06, "loss": 0.4915, "step": 1192 }, { "epoch": 0.1784058621205324, "grad_norm": 1.777010297411083, "learning_rate": 9.433842091021287e-06, "loss": 0.4445, "step": 1193 }, { "epoch": 0.17855540601166442, "grad_norm": 1.8576886519979177, "learning_rate": 9.432722169107647e-06, "loss": 0.2065, "step": 1194 }, { "epoch": 0.17870494990279648, "grad_norm": 1.6414559576032928, "learning_rate": 9.431601207234663e-06, "loss": 0.2878, "step": 1195 }, { "epoch": 0.1788544937939285, "grad_norm": 1.6433452397947506, "learning_rate": 9.430479205665329e-06, "loss": 0.3933, "step": 1196 }, { "epoch": 0.17900403768506057, "grad_norm": 1.3219617553631218, "learning_rate": 9.429356164662872e-06, "loss": 0.1886, "step": 1197 }, { "epoch": 0.1791535815761926, "grad_norm": 1.3673182854086454, "learning_rate": 9.428232084490774e-06, "loss": 0.2098, "step": 1198 }, { "epoch": 0.17930312546732466, "grad_norm": 1.4932716672657123, "learning_rate": 9.427106965412752e-06, "loss": 0.1868, "step": 1199 }, { "epoch": 0.17945266935845672, "grad_norm": 1.2835655324809725, "learning_rate": 9.425980807692771e-06, "loss": 0.2841, "step": 1200 }, { "epoch": 0.17960221324958875, "grad_norm": 1.5229676329003083, "learning_rate": 9.424853611595037e-06, "loss": 0.429, "step": 1201 }, { "epoch": 0.1797517571407208, "grad_norm": 1.3834763754455093, "learning_rate": 9.423725377384e-06, "loss": 0.199, "step": 1202 }, { "epoch": 0.17990130103185284, "grad_norm": 1.172574987366, "learning_rate": 9.42259610532436e-06, "loss": 0.2422, "step": 1203 }, { "epoch": 0.1800508449229849, "grad_norm": 2.3677332220742753, "learning_rate": 9.421465795681048e-06, "loss": 0.6703, "step": 1204 }, { "epoch": 0.18020038881411693, "grad_norm": 1.743670576433428, "learning_rate": 9.420334448719251e-06, "loss": 0.3879, "step": 1205 }, { "epoch": 0.180349932705249, "grad_norm": 1.4354052350500734, "learning_rate": 9.419202064704393e-06, "loss": 0.2261, "step": 1206 }, { "epoch": 0.18049947659638105, "grad_norm": 1.5355684537494616, "learning_rate": 9.41806864390214e-06, "loss": 0.2323, "step": 1207 }, { "epoch": 0.18064902048751308, "grad_norm": 2.7749706919729067, "learning_rate": 9.416934186578403e-06, "loss": 0.2457, "step": 1208 }, { "epoch": 0.18079856437864514, "grad_norm": 1.7998786782731084, "learning_rate": 9.41579869299934e-06, "loss": 0.5115, "step": 1209 }, { "epoch": 0.18094810826977717, "grad_norm": 1.722249547477117, "learning_rate": 9.414662163431347e-06, "loss": 0.3978, "step": 1210 }, { "epoch": 0.18109765216090923, "grad_norm": 1.470878100530038, "learning_rate": 9.413524598141065e-06, "loss": 0.3655, "step": 1211 }, { "epoch": 0.1812471960520413, "grad_norm": 1.2321837235938764, "learning_rate": 9.412385997395377e-06, "loss": 0.206, "step": 1212 }, { "epoch": 0.18139673994317332, "grad_norm": 1.71695513424398, "learning_rate": 9.41124636146141e-06, "loss": 0.2988, "step": 1213 }, { "epoch": 0.18154628383430538, "grad_norm": 1.5665377801862033, "learning_rate": 9.410105690606533e-06, "loss": 0.435, "step": 1214 }, { "epoch": 0.1816958277254374, "grad_norm": 1.9574571743722469, "learning_rate": 9.40896398509836e-06, "loss": 0.3844, "step": 1215 }, { "epoch": 0.18184537161656947, "grad_norm": 1.732474617655161, "learning_rate": 9.407821245204746e-06, "loss": 0.4532, "step": 1216 }, { "epoch": 0.1819949155077015, "grad_norm": 1.6282505343946028, "learning_rate": 9.406677471193788e-06, "loss": 0.346, "step": 1217 }, { "epoch": 0.18214445939883356, "grad_norm": 1.7687288700904007, "learning_rate": 9.405532663333826e-06, "loss": 0.2398, "step": 1218 }, { "epoch": 0.18229400328996562, "grad_norm": 1.710427334466053, "learning_rate": 9.404386821893442e-06, "loss": 0.2851, "step": 1219 }, { "epoch": 0.18244354718109765, "grad_norm": 1.017825559673437, "learning_rate": 9.403239947141467e-06, "loss": 0.1898, "step": 1220 }, { "epoch": 0.1825930910722297, "grad_norm": 1.7058191164095473, "learning_rate": 9.402092039346961e-06, "loss": 0.3391, "step": 1221 }, { "epoch": 0.18274263496336174, "grad_norm": 1.2429292971081916, "learning_rate": 9.40094309877924e-06, "loss": 0.247, "step": 1222 }, { "epoch": 0.1828921788544938, "grad_norm": 1.26527696992994, "learning_rate": 9.399793125707853e-06, "loss": 0.2229, "step": 1223 }, { "epoch": 0.18304172274562583, "grad_norm": 1.150282472600963, "learning_rate": 9.398642120402596e-06, "loss": 0.2145, "step": 1224 }, { "epoch": 0.1831912666367579, "grad_norm": 1.3914149403501497, "learning_rate": 9.39749008313351e-06, "loss": 0.231, "step": 1225 }, { "epoch": 0.18334081052788995, "grad_norm": 1.3685090802839712, "learning_rate": 9.396337014170866e-06, "loss": 0.1872, "step": 1226 }, { "epoch": 0.18349035441902198, "grad_norm": 1.6709772065779387, "learning_rate": 9.395182913785192e-06, "loss": 0.2055, "step": 1227 }, { "epoch": 0.18363989831015404, "grad_norm": 2.0418194880673783, "learning_rate": 9.394027782247247e-06, "loss": 0.4888, "step": 1228 }, { "epoch": 0.18378944220128607, "grad_norm": 1.5794839342981186, "learning_rate": 9.392871619828036e-06, "loss": 0.3355, "step": 1229 }, { "epoch": 0.18393898609241813, "grad_norm": 2.365767436986478, "learning_rate": 9.39171442679881e-06, "loss": 0.4306, "step": 1230 }, { "epoch": 0.18408852998355016, "grad_norm": 1.535735557296357, "learning_rate": 9.390556203431053e-06, "loss": 0.3454, "step": 1231 }, { "epoch": 0.18423807387468222, "grad_norm": 2.0146640105762, "learning_rate": 9.3893969499965e-06, "loss": 0.5002, "step": 1232 }, { "epoch": 0.18438761776581428, "grad_norm": 1.0888630229716356, "learning_rate": 9.388236666767119e-06, "loss": 0.1717, "step": 1233 }, { "epoch": 0.1845371616569463, "grad_norm": 1.471926551369625, "learning_rate": 9.387075354015125e-06, "loss": 0.2728, "step": 1234 }, { "epoch": 0.18468670554807837, "grad_norm": 1.2418392055984802, "learning_rate": 9.385913012012972e-06, "loss": 0.2338, "step": 1235 }, { "epoch": 0.1848362494392104, "grad_norm": 1.1326547586847213, "learning_rate": 9.384749641033358e-06, "loss": 0.2014, "step": 1236 }, { "epoch": 0.18498579333034246, "grad_norm": 1.2625669973249032, "learning_rate": 9.383585241349223e-06, "loss": 0.2257, "step": 1237 }, { "epoch": 0.18513533722147452, "grad_norm": 2.0231610702160494, "learning_rate": 9.382419813233741e-06, "loss": 0.6136, "step": 1238 }, { "epoch": 0.18528488111260655, "grad_norm": 2.2265632194384035, "learning_rate": 9.381253356960339e-06, "loss": 0.379, "step": 1239 }, { "epoch": 0.1854344250037386, "grad_norm": 1.7519589257117685, "learning_rate": 9.380085872802672e-06, "loss": 0.4481, "step": 1240 }, { "epoch": 0.18558396889487064, "grad_norm": 1.8894247538731719, "learning_rate": 9.37891736103465e-06, "loss": 0.2349, "step": 1241 }, { "epoch": 0.1857335127860027, "grad_norm": 2.0840730120928153, "learning_rate": 9.377747821930411e-06, "loss": 0.386, "step": 1242 }, { "epoch": 0.18588305667713473, "grad_norm": 1.2303960642463392, "learning_rate": 9.376577255764346e-06, "loss": 0.2138, "step": 1243 }, { "epoch": 0.1860326005682668, "grad_norm": 1.0736052443136495, "learning_rate": 9.375405662811076e-06, "loss": 0.2919, "step": 1244 }, { "epoch": 0.18618214445939885, "grad_norm": 2.1694546083973236, "learning_rate": 9.37423304334547e-06, "loss": 0.4716, "step": 1245 }, { "epoch": 0.18633168835053088, "grad_norm": 1.7953994969561728, "learning_rate": 9.373059397642637e-06, "loss": 0.2303, "step": 1246 }, { "epoch": 0.18648123224166294, "grad_norm": 1.1331346308690267, "learning_rate": 9.371884725977924e-06, "loss": 0.1681, "step": 1247 }, { "epoch": 0.18663077613279497, "grad_norm": 1.8818511328803789, "learning_rate": 9.370709028626921e-06, "loss": 0.3736, "step": 1248 }, { "epoch": 0.18678032002392703, "grad_norm": 1.080778831083804, "learning_rate": 9.369532305865459e-06, "loss": 0.2155, "step": 1249 }, { "epoch": 0.18692986391505906, "grad_norm": 2.005915788639095, "learning_rate": 9.368354557969606e-06, "loss": 0.4026, "step": 1250 }, { "epoch": 0.18707940780619112, "grad_norm": 0.8756048068588032, "learning_rate": 9.367175785215674e-06, "loss": 0.183, "step": 1251 }, { "epoch": 0.18722895169732318, "grad_norm": 2.100099144522435, "learning_rate": 9.365995987880216e-06, "loss": 0.182, "step": 1252 }, { "epoch": 0.1873784955884552, "grad_norm": 1.9441741117291806, "learning_rate": 9.364815166240023e-06, "loss": 0.3865, "step": 1253 }, { "epoch": 0.18752803947958727, "grad_norm": 1.1762687046527927, "learning_rate": 9.363633320572124e-06, "loss": 0.2105, "step": 1254 }, { "epoch": 0.1876775833707193, "grad_norm": 1.170057500642311, "learning_rate": 9.362450451153795e-06, "loss": 0.201, "step": 1255 }, { "epoch": 0.18782712726185136, "grad_norm": 1.774823231357248, "learning_rate": 9.36126655826255e-06, "loss": 0.2958, "step": 1256 }, { "epoch": 0.1879766711529834, "grad_norm": 1.847386943393164, "learning_rate": 9.360081642176137e-06, "loss": 0.2783, "step": 1257 }, { "epoch": 0.18812621504411545, "grad_norm": 1.2317043700510546, "learning_rate": 9.358895703172552e-06, "loss": 0.2237, "step": 1258 }, { "epoch": 0.1882757589352475, "grad_norm": 1.4962863598933458, "learning_rate": 9.357708741530025e-06, "loss": 0.208, "step": 1259 }, { "epoch": 0.18842530282637954, "grad_norm": 1.753807685308467, "learning_rate": 9.356520757527032e-06, "loss": 0.513, "step": 1260 }, { "epoch": 0.1885748467175116, "grad_norm": 2.457507671022133, "learning_rate": 9.355331751442284e-06, "loss": 0.8743, "step": 1261 }, { "epoch": 0.18872439060864363, "grad_norm": 1.7444181168119555, "learning_rate": 9.354141723554734e-06, "loss": 0.3346, "step": 1262 }, { "epoch": 0.1888739344997757, "grad_norm": 1.971056965005781, "learning_rate": 9.35295067414357e-06, "loss": 0.2297, "step": 1263 }, { "epoch": 0.18902347839090772, "grad_norm": 1.5052086349314306, "learning_rate": 9.35175860348823e-06, "loss": 0.2149, "step": 1264 }, { "epoch": 0.18917302228203978, "grad_norm": 1.4361302390685748, "learning_rate": 9.35056551186838e-06, "loss": 0.3298, "step": 1265 }, { "epoch": 0.18932256617317184, "grad_norm": 1.4206462492110938, "learning_rate": 9.349371399563935e-06, "loss": 0.1929, "step": 1266 }, { "epoch": 0.18947211006430387, "grad_norm": 1.4913953281160535, "learning_rate": 9.348176266855042e-06, "loss": 0.2526, "step": 1267 }, { "epoch": 0.18962165395543593, "grad_norm": 2.365270322972236, "learning_rate": 9.346980114022092e-06, "loss": 0.5066, "step": 1268 }, { "epoch": 0.18977119784656796, "grad_norm": 1.8955954404187068, "learning_rate": 9.345782941345714e-06, "loss": 0.4404, "step": 1269 }, { "epoch": 0.18992074173770002, "grad_norm": 1.1285554099802715, "learning_rate": 9.344584749106775e-06, "loss": 0.2001, "step": 1270 }, { "epoch": 0.19007028562883208, "grad_norm": 1.6210163029014748, "learning_rate": 9.343385537586385e-06, "loss": 0.3274, "step": 1271 }, { "epoch": 0.1902198295199641, "grad_norm": 1.4031028340124463, "learning_rate": 9.342185307065888e-06, "loss": 0.1922, "step": 1272 }, { "epoch": 0.19036937341109617, "grad_norm": 1.5025368685887945, "learning_rate": 9.340984057826872e-06, "loss": 0.4106, "step": 1273 }, { "epoch": 0.1905189173022282, "grad_norm": 1.3363573267962257, "learning_rate": 9.339781790151159e-06, "loss": 0.2906, "step": 1274 }, { "epoch": 0.19066846119336026, "grad_norm": 2.2033082601743263, "learning_rate": 9.338578504320815e-06, "loss": 0.3913, "step": 1275 }, { "epoch": 0.1908180050844923, "grad_norm": 1.5703985946217345, "learning_rate": 9.337374200618141e-06, "loss": 0.2363, "step": 1276 }, { "epoch": 0.19096754897562435, "grad_norm": 1.8441964019229968, "learning_rate": 9.336168879325678e-06, "loss": 0.2193, "step": 1277 }, { "epoch": 0.1911170928667564, "grad_norm": 1.9461041413852502, "learning_rate": 9.334962540726208e-06, "loss": 0.3327, "step": 1278 }, { "epoch": 0.19126663675788844, "grad_norm": 1.987695873853033, "learning_rate": 9.333755185102747e-06, "loss": 0.5218, "step": 1279 }, { "epoch": 0.1914161806490205, "grad_norm": 1.9784687710756435, "learning_rate": 9.332546812738555e-06, "loss": 0.4903, "step": 1280 }, { "epoch": 0.19156572454015253, "grad_norm": 1.6276484190964966, "learning_rate": 9.331337423917126e-06, "loss": 0.3464, "step": 1281 }, { "epoch": 0.19171526843128459, "grad_norm": 1.2032949092333924, "learning_rate": 9.330127018922195e-06, "loss": 0.1529, "step": 1282 }, { "epoch": 0.19186481232241662, "grad_norm": 1.8200067255136916, "learning_rate": 9.328915598037733e-06, "loss": 0.4354, "step": 1283 }, { "epoch": 0.19201435621354868, "grad_norm": 1.4794611609702433, "learning_rate": 9.327703161547952e-06, "loss": 0.2071, "step": 1284 }, { "epoch": 0.19216390010468073, "grad_norm": 1.8032485542741101, "learning_rate": 9.326489709737303e-06, "loss": 0.3813, "step": 1285 }, { "epoch": 0.19231344399581277, "grad_norm": 1.3455083426481262, "learning_rate": 9.325275242890472e-06, "loss": 0.1853, "step": 1286 }, { "epoch": 0.19246298788694483, "grad_norm": 2.125860082250447, "learning_rate": 9.324059761292385e-06, "loss": 0.71, "step": 1287 }, { "epoch": 0.19261253177807686, "grad_norm": 1.8089276946794224, "learning_rate": 9.322843265228206e-06, "loss": 0.3672, "step": 1288 }, { "epoch": 0.19276207566920892, "grad_norm": 1.8361099385383872, "learning_rate": 9.321625754983335e-06, "loss": 0.3484, "step": 1289 }, { "epoch": 0.19291161956034095, "grad_norm": 1.6363642315445044, "learning_rate": 9.320407230843413e-06, "loss": 0.3042, "step": 1290 }, { "epoch": 0.193061163451473, "grad_norm": 2.021061654973304, "learning_rate": 9.319187693094318e-06, "loss": 0.5033, "step": 1291 }, { "epoch": 0.19321070734260506, "grad_norm": 2.62527535830696, "learning_rate": 9.317967142022163e-06, "loss": 0.275, "step": 1292 }, { "epoch": 0.1933602512337371, "grad_norm": 1.3806620083144838, "learning_rate": 9.316745577913304e-06, "loss": 0.2855, "step": 1293 }, { "epoch": 0.19350979512486916, "grad_norm": 1.7655797800670596, "learning_rate": 9.31552300105433e-06, "loss": 0.3915, "step": 1294 }, { "epoch": 0.1936593390160012, "grad_norm": 1.3465049850252158, "learning_rate": 9.314299411732069e-06, "loss": 0.249, "step": 1295 }, { "epoch": 0.19380888290713325, "grad_norm": 0.8256201784583667, "learning_rate": 9.313074810233589e-06, "loss": 0.1543, "step": 1296 }, { "epoch": 0.1939584267982653, "grad_norm": 1.9261313307855723, "learning_rate": 9.31184919684619e-06, "loss": 0.6008, "step": 1297 }, { "epoch": 0.19410797068939734, "grad_norm": 1.519619673139573, "learning_rate": 9.310622571857417e-06, "loss": 0.239, "step": 1298 }, { "epoch": 0.1942575145805294, "grad_norm": 2.0122490785681717, "learning_rate": 9.309394935555042e-06, "loss": 0.33, "step": 1299 }, { "epoch": 0.19440705847166143, "grad_norm": 1.5990433489122537, "learning_rate": 9.308166288227088e-06, "loss": 0.4012, "step": 1300 }, { "epoch": 0.19455660236279348, "grad_norm": 1.4956175342537672, "learning_rate": 9.3069366301618e-06, "loss": 0.5736, "step": 1301 }, { "epoch": 0.19470614625392552, "grad_norm": 1.2896015261249874, "learning_rate": 9.305705961647672e-06, "loss": 0.1798, "step": 1302 }, { "epoch": 0.19485569014505758, "grad_norm": 1.3006798401099697, "learning_rate": 9.304474282973432e-06, "loss": 0.3653, "step": 1303 }, { "epoch": 0.19500523403618963, "grad_norm": 2.1249741454515054, "learning_rate": 9.30324159442804e-06, "loss": 0.7342, "step": 1304 }, { "epoch": 0.19515477792732167, "grad_norm": 1.486234854981151, "learning_rate": 9.302007896300697e-06, "loss": 0.2874, "step": 1305 }, { "epoch": 0.19530432181845372, "grad_norm": 1.486723968689139, "learning_rate": 9.300773188880843e-06, "loss": 0.2301, "step": 1306 }, { "epoch": 0.19545386570958576, "grad_norm": 1.697087725237096, "learning_rate": 9.29953747245815e-06, "loss": 0.3191, "step": 1307 }, { "epoch": 0.19560340960071781, "grad_norm": 1.9495679450825656, "learning_rate": 9.29830074732253e-06, "loss": 0.3954, "step": 1308 }, { "epoch": 0.19575295349184985, "grad_norm": 1.2105140299371033, "learning_rate": 9.29706301376413e-06, "loss": 0.2557, "step": 1309 }, { "epoch": 0.1959024973829819, "grad_norm": 1.547760007057872, "learning_rate": 9.295824272073334e-06, "loss": 0.2865, "step": 1310 }, { "epoch": 0.19605204127411396, "grad_norm": 1.6181428409490188, "learning_rate": 9.294584522540766e-06, "loss": 0.3332, "step": 1311 }, { "epoch": 0.196201585165246, "grad_norm": 1.664852192256293, "learning_rate": 9.293343765457278e-06, "loss": 0.3058, "step": 1312 }, { "epoch": 0.19635112905637805, "grad_norm": 1.8608018825705885, "learning_rate": 9.292102001113968e-06, "loss": 0.3048, "step": 1313 }, { "epoch": 0.19650067294751009, "grad_norm": 1.420503009424543, "learning_rate": 9.290859229802162e-06, "loss": 0.2283, "step": 1314 }, { "epoch": 0.19665021683864214, "grad_norm": 1.996771180524021, "learning_rate": 9.289615451813428e-06, "loss": 0.4804, "step": 1315 }, { "epoch": 0.19679976072977418, "grad_norm": 1.8295883207210475, "learning_rate": 9.28837066743957e-06, "loss": 0.3065, "step": 1316 }, { "epoch": 0.19694930462090623, "grad_norm": 1.6508456488855519, "learning_rate": 9.287124876972625e-06, "loss": 0.2617, "step": 1317 }, { "epoch": 0.1970988485120383, "grad_norm": 1.7646395203323395, "learning_rate": 9.285878080704866e-06, "loss": 0.3484, "step": 1318 }, { "epoch": 0.19724839240317033, "grad_norm": 1.6976643786387164, "learning_rate": 9.284630278928805e-06, "loss": 0.4485, "step": 1319 }, { "epoch": 0.19739793629430238, "grad_norm": 1.6867112197107144, "learning_rate": 9.283381471937188e-06, "loss": 0.381, "step": 1320 }, { "epoch": 0.19754748018543442, "grad_norm": 1.650278888960391, "learning_rate": 9.282131660022997e-06, "loss": 0.2289, "step": 1321 }, { "epoch": 0.19769702407656647, "grad_norm": 2.0028177208667977, "learning_rate": 9.28088084347945e-06, "loss": 0.5132, "step": 1322 }, { "epoch": 0.19784656796769853, "grad_norm": 1.030996633782416, "learning_rate": 9.279629022600002e-06, "loss": 0.1764, "step": 1323 }, { "epoch": 0.19799611185883056, "grad_norm": 1.448960209222983, "learning_rate": 9.27837619767834e-06, "loss": 0.2575, "step": 1324 }, { "epoch": 0.19814565574996262, "grad_norm": 2.099657510604881, "learning_rate": 9.27712236900839e-06, "loss": 0.4117, "step": 1325 }, { "epoch": 0.19829519964109465, "grad_norm": 1.1059240433107884, "learning_rate": 9.27586753688431e-06, "loss": 0.2064, "step": 1326 }, { "epoch": 0.19844474353222671, "grad_norm": 0.9819176984170996, "learning_rate": 9.274611701600502e-06, "loss": 0.2357, "step": 1327 }, { "epoch": 0.19859428742335875, "grad_norm": 1.5836465571763443, "learning_rate": 9.273354863451589e-06, "loss": 0.2478, "step": 1328 }, { "epoch": 0.1987438313144908, "grad_norm": 1.8864795778055325, "learning_rate": 9.272097022732444e-06, "loss": 0.3705, "step": 1329 }, { "epoch": 0.19889337520562286, "grad_norm": 1.5474710953559745, "learning_rate": 9.270838179738164e-06, "loss": 0.1888, "step": 1330 }, { "epoch": 0.1990429190967549, "grad_norm": 1.6239105754270915, "learning_rate": 9.269578334764087e-06, "loss": 0.3698, "step": 1331 }, { "epoch": 0.19919246298788695, "grad_norm": 1.033656840947032, "learning_rate": 9.268317488105787e-06, "loss": 0.1741, "step": 1332 }, { "epoch": 0.19934200687901898, "grad_norm": 1.2644053752133695, "learning_rate": 9.267055640059068e-06, "loss": 0.3292, "step": 1333 }, { "epoch": 0.19949155077015104, "grad_norm": 2.0362957566742224, "learning_rate": 9.265792790919972e-06, "loss": 0.3348, "step": 1334 }, { "epoch": 0.19964109466128308, "grad_norm": 1.528100233715919, "learning_rate": 9.264528940984777e-06, "loss": 0.2456, "step": 1335 }, { "epoch": 0.19979063855241513, "grad_norm": 1.420463259410139, "learning_rate": 9.263264090549992e-06, "loss": 0.3396, "step": 1336 }, { "epoch": 0.1999401824435472, "grad_norm": 1.7091828149298964, "learning_rate": 9.261998239912367e-06, "loss": 0.2596, "step": 1337 }, { "epoch": 0.20008972633467922, "grad_norm": 2.259480708170105, "learning_rate": 9.26073138936888e-06, "loss": 0.4451, "step": 1338 }, { "epoch": 0.20023927022581128, "grad_norm": 1.8137629934472514, "learning_rate": 9.259463539216746e-06, "loss": 0.3549, "step": 1339 }, { "epoch": 0.20038881411694331, "grad_norm": 1.1443459601678743, "learning_rate": 9.258194689753417e-06, "loss": 0.228, "step": 1340 }, { "epoch": 0.20053835800807537, "grad_norm": 1.9827124471950097, "learning_rate": 9.256924841276576e-06, "loss": 0.2773, "step": 1341 }, { "epoch": 0.2006879018992074, "grad_norm": 1.2970841523500378, "learning_rate": 9.25565399408414e-06, "loss": 0.2164, "step": 1342 }, { "epoch": 0.20083744579033946, "grad_norm": 1.8476680005205972, "learning_rate": 9.254382148474264e-06, "loss": 0.3472, "step": 1343 }, { "epoch": 0.20098698968147152, "grad_norm": 0.9750041200477407, "learning_rate": 9.253109304745335e-06, "loss": 0.1686, "step": 1344 }, { "epoch": 0.20113653357260355, "grad_norm": 1.9603557255590418, "learning_rate": 9.251835463195977e-06, "loss": 0.5067, "step": 1345 }, { "epoch": 0.2012860774637356, "grad_norm": 1.8365618678908586, "learning_rate": 9.25056062412504e-06, "loss": 0.3701, "step": 1346 }, { "epoch": 0.20143562135486764, "grad_norm": 1.7722339453250509, "learning_rate": 9.249284787831617e-06, "loss": 0.4201, "step": 1347 }, { "epoch": 0.2015851652459997, "grad_norm": 1.6882388762311373, "learning_rate": 9.24800795461503e-06, "loss": 0.3566, "step": 1348 }, { "epoch": 0.20173470913713173, "grad_norm": 1.3408931481790511, "learning_rate": 9.246730124774839e-06, "loss": 0.3711, "step": 1349 }, { "epoch": 0.2018842530282638, "grad_norm": 1.5904902378620338, "learning_rate": 9.245451298610833e-06, "loss": 0.2285, "step": 1350 }, { "epoch": 0.20203379691939585, "grad_norm": 1.24091223117027, "learning_rate": 9.244171476423037e-06, "loss": 0.1831, "step": 1351 }, { "epoch": 0.20218334081052788, "grad_norm": 1.8606049117343595, "learning_rate": 9.24289065851171e-06, "loss": 0.2355, "step": 1352 }, { "epoch": 0.20233288470165994, "grad_norm": 1.3471226340640614, "learning_rate": 9.241608845177344e-06, "loss": 0.2296, "step": 1353 }, { "epoch": 0.20248242859279197, "grad_norm": 1.8826806706856583, "learning_rate": 9.240326036720665e-06, "loss": 0.3794, "step": 1354 }, { "epoch": 0.20263197248392403, "grad_norm": 1.6131643098193205, "learning_rate": 9.239042233442632e-06, "loss": 0.2487, "step": 1355 }, { "epoch": 0.2027815163750561, "grad_norm": 1.250046228507658, "learning_rate": 9.23775743564444e-06, "loss": 0.362, "step": 1356 }, { "epoch": 0.20293106026618812, "grad_norm": 1.680459140251771, "learning_rate": 9.236471643627512e-06, "loss": 0.218, "step": 1357 }, { "epoch": 0.20308060415732018, "grad_norm": 3.806087040662127, "learning_rate": 9.235184857693506e-06, "loss": 0.3938, "step": 1358 }, { "epoch": 0.2032301480484522, "grad_norm": 1.7243993707977876, "learning_rate": 9.233897078144317e-06, "loss": 0.2151, "step": 1359 }, { "epoch": 0.20337969193958427, "grad_norm": 1.529478064822457, "learning_rate": 9.23260830528207e-06, "loss": 0.3384, "step": 1360 }, { "epoch": 0.2035292358307163, "grad_norm": 1.4829330919870423, "learning_rate": 9.231318539409124e-06, "loss": 0.3183, "step": 1361 }, { "epoch": 0.20367877972184836, "grad_norm": 1.332269401180316, "learning_rate": 9.23002778082807e-06, "loss": 0.1772, "step": 1362 }, { "epoch": 0.20382832361298042, "grad_norm": 1.0915215645001923, "learning_rate": 9.228736029841732e-06, "loss": 0.1997, "step": 1363 }, { "epoch": 0.20397786750411245, "grad_norm": 1.159153234988241, "learning_rate": 9.227443286753167e-06, "loss": 0.1916, "step": 1364 }, { "epoch": 0.2041274113952445, "grad_norm": 1.7356076242142402, "learning_rate": 9.226149551865665e-06, "loss": 0.3072, "step": 1365 }, { "epoch": 0.20427695528637654, "grad_norm": 1.4127677237560496, "learning_rate": 9.224854825482752e-06, "loss": 0.2194, "step": 1366 }, { "epoch": 0.2044264991775086, "grad_norm": 1.5370329143503136, "learning_rate": 9.223559107908178e-06, "loss": 0.3882, "step": 1367 }, { "epoch": 0.20457604306864063, "grad_norm": 1.231450996718572, "learning_rate": 9.222262399445934e-06, "loss": 0.2097, "step": 1368 }, { "epoch": 0.2047255869597727, "grad_norm": 2.132300752161669, "learning_rate": 9.22096470040024e-06, "loss": 0.5167, "step": 1369 }, { "epoch": 0.20487513085090475, "grad_norm": 1.6056336366949036, "learning_rate": 9.219666011075548e-06, "loss": 0.4712, "step": 1370 }, { "epoch": 0.20502467474203678, "grad_norm": 1.0274444121008708, "learning_rate": 9.218366331776543e-06, "loss": 0.1489, "step": 1371 }, { "epoch": 0.20517421863316884, "grad_norm": 1.7166617308295589, "learning_rate": 9.217065662808143e-06, "loss": 0.2071, "step": 1372 }, { "epoch": 0.20532376252430087, "grad_norm": 1.678765123523581, "learning_rate": 9.215764004475496e-06, "loss": 0.3746, "step": 1373 }, { "epoch": 0.20547330641543293, "grad_norm": 1.5537365180594542, "learning_rate": 9.214461357083986e-06, "loss": 0.1505, "step": 1374 }, { "epoch": 0.20562285030656496, "grad_norm": 1.3435785882872704, "learning_rate": 9.213157720939226e-06, "loss": 0.1997, "step": 1375 }, { "epoch": 0.20577239419769702, "grad_norm": 1.5875355486569163, "learning_rate": 9.211853096347059e-06, "loss": 0.1984, "step": 1376 }, { "epoch": 0.20592193808882908, "grad_norm": 1.1101394692319788, "learning_rate": 9.210547483613566e-06, "loss": 0.1921, "step": 1377 }, { "epoch": 0.2060714819799611, "grad_norm": 1.4017863241620425, "learning_rate": 9.209240883045054e-06, "loss": 0.2018, "step": 1378 }, { "epoch": 0.20622102587109317, "grad_norm": 1.7309276589079257, "learning_rate": 9.207933294948064e-06, "loss": 0.2325, "step": 1379 }, { "epoch": 0.2063705697622252, "grad_norm": 2.2693408682674647, "learning_rate": 9.206624719629371e-06, "loss": 0.6731, "step": 1380 }, { "epoch": 0.20652011365335726, "grad_norm": 1.6617142222154033, "learning_rate": 9.205315157395978e-06, "loss": 0.2133, "step": 1381 }, { "epoch": 0.20666965754448932, "grad_norm": 1.5651100627218977, "learning_rate": 9.20400460855512e-06, "loss": 0.3562, "step": 1382 }, { "epoch": 0.20681920143562135, "grad_norm": 1.412248222099086, "learning_rate": 9.202693073414267e-06, "loss": 0.2117, "step": 1383 }, { "epoch": 0.2069687453267534, "grad_norm": 2.058721397249849, "learning_rate": 9.201380552281114e-06, "loss": 0.4157, "step": 1384 }, { "epoch": 0.20711828921788544, "grad_norm": 1.190322717831582, "learning_rate": 9.200067045463594e-06, "loss": 0.1468, "step": 1385 }, { "epoch": 0.2072678331090175, "grad_norm": 1.6727715120686848, "learning_rate": 9.198752553269867e-06, "loss": 0.3737, "step": 1386 }, { "epoch": 0.20741737700014953, "grad_norm": 1.6151445864315066, "learning_rate": 9.197437076008328e-06, "loss": 0.3504, "step": 1387 }, { "epoch": 0.2075669208912816, "grad_norm": 1.8626136343488717, "learning_rate": 9.196120613987596e-06, "loss": 0.1897, "step": 1388 }, { "epoch": 0.20771646478241365, "grad_norm": 2.1277783512048165, "learning_rate": 9.19480316751653e-06, "loss": 0.5398, "step": 1389 }, { "epoch": 0.20786600867354568, "grad_norm": 1.6518405571968426, "learning_rate": 9.193484736904214e-06, "loss": 0.3226, "step": 1390 }, { "epoch": 0.20801555256467774, "grad_norm": 1.680879037198174, "learning_rate": 9.192165322459965e-06, "loss": 0.2825, "step": 1391 }, { "epoch": 0.20816509645580977, "grad_norm": 1.7490131840067196, "learning_rate": 9.19084492449333e-06, "loss": 0.4683, "step": 1392 }, { "epoch": 0.20831464034694183, "grad_norm": 1.3086537105430815, "learning_rate": 9.189523543314087e-06, "loss": 0.2554, "step": 1393 }, { "epoch": 0.20846418423807386, "grad_norm": 12.365317544881876, "learning_rate": 9.188201179232243e-06, "loss": 0.2051, "step": 1394 }, { "epoch": 0.20861372812920592, "grad_norm": 1.6315580687346722, "learning_rate": 9.18687783255804e-06, "loss": 0.2119, "step": 1395 }, { "epoch": 0.20876327202033798, "grad_norm": 1.7045742133633315, "learning_rate": 9.185553503601948e-06, "loss": 0.2251, "step": 1396 }, { "epoch": 0.20891281591147, "grad_norm": 1.6188256816772855, "learning_rate": 9.184228192674667e-06, "loss": 0.2406, "step": 1397 }, { "epoch": 0.20906235980260207, "grad_norm": 3.936101301721377, "learning_rate": 9.182901900087124e-06, "loss": 0.4307, "step": 1398 }, { "epoch": 0.2092119036937341, "grad_norm": 2.5527420452443406, "learning_rate": 9.181574626150486e-06, "loss": 0.2308, "step": 1399 }, { "epoch": 0.20936144758486616, "grad_norm": 1.6355845634467614, "learning_rate": 9.180246371176141e-06, "loss": 0.1983, "step": 1400 }, { "epoch": 0.2095109914759982, "grad_norm": 1.6442195705711569, "learning_rate": 9.17891713547571e-06, "loss": 0.4733, "step": 1401 }, { "epoch": 0.20966053536713025, "grad_norm": 1.9389155277617691, "learning_rate": 9.177586919361043e-06, "loss": 0.3677, "step": 1402 }, { "epoch": 0.2098100792582623, "grad_norm": 1.4858954046606485, "learning_rate": 9.176255723144227e-06, "loss": 0.2122, "step": 1403 }, { "epoch": 0.20995962314939434, "grad_norm": 1.8989896403075806, "learning_rate": 9.17492354713757e-06, "loss": 0.5022, "step": 1404 }, { "epoch": 0.2101091670405264, "grad_norm": 2.032698049116247, "learning_rate": 9.173590391653612e-06, "loss": 0.5431, "step": 1405 }, { "epoch": 0.21025871093165843, "grad_norm": 1.5701626022875448, "learning_rate": 9.172256257005127e-06, "loss": 0.1862, "step": 1406 }, { "epoch": 0.2104082548227905, "grad_norm": 1.9707343228520016, "learning_rate": 9.170921143505114e-06, "loss": 0.5544, "step": 1407 }, { "epoch": 0.21055779871392252, "grad_norm": 1.3116097098782349, "learning_rate": 9.169585051466804e-06, "loss": 0.2191, "step": 1408 }, { "epoch": 0.21070734260505458, "grad_norm": 1.4155318018990113, "learning_rate": 9.168247981203657e-06, "loss": 0.2867, "step": 1409 }, { "epoch": 0.21085688649618664, "grad_norm": 1.5783535607698234, "learning_rate": 9.166909933029365e-06, "loss": 0.2115, "step": 1410 }, { "epoch": 0.21100643038731867, "grad_norm": 2.1761826396642343, "learning_rate": 9.16557090725784e-06, "loss": 0.6031, "step": 1411 }, { "epoch": 0.21115597427845073, "grad_norm": 1.5953173737260011, "learning_rate": 9.16423090420324e-06, "loss": 0.2972, "step": 1412 }, { "epoch": 0.21130551816958276, "grad_norm": 1.859606383856977, "learning_rate": 9.162889924179934e-06, "loss": 0.3625, "step": 1413 }, { "epoch": 0.21145506206071482, "grad_norm": 1.759605112492795, "learning_rate": 9.161547967502536e-06, "loss": 0.3425, "step": 1414 }, { "epoch": 0.21160460595184688, "grad_norm": 1.2621512719829986, "learning_rate": 9.160205034485875e-06, "loss": 0.2218, "step": 1415 }, { "epoch": 0.2117541498429789, "grad_norm": 1.4111741267825535, "learning_rate": 9.158861125445022e-06, "loss": 0.3193, "step": 1416 }, { "epoch": 0.21190369373411097, "grad_norm": 1.2698781251191589, "learning_rate": 9.157516240695266e-06, "loss": 0.1909, "step": 1417 }, { "epoch": 0.212053237625243, "grad_norm": 1.6960651115905963, "learning_rate": 9.156170380552134e-06, "loss": 0.3382, "step": 1418 }, { "epoch": 0.21220278151637506, "grad_norm": 2.1154692881317434, "learning_rate": 9.154823545331376e-06, "loss": 0.4082, "step": 1419 }, { "epoch": 0.2123523254075071, "grad_norm": 2.033684826459006, "learning_rate": 9.153475735348973e-06, "loss": 0.3867, "step": 1420 }, { "epoch": 0.21250186929863915, "grad_norm": 1.8325048944634288, "learning_rate": 9.152126950921135e-06, "loss": 0.2373, "step": 1421 }, { "epoch": 0.2126514131897712, "grad_norm": 2.0710673737402256, "learning_rate": 9.150777192364297e-06, "loss": 0.356, "step": 1422 }, { "epoch": 0.21280095708090324, "grad_norm": 1.3874235860299686, "learning_rate": 9.149426459995127e-06, "loss": 0.3691, "step": 1423 }, { "epoch": 0.2129505009720353, "grad_norm": 2.1249487008185803, "learning_rate": 9.14807475413052e-06, "loss": 0.3657, "step": 1424 }, { "epoch": 0.21310004486316733, "grad_norm": 1.6076473156790783, "learning_rate": 9.146722075087599e-06, "loss": 0.2973, "step": 1425 }, { "epoch": 0.2132495887542994, "grad_norm": 1.7585364821746667, "learning_rate": 9.145368423183716e-06, "loss": 0.3684, "step": 1426 }, { "epoch": 0.21339913264543142, "grad_norm": 1.5724475968533793, "learning_rate": 9.144013798736451e-06, "loss": 0.3614, "step": 1427 }, { "epoch": 0.21354867653656348, "grad_norm": 1.5213037170574124, "learning_rate": 9.142658202063613e-06, "loss": 0.331, "step": 1428 }, { "epoch": 0.21369822042769554, "grad_norm": 1.7462862535903572, "learning_rate": 9.141301633483233e-06, "loss": 0.2972, "step": 1429 }, { "epoch": 0.21384776431882757, "grad_norm": 1.3980558796390394, "learning_rate": 9.139944093313582e-06, "loss": 0.2375, "step": 1430 }, { "epoch": 0.21399730820995963, "grad_norm": 1.655121853169513, "learning_rate": 9.138585581873145e-06, "loss": 0.1952, "step": 1431 }, { "epoch": 0.21414685210109166, "grad_norm": 1.9616282412950032, "learning_rate": 9.137226099480649e-06, "loss": 0.3827, "step": 1432 }, { "epoch": 0.21429639599222372, "grad_norm": 1.5101599924640094, "learning_rate": 9.135865646455035e-06, "loss": 0.2151, "step": 1433 }, { "epoch": 0.21444593988335575, "grad_norm": 1.9492564570386166, "learning_rate": 9.134504223115483e-06, "loss": 0.5627, "step": 1434 }, { "epoch": 0.2145954837744878, "grad_norm": 1.4722879693007511, "learning_rate": 9.133141829781396e-06, "loss": 0.3629, "step": 1435 }, { "epoch": 0.21474502766561987, "grad_norm": 1.3336058131154513, "learning_rate": 9.131778466772401e-06, "loss": 0.2227, "step": 1436 }, { "epoch": 0.2148945715567519, "grad_norm": 1.557620867064145, "learning_rate": 9.130414134408358e-06, "loss": 0.1922, "step": 1437 }, { "epoch": 0.21504411544788396, "grad_norm": 1.5608371994510732, "learning_rate": 9.129048833009354e-06, "loss": 0.345, "step": 1438 }, { "epoch": 0.215193659339016, "grad_norm": 2.0372244298598026, "learning_rate": 9.127682562895701e-06, "loss": 0.1943, "step": 1439 }, { "epoch": 0.21534320323014805, "grad_norm": 1.771798287939486, "learning_rate": 9.126315324387937e-06, "loss": 0.2201, "step": 1440 }, { "epoch": 0.2154927471212801, "grad_norm": 1.320918694554743, "learning_rate": 9.124947117806833e-06, "loss": 0.3557, "step": 1441 }, { "epoch": 0.21564229101241214, "grad_norm": 1.2144354045037664, "learning_rate": 9.12357794347338e-06, "loss": 0.1677, "step": 1442 }, { "epoch": 0.2157918349035442, "grad_norm": 1.7291539866895038, "learning_rate": 9.122207801708802e-06, "loss": 0.2361, "step": 1443 }, { "epoch": 0.21594137879467623, "grad_norm": 1.525451199822209, "learning_rate": 9.120836692834547e-06, "loss": 0.2203, "step": 1444 }, { "epoch": 0.2160909226858083, "grad_norm": 1.1450235200011314, "learning_rate": 9.11946461717229e-06, "loss": 0.2239, "step": 1445 }, { "epoch": 0.21624046657694032, "grad_norm": 1.6249409995005226, "learning_rate": 9.118091575043931e-06, "loss": 0.2035, "step": 1446 }, { "epoch": 0.21639001046807238, "grad_norm": 1.5911197134726236, "learning_rate": 9.116717566771602e-06, "loss": 0.3426, "step": 1447 }, { "epoch": 0.21653955435920444, "grad_norm": 1.2768942365723772, "learning_rate": 9.115342592677658e-06, "loss": 0.3227, "step": 1448 }, { "epoch": 0.21668909825033647, "grad_norm": 2.6013675708313646, "learning_rate": 9.11396665308468e-06, "loss": 0.876, "step": 1449 }, { "epoch": 0.21683864214146853, "grad_norm": 1.5557280944558294, "learning_rate": 9.112589748315477e-06, "loss": 0.1862, "step": 1450 }, { "epoch": 0.21698818603260056, "grad_norm": 1.9084211867181637, "learning_rate": 9.111211878693084e-06, "loss": 0.4744, "step": 1451 }, { "epoch": 0.21713772992373262, "grad_norm": 1.9726349404572066, "learning_rate": 9.109833044540766e-06, "loss": 0.4964, "step": 1452 }, { "epoch": 0.21728727381486465, "grad_norm": 1.7981363223292532, "learning_rate": 9.108453246182005e-06, "loss": 0.4617, "step": 1453 }, { "epoch": 0.2174368177059967, "grad_norm": 1.4636175949338748, "learning_rate": 9.10707248394052e-06, "loss": 0.3562, "step": 1454 }, { "epoch": 0.21758636159712877, "grad_norm": 1.5023091294876543, "learning_rate": 9.105690758140247e-06, "loss": 0.1974, "step": 1455 }, { "epoch": 0.2177359054882608, "grad_norm": 1.5799772562807408, "learning_rate": 9.104308069105355e-06, "loss": 0.4225, "step": 1456 }, { "epoch": 0.21788544937939286, "grad_norm": 1.697173297157486, "learning_rate": 9.102924417160235e-06, "loss": 0.2355, "step": 1457 }, { "epoch": 0.2180349932705249, "grad_norm": 1.9831963966350166, "learning_rate": 9.101539802629506e-06, "loss": 0.6711, "step": 1458 }, { "epoch": 0.21818453716165695, "grad_norm": 1.3049711330279425, "learning_rate": 9.10015422583801e-06, "loss": 0.2306, "step": 1459 }, { "epoch": 0.21833408105278898, "grad_norm": 1.3485657787858605, "learning_rate": 9.09876768711082e-06, "loss": 0.3155, "step": 1460 }, { "epoch": 0.21848362494392104, "grad_norm": 1.430084626746487, "learning_rate": 9.097380186773225e-06, "loss": 0.1803, "step": 1461 }, { "epoch": 0.2186331688350531, "grad_norm": 1.042217037319963, "learning_rate": 9.095991725150755e-06, "loss": 0.2157, "step": 1462 }, { "epoch": 0.21878271272618513, "grad_norm": 1.6038911196277919, "learning_rate": 9.094602302569149e-06, "loss": 0.3647, "step": 1463 }, { "epoch": 0.2189322566173172, "grad_norm": 1.761061421005298, "learning_rate": 9.093211919354384e-06, "loss": 0.3914, "step": 1464 }, { "epoch": 0.21908180050844922, "grad_norm": 1.780140598214583, "learning_rate": 9.091820575832653e-06, "loss": 0.5239, "step": 1465 }, { "epoch": 0.21923134439958128, "grad_norm": 1.5056065150417166, "learning_rate": 9.090428272330381e-06, "loss": 0.3738, "step": 1466 }, { "epoch": 0.2193808882907133, "grad_norm": 1.5037400785516828, "learning_rate": 9.089035009174213e-06, "loss": 0.3748, "step": 1467 }, { "epoch": 0.21953043218184537, "grad_norm": 1.0034616725342143, "learning_rate": 9.087640786691029e-06, "loss": 0.1818, "step": 1468 }, { "epoch": 0.21967997607297743, "grad_norm": 1.8826776828221814, "learning_rate": 9.08624560520792e-06, "loss": 0.5423, "step": 1469 }, { "epoch": 0.21982951996410946, "grad_norm": 1.5762662138471795, "learning_rate": 9.08484946505221e-06, "loss": 0.362, "step": 1470 }, { "epoch": 0.21997906385524152, "grad_norm": 1.459026627139429, "learning_rate": 9.08345236655145e-06, "loss": 0.2379, "step": 1471 }, { "epoch": 0.22012860774637355, "grad_norm": 1.9767735560822688, "learning_rate": 9.082054310033412e-06, "loss": 0.2279, "step": 1472 }, { "epoch": 0.2202781516375056, "grad_norm": 1.5033186272325123, "learning_rate": 9.08065529582609e-06, "loss": 0.3475, "step": 1473 }, { "epoch": 0.22042769552863767, "grad_norm": 1.7347624363190264, "learning_rate": 9.07925532425771e-06, "loss": 0.2346, "step": 1474 }, { "epoch": 0.2205772394197697, "grad_norm": 1.311368499943375, "learning_rate": 9.077854395656719e-06, "loss": 0.2136, "step": 1475 }, { "epoch": 0.22072678331090176, "grad_norm": 2.079168631285795, "learning_rate": 9.076452510351786e-06, "loss": 0.5518, "step": 1476 }, { "epoch": 0.2208763272020338, "grad_norm": 1.7471493510688818, "learning_rate": 9.075049668671808e-06, "loss": 0.4685, "step": 1477 }, { "epoch": 0.22102587109316585, "grad_norm": 1.7503079479759727, "learning_rate": 9.073645870945904e-06, "loss": 0.3369, "step": 1478 }, { "epoch": 0.22117541498429788, "grad_norm": 1.6001592489802432, "learning_rate": 9.07224111750342e-06, "loss": 0.2451, "step": 1479 }, { "epoch": 0.22132495887542994, "grad_norm": 1.5995131942282934, "learning_rate": 9.070835408673926e-06, "loss": 0.2199, "step": 1480 }, { "epoch": 0.221474502766562, "grad_norm": 1.4586326961701683, "learning_rate": 9.06942874478721e-06, "loss": 0.374, "step": 1481 }, { "epoch": 0.22162404665769403, "grad_norm": 1.5381391273868525, "learning_rate": 9.068021126173294e-06, "loss": 0.2369, "step": 1482 }, { "epoch": 0.2217735905488261, "grad_norm": 1.96520452808454, "learning_rate": 9.066612553162417e-06, "loss": 0.2214, "step": 1483 }, { "epoch": 0.22192313443995812, "grad_norm": 1.3323608756885599, "learning_rate": 9.065203026085041e-06, "loss": 0.2335, "step": 1484 }, { "epoch": 0.22207267833109018, "grad_norm": 1.725202352638352, "learning_rate": 9.063792545271859e-06, "loss": 0.4229, "step": 1485 }, { "epoch": 0.2222222222222222, "grad_norm": 0.9584754362518301, "learning_rate": 9.062381111053781e-06, "loss": 0.1886, "step": 1486 }, { "epoch": 0.22237176611335427, "grad_norm": 1.72245020942535, "learning_rate": 9.060968723761945e-06, "loss": 0.3353, "step": 1487 }, { "epoch": 0.22252131000448633, "grad_norm": 1.6135392202141645, "learning_rate": 9.05955538372771e-06, "loss": 0.1953, "step": 1488 }, { "epoch": 0.22267085389561836, "grad_norm": 1.3152021255142217, "learning_rate": 9.058141091282656e-06, "loss": 0.1742, "step": 1489 }, { "epoch": 0.22282039778675042, "grad_norm": 2.024778297438531, "learning_rate": 9.056725846758594e-06, "loss": 0.4014, "step": 1490 }, { "epoch": 0.22296994167788245, "grad_norm": 1.6698713908455967, "learning_rate": 9.055309650487552e-06, "loss": 0.3407, "step": 1491 }, { "epoch": 0.2231194855690145, "grad_norm": 1.934590891729098, "learning_rate": 9.053892502801783e-06, "loss": 0.3714, "step": 1492 }, { "epoch": 0.22326902946014654, "grad_norm": 1.8737430969121303, "learning_rate": 9.052474404033764e-06, "loss": 0.2273, "step": 1493 }, { "epoch": 0.2234185733512786, "grad_norm": 2.1916032794360594, "learning_rate": 9.051055354516195e-06, "loss": 0.5141, "step": 1494 }, { "epoch": 0.22356811724241066, "grad_norm": 1.3808614199352094, "learning_rate": 9.049635354581998e-06, "loss": 0.22, "step": 1495 }, { "epoch": 0.2237176611335427, "grad_norm": 1.56662054094386, "learning_rate": 9.048214404564319e-06, "loss": 0.23, "step": 1496 }, { "epoch": 0.22386720502467475, "grad_norm": 1.3046782458999366, "learning_rate": 9.046792504796526e-06, "loss": 0.2448, "step": 1497 }, { "epoch": 0.22401674891580678, "grad_norm": 1.2799576312369438, "learning_rate": 9.045369655612212e-06, "loss": 0.222, "step": 1498 }, { "epoch": 0.22416629280693884, "grad_norm": 1.3122547692582682, "learning_rate": 9.043945857345189e-06, "loss": 0.2261, "step": 1499 }, { "epoch": 0.2243158366980709, "grad_norm": 1.644275160578279, "learning_rate": 9.042521110329497e-06, "loss": 0.3581, "step": 1500 }, { "epoch": 0.22446538058920293, "grad_norm": 1.3566851036629488, "learning_rate": 9.04109541489939e-06, "loss": 0.3989, "step": 1501 }, { "epoch": 0.224614924480335, "grad_norm": 1.1272465309050348, "learning_rate": 9.039668771389356e-06, "loss": 0.2049, "step": 1502 }, { "epoch": 0.22476446837146702, "grad_norm": 1.2545829340424604, "learning_rate": 9.038241180134095e-06, "loss": 0.2186, "step": 1503 }, { "epoch": 0.22491401226259908, "grad_norm": 2.458444877022837, "learning_rate": 9.036812641468535e-06, "loss": 0.9375, "step": 1504 }, { "epoch": 0.2250635561537311, "grad_norm": 1.7631503038372196, "learning_rate": 9.035383155727826e-06, "loss": 0.5211, "step": 1505 }, { "epoch": 0.22521310004486317, "grad_norm": 1.2773941362574963, "learning_rate": 9.03395272324734e-06, "loss": 0.1924, "step": 1506 }, { "epoch": 0.22536264393599523, "grad_norm": 0.8285607729871721, "learning_rate": 9.032521344362665e-06, "loss": 0.1591, "step": 1507 }, { "epoch": 0.22551218782712726, "grad_norm": 1.4527549144198617, "learning_rate": 9.031089019409622e-06, "loss": 0.3534, "step": 1508 }, { "epoch": 0.22566173171825932, "grad_norm": 1.5654074226457095, "learning_rate": 9.029655748724245e-06, "loss": 0.3845, "step": 1509 }, { "epoch": 0.22581127560939135, "grad_norm": 1.7456103899692976, "learning_rate": 9.028221532642793e-06, "loss": 0.3186, "step": 1510 }, { "epoch": 0.2259608195005234, "grad_norm": 1.9962184366897087, "learning_rate": 9.02678637150175e-06, "loss": 0.4627, "step": 1511 }, { "epoch": 0.22611036339165544, "grad_norm": 1.1115007945174045, "learning_rate": 9.025350265637816e-06, "loss": 0.2216, "step": 1512 }, { "epoch": 0.2262599072827875, "grad_norm": 1.1002289464689736, "learning_rate": 9.023913215387914e-06, "loss": 0.26, "step": 1513 }, { "epoch": 0.22640945117391956, "grad_norm": 4.707452449710372, "learning_rate": 9.022475221089194e-06, "loss": 0.2086, "step": 1514 }, { "epoch": 0.2265589950650516, "grad_norm": 1.1907478811651444, "learning_rate": 9.021036283079022e-06, "loss": 0.1712, "step": 1515 }, { "epoch": 0.22670853895618365, "grad_norm": 1.4943756417337315, "learning_rate": 9.019596401694984e-06, "loss": 0.3612, "step": 1516 }, { "epoch": 0.22685808284731568, "grad_norm": 1.8328029211172867, "learning_rate": 9.018155577274891e-06, "loss": 0.5014, "step": 1517 }, { "epoch": 0.22700762673844774, "grad_norm": 1.5769600203549734, "learning_rate": 9.016713810156777e-06, "loss": 0.3982, "step": 1518 }, { "epoch": 0.22715717062957977, "grad_norm": 1.033998264540829, "learning_rate": 9.015271100678892e-06, "loss": 0.1698, "step": 1519 }, { "epoch": 0.22730671452071183, "grad_norm": 1.4450333274104865, "learning_rate": 9.01382744917971e-06, "loss": 0.2241, "step": 1520 }, { "epoch": 0.2274562584118439, "grad_norm": 1.4009561553962124, "learning_rate": 9.012382855997925e-06, "loss": 0.318, "step": 1521 }, { "epoch": 0.22760580230297592, "grad_norm": 1.6380239076287348, "learning_rate": 9.010937321472454e-06, "loss": 0.3649, "step": 1522 }, { "epoch": 0.22775534619410798, "grad_norm": 1.8718410186357894, "learning_rate": 9.009490845942433e-06, "loss": 0.3845, "step": 1523 }, { "epoch": 0.22790489008524, "grad_norm": 1.474697832902752, "learning_rate": 9.00804342974722e-06, "loss": 0.3287, "step": 1524 }, { "epoch": 0.22805443397637207, "grad_norm": 1.9033242818165663, "learning_rate": 9.006595073226387e-06, "loss": 0.3614, "step": 1525 }, { "epoch": 0.2282039778675041, "grad_norm": 1.7576573706097778, "learning_rate": 9.005145776719742e-06, "loss": 0.4287, "step": 1526 }, { "epoch": 0.22835352175863616, "grad_norm": 1.646435656567765, "learning_rate": 9.003695540567294e-06, "loss": 0.3619, "step": 1527 }, { "epoch": 0.22850306564976822, "grad_norm": 1.6955358560275535, "learning_rate": 9.00224436510929e-06, "loss": 0.371, "step": 1528 }, { "epoch": 0.22865260954090025, "grad_norm": 1.658825841648886, "learning_rate": 9.000792250686186e-06, "loss": 0.3939, "step": 1529 }, { "epoch": 0.2288021534320323, "grad_norm": 1.5326543749267825, "learning_rate": 8.999339197638664e-06, "loss": 0.2482, "step": 1530 }, { "epoch": 0.22895169732316434, "grad_norm": 2.4642466957027174, "learning_rate": 8.99788520630762e-06, "loss": 0.4219, "step": 1531 }, { "epoch": 0.2291012412142964, "grad_norm": 1.532408533906101, "learning_rate": 8.99643027703418e-06, "loss": 0.3121, "step": 1532 }, { "epoch": 0.22925078510542846, "grad_norm": 1.068785213809154, "learning_rate": 8.994974410159682e-06, "loss": 0.2221, "step": 1533 }, { "epoch": 0.2294003289965605, "grad_norm": 1.6812794615422513, "learning_rate": 8.993517606025686e-06, "loss": 0.36, "step": 1534 }, { "epoch": 0.22954987288769255, "grad_norm": 1.3397673899297329, "learning_rate": 8.992059864973972e-06, "loss": 0.1707, "step": 1535 }, { "epoch": 0.22969941677882458, "grad_norm": 2.2086842094358445, "learning_rate": 8.990601187346542e-06, "loss": 0.2141, "step": 1536 }, { "epoch": 0.22984896066995664, "grad_norm": 1.9813196209128292, "learning_rate": 8.989141573485614e-06, "loss": 0.3664, "step": 1537 }, { "epoch": 0.22999850456108867, "grad_norm": 1.4968536528735594, "learning_rate": 8.987681023733628e-06, "loss": 0.212, "step": 1538 }, { "epoch": 0.23014804845222073, "grad_norm": 2.005340468193955, "learning_rate": 8.986219538433243e-06, "loss": 0.5171, "step": 1539 }, { "epoch": 0.23029759234335279, "grad_norm": 1.6056668375146956, "learning_rate": 8.984757117927337e-06, "loss": 0.367, "step": 1540 }, { "epoch": 0.23044713623448482, "grad_norm": 1.736972659173659, "learning_rate": 8.983293762559009e-06, "loss": 0.3054, "step": 1541 }, { "epoch": 0.23059668012561688, "grad_norm": 2.3554690220264893, "learning_rate": 8.981829472671576e-06, "loss": 0.48, "step": 1542 }, { "epoch": 0.2307462240167489, "grad_norm": 2.441414392076267, "learning_rate": 8.980364248608576e-06, "loss": 0.5507, "step": 1543 }, { "epoch": 0.23089576790788097, "grad_norm": 1.6179923206419418, "learning_rate": 8.97889809071376e-06, "loss": 0.1799, "step": 1544 }, { "epoch": 0.231045311799013, "grad_norm": 1.3522900238471818, "learning_rate": 8.977430999331108e-06, "loss": 0.2205, "step": 1545 }, { "epoch": 0.23119485569014506, "grad_norm": 1.5829130211417721, "learning_rate": 8.97596297480481e-06, "loss": 0.3255, "step": 1546 }, { "epoch": 0.23134439958127712, "grad_norm": 1.5826039702225925, "learning_rate": 8.974494017479281e-06, "loss": 0.2593, "step": 1547 }, { "epoch": 0.23149394347240915, "grad_norm": 1.498668084288196, "learning_rate": 8.973024127699152e-06, "loss": 0.3489, "step": 1548 }, { "epoch": 0.2316434873635412, "grad_norm": 1.753546348637437, "learning_rate": 8.971553305809274e-06, "loss": 0.1926, "step": 1549 }, { "epoch": 0.23179303125467324, "grad_norm": 1.434969286299199, "learning_rate": 8.970081552154714e-06, "loss": 0.3408, "step": 1550 }, { "epoch": 0.2319425751458053, "grad_norm": 0.8478352471965336, "learning_rate": 8.968608867080761e-06, "loss": 0.2296, "step": 1551 }, { "epoch": 0.23209211903693733, "grad_norm": 1.5740700313091822, "learning_rate": 8.967135250932921e-06, "loss": 0.2804, "step": 1552 }, { "epoch": 0.2322416629280694, "grad_norm": 1.5528025294287224, "learning_rate": 8.96566070405692e-06, "loss": 0.3515, "step": 1553 }, { "epoch": 0.23239120681920145, "grad_norm": 1.6156309602690202, "learning_rate": 8.964185226798696e-06, "loss": 0.3846, "step": 1554 }, { "epoch": 0.23254075071033348, "grad_norm": 1.900324205764928, "learning_rate": 8.962708819504415e-06, "loss": 0.3893, "step": 1555 }, { "epoch": 0.23269029460146554, "grad_norm": 1.6871831737787155, "learning_rate": 8.961231482520456e-06, "loss": 0.4995, "step": 1556 }, { "epoch": 0.23283983849259757, "grad_norm": 1.7659270863655478, "learning_rate": 8.959753216193415e-06, "loss": 0.3834, "step": 1557 }, { "epoch": 0.23298938238372963, "grad_norm": 1.4355628771765856, "learning_rate": 8.958274020870107e-06, "loss": 0.2458, "step": 1558 }, { "epoch": 0.23313892627486169, "grad_norm": 1.091740260422094, "learning_rate": 8.956793896897566e-06, "loss": 0.1625, "step": 1559 }, { "epoch": 0.23328847016599372, "grad_norm": 1.1925103613752697, "learning_rate": 8.955312844623045e-06, "loss": 0.1767, "step": 1560 }, { "epoch": 0.23343801405712578, "grad_norm": 1.8795430835063092, "learning_rate": 8.953830864394012e-06, "loss": 0.3983, "step": 1561 }, { "epoch": 0.2335875579482578, "grad_norm": 1.4373449366320268, "learning_rate": 8.952347956558152e-06, "loss": 0.2384, "step": 1562 }, { "epoch": 0.23373710183938987, "grad_norm": 1.4903754933686846, "learning_rate": 8.950864121463374e-06, "loss": 0.2191, "step": 1563 }, { "epoch": 0.2338866457305219, "grad_norm": 1.8744944645931436, "learning_rate": 8.949379359457795e-06, "loss": 0.4788, "step": 1564 }, { "epoch": 0.23403618962165396, "grad_norm": 2.30369719872738, "learning_rate": 8.947893670889756e-06, "loss": 0.5508, "step": 1565 }, { "epoch": 0.23418573351278602, "grad_norm": 1.5228147309739217, "learning_rate": 8.946407056107815e-06, "loss": 0.2744, "step": 1566 }, { "epoch": 0.23433527740391805, "grad_norm": 0.9855113226049245, "learning_rate": 8.944919515460746e-06, "loss": 0.1762, "step": 1567 }, { "epoch": 0.2344848212950501, "grad_norm": 2.0244671473740476, "learning_rate": 8.943431049297542e-06, "loss": 0.4977, "step": 1568 }, { "epoch": 0.23463436518618214, "grad_norm": 1.9053040838783037, "learning_rate": 8.941941657967408e-06, "loss": 0.3878, "step": 1569 }, { "epoch": 0.2347839090773142, "grad_norm": 1.2593264871558218, "learning_rate": 8.94045134181977e-06, "loss": 0.2085, "step": 1570 }, { "epoch": 0.23493345296844623, "grad_norm": 1.4409087053787786, "learning_rate": 8.938960101204273e-06, "loss": 0.392, "step": 1571 }, { "epoch": 0.23508299685957829, "grad_norm": 0.9533079750475656, "learning_rate": 8.937467936470773e-06, "loss": 0.161, "step": 1572 }, { "epoch": 0.23523254075071034, "grad_norm": 1.3187507218092258, "learning_rate": 8.935974847969352e-06, "loss": 0.228, "step": 1573 }, { "epoch": 0.23538208464184238, "grad_norm": 1.4711338301621406, "learning_rate": 8.934480836050297e-06, "loss": 0.2065, "step": 1574 }, { "epoch": 0.23553162853297444, "grad_norm": 1.6346673332024881, "learning_rate": 8.93298590106412e-06, "loss": 0.3639, "step": 1575 }, { "epoch": 0.23568117242410647, "grad_norm": 1.519040249043761, "learning_rate": 8.931490043361546e-06, "loss": 0.3451, "step": 1576 }, { "epoch": 0.23583071631523853, "grad_norm": 1.5939490670813292, "learning_rate": 8.929993263293519e-06, "loss": 0.3313, "step": 1577 }, { "epoch": 0.23598026020637056, "grad_norm": 2.4862093840935584, "learning_rate": 8.928495561211199e-06, "loss": 0.5844, "step": 1578 }, { "epoch": 0.23612980409750262, "grad_norm": 0.9227521449231808, "learning_rate": 8.92699693746596e-06, "loss": 0.1858, "step": 1579 }, { "epoch": 0.23627934798863467, "grad_norm": 1.7904495917281569, "learning_rate": 8.925497392409392e-06, "loss": 0.4007, "step": 1580 }, { "epoch": 0.2364288918797667, "grad_norm": 1.6104758349435173, "learning_rate": 8.923996926393306e-06, "loss": 0.2506, "step": 1581 }, { "epoch": 0.23657843577089877, "grad_norm": 1.162466771602011, "learning_rate": 8.922495539769722e-06, "loss": 0.2251, "step": 1582 }, { "epoch": 0.2367279796620308, "grad_norm": 1.4044418047951504, "learning_rate": 8.920993232890885e-06, "loss": 0.3529, "step": 1583 }, { "epoch": 0.23687752355316286, "grad_norm": 1.3430242838204132, "learning_rate": 8.919490006109243e-06, "loss": 0.213, "step": 1584 }, { "epoch": 0.23702706744429491, "grad_norm": 2.047091159617954, "learning_rate": 8.917985859777477e-06, "loss": 0.4145, "step": 1585 }, { "epoch": 0.23717661133542695, "grad_norm": 2.0721630966321247, "learning_rate": 8.916480794248465e-06, "loss": 0.5021, "step": 1586 }, { "epoch": 0.237326155226559, "grad_norm": 1.5816352819652573, "learning_rate": 8.914974809875317e-06, "loss": 0.3858, "step": 1587 }, { "epoch": 0.23747569911769104, "grad_norm": 1.8550669988064394, "learning_rate": 8.913467907011349e-06, "loss": 0.4832, "step": 1588 }, { "epoch": 0.2376252430088231, "grad_norm": 1.6366187681919886, "learning_rate": 8.911960086010091e-06, "loss": 0.3524, "step": 1589 }, { "epoch": 0.23777478689995513, "grad_norm": 1.4416460700943408, "learning_rate": 8.910451347225298e-06, "loss": 0.3926, "step": 1590 }, { "epoch": 0.23792433079108719, "grad_norm": 1.564579376384169, "learning_rate": 8.908941691010933e-06, "loss": 0.2312, "step": 1591 }, { "epoch": 0.23807387468221924, "grad_norm": 1.2944939224073606, "learning_rate": 8.907431117721175e-06, "loss": 0.2245, "step": 1592 }, { "epoch": 0.23822341857335128, "grad_norm": 1.6975131715816902, "learning_rate": 8.905919627710419e-06, "loss": 0.3363, "step": 1593 }, { "epoch": 0.23837296246448333, "grad_norm": 1.596361094662525, "learning_rate": 8.904407221333275e-06, "loss": 0.3658, "step": 1594 }, { "epoch": 0.23852250635561537, "grad_norm": 1.4858572761749074, "learning_rate": 8.902893898944571e-06, "loss": 0.2827, "step": 1595 }, { "epoch": 0.23867205024674742, "grad_norm": 2.4979339586019464, "learning_rate": 8.901379660899343e-06, "loss": 0.5078, "step": 1596 }, { "epoch": 0.23882159413787946, "grad_norm": 1.647641401011452, "learning_rate": 8.899864507552846e-06, "loss": 0.3907, "step": 1597 }, { "epoch": 0.23897113802901152, "grad_norm": 1.604466675493429, "learning_rate": 8.898348439260553e-06, "loss": 0.4351, "step": 1598 }, { "epoch": 0.23912068192014357, "grad_norm": 1.158380693426154, "learning_rate": 8.896831456378148e-06, "loss": 0.2045, "step": 1599 }, { "epoch": 0.2392702258112756, "grad_norm": 1.6498746618436277, "learning_rate": 8.895313559261525e-06, "loss": 0.3361, "step": 1600 }, { "epoch": 0.23941976970240766, "grad_norm": 1.237277087089687, "learning_rate": 8.893794748266801e-06, "loss": 0.2327, "step": 1601 }, { "epoch": 0.2395693135935397, "grad_norm": 1.7276131272814355, "learning_rate": 8.892275023750305e-06, "loss": 0.3676, "step": 1602 }, { "epoch": 0.23971885748467175, "grad_norm": 1.0737115286717451, "learning_rate": 8.890754386068577e-06, "loss": 0.1695, "step": 1603 }, { "epoch": 0.23986840137580379, "grad_norm": 1.7893197216212124, "learning_rate": 8.889232835578372e-06, "loss": 0.2075, "step": 1604 }, { "epoch": 0.24001794526693584, "grad_norm": 1.0316754586933763, "learning_rate": 8.887710372636662e-06, "loss": 0.1818, "step": 1605 }, { "epoch": 0.2401674891580679, "grad_norm": 1.908936676793948, "learning_rate": 8.886186997600633e-06, "loss": 0.4014, "step": 1606 }, { "epoch": 0.24031703304919994, "grad_norm": 1.8121753558090572, "learning_rate": 8.884662710827679e-06, "loss": 0.5739, "step": 1607 }, { "epoch": 0.240466576940332, "grad_norm": 1.5081453030204104, "learning_rate": 8.883137512675417e-06, "loss": 0.2058, "step": 1608 }, { "epoch": 0.24061612083146403, "grad_norm": 1.6968786810383643, "learning_rate": 8.88161140350167e-06, "loss": 0.3992, "step": 1609 }, { "epoch": 0.24076566472259608, "grad_norm": 1.0441105843798988, "learning_rate": 8.880084383664481e-06, "loss": 0.1736, "step": 1610 }, { "epoch": 0.24091520861372812, "grad_norm": 1.4531784381064643, "learning_rate": 8.8785564535221e-06, "loss": 0.2213, "step": 1611 }, { "epoch": 0.24106475250486017, "grad_norm": 1.6179328389513437, "learning_rate": 8.877027613432997e-06, "loss": 0.4397, "step": 1612 }, { "epoch": 0.24121429639599223, "grad_norm": 1.3779127226717234, "learning_rate": 8.875497863755851e-06, "loss": 0.2095, "step": 1613 }, { "epoch": 0.24136384028712426, "grad_norm": 1.4488714989705522, "learning_rate": 8.873967204849556e-06, "loss": 0.3495, "step": 1614 }, { "epoch": 0.24151338417825632, "grad_norm": 1.699118467638372, "learning_rate": 8.872435637073223e-06, "loss": 0.2042, "step": 1615 }, { "epoch": 0.24166292806938836, "grad_norm": 1.5724198212446476, "learning_rate": 8.870903160786166e-06, "loss": 0.358, "step": 1616 }, { "epoch": 0.24181247196052041, "grad_norm": 1.8735086152337022, "learning_rate": 8.869369776347923e-06, "loss": 0.352, "step": 1617 }, { "epoch": 0.24196201585165247, "grad_norm": 2.071130867923683, "learning_rate": 8.867835484118241e-06, "loss": 0.369, "step": 1618 }, { "epoch": 0.2421115597427845, "grad_norm": 1.00009701768564, "learning_rate": 8.866300284457078e-06, "loss": 0.1362, "step": 1619 }, { "epoch": 0.24226110363391656, "grad_norm": 1.2342660250726913, "learning_rate": 8.864764177724607e-06, "loss": 0.228, "step": 1620 }, { "epoch": 0.2424106475250486, "grad_norm": 1.181191216278982, "learning_rate": 8.863227164281214e-06, "loss": 0.185, "step": 1621 }, { "epoch": 0.24256019141618065, "grad_norm": 1.600687165751174, "learning_rate": 8.861689244487497e-06, "loss": 0.198, "step": 1622 }, { "epoch": 0.24270973530731269, "grad_norm": 1.8325486581074766, "learning_rate": 8.860150418704268e-06, "loss": 0.3872, "step": 1623 }, { "epoch": 0.24285927919844474, "grad_norm": 1.8934954141642377, "learning_rate": 8.858610687292548e-06, "loss": 0.4214, "step": 1624 }, { "epoch": 0.2430088230895768, "grad_norm": 2.2505887638767184, "learning_rate": 8.857070050613573e-06, "loss": 0.2427, "step": 1625 }, { "epoch": 0.24315836698070883, "grad_norm": 1.7156119494211053, "learning_rate": 8.855528509028793e-06, "loss": 0.3129, "step": 1626 }, { "epoch": 0.2433079108718409, "grad_norm": 1.1926433487919907, "learning_rate": 8.853986062899869e-06, "loss": 0.1944, "step": 1627 }, { "epoch": 0.24345745476297292, "grad_norm": 1.4955836838428689, "learning_rate": 8.852442712588671e-06, "loss": 0.4259, "step": 1628 }, { "epoch": 0.24360699865410498, "grad_norm": 1.5899010145338972, "learning_rate": 8.850898458457284e-06, "loss": 0.1922, "step": 1629 }, { "epoch": 0.24375654254523701, "grad_norm": 1.7661527081371415, "learning_rate": 8.849353300868007e-06, "loss": 0.3413, "step": 1630 }, { "epoch": 0.24390608643636907, "grad_norm": 1.0197540346721858, "learning_rate": 8.847807240183349e-06, "loss": 0.2392, "step": 1631 }, { "epoch": 0.24405563032750113, "grad_norm": 2.304126832944234, "learning_rate": 8.84626027676603e-06, "loss": 0.6652, "step": 1632 }, { "epoch": 0.24420517421863316, "grad_norm": 1.730517756971552, "learning_rate": 8.844712410978981e-06, "loss": 0.2769, "step": 1633 }, { "epoch": 0.24435471810976522, "grad_norm": 1.7941803588719758, "learning_rate": 8.843163643185347e-06, "loss": 0.5527, "step": 1634 }, { "epoch": 0.24450426200089725, "grad_norm": 1.5034412891428997, "learning_rate": 8.841613973748486e-06, "loss": 0.2425, "step": 1635 }, { "epoch": 0.2446538058920293, "grad_norm": 1.8387720627371986, "learning_rate": 8.840063403031962e-06, "loss": 0.4814, "step": 1636 }, { "epoch": 0.24480334978316134, "grad_norm": 1.6058048815286128, "learning_rate": 8.838511931399557e-06, "loss": 0.205, "step": 1637 }, { "epoch": 0.2449528936742934, "grad_norm": 1.6535570636461936, "learning_rate": 8.83695955921526e-06, "loss": 0.3842, "step": 1638 }, { "epoch": 0.24510243756542546, "grad_norm": 2.0495436648956677, "learning_rate": 8.83540628684327e-06, "loss": 0.588, "step": 1639 }, { "epoch": 0.2452519814565575, "grad_norm": 1.932080426112819, "learning_rate": 8.833852114648006e-06, "loss": 0.2566, "step": 1640 }, { "epoch": 0.24540152534768955, "grad_norm": 1.9660531070561371, "learning_rate": 8.832297042994083e-06, "loss": 0.5196, "step": 1641 }, { "epoch": 0.24555106923882158, "grad_norm": 1.4040557060017893, "learning_rate": 8.830741072246343e-06, "loss": 0.2041, "step": 1642 }, { "epoch": 0.24570061312995364, "grad_norm": 2.1028771117617637, "learning_rate": 8.829184202769828e-06, "loss": 0.2795, "step": 1643 }, { "epoch": 0.2458501570210857, "grad_norm": 1.379854611664084, "learning_rate": 8.827626434929796e-06, "loss": 0.1597, "step": 1644 }, { "epoch": 0.24599970091221773, "grad_norm": 1.5617312208997327, "learning_rate": 8.826067769091715e-06, "loss": 0.3839, "step": 1645 }, { "epoch": 0.2461492448033498, "grad_norm": 1.7647416850681912, "learning_rate": 8.824508205621263e-06, "loss": 0.373, "step": 1646 }, { "epoch": 0.24629878869448182, "grad_norm": 2.0755613511333406, "learning_rate": 8.822947744884326e-06, "loss": 0.5092, "step": 1647 }, { "epoch": 0.24644833258561388, "grad_norm": 2.235044525778246, "learning_rate": 8.821386387247006e-06, "loss": 0.6294, "step": 1648 }, { "epoch": 0.24659787647674591, "grad_norm": 1.6548207522042184, "learning_rate": 8.81982413307561e-06, "loss": 0.2872, "step": 1649 }, { "epoch": 0.24674742036787797, "grad_norm": 0.9460460785889475, "learning_rate": 8.818260982736662e-06, "loss": 0.2059, "step": 1650 }, { "epoch": 0.24689696425901003, "grad_norm": 1.596549339690058, "learning_rate": 8.816696936596887e-06, "loss": 0.2146, "step": 1651 }, { "epoch": 0.24704650815014206, "grad_norm": 1.7405302714522426, "learning_rate": 8.815131995023228e-06, "loss": 0.2358, "step": 1652 }, { "epoch": 0.24719605204127412, "grad_norm": 1.71942999566363, "learning_rate": 8.813566158382835e-06, "loss": 0.4195, "step": 1653 }, { "epoch": 0.24734559593240615, "grad_norm": 1.8118502915807313, "learning_rate": 8.81199942704307e-06, "loss": 0.3486, "step": 1654 }, { "epoch": 0.2474951398235382, "grad_norm": 1.0992755654244406, "learning_rate": 8.810431801371501e-06, "loss": 0.2063, "step": 1655 }, { "epoch": 0.24764468371467024, "grad_norm": 3.017300017050166, "learning_rate": 8.80886328173591e-06, "loss": 0.4337, "step": 1656 }, { "epoch": 0.2477942276058023, "grad_norm": 1.6022895744312384, "learning_rate": 8.807293868504282e-06, "loss": 0.3729, "step": 1657 }, { "epoch": 0.24794377149693436, "grad_norm": 1.6149891917484116, "learning_rate": 8.805723562044825e-06, "loss": 0.4056, "step": 1658 }, { "epoch": 0.2480933153880664, "grad_norm": 1.6145150498338217, "learning_rate": 8.80415236272594e-06, "loss": 0.3016, "step": 1659 }, { "epoch": 0.24824285927919845, "grad_norm": 1.1459038449035153, "learning_rate": 8.80258027091625e-06, "loss": 0.2218, "step": 1660 }, { "epoch": 0.24839240317033048, "grad_norm": 1.591564655332456, "learning_rate": 8.801007286984581e-06, "loss": 0.245, "step": 1661 }, { "epoch": 0.24854194706146254, "grad_norm": 1.7569520327773172, "learning_rate": 8.799433411299971e-06, "loss": 0.4247, "step": 1662 }, { "epoch": 0.24869149095259457, "grad_norm": 1.064692259533788, "learning_rate": 8.797858644231666e-06, "loss": 0.2029, "step": 1663 }, { "epoch": 0.24884103484372663, "grad_norm": 1.7207168422676138, "learning_rate": 8.796282986149123e-06, "loss": 0.4056, "step": 1664 }, { "epoch": 0.2489905787348587, "grad_norm": 1.8670534021913596, "learning_rate": 8.794706437422004e-06, "loss": 0.4252, "step": 1665 }, { "epoch": 0.24914012262599072, "grad_norm": 1.1847022204061899, "learning_rate": 8.793128998420183e-06, "loss": 0.2108, "step": 1666 }, { "epoch": 0.24928966651712278, "grad_norm": 1.7975419112991038, "learning_rate": 8.791550669513746e-06, "loss": 0.335, "step": 1667 }, { "epoch": 0.2494392104082548, "grad_norm": 1.5925842023054706, "learning_rate": 8.789971451072979e-06, "loss": 0.3249, "step": 1668 }, { "epoch": 0.24958875429938687, "grad_norm": 1.6010634947496465, "learning_rate": 8.788391343468385e-06, "loss": 0.395, "step": 1669 }, { "epoch": 0.2497382981905189, "grad_norm": 1.6989646502396285, "learning_rate": 8.78681034707067e-06, "loss": 0.2221, "step": 1670 }, { "epoch": 0.24988784208165096, "grad_norm": 1.1982003401309609, "learning_rate": 8.785228462250755e-06, "loss": 0.1606, "step": 1671 }, { "epoch": 0.250037385972783, "grad_norm": 1.504124635470649, "learning_rate": 8.783645689379763e-06, "loss": 0.2759, "step": 1672 }, { "epoch": 0.25018692986391505, "grad_norm": 1.1944052488188541, "learning_rate": 8.782062028829028e-06, "loss": 0.1768, "step": 1673 }, { "epoch": 0.2503364737550471, "grad_norm": 2.329098645094478, "learning_rate": 8.78047748097009e-06, "loss": 0.8662, "step": 1674 }, { "epoch": 0.25048601764617917, "grad_norm": 1.0556820313003359, "learning_rate": 8.778892046174703e-06, "loss": 0.1784, "step": 1675 }, { "epoch": 0.2506355615373112, "grad_norm": 2.158513791608413, "learning_rate": 8.777305724814823e-06, "loss": 0.2676, "step": 1676 }, { "epoch": 0.25078510542844323, "grad_norm": 1.4537227026901378, "learning_rate": 8.775718517262616e-06, "loss": 0.3745, "step": 1677 }, { "epoch": 0.2509346493195753, "grad_norm": 0.950798082633677, "learning_rate": 8.774130423890457e-06, "loss": 0.1824, "step": 1678 }, { "epoch": 0.25108419321070735, "grad_norm": 1.545457884426641, "learning_rate": 8.77254144507093e-06, "loss": 0.3852, "step": 1679 }, { "epoch": 0.2512337371018394, "grad_norm": 1.319903070406757, "learning_rate": 8.770951581176819e-06, "loss": 0.1928, "step": 1680 }, { "epoch": 0.2513832809929714, "grad_norm": 1.6348677618098524, "learning_rate": 8.769360832581127e-06, "loss": 0.2614, "step": 1681 }, { "epoch": 0.2515328248841035, "grad_norm": 1.2973642919936317, "learning_rate": 8.767769199657056e-06, "loss": 0.1926, "step": 1682 }, { "epoch": 0.25168236877523553, "grad_norm": 2.0805063872419214, "learning_rate": 8.766176682778021e-06, "loss": 0.6252, "step": 1683 }, { "epoch": 0.25183191266636756, "grad_norm": 1.5452777534643873, "learning_rate": 8.76458328231764e-06, "loss": 0.3641, "step": 1684 }, { "epoch": 0.25198145655749965, "grad_norm": 2.031081999653712, "learning_rate": 8.76298899864974e-06, "loss": 0.6777, "step": 1685 }, { "epoch": 0.2521310004486317, "grad_norm": 1.4412028187219246, "learning_rate": 8.761393832148355e-06, "loss": 0.3482, "step": 1686 }, { "epoch": 0.2522805443397637, "grad_norm": 1.756325567561018, "learning_rate": 8.759797783187728e-06, "loss": 0.486, "step": 1687 }, { "epoch": 0.25243008823089574, "grad_norm": 1.4179914068088109, "learning_rate": 8.758200852142306e-06, "loss": 0.3271, "step": 1688 }, { "epoch": 0.25257963212202783, "grad_norm": 1.5841241720765276, "learning_rate": 8.756603039386744e-06, "loss": 0.2038, "step": 1689 }, { "epoch": 0.25272917601315986, "grad_norm": 2.0246116663194687, "learning_rate": 8.755004345295906e-06, "loss": 0.5168, "step": 1690 }, { "epoch": 0.2528787199042919, "grad_norm": 1.4410735414416371, "learning_rate": 8.753404770244861e-06, "loss": 0.2331, "step": 1691 }, { "epoch": 0.253028263795424, "grad_norm": 1.6406911200868926, "learning_rate": 8.751804314608885e-06, "loss": 0.3777, "step": 1692 }, { "epoch": 0.253177807686556, "grad_norm": 1.4803860988544733, "learning_rate": 8.750202978763455e-06, "loss": 0.3677, "step": 1693 }, { "epoch": 0.25332735157768804, "grad_norm": 1.1771068735919579, "learning_rate": 8.748600763084267e-06, "loss": 0.3354, "step": 1694 }, { "epoch": 0.2534768954688201, "grad_norm": 1.364747322095615, "learning_rate": 8.746997667947215e-06, "loss": 0.2101, "step": 1695 }, { "epoch": 0.25362643935995216, "grad_norm": 1.0147659263108189, "learning_rate": 8.745393693728395e-06, "loss": 0.2104, "step": 1696 }, { "epoch": 0.2537759832510842, "grad_norm": 1.3008901060796498, "learning_rate": 8.74378884080412e-06, "loss": 0.2224, "step": 1697 }, { "epoch": 0.2539255271422162, "grad_norm": 1.5410622225873059, "learning_rate": 8.742183109550906e-06, "loss": 0.3175, "step": 1698 }, { "epoch": 0.2540750710333483, "grad_norm": 1.1271133540339682, "learning_rate": 8.740576500345465e-06, "loss": 0.2078, "step": 1699 }, { "epoch": 0.25422461492448034, "grad_norm": 1.5002212907557977, "learning_rate": 8.73896901356473e-06, "loss": 0.3457, "step": 1700 }, { "epoch": 0.2543741588156124, "grad_norm": 1.5194450466740133, "learning_rate": 8.737360649585831e-06, "loss": 0.2974, "step": 1701 }, { "epoch": 0.2545237027067444, "grad_norm": 1.541411347823068, "learning_rate": 8.735751408786106e-06, "loss": 0.1909, "step": 1702 }, { "epoch": 0.2546732465978765, "grad_norm": 1.0383311616312054, "learning_rate": 8.734141291543096e-06, "loss": 0.1968, "step": 1703 }, { "epoch": 0.2548227904890085, "grad_norm": 1.1554526724480272, "learning_rate": 8.732530298234551e-06, "loss": 0.3622, "step": 1704 }, { "epoch": 0.25497233438014055, "grad_norm": 1.5935676507788412, "learning_rate": 8.730918429238429e-06, "loss": 0.4022, "step": 1705 }, { "epoch": 0.25512187827127264, "grad_norm": 1.7826101439006568, "learning_rate": 8.729305684932884e-06, "loss": 0.4635, "step": 1706 }, { "epoch": 0.25527142216240467, "grad_norm": 1.9360511884951532, "learning_rate": 8.727692065696286e-06, "loss": 0.5279, "step": 1707 }, { "epoch": 0.2554209660535367, "grad_norm": 2.2688771091405355, "learning_rate": 8.726077571907205e-06, "loss": 0.4807, "step": 1708 }, { "epoch": 0.25557050994466873, "grad_norm": 1.1500886235737136, "learning_rate": 8.724462203944417e-06, "loss": 0.1742, "step": 1709 }, { "epoch": 0.2557200538358008, "grad_norm": 2.1581902257611505, "learning_rate": 8.7228459621869e-06, "loss": 0.2518, "step": 1710 }, { "epoch": 0.25586959772693285, "grad_norm": 1.5158538301505384, "learning_rate": 8.721228847013844e-06, "loss": 0.4283, "step": 1711 }, { "epoch": 0.2560191416180649, "grad_norm": 1.1144613603300342, "learning_rate": 8.719610858804634e-06, "loss": 0.1865, "step": 1712 }, { "epoch": 0.25616868550919697, "grad_norm": 1.712020727078079, "learning_rate": 8.717991997938872e-06, "loss": 0.4059, "step": 1713 }, { "epoch": 0.256318229400329, "grad_norm": 1.3838237147007102, "learning_rate": 8.716372264796355e-06, "loss": 0.3562, "step": 1714 }, { "epoch": 0.25646777329146103, "grad_norm": 2.009760245861866, "learning_rate": 8.71475165975709e-06, "loss": 0.6606, "step": 1715 }, { "epoch": 0.2566173171825931, "grad_norm": 1.4629059301831144, "learning_rate": 8.713130183201283e-06, "loss": 0.204, "step": 1716 }, { "epoch": 0.25676686107372515, "grad_norm": 1.712741449487023, "learning_rate": 8.711507835509352e-06, "loss": 0.3339, "step": 1717 }, { "epoch": 0.2569164049648572, "grad_norm": 1.7719128226224596, "learning_rate": 8.709884617061912e-06, "loss": 0.4942, "step": 1718 }, { "epoch": 0.2570659488559892, "grad_norm": 2.5501496729214645, "learning_rate": 8.708260528239788e-06, "loss": 0.1606, "step": 1719 }, { "epoch": 0.2572154927471213, "grad_norm": 1.3379881043667554, "learning_rate": 8.706635569424007e-06, "loss": 0.1771, "step": 1720 }, { "epoch": 0.25736503663825333, "grad_norm": 1.3113682840451848, "learning_rate": 8.705009740995798e-06, "loss": 0.1974, "step": 1721 }, { "epoch": 0.25751458052938536, "grad_norm": 1.3252114693451582, "learning_rate": 8.703383043336598e-06, "loss": 0.1857, "step": 1722 }, { "epoch": 0.25766412442051745, "grad_norm": 1.2740861405038273, "learning_rate": 8.701755476828045e-06, "loss": 0.179, "step": 1723 }, { "epoch": 0.2578136683116495, "grad_norm": 1.106234026109271, "learning_rate": 8.700127041851983e-06, "loss": 0.2017, "step": 1724 }, { "epoch": 0.2579632122027815, "grad_norm": 1.6172632969033804, "learning_rate": 8.698497738790456e-06, "loss": 0.3627, "step": 1725 }, { "epoch": 0.25811275609391354, "grad_norm": 1.1499075369100074, "learning_rate": 8.696867568025715e-06, "loss": 0.2216, "step": 1726 }, { "epoch": 0.25826229998504563, "grad_norm": 1.360793130358255, "learning_rate": 8.695236529940217e-06, "loss": 0.2168, "step": 1727 }, { "epoch": 0.25841184387617766, "grad_norm": 1.5474318214998557, "learning_rate": 8.693604624916614e-06, "loss": 0.3872, "step": 1728 }, { "epoch": 0.2585613877673097, "grad_norm": 1.603910943390027, "learning_rate": 8.691971853337772e-06, "loss": 0.3828, "step": 1729 }, { "epoch": 0.2587109316584418, "grad_norm": 1.6171942954911174, "learning_rate": 8.69033821558675e-06, "loss": 0.3922, "step": 1730 }, { "epoch": 0.2588604755495738, "grad_norm": 1.2280233394489264, "learning_rate": 8.68870371204682e-06, "loss": 0.234, "step": 1731 }, { "epoch": 0.25901001944070584, "grad_norm": 1.495515188592798, "learning_rate": 8.687068343101449e-06, "loss": 0.3597, "step": 1732 }, { "epoch": 0.25915956333183787, "grad_norm": 1.2004438690731185, "learning_rate": 8.685432109134309e-06, "loss": 0.3891, "step": 1733 }, { "epoch": 0.25930910722296996, "grad_norm": 1.1612103655288195, "learning_rate": 8.68379501052928e-06, "loss": 0.1674, "step": 1734 }, { "epoch": 0.259458651114102, "grad_norm": 1.5379960884614503, "learning_rate": 8.682157047670439e-06, "loss": 0.2165, "step": 1735 }, { "epoch": 0.259608195005234, "grad_norm": 1.256481316250901, "learning_rate": 8.68051822094207e-06, "loss": 0.1956, "step": 1736 }, { "epoch": 0.2597577388963661, "grad_norm": 1.150195060615946, "learning_rate": 8.678878530728653e-06, "loss": 0.2046, "step": 1737 }, { "epoch": 0.25990728278749814, "grad_norm": 0.9645721264599928, "learning_rate": 8.677237977414879e-06, "loss": 0.2169, "step": 1738 }, { "epoch": 0.26005682667863017, "grad_norm": 2.055210192087077, "learning_rate": 8.675596561385637e-06, "loss": 0.5667, "step": 1739 }, { "epoch": 0.2602063705697622, "grad_norm": 1.6323123777977344, "learning_rate": 8.67395428302602e-06, "loss": 0.2085, "step": 1740 }, { "epoch": 0.2603559144608943, "grad_norm": 2.0705229310389734, "learning_rate": 8.67231114272132e-06, "loss": 0.6948, "step": 1741 }, { "epoch": 0.2605054583520263, "grad_norm": 1.4253846439937559, "learning_rate": 8.670667140857034e-06, "loss": 0.2156, "step": 1742 }, { "epoch": 0.26065500224315835, "grad_norm": 1.2432278896167281, "learning_rate": 8.669022277818861e-06, "loss": 0.1668, "step": 1743 }, { "epoch": 0.26080454613429044, "grad_norm": 2.1626989048511396, "learning_rate": 8.6673765539927e-06, "loss": 0.5859, "step": 1744 }, { "epoch": 0.26095409002542247, "grad_norm": 1.3212936003748748, "learning_rate": 8.66572996976466e-06, "loss": 0.2208, "step": 1745 }, { "epoch": 0.2611036339165545, "grad_norm": 1.6949012011313151, "learning_rate": 8.66408252552104e-06, "loss": 0.3465, "step": 1746 }, { "epoch": 0.26125317780768653, "grad_norm": 1.6334321794556426, "learning_rate": 8.662434221648344e-06, "loss": 0.3536, "step": 1747 }, { "epoch": 0.2614027216988186, "grad_norm": 1.4554313973278277, "learning_rate": 8.660785058533288e-06, "loss": 0.1853, "step": 1748 }, { "epoch": 0.26155226558995065, "grad_norm": 1.4889399004698576, "learning_rate": 8.659135036562774e-06, "loss": 0.3361, "step": 1749 }, { "epoch": 0.2617018094810827, "grad_norm": 1.2820685580565203, "learning_rate": 8.657484156123916e-06, "loss": 0.2098, "step": 1750 }, { "epoch": 0.26185135337221477, "grad_norm": 1.4528196588291855, "learning_rate": 8.655832417604028e-06, "loss": 0.3716, "step": 1751 }, { "epoch": 0.2620008972633468, "grad_norm": 1.1976317675054606, "learning_rate": 8.65417982139062e-06, "loss": 0.2427, "step": 1752 }, { "epoch": 0.26215044115447883, "grad_norm": 1.5994876354129555, "learning_rate": 8.652526367871412e-06, "loss": 0.4616, "step": 1753 }, { "epoch": 0.26229998504561086, "grad_norm": 1.4225347188398327, "learning_rate": 8.650872057434316e-06, "loss": 0.2385, "step": 1754 }, { "epoch": 0.26244952893674295, "grad_norm": 1.6636967962332678, "learning_rate": 8.64921689046745e-06, "loss": 0.2447, "step": 1755 }, { "epoch": 0.262599072827875, "grad_norm": 1.5225788924725037, "learning_rate": 8.647560867359133e-06, "loss": 0.3694, "step": 1756 }, { "epoch": 0.262748616719007, "grad_norm": 1.2202525703548277, "learning_rate": 8.645903988497884e-06, "loss": 0.187, "step": 1757 }, { "epoch": 0.2628981606101391, "grad_norm": 1.9853373281678817, "learning_rate": 8.644246254272423e-06, "loss": 0.2257, "step": 1758 }, { "epoch": 0.26304770450127113, "grad_norm": 1.1859244322025415, "learning_rate": 8.64258766507167e-06, "loss": 0.2107, "step": 1759 }, { "epoch": 0.26319724839240316, "grad_norm": 1.610055608255526, "learning_rate": 8.640928221284744e-06, "loss": 0.3033, "step": 1760 }, { "epoch": 0.2633467922835352, "grad_norm": 1.322382799898529, "learning_rate": 8.63926792330097e-06, "loss": 0.219, "step": 1761 }, { "epoch": 0.2634963361746673, "grad_norm": 1.5892475425535593, "learning_rate": 8.637606771509868e-06, "loss": 0.3678, "step": 1762 }, { "epoch": 0.2636458800657993, "grad_norm": 1.6038277957567881, "learning_rate": 8.635944766301158e-06, "loss": 0.3819, "step": 1763 }, { "epoch": 0.26379542395693134, "grad_norm": 1.7817364162394536, "learning_rate": 8.634281908064767e-06, "loss": 0.3632, "step": 1764 }, { "epoch": 0.2639449678480634, "grad_norm": 1.8816182001166262, "learning_rate": 8.632618197190817e-06, "loss": 0.4874, "step": 1765 }, { "epoch": 0.26409451173919546, "grad_norm": 1.7637033301301739, "learning_rate": 8.630953634069627e-06, "loss": 0.2109, "step": 1766 }, { "epoch": 0.2642440556303275, "grad_norm": 1.4820279840766182, "learning_rate": 8.629288219091722e-06, "loss": 0.2482, "step": 1767 }, { "epoch": 0.2643935995214595, "grad_norm": 1.3257383394968065, "learning_rate": 8.627621952647825e-06, "loss": 0.2066, "step": 1768 }, { "epoch": 0.2645431434125916, "grad_norm": 1.6804248433426907, "learning_rate": 8.625954835128856e-06, "loss": 0.2436, "step": 1769 }, { "epoch": 0.26469268730372364, "grad_norm": 1.8362570342532276, "learning_rate": 8.624286866925938e-06, "loss": 0.3694, "step": 1770 }, { "epoch": 0.26484223119485567, "grad_norm": 1.5661865351400834, "learning_rate": 8.622618048430391e-06, "loss": 0.3591, "step": 1771 }, { "epoch": 0.26499177508598776, "grad_norm": 2.1673241284131044, "learning_rate": 8.62094838003374e-06, "loss": 0.5737, "step": 1772 }, { "epoch": 0.2651413189771198, "grad_norm": 1.0699484499241998, "learning_rate": 8.619277862127702e-06, "loss": 0.2189, "step": 1773 }, { "epoch": 0.2652908628682518, "grad_norm": 1.736379723484864, "learning_rate": 8.617606495104198e-06, "loss": 0.4754, "step": 1774 }, { "epoch": 0.2654404067593839, "grad_norm": 1.583421637766384, "learning_rate": 8.615934279355345e-06, "loss": 0.3366, "step": 1775 }, { "epoch": 0.26558995065051594, "grad_norm": 1.6478010096837548, "learning_rate": 8.614261215273462e-06, "loss": 0.3672, "step": 1776 }, { "epoch": 0.26573949454164797, "grad_norm": 1.9102927551172686, "learning_rate": 8.612587303251065e-06, "loss": 0.4308, "step": 1777 }, { "epoch": 0.26588903843278, "grad_norm": 1.013647409926638, "learning_rate": 8.610912543680872e-06, "loss": 0.2259, "step": 1778 }, { "epoch": 0.2660385823239121, "grad_norm": 0.9074429437734283, "learning_rate": 8.609236936955797e-06, "loss": 0.1942, "step": 1779 }, { "epoch": 0.2661881262150441, "grad_norm": 1.1128832323090567, "learning_rate": 8.607560483468953e-06, "loss": 0.2495, "step": 1780 }, { "epoch": 0.26633767010617615, "grad_norm": 1.4209004510295011, "learning_rate": 8.60588318361365e-06, "loss": 0.2153, "step": 1781 }, { "epoch": 0.26648721399730824, "grad_norm": 1.5238417321433448, "learning_rate": 8.604205037783404e-06, "loss": 0.1742, "step": 1782 }, { "epoch": 0.26663675788844027, "grad_norm": 1.2906114880614232, "learning_rate": 8.60252604637192e-06, "loss": 0.3304, "step": 1783 }, { "epoch": 0.2667863017795723, "grad_norm": 2.0207728584980726, "learning_rate": 8.600846209773107e-06, "loss": 0.7203, "step": 1784 }, { "epoch": 0.26693584567070433, "grad_norm": 1.0286990588537865, "learning_rate": 8.599165528381068e-06, "loss": 0.3377, "step": 1785 }, { "epoch": 0.2670853895618364, "grad_norm": 1.6105436646556477, "learning_rate": 8.597484002590113e-06, "loss": 0.3509, "step": 1786 }, { "epoch": 0.26723493345296845, "grad_norm": 1.655849264356554, "learning_rate": 8.595801632794738e-06, "loss": 0.3233, "step": 1787 }, { "epoch": 0.2673844773441005, "grad_norm": 1.411449436450616, "learning_rate": 8.594118419389648e-06, "loss": 0.2504, "step": 1788 }, { "epoch": 0.26753402123523257, "grad_norm": 1.3172823398923945, "learning_rate": 8.592434362769738e-06, "loss": 0.3616, "step": 1789 }, { "epoch": 0.2676835651263646, "grad_norm": 1.4726126876884136, "learning_rate": 8.590749463330105e-06, "loss": 0.1986, "step": 1790 }, { "epoch": 0.26783310901749663, "grad_norm": 1.7217144976129872, "learning_rate": 8.589063721466041e-06, "loss": 0.3317, "step": 1791 }, { "epoch": 0.26798265290862866, "grad_norm": 1.5253687117024657, "learning_rate": 8.58737713757304e-06, "loss": 0.1819, "step": 1792 }, { "epoch": 0.26813219679976075, "grad_norm": 1.0839801985857815, "learning_rate": 8.585689712046792e-06, "loss": 0.1985, "step": 1793 }, { "epoch": 0.2682817406908928, "grad_norm": 1.4726576160050187, "learning_rate": 8.58400144528318e-06, "loss": 0.3811, "step": 1794 }, { "epoch": 0.2684312845820248, "grad_norm": 1.628036147711573, "learning_rate": 8.582312337678286e-06, "loss": 0.4639, "step": 1795 }, { "epoch": 0.2685808284731569, "grad_norm": 1.8952018740494194, "learning_rate": 8.580622389628395e-06, "loss": 0.3552, "step": 1796 }, { "epoch": 0.2687303723642889, "grad_norm": 2.001984029391798, "learning_rate": 8.578931601529983e-06, "loss": 0.3726, "step": 1797 }, { "epoch": 0.26887991625542096, "grad_norm": 1.2603899746687859, "learning_rate": 8.577239973779727e-06, "loss": 0.3207, "step": 1798 }, { "epoch": 0.269029460146553, "grad_norm": 1.4854613896182869, "learning_rate": 8.575547506774498e-06, "loss": 0.2007, "step": 1799 }, { "epoch": 0.2691790040376851, "grad_norm": 1.8906751578747407, "learning_rate": 8.573854200911365e-06, "loss": 0.6032, "step": 1800 }, { "epoch": 0.2693285479288171, "grad_norm": 1.9613152098406168, "learning_rate": 8.572160056587592e-06, "loss": 0.3775, "step": 1801 }, { "epoch": 0.26947809181994914, "grad_norm": 0.9889564582361468, "learning_rate": 8.570465074200645e-06, "loss": 0.1862, "step": 1802 }, { "epoch": 0.2696276357110812, "grad_norm": 2.0931504871597015, "learning_rate": 8.568769254148182e-06, "loss": 0.5499, "step": 1803 }, { "epoch": 0.26977717960221326, "grad_norm": 1.5128120195103958, "learning_rate": 8.56707259682806e-06, "loss": 0.3961, "step": 1804 }, { "epoch": 0.2699267234933453, "grad_norm": 1.4113868405534056, "learning_rate": 8.565375102638327e-06, "loss": 0.332, "step": 1805 }, { "epoch": 0.2700762673844773, "grad_norm": 1.9594938693881467, "learning_rate": 8.563676771977234e-06, "loss": 0.5145, "step": 1806 }, { "epoch": 0.2702258112756094, "grad_norm": 1.7464884113562893, "learning_rate": 8.561977605243228e-06, "loss": 0.3668, "step": 1807 }, { "epoch": 0.27037535516674144, "grad_norm": 1.1626732860512075, "learning_rate": 8.560277602834945e-06, "loss": 0.168, "step": 1808 }, { "epoch": 0.27052489905787347, "grad_norm": 1.6748170212716917, "learning_rate": 8.558576765151227e-06, "loss": 0.2433, "step": 1809 }, { "epoch": 0.27067444294900556, "grad_norm": 2.025570121175929, "learning_rate": 8.556875092591104e-06, "loss": 0.2798, "step": 1810 }, { "epoch": 0.2708239868401376, "grad_norm": 2.374954242014533, "learning_rate": 8.555172585553804e-06, "loss": 0.4171, "step": 1811 }, { "epoch": 0.2709735307312696, "grad_norm": 1.7745787943498124, "learning_rate": 8.553469244438754e-06, "loss": 0.3595, "step": 1812 }, { "epoch": 0.27112307462240165, "grad_norm": 2.431602477728119, "learning_rate": 8.551765069645574e-06, "loss": 0.472, "step": 1813 }, { "epoch": 0.27127261851353374, "grad_norm": 1.5428274790156662, "learning_rate": 8.55006006157408e-06, "loss": 0.3913, "step": 1814 }, { "epoch": 0.27142216240466577, "grad_norm": 1.4868080151451115, "learning_rate": 8.548354220624278e-06, "loss": 0.2358, "step": 1815 }, { "epoch": 0.2715717062957978, "grad_norm": 1.5637252724169068, "learning_rate": 8.546647547196383e-06, "loss": 0.1934, "step": 1816 }, { "epoch": 0.2717212501869299, "grad_norm": 1.4784784501314225, "learning_rate": 8.544940041690792e-06, "loss": 0.5315, "step": 1817 }, { "epoch": 0.2718707940780619, "grad_norm": 1.6049147134883024, "learning_rate": 8.543231704508102e-06, "loss": 0.2277, "step": 1818 }, { "epoch": 0.27202033796919395, "grad_norm": 1.1639185886904326, "learning_rate": 8.541522536049108e-06, "loss": 0.2021, "step": 1819 }, { "epoch": 0.272169881860326, "grad_norm": 1.5080421265491435, "learning_rate": 8.539812536714796e-06, "loss": 0.1978, "step": 1820 }, { "epoch": 0.27231942575145807, "grad_norm": 0.880232094927403, "learning_rate": 8.538101706906347e-06, "loss": 0.1639, "step": 1821 }, { "epoch": 0.2724689696425901, "grad_norm": 1.4359482262535581, "learning_rate": 8.536390047025143e-06, "loss": 0.2078, "step": 1822 }, { "epoch": 0.27261851353372213, "grad_norm": 1.3134455382926835, "learning_rate": 8.53467755747275e-06, "loss": 0.3189, "step": 1823 }, { "epoch": 0.2727680574248542, "grad_norm": 1.4404618987021778, "learning_rate": 8.532964238650938e-06, "loss": 0.3408, "step": 1824 }, { "epoch": 0.27291760131598625, "grad_norm": 1.3234925937557052, "learning_rate": 8.531250090961666e-06, "loss": 0.2465, "step": 1825 }, { "epoch": 0.2730671452071183, "grad_norm": 1.678127185102635, "learning_rate": 8.52953511480709e-06, "loss": 0.5327, "step": 1826 }, { "epoch": 0.2732166890982503, "grad_norm": 1.4928358926152048, "learning_rate": 8.527819310589564e-06, "loss": 0.3706, "step": 1827 }, { "epoch": 0.2733662329893824, "grad_norm": 1.7549470816483508, "learning_rate": 8.526102678711626e-06, "loss": 0.4288, "step": 1828 }, { "epoch": 0.2735157768805144, "grad_norm": 1.5327378751610867, "learning_rate": 8.524385219576019e-06, "loss": 0.4647, "step": 1829 }, { "epoch": 0.27366532077164646, "grad_norm": 1.7197750676087826, "learning_rate": 8.522666933585672e-06, "loss": 0.3523, "step": 1830 }, { "epoch": 0.27381486466277855, "grad_norm": 1.174817852571183, "learning_rate": 8.520947821143714e-06, "loss": 0.2134, "step": 1831 }, { "epoch": 0.2739644085539106, "grad_norm": 2.0070860843733302, "learning_rate": 8.519227882653465e-06, "loss": 0.4656, "step": 1832 }, { "epoch": 0.2741139524450426, "grad_norm": 1.7275106234713773, "learning_rate": 8.517507118518436e-06, "loss": 0.5513, "step": 1833 }, { "epoch": 0.2742634963361747, "grad_norm": 1.2685207625970671, "learning_rate": 8.515785529142339e-06, "loss": 0.2491, "step": 1834 }, { "epoch": 0.2744130402273067, "grad_norm": 1.2419112651105808, "learning_rate": 8.514063114929077e-06, "loss": 0.1897, "step": 1835 }, { "epoch": 0.27456258411843876, "grad_norm": 1.1605015571551742, "learning_rate": 8.512339876282738e-06, "loss": 0.2282, "step": 1836 }, { "epoch": 0.2747121280095708, "grad_norm": 1.2889897008048643, "learning_rate": 8.510615813607617e-06, "loss": 0.3405, "step": 1837 }, { "epoch": 0.2748616719007029, "grad_norm": 1.8490938154917704, "learning_rate": 8.508890927308191e-06, "loss": 0.3821, "step": 1838 }, { "epoch": 0.2750112157918349, "grad_norm": 1.3689509544740683, "learning_rate": 8.507165217789137e-06, "loss": 0.3832, "step": 1839 }, { "epoch": 0.27516075968296694, "grad_norm": 1.6403361270537955, "learning_rate": 8.505438685455325e-06, "loss": 0.3848, "step": 1840 }, { "epoch": 0.275310303574099, "grad_norm": 1.8182842779990729, "learning_rate": 8.503711330711813e-06, "loss": 0.2549, "step": 1841 }, { "epoch": 0.27545984746523106, "grad_norm": 1.7148274136332127, "learning_rate": 8.501983153963855e-06, "loss": 0.5105, "step": 1842 }, { "epoch": 0.2756093913563631, "grad_norm": 1.537038227068178, "learning_rate": 8.500254155616901e-06, "loss": 0.3514, "step": 1843 }, { "epoch": 0.2757589352474951, "grad_norm": 1.641983169416554, "learning_rate": 8.498524336076587e-06, "loss": 0.6491, "step": 1844 }, { "epoch": 0.2759084791386272, "grad_norm": 1.250729132131747, "learning_rate": 8.49679369574875e-06, "loss": 0.2051, "step": 1845 }, { "epoch": 0.27605802302975924, "grad_norm": 1.579592841261978, "learning_rate": 8.49506223503941e-06, "loss": 0.3549, "step": 1846 }, { "epoch": 0.27620756692089127, "grad_norm": 1.1166297247129173, "learning_rate": 8.493329954354788e-06, "loss": 0.2016, "step": 1847 }, { "epoch": 0.27635711081202335, "grad_norm": 1.457594575978762, "learning_rate": 8.491596854101292e-06, "loss": 0.3752, "step": 1848 }, { "epoch": 0.2765066547031554, "grad_norm": 1.6194515688957392, "learning_rate": 8.489862934685523e-06, "loss": 0.2791, "step": 1849 }, { "epoch": 0.2766561985942874, "grad_norm": 1.189645094014293, "learning_rate": 8.488128196514279e-06, "loss": 0.1892, "step": 1850 }, { "epoch": 0.27680574248541945, "grad_norm": 1.6414546776027312, "learning_rate": 8.486392639994541e-06, "loss": 0.4263, "step": 1851 }, { "epoch": 0.27695528637655153, "grad_norm": 1.1030338687374446, "learning_rate": 8.48465626553349e-06, "loss": 0.2174, "step": 1852 }, { "epoch": 0.27710483026768357, "grad_norm": 2.062987335024284, "learning_rate": 8.482919073538498e-06, "loss": 0.5137, "step": 1853 }, { "epoch": 0.2772543741588156, "grad_norm": 1.9265333026489615, "learning_rate": 8.481181064417124e-06, "loss": 0.5053, "step": 1854 }, { "epoch": 0.2774039180499477, "grad_norm": 1.2857660275288423, "learning_rate": 8.479442238577123e-06, "loss": 0.2356, "step": 1855 }, { "epoch": 0.2775534619410797, "grad_norm": 1.4308427978938358, "learning_rate": 8.477702596426441e-06, "loss": 0.3124, "step": 1856 }, { "epoch": 0.27770300583221175, "grad_norm": 2.112210621284666, "learning_rate": 8.475962138373212e-06, "loss": 0.5583, "step": 1857 }, { "epoch": 0.2778525497233438, "grad_norm": 1.5648504781268902, "learning_rate": 8.474220864825768e-06, "loss": 0.2046, "step": 1858 }, { "epoch": 0.27800209361447586, "grad_norm": 1.0861658989898926, "learning_rate": 8.472478776192624e-06, "loss": 0.1941, "step": 1859 }, { "epoch": 0.2781516375056079, "grad_norm": 1.7593139339710724, "learning_rate": 8.470735872882498e-06, "loss": 0.3911, "step": 1860 }, { "epoch": 0.2783011813967399, "grad_norm": 1.9051060527282064, "learning_rate": 8.468992155304285e-06, "loss": 0.3441, "step": 1861 }, { "epoch": 0.278450725287872, "grad_norm": 1.2726234923585076, "learning_rate": 8.46724762386708e-06, "loss": 0.1966, "step": 1862 }, { "epoch": 0.27860026917900405, "grad_norm": 1.6275578962742665, "learning_rate": 8.465502278980168e-06, "loss": 0.2015, "step": 1863 }, { "epoch": 0.2787498130701361, "grad_norm": 1.1894457179498164, "learning_rate": 8.463756121053024e-06, "loss": 0.2064, "step": 1864 }, { "epoch": 0.2788993569612681, "grad_norm": 0.8970585928811048, "learning_rate": 8.462009150495311e-06, "loss": 0.2077, "step": 1865 }, { "epoch": 0.2790489008524002, "grad_norm": 1.4979876938013277, "learning_rate": 8.460261367716888e-06, "loss": 0.1868, "step": 1866 }, { "epoch": 0.2791984447435322, "grad_norm": 1.5952489152766243, "learning_rate": 8.458512773127801e-06, "loss": 0.3924, "step": 1867 }, { "epoch": 0.27934798863466426, "grad_norm": 1.3568222258261973, "learning_rate": 8.456763367138283e-06, "loss": 0.3057, "step": 1868 }, { "epoch": 0.27949753252579634, "grad_norm": 1.430069295266679, "learning_rate": 8.455013150158767e-06, "loss": 0.2027, "step": 1869 }, { "epoch": 0.2796470764169284, "grad_norm": 1.835468064313153, "learning_rate": 8.453262122599871e-06, "loss": 0.203, "step": 1870 }, { "epoch": 0.2797966203080604, "grad_norm": 1.16217124036999, "learning_rate": 8.451510284872397e-06, "loss": 0.2065, "step": 1871 }, { "epoch": 0.27994616419919244, "grad_norm": 1.2439706377831612, "learning_rate": 8.449757637387349e-06, "loss": 0.1854, "step": 1872 }, { "epoch": 0.2800957080903245, "grad_norm": 1.771165875925341, "learning_rate": 8.448004180555912e-06, "loss": 0.2118, "step": 1873 }, { "epoch": 0.28024525198145656, "grad_norm": 1.448237639694636, "learning_rate": 8.446249914789464e-06, "loss": 0.329, "step": 1874 }, { "epoch": 0.2803947958725886, "grad_norm": 1.9446859154931548, "learning_rate": 8.444494840499573e-06, "loss": 0.1968, "step": 1875 }, { "epoch": 0.2805443397637207, "grad_norm": 1.1282582980944778, "learning_rate": 8.442738958097998e-06, "loss": 0.1929, "step": 1876 }, { "epoch": 0.2806938836548527, "grad_norm": 0.9295466099898506, "learning_rate": 8.440982267996683e-06, "loss": 0.176, "step": 1877 }, { "epoch": 0.28084342754598474, "grad_norm": 1.6067454611327383, "learning_rate": 8.439224770607768e-06, "loss": 0.2216, "step": 1878 }, { "epoch": 0.28099297143711677, "grad_norm": 1.506148146117863, "learning_rate": 8.437466466343573e-06, "loss": 0.3125, "step": 1879 }, { "epoch": 0.28114251532824885, "grad_norm": 1.1300992074338925, "learning_rate": 8.43570735561662e-06, "loss": 0.2173, "step": 1880 }, { "epoch": 0.2812920592193809, "grad_norm": 1.0011833423935625, "learning_rate": 8.43394743883961e-06, "loss": 0.2908, "step": 1881 }, { "epoch": 0.2814416031105129, "grad_norm": 1.8315268271499454, "learning_rate": 8.432186716425438e-06, "loss": 0.3737, "step": 1882 }, { "epoch": 0.281591147001645, "grad_norm": 1.7995305921177922, "learning_rate": 8.430425188787181e-06, "loss": 0.3303, "step": 1883 }, { "epoch": 0.28174069089277703, "grad_norm": 1.2710141324651125, "learning_rate": 8.428662856338116e-06, "loss": 0.2441, "step": 1884 }, { "epoch": 0.28189023478390907, "grad_norm": 1.3946588107946323, "learning_rate": 8.426899719491703e-06, "loss": 0.2108, "step": 1885 }, { "epoch": 0.2820397786750411, "grad_norm": 1.1149474892337983, "learning_rate": 8.42513577866159e-06, "loss": 0.2215, "step": 1886 }, { "epoch": 0.2821893225661732, "grad_norm": 2.008735822937365, "learning_rate": 8.423371034261612e-06, "loss": 0.4968, "step": 1887 }, { "epoch": 0.2823388664573052, "grad_norm": 1.935748092708562, "learning_rate": 8.4216054867058e-06, "loss": 0.1889, "step": 1888 }, { "epoch": 0.28248841034843725, "grad_norm": 1.7520452919556104, "learning_rate": 8.419839136408361e-06, "loss": 0.3447, "step": 1889 }, { "epoch": 0.28263795423956933, "grad_norm": 0.8268783166552869, "learning_rate": 8.418071983783707e-06, "loss": 0.2028, "step": 1890 }, { "epoch": 0.28278749813070136, "grad_norm": 1.2060134973326222, "learning_rate": 8.416304029246422e-06, "loss": 0.2033, "step": 1891 }, { "epoch": 0.2829370420218334, "grad_norm": 1.081395124056579, "learning_rate": 8.414535273211286e-06, "loss": 0.2197, "step": 1892 }, { "epoch": 0.2830865859129655, "grad_norm": 1.6424700307942506, "learning_rate": 8.412765716093273e-06, "loss": 0.3939, "step": 1893 }, { "epoch": 0.2832361298040975, "grad_norm": 1.3786288095772328, "learning_rate": 8.410995358307528e-06, "loss": 0.187, "step": 1894 }, { "epoch": 0.28338567369522955, "grad_norm": 1.1978351240462277, "learning_rate": 8.4092242002694e-06, "loss": 0.2215, "step": 1895 }, { "epoch": 0.2835352175863616, "grad_norm": 1.2274171257303508, "learning_rate": 8.40745224239442e-06, "loss": 0.2047, "step": 1896 }, { "epoch": 0.28368476147749366, "grad_norm": 1.5902928460844958, "learning_rate": 8.405679485098304e-06, "loss": 0.3429, "step": 1897 }, { "epoch": 0.2838343053686257, "grad_norm": 1.8154747327907406, "learning_rate": 8.403905928796961e-06, "loss": 0.4344, "step": 1898 }, { "epoch": 0.2839838492597577, "grad_norm": 1.626710770634783, "learning_rate": 8.402131573906479e-06, "loss": 0.4395, "step": 1899 }, { "epoch": 0.2841333931508898, "grad_norm": 1.2949990843572656, "learning_rate": 8.400356420843144e-06, "loss": 0.2326, "step": 1900 }, { "epoch": 0.28428293704202184, "grad_norm": 1.5453342232271752, "learning_rate": 8.398580470023422e-06, "loss": 0.3238, "step": 1901 }, { "epoch": 0.2844324809331539, "grad_norm": 1.4286541357110913, "learning_rate": 8.396803721863969e-06, "loss": 0.2427, "step": 1902 }, { "epoch": 0.2845820248242859, "grad_norm": 1.830298456304548, "learning_rate": 8.395026176781627e-06, "loss": 0.3781, "step": 1903 }, { "epoch": 0.284731568715418, "grad_norm": 0.95541693717727, "learning_rate": 8.393247835193424e-06, "loss": 0.2069, "step": 1904 }, { "epoch": 0.28488111260655, "grad_norm": 1.4752609480755603, "learning_rate": 8.391468697516575e-06, "loss": 0.2127, "step": 1905 }, { "epoch": 0.28503065649768206, "grad_norm": 1.4768738670643429, "learning_rate": 8.389688764168487e-06, "loss": 0.3605, "step": 1906 }, { "epoch": 0.28518020038881414, "grad_norm": 1.2979854503198742, "learning_rate": 8.387908035566747e-06, "loss": 0.1915, "step": 1907 }, { "epoch": 0.2853297442799462, "grad_norm": 0.9902824337524958, "learning_rate": 8.38612651212913e-06, "loss": 0.2028, "step": 1908 }, { "epoch": 0.2854792881710782, "grad_norm": 1.6167134405609127, "learning_rate": 8.384344194273602e-06, "loss": 0.2436, "step": 1909 }, { "epoch": 0.28562883206221024, "grad_norm": 1.901370964967397, "learning_rate": 8.38256108241831e-06, "loss": 0.5823, "step": 1910 }, { "epoch": 0.2857783759533423, "grad_norm": 1.8663326764685, "learning_rate": 8.380777176981586e-06, "loss": 0.296, "step": 1911 }, { "epoch": 0.28592791984447435, "grad_norm": 2.010032346949138, "learning_rate": 8.378992478381957e-06, "loss": 0.3593, "step": 1912 }, { "epoch": 0.2860774637356064, "grad_norm": 1.4476840280584171, "learning_rate": 8.377206987038128e-06, "loss": 0.2342, "step": 1913 }, { "epoch": 0.28622700762673847, "grad_norm": 1.5594572863531158, "learning_rate": 8.375420703368993e-06, "loss": 0.3667, "step": 1914 }, { "epoch": 0.2863765515178705, "grad_norm": 1.6185881957264394, "learning_rate": 8.37363362779363e-06, "loss": 0.3414, "step": 1915 }, { "epoch": 0.28652609540900253, "grad_norm": 1.6321798860422365, "learning_rate": 8.371845760731305e-06, "loss": 0.4336, "step": 1916 }, { "epoch": 0.28667563930013457, "grad_norm": 1.391635511553069, "learning_rate": 8.370057102601467e-06, "loss": 0.2641, "step": 1917 }, { "epoch": 0.28682518319126665, "grad_norm": 1.6939840429561068, "learning_rate": 8.368267653823758e-06, "loss": 0.3579, "step": 1918 }, { "epoch": 0.2869747270823987, "grad_norm": 1.6529503491198996, "learning_rate": 8.366477414817993e-06, "loss": 0.3893, "step": 1919 }, { "epoch": 0.2871242709735307, "grad_norm": 1.750622438533098, "learning_rate": 8.364686386004184e-06, "loss": 0.3065, "step": 1920 }, { "epoch": 0.2872738148646628, "grad_norm": 1.5641156236161282, "learning_rate": 8.362894567802522e-06, "loss": 0.1891, "step": 1921 }, { "epoch": 0.28742335875579483, "grad_norm": 1.3418086188856035, "learning_rate": 8.361101960633384e-06, "loss": 0.2246, "step": 1922 }, { "epoch": 0.28757290264692686, "grad_norm": 1.5465668819355392, "learning_rate": 8.359308564917335e-06, "loss": 0.2342, "step": 1923 }, { "epoch": 0.2877224465380589, "grad_norm": 1.6114344336379887, "learning_rate": 8.357514381075123e-06, "loss": 0.1817, "step": 1924 }, { "epoch": 0.287871990429191, "grad_norm": 1.3668429785293057, "learning_rate": 8.355719409527676e-06, "loss": 0.3416, "step": 1925 }, { "epoch": 0.288021534320323, "grad_norm": 1.0266243878235821, "learning_rate": 8.353923650696119e-06, "loss": 0.1685, "step": 1926 }, { "epoch": 0.28817107821145505, "grad_norm": 1.7588874793595086, "learning_rate": 8.352127105001748e-06, "loss": 0.3868, "step": 1927 }, { "epoch": 0.28832062210258713, "grad_norm": 1.8522510535939594, "learning_rate": 8.350329772866054e-06, "loss": 0.2917, "step": 1928 }, { "epoch": 0.28847016599371916, "grad_norm": 1.7432437696852978, "learning_rate": 8.348531654710706e-06, "loss": 0.2314, "step": 1929 }, { "epoch": 0.2886197098848512, "grad_norm": 1.8048485192800683, "learning_rate": 8.34673275095756e-06, "loss": 0.421, "step": 1930 }, { "epoch": 0.2887692537759832, "grad_norm": 1.254687110691794, "learning_rate": 8.344933062028659e-06, "loss": 0.3086, "step": 1931 }, { "epoch": 0.2889187976671153, "grad_norm": 1.5516562911782632, "learning_rate": 8.343132588346223e-06, "loss": 0.243, "step": 1932 }, { "epoch": 0.28906834155824734, "grad_norm": 1.5234323648219494, "learning_rate": 8.341331330332665e-06, "loss": 0.3708, "step": 1933 }, { "epoch": 0.2892178854493794, "grad_norm": 1.6869191874454617, "learning_rate": 8.339529288410575e-06, "loss": 0.3664, "step": 1934 }, { "epoch": 0.28936742934051146, "grad_norm": 1.6174375460534882, "learning_rate": 8.337726463002728e-06, "loss": 0.3548, "step": 1935 }, { "epoch": 0.2895169732316435, "grad_norm": 1.3712125139885123, "learning_rate": 8.335922854532087e-06, "loss": 0.258, "step": 1936 }, { "epoch": 0.2896665171227755, "grad_norm": 1.782853458880192, "learning_rate": 8.334118463421795e-06, "loss": 0.549, "step": 1937 }, { "epoch": 0.28981606101390756, "grad_norm": 1.620144816801684, "learning_rate": 8.33231329009518e-06, "loss": 0.3349, "step": 1938 }, { "epoch": 0.28996560490503964, "grad_norm": 1.333232350703901, "learning_rate": 8.33050733497575e-06, "loss": 0.2262, "step": 1939 }, { "epoch": 0.2901151487961717, "grad_norm": 1.6837289583297483, "learning_rate": 8.328700598487203e-06, "loss": 0.4787, "step": 1940 }, { "epoch": 0.2902646926873037, "grad_norm": 1.0591641715642133, "learning_rate": 8.326893081053417e-06, "loss": 0.2385, "step": 1941 }, { "epoch": 0.2904142365784358, "grad_norm": 1.129840571438994, "learning_rate": 8.325084783098452e-06, "loss": 0.247, "step": 1942 }, { "epoch": 0.2905637804695678, "grad_norm": 1.432313552747029, "learning_rate": 8.32327570504655e-06, "loss": 0.191, "step": 1943 }, { "epoch": 0.29071332436069985, "grad_norm": 1.5356970591574979, "learning_rate": 8.32146584732214e-06, "loss": 0.187, "step": 1944 }, { "epoch": 0.2908628682518319, "grad_norm": 1.0595724079445366, "learning_rate": 8.319655210349832e-06, "loss": 0.1802, "step": 1945 }, { "epoch": 0.29101241214296397, "grad_norm": 1.6123333459960774, "learning_rate": 8.31784379455442e-06, "loss": 0.2293, "step": 1946 }, { "epoch": 0.291161956034096, "grad_norm": 1.3341178548001948, "learning_rate": 8.31603160036088e-06, "loss": 0.2359, "step": 1947 }, { "epoch": 0.29131149992522803, "grad_norm": 1.2376522352882886, "learning_rate": 8.314218628194365e-06, "loss": 0.1896, "step": 1948 }, { "epoch": 0.2914610438163601, "grad_norm": 1.679702490362655, "learning_rate": 8.312404878480222e-06, "loss": 0.3223, "step": 1949 }, { "epoch": 0.29161058770749215, "grad_norm": 1.2597407375509204, "learning_rate": 8.31059035164397e-06, "loss": 0.2175, "step": 1950 }, { "epoch": 0.2917601315986242, "grad_norm": 1.624231610744949, "learning_rate": 8.308775048111318e-06, "loss": 0.3432, "step": 1951 }, { "epoch": 0.29190967548975627, "grad_norm": 1.9330093174874856, "learning_rate": 8.306958968308152e-06, "loss": 0.2494, "step": 1952 }, { "epoch": 0.2920592193808883, "grad_norm": 1.7103715792411738, "learning_rate": 8.30514211266054e-06, "loss": 0.3095, "step": 1953 }, { "epoch": 0.29220876327202033, "grad_norm": 1.27605011963153, "learning_rate": 8.303324481594737e-06, "loss": 0.2158, "step": 1954 }, { "epoch": 0.29235830716315236, "grad_norm": 1.2814850533429023, "learning_rate": 8.301506075537173e-06, "loss": 0.2314, "step": 1955 }, { "epoch": 0.29250785105428445, "grad_norm": 1.954217300152197, "learning_rate": 8.299686894914467e-06, "loss": 0.567, "step": 1956 }, { "epoch": 0.2926573949454165, "grad_norm": 0.9650685264754246, "learning_rate": 8.297866940153416e-06, "loss": 0.2588, "step": 1957 }, { "epoch": 0.2928069388365485, "grad_norm": 1.5166783973503064, "learning_rate": 8.296046211681e-06, "loss": 0.369, "step": 1958 }, { "epoch": 0.2929564827276806, "grad_norm": 1.7977899668272896, "learning_rate": 8.294224709924373e-06, "loss": 0.5176, "step": 1959 }, { "epoch": 0.29310602661881263, "grad_norm": 1.5915279488901846, "learning_rate": 8.292402435310883e-06, "loss": 0.3614, "step": 1960 }, { "epoch": 0.29325557050994466, "grad_norm": 1.8933979183970016, "learning_rate": 8.290579388268054e-06, "loss": 0.5366, "step": 1961 }, { "epoch": 0.2934051144010767, "grad_norm": 1.8812762399598735, "learning_rate": 8.288755569223586e-06, "loss": 0.5443, "step": 1962 }, { "epoch": 0.2935546582922088, "grad_norm": 1.6643902182805455, "learning_rate": 8.286930978605366e-06, "loss": 0.3375, "step": 1963 }, { "epoch": 0.2937042021833408, "grad_norm": 1.762005285741713, "learning_rate": 8.285105616841463e-06, "loss": 0.5123, "step": 1964 }, { "epoch": 0.29385374607447284, "grad_norm": 1.6916688314394261, "learning_rate": 8.283279484360119e-06, "loss": 0.4779, "step": 1965 }, { "epoch": 0.29400328996560493, "grad_norm": 1.0908816823199547, "learning_rate": 8.281452581589768e-06, "loss": 0.3536, "step": 1966 }, { "epoch": 0.29415283385673696, "grad_norm": 1.4434211599007758, "learning_rate": 8.279624908959018e-06, "loss": 0.1613, "step": 1967 }, { "epoch": 0.294302377747869, "grad_norm": 1.8196704493501261, "learning_rate": 8.277796466896657e-06, "loss": 0.4989, "step": 1968 }, { "epoch": 0.294451921639001, "grad_norm": 1.3656918969234806, "learning_rate": 8.275967255831655e-06, "loss": 0.1918, "step": 1969 }, { "epoch": 0.2946014655301331, "grad_norm": 1.4834473723024195, "learning_rate": 8.274137276193162e-06, "loss": 0.2105, "step": 1970 }, { "epoch": 0.29475100942126514, "grad_norm": 2.0922406506784745, "learning_rate": 8.272306528410511e-06, "loss": 0.3878, "step": 1971 }, { "epoch": 0.2949005533123972, "grad_norm": 1.6479894472086396, "learning_rate": 8.270475012913212e-06, "loss": 0.3405, "step": 1972 }, { "epoch": 0.29505009720352926, "grad_norm": 1.816314957488247, "learning_rate": 8.268642730130956e-06, "loss": 0.3493, "step": 1973 }, { "epoch": 0.2951996410946613, "grad_norm": 1.7886487407400398, "learning_rate": 8.266809680493615e-06, "loss": 0.4509, "step": 1974 }, { "epoch": 0.2953491849857933, "grad_norm": 1.4476130090504609, "learning_rate": 8.264975864431239e-06, "loss": 0.3609, "step": 1975 }, { "epoch": 0.29549872887692535, "grad_norm": 1.6461788405344846, "learning_rate": 8.263141282374058e-06, "loss": 0.3404, "step": 1976 }, { "epoch": 0.29564827276805744, "grad_norm": 1.5013235196228902, "learning_rate": 8.261305934752486e-06, "loss": 0.2106, "step": 1977 }, { "epoch": 0.29579781665918947, "grad_norm": 1.4062496662139496, "learning_rate": 8.259469821997111e-06, "loss": 0.3559, "step": 1978 }, { "epoch": 0.2959473605503215, "grad_norm": 1.1293811953986368, "learning_rate": 8.257632944538704e-06, "loss": 0.2078, "step": 1979 }, { "epoch": 0.2960969044414536, "grad_norm": 1.4191657864487062, "learning_rate": 8.255795302808212e-06, "loss": 0.3335, "step": 1980 }, { "epoch": 0.2962464483325856, "grad_norm": 1.2818891440117588, "learning_rate": 8.253956897236765e-06, "loss": 0.1667, "step": 1981 }, { "epoch": 0.29639599222371765, "grad_norm": 2.032954982306054, "learning_rate": 8.25211772825567e-06, "loss": 0.3959, "step": 1982 }, { "epoch": 0.2965455361148497, "grad_norm": 2.0441581298148606, "learning_rate": 8.250277796296412e-06, "loss": 0.5914, "step": 1983 }, { "epoch": 0.29669508000598177, "grad_norm": 1.0550678485707448, "learning_rate": 8.248437101790663e-06, "loss": 0.2187, "step": 1984 }, { "epoch": 0.2968446238971138, "grad_norm": 1.714428768910598, "learning_rate": 8.24659564517026e-06, "loss": 0.3521, "step": 1985 }, { "epoch": 0.29699416778824583, "grad_norm": 1.495931419848694, "learning_rate": 8.244753426867233e-06, "loss": 0.3141, "step": 1986 }, { "epoch": 0.2971437116793779, "grad_norm": 1.4595624143758643, "learning_rate": 8.24291044731378e-06, "loss": 0.207, "step": 1987 }, { "epoch": 0.29729325557050995, "grad_norm": 1.4893368784443541, "learning_rate": 8.241066706942282e-06, "loss": 0.2096, "step": 1988 }, { "epoch": 0.297442799461642, "grad_norm": 1.5298733994754299, "learning_rate": 8.239222206185303e-06, "loss": 0.2963, "step": 1989 }, { "epoch": 0.297592343352774, "grad_norm": 1.7110744142837584, "learning_rate": 8.237376945475573e-06, "loss": 0.4879, "step": 1990 }, { "epoch": 0.2977418872439061, "grad_norm": 1.3646893921129908, "learning_rate": 8.235530925246013e-06, "loss": 0.2199, "step": 1991 }, { "epoch": 0.29789143113503813, "grad_norm": 1.5519467713508963, "learning_rate": 8.233684145929714e-06, "loss": 0.1567, "step": 1992 }, { "epoch": 0.29804097502617016, "grad_norm": 1.2022227133419618, "learning_rate": 8.231836607959953e-06, "loss": 0.3048, "step": 1993 }, { "epoch": 0.29819051891730225, "grad_norm": 1.0528429237271446, "learning_rate": 8.229988311770176e-06, "loss": 0.2089, "step": 1994 }, { "epoch": 0.2983400628084343, "grad_norm": 2.0589676323841233, "learning_rate": 8.228139257794012e-06, "loss": 0.4729, "step": 1995 }, { "epoch": 0.2984896066995663, "grad_norm": 1.699439689917369, "learning_rate": 8.226289446465269e-06, "loss": 0.197, "step": 1996 }, { "epoch": 0.29863915059069834, "grad_norm": 2.1597718626316214, "learning_rate": 8.224438878217928e-06, "loss": 0.5298, "step": 1997 }, { "epoch": 0.29878869448183043, "grad_norm": 1.5026245898132045, "learning_rate": 8.22258755348615e-06, "loss": 0.4905, "step": 1998 }, { "epoch": 0.29893823837296246, "grad_norm": 1.5917682855765136, "learning_rate": 8.220735472704278e-06, "loss": 0.2223, "step": 1999 }, { "epoch": 0.2990877822640945, "grad_norm": 1.4766123425180093, "learning_rate": 8.218882636306823e-06, "loss": 0.2059, "step": 2000 }, { "epoch": 0.2992373261552266, "grad_norm": 1.0856069277789302, "learning_rate": 8.217029044728478e-06, "loss": 0.193, "step": 2001 }, { "epoch": 0.2993868700463586, "grad_norm": 1.2329171435469741, "learning_rate": 8.215174698404118e-06, "loss": 0.2372, "step": 2002 }, { "epoch": 0.29953641393749064, "grad_norm": 1.7452583821562881, "learning_rate": 8.213319597768785e-06, "loss": 0.2098, "step": 2003 }, { "epoch": 0.2996859578286227, "grad_norm": 1.0905531929281826, "learning_rate": 8.21146374325771e-06, "loss": 0.2187, "step": 2004 }, { "epoch": 0.29983550171975476, "grad_norm": 1.816609674773371, "learning_rate": 8.209607135306287e-06, "loss": 0.3709, "step": 2005 }, { "epoch": 0.2999850456108868, "grad_norm": 1.7738216295020472, "learning_rate": 8.2077497743501e-06, "loss": 0.4486, "step": 2006 }, { "epoch": 0.3001345895020188, "grad_norm": 1.1263320970303174, "learning_rate": 8.205891660824903e-06, "loss": 0.2091, "step": 2007 }, { "epoch": 0.3002841333931509, "grad_norm": 1.501664539486306, "learning_rate": 8.204032795166625e-06, "loss": 0.2009, "step": 2008 }, { "epoch": 0.30043367728428294, "grad_norm": 2.6199919066049335, "learning_rate": 8.202173177811374e-06, "loss": 0.203, "step": 2009 }, { "epoch": 0.30058322117541497, "grad_norm": 1.625937686489783, "learning_rate": 8.200312809195436e-06, "loss": 0.3831, "step": 2010 }, { "epoch": 0.30073276506654706, "grad_norm": 1.8974675970112806, "learning_rate": 8.198451689755269e-06, "loss": 0.5418, "step": 2011 }, { "epoch": 0.3008823089576791, "grad_norm": 1.8748455619787359, "learning_rate": 8.196589819927512e-06, "loss": 0.4425, "step": 2012 }, { "epoch": 0.3010318528488111, "grad_norm": 1.6094667931646591, "learning_rate": 8.194727200148978e-06, "loss": 0.3389, "step": 2013 }, { "epoch": 0.30118139673994315, "grad_norm": 0.9707433761295292, "learning_rate": 8.192863830856652e-06, "loss": 0.1763, "step": 2014 }, { "epoch": 0.30133094063107524, "grad_norm": 1.3903941059235092, "learning_rate": 8.1909997124877e-06, "loss": 0.2339, "step": 2015 }, { "epoch": 0.30148048452220727, "grad_norm": 1.3528795750135822, "learning_rate": 8.189134845479462e-06, "loss": 0.2193, "step": 2016 }, { "epoch": 0.3016300284133393, "grad_norm": 1.7397719431371887, "learning_rate": 8.187269230269458e-06, "loss": 0.4049, "step": 2017 }, { "epoch": 0.3017795723044714, "grad_norm": 1.626350301455843, "learning_rate": 8.185402867295373e-06, "loss": 0.2327, "step": 2018 }, { "epoch": 0.3019291161956034, "grad_norm": 1.3932636681559976, "learning_rate": 8.183535756995078e-06, "loss": 0.2575, "step": 2019 }, { "epoch": 0.30207866008673545, "grad_norm": 1.7626753702911266, "learning_rate": 8.181667899806613e-06, "loss": 0.4597, "step": 2020 }, { "epoch": 0.3022282039778675, "grad_norm": 1.684539378891143, "learning_rate": 8.179799296168194e-06, "loss": 0.4608, "step": 2021 }, { "epoch": 0.30237774786899957, "grad_norm": 1.0640465826224796, "learning_rate": 8.177929946518217e-06, "loss": 0.2221, "step": 2022 }, { "epoch": 0.3025272917601316, "grad_norm": 1.5147026539291037, "learning_rate": 8.176059851295248e-06, "loss": 0.2353, "step": 2023 }, { "epoch": 0.30267683565126363, "grad_norm": 1.7530304138656525, "learning_rate": 8.174189010938028e-06, "loss": 0.4141, "step": 2024 }, { "epoch": 0.3028263795423957, "grad_norm": 1.9415201656454513, "learning_rate": 8.172317425885477e-06, "loss": 0.5352, "step": 2025 }, { "epoch": 0.30297592343352775, "grad_norm": 1.9383678107900466, "learning_rate": 8.170445096576683e-06, "loss": 0.573, "step": 2026 }, { "epoch": 0.3031254673246598, "grad_norm": 1.803033239816454, "learning_rate": 8.168572023450915e-06, "loss": 0.5225, "step": 2027 }, { "epoch": 0.3032750112157918, "grad_norm": 2.1395850335411244, "learning_rate": 8.166698206947614e-06, "loss": 0.7007, "step": 2028 }, { "epoch": 0.3034245551069239, "grad_norm": 1.5110862268685226, "learning_rate": 8.164823647506394e-06, "loss": 0.2269, "step": 2029 }, { "epoch": 0.30357409899805593, "grad_norm": 1.6974467146899468, "learning_rate": 8.162948345567048e-06, "loss": 0.3921, "step": 2030 }, { "epoch": 0.30372364288918796, "grad_norm": 1.3936816440378674, "learning_rate": 8.161072301569536e-06, "loss": 0.2049, "step": 2031 }, { "epoch": 0.30387318678032005, "grad_norm": 1.4848287127408877, "learning_rate": 8.159195515953998e-06, "loss": 0.2062, "step": 2032 }, { "epoch": 0.3040227306714521, "grad_norm": 1.2105792130825146, "learning_rate": 8.157317989160746e-06, "loss": 0.1788, "step": 2033 }, { "epoch": 0.3041722745625841, "grad_norm": 1.3820320839391493, "learning_rate": 8.155439721630265e-06, "loss": 0.2331, "step": 2034 }, { "epoch": 0.30432181845371614, "grad_norm": 2.150262549914666, "learning_rate": 8.153560713803215e-06, "loss": 0.1875, "step": 2035 }, { "epoch": 0.30447136234484823, "grad_norm": 1.0906698589335555, "learning_rate": 8.15168096612043e-06, "loss": 0.2033, "step": 2036 }, { "epoch": 0.30462090623598026, "grad_norm": 1.065733777923844, "learning_rate": 8.149800479022917e-06, "loss": 0.2001, "step": 2037 }, { "epoch": 0.3047704501271123, "grad_norm": 1.6644796466482528, "learning_rate": 8.147919252951855e-06, "loss": 0.5517, "step": 2038 }, { "epoch": 0.3049199940182444, "grad_norm": 1.5874527096275228, "learning_rate": 8.146037288348598e-06, "loss": 0.3491, "step": 2039 }, { "epoch": 0.3050695379093764, "grad_norm": 2.1160201341828513, "learning_rate": 8.144154585654675e-06, "loss": 0.4023, "step": 2040 }, { "epoch": 0.30521908180050844, "grad_norm": 1.449019528908297, "learning_rate": 8.142271145311784e-06, "loss": 0.3529, "step": 2041 }, { "epoch": 0.30536862569164047, "grad_norm": 1.0741990035582996, "learning_rate": 8.1403869677618e-06, "loss": 0.2042, "step": 2042 }, { "epoch": 0.30551816958277256, "grad_norm": 1.6081149900134495, "learning_rate": 8.138502053446766e-06, "loss": 0.3908, "step": 2043 }, { "epoch": 0.3056677134739046, "grad_norm": 2.0690220211720716, "learning_rate": 8.136616402808906e-06, "loss": 0.5508, "step": 2044 }, { "epoch": 0.3058172573650366, "grad_norm": 1.6409484884961714, "learning_rate": 8.13473001629061e-06, "loss": 0.3161, "step": 2045 }, { "epoch": 0.3059668012561687, "grad_norm": 1.858300355296472, "learning_rate": 8.132842894334438e-06, "loss": 0.5614, "step": 2046 }, { "epoch": 0.30611634514730074, "grad_norm": 1.2324767485914987, "learning_rate": 8.130955037383132e-06, "loss": 0.2132, "step": 2047 }, { "epoch": 0.30626588903843277, "grad_norm": 1.7272643652056465, "learning_rate": 8.1290664458796e-06, "loss": 0.3132, "step": 2048 }, { "epoch": 0.3064154329295648, "grad_norm": 1.3637333594365966, "learning_rate": 8.127177120266926e-06, "loss": 0.1739, "step": 2049 }, { "epoch": 0.3065649768206969, "grad_norm": 1.538728380001006, "learning_rate": 8.12528706098836e-06, "loss": 0.4072, "step": 2050 }, { "epoch": 0.3067145207118289, "grad_norm": 1.4591751944118272, "learning_rate": 8.12339626848733e-06, "loss": 0.3859, "step": 2051 }, { "epoch": 0.30686406460296095, "grad_norm": 1.9028261579580017, "learning_rate": 8.121504743207436e-06, "loss": 0.4893, "step": 2052 }, { "epoch": 0.30701360849409304, "grad_norm": 1.7379512730839208, "learning_rate": 8.119612485592442e-06, "loss": 0.3564, "step": 2053 }, { "epoch": 0.30716315238522507, "grad_norm": 1.388968630495014, "learning_rate": 8.117719496086298e-06, "loss": 0.3179, "step": 2054 }, { "epoch": 0.3073126962763571, "grad_norm": 1.403056196180425, "learning_rate": 8.115825775133112e-06, "loss": 0.352, "step": 2055 }, { "epoch": 0.30746224016748913, "grad_norm": 1.508992427712604, "learning_rate": 8.113931323177171e-06, "loss": 0.4021, "step": 2056 }, { "epoch": 0.3076117840586212, "grad_norm": 2.1741360894355015, "learning_rate": 8.112036140662931e-06, "loss": 0.467, "step": 2057 }, { "epoch": 0.30776132794975325, "grad_norm": 1.6144568383492353, "learning_rate": 8.110140228035022e-06, "loss": 0.3924, "step": 2058 }, { "epoch": 0.3079108718408853, "grad_norm": 1.4893071701125404, "learning_rate": 8.10824358573824e-06, "loss": 0.5036, "step": 2059 }, { "epoch": 0.30806041573201737, "grad_norm": 1.4815595377852415, "learning_rate": 8.10634621421756e-06, "loss": 0.3972, "step": 2060 }, { "epoch": 0.3082099596231494, "grad_norm": 1.7353454529628423, "learning_rate": 8.104448113918118e-06, "loss": 0.2237, "step": 2061 }, { "epoch": 0.30835950351428143, "grad_norm": 1.0247854772257274, "learning_rate": 8.102549285285233e-06, "loss": 0.1595, "step": 2062 }, { "epoch": 0.30850904740541346, "grad_norm": 1.6101502986170517, "learning_rate": 8.100649728764382e-06, "loss": 0.519, "step": 2063 }, { "epoch": 0.30865859129654555, "grad_norm": 1.4785633507789944, "learning_rate": 8.098749444801226e-06, "loss": 0.3757, "step": 2064 }, { "epoch": 0.3088081351876776, "grad_norm": 1.1653508922546973, "learning_rate": 8.096848433841585e-06, "loss": 0.1843, "step": 2065 }, { "epoch": 0.3089576790788096, "grad_norm": 2.23989146500782, "learning_rate": 8.094946696331454e-06, "loss": 0.5717, "step": 2066 }, { "epoch": 0.3091072229699417, "grad_norm": 1.4086002710437948, "learning_rate": 8.093044232717004e-06, "loss": 0.3305, "step": 2067 }, { "epoch": 0.30925676686107373, "grad_norm": 1.3532349541812738, "learning_rate": 8.091141043444565e-06, "loss": 0.2031, "step": 2068 }, { "epoch": 0.30940631075220576, "grad_norm": 1.5659716733424975, "learning_rate": 8.08923712896065e-06, "loss": 0.264, "step": 2069 }, { "epoch": 0.30955585464333785, "grad_norm": 1.188194674034748, "learning_rate": 8.087332489711931e-06, "loss": 0.2084, "step": 2070 }, { "epoch": 0.3097053985344699, "grad_norm": 1.3639153806477138, "learning_rate": 8.085427126145255e-06, "loss": 0.2999, "step": 2071 }, { "epoch": 0.3098549424256019, "grad_norm": 1.743026958811897, "learning_rate": 8.083521038707643e-06, "loss": 0.5313, "step": 2072 }, { "epoch": 0.31000448631673394, "grad_norm": 1.1747827948023122, "learning_rate": 8.081614227846275e-06, "loss": 0.1753, "step": 2073 }, { "epoch": 0.310154030207866, "grad_norm": 1.1941064963550336, "learning_rate": 8.079706694008512e-06, "loss": 0.2424, "step": 2074 }, { "epoch": 0.31030357409899806, "grad_norm": 1.7186546887175254, "learning_rate": 8.077798437641878e-06, "loss": 0.2263, "step": 2075 }, { "epoch": 0.3104531179901301, "grad_norm": 1.0317324156623773, "learning_rate": 8.075889459194069e-06, "loss": 0.1556, "step": 2076 }, { "epoch": 0.3106026618812622, "grad_norm": 1.5907870570782063, "learning_rate": 8.073979759112949e-06, "loss": 0.2729, "step": 2077 }, { "epoch": 0.3107522057723942, "grad_norm": 1.8674735165445506, "learning_rate": 8.072069337846554e-06, "loss": 0.3821, "step": 2078 }, { "epoch": 0.31090174966352624, "grad_norm": 1.983553521137882, "learning_rate": 8.070158195843084e-06, "loss": 0.6787, "step": 2079 }, { "epoch": 0.31105129355465827, "grad_norm": 1.0817414697908496, "learning_rate": 8.068246333550913e-06, "loss": 0.1601, "step": 2080 }, { "epoch": 0.31120083744579036, "grad_norm": 1.4614119919401352, "learning_rate": 8.066333751418582e-06, "loss": 0.2074, "step": 2081 }, { "epoch": 0.3113503813369224, "grad_norm": 1.2801507618992622, "learning_rate": 8.064420449894802e-06, "loss": 0.2384, "step": 2082 }, { "epoch": 0.3114999252280544, "grad_norm": 1.856756460170748, "learning_rate": 8.062506429428451e-06, "loss": 0.3569, "step": 2083 }, { "epoch": 0.3116494691191865, "grad_norm": 1.5574561714489221, "learning_rate": 8.060591690468579e-06, "loss": 0.2001, "step": 2084 }, { "epoch": 0.31179901301031854, "grad_norm": 0.8139823838456822, "learning_rate": 8.0586762334644e-06, "loss": 0.1493, "step": 2085 }, { "epoch": 0.31194855690145057, "grad_norm": 1.8147690398413907, "learning_rate": 8.056760058865298e-06, "loss": 0.3726, "step": 2086 }, { "epoch": 0.3120981007925826, "grad_norm": 1.3551230470716993, "learning_rate": 8.054843167120827e-06, "loss": 0.3161, "step": 2087 }, { "epoch": 0.3122476446837147, "grad_norm": 1.423454170150374, "learning_rate": 8.052925558680708e-06, "loss": 0.3286, "step": 2088 }, { "epoch": 0.3123971885748467, "grad_norm": 1.1629738020950542, "learning_rate": 8.051007233994833e-06, "loss": 0.2128, "step": 2089 }, { "epoch": 0.31254673246597875, "grad_norm": 1.0328139570935773, "learning_rate": 8.049088193513257e-06, "loss": 0.2028, "step": 2090 }, { "epoch": 0.31269627635711084, "grad_norm": 0.9561546972801208, "learning_rate": 8.047168437686204e-06, "loss": 0.1901, "step": 2091 }, { "epoch": 0.31284582024824287, "grad_norm": 1.5085687791327238, "learning_rate": 8.045247966964069e-06, "loss": 0.3978, "step": 2092 }, { "epoch": 0.3129953641393749, "grad_norm": 1.7990952747958247, "learning_rate": 8.043326781797413e-06, "loss": 0.4526, "step": 2093 }, { "epoch": 0.31314490803050693, "grad_norm": 1.6047231005366211, "learning_rate": 8.041404882636964e-06, "loss": 0.3136, "step": 2094 }, { "epoch": 0.313294451921639, "grad_norm": 1.0885180525142448, "learning_rate": 8.039482269933619e-06, "loss": 0.1868, "step": 2095 }, { "epoch": 0.31344399581277105, "grad_norm": 1.6731122777438763, "learning_rate": 8.03755894413844e-06, "loss": 0.253, "step": 2096 }, { "epoch": 0.3135935397039031, "grad_norm": 1.7388060302674344, "learning_rate": 8.03563490570266e-06, "loss": 0.3781, "step": 2097 }, { "epoch": 0.31374308359503517, "grad_norm": 1.3948567328495716, "learning_rate": 8.033710155077675e-06, "loss": 0.3065, "step": 2098 }, { "epoch": 0.3138926274861672, "grad_norm": 1.7899046169473158, "learning_rate": 8.03178469271505e-06, "loss": 0.4497, "step": 2099 }, { "epoch": 0.31404217137729923, "grad_norm": 1.1376056341100615, "learning_rate": 8.029858519066519e-06, "loss": 0.1793, "step": 2100 }, { "epoch": 0.31419171526843126, "grad_norm": 1.4405559368087475, "learning_rate": 8.027931634583978e-06, "loss": 0.1984, "step": 2101 }, { "epoch": 0.31434125915956335, "grad_norm": 1.7393459445529194, "learning_rate": 8.026004039719494e-06, "loss": 0.4943, "step": 2102 }, { "epoch": 0.3144908030506954, "grad_norm": 1.6507941300141975, "learning_rate": 8.024075734925302e-06, "loss": 0.3702, "step": 2103 }, { "epoch": 0.3146403469418274, "grad_norm": 1.175812152126952, "learning_rate": 8.022146720653797e-06, "loss": 0.2246, "step": 2104 }, { "epoch": 0.3147898908329595, "grad_norm": 1.505696425174867, "learning_rate": 8.020216997357547e-06, "loss": 0.3827, "step": 2105 }, { "epoch": 0.3149394347240915, "grad_norm": 1.5053350968315022, "learning_rate": 8.018286565489281e-06, "loss": 0.3602, "step": 2106 }, { "epoch": 0.31508897861522356, "grad_norm": 7.395352954217879, "learning_rate": 8.016355425501899e-06, "loss": 0.5292, "step": 2107 }, { "epoch": 0.3152385225063556, "grad_norm": 1.4124551721650014, "learning_rate": 8.014423577848465e-06, "loss": 0.3058, "step": 2108 }, { "epoch": 0.3153880663974877, "grad_norm": 1.7291348054164273, "learning_rate": 8.012491022982206e-06, "loss": 0.4039, "step": 2109 }, { "epoch": 0.3155376102886197, "grad_norm": 1.3762981669774148, "learning_rate": 8.010557761356523e-06, "loss": 0.3214, "step": 2110 }, { "epoch": 0.31568715417975174, "grad_norm": 1.9151602263688279, "learning_rate": 8.008623793424975e-06, "loss": 0.5339, "step": 2111 }, { "epoch": 0.3158366980708838, "grad_norm": 1.6710466712643997, "learning_rate": 8.006689119641289e-06, "loss": 0.3143, "step": 2112 }, { "epoch": 0.31598624196201586, "grad_norm": 1.2763038771322965, "learning_rate": 8.00475374045936e-06, "loss": 0.3106, "step": 2113 }, { "epoch": 0.3161357858531479, "grad_norm": 0.9519969897990209, "learning_rate": 8.002817656333246e-06, "loss": 0.1709, "step": 2114 }, { "epoch": 0.3162853297442799, "grad_norm": 1.0941737648338323, "learning_rate": 8.000880867717168e-06, "loss": 0.3558, "step": 2115 }, { "epoch": 0.316434873635412, "grad_norm": 1.6804061150860001, "learning_rate": 7.99894337506552e-06, "loss": 0.3005, "step": 2116 }, { "epoch": 0.31658441752654404, "grad_norm": 1.1783654687222251, "learning_rate": 7.997005178832853e-06, "loss": 0.2406, "step": 2117 }, { "epoch": 0.31673396141767607, "grad_norm": 1.6348023119949275, "learning_rate": 7.99506627947389e-06, "loss": 0.2475, "step": 2118 }, { "epoch": 0.31688350530880816, "grad_norm": 1.4962501892290498, "learning_rate": 7.993126677443513e-06, "loss": 0.3126, "step": 2119 }, { "epoch": 0.3170330491999402, "grad_norm": 1.1269074931566945, "learning_rate": 7.991186373196771e-06, "loss": 0.2101, "step": 2120 }, { "epoch": 0.3171825930910722, "grad_norm": 1.5451433092999418, "learning_rate": 7.989245367188877e-06, "loss": 0.4435, "step": 2121 }, { "epoch": 0.31733213698220425, "grad_norm": 1.6954504563414547, "learning_rate": 7.987303659875212e-06, "loss": 0.3777, "step": 2122 }, { "epoch": 0.31748168087333634, "grad_norm": 1.7046868376726214, "learning_rate": 7.98536125171132e-06, "loss": 0.375, "step": 2123 }, { "epoch": 0.31763122476446837, "grad_norm": 1.6583768126843783, "learning_rate": 7.983418143152906e-06, "loss": 0.1679, "step": 2124 }, { "epoch": 0.3177807686556004, "grad_norm": 1.652523256475839, "learning_rate": 7.981474334655845e-06, "loss": 0.3821, "step": 2125 }, { "epoch": 0.3179303125467325, "grad_norm": 1.6865422789510223, "learning_rate": 7.979529826676172e-06, "loss": 0.2312, "step": 2126 }, { "epoch": 0.3180798564378645, "grad_norm": 1.318155763232294, "learning_rate": 7.977584619670084e-06, "loss": 0.2417, "step": 2127 }, { "epoch": 0.31822940032899655, "grad_norm": 0.9387354141491452, "learning_rate": 7.97563871409395e-06, "loss": 0.2152, "step": 2128 }, { "epoch": 0.31837894422012863, "grad_norm": 1.4740638203723162, "learning_rate": 7.973692110404295e-06, "loss": 0.3663, "step": 2129 }, { "epoch": 0.31852848811126067, "grad_norm": 1.5147303848676803, "learning_rate": 7.971744809057815e-06, "loss": 0.2317, "step": 2130 }, { "epoch": 0.3186780320023927, "grad_norm": 1.3220893173215598, "learning_rate": 7.96979681051136e-06, "loss": 0.3673, "step": 2131 }, { "epoch": 0.31882757589352473, "grad_norm": 1.778728683314461, "learning_rate": 7.967848115221953e-06, "loss": 0.3838, "step": 2132 }, { "epoch": 0.3189771197846568, "grad_norm": 1.79666907747646, "learning_rate": 7.965898723646777e-06, "loss": 0.4668, "step": 2133 }, { "epoch": 0.31912666367578885, "grad_norm": 1.1363334854959442, "learning_rate": 7.963948636243175e-06, "loss": 0.1881, "step": 2134 }, { "epoch": 0.3192762075669209, "grad_norm": 1.2727017471208417, "learning_rate": 7.96199785346866e-06, "loss": 0.3583, "step": 2135 }, { "epoch": 0.31942575145805296, "grad_norm": 1.6879630689693983, "learning_rate": 7.960046375780903e-06, "loss": 0.528, "step": 2136 }, { "epoch": 0.319575295349185, "grad_norm": 1.441558984867543, "learning_rate": 7.958094203637738e-06, "loss": 0.3621, "step": 2137 }, { "epoch": 0.319724839240317, "grad_norm": 0.9760649090504697, "learning_rate": 7.956141337497166e-06, "loss": 0.1714, "step": 2138 }, { "epoch": 0.31987438313144906, "grad_norm": 0.9469336665426322, "learning_rate": 7.954187777817345e-06, "loss": 0.1795, "step": 2139 }, { "epoch": 0.32002392702258114, "grad_norm": 1.0476640767317869, "learning_rate": 7.952233525056603e-06, "loss": 0.1922, "step": 2140 }, { "epoch": 0.3201734709137132, "grad_norm": 1.254945459546166, "learning_rate": 7.950278579673422e-06, "loss": 0.2742, "step": 2141 }, { "epoch": 0.3203230148048452, "grad_norm": 0.7230488496853383, "learning_rate": 7.948322942126456e-06, "loss": 0.1742, "step": 2142 }, { "epoch": 0.3204725586959773, "grad_norm": 0.960182470952214, "learning_rate": 7.946366612874512e-06, "loss": 0.1851, "step": 2143 }, { "epoch": 0.3206221025871093, "grad_norm": 1.1966344295913398, "learning_rate": 7.944409592376565e-06, "loss": 0.1877, "step": 2144 }, { "epoch": 0.32077164647824136, "grad_norm": 1.9057275611187459, "learning_rate": 7.942451881091752e-06, "loss": 0.2618, "step": 2145 }, { "epoch": 0.3209211903693734, "grad_norm": 1.2274689028407764, "learning_rate": 7.94049347947937e-06, "loss": 0.1885, "step": 2146 }, { "epoch": 0.3210707342605055, "grad_norm": 1.678924591333489, "learning_rate": 7.93853438799888e-06, "loss": 0.2338, "step": 2147 }, { "epoch": 0.3212202781516375, "grad_norm": 1.7464181014190676, "learning_rate": 7.936574607109901e-06, "loss": 0.3676, "step": 2148 }, { "epoch": 0.32136982204276954, "grad_norm": 1.5398927642089624, "learning_rate": 7.934614137272218e-06, "loss": 0.2293, "step": 2149 }, { "epoch": 0.3215193659339016, "grad_norm": 1.5519843635324424, "learning_rate": 7.932652978945779e-06, "loss": 0.199, "step": 2150 }, { "epoch": 0.32166890982503366, "grad_norm": 1.1920599004222825, "learning_rate": 7.930691132590686e-06, "loss": 0.1696, "step": 2151 }, { "epoch": 0.3218184537161657, "grad_norm": 0.8145849226466982, "learning_rate": 7.928728598667211e-06, "loss": 0.1537, "step": 2152 }, { "epoch": 0.3219679976072977, "grad_norm": 1.3926444820307884, "learning_rate": 7.926765377635781e-06, "loss": 0.1999, "step": 2153 }, { "epoch": 0.3221175414984298, "grad_norm": 1.6412886866609875, "learning_rate": 7.924801469956986e-06, "loss": 0.2527, "step": 2154 }, { "epoch": 0.32226708538956184, "grad_norm": 1.5325273883442905, "learning_rate": 7.92283687609158e-06, "loss": 0.2634, "step": 2155 }, { "epoch": 0.32241662928069387, "grad_norm": 1.0316905287286056, "learning_rate": 7.920871596500473e-06, "loss": 0.174, "step": 2156 }, { "epoch": 0.32256617317182595, "grad_norm": 1.1349047741297784, "learning_rate": 7.91890563164474e-06, "loss": 0.1721, "step": 2157 }, { "epoch": 0.322715717062958, "grad_norm": 2.057460755429332, "learning_rate": 7.916938981985619e-06, "loss": 0.6121, "step": 2158 }, { "epoch": 0.32286526095409, "grad_norm": 1.236449628888766, "learning_rate": 7.914971647984494e-06, "loss": 0.1979, "step": 2159 }, { "epoch": 0.32301480484522205, "grad_norm": 1.952661192183289, "learning_rate": 7.913003630102934e-06, "loss": 0.4105, "step": 2160 }, { "epoch": 0.32316434873635413, "grad_norm": 1.6581077143849778, "learning_rate": 7.911034928802647e-06, "loss": 0.4989, "step": 2161 }, { "epoch": 0.32331389262748617, "grad_norm": 1.0964604376832452, "learning_rate": 7.909065544545511e-06, "loss": 0.2086, "step": 2162 }, { "epoch": 0.3234634365186182, "grad_norm": 1.4794895335762217, "learning_rate": 7.907095477793563e-06, "loss": 0.2232, "step": 2163 }, { "epoch": 0.3236129804097503, "grad_norm": 1.7007378332145011, "learning_rate": 7.905124729008996e-06, "loss": 0.3449, "step": 2164 }, { "epoch": 0.3237625243008823, "grad_norm": 5.5988236758577, "learning_rate": 7.903153298654173e-06, "loss": 0.2263, "step": 2165 }, { "epoch": 0.32391206819201435, "grad_norm": 1.7620287916310595, "learning_rate": 7.901181187191606e-06, "loss": 0.4116, "step": 2166 }, { "epoch": 0.3240616120831464, "grad_norm": 1.6633135381694006, "learning_rate": 7.899208395083974e-06, "loss": 0.2297, "step": 2167 }, { "epoch": 0.32421115597427846, "grad_norm": 0.9621828860582361, "learning_rate": 7.897234922794113e-06, "loss": 0.2098, "step": 2168 }, { "epoch": 0.3243606998654105, "grad_norm": 1.3513242795516456, "learning_rate": 7.895260770785014e-06, "loss": 0.3103, "step": 2169 }, { "epoch": 0.3245102437565425, "grad_norm": 0.9735967195147534, "learning_rate": 7.893285939519836e-06, "loss": 0.1699, "step": 2170 }, { "epoch": 0.3246597876476746, "grad_norm": 2.0453322690120554, "learning_rate": 7.891310429461895e-06, "loss": 0.5192, "step": 2171 }, { "epoch": 0.32480933153880664, "grad_norm": 1.4629318475963238, "learning_rate": 7.889334241074663e-06, "loss": 0.4312, "step": 2172 }, { "epoch": 0.3249588754299387, "grad_norm": 1.5333310991077456, "learning_rate": 7.887357374821768e-06, "loss": 0.2691, "step": 2173 }, { "epoch": 0.3251084193210707, "grad_norm": 1.6289642332812333, "learning_rate": 7.88537983116701e-06, "loss": 0.4631, "step": 2174 }, { "epoch": 0.3252579632122028, "grad_norm": 1.166966622363864, "learning_rate": 7.883401610574338e-06, "loss": 0.1523, "step": 2175 }, { "epoch": 0.3254075071033348, "grad_norm": 1.7643050095516288, "learning_rate": 7.881422713507857e-06, "loss": 0.4944, "step": 2176 }, { "epoch": 0.32555705099446686, "grad_norm": 2.0778286729797943, "learning_rate": 7.879443140431837e-06, "loss": 0.5285, "step": 2177 }, { "epoch": 0.32570659488559894, "grad_norm": 1.22732284237921, "learning_rate": 7.877462891810708e-06, "loss": 0.2215, "step": 2178 }, { "epoch": 0.325856138776731, "grad_norm": 1.794981875573838, "learning_rate": 7.875481968109052e-06, "loss": 0.4738, "step": 2179 }, { "epoch": 0.326005682667863, "grad_norm": 1.8625636048305199, "learning_rate": 7.873500369791615e-06, "loss": 0.5112, "step": 2180 }, { "epoch": 0.3261552265589951, "grad_norm": 1.5970308413484968, "learning_rate": 7.8715180973233e-06, "loss": 0.3491, "step": 2181 }, { "epoch": 0.3263047704501271, "grad_norm": 4.445014172618314, "learning_rate": 7.869535151169163e-06, "loss": 0.4128, "step": 2182 }, { "epoch": 0.32645431434125916, "grad_norm": 2.199504833687433, "learning_rate": 7.867551531794427e-06, "loss": 0.5042, "step": 2183 }, { "epoch": 0.3266038582323912, "grad_norm": 1.9133519547070805, "learning_rate": 7.865567239664463e-06, "loss": 0.5783, "step": 2184 }, { "epoch": 0.3267534021235233, "grad_norm": 1.847425837596219, "learning_rate": 7.86358227524481e-06, "loss": 0.5434, "step": 2185 }, { "epoch": 0.3269029460146553, "grad_norm": 1.7506088747332584, "learning_rate": 7.861596639001157e-06, "loss": 0.3848, "step": 2186 }, { "epoch": 0.32705248990578734, "grad_norm": 1.6285208330736813, "learning_rate": 7.859610331399354e-06, "loss": 0.2588, "step": 2187 }, { "epoch": 0.3272020337969194, "grad_norm": 1.0354818947520257, "learning_rate": 7.85762335290541e-06, "loss": 0.1884, "step": 2188 }, { "epoch": 0.32735157768805145, "grad_norm": 2.061471227348859, "learning_rate": 7.855635703985487e-06, "loss": 0.5502, "step": 2189 }, { "epoch": 0.3275011215791835, "grad_norm": 1.0589713974920532, "learning_rate": 7.853647385105905e-06, "loss": 0.1894, "step": 2190 }, { "epoch": 0.3276506654703155, "grad_norm": 1.27660152324297, "learning_rate": 7.851658396733148e-06, "loss": 0.2325, "step": 2191 }, { "epoch": 0.3278002093614476, "grad_norm": 1.602286282256986, "learning_rate": 7.849668739333846e-06, "loss": 0.1959, "step": 2192 }, { "epoch": 0.32794975325257963, "grad_norm": 1.3119010750867572, "learning_rate": 7.847678413374795e-06, "loss": 0.1982, "step": 2193 }, { "epoch": 0.32809929714371167, "grad_norm": 1.2840343160987466, "learning_rate": 7.845687419322945e-06, "loss": 0.2299, "step": 2194 }, { "epoch": 0.32824884103484375, "grad_norm": 1.5111699549772195, "learning_rate": 7.843695757645402e-06, "loss": 0.2698, "step": 2195 }, { "epoch": 0.3283983849259758, "grad_norm": 1.1728180317528463, "learning_rate": 7.841703428809426e-06, "loss": 0.1911, "step": 2196 }, { "epoch": 0.3285479288171078, "grad_norm": 1.623773790762259, "learning_rate": 7.839710433282441e-06, "loss": 0.4539, "step": 2197 }, { "epoch": 0.32869747270823985, "grad_norm": 1.3523478321165887, "learning_rate": 7.83771677153202e-06, "loss": 0.1695, "step": 2198 }, { "epoch": 0.32884701659937193, "grad_norm": 1.669502440977397, "learning_rate": 7.835722444025898e-06, "loss": 0.2157, "step": 2199 }, { "epoch": 0.32899656049050396, "grad_norm": 1.3304613082395382, "learning_rate": 7.83372745123196e-06, "loss": 0.1949, "step": 2200 }, { "epoch": 0.329146104381636, "grad_norm": 1.857526998824266, "learning_rate": 7.831731793618253e-06, "loss": 0.2801, "step": 2201 }, { "epoch": 0.3292956482727681, "grad_norm": 1.4165319093489228, "learning_rate": 7.829735471652978e-06, "loss": 0.3415, "step": 2202 }, { "epoch": 0.3294451921639001, "grad_norm": 1.193763566127306, "learning_rate": 7.827738485804488e-06, "loss": 0.1967, "step": 2203 }, { "epoch": 0.32959473605503214, "grad_norm": 1.4097654443065837, "learning_rate": 7.825740836541299e-06, "loss": 0.2322, "step": 2204 }, { "epoch": 0.3297442799461642, "grad_norm": 0.8446160856119209, "learning_rate": 7.823742524332074e-06, "loss": 0.1748, "step": 2205 }, { "epoch": 0.32989382383729626, "grad_norm": 1.4283812084385221, "learning_rate": 7.821743549645642e-06, "loss": 0.2485, "step": 2206 }, { "epoch": 0.3300433677284283, "grad_norm": 1.522841097169646, "learning_rate": 7.819743912950979e-06, "loss": 0.3102, "step": 2207 }, { "epoch": 0.3301929116195603, "grad_norm": 1.07026655557967, "learning_rate": 7.817743614717218e-06, "loss": 0.2192, "step": 2208 }, { "epoch": 0.3303424555106924, "grad_norm": 1.5961930831288902, "learning_rate": 7.815742655413651e-06, "loss": 0.3224, "step": 2209 }, { "epoch": 0.33049199940182444, "grad_norm": 1.4925568208469404, "learning_rate": 7.813741035509718e-06, "loss": 0.4544, "step": 2210 }, { "epoch": 0.3306415432929565, "grad_norm": 1.0278606288492513, "learning_rate": 7.811738755475024e-06, "loss": 0.1586, "step": 2211 }, { "epoch": 0.3307910871840885, "grad_norm": 1.6801015432287125, "learning_rate": 7.80973581577932e-06, "loss": 0.3744, "step": 2212 }, { "epoch": 0.3309406310752206, "grad_norm": 1.7419776704612941, "learning_rate": 7.807732216892514e-06, "loss": 0.5292, "step": 2213 }, { "epoch": 0.3310901749663526, "grad_norm": 1.2090204199623609, "learning_rate": 7.80572795928467e-06, "loss": 0.3566, "step": 2214 }, { "epoch": 0.33123971885748466, "grad_norm": 1.0659334336267288, "learning_rate": 7.803723043426008e-06, "loss": 0.1775, "step": 2215 }, { "epoch": 0.33138926274861674, "grad_norm": 1.2471584103742004, "learning_rate": 7.8017174697869e-06, "loss": 0.2591, "step": 2216 }, { "epoch": 0.3315388066397488, "grad_norm": 1.2868762000450256, "learning_rate": 7.799711238837871e-06, "loss": 0.2203, "step": 2217 }, { "epoch": 0.3316883505308808, "grad_norm": 1.597988709645616, "learning_rate": 7.797704351049604e-06, "loss": 0.4692, "step": 2218 }, { "epoch": 0.33183789442201284, "grad_norm": 1.6497915275322537, "learning_rate": 7.795696806892936e-06, "loss": 0.5004, "step": 2219 }, { "epoch": 0.3319874383131449, "grad_norm": 1.3728703674339666, "learning_rate": 7.793688606838852e-06, "loss": 0.2219, "step": 2220 }, { "epoch": 0.33213698220427695, "grad_norm": 2.095794714839839, "learning_rate": 7.791679751358497e-06, "loss": 0.3322, "step": 2221 }, { "epoch": 0.332286526095409, "grad_norm": 1.5144434482126574, "learning_rate": 7.789670240923169e-06, "loss": 0.3003, "step": 2222 }, { "epoch": 0.33243606998654107, "grad_norm": 1.5220974726280843, "learning_rate": 7.787660076004316e-06, "loss": 0.3285, "step": 2223 }, { "epoch": 0.3325856138776731, "grad_norm": 1.440338460328666, "learning_rate": 7.785649257073544e-06, "loss": 0.2101, "step": 2224 }, { "epoch": 0.33273515776880513, "grad_norm": 1.5213659837860871, "learning_rate": 7.783637784602608e-06, "loss": 0.2036, "step": 2225 }, { "epoch": 0.33288470165993717, "grad_norm": 2.991037147646805, "learning_rate": 7.781625659063423e-06, "loss": 0.6472, "step": 2226 }, { "epoch": 0.33303424555106925, "grad_norm": 1.511479617581753, "learning_rate": 7.779612880928052e-06, "loss": 0.1741, "step": 2227 }, { "epoch": 0.3331837894422013, "grad_norm": 1.4094694160926888, "learning_rate": 7.777599450668708e-06, "loss": 0.1864, "step": 2228 }, { "epoch": 0.3333333333333333, "grad_norm": 1.9589341253223518, "learning_rate": 7.775585368757767e-06, "loss": 0.2585, "step": 2229 }, { "epoch": 0.3334828772244654, "grad_norm": 1.5850528260281727, "learning_rate": 7.773570635667746e-06, "loss": 0.3359, "step": 2230 }, { "epoch": 0.33363242111559743, "grad_norm": 1.928677128362692, "learning_rate": 7.771555251871326e-06, "loss": 0.5367, "step": 2231 }, { "epoch": 0.33378196500672946, "grad_norm": 1.8147255946503564, "learning_rate": 7.769539217841333e-06, "loss": 0.2452, "step": 2232 }, { "epoch": 0.3339315088978615, "grad_norm": 1.533478905239483, "learning_rate": 7.76752253405075e-06, "loss": 0.3273, "step": 2233 }, { "epoch": 0.3340810527889936, "grad_norm": 1.3049158892916473, "learning_rate": 7.765505200972708e-06, "loss": 0.2014, "step": 2234 }, { "epoch": 0.3342305966801256, "grad_norm": 1.6006461196766226, "learning_rate": 7.763487219080492e-06, "loss": 0.3297, "step": 2235 }, { "epoch": 0.33438014057125764, "grad_norm": 1.3959531694066092, "learning_rate": 7.761468588847543e-06, "loss": 0.1987, "step": 2236 }, { "epoch": 0.33452968446238973, "grad_norm": 1.413036480020334, "learning_rate": 7.75944931074745e-06, "loss": 0.2507, "step": 2237 }, { "epoch": 0.33467922835352176, "grad_norm": 1.6143147896405532, "learning_rate": 7.757429385253955e-06, "loss": 0.365, "step": 2238 }, { "epoch": 0.3348287722446538, "grad_norm": 1.2552402165493568, "learning_rate": 7.755408812840952e-06, "loss": 0.2404, "step": 2239 }, { "epoch": 0.3349783161357859, "grad_norm": 1.884570594935436, "learning_rate": 7.753387593982485e-06, "loss": 0.4998, "step": 2240 }, { "epoch": 0.3351278600269179, "grad_norm": 1.5525626441469005, "learning_rate": 7.751365729152754e-06, "loss": 0.1856, "step": 2241 }, { "epoch": 0.33527740391804994, "grad_norm": 1.5710541861971588, "learning_rate": 7.749343218826107e-06, "loss": 0.1989, "step": 2242 }, { "epoch": 0.335426947809182, "grad_norm": 1.9778222905539145, "learning_rate": 7.747320063477045e-06, "loss": 0.602, "step": 2243 }, { "epoch": 0.33557649170031406, "grad_norm": 1.6193619290401855, "learning_rate": 7.745296263580218e-06, "loss": 0.2811, "step": 2244 }, { "epoch": 0.3357260355914461, "grad_norm": 1.9654135361350595, "learning_rate": 7.743271819610432e-06, "loss": 0.5169, "step": 2245 }, { "epoch": 0.3358755794825781, "grad_norm": 1.5452430622711606, "learning_rate": 7.741246732042638e-06, "loss": 0.345, "step": 2246 }, { "epoch": 0.3360251233737102, "grad_norm": 1.7587162322576044, "learning_rate": 7.739221001351942e-06, "loss": 0.4502, "step": 2247 }, { "epoch": 0.33617466726484224, "grad_norm": 1.9373589145144738, "learning_rate": 7.7371946280136e-06, "loss": 0.3852, "step": 2248 }, { "epoch": 0.3363242111559743, "grad_norm": 1.9597929999948922, "learning_rate": 7.73516761250302e-06, "loss": 0.3904, "step": 2249 }, { "epoch": 0.3364737550471063, "grad_norm": 1.8798765980010916, "learning_rate": 7.733139955295756e-06, "loss": 0.5182, "step": 2250 }, { "epoch": 0.3366232989382384, "grad_norm": 1.626082298346251, "learning_rate": 7.73111165686752e-06, "loss": 0.3692, "step": 2251 }, { "epoch": 0.3367728428293704, "grad_norm": 0.9953658155537086, "learning_rate": 7.72908271769417e-06, "loss": 0.1844, "step": 2252 }, { "epoch": 0.33692238672050245, "grad_norm": 1.3727196538195634, "learning_rate": 7.727053138251712e-06, "loss": 0.2179, "step": 2253 }, { "epoch": 0.33707193061163454, "grad_norm": 1.4912804935224222, "learning_rate": 7.725022919016306e-06, "loss": 0.3038, "step": 2254 }, { "epoch": 0.33722147450276657, "grad_norm": 1.5520829591707979, "learning_rate": 7.722992060464261e-06, "loss": 0.2481, "step": 2255 }, { "epoch": 0.3373710183938986, "grad_norm": 0.9325160815572334, "learning_rate": 7.720960563072035e-06, "loss": 0.1387, "step": 2256 }, { "epoch": 0.33752056228503063, "grad_norm": 1.1905265030205905, "learning_rate": 7.718928427316241e-06, "loss": 0.2595, "step": 2257 }, { "epoch": 0.3376701061761627, "grad_norm": 1.0242402171948481, "learning_rate": 7.716895653673633e-06, "loss": 0.1728, "step": 2258 }, { "epoch": 0.33781965006729475, "grad_norm": 1.9399325651192516, "learning_rate": 7.714862242621121e-06, "loss": 0.236, "step": 2259 }, { "epoch": 0.3379691939584268, "grad_norm": 1.2589445942870727, "learning_rate": 7.712828194635762e-06, "loss": 0.1777, "step": 2260 }, { "epoch": 0.33811873784955887, "grad_norm": 1.0189796376895088, "learning_rate": 7.710793510194765e-06, "loss": 0.2061, "step": 2261 }, { "epoch": 0.3382682817406909, "grad_norm": 1.3110462323223497, "learning_rate": 7.708758189775485e-06, "loss": 0.1907, "step": 2262 }, { "epoch": 0.33841782563182293, "grad_norm": 1.8977870965152843, "learning_rate": 7.706722233855428e-06, "loss": 0.4697, "step": 2263 }, { "epoch": 0.33856736952295496, "grad_norm": 1.676037286436981, "learning_rate": 7.70468564291225e-06, "loss": 0.5385, "step": 2264 }, { "epoch": 0.33871691341408705, "grad_norm": 1.8926850673510127, "learning_rate": 7.702648417423755e-06, "loss": 0.6545, "step": 2265 }, { "epoch": 0.3388664573052191, "grad_norm": 1.211799579288037, "learning_rate": 7.700610557867894e-06, "loss": 0.1933, "step": 2266 }, { "epoch": 0.3390160011963511, "grad_norm": 1.3249080152566644, "learning_rate": 7.69857206472277e-06, "loss": 0.3369, "step": 2267 }, { "epoch": 0.3391655450874832, "grad_norm": 1.197942464661273, "learning_rate": 7.696532938466631e-06, "loss": 0.2168, "step": 2268 }, { "epoch": 0.33931508897861523, "grad_norm": 1.5336422309655264, "learning_rate": 7.69449317957788e-06, "loss": 0.1862, "step": 2269 }, { "epoch": 0.33946463286974726, "grad_norm": 1.4755775481717086, "learning_rate": 7.692452788535058e-06, "loss": 0.3991, "step": 2270 }, { "epoch": 0.3396141767608793, "grad_norm": 1.2003981851028729, "learning_rate": 7.690411765816864e-06, "loss": 0.1887, "step": 2271 }, { "epoch": 0.3397637206520114, "grad_norm": 1.5232479955721698, "learning_rate": 7.688370111902141e-06, "loss": 0.3291, "step": 2272 }, { "epoch": 0.3399132645431434, "grad_norm": 1.1630358360895665, "learning_rate": 7.686327827269883e-06, "loss": 0.3571, "step": 2273 }, { "epoch": 0.34006280843427544, "grad_norm": 1.2180304152897419, "learning_rate": 7.684284912399227e-06, "loss": 0.2026, "step": 2274 }, { "epoch": 0.34021235232540753, "grad_norm": 1.2838128091625958, "learning_rate": 7.68224136776946e-06, "loss": 0.1883, "step": 2275 }, { "epoch": 0.34036189621653956, "grad_norm": 1.9043193483249239, "learning_rate": 7.680197193860019e-06, "loss": 0.5335, "step": 2276 }, { "epoch": 0.3405114401076716, "grad_norm": 1.6719584667012997, "learning_rate": 7.678152391150488e-06, "loss": 0.4823, "step": 2277 }, { "epoch": 0.3406609839988036, "grad_norm": 1.614799856208538, "learning_rate": 7.676106960120595e-06, "loss": 0.3589, "step": 2278 }, { "epoch": 0.3408105278899357, "grad_norm": 1.915461278238496, "learning_rate": 7.674060901250217e-06, "loss": 0.4702, "step": 2279 }, { "epoch": 0.34096007178106774, "grad_norm": 1.80642832565446, "learning_rate": 7.672014215019382e-06, "loss": 0.5783, "step": 2280 }, { "epoch": 0.3411096156721998, "grad_norm": 1.8185018530729478, "learning_rate": 7.66996690190826e-06, "loss": 0.2136, "step": 2281 }, { "epoch": 0.34125915956333186, "grad_norm": 1.5750847935557877, "learning_rate": 7.667918962397172e-06, "loss": 0.3577, "step": 2282 }, { "epoch": 0.3414087034544639, "grad_norm": 1.4052021520161486, "learning_rate": 7.665870396966582e-06, "loss": 0.3421, "step": 2283 }, { "epoch": 0.3415582473455959, "grad_norm": 2.4796312435140964, "learning_rate": 7.663821206097106e-06, "loss": 0.5199, "step": 2284 }, { "epoch": 0.34170779123672795, "grad_norm": 1.9135083427877795, "learning_rate": 7.661771390269506e-06, "loss": 0.5724, "step": 2285 }, { "epoch": 0.34185733512786004, "grad_norm": 1.47040509500929, "learning_rate": 7.65972094996468e-06, "loss": 0.3446, "step": 2286 }, { "epoch": 0.34200687901899207, "grad_norm": 1.8826362598607522, "learning_rate": 7.65766988566369e-06, "loss": 0.4584, "step": 2287 }, { "epoch": 0.3421564229101241, "grad_norm": 1.583414431479772, "learning_rate": 7.65561819784773e-06, "loss": 0.2571, "step": 2288 }, { "epoch": 0.3423059668012562, "grad_norm": 1.1374097342884393, "learning_rate": 7.653565886998149e-06, "loss": 0.2772, "step": 2289 }, { "epoch": 0.3424555106923882, "grad_norm": 1.6018927652316786, "learning_rate": 7.651512953596438e-06, "loss": 0.3834, "step": 2290 }, { "epoch": 0.34260505458352025, "grad_norm": 1.5898030999912747, "learning_rate": 7.649459398124233e-06, "loss": 0.268, "step": 2291 }, { "epoch": 0.3427545984746523, "grad_norm": 1.788880984497509, "learning_rate": 7.64740522106332e-06, "loss": 0.4031, "step": 2292 }, { "epoch": 0.34290414236578437, "grad_norm": 1.4103716345007062, "learning_rate": 7.645350422895627e-06, "loss": 0.34, "step": 2293 }, { "epoch": 0.3430536862569164, "grad_norm": 1.455320179410942, "learning_rate": 7.643295004103232e-06, "loss": 0.393, "step": 2294 }, { "epoch": 0.34320323014804843, "grad_norm": 1.121798327145041, "learning_rate": 7.641238965168356e-06, "loss": 0.1951, "step": 2295 }, { "epoch": 0.3433527740391805, "grad_norm": 1.959743352728859, "learning_rate": 7.639182306573362e-06, "loss": 0.5306, "step": 2296 }, { "epoch": 0.34350231793031255, "grad_norm": 1.0222053736742456, "learning_rate": 7.637125028800765e-06, "loss": 0.1877, "step": 2297 }, { "epoch": 0.3436518618214446, "grad_norm": 1.2126683631983057, "learning_rate": 7.63506713233322e-06, "loss": 0.2219, "step": 2298 }, { "epoch": 0.34380140571257667, "grad_norm": 1.182330026578451, "learning_rate": 7.633008617653531e-06, "loss": 0.2427, "step": 2299 }, { "epoch": 0.3439509496037087, "grad_norm": 1.3284239825142252, "learning_rate": 7.630949485244646e-06, "loss": 0.3342, "step": 2300 }, { "epoch": 0.34410049349484073, "grad_norm": 1.4991905889604522, "learning_rate": 7.6288897355896565e-06, "loss": 0.3277, "step": 2301 }, { "epoch": 0.34425003738597276, "grad_norm": 1.2105004198602645, "learning_rate": 7.6268293691718e-06, "loss": 0.2175, "step": 2302 }, { "epoch": 0.34439958127710485, "grad_norm": 1.137201185526714, "learning_rate": 7.624768386474456e-06, "loss": 0.1951, "step": 2303 }, { "epoch": 0.3445491251682369, "grad_norm": 1.2921483182674967, "learning_rate": 7.622706787981153e-06, "loss": 0.2069, "step": 2304 }, { "epoch": 0.3446986690593689, "grad_norm": 1.5385416209630458, "learning_rate": 7.620644574175562e-06, "loss": 0.3467, "step": 2305 }, { "epoch": 0.344848212950501, "grad_norm": 2.272200349391826, "learning_rate": 7.6185817455414975e-06, "loss": 0.4775, "step": 2306 }, { "epoch": 0.34499775684163303, "grad_norm": 1.7594806067750843, "learning_rate": 7.61651830256292e-06, "loss": 0.2168, "step": 2307 }, { "epoch": 0.34514730073276506, "grad_norm": 1.9724043128153095, "learning_rate": 7.614454245723932e-06, "loss": 0.5726, "step": 2308 }, { "epoch": 0.3452968446238971, "grad_norm": 1.4742051107263883, "learning_rate": 7.612389575508781e-06, "loss": 0.3032, "step": 2309 }, { "epoch": 0.3454463885150292, "grad_norm": 2.0620637201052325, "learning_rate": 7.610324292401861e-06, "loss": 0.309, "step": 2310 }, { "epoch": 0.3455959324061612, "grad_norm": 1.0544585597609104, "learning_rate": 7.608258396887702e-06, "loss": 0.2339, "step": 2311 }, { "epoch": 0.34574547629729324, "grad_norm": 1.2388411799285035, "learning_rate": 7.606191889450989e-06, "loss": 0.1824, "step": 2312 }, { "epoch": 0.34589502018842533, "grad_norm": 1.9886690672129868, "learning_rate": 7.604124770576539e-06, "loss": 0.3321, "step": 2313 }, { "epoch": 0.34604456407955736, "grad_norm": 1.3774072344684907, "learning_rate": 7.602057040749325e-06, "loss": 0.3824, "step": 2314 }, { "epoch": 0.3461941079706894, "grad_norm": 0.842321998785562, "learning_rate": 7.59998870045445e-06, "loss": 0.1966, "step": 2315 }, { "epoch": 0.3463436518618214, "grad_norm": 1.6133223844738274, "learning_rate": 7.597919750177168e-06, "loss": 0.3439, "step": 2316 }, { "epoch": 0.3464931957529535, "grad_norm": 2.701793044817531, "learning_rate": 7.595850190402877e-06, "loss": 0.3649, "step": 2317 }, { "epoch": 0.34664273964408554, "grad_norm": 2.042826281041951, "learning_rate": 7.593780021617115e-06, "loss": 0.5945, "step": 2318 }, { "epoch": 0.34679228353521757, "grad_norm": 1.8226181825863899, "learning_rate": 7.591709244305561e-06, "loss": 0.4039, "step": 2319 }, { "epoch": 0.34694182742634966, "grad_norm": 1.7612529231819496, "learning_rate": 7.589637858954041e-06, "loss": 0.2384, "step": 2320 }, { "epoch": 0.3470913713174817, "grad_norm": 1.2495900823088222, "learning_rate": 7.587565866048523e-06, "loss": 0.3633, "step": 2321 }, { "epoch": 0.3472409152086137, "grad_norm": 1.2456808766471323, "learning_rate": 7.5854932660751144e-06, "loss": 0.2318, "step": 2322 }, { "epoch": 0.34739045909974575, "grad_norm": 1.772092412043284, "learning_rate": 7.58342005952007e-06, "loss": 0.2514, "step": 2323 }, { "epoch": 0.34754000299087784, "grad_norm": 1.0718322965745777, "learning_rate": 7.581346246869781e-06, "loss": 0.2001, "step": 2324 }, { "epoch": 0.34768954688200987, "grad_norm": 1.9749261015333692, "learning_rate": 7.579271828610786e-06, "loss": 0.642, "step": 2325 }, { "epoch": 0.3478390907731419, "grad_norm": 3.5434908077443183, "learning_rate": 7.5771968052297605e-06, "loss": 0.2132, "step": 2326 }, { "epoch": 0.347988634664274, "grad_norm": 1.3138111751860577, "learning_rate": 7.575121177213528e-06, "loss": 0.2074, "step": 2327 }, { "epoch": 0.348138178555406, "grad_norm": 1.4549216960822273, "learning_rate": 7.573044945049051e-06, "loss": 0.2213, "step": 2328 }, { "epoch": 0.34828772244653805, "grad_norm": 1.9285857186532291, "learning_rate": 7.5709681092234315e-06, "loss": 0.5573, "step": 2329 }, { "epoch": 0.3484372663376701, "grad_norm": 1.528743992296175, "learning_rate": 7.568890670223918e-06, "loss": 0.2517, "step": 2330 }, { "epoch": 0.34858681022880217, "grad_norm": 1.3064713646436477, "learning_rate": 7.566812628537894e-06, "loss": 0.1867, "step": 2331 }, { "epoch": 0.3487363541199342, "grad_norm": 1.3819819935958808, "learning_rate": 7.56473398465289e-06, "loss": 0.2354, "step": 2332 }, { "epoch": 0.34888589801106623, "grad_norm": 1.90386469705261, "learning_rate": 7.5626547390565766e-06, "loss": 0.6091, "step": 2333 }, { "epoch": 0.3490354419021983, "grad_norm": 1.1572160857164981, "learning_rate": 7.5605748922367636e-06, "loss": 0.1634, "step": 2334 }, { "epoch": 0.34918498579333035, "grad_norm": 1.120922647106074, "learning_rate": 7.558494444681405e-06, "loss": 0.2191, "step": 2335 }, { "epoch": 0.3493345296844624, "grad_norm": 1.8933761063913324, "learning_rate": 7.556413396878593e-06, "loss": 0.4644, "step": 2336 }, { "epoch": 0.3494840735755944, "grad_norm": 2.2982217544810526, "learning_rate": 7.554331749316559e-06, "loss": 0.5312, "step": 2337 }, { "epoch": 0.3496336174667265, "grad_norm": 1.4598497201666212, "learning_rate": 7.552249502483681e-06, "loss": 0.2322, "step": 2338 }, { "epoch": 0.34978316135785853, "grad_norm": 1.0002482253315497, "learning_rate": 7.550166656868472e-06, "loss": 0.183, "step": 2339 }, { "epoch": 0.34993270524899056, "grad_norm": 1.7931068633163847, "learning_rate": 7.548083212959588e-06, "loss": 0.3655, "step": 2340 }, { "epoch": 0.35008224914012265, "grad_norm": 1.9088729651038523, "learning_rate": 7.545999171245826e-06, "loss": 0.5897, "step": 2341 }, { "epoch": 0.3502317930312547, "grad_norm": 1.6014278355250144, "learning_rate": 7.543914532216121e-06, "loss": 0.3399, "step": 2342 }, { "epoch": 0.3503813369223867, "grad_norm": 1.589397325565207, "learning_rate": 7.541829296359552e-06, "loss": 0.5276, "step": 2343 }, { "epoch": 0.35053088081351874, "grad_norm": 1.3666145854556788, "learning_rate": 7.5397434641653325e-06, "loss": 0.297, "step": 2344 }, { "epoch": 0.35068042470465083, "grad_norm": 1.1296429490766309, "learning_rate": 7.53765703612282e-06, "loss": 0.1881, "step": 2345 }, { "epoch": 0.35082996859578286, "grad_norm": 1.3370573999376023, "learning_rate": 7.535570012721509e-06, "loss": 0.1978, "step": 2346 }, { "epoch": 0.3509795124869149, "grad_norm": 1.80559813243436, "learning_rate": 7.533482394451037e-06, "loss": 0.5293, "step": 2347 }, { "epoch": 0.351129056378047, "grad_norm": 1.2601205754536828, "learning_rate": 7.531394181801182e-06, "loss": 0.3029, "step": 2348 }, { "epoch": 0.351278600269179, "grad_norm": 1.5151339748208972, "learning_rate": 7.529305375261852e-06, "loss": 0.354, "step": 2349 }, { "epoch": 0.35142814416031104, "grad_norm": 1.1154348924689401, "learning_rate": 7.52721597532311e-06, "loss": 0.1877, "step": 2350 }, { "epoch": 0.35157768805144307, "grad_norm": 1.6903766996209675, "learning_rate": 7.525125982475141e-06, "loss": 0.3838, "step": 2351 }, { "epoch": 0.35172723194257516, "grad_norm": 1.5111917986875527, "learning_rate": 7.523035397208281e-06, "loss": 0.3664, "step": 2352 }, { "epoch": 0.3518767758337072, "grad_norm": 14.577070159201043, "learning_rate": 7.520944220013002e-06, "loss": 0.2204, "step": 2353 }, { "epoch": 0.3520263197248392, "grad_norm": 1.2720960634080132, "learning_rate": 7.518852451379914e-06, "loss": 0.2156, "step": 2354 }, { "epoch": 0.3521758636159713, "grad_norm": 1.4752321481872246, "learning_rate": 7.516760091799766e-06, "loss": 0.2068, "step": 2355 }, { "epoch": 0.35232540750710334, "grad_norm": 1.2757349060760932, "learning_rate": 7.5146671417634456e-06, "loss": 0.3031, "step": 2356 }, { "epoch": 0.35247495139823537, "grad_norm": 1.1892854387567553, "learning_rate": 7.512573601761979e-06, "loss": 0.1938, "step": 2357 }, { "epoch": 0.35262449528936746, "grad_norm": 1.1695321760256765, "learning_rate": 7.5104794722865305e-06, "loss": 0.1994, "step": 2358 }, { "epoch": 0.3527740391804995, "grad_norm": 1.4157720422061164, "learning_rate": 7.5083847538284025e-06, "loss": 0.2415, "step": 2359 }, { "epoch": 0.3529235830716315, "grad_norm": 1.642650861184269, "learning_rate": 7.506289446879038e-06, "loss": 0.3864, "step": 2360 }, { "epoch": 0.35307312696276355, "grad_norm": 1.66074120706464, "learning_rate": 7.504193551930014e-06, "loss": 0.2436, "step": 2361 }, { "epoch": 0.35322267085389564, "grad_norm": 1.6293105897438254, "learning_rate": 7.502097069473051e-06, "loss": 0.4727, "step": 2362 }, { "epoch": 0.35337221474502767, "grad_norm": 2.1484736491543326, "learning_rate": 7.500000000000001e-06, "loss": 0.6399, "step": 2363 }, { "epoch": 0.3535217586361597, "grad_norm": 1.998088679617451, "learning_rate": 7.497902344002858e-06, "loss": 0.4879, "step": 2364 }, { "epoch": 0.3536713025272918, "grad_norm": 1.0466569132461796, "learning_rate": 7.495804101973751e-06, "loss": 0.175, "step": 2365 }, { "epoch": 0.3538208464184238, "grad_norm": 1.483423152464052, "learning_rate": 7.49370527440495e-06, "loss": 0.1817, "step": 2366 }, { "epoch": 0.35397039030955585, "grad_norm": 1.181675839490534, "learning_rate": 7.491605861788856e-06, "loss": 0.2158, "step": 2367 }, { "epoch": 0.3541199342006879, "grad_norm": 1.272613662606653, "learning_rate": 7.4895058646180165e-06, "loss": 0.2161, "step": 2368 }, { "epoch": 0.35426947809181997, "grad_norm": 2.249706812141276, "learning_rate": 7.487405283385109e-06, "loss": 0.4302, "step": 2369 }, { "epoch": 0.354419021982952, "grad_norm": 1.790056947576118, "learning_rate": 7.485304118582949e-06, "loss": 0.4237, "step": 2370 }, { "epoch": 0.35456856587408403, "grad_norm": 1.728469311368499, "learning_rate": 7.483202370704492e-06, "loss": 0.232, "step": 2371 }, { "epoch": 0.3547181097652161, "grad_norm": 1.7204599933992115, "learning_rate": 7.481100040242827e-06, "loss": 0.2945, "step": 2372 }, { "epoch": 0.35486765365634815, "grad_norm": 1.7017419956837787, "learning_rate": 7.478997127691181e-06, "loss": 0.324, "step": 2373 }, { "epoch": 0.3550171975474802, "grad_norm": 1.7242409041398354, "learning_rate": 7.476893633542917e-06, "loss": 0.379, "step": 2374 }, { "epoch": 0.3551667414386122, "grad_norm": 1.7352420641389674, "learning_rate": 7.474789558291537e-06, "loss": 0.3666, "step": 2375 }, { "epoch": 0.3553162853297443, "grad_norm": 2.417484284214745, "learning_rate": 7.472684902430678e-06, "loss": 0.5313, "step": 2376 }, { "epoch": 0.35546582922087633, "grad_norm": 1.9863314858317191, "learning_rate": 7.470579666454108e-06, "loss": 0.4731, "step": 2377 }, { "epoch": 0.35561537311200836, "grad_norm": 1.3767673556499953, "learning_rate": 7.46847385085574e-06, "loss": 0.2309, "step": 2378 }, { "epoch": 0.35576491700314045, "grad_norm": 2.1027176103910423, "learning_rate": 7.466367456129616e-06, "loss": 0.2419, "step": 2379 }, { "epoch": 0.3559144608942725, "grad_norm": 1.3223085016147877, "learning_rate": 7.464260482769917e-06, "loss": 0.2214, "step": 2380 }, { "epoch": 0.3560640047854045, "grad_norm": 3.308046838444924, "learning_rate": 7.462152931270961e-06, "loss": 0.1637, "step": 2381 }, { "epoch": 0.35621354867653654, "grad_norm": 1.2529639271363728, "learning_rate": 7.4600448021271975e-06, "loss": 0.209, "step": 2382 }, { "epoch": 0.3563630925676686, "grad_norm": 1.4685959227889136, "learning_rate": 7.457936095833216e-06, "loss": 0.3916, "step": 2383 }, { "epoch": 0.35651263645880066, "grad_norm": 1.762730610248373, "learning_rate": 7.455826812883738e-06, "loss": 0.2468, "step": 2384 }, { "epoch": 0.3566621803499327, "grad_norm": 1.3974489415451616, "learning_rate": 7.453716953773622e-06, "loss": 0.2198, "step": 2385 }, { "epoch": 0.3568117242410648, "grad_norm": 1.6020912414110884, "learning_rate": 7.4516065189978625e-06, "loss": 0.3663, "step": 2386 }, { "epoch": 0.3569612681321968, "grad_norm": 1.1698502771825479, "learning_rate": 7.449495509051584e-06, "loss": 0.2301, "step": 2387 }, { "epoch": 0.35711081202332884, "grad_norm": 1.6443713783244578, "learning_rate": 7.447383924430055e-06, "loss": 0.2023, "step": 2388 }, { "epoch": 0.35726035591446087, "grad_norm": 1.320656374724735, "learning_rate": 7.44527176562867e-06, "loss": 0.1615, "step": 2389 }, { "epoch": 0.35740989980559296, "grad_norm": 2.147900342981391, "learning_rate": 7.4431590331429615e-06, "loss": 0.2496, "step": 2390 }, { "epoch": 0.357559443696725, "grad_norm": 1.0493224753110673, "learning_rate": 7.441045727468601e-06, "loss": 0.174, "step": 2391 }, { "epoch": 0.357708987587857, "grad_norm": 1.4365731625537776, "learning_rate": 7.4389318491013855e-06, "loss": 0.2509, "step": 2392 }, { "epoch": 0.3578585314789891, "grad_norm": 1.3570392728095118, "learning_rate": 7.436817398537253e-06, "loss": 0.3505, "step": 2393 }, { "epoch": 0.35800807537012114, "grad_norm": 0.9836140143478714, "learning_rate": 7.434702376272275e-06, "loss": 0.1676, "step": 2394 }, { "epoch": 0.35815761926125317, "grad_norm": 1.6386886795873667, "learning_rate": 7.4325867828026555e-06, "loss": 0.3311, "step": 2395 }, { "epoch": 0.3583071631523852, "grad_norm": 1.7664542023376915, "learning_rate": 7.4304706186247344e-06, "loss": 0.4952, "step": 2396 }, { "epoch": 0.3584567070435173, "grad_norm": 1.970974898714591, "learning_rate": 7.42835388423498e-06, "loss": 0.5721, "step": 2397 }, { "epoch": 0.3586062509346493, "grad_norm": 1.234101790360312, "learning_rate": 7.426236580130004e-06, "loss": 0.1842, "step": 2398 }, { "epoch": 0.35875579482578135, "grad_norm": 0.8359881523865066, "learning_rate": 7.424118706806543e-06, "loss": 0.1574, "step": 2399 }, { "epoch": 0.35890533871691344, "grad_norm": 1.2167515845771457, "learning_rate": 7.422000264761471e-06, "loss": 0.3393, "step": 2400 }, { "epoch": 0.35905488260804547, "grad_norm": 1.1652590311365825, "learning_rate": 7.419881254491794e-06, "loss": 0.3154, "step": 2401 }, { "epoch": 0.3592044264991775, "grad_norm": 1.7451208164751588, "learning_rate": 7.417761676494654e-06, "loss": 0.3306, "step": 2402 }, { "epoch": 0.35935397039030953, "grad_norm": 1.50819933578857, "learning_rate": 7.415641531267325e-06, "loss": 0.3414, "step": 2403 }, { "epoch": 0.3595035142814416, "grad_norm": 1.159271590101521, "learning_rate": 7.4135208193072126e-06, "loss": 0.1953, "step": 2404 }, { "epoch": 0.35965305817257365, "grad_norm": 1.6763078000161467, "learning_rate": 7.411399541111855e-06, "loss": 0.3306, "step": 2405 }, { "epoch": 0.3598026020637057, "grad_norm": 1.6105048047144768, "learning_rate": 7.409277697178926e-06, "loss": 0.2899, "step": 2406 }, { "epoch": 0.35995214595483777, "grad_norm": 1.6423579180660441, "learning_rate": 7.4071552880062295e-06, "loss": 0.1941, "step": 2407 }, { "epoch": 0.3601016898459698, "grad_norm": 1.984850391003558, "learning_rate": 7.4050323140917035e-06, "loss": 0.509, "step": 2408 }, { "epoch": 0.36025123373710183, "grad_norm": 1.6162723010996198, "learning_rate": 7.402908775933419e-06, "loss": 0.3583, "step": 2409 }, { "epoch": 0.36040077762823386, "grad_norm": 1.3526627301579437, "learning_rate": 7.400784674029579e-06, "loss": 0.2228, "step": 2410 }, { "epoch": 0.36055032151936595, "grad_norm": 1.0613997053031348, "learning_rate": 7.398660008878517e-06, "loss": 0.1782, "step": 2411 }, { "epoch": 0.360699865410498, "grad_norm": 1.4156014777328207, "learning_rate": 7.396534780978699e-06, "loss": 0.3675, "step": 2412 }, { "epoch": 0.36084940930163, "grad_norm": 2.0634027441697778, "learning_rate": 7.394408990828726e-06, "loss": 0.3529, "step": 2413 }, { "epoch": 0.3609989531927621, "grad_norm": 1.6015524340522593, "learning_rate": 7.392282638927326e-06, "loss": 0.233, "step": 2414 }, { "epoch": 0.3611484970838941, "grad_norm": 1.047148380596162, "learning_rate": 7.390155725773365e-06, "loss": 0.1861, "step": 2415 }, { "epoch": 0.36129804097502616, "grad_norm": 1.6393955983196027, "learning_rate": 7.388028251865837e-06, "loss": 0.1914, "step": 2416 }, { "epoch": 0.36144758486615824, "grad_norm": 1.3926718614765299, "learning_rate": 7.385900217703865e-06, "loss": 0.186, "step": 2417 }, { "epoch": 0.3615971287572903, "grad_norm": 1.5958216662832228, "learning_rate": 7.383771623786709e-06, "loss": 0.2127, "step": 2418 }, { "epoch": 0.3617466726484223, "grad_norm": 1.956943418621964, "learning_rate": 7.381642470613758e-06, "loss": 0.5055, "step": 2419 }, { "epoch": 0.36189621653955434, "grad_norm": 2.2836322625873855, "learning_rate": 7.37951275868453e-06, "loss": 0.6337, "step": 2420 }, { "epoch": 0.3620457604306864, "grad_norm": 1.922712275476603, "learning_rate": 7.3773824884986744e-06, "loss": 0.4718, "step": 2421 }, { "epoch": 0.36219530432181846, "grad_norm": 1.01255124960959, "learning_rate": 7.375251660555978e-06, "loss": 0.1985, "step": 2422 }, { "epoch": 0.3623448482129505, "grad_norm": 1.6681331401639201, "learning_rate": 7.373120275356349e-06, "loss": 0.4116, "step": 2423 }, { "epoch": 0.3624943921040826, "grad_norm": 1.3837406537641457, "learning_rate": 7.370988333399834e-06, "loss": 0.2207, "step": 2424 }, { "epoch": 0.3626439359952146, "grad_norm": 1.559290768311182, "learning_rate": 7.3688558351866055e-06, "loss": 0.2715, "step": 2425 }, { "epoch": 0.36279347988634664, "grad_norm": 1.305139066591499, "learning_rate": 7.366722781216968e-06, "loss": 0.3011, "step": 2426 }, { "epoch": 0.36294302377747867, "grad_norm": 1.6209491159652774, "learning_rate": 7.3645891719913584e-06, "loss": 0.3261, "step": 2427 }, { "epoch": 0.36309256766861076, "grad_norm": 1.4951914962551227, "learning_rate": 7.3624550080103385e-06, "loss": 0.3393, "step": 2428 }, { "epoch": 0.3632421115597428, "grad_norm": 1.8477503569108042, "learning_rate": 7.360320289774607e-06, "loss": 0.3535, "step": 2429 }, { "epoch": 0.3633916554508748, "grad_norm": 1.8411787417724799, "learning_rate": 7.358185017784989e-06, "loss": 0.3847, "step": 2430 }, { "epoch": 0.3635411993420069, "grad_norm": 1.3297346850852803, "learning_rate": 7.356049192542439e-06, "loss": 0.1938, "step": 2431 }, { "epoch": 0.36369074323313894, "grad_norm": 1.746289398019253, "learning_rate": 7.353912814548042e-06, "loss": 0.5313, "step": 2432 }, { "epoch": 0.36384028712427097, "grad_norm": 1.4432992935580808, "learning_rate": 7.351775884303013e-06, "loss": 0.3349, "step": 2433 }, { "epoch": 0.363989831015403, "grad_norm": 0.9649898248590643, "learning_rate": 7.349638402308696e-06, "loss": 0.2208, "step": 2434 }, { "epoch": 0.3641393749065351, "grad_norm": 1.8449998111001102, "learning_rate": 7.347500369066567e-06, "loss": 0.3445, "step": 2435 }, { "epoch": 0.3642889187976671, "grad_norm": 1.7052431627521538, "learning_rate": 7.345361785078227e-06, "loss": 0.2205, "step": 2436 }, { "epoch": 0.36443846268879915, "grad_norm": 1.172361472405274, "learning_rate": 7.343222650845408e-06, "loss": 0.217, "step": 2437 }, { "epoch": 0.36458800657993123, "grad_norm": 1.6796925766446777, "learning_rate": 7.341082966869975e-06, "loss": 0.3589, "step": 2438 }, { "epoch": 0.36473755047106327, "grad_norm": 1.7021974172455587, "learning_rate": 7.3389427336539146e-06, "loss": 0.3936, "step": 2439 }, { "epoch": 0.3648870943621953, "grad_norm": 1.5350766828655338, "learning_rate": 7.336801951699348e-06, "loss": 0.4299, "step": 2440 }, { "epoch": 0.36503663825332733, "grad_norm": 2.0471270711764524, "learning_rate": 7.334660621508523e-06, "loss": 0.6307, "step": 2441 }, { "epoch": 0.3651861821444594, "grad_norm": 1.3866667939244877, "learning_rate": 7.3325187435838145e-06, "loss": 0.3435, "step": 2442 }, { "epoch": 0.36533572603559145, "grad_norm": 1.9710057444965332, "learning_rate": 7.330376318427731e-06, "loss": 0.3715, "step": 2443 }, { "epoch": 0.3654852699267235, "grad_norm": 1.9432365570364065, "learning_rate": 7.328233346542906e-06, "loss": 0.5569, "step": 2444 }, { "epoch": 0.36563481381785556, "grad_norm": 1.0580374357411853, "learning_rate": 7.326089828432097e-06, "loss": 0.2205, "step": 2445 }, { "epoch": 0.3657843577089876, "grad_norm": 1.9944374991832907, "learning_rate": 7.323945764598198e-06, "loss": 0.5225, "step": 2446 }, { "epoch": 0.3659339016001196, "grad_norm": 1.3286346243167817, "learning_rate": 7.321801155544227e-06, "loss": 0.3126, "step": 2447 }, { "epoch": 0.36608344549125166, "grad_norm": 1.797949701766733, "learning_rate": 7.319656001773326e-06, "loss": 0.5528, "step": 2448 }, { "epoch": 0.36623298938238374, "grad_norm": 1.3871718042688361, "learning_rate": 7.317510303788775e-06, "loss": 0.1986, "step": 2449 }, { "epoch": 0.3663825332735158, "grad_norm": 1.696323576703982, "learning_rate": 7.31536406209397e-06, "loss": 0.4646, "step": 2450 }, { "epoch": 0.3665320771646478, "grad_norm": 1.112116781879356, "learning_rate": 7.313217277192441e-06, "loss": 0.1698, "step": 2451 }, { "epoch": 0.3666816210557799, "grad_norm": 1.6253111202173256, "learning_rate": 7.311069949587849e-06, "loss": 0.368, "step": 2452 }, { "epoch": 0.3668311649469119, "grad_norm": 1.8012549383451812, "learning_rate": 7.308922079783972e-06, "loss": 0.4943, "step": 2453 }, { "epoch": 0.36698070883804396, "grad_norm": 1.73000381975083, "learning_rate": 7.306773668284723e-06, "loss": 0.543, "step": 2454 }, { "epoch": 0.367130252729176, "grad_norm": 1.7982349971106675, "learning_rate": 7.30462471559414e-06, "loss": 0.3192, "step": 2455 }, { "epoch": 0.3672797966203081, "grad_norm": 1.3452770378011227, "learning_rate": 7.302475222216388e-06, "loss": 0.3435, "step": 2456 }, { "epoch": 0.3674293405114401, "grad_norm": 1.673472364447151, "learning_rate": 7.300325188655762e-06, "loss": 0.2812, "step": 2457 }, { "epoch": 0.36757888440257214, "grad_norm": 1.5298434482917413, "learning_rate": 7.298174615416676e-06, "loss": 0.2351, "step": 2458 }, { "epoch": 0.3677284282937042, "grad_norm": 0.9700694807997261, "learning_rate": 7.2960235030036765e-06, "loss": 0.1807, "step": 2459 }, { "epoch": 0.36787797218483625, "grad_norm": 1.3628120017832577, "learning_rate": 7.293871851921435e-06, "loss": 0.1775, "step": 2460 }, { "epoch": 0.3680275160759683, "grad_norm": 1.331669281066242, "learning_rate": 7.29171966267475e-06, "loss": 0.1953, "step": 2461 }, { "epoch": 0.3681770599671003, "grad_norm": 1.7478823536432608, "learning_rate": 7.2895669357685465e-06, "loss": 0.402, "step": 2462 }, { "epoch": 0.3683266038582324, "grad_norm": 1.0780884237235537, "learning_rate": 7.287413671707875e-06, "loss": 0.2042, "step": 2463 }, { "epoch": 0.36847614774936444, "grad_norm": 1.39413350641363, "learning_rate": 7.285259870997911e-06, "loss": 0.2161, "step": 2464 }, { "epoch": 0.36862569164049647, "grad_norm": 1.781457240613985, "learning_rate": 7.283105534143957e-06, "loss": 0.4025, "step": 2465 }, { "epoch": 0.36877523553162855, "grad_norm": 1.298349295097918, "learning_rate": 7.280950661651443e-06, "loss": 0.2216, "step": 2466 }, { "epoch": 0.3689247794227606, "grad_norm": 1.6025246428329694, "learning_rate": 7.278795254025921e-06, "loss": 0.2132, "step": 2467 }, { "epoch": 0.3690743233138926, "grad_norm": 1.9856391732187633, "learning_rate": 7.276639311773068e-06, "loss": 0.5661, "step": 2468 }, { "epoch": 0.36922386720502465, "grad_norm": 1.901744160559575, "learning_rate": 7.274482835398695e-06, "loss": 0.5376, "step": 2469 }, { "epoch": 0.36937341109615673, "grad_norm": 1.3567504841626332, "learning_rate": 7.272325825408728e-06, "loss": 0.2408, "step": 2470 }, { "epoch": 0.36952295498728877, "grad_norm": 2.2451530510094253, "learning_rate": 7.270168282309222e-06, "loss": 0.2229, "step": 2471 }, { "epoch": 0.3696724988784208, "grad_norm": 1.3887976362795125, "learning_rate": 7.268010206606361e-06, "loss": 0.3222, "step": 2472 }, { "epoch": 0.3698220427695529, "grad_norm": 1.2797777276840512, "learning_rate": 7.265851598806446e-06, "loss": 0.2033, "step": 2473 }, { "epoch": 0.3699715866606849, "grad_norm": 1.191177092854937, "learning_rate": 7.263692459415909e-06, "loss": 0.2131, "step": 2474 }, { "epoch": 0.37012113055181695, "grad_norm": 1.5776822158264978, "learning_rate": 7.261532788941306e-06, "loss": 0.3744, "step": 2475 }, { "epoch": 0.37027067444294903, "grad_norm": 1.0854274920898752, "learning_rate": 7.259372587889314e-06, "loss": 0.233, "step": 2476 }, { "epoch": 0.37042021833408106, "grad_norm": 1.2018243893675977, "learning_rate": 7.25721185676674e-06, "loss": 0.2363, "step": 2477 }, { "epoch": 0.3705697622252131, "grad_norm": 0.9967469112175203, "learning_rate": 7.25505059608051e-06, "loss": 0.1983, "step": 2478 }, { "epoch": 0.3707193061163451, "grad_norm": 1.2775894695719086, "learning_rate": 7.252888806337678e-06, "loss": 0.208, "step": 2479 }, { "epoch": 0.3708688500074772, "grad_norm": 1.5952345161170771, "learning_rate": 7.25072648804542e-06, "loss": 0.2703, "step": 2480 }, { "epoch": 0.37101839389860924, "grad_norm": 1.1681377853119423, "learning_rate": 7.248563641711036e-06, "loss": 0.2119, "step": 2481 }, { "epoch": 0.3711679377897413, "grad_norm": 1.9271882432441645, "learning_rate": 7.2464002678419524e-06, "loss": 0.3501, "step": 2482 }, { "epoch": 0.37131748168087336, "grad_norm": 1.4801233084882979, "learning_rate": 7.244236366945715e-06, "loss": 0.2263, "step": 2483 }, { "epoch": 0.3714670255720054, "grad_norm": 1.464705478500087, "learning_rate": 7.242071939529999e-06, "loss": 0.2709, "step": 2484 }, { "epoch": 0.3716165694631374, "grad_norm": 1.5487734242577513, "learning_rate": 7.239906986102598e-06, "loss": 0.1828, "step": 2485 }, { "epoch": 0.37176611335426946, "grad_norm": 1.1511925750155394, "learning_rate": 7.237741507171432e-06, "loss": 0.1896, "step": 2486 }, { "epoch": 0.37191565724540154, "grad_norm": 1.6178218431333782, "learning_rate": 7.235575503244542e-06, "loss": 0.3265, "step": 2487 }, { "epoch": 0.3720652011365336, "grad_norm": 1.005983926413353, "learning_rate": 7.233408974830093e-06, "loss": 0.173, "step": 2488 }, { "epoch": 0.3722147450276656, "grad_norm": 1.0227776759122873, "learning_rate": 7.231241922436374e-06, "loss": 0.1948, "step": 2489 }, { "epoch": 0.3723642889187977, "grad_norm": 1.6953767597307763, "learning_rate": 7.229074346571798e-06, "loss": 0.5005, "step": 2490 }, { "epoch": 0.3725138328099297, "grad_norm": 1.5875223115043986, "learning_rate": 7.226906247744897e-06, "loss": 0.3688, "step": 2491 }, { "epoch": 0.37266337670106175, "grad_norm": 2.27185613487943, "learning_rate": 7.2247376264643294e-06, "loss": 0.4033, "step": 2492 }, { "epoch": 0.3728129205921938, "grad_norm": 1.6290529057007825, "learning_rate": 7.2225684832388745e-06, "loss": 0.3383, "step": 2493 }, { "epoch": 0.3729624644833259, "grad_norm": 1.0562741707118548, "learning_rate": 7.220398818577432e-06, "loss": 0.182, "step": 2494 }, { "epoch": 0.3731120083744579, "grad_norm": 1.3801253740702135, "learning_rate": 7.21822863298903e-06, "loss": 0.4305, "step": 2495 }, { "epoch": 0.37326155226558994, "grad_norm": 1.3558340102680362, "learning_rate": 7.216057926982811e-06, "loss": 0.2011, "step": 2496 }, { "epoch": 0.373411096156722, "grad_norm": 1.6070744311623764, "learning_rate": 7.213886701068047e-06, "loss": 0.3777, "step": 2497 }, { "epoch": 0.37356064004785405, "grad_norm": 1.0945948539403993, "learning_rate": 7.211714955754125e-06, "loss": 0.154, "step": 2498 }, { "epoch": 0.3737101839389861, "grad_norm": 1.1736153236745563, "learning_rate": 7.2095426915505605e-06, "loss": 0.1973, "step": 2499 }, { "epoch": 0.3738597278301181, "grad_norm": 1.5381442491891666, "learning_rate": 7.207369908966987e-06, "loss": 0.2319, "step": 2500 }, { "epoch": 0.3740092717212502, "grad_norm": 1.143218245070331, "learning_rate": 7.2051966085131584e-06, "loss": 0.1947, "step": 2501 }, { "epoch": 0.37415881561238223, "grad_norm": 1.756151915421153, "learning_rate": 7.203022790698954e-06, "loss": 0.4682, "step": 2502 }, { "epoch": 0.37430835950351427, "grad_norm": 1.956298123550812, "learning_rate": 7.20084845603437e-06, "loss": 0.5118, "step": 2503 }, { "epoch": 0.37445790339464635, "grad_norm": 1.9158174915897688, "learning_rate": 7.198673605029529e-06, "loss": 0.3117, "step": 2504 }, { "epoch": 0.3746074472857784, "grad_norm": 4.295053824375029, "learning_rate": 7.196498238194672e-06, "loss": 0.2234, "step": 2505 }, { "epoch": 0.3747569911769104, "grad_norm": 1.6941035724235585, "learning_rate": 7.194322356040159e-06, "loss": 0.2312, "step": 2506 }, { "epoch": 0.37490653506804245, "grad_norm": 1.5640016182855432, "learning_rate": 7.192145959076474e-06, "loss": 0.3622, "step": 2507 }, { "epoch": 0.37505607895917453, "grad_norm": 1.8466707952598986, "learning_rate": 7.1899690478142196e-06, "loss": 0.4891, "step": 2508 }, { "epoch": 0.37520562285030656, "grad_norm": 1.4306072393075668, "learning_rate": 7.18779162276412e-06, "loss": 0.2988, "step": 2509 }, { "epoch": 0.3753551667414386, "grad_norm": 1.4306403617067467, "learning_rate": 7.185613684437024e-06, "loss": 0.3184, "step": 2510 }, { "epoch": 0.3755047106325707, "grad_norm": 1.6012766699315721, "learning_rate": 7.183435233343892e-06, "loss": 0.3621, "step": 2511 }, { "epoch": 0.3756542545237027, "grad_norm": 1.2931435832729816, "learning_rate": 7.181256269995813e-06, "loss": 0.3318, "step": 2512 }, { "epoch": 0.37580379841483474, "grad_norm": 1.422093979772559, "learning_rate": 7.179076794903991e-06, "loss": 0.2357, "step": 2513 }, { "epoch": 0.3759533423059668, "grad_norm": 0.9257904335966806, "learning_rate": 7.176896808579752e-06, "loss": 0.1813, "step": 2514 }, { "epoch": 0.37610288619709886, "grad_norm": 2.037815545812073, "learning_rate": 7.174716311534542e-06, "loss": 0.2783, "step": 2515 }, { "epoch": 0.3762524300882309, "grad_norm": 1.71379595660131, "learning_rate": 7.172535304279926e-06, "loss": 0.4839, "step": 2516 }, { "epoch": 0.3764019739793629, "grad_norm": 2.00916868636801, "learning_rate": 7.170353787327593e-06, "loss": 0.6153, "step": 2517 }, { "epoch": 0.376551517870495, "grad_norm": 1.5789476931879063, "learning_rate": 7.168171761189343e-06, "loss": 0.2031, "step": 2518 }, { "epoch": 0.37670106176162704, "grad_norm": 1.684853217653668, "learning_rate": 7.165989226377103e-06, "loss": 0.3816, "step": 2519 }, { "epoch": 0.3768506056527591, "grad_norm": 1.9200462619229837, "learning_rate": 7.163806183402916e-06, "loss": 0.4962, "step": 2520 }, { "epoch": 0.3770001495438911, "grad_norm": 1.8855548807104212, "learning_rate": 7.161622632778944e-06, "loss": 0.2185, "step": 2521 }, { "epoch": 0.3771496934350232, "grad_norm": 1.6740717226550226, "learning_rate": 7.159438575017471e-06, "loss": 0.3777, "step": 2522 }, { "epoch": 0.3772992373261552, "grad_norm": 1.397870156480417, "learning_rate": 7.157254010630896e-06, "loss": 0.2426, "step": 2523 }, { "epoch": 0.37744878121728725, "grad_norm": 1.6145415521158457, "learning_rate": 7.155068940131741e-06, "loss": 0.3926, "step": 2524 }, { "epoch": 0.37759832510841934, "grad_norm": 1.7042594169803778, "learning_rate": 7.152883364032644e-06, "loss": 0.1917, "step": 2525 }, { "epoch": 0.3777478689995514, "grad_norm": 1.5289841742678918, "learning_rate": 7.15069728284636e-06, "loss": 0.351, "step": 2526 }, { "epoch": 0.3778974128906834, "grad_norm": 2.395918097586016, "learning_rate": 7.148510697085767e-06, "loss": 0.3126, "step": 2527 }, { "epoch": 0.37804695678181544, "grad_norm": 2.703467639529761, "learning_rate": 7.146323607263859e-06, "loss": 0.2623, "step": 2528 }, { "epoch": 0.3781965006729475, "grad_norm": 1.7671642324701808, "learning_rate": 7.144136013893745e-06, "loss": 0.2137, "step": 2529 }, { "epoch": 0.37834604456407955, "grad_norm": 1.932948758218732, "learning_rate": 7.141947917488663e-06, "loss": 0.3711, "step": 2530 }, { "epoch": 0.3784955884552116, "grad_norm": 1.2026669943340598, "learning_rate": 7.139759318561954e-06, "loss": 0.171, "step": 2531 }, { "epoch": 0.37864513234634367, "grad_norm": 1.7885589140732676, "learning_rate": 7.137570217627088e-06, "loss": 0.4937, "step": 2532 }, { "epoch": 0.3787946762374757, "grad_norm": 1.6235037343132346, "learning_rate": 7.135380615197649e-06, "loss": 0.4338, "step": 2533 }, { "epoch": 0.37894422012860773, "grad_norm": 1.4352072567044432, "learning_rate": 7.133190511787337e-06, "loss": 0.203, "step": 2534 }, { "epoch": 0.3790937640197398, "grad_norm": 2.021523435648341, "learning_rate": 7.130999907909972e-06, "loss": 0.4868, "step": 2535 }, { "epoch": 0.37924330791087185, "grad_norm": 1.575689088228456, "learning_rate": 7.128808804079492e-06, "loss": 0.3587, "step": 2536 }, { "epoch": 0.3793928518020039, "grad_norm": 1.538572538393702, "learning_rate": 7.126617200809951e-06, "loss": 0.3164, "step": 2537 }, { "epoch": 0.3795423956931359, "grad_norm": 2.1498396130276727, "learning_rate": 7.12442509861552e-06, "loss": 0.6093, "step": 2538 }, { "epoch": 0.379691939584268, "grad_norm": 1.5246107523171812, "learning_rate": 7.122232498010486e-06, "loss": 0.2357, "step": 2539 }, { "epoch": 0.37984148347540003, "grad_norm": 1.5356177541463252, "learning_rate": 7.120039399509257e-06, "loss": 0.2945, "step": 2540 }, { "epoch": 0.37999102736653206, "grad_norm": 1.2696297372616192, "learning_rate": 7.117845803626352e-06, "loss": 0.2568, "step": 2541 }, { "epoch": 0.38014057125766415, "grad_norm": 1.2940629840823208, "learning_rate": 7.115651710876411e-06, "loss": 0.2167, "step": 2542 }, { "epoch": 0.3802901151487962, "grad_norm": 1.6760687859259409, "learning_rate": 7.11345712177419e-06, "loss": 0.4574, "step": 2543 }, { "epoch": 0.3804396590399282, "grad_norm": 1.8274306217532401, "learning_rate": 7.111262036834559e-06, "loss": 0.1839, "step": 2544 }, { "epoch": 0.38058920293106024, "grad_norm": 1.7239605405645464, "learning_rate": 7.109066456572508e-06, "loss": 0.3814, "step": 2545 }, { "epoch": 0.38073874682219233, "grad_norm": 2.0774382697068146, "learning_rate": 7.106870381503139e-06, "loss": 0.3502, "step": 2546 }, { "epoch": 0.38088829071332436, "grad_norm": 1.6407126721162997, "learning_rate": 7.104673812141676e-06, "loss": 0.3768, "step": 2547 }, { "epoch": 0.3810378346044564, "grad_norm": 1.4264932056782313, "learning_rate": 7.102476749003453e-06, "loss": 0.2587, "step": 2548 }, { "epoch": 0.3811873784955885, "grad_norm": 1.4315314856298393, "learning_rate": 7.1002791926039204e-06, "loss": 0.1918, "step": 2549 }, { "epoch": 0.3813369223867205, "grad_norm": 2.268263838831847, "learning_rate": 7.098081143458649e-06, "loss": 0.2129, "step": 2550 }, { "epoch": 0.38148646627785254, "grad_norm": 1.4599724810600025, "learning_rate": 7.095882602083321e-06, "loss": 0.3225, "step": 2551 }, { "epoch": 0.3816360101689846, "grad_norm": 1.4198436584338292, "learning_rate": 7.0936835689937366e-06, "loss": 0.1947, "step": 2552 }, { "epoch": 0.38178555406011666, "grad_norm": 1.3581488206510595, "learning_rate": 7.09148404470581e-06, "loss": 0.2492, "step": 2553 }, { "epoch": 0.3819350979512487, "grad_norm": 1.8329873345541536, "learning_rate": 7.089284029735568e-06, "loss": 0.1999, "step": 2554 }, { "epoch": 0.3820846418423807, "grad_norm": 1.879696219955982, "learning_rate": 7.087083524599158e-06, "loss": 0.2107, "step": 2555 }, { "epoch": 0.3822341857335128, "grad_norm": 1.2252237248239084, "learning_rate": 7.08488252981284e-06, "loss": 0.1991, "step": 2556 }, { "epoch": 0.38238372962464484, "grad_norm": 1.952002278934872, "learning_rate": 7.082681045892988e-06, "loss": 0.3916, "step": 2557 }, { "epoch": 0.3825332735157769, "grad_norm": 1.7256855312550468, "learning_rate": 7.08047907335609e-06, "loss": 0.3116, "step": 2558 }, { "epoch": 0.3826828174069089, "grad_norm": 1.587038951947857, "learning_rate": 7.078276612718752e-06, "loss": 0.2224, "step": 2559 }, { "epoch": 0.382832361298041, "grad_norm": 1.8262578444211246, "learning_rate": 7.076073664497691e-06, "loss": 0.3618, "step": 2560 }, { "epoch": 0.382981905189173, "grad_norm": 2.0217394092028167, "learning_rate": 7.07387022920974e-06, "loss": 0.2429, "step": 2561 }, { "epoch": 0.38313144908030505, "grad_norm": 1.9141258813134443, "learning_rate": 7.071666307371847e-06, "loss": 0.4741, "step": 2562 }, { "epoch": 0.38328099297143714, "grad_norm": 1.432198404107898, "learning_rate": 7.069461899501073e-06, "loss": 0.1918, "step": 2563 }, { "epoch": 0.38343053686256917, "grad_norm": 1.5928510931135798, "learning_rate": 7.067257006114593e-06, "loss": 0.3226, "step": 2564 }, { "epoch": 0.3835800807537012, "grad_norm": 1.6585372345576896, "learning_rate": 7.065051627729698e-06, "loss": 0.3276, "step": 2565 }, { "epoch": 0.38372962464483323, "grad_norm": 1.2734067275418923, "learning_rate": 7.062845764863787e-06, "loss": 0.3313, "step": 2566 }, { "epoch": 0.3838791685359653, "grad_norm": 1.1374789118122002, "learning_rate": 7.0606394180343805e-06, "loss": 0.1907, "step": 2567 }, { "epoch": 0.38402871242709735, "grad_norm": 1.549559123650284, "learning_rate": 7.058432587759107e-06, "loss": 0.3915, "step": 2568 }, { "epoch": 0.3841782563182294, "grad_norm": 1.4708906613930632, "learning_rate": 7.0562252745557115e-06, "loss": 0.1992, "step": 2569 }, { "epoch": 0.38432780020936147, "grad_norm": 1.6323056255608734, "learning_rate": 7.054017478942048e-06, "loss": 0.2221, "step": 2570 }, { "epoch": 0.3844773441004935, "grad_norm": 1.5203359282551667, "learning_rate": 7.0518092014360905e-06, "loss": 0.3545, "step": 2571 }, { "epoch": 0.38462688799162553, "grad_norm": 1.159124982100583, "learning_rate": 7.0496004425559195e-06, "loss": 0.1549, "step": 2572 }, { "epoch": 0.38477643188275756, "grad_norm": 2.1232683966349586, "learning_rate": 7.047391202819734e-06, "loss": 0.6264, "step": 2573 }, { "epoch": 0.38492597577388965, "grad_norm": 1.0622764450389663, "learning_rate": 7.045181482745837e-06, "loss": 0.2163, "step": 2574 }, { "epoch": 0.3850755196650217, "grad_norm": 1.077605292843362, "learning_rate": 7.042971282852656e-06, "loss": 0.2079, "step": 2575 }, { "epoch": 0.3852250635561537, "grad_norm": 1.0767369768412656, "learning_rate": 7.040760603658723e-06, "loss": 0.2035, "step": 2576 }, { "epoch": 0.3853746074472858, "grad_norm": 1.5110599625002836, "learning_rate": 7.038549445682685e-06, "loss": 0.1561, "step": 2577 }, { "epoch": 0.38552415133841783, "grad_norm": 1.6236972538616068, "learning_rate": 7.036337809443301e-06, "loss": 0.4143, "step": 2578 }, { "epoch": 0.38567369522954986, "grad_norm": 1.6146980603793564, "learning_rate": 7.0341256954594415e-06, "loss": 0.332, "step": 2579 }, { "epoch": 0.3858232391206819, "grad_norm": 2.047834790133371, "learning_rate": 7.031913104250091e-06, "loss": 0.4432, "step": 2580 }, { "epoch": 0.385972783011814, "grad_norm": 1.5013503959890904, "learning_rate": 7.029700036334344e-06, "loss": 0.344, "step": 2581 }, { "epoch": 0.386122326902946, "grad_norm": 1.0672948108215132, "learning_rate": 7.027486492231407e-06, "loss": 0.2127, "step": 2582 }, { "epoch": 0.38627187079407804, "grad_norm": 1.6877080780242262, "learning_rate": 7.025272472460598e-06, "loss": 0.4702, "step": 2583 }, { "epoch": 0.38642141468521013, "grad_norm": 1.5687512279976832, "learning_rate": 7.02305797754135e-06, "loss": 0.345, "step": 2584 }, { "epoch": 0.38657095857634216, "grad_norm": 1.268209431704893, "learning_rate": 7.020843007993203e-06, "loss": 0.1817, "step": 2585 }, { "epoch": 0.3867205024674742, "grad_norm": 2.1026474658538508, "learning_rate": 7.018627564335813e-06, "loss": 0.3595, "step": 2586 }, { "epoch": 0.3868700463586062, "grad_norm": 1.283399204038543, "learning_rate": 7.01641164708894e-06, "loss": 0.3985, "step": 2587 }, { "epoch": 0.3870195902497383, "grad_norm": 1.5628231644223662, "learning_rate": 7.014195256772462e-06, "loss": 0.243, "step": 2588 }, { "epoch": 0.38716913414087034, "grad_norm": 1.4666711014261535, "learning_rate": 7.011978393906366e-06, "loss": 0.3757, "step": 2589 }, { "epoch": 0.3873186780320024, "grad_norm": 1.7424029593387464, "learning_rate": 7.009761059010746e-06, "loss": 0.3297, "step": 2590 }, { "epoch": 0.38746822192313446, "grad_norm": 1.793037388304316, "learning_rate": 7.007543252605815e-06, "loss": 0.1902, "step": 2591 }, { "epoch": 0.3876177658142665, "grad_norm": 1.3063151548645278, "learning_rate": 7.005324975211889e-06, "loss": 0.2154, "step": 2592 }, { "epoch": 0.3877673097053985, "grad_norm": 1.5791521412257716, "learning_rate": 7.003106227349399e-06, "loss": 0.353, "step": 2593 }, { "epoch": 0.3879168535965306, "grad_norm": 1.1004684710498887, "learning_rate": 7.0008870095388815e-06, "loss": 0.1875, "step": 2594 }, { "epoch": 0.38806639748766264, "grad_norm": 1.5070698124572854, "learning_rate": 6.998667322300989e-06, "loss": 0.3211, "step": 2595 }, { "epoch": 0.38821594137879467, "grad_norm": 1.5281886376155964, "learning_rate": 6.9964471661564815e-06, "loss": 0.2161, "step": 2596 }, { "epoch": 0.3883654852699267, "grad_norm": 1.5252156175553138, "learning_rate": 6.994226541626227e-06, "loss": 0.4639, "step": 2597 }, { "epoch": 0.3885150291610588, "grad_norm": 1.0809213992053146, "learning_rate": 6.9920054492312086e-06, "loss": 0.1773, "step": 2598 }, { "epoch": 0.3886645730521908, "grad_norm": 1.0167620236333288, "learning_rate": 6.989783889492512e-06, "loss": 0.197, "step": 2599 }, { "epoch": 0.38881411694332285, "grad_norm": 1.9918245714247895, "learning_rate": 6.98756186293134e-06, "loss": 0.2173, "step": 2600 }, { "epoch": 0.38896366083445494, "grad_norm": 1.134571833649737, "learning_rate": 6.9853393700689995e-06, "loss": 0.2177, "step": 2601 }, { "epoch": 0.38911320472558697, "grad_norm": 1.2111914722156627, "learning_rate": 6.98311641142691e-06, "loss": 0.189, "step": 2602 }, { "epoch": 0.389262748616719, "grad_norm": 1.5006726664035277, "learning_rate": 6.9808929875265974e-06, "loss": 0.3254, "step": 2603 }, { "epoch": 0.38941229250785103, "grad_norm": 1.6055883887420528, "learning_rate": 6.9786690988897e-06, "loss": 0.3344, "step": 2604 }, { "epoch": 0.3895618363989831, "grad_norm": 0.9973538599901819, "learning_rate": 6.9764447460379625e-06, "loss": 0.1393, "step": 2605 }, { "epoch": 0.38971138029011515, "grad_norm": 1.6233042755021416, "learning_rate": 6.9742199294932415e-06, "loss": 0.204, "step": 2606 }, { "epoch": 0.3898609241812472, "grad_norm": 1.3344019910552745, "learning_rate": 6.971994649777497e-06, "loss": 0.1945, "step": 2607 }, { "epoch": 0.39001046807237927, "grad_norm": 1.3137908772185043, "learning_rate": 6.969768907412804e-06, "loss": 0.1984, "step": 2608 }, { "epoch": 0.3901600119635113, "grad_norm": 1.777140683321665, "learning_rate": 6.9675427029213405e-06, "loss": 0.3008, "step": 2609 }, { "epoch": 0.39030955585464333, "grad_norm": 1.0681843692748616, "learning_rate": 6.965316036825398e-06, "loss": 0.2006, "step": 2610 }, { "epoch": 0.39045909974577536, "grad_norm": 1.8543912871172343, "learning_rate": 6.963088909647372e-06, "loss": 0.5414, "step": 2611 }, { "epoch": 0.39060864363690745, "grad_norm": 1.9579644690684397, "learning_rate": 6.960861321909769e-06, "loss": 0.2134, "step": 2612 }, { "epoch": 0.3907581875280395, "grad_norm": 1.5162323884971123, "learning_rate": 6.9586332741352025e-06, "loss": 0.4165, "step": 2613 }, { "epoch": 0.3909077314191715, "grad_norm": 1.6022574800937832, "learning_rate": 6.956404766846394e-06, "loss": 0.2272, "step": 2614 }, { "epoch": 0.3910572753103036, "grad_norm": 1.6770508498447996, "learning_rate": 6.954175800566172e-06, "loss": 0.4977, "step": 2615 }, { "epoch": 0.39120681920143563, "grad_norm": 1.8009556625869532, "learning_rate": 6.9519463758174745e-06, "loss": 0.5158, "step": 2616 }, { "epoch": 0.39135636309256766, "grad_norm": 1.3758659710920118, "learning_rate": 6.949716493123345e-06, "loss": 0.1789, "step": 2617 }, { "epoch": 0.3915059069836997, "grad_norm": 1.3547231618901736, "learning_rate": 6.947486153006937e-06, "loss": 0.3534, "step": 2618 }, { "epoch": 0.3916554508748318, "grad_norm": 1.5074942912698235, "learning_rate": 6.945255355991509e-06, "loss": 0.2497, "step": 2619 }, { "epoch": 0.3918049947659638, "grad_norm": 1.2442646683398755, "learning_rate": 6.943024102600428e-06, "loss": 0.1861, "step": 2620 }, { "epoch": 0.39195453865709584, "grad_norm": 1.5075321060331324, "learning_rate": 6.940792393357165e-06, "loss": 0.2194, "step": 2621 }, { "epoch": 0.39210408254822793, "grad_norm": 1.683076961752896, "learning_rate": 6.938560228785304e-06, "loss": 0.2156, "step": 2622 }, { "epoch": 0.39225362643935996, "grad_norm": 1.7112132072394493, "learning_rate": 6.9363276094085296e-06, "loss": 0.3958, "step": 2623 }, { "epoch": 0.392403170330492, "grad_norm": 0.7048488756753248, "learning_rate": 6.934094535750638e-06, "loss": 0.2183, "step": 2624 }, { "epoch": 0.392552714221624, "grad_norm": 1.7586652848799478, "learning_rate": 6.931861008335527e-06, "loss": 0.469, "step": 2625 }, { "epoch": 0.3927022581127561, "grad_norm": 1.1588740664998758, "learning_rate": 6.929627027687207e-06, "loss": 0.2168, "step": 2626 }, { "epoch": 0.39285180200388814, "grad_norm": 1.548868104243127, "learning_rate": 6.927392594329789e-06, "loss": 0.2312, "step": 2627 }, { "epoch": 0.39300134589502017, "grad_norm": 1.595689192189307, "learning_rate": 6.925157708787493e-06, "loss": 0.4765, "step": 2628 }, { "epoch": 0.39315088978615226, "grad_norm": 1.0277723372062315, "learning_rate": 6.922922371584647e-06, "loss": 0.1517, "step": 2629 }, { "epoch": 0.3933004336772843, "grad_norm": 1.7591622574617085, "learning_rate": 6.920686583245679e-06, "loss": 0.3411, "step": 2630 }, { "epoch": 0.3934499775684163, "grad_norm": 1.6744764890594985, "learning_rate": 6.918450344295129e-06, "loss": 0.2044, "step": 2631 }, { "epoch": 0.39359952145954835, "grad_norm": 1.4007607932440016, "learning_rate": 6.916213655257639e-06, "loss": 0.3232, "step": 2632 }, { "epoch": 0.39374906535068044, "grad_norm": 1.4204862226075206, "learning_rate": 6.9139765166579576e-06, "loss": 0.2314, "step": 2633 }, { "epoch": 0.39389860924181247, "grad_norm": 1.3405577478235213, "learning_rate": 6.9117389290209415e-06, "loss": 0.211, "step": 2634 }, { "epoch": 0.3940481531329445, "grad_norm": 1.454026439973042, "learning_rate": 6.909500892871547e-06, "loss": 0.33, "step": 2635 }, { "epoch": 0.3941976970240766, "grad_norm": 1.3439856339271192, "learning_rate": 6.907262408734842e-06, "loss": 0.1815, "step": 2636 }, { "epoch": 0.3943472409152086, "grad_norm": 1.6261472549425744, "learning_rate": 6.905023477135992e-06, "loss": 0.3739, "step": 2637 }, { "epoch": 0.39449678480634065, "grad_norm": 1.576356761161988, "learning_rate": 6.902784098600277e-06, "loss": 0.3767, "step": 2638 }, { "epoch": 0.3946463286974727, "grad_norm": 1.339777085672714, "learning_rate": 6.9005442736530745e-06, "loss": 0.1876, "step": 2639 }, { "epoch": 0.39479587258860477, "grad_norm": 1.4248530907151669, "learning_rate": 6.898304002819869e-06, "loss": 0.2061, "step": 2640 }, { "epoch": 0.3949454164797368, "grad_norm": 1.8979002423907008, "learning_rate": 6.896063286626251e-06, "loss": 0.3686, "step": 2641 }, { "epoch": 0.39509496037086883, "grad_norm": 1.4615671922332303, "learning_rate": 6.893822125597911e-06, "loss": 0.2747, "step": 2642 }, { "epoch": 0.3952445042620009, "grad_norm": 1.1050718617792725, "learning_rate": 6.891580520260649e-06, "loss": 0.1886, "step": 2643 }, { "epoch": 0.39539404815313295, "grad_norm": 1.1930459776952358, "learning_rate": 6.8893384711403675e-06, "loss": 0.1788, "step": 2644 }, { "epoch": 0.395543592044265, "grad_norm": 1.0036636367374232, "learning_rate": 6.887095978763072e-06, "loss": 0.1815, "step": 2645 }, { "epoch": 0.39569313593539707, "grad_norm": 1.751476920569849, "learning_rate": 6.884853043654876e-06, "loss": 0.3909, "step": 2646 }, { "epoch": 0.3958426798265291, "grad_norm": 1.5414130565030935, "learning_rate": 6.882609666341988e-06, "loss": 0.383, "step": 2647 }, { "epoch": 0.39599222371766113, "grad_norm": 1.7873679635826936, "learning_rate": 6.88036584735073e-06, "loss": 0.5987, "step": 2648 }, { "epoch": 0.39614176760879316, "grad_norm": 1.4503283763937924, "learning_rate": 6.878121587207522e-06, "loss": 0.1782, "step": 2649 }, { "epoch": 0.39629131149992525, "grad_norm": 1.1935592615073942, "learning_rate": 6.875876886438889e-06, "loss": 0.3002, "step": 2650 }, { "epoch": 0.3964408553910573, "grad_norm": 1.471497098484429, "learning_rate": 6.873631745571461e-06, "loss": 0.2105, "step": 2651 }, { "epoch": 0.3965903992821893, "grad_norm": 1.7129740678391, "learning_rate": 6.871386165131968e-06, "loss": 0.2233, "step": 2652 }, { "epoch": 0.3967399431733214, "grad_norm": 1.5756431198477898, "learning_rate": 6.869140145647245e-06, "loss": 0.4151, "step": 2653 }, { "epoch": 0.39688948706445343, "grad_norm": 1.238820133220739, "learning_rate": 6.866893687644232e-06, "loss": 0.2, "step": 2654 }, { "epoch": 0.39703903095558546, "grad_norm": 1.9264407972582287, "learning_rate": 6.864646791649966e-06, "loss": 0.4677, "step": 2655 }, { "epoch": 0.3971885748467175, "grad_norm": 1.550112419127813, "learning_rate": 6.862399458191593e-06, "loss": 0.3658, "step": 2656 }, { "epoch": 0.3973381187378496, "grad_norm": 1.5119124813290858, "learning_rate": 6.860151687796359e-06, "loss": 0.2068, "step": 2657 }, { "epoch": 0.3974876626289816, "grad_norm": 1.4272874845074974, "learning_rate": 6.857903480991611e-06, "loss": 0.2713, "step": 2658 }, { "epoch": 0.39763720652011364, "grad_norm": 1.6594720979073507, "learning_rate": 6.855654838304802e-06, "loss": 0.3205, "step": 2659 }, { "epoch": 0.3977867504112457, "grad_norm": 2.407352823346937, "learning_rate": 6.853405760263485e-06, "loss": 0.2088, "step": 2660 }, { "epoch": 0.39793629430237776, "grad_norm": 1.7959792599685889, "learning_rate": 6.851156247395313e-06, "loss": 0.393, "step": 2661 }, { "epoch": 0.3980858381935098, "grad_norm": 1.4851282768283998, "learning_rate": 6.848906300228047e-06, "loss": 0.2354, "step": 2662 }, { "epoch": 0.3982353820846418, "grad_norm": 1.4691880516424127, "learning_rate": 6.846655919289543e-06, "loss": 0.231, "step": 2663 }, { "epoch": 0.3983849259757739, "grad_norm": 1.5828677803244833, "learning_rate": 6.844405105107763e-06, "loss": 0.1887, "step": 2664 }, { "epoch": 0.39853446986690594, "grad_norm": 1.5541799378105448, "learning_rate": 6.842153858210772e-06, "loss": 0.3784, "step": 2665 }, { "epoch": 0.39868401375803797, "grad_norm": 1.9024949023533122, "learning_rate": 6.83990217912673e-06, "loss": 0.4826, "step": 2666 }, { "epoch": 0.39883355764917006, "grad_norm": 1.7789769644036557, "learning_rate": 6.837650068383908e-06, "loss": 0.4536, "step": 2667 }, { "epoch": 0.3989831015403021, "grad_norm": 1.6091866012593252, "learning_rate": 6.835397526510667e-06, "loss": 0.4931, "step": 2668 }, { "epoch": 0.3991326454314341, "grad_norm": 1.4778294602532251, "learning_rate": 6.83314455403548e-06, "loss": 0.3916, "step": 2669 }, { "epoch": 0.39928218932256615, "grad_norm": 1.4396363740839195, "learning_rate": 6.8308911514869125e-06, "loss": 0.2875, "step": 2670 }, { "epoch": 0.39943173321369824, "grad_norm": 1.5237349879970323, "learning_rate": 6.828637319393636e-06, "loss": 0.2372, "step": 2671 }, { "epoch": 0.39958127710483027, "grad_norm": 1.4821885218044735, "learning_rate": 6.826383058284421e-06, "loss": 0.3351, "step": 2672 }, { "epoch": 0.3997308209959623, "grad_norm": 1.6760124989546532, "learning_rate": 6.824128368688139e-06, "loss": 0.3599, "step": 2673 }, { "epoch": 0.3998803648870944, "grad_norm": 1.0076444138222016, "learning_rate": 6.821873251133764e-06, "loss": 0.1925, "step": 2674 }, { "epoch": 0.4000299087782264, "grad_norm": 1.0371695095164526, "learning_rate": 6.819617706150363e-06, "loss": 0.177, "step": 2675 }, { "epoch": 0.40017945266935845, "grad_norm": 1.2181986301270449, "learning_rate": 6.817361734267114e-06, "loss": 0.2938, "step": 2676 } ], "logging_steps": 1.0, "max_steps": 6687, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 669, "total_flos": 166488976527360.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }