{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1634, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024479804161566705, "grad_norm": 5.1137237548828125, "learning_rate": 3.0487804878048784e-07, "loss": 1.1539, "num_tokens": 579147.0, "step": 2 }, { "epoch": 0.004895960832313341, "grad_norm": 4.984861850738525, "learning_rate": 9.146341463414634e-07, "loss": 1.1347, "num_tokens": 1171666.0, "step": 4 }, { "epoch": 0.0073439412484700125, "grad_norm": 4.731399059295654, "learning_rate": 1.5243902439024391e-06, "loss": 1.1404, "num_tokens": 1782835.0, "step": 6 }, { "epoch": 0.009791921664626682, "grad_norm": 4.15858268737793, "learning_rate": 2.134146341463415e-06, "loss": 1.0869, "num_tokens": 2373836.0, "step": 8 }, { "epoch": 0.012239902080783354, "grad_norm": 2.968505382537842, "learning_rate": 2.7439024390243905e-06, "loss": 0.9824, "num_tokens": 2949918.0, "step": 10 }, { "epoch": 0.014687882496940025, "grad_norm": 1.7302820682525635, "learning_rate": 3.3536585365853664e-06, "loss": 0.8956, "num_tokens": 3550879.0, "step": 12 }, { "epoch": 0.017135862913096694, "grad_norm": 1.3977758884429932, "learning_rate": 3.9634146341463414e-06, "loss": 0.8249, "num_tokens": 4150487.0, "step": 14 }, { "epoch": 0.019583843329253364, "grad_norm": 0.877804160118103, "learning_rate": 4.573170731707317e-06, "loss": 0.7489, "num_tokens": 4732419.0, "step": 16 }, { "epoch": 0.022031823745410038, "grad_norm": 0.9915581345558167, "learning_rate": 5.182926829268292e-06, "loss": 0.736, "num_tokens": 5323835.0, "step": 18 }, { "epoch": 0.02447980416156671, "grad_norm": 0.668745219707489, "learning_rate": 5.792682926829269e-06, "loss": 0.7068, "num_tokens": 5907508.0, "step": 20 }, { "epoch": 0.02692778457772338, "grad_norm": 0.5032077431678772, "learning_rate": 6.402439024390244e-06, "loss": 0.6967, "num_tokens": 6502779.0, "step": 22 }, { "epoch": 0.02937576499388005, "grad_norm": 0.4938846826553345, "learning_rate": 7.0121951219512205e-06, "loss": 0.6396, "num_tokens": 7084297.0, "step": 24 }, { "epoch": 0.03182374541003672, "grad_norm": 0.394046425819397, "learning_rate": 7.621951219512195e-06, "loss": 0.6264, "num_tokens": 7677074.0, "step": 26 }, { "epoch": 0.03427172582619339, "grad_norm": 0.42694729566574097, "learning_rate": 8.231707317073172e-06, "loss": 0.6148, "num_tokens": 8262795.0, "step": 28 }, { "epoch": 0.03671970624235006, "grad_norm": 0.3103824853897095, "learning_rate": 8.841463414634146e-06, "loss": 0.608, "num_tokens": 8856597.0, "step": 30 }, { "epoch": 0.03916768665850673, "grad_norm": 0.3300226628780365, "learning_rate": 9.451219512195123e-06, "loss": 0.5983, "num_tokens": 9443865.0, "step": 32 }, { "epoch": 0.0416156670746634, "grad_norm": 0.28514596819877625, "learning_rate": 1.0060975609756099e-05, "loss": 0.5841, "num_tokens": 10028572.0, "step": 34 }, { "epoch": 0.044063647490820076, "grad_norm": 0.29121583700180054, "learning_rate": 1.0670731707317074e-05, "loss": 0.5891, "num_tokens": 10622144.0, "step": 36 }, { "epoch": 0.046511627906976744, "grad_norm": 0.27484285831451416, "learning_rate": 1.128048780487805e-05, "loss": 0.5551, "num_tokens": 11195500.0, "step": 38 }, { "epoch": 0.04895960832313342, "grad_norm": 0.27593994140625, "learning_rate": 1.1890243902439025e-05, "loss": 0.5492, "num_tokens": 11804642.0, "step": 40 }, { "epoch": 0.051407588739290085, "grad_norm": 0.2713501751422882, "learning_rate": 1.25e-05, "loss": 0.5408, "num_tokens": 12412613.0, "step": 42 }, { "epoch": 0.05385556915544676, "grad_norm": 0.2747846841812134, "learning_rate": 1.3109756097560976e-05, "loss": 0.535, "num_tokens": 12980451.0, "step": 44 }, { "epoch": 0.056303549571603426, "grad_norm": 0.2635088264942169, "learning_rate": 1.3719512195121953e-05, "loss": 0.5348, "num_tokens": 13573410.0, "step": 46 }, { "epoch": 0.0587515299877601, "grad_norm": 0.2709018290042877, "learning_rate": 1.4329268292682927e-05, "loss": 0.5449, "num_tokens": 14151346.0, "step": 48 }, { "epoch": 0.06119951040391677, "grad_norm": 0.2677537798881531, "learning_rate": 1.4939024390243902e-05, "loss": 0.5479, "num_tokens": 14772803.0, "step": 50 }, { "epoch": 0.06364749082007344, "grad_norm": 0.2600240409374237, "learning_rate": 1.554878048780488e-05, "loss": 0.5297, "num_tokens": 15354869.0, "step": 52 }, { "epoch": 0.06609547123623011, "grad_norm": 0.2664760649204254, "learning_rate": 1.6158536585365855e-05, "loss": 0.5248, "num_tokens": 15944249.0, "step": 54 }, { "epoch": 0.06854345165238677, "grad_norm": 0.26839882135391235, "learning_rate": 1.676829268292683e-05, "loss": 0.5017, "num_tokens": 16523611.0, "step": 56 }, { "epoch": 0.07099143206854346, "grad_norm": 0.2592765688896179, "learning_rate": 1.7378048780487806e-05, "loss": 0.5185, "num_tokens": 17110353.0, "step": 58 }, { "epoch": 0.07343941248470012, "grad_norm": 0.2596220374107361, "learning_rate": 1.798780487804878e-05, "loss": 0.5195, "num_tokens": 17682600.0, "step": 60 }, { "epoch": 0.07588739290085679, "grad_norm": 0.26976412534713745, "learning_rate": 1.8597560975609757e-05, "loss": 0.5076, "num_tokens": 18274385.0, "step": 62 }, { "epoch": 0.07833537331701346, "grad_norm": 0.270097017288208, "learning_rate": 1.9207317073170733e-05, "loss": 0.5333, "num_tokens": 18864027.0, "step": 64 }, { "epoch": 0.08078335373317014, "grad_norm": 0.2646830976009369, "learning_rate": 1.9817073170731708e-05, "loss": 0.4993, "num_tokens": 19450167.0, "step": 66 }, { "epoch": 0.0832313341493268, "grad_norm": 0.260958194732666, "learning_rate": 2.0426829268292683e-05, "loss": 0.5001, "num_tokens": 20027616.0, "step": 68 }, { "epoch": 0.08567931456548347, "grad_norm": 0.2683822214603424, "learning_rate": 2.103658536585366e-05, "loss": 0.5106, "num_tokens": 20641209.0, "step": 70 }, { "epoch": 0.08812729498164015, "grad_norm": 0.26536548137664795, "learning_rate": 2.1646341463414634e-05, "loss": 0.4927, "num_tokens": 21231993.0, "step": 72 }, { "epoch": 0.09057527539779682, "grad_norm": 0.2761287987232208, "learning_rate": 2.225609756097561e-05, "loss": 0.4868, "num_tokens": 21822237.0, "step": 74 }, { "epoch": 0.09302325581395349, "grad_norm": 0.2746763527393341, "learning_rate": 2.286585365853659e-05, "loss": 0.5097, "num_tokens": 22406364.0, "step": 76 }, { "epoch": 0.09547123623011015, "grad_norm": 0.278967946767807, "learning_rate": 2.347560975609756e-05, "loss": 0.5027, "num_tokens": 23012261.0, "step": 78 }, { "epoch": 0.09791921664626684, "grad_norm": 0.27916890382766724, "learning_rate": 2.4085365853658536e-05, "loss": 0.4991, "num_tokens": 23609152.0, "step": 80 }, { "epoch": 0.1003671970624235, "grad_norm": 0.27606090903282166, "learning_rate": 2.4695121951219512e-05, "loss": 0.4953, "num_tokens": 24195467.0, "step": 82 }, { "epoch": 0.10281517747858017, "grad_norm": 0.29051369428634644, "learning_rate": 2.530487804878049e-05, "loss": 0.4836, "num_tokens": 24776287.0, "step": 84 }, { "epoch": 0.10526315789473684, "grad_norm": 0.29912522435188293, "learning_rate": 2.5914634146341466e-05, "loss": 0.4944, "num_tokens": 25353355.0, "step": 86 }, { "epoch": 0.10771113831089352, "grad_norm": 0.3201720118522644, "learning_rate": 2.652439024390244e-05, "loss": 0.4903, "num_tokens": 25910479.0, "step": 88 }, { "epoch": 0.11015911872705018, "grad_norm": 0.30178847908973694, "learning_rate": 2.7134146341463417e-05, "loss": 0.4972, "num_tokens": 26499675.0, "step": 90 }, { "epoch": 0.11260709914320685, "grad_norm": 0.28765812516212463, "learning_rate": 2.7743902439024393e-05, "loss": 0.4786, "num_tokens": 27077382.0, "step": 92 }, { "epoch": 0.11505507955936352, "grad_norm": 0.30265727639198303, "learning_rate": 2.8353658536585365e-05, "loss": 0.4914, "num_tokens": 27670497.0, "step": 94 }, { "epoch": 0.1175030599755202, "grad_norm": 0.29097241163253784, "learning_rate": 2.896341463414634e-05, "loss": 0.4841, "num_tokens": 28263901.0, "step": 96 }, { "epoch": 0.11995104039167687, "grad_norm": 0.29328176379203796, "learning_rate": 2.9573170731707316e-05, "loss": 0.4649, "num_tokens": 28845275.0, "step": 98 }, { "epoch": 0.12239902080783353, "grad_norm": 0.31686413288116455, "learning_rate": 3.0182926829268294e-05, "loss": 0.4835, "num_tokens": 29436166.0, "step": 100 }, { "epoch": 0.12484700122399021, "grad_norm": 0.31773534417152405, "learning_rate": 3.079268292682927e-05, "loss": 0.4789, "num_tokens": 30044583.0, "step": 102 }, { "epoch": 0.12729498164014688, "grad_norm": 0.3541443943977356, "learning_rate": 3.140243902439025e-05, "loss": 0.4812, "num_tokens": 30645726.0, "step": 104 }, { "epoch": 0.12974296205630356, "grad_norm": 0.33977681398391724, "learning_rate": 3.201219512195122e-05, "loss": 0.4932, "num_tokens": 31251475.0, "step": 106 }, { "epoch": 0.13219094247246022, "grad_norm": 0.33283287286758423, "learning_rate": 3.26219512195122e-05, "loss": 0.4688, "num_tokens": 31830636.0, "step": 108 }, { "epoch": 0.1346389228886169, "grad_norm": 0.33952081203460693, "learning_rate": 3.323170731707317e-05, "loss": 0.4885, "num_tokens": 32435998.0, "step": 110 }, { "epoch": 0.13708690330477355, "grad_norm": 0.30896127223968506, "learning_rate": 3.384146341463415e-05, "loss": 0.4684, "num_tokens": 33041535.0, "step": 112 }, { "epoch": 0.13953488372093023, "grad_norm": 0.31626206636428833, "learning_rate": 3.445121951219512e-05, "loss": 0.484, "num_tokens": 33617226.0, "step": 114 }, { "epoch": 0.1419828641370869, "grad_norm": 0.3127070665359497, "learning_rate": 3.5060975609756095e-05, "loss": 0.4778, "num_tokens": 34184157.0, "step": 116 }, { "epoch": 0.14443084455324356, "grad_norm": 0.2935652434825897, "learning_rate": 3.5670731707317074e-05, "loss": 0.4657, "num_tokens": 34779714.0, "step": 118 }, { "epoch": 0.14687882496940025, "grad_norm": 0.3200523555278778, "learning_rate": 3.628048780487805e-05, "loss": 0.4665, "num_tokens": 35361759.0, "step": 120 }, { "epoch": 0.14932680538555693, "grad_norm": 0.26772016286849976, "learning_rate": 3.6890243902439025e-05, "loss": 0.4685, "num_tokens": 35960992.0, "step": 122 }, { "epoch": 0.15177478580171358, "grad_norm": 0.29133862257003784, "learning_rate": 3.7500000000000003e-05, "loss": 0.4815, "num_tokens": 36577873.0, "step": 124 }, { "epoch": 0.15422276621787026, "grad_norm": 0.3038277328014374, "learning_rate": 3.8109756097560976e-05, "loss": 0.4831, "num_tokens": 37173083.0, "step": 126 }, { "epoch": 0.15667074663402691, "grad_norm": 0.308441162109375, "learning_rate": 3.8719512195121954e-05, "loss": 0.4803, "num_tokens": 37772321.0, "step": 128 }, { "epoch": 0.1591187270501836, "grad_norm": 0.29993629455566406, "learning_rate": 3.932926829268293e-05, "loss": 0.4859, "num_tokens": 38366280.0, "step": 130 }, { "epoch": 0.16156670746634028, "grad_norm": 0.31718897819519043, "learning_rate": 3.9939024390243905e-05, "loss": 0.4955, "num_tokens": 38963886.0, "step": 132 }, { "epoch": 0.16401468788249693, "grad_norm": 0.3059256672859192, "learning_rate": 4.0548780487804884e-05, "loss": 0.4655, "num_tokens": 39590863.0, "step": 134 }, { "epoch": 0.1664626682986536, "grad_norm": 0.31588634848594666, "learning_rate": 4.1158536585365856e-05, "loss": 0.4688, "num_tokens": 40182071.0, "step": 136 }, { "epoch": 0.1689106487148103, "grad_norm": 0.27568042278289795, "learning_rate": 4.176829268292683e-05, "loss": 0.4655, "num_tokens": 40758834.0, "step": 138 }, { "epoch": 0.17135862913096694, "grad_norm": 0.2977793216705322, "learning_rate": 4.237804878048781e-05, "loss": 0.495, "num_tokens": 41366991.0, "step": 140 }, { "epoch": 0.17380660954712362, "grad_norm": 0.32999318838119507, "learning_rate": 4.298780487804878e-05, "loss": 0.4856, "num_tokens": 41970776.0, "step": 142 }, { "epoch": 0.1762545899632803, "grad_norm": 0.26864486932754517, "learning_rate": 4.359756097560976e-05, "loss": 0.4678, "num_tokens": 42545064.0, "step": 144 }, { "epoch": 0.17870257037943696, "grad_norm": 0.2972167134284973, "learning_rate": 4.420731707317074e-05, "loss": 0.4601, "num_tokens": 43097646.0, "step": 146 }, { "epoch": 0.18115055079559364, "grad_norm": 0.2935771942138672, "learning_rate": 4.481707317073171e-05, "loss": 0.4891, "num_tokens": 43704720.0, "step": 148 }, { "epoch": 0.1835985312117503, "grad_norm": 0.29881051182746887, "learning_rate": 4.542682926829269e-05, "loss": 0.4483, "num_tokens": 44291468.0, "step": 150 }, { "epoch": 0.18604651162790697, "grad_norm": 0.2607310712337494, "learning_rate": 4.603658536585366e-05, "loss": 0.4772, "num_tokens": 44889809.0, "step": 152 }, { "epoch": 0.18849449204406366, "grad_norm": 0.3168869614601135, "learning_rate": 4.664634146341464e-05, "loss": 0.4654, "num_tokens": 45471363.0, "step": 154 }, { "epoch": 0.1909424724602203, "grad_norm": 0.33748289942741394, "learning_rate": 4.725609756097561e-05, "loss": 0.4628, "num_tokens": 46044686.0, "step": 156 }, { "epoch": 0.193390452876377, "grad_norm": 0.28347885608673096, "learning_rate": 4.786585365853658e-05, "loss": 0.4504, "num_tokens": 46638282.0, "step": 158 }, { "epoch": 0.19583843329253367, "grad_norm": 0.29194921255111694, "learning_rate": 4.847560975609756e-05, "loss": 0.4776, "num_tokens": 47254725.0, "step": 160 }, { "epoch": 0.19828641370869032, "grad_norm": 0.3240629732608795, "learning_rate": 4.908536585365854e-05, "loss": 0.4624, "num_tokens": 47825874.0, "step": 162 }, { "epoch": 0.200734394124847, "grad_norm": 0.41551873087882996, "learning_rate": 4.969512195121951e-05, "loss": 0.4685, "num_tokens": 48404000.0, "step": 164 }, { "epoch": 0.20318237454100369, "grad_norm": 0.34084752202033997, "learning_rate": 4.9965986394557824e-05, "loss": 0.4821, "num_tokens": 49020620.0, "step": 166 }, { "epoch": 0.20563035495716034, "grad_norm": 0.32527878880500793, "learning_rate": 4.9897959183673474e-05, "loss": 0.4677, "num_tokens": 49610138.0, "step": 168 }, { "epoch": 0.20807833537331702, "grad_norm": 0.32203590869903564, "learning_rate": 4.982993197278912e-05, "loss": 0.472, "num_tokens": 50239651.0, "step": 170 }, { "epoch": 0.21052631578947367, "grad_norm": 0.3320338726043701, "learning_rate": 4.976190476190477e-05, "loss": 0.4576, "num_tokens": 50821539.0, "step": 172 }, { "epoch": 0.21297429620563035, "grad_norm": 0.31365767121315, "learning_rate": 4.969387755102041e-05, "loss": 0.4662, "num_tokens": 51392202.0, "step": 174 }, { "epoch": 0.21542227662178703, "grad_norm": 0.29618459939956665, "learning_rate": 4.962585034013605e-05, "loss": 0.473, "num_tokens": 51973577.0, "step": 176 }, { "epoch": 0.2178702570379437, "grad_norm": 0.2908554673194885, "learning_rate": 4.95578231292517e-05, "loss": 0.459, "num_tokens": 52557644.0, "step": 178 }, { "epoch": 0.22031823745410037, "grad_norm": 0.2914385199546814, "learning_rate": 4.9489795918367346e-05, "loss": 0.4573, "num_tokens": 53136591.0, "step": 180 }, { "epoch": 0.22276621787025705, "grad_norm": 0.3093741238117218, "learning_rate": 4.9421768707482996e-05, "loss": 0.4739, "num_tokens": 53723609.0, "step": 182 }, { "epoch": 0.2252141982864137, "grad_norm": 0.28613534569740295, "learning_rate": 4.9353741496598646e-05, "loss": 0.4756, "num_tokens": 54317296.0, "step": 184 }, { "epoch": 0.22766217870257038, "grad_norm": 0.305011123418808, "learning_rate": 4.928571428571429e-05, "loss": 0.46, "num_tokens": 54898930.0, "step": 186 }, { "epoch": 0.23011015911872704, "grad_norm": 0.30485352873802185, "learning_rate": 4.921768707482993e-05, "loss": 0.4668, "num_tokens": 55520199.0, "step": 188 }, { "epoch": 0.23255813953488372, "grad_norm": 0.27733299136161804, "learning_rate": 4.914965986394558e-05, "loss": 0.457, "num_tokens": 56148692.0, "step": 190 }, { "epoch": 0.2350061199510404, "grad_norm": 0.27111920714378357, "learning_rate": 4.9081632653061225e-05, "loss": 0.4642, "num_tokens": 56744145.0, "step": 192 }, { "epoch": 0.23745410036719705, "grad_norm": 0.275425523519516, "learning_rate": 4.9013605442176875e-05, "loss": 0.4609, "num_tokens": 57372074.0, "step": 194 }, { "epoch": 0.23990208078335373, "grad_norm": 0.28021734952926636, "learning_rate": 4.894557823129252e-05, "loss": 0.4796, "num_tokens": 57970018.0, "step": 196 }, { "epoch": 0.2423500611995104, "grad_norm": 0.2859236001968384, "learning_rate": 4.887755102040816e-05, "loss": 0.4736, "num_tokens": 58616468.0, "step": 198 }, { "epoch": 0.24479804161566707, "grad_norm": 0.31136834621429443, "learning_rate": 4.880952380952381e-05, "loss": 0.4615, "num_tokens": 59212867.0, "step": 200 }, { "epoch": 0.24724602203182375, "grad_norm": 0.3293072283267975, "learning_rate": 4.8741496598639455e-05, "loss": 0.4648, "num_tokens": 59831819.0, "step": 202 }, { "epoch": 0.24969400244798043, "grad_norm": 0.3014868497848511, "learning_rate": 4.8673469387755104e-05, "loss": 0.4486, "num_tokens": 60406569.0, "step": 204 }, { "epoch": 0.2521419828641371, "grad_norm": 0.28336650133132935, "learning_rate": 4.8605442176870754e-05, "loss": 0.4649, "num_tokens": 60992960.0, "step": 206 }, { "epoch": 0.25458996328029376, "grad_norm": 0.2801848351955414, "learning_rate": 4.85374149659864e-05, "loss": 0.4572, "num_tokens": 61582502.0, "step": 208 }, { "epoch": 0.25703794369645044, "grad_norm": 0.2692711353302002, "learning_rate": 4.846938775510204e-05, "loss": 0.4552, "num_tokens": 62170270.0, "step": 210 }, { "epoch": 0.2594859241126071, "grad_norm": 0.3017294406890869, "learning_rate": 4.840136054421769e-05, "loss": 0.4505, "num_tokens": 62762518.0, "step": 212 }, { "epoch": 0.26193390452876375, "grad_norm": 0.30879127979278564, "learning_rate": 4.8333333333333334e-05, "loss": 0.4588, "num_tokens": 63338317.0, "step": 214 }, { "epoch": 0.26438188494492043, "grad_norm": 0.28352242708206177, "learning_rate": 4.8265306122448984e-05, "loss": 0.4442, "num_tokens": 63946534.0, "step": 216 }, { "epoch": 0.2668298653610771, "grad_norm": 0.2639378607273102, "learning_rate": 4.8197278911564633e-05, "loss": 0.4443, "num_tokens": 64529245.0, "step": 218 }, { "epoch": 0.2692778457772338, "grad_norm": 0.28288203477859497, "learning_rate": 4.812925170068027e-05, "loss": 0.4682, "num_tokens": 65101588.0, "step": 220 }, { "epoch": 0.2717258261933905, "grad_norm": 0.28435492515563965, "learning_rate": 4.806122448979592e-05, "loss": 0.4561, "num_tokens": 65693459.0, "step": 222 }, { "epoch": 0.2741738066095471, "grad_norm": 0.2928950786590576, "learning_rate": 4.799319727891157e-05, "loss": 0.4608, "num_tokens": 66264575.0, "step": 224 }, { "epoch": 0.2766217870257038, "grad_norm": 0.2890413999557495, "learning_rate": 4.792517006802721e-05, "loss": 0.4391, "num_tokens": 66836693.0, "step": 226 }, { "epoch": 0.27906976744186046, "grad_norm": 0.3116688132286072, "learning_rate": 4.785714285714286e-05, "loss": 0.4621, "num_tokens": 67435169.0, "step": 228 }, { "epoch": 0.28151774785801714, "grad_norm": 0.2953718304634094, "learning_rate": 4.7789115646258506e-05, "loss": 0.4668, "num_tokens": 68047327.0, "step": 230 }, { "epoch": 0.2839657282741738, "grad_norm": 0.30888521671295166, "learning_rate": 4.772108843537415e-05, "loss": 0.4585, "num_tokens": 68652895.0, "step": 232 }, { "epoch": 0.2864137086903305, "grad_norm": 0.2853846549987793, "learning_rate": 4.76530612244898e-05, "loss": 0.4594, "num_tokens": 69234680.0, "step": 234 }, { "epoch": 0.28886168910648713, "grad_norm": 0.2881208062171936, "learning_rate": 4.758503401360544e-05, "loss": 0.4601, "num_tokens": 69826413.0, "step": 236 }, { "epoch": 0.2913096695226438, "grad_norm": 0.26556316018104553, "learning_rate": 4.751700680272109e-05, "loss": 0.4535, "num_tokens": 70441126.0, "step": 238 }, { "epoch": 0.2937576499388005, "grad_norm": 0.24764332175254822, "learning_rate": 4.744897959183674e-05, "loss": 0.4491, "num_tokens": 71026297.0, "step": 240 }, { "epoch": 0.2962056303549572, "grad_norm": 0.25014156103134155, "learning_rate": 4.738095238095238e-05, "loss": 0.4398, "num_tokens": 71615415.0, "step": 242 }, { "epoch": 0.29865361077111385, "grad_norm": 0.2763251066207886, "learning_rate": 4.731292517006803e-05, "loss": 0.4533, "num_tokens": 72187205.0, "step": 244 }, { "epoch": 0.3011015911872705, "grad_norm": 0.2749806046485901, "learning_rate": 4.724489795918368e-05, "loss": 0.4463, "num_tokens": 72766789.0, "step": 246 }, { "epoch": 0.30354957160342716, "grad_norm": 0.2782520651817322, "learning_rate": 4.717687074829932e-05, "loss": 0.4556, "num_tokens": 73367195.0, "step": 248 }, { "epoch": 0.30599755201958384, "grad_norm": 0.2629903256893158, "learning_rate": 4.710884353741497e-05, "loss": 0.4616, "num_tokens": 73931883.0, "step": 250 }, { "epoch": 0.3084455324357405, "grad_norm": 0.23975805938243866, "learning_rate": 4.7040816326530614e-05, "loss": 0.4575, "num_tokens": 74512785.0, "step": 252 }, { "epoch": 0.3108935128518972, "grad_norm": 0.26275327801704407, "learning_rate": 4.697278911564626e-05, "loss": 0.4476, "num_tokens": 75103604.0, "step": 254 }, { "epoch": 0.31334149326805383, "grad_norm": 0.2784687876701355, "learning_rate": 4.690476190476191e-05, "loss": 0.4634, "num_tokens": 75691456.0, "step": 256 }, { "epoch": 0.3157894736842105, "grad_norm": 0.27000266313552856, "learning_rate": 4.683673469387756e-05, "loss": 0.4509, "num_tokens": 76296514.0, "step": 258 }, { "epoch": 0.3182374541003672, "grad_norm": 0.27152442932128906, "learning_rate": 4.67687074829932e-05, "loss": 0.4539, "num_tokens": 76901295.0, "step": 260 }, { "epoch": 0.32068543451652387, "grad_norm": 0.28256285190582275, "learning_rate": 4.670068027210884e-05, "loss": 0.445, "num_tokens": 77473398.0, "step": 262 }, { "epoch": 0.32313341493268055, "grad_norm": 0.2900221347808838, "learning_rate": 4.663265306122449e-05, "loss": 0.4625, "num_tokens": 78068399.0, "step": 264 }, { "epoch": 0.32558139534883723, "grad_norm": 0.28292152285575867, "learning_rate": 4.6564625850340136e-05, "loss": 0.4686, "num_tokens": 78683018.0, "step": 266 }, { "epoch": 0.32802937576499386, "grad_norm": 0.2715926170349121, "learning_rate": 4.6496598639455786e-05, "loss": 0.4449, "num_tokens": 79280661.0, "step": 268 }, { "epoch": 0.33047735618115054, "grad_norm": 0.28506749868392944, "learning_rate": 4.642857142857143e-05, "loss": 0.4467, "num_tokens": 79894114.0, "step": 270 }, { "epoch": 0.3329253365973072, "grad_norm": 0.2762807309627533, "learning_rate": 4.636054421768708e-05, "loss": 0.4497, "num_tokens": 80498761.0, "step": 272 }, { "epoch": 0.3353733170134639, "grad_norm": 0.27056536078453064, "learning_rate": 4.629251700680272e-05, "loss": 0.4384, "num_tokens": 81090664.0, "step": 274 }, { "epoch": 0.3378212974296206, "grad_norm": 0.28110426664352417, "learning_rate": 4.6224489795918366e-05, "loss": 0.4466, "num_tokens": 81682310.0, "step": 276 }, { "epoch": 0.3402692778457772, "grad_norm": 0.28032049536705017, "learning_rate": 4.6156462585034015e-05, "loss": 0.4317, "num_tokens": 82257211.0, "step": 278 }, { "epoch": 0.3427172582619339, "grad_norm": 0.27031221985816956, "learning_rate": 4.6088435374149665e-05, "loss": 0.4324, "num_tokens": 82873665.0, "step": 280 }, { "epoch": 0.34516523867809057, "grad_norm": 0.30070960521698, "learning_rate": 4.602040816326531e-05, "loss": 0.4442, "num_tokens": 83459915.0, "step": 282 }, { "epoch": 0.34761321909424725, "grad_norm": 0.2606748640537262, "learning_rate": 4.595238095238095e-05, "loss": 0.4533, "num_tokens": 84064495.0, "step": 284 }, { "epoch": 0.35006119951040393, "grad_norm": 0.2483888566493988, "learning_rate": 4.58843537414966e-05, "loss": 0.4307, "num_tokens": 84650583.0, "step": 286 }, { "epoch": 0.3525091799265606, "grad_norm": 0.26000508666038513, "learning_rate": 4.5816326530612245e-05, "loss": 0.4422, "num_tokens": 85253276.0, "step": 288 }, { "epoch": 0.35495716034271724, "grad_norm": 0.26508358120918274, "learning_rate": 4.5748299319727895e-05, "loss": 0.4409, "num_tokens": 85833675.0, "step": 290 }, { "epoch": 0.3574051407588739, "grad_norm": 0.24521681666374207, "learning_rate": 4.5680272108843544e-05, "loss": 0.4342, "num_tokens": 86421330.0, "step": 292 }, { "epoch": 0.3598531211750306, "grad_norm": 0.2621183693408966, "learning_rate": 4.561224489795918e-05, "loss": 0.4465, "num_tokens": 87030151.0, "step": 294 }, { "epoch": 0.3623011015911873, "grad_norm": 0.2855012118816376, "learning_rate": 4.554421768707483e-05, "loss": 0.4248, "num_tokens": 87567827.0, "step": 296 }, { "epoch": 0.36474908200734396, "grad_norm": 0.271363765001297, "learning_rate": 4.547619047619048e-05, "loss": 0.4328, "num_tokens": 88157730.0, "step": 298 }, { "epoch": 0.3671970624235006, "grad_norm": 0.2715591788291931, "learning_rate": 4.5408163265306124e-05, "loss": 0.4452, "num_tokens": 88759382.0, "step": 300 }, { "epoch": 0.36964504283965727, "grad_norm": 0.2715524435043335, "learning_rate": 4.5340136054421774e-05, "loss": 0.4404, "num_tokens": 89342429.0, "step": 302 }, { "epoch": 0.37209302325581395, "grad_norm": 0.25128498673439026, "learning_rate": 4.527210884353742e-05, "loss": 0.4394, "num_tokens": 89934089.0, "step": 304 }, { "epoch": 0.37454100367197063, "grad_norm": 0.24225927889347076, "learning_rate": 4.520408163265306e-05, "loss": 0.4359, "num_tokens": 90525609.0, "step": 306 }, { "epoch": 0.3769889840881273, "grad_norm": 0.26876208186149597, "learning_rate": 4.513605442176871e-05, "loss": 0.4165, "num_tokens": 91116420.0, "step": 308 }, { "epoch": 0.379436964504284, "grad_norm": 0.22906406223773956, "learning_rate": 4.506802721088435e-05, "loss": 0.4394, "num_tokens": 91706872.0, "step": 310 }, { "epoch": 0.3818849449204406, "grad_norm": 0.251137375831604, "learning_rate": 4.5e-05, "loss": 0.4355, "num_tokens": 92273116.0, "step": 312 }, { "epoch": 0.3843329253365973, "grad_norm": 0.26540762186050415, "learning_rate": 4.493197278911565e-05, "loss": 0.4445, "num_tokens": 92877700.0, "step": 314 }, { "epoch": 0.386780905752754, "grad_norm": 0.26842620968818665, "learning_rate": 4.486394557823129e-05, "loss": 0.4484, "num_tokens": 93448212.0, "step": 316 }, { "epoch": 0.38922888616891066, "grad_norm": 0.28513234853744507, "learning_rate": 4.479591836734694e-05, "loss": 0.4374, "num_tokens": 94015732.0, "step": 318 }, { "epoch": 0.39167686658506734, "grad_norm": 0.2440779209136963, "learning_rate": 4.472789115646259e-05, "loss": 0.4323, "num_tokens": 94583212.0, "step": 320 }, { "epoch": 0.39412484700122397, "grad_norm": 0.25000423192977905, "learning_rate": 4.465986394557823e-05, "loss": 0.4239, "num_tokens": 95154747.0, "step": 322 }, { "epoch": 0.39657282741738065, "grad_norm": 0.2511066794395447, "learning_rate": 4.459183673469388e-05, "loss": 0.4495, "num_tokens": 95736389.0, "step": 324 }, { "epoch": 0.3990208078335373, "grad_norm": 0.2468034029006958, "learning_rate": 4.4523809523809525e-05, "loss": 0.4447, "num_tokens": 96331517.0, "step": 326 }, { "epoch": 0.401468788249694, "grad_norm": 0.2469393014907837, "learning_rate": 4.445578231292517e-05, "loss": 0.4363, "num_tokens": 96919369.0, "step": 328 }, { "epoch": 0.4039167686658507, "grad_norm": 0.25417712330818176, "learning_rate": 4.438775510204082e-05, "loss": 0.4306, "num_tokens": 97509288.0, "step": 330 }, { "epoch": 0.40636474908200737, "grad_norm": 0.24457654356956482, "learning_rate": 4.431972789115647e-05, "loss": 0.4437, "num_tokens": 98114408.0, "step": 332 }, { "epoch": 0.408812729498164, "grad_norm": 0.24999113380908966, "learning_rate": 4.425170068027211e-05, "loss": 0.4259, "num_tokens": 98713441.0, "step": 334 }, { "epoch": 0.4112607099143207, "grad_norm": 0.2625894546508789, "learning_rate": 4.418367346938776e-05, "loss": 0.4311, "num_tokens": 99327738.0, "step": 336 }, { "epoch": 0.41370869033047736, "grad_norm": 0.24420839548110962, "learning_rate": 4.4115646258503404e-05, "loss": 0.436, "num_tokens": 99919049.0, "step": 338 }, { "epoch": 0.41615667074663404, "grad_norm": 0.23604153096675873, "learning_rate": 4.404761904761905e-05, "loss": 0.425, "num_tokens": 100516376.0, "step": 340 }, { "epoch": 0.4186046511627907, "grad_norm": 0.25000831484794617, "learning_rate": 4.39795918367347e-05, "loss": 0.4389, "num_tokens": 101133012.0, "step": 342 }, { "epoch": 0.42105263157894735, "grad_norm": 0.24555934965610504, "learning_rate": 4.391156462585034e-05, "loss": 0.4274, "num_tokens": 101700842.0, "step": 344 }, { "epoch": 0.423500611995104, "grad_norm": 0.23334544897079468, "learning_rate": 4.384353741496599e-05, "loss": 0.4443, "num_tokens": 102301821.0, "step": 346 }, { "epoch": 0.4259485924112607, "grad_norm": 0.2547636330127716, "learning_rate": 4.377551020408163e-05, "loss": 0.4358, "num_tokens": 102911147.0, "step": 348 }, { "epoch": 0.4283965728274174, "grad_norm": 0.24813403189182281, "learning_rate": 4.3707482993197277e-05, "loss": 0.4147, "num_tokens": 103494243.0, "step": 350 }, { "epoch": 0.43084455324357407, "grad_norm": 0.24748776853084564, "learning_rate": 4.3639455782312926e-05, "loss": 0.4337, "num_tokens": 104100107.0, "step": 352 }, { "epoch": 0.43329253365973075, "grad_norm": 0.24889934062957764, "learning_rate": 4.3571428571428576e-05, "loss": 0.4319, "num_tokens": 104696583.0, "step": 354 }, { "epoch": 0.4357405140758874, "grad_norm": 0.24503983557224274, "learning_rate": 4.350340136054422e-05, "loss": 0.4345, "num_tokens": 105266973.0, "step": 356 }, { "epoch": 0.43818849449204406, "grad_norm": 0.2355017215013504, "learning_rate": 4.343537414965987e-05, "loss": 0.4238, "num_tokens": 105855583.0, "step": 358 }, { "epoch": 0.44063647490820074, "grad_norm": 0.21979409456253052, "learning_rate": 4.336734693877551e-05, "loss": 0.4313, "num_tokens": 106413444.0, "step": 360 }, { "epoch": 0.4430844553243574, "grad_norm": 0.23361046612262726, "learning_rate": 4.3299319727891156e-05, "loss": 0.4415, "num_tokens": 107001653.0, "step": 362 }, { "epoch": 0.4455324357405141, "grad_norm": 0.2537902891635895, "learning_rate": 4.3231292517006806e-05, "loss": 0.4331, "num_tokens": 107595819.0, "step": 364 }, { "epoch": 0.4479804161566707, "grad_norm": 0.23102331161499023, "learning_rate": 4.3163265306122455e-05, "loss": 0.4073, "num_tokens": 108198205.0, "step": 366 }, { "epoch": 0.4504283965728274, "grad_norm": 0.2286710888147354, "learning_rate": 4.30952380952381e-05, "loss": 0.4143, "num_tokens": 108768930.0, "step": 368 }, { "epoch": 0.4528763769889841, "grad_norm": 0.22512906789779663, "learning_rate": 4.302721088435374e-05, "loss": 0.4423, "num_tokens": 109393449.0, "step": 370 }, { "epoch": 0.45532435740514077, "grad_norm": 0.20240113139152527, "learning_rate": 4.295918367346939e-05, "loss": 0.4228, "num_tokens": 109998048.0, "step": 372 }, { "epoch": 0.45777233782129745, "grad_norm": 0.2297106236219406, "learning_rate": 4.2891156462585035e-05, "loss": 0.4334, "num_tokens": 110602780.0, "step": 374 }, { "epoch": 0.4602203182374541, "grad_norm": 0.2433292418718338, "learning_rate": 4.2823129251700685e-05, "loss": 0.426, "num_tokens": 111196271.0, "step": 376 }, { "epoch": 0.46266829865361075, "grad_norm": 0.24165078997612, "learning_rate": 4.275510204081633e-05, "loss": 0.4211, "num_tokens": 111762398.0, "step": 378 }, { "epoch": 0.46511627906976744, "grad_norm": 0.2309572547674179, "learning_rate": 4.268707482993197e-05, "loss": 0.4292, "num_tokens": 112344276.0, "step": 380 }, { "epoch": 0.4675642594859241, "grad_norm": 0.2697245478630066, "learning_rate": 4.261904761904762e-05, "loss": 0.4236, "num_tokens": 112929852.0, "step": 382 }, { "epoch": 0.4700122399020808, "grad_norm": 0.2398257702589035, "learning_rate": 4.2551020408163264e-05, "loss": 0.4246, "num_tokens": 113524769.0, "step": 384 }, { "epoch": 0.4724602203182375, "grad_norm": 0.24562087655067444, "learning_rate": 4.2482993197278914e-05, "loss": 0.4119, "num_tokens": 114097175.0, "step": 386 }, { "epoch": 0.4749082007343941, "grad_norm": 0.25012898445129395, "learning_rate": 4.2414965986394564e-05, "loss": 0.4349, "num_tokens": 114681195.0, "step": 388 }, { "epoch": 0.4773561811505508, "grad_norm": 0.23673886060714722, "learning_rate": 4.234693877551021e-05, "loss": 0.4387, "num_tokens": 115275959.0, "step": 390 }, { "epoch": 0.47980416156670747, "grad_norm": 0.23175287246704102, "learning_rate": 4.227891156462585e-05, "loss": 0.4152, "num_tokens": 115836837.0, "step": 392 }, { "epoch": 0.48225214198286415, "grad_norm": 0.23215539753437042, "learning_rate": 4.22108843537415e-05, "loss": 0.4344, "num_tokens": 116449305.0, "step": 394 }, { "epoch": 0.4847001223990208, "grad_norm": 0.23859956860542297, "learning_rate": 4.214285714285714e-05, "loss": 0.4177, "num_tokens": 117045707.0, "step": 396 }, { "epoch": 0.48714810281517745, "grad_norm": 0.22515934705734253, "learning_rate": 4.207482993197279e-05, "loss": 0.4263, "num_tokens": 117635477.0, "step": 398 }, { "epoch": 0.48959608323133413, "grad_norm": 0.2429385930299759, "learning_rate": 4.200680272108844e-05, "loss": 0.4302, "num_tokens": 118231246.0, "step": 400 }, { "epoch": 0.4920440636474908, "grad_norm": 0.24461030960083008, "learning_rate": 4.193877551020408e-05, "loss": 0.4231, "num_tokens": 118844143.0, "step": 402 }, { "epoch": 0.4944920440636475, "grad_norm": 0.23398247361183167, "learning_rate": 4.187074829931973e-05, "loss": 0.4192, "num_tokens": 119416215.0, "step": 404 }, { "epoch": 0.4969400244798042, "grad_norm": 0.24912168085575104, "learning_rate": 4.180272108843538e-05, "loss": 0.4264, "num_tokens": 120009624.0, "step": 406 }, { "epoch": 0.49938800489596086, "grad_norm": 0.2675349712371826, "learning_rate": 4.173469387755102e-05, "loss": 0.4115, "num_tokens": 120601206.0, "step": 408 }, { "epoch": 0.5018359853121175, "grad_norm": 0.24355578422546387, "learning_rate": 4.166666666666667e-05, "loss": 0.4343, "num_tokens": 121202335.0, "step": 410 }, { "epoch": 0.5042839657282742, "grad_norm": 0.24001996219158173, "learning_rate": 4.1598639455782315e-05, "loss": 0.4246, "num_tokens": 121787505.0, "step": 412 }, { "epoch": 0.5067319461444308, "grad_norm": 0.2392289936542511, "learning_rate": 4.153061224489796e-05, "loss": 0.424, "num_tokens": 122383008.0, "step": 414 }, { "epoch": 0.5091799265605875, "grad_norm": 0.25970658659935, "learning_rate": 4.146258503401361e-05, "loss": 0.4345, "num_tokens": 122980153.0, "step": 416 }, { "epoch": 0.5116279069767442, "grad_norm": 0.23119714856147766, "learning_rate": 4.139455782312925e-05, "loss": 0.4116, "num_tokens": 123564055.0, "step": 418 }, { "epoch": 0.5140758873929009, "grad_norm": 0.25148412585258484, "learning_rate": 4.13265306122449e-05, "loss": 0.4214, "num_tokens": 124154714.0, "step": 420 }, { "epoch": 0.5165238678090576, "grad_norm": 0.2325267493724823, "learning_rate": 4.125850340136055e-05, "loss": 0.4315, "num_tokens": 124725604.0, "step": 422 }, { "epoch": 0.5189718482252142, "grad_norm": 0.24483321607112885, "learning_rate": 4.119047619047619e-05, "loss": 0.4232, "num_tokens": 125329669.0, "step": 424 }, { "epoch": 0.5214198286413708, "grad_norm": 0.232200026512146, "learning_rate": 4.112244897959184e-05, "loss": 0.4147, "num_tokens": 125871390.0, "step": 426 }, { "epoch": 0.5238678090575275, "grad_norm": 0.22668562829494476, "learning_rate": 4.105442176870749e-05, "loss": 0.4293, "num_tokens": 126485091.0, "step": 428 }, { "epoch": 0.5263157894736842, "grad_norm": 0.22833625972270966, "learning_rate": 4.098639455782313e-05, "loss": 0.4218, "num_tokens": 127070743.0, "step": 430 }, { "epoch": 0.5287637698898409, "grad_norm": 0.2558884024620056, "learning_rate": 4.091836734693878e-05, "loss": 0.4331, "num_tokens": 127643183.0, "step": 432 }, { "epoch": 0.5312117503059975, "grad_norm": 0.2330903708934784, "learning_rate": 4.0850340136054423e-05, "loss": 0.4286, "num_tokens": 128238869.0, "step": 434 }, { "epoch": 0.5336597307221542, "grad_norm": 0.24798254668712616, "learning_rate": 4.078231292517007e-05, "loss": 0.4137, "num_tokens": 128833166.0, "step": 436 }, { "epoch": 0.5361077111383109, "grad_norm": 0.2437429279088974, "learning_rate": 4.0714285714285717e-05, "loss": 0.4324, "num_tokens": 129432358.0, "step": 438 }, { "epoch": 0.5385556915544676, "grad_norm": 0.234913632273674, "learning_rate": 4.0646258503401366e-05, "loss": 0.4059, "num_tokens": 130005589.0, "step": 440 }, { "epoch": 0.5410036719706243, "grad_norm": 0.21830596029758453, "learning_rate": 4.057823129251701e-05, "loss": 0.4399, "num_tokens": 130617968.0, "step": 442 }, { "epoch": 0.543451652386781, "grad_norm": 0.22601354122161865, "learning_rate": 4.051020408163265e-05, "loss": 0.4096, "num_tokens": 131211641.0, "step": 444 }, { "epoch": 0.5458996328029376, "grad_norm": 0.23291222751140594, "learning_rate": 4.04421768707483e-05, "loss": 0.4185, "num_tokens": 131815751.0, "step": 446 }, { "epoch": 0.5483476132190942, "grad_norm": 0.2386036366224289, "learning_rate": 4.0374149659863946e-05, "loss": 0.4246, "num_tokens": 132435428.0, "step": 448 }, { "epoch": 0.5507955936352509, "grad_norm": 0.22926120460033417, "learning_rate": 4.0306122448979596e-05, "loss": 0.4334, "num_tokens": 133026844.0, "step": 450 }, { "epoch": 0.5532435740514076, "grad_norm": 0.24068664014339447, "learning_rate": 4.023809523809524e-05, "loss": 0.4306, "num_tokens": 133610279.0, "step": 452 }, { "epoch": 0.5556915544675642, "grad_norm": 0.23355446755886078, "learning_rate": 4.017006802721089e-05, "loss": 0.4182, "num_tokens": 134186783.0, "step": 454 }, { "epoch": 0.5581395348837209, "grad_norm": 0.22524282336235046, "learning_rate": 4.010204081632653e-05, "loss": 0.403, "num_tokens": 134766199.0, "step": 456 }, { "epoch": 0.5605875152998776, "grad_norm": 0.22861827909946442, "learning_rate": 4.0034013605442175e-05, "loss": 0.4159, "num_tokens": 135355631.0, "step": 458 }, { "epoch": 0.5630354957160343, "grad_norm": 0.2309344857931137, "learning_rate": 3.9965986394557825e-05, "loss": 0.4361, "num_tokens": 135973241.0, "step": 460 }, { "epoch": 0.565483476132191, "grad_norm": 0.22461438179016113, "learning_rate": 3.9897959183673475e-05, "loss": 0.4098, "num_tokens": 136579751.0, "step": 462 }, { "epoch": 0.5679314565483476, "grad_norm": 0.23691856861114502, "learning_rate": 3.982993197278912e-05, "loss": 0.4212, "num_tokens": 137187474.0, "step": 464 }, { "epoch": 0.5703794369645043, "grad_norm": 0.24203404784202576, "learning_rate": 3.976190476190476e-05, "loss": 0.4377, "num_tokens": 137796342.0, "step": 466 }, { "epoch": 0.572827417380661, "grad_norm": 0.2416626662015915, "learning_rate": 3.969387755102041e-05, "loss": 0.4158, "num_tokens": 138415346.0, "step": 468 }, { "epoch": 0.5752753977968176, "grad_norm": 0.22454653680324554, "learning_rate": 3.9625850340136054e-05, "loss": 0.4316, "num_tokens": 139036676.0, "step": 470 }, { "epoch": 0.5777233782129743, "grad_norm": 0.233390673995018, "learning_rate": 3.9557823129251704e-05, "loss": 0.4295, "num_tokens": 139647226.0, "step": 472 }, { "epoch": 0.5801713586291309, "grad_norm": 0.229845330119133, "learning_rate": 3.9489795918367354e-05, "loss": 0.4105, "num_tokens": 140222429.0, "step": 474 }, { "epoch": 0.5826193390452876, "grad_norm": 0.2502513527870178, "learning_rate": 3.9421768707483e-05, "loss": 0.4296, "num_tokens": 140846203.0, "step": 476 }, { "epoch": 0.5850673194614443, "grad_norm": 0.21805426478385925, "learning_rate": 3.935374149659864e-05, "loss": 0.4098, "num_tokens": 141448874.0, "step": 478 }, { "epoch": 0.587515299877601, "grad_norm": 0.23197561502456665, "learning_rate": 3.928571428571429e-05, "loss": 0.4063, "num_tokens": 142029277.0, "step": 480 }, { "epoch": 0.5899632802937577, "grad_norm": 0.24564990401268005, "learning_rate": 3.921768707482993e-05, "loss": 0.4064, "num_tokens": 142624816.0, "step": 482 }, { "epoch": 0.5924112607099143, "grad_norm": 0.2349110096693039, "learning_rate": 3.914965986394558e-05, "loss": 0.4097, "num_tokens": 143212355.0, "step": 484 }, { "epoch": 0.594859241126071, "grad_norm": 0.22426576912403107, "learning_rate": 3.9081632653061226e-05, "loss": 0.428, "num_tokens": 143820960.0, "step": 486 }, { "epoch": 0.5973072215422277, "grad_norm": 0.21998530626296997, "learning_rate": 3.901360544217687e-05, "loss": 0.4256, "num_tokens": 144394056.0, "step": 488 }, { "epoch": 0.5997552019583844, "grad_norm": 0.22723303735256195, "learning_rate": 3.894557823129252e-05, "loss": 0.4133, "num_tokens": 144976355.0, "step": 490 }, { "epoch": 0.602203182374541, "grad_norm": 0.243075892329216, "learning_rate": 3.887755102040816e-05, "loss": 0.416, "num_tokens": 145575990.0, "step": 492 }, { "epoch": 0.6046511627906976, "grad_norm": 0.23288574814796448, "learning_rate": 3.880952380952381e-05, "loss": 0.4147, "num_tokens": 146180118.0, "step": 494 }, { "epoch": 0.6070991432068543, "grad_norm": 0.23967666923999786, "learning_rate": 3.874149659863946e-05, "loss": 0.4247, "num_tokens": 146763569.0, "step": 496 }, { "epoch": 0.609547123623011, "grad_norm": 0.24336597323417664, "learning_rate": 3.86734693877551e-05, "loss": 0.4123, "num_tokens": 147348892.0, "step": 498 }, { "epoch": 0.6119951040391677, "grad_norm": 0.21569854021072388, "learning_rate": 3.860544217687075e-05, "loss": 0.4149, "num_tokens": 147934106.0, "step": 500 }, { "epoch": 0.6144430844553244, "grad_norm": 0.23546361923217773, "learning_rate": 3.85374149659864e-05, "loss": 0.4331, "num_tokens": 148526284.0, "step": 502 }, { "epoch": 0.616891064871481, "grad_norm": 0.23105807602405548, "learning_rate": 3.846938775510204e-05, "loss": 0.4307, "num_tokens": 149116759.0, "step": 504 }, { "epoch": 0.6193390452876377, "grad_norm": 0.2113538384437561, "learning_rate": 3.840136054421769e-05, "loss": 0.427, "num_tokens": 149733718.0, "step": 506 }, { "epoch": 0.6217870257037944, "grad_norm": 0.21684259176254272, "learning_rate": 3.8333333333333334e-05, "loss": 0.4189, "num_tokens": 150325552.0, "step": 508 }, { "epoch": 0.6242350061199511, "grad_norm": 0.22028353810310364, "learning_rate": 3.826530612244898e-05, "loss": 0.4177, "num_tokens": 150919640.0, "step": 510 }, { "epoch": 0.6266829865361077, "grad_norm": 0.20340217649936676, "learning_rate": 3.819727891156463e-05, "loss": 0.3921, "num_tokens": 151479810.0, "step": 512 }, { "epoch": 0.6291309669522643, "grad_norm": 0.21583996713161469, "learning_rate": 3.812925170068028e-05, "loss": 0.4203, "num_tokens": 152069098.0, "step": 514 }, { "epoch": 0.631578947368421, "grad_norm": 0.21503205597400665, "learning_rate": 3.806122448979592e-05, "loss": 0.4094, "num_tokens": 152650689.0, "step": 516 }, { "epoch": 0.6340269277845777, "grad_norm": 0.22335848212242126, "learning_rate": 3.799319727891157e-05, "loss": 0.4217, "num_tokens": 153238050.0, "step": 518 }, { "epoch": 0.6364749082007344, "grad_norm": 0.2269769310951233, "learning_rate": 3.7925170068027214e-05, "loss": 0.4173, "num_tokens": 153848706.0, "step": 520 }, { "epoch": 0.6389228886168911, "grad_norm": 0.25328436493873596, "learning_rate": 3.785714285714286e-05, "loss": 0.4342, "num_tokens": 154435999.0, "step": 522 }, { "epoch": 0.6413708690330477, "grad_norm": 0.2390831559896469, "learning_rate": 3.778911564625851e-05, "loss": 0.4019, "num_tokens": 155012325.0, "step": 524 }, { "epoch": 0.6438188494492044, "grad_norm": 0.23828354477882385, "learning_rate": 3.772108843537415e-05, "loss": 0.3916, "num_tokens": 155577911.0, "step": 526 }, { "epoch": 0.6462668298653611, "grad_norm": 0.22742412984371185, "learning_rate": 3.76530612244898e-05, "loss": 0.4209, "num_tokens": 156159354.0, "step": 528 }, { "epoch": 0.6487148102815178, "grad_norm": 0.23829485476016998, "learning_rate": 3.758503401360544e-05, "loss": 0.414, "num_tokens": 156734371.0, "step": 530 }, { "epoch": 0.6511627906976745, "grad_norm": 0.23417070508003235, "learning_rate": 3.7517006802721086e-05, "loss": 0.4337, "num_tokens": 157342481.0, "step": 532 }, { "epoch": 0.653610771113831, "grad_norm": 0.21686191856861115, "learning_rate": 3.7448979591836736e-05, "loss": 0.4023, "num_tokens": 157940378.0, "step": 534 }, { "epoch": 0.6560587515299877, "grad_norm": 0.23293235898017883, "learning_rate": 3.7380952380952386e-05, "loss": 0.42, "num_tokens": 158529483.0, "step": 536 }, { "epoch": 0.6585067319461444, "grad_norm": 0.22783978283405304, "learning_rate": 3.731292517006803e-05, "loss": 0.419, "num_tokens": 159116553.0, "step": 538 }, { "epoch": 0.6609547123623011, "grad_norm": 0.22063913941383362, "learning_rate": 3.724489795918368e-05, "loss": 0.4113, "num_tokens": 159704424.0, "step": 540 }, { "epoch": 0.6634026927784578, "grad_norm": 0.22138412296772003, "learning_rate": 3.717687074829932e-05, "loss": 0.4217, "num_tokens": 160315337.0, "step": 542 }, { "epoch": 0.6658506731946144, "grad_norm": 0.2054126262664795, "learning_rate": 3.7108843537414965e-05, "loss": 0.4185, "num_tokens": 160906136.0, "step": 544 }, { "epoch": 0.6682986536107711, "grad_norm": 0.21915203332901, "learning_rate": 3.7040816326530615e-05, "loss": 0.395, "num_tokens": 161482176.0, "step": 546 }, { "epoch": 0.6707466340269278, "grad_norm": 0.22101595997810364, "learning_rate": 3.697278911564626e-05, "loss": 0.402, "num_tokens": 162066339.0, "step": 548 }, { "epoch": 0.6731946144430845, "grad_norm": 0.21050333976745605, "learning_rate": 3.690476190476191e-05, "loss": 0.4045, "num_tokens": 162686859.0, "step": 550 }, { "epoch": 0.6756425948592412, "grad_norm": 0.22473134100437164, "learning_rate": 3.683673469387755e-05, "loss": 0.4011, "num_tokens": 163261743.0, "step": 552 }, { "epoch": 0.6780905752753978, "grad_norm": 0.21602129936218262, "learning_rate": 3.67687074829932e-05, "loss": 0.4063, "num_tokens": 163864744.0, "step": 554 }, { "epoch": 0.6805385556915544, "grad_norm": 0.21802207827568054, "learning_rate": 3.6700680272108844e-05, "loss": 0.3954, "num_tokens": 164424223.0, "step": 556 }, { "epoch": 0.6829865361077111, "grad_norm": 0.21203817427158356, "learning_rate": 3.6632653061224494e-05, "loss": 0.416, "num_tokens": 165031113.0, "step": 558 }, { "epoch": 0.6854345165238678, "grad_norm": 0.21197819709777832, "learning_rate": 3.656462585034014e-05, "loss": 0.4106, "num_tokens": 165645826.0, "step": 560 }, { "epoch": 0.6878824969400245, "grad_norm": 0.22751174867153168, "learning_rate": 3.649659863945579e-05, "loss": 0.4108, "num_tokens": 166270973.0, "step": 562 }, { "epoch": 0.6903304773561811, "grad_norm": 0.23004040122032166, "learning_rate": 3.642857142857143e-05, "loss": 0.4266, "num_tokens": 166855785.0, "step": 564 }, { "epoch": 0.6927784577723378, "grad_norm": 0.20831601321697235, "learning_rate": 3.636054421768707e-05, "loss": 0.4123, "num_tokens": 167481909.0, "step": 566 }, { "epoch": 0.6952264381884945, "grad_norm": 0.21663352847099304, "learning_rate": 3.629251700680272e-05, "loss": 0.4031, "num_tokens": 168078039.0, "step": 568 }, { "epoch": 0.6976744186046512, "grad_norm": 0.22038975358009338, "learning_rate": 3.622448979591837e-05, "loss": 0.4097, "num_tokens": 168668586.0, "step": 570 }, { "epoch": 0.7001223990208079, "grad_norm": 0.2203424721956253, "learning_rate": 3.6156462585034016e-05, "loss": 0.4043, "num_tokens": 169280779.0, "step": 572 }, { "epoch": 0.7025703794369645, "grad_norm": 0.21173244714736938, "learning_rate": 3.608843537414966e-05, "loss": 0.3905, "num_tokens": 169869109.0, "step": 574 }, { "epoch": 0.7050183598531212, "grad_norm": 0.22765503823757172, "learning_rate": 3.602040816326531e-05, "loss": 0.4163, "num_tokens": 170459382.0, "step": 576 }, { "epoch": 0.7074663402692778, "grad_norm": 0.21649986505508423, "learning_rate": 3.595238095238095e-05, "loss": 0.4223, "num_tokens": 171059799.0, "step": 578 }, { "epoch": 0.7099143206854345, "grad_norm": 0.2312426120042801, "learning_rate": 3.58843537414966e-05, "loss": 0.4135, "num_tokens": 171616638.0, "step": 580 }, { "epoch": 0.7123623011015912, "grad_norm": 0.22326509654521942, "learning_rate": 3.5816326530612245e-05, "loss": 0.4168, "num_tokens": 172215523.0, "step": 582 }, { "epoch": 0.7148102815177478, "grad_norm": 0.2218017876148224, "learning_rate": 3.574829931972789e-05, "loss": 0.4051, "num_tokens": 172816261.0, "step": 584 }, { "epoch": 0.7172582619339045, "grad_norm": 0.23403960466384888, "learning_rate": 3.568027210884354e-05, "loss": 0.399, "num_tokens": 173418219.0, "step": 586 }, { "epoch": 0.7197062423500612, "grad_norm": 0.24804775416851044, "learning_rate": 3.561224489795918e-05, "loss": 0.4103, "num_tokens": 174001845.0, "step": 588 }, { "epoch": 0.7221542227662179, "grad_norm": 0.21949386596679688, "learning_rate": 3.554421768707483e-05, "loss": 0.3855, "num_tokens": 174573841.0, "step": 590 }, { "epoch": 0.7246022031823746, "grad_norm": 0.2172522097826004, "learning_rate": 3.547619047619048e-05, "loss": 0.4065, "num_tokens": 175157025.0, "step": 592 }, { "epoch": 0.7270501835985312, "grad_norm": 0.20603708922863007, "learning_rate": 3.5408163265306125e-05, "loss": 0.3932, "num_tokens": 175763839.0, "step": 594 }, { "epoch": 0.7294981640146879, "grad_norm": 0.22905442118644714, "learning_rate": 3.534013605442177e-05, "loss": 0.4014, "num_tokens": 176340303.0, "step": 596 }, { "epoch": 0.7319461444308446, "grad_norm": 0.21808220446109772, "learning_rate": 3.527210884353742e-05, "loss": 0.4157, "num_tokens": 176933693.0, "step": 598 }, { "epoch": 0.7343941248470012, "grad_norm": 0.23098190128803253, "learning_rate": 3.520408163265306e-05, "loss": 0.4062, "num_tokens": 177551417.0, "step": 600 }, { "epoch": 0.7368421052631579, "grad_norm": 0.23179644346237183, "learning_rate": 3.513605442176871e-05, "loss": 0.4002, "num_tokens": 178108215.0, "step": 602 }, { "epoch": 0.7392900856793145, "grad_norm": 0.23075342178344727, "learning_rate": 3.506802721088436e-05, "loss": 0.4061, "num_tokens": 178678758.0, "step": 604 }, { "epoch": 0.7417380660954712, "grad_norm": 0.22547698020935059, "learning_rate": 3.5e-05, "loss": 0.3978, "num_tokens": 179263018.0, "step": 606 }, { "epoch": 0.7441860465116279, "grad_norm": 0.22350163757801056, "learning_rate": 3.493197278911565e-05, "loss": 0.4063, "num_tokens": 179877385.0, "step": 608 }, { "epoch": 0.7466340269277846, "grad_norm": 0.24946771562099457, "learning_rate": 3.48639455782313e-05, "loss": 0.4058, "num_tokens": 180464341.0, "step": 610 }, { "epoch": 0.7490820073439413, "grad_norm": 0.22615167498588562, "learning_rate": 3.479591836734694e-05, "loss": 0.4118, "num_tokens": 181070260.0, "step": 612 }, { "epoch": 0.7515299877600979, "grad_norm": 0.23289895057678223, "learning_rate": 3.472789115646259e-05, "loss": 0.394, "num_tokens": 181650173.0, "step": 614 }, { "epoch": 0.7539779681762546, "grad_norm": 0.21833616495132446, "learning_rate": 3.465986394557823e-05, "loss": 0.3989, "num_tokens": 182238618.0, "step": 616 }, { "epoch": 0.7564259485924113, "grad_norm": 0.20955616235733032, "learning_rate": 3.4591836734693876e-05, "loss": 0.3918, "num_tokens": 182798204.0, "step": 618 }, { "epoch": 0.758873929008568, "grad_norm": 0.22131891548633575, "learning_rate": 3.4523809523809526e-05, "loss": 0.3987, "num_tokens": 183380029.0, "step": 620 }, { "epoch": 0.7613219094247246, "grad_norm": 0.22375357151031494, "learning_rate": 3.445578231292517e-05, "loss": 0.4066, "num_tokens": 183966255.0, "step": 622 }, { "epoch": 0.7637698898408812, "grad_norm": 0.23324353992938995, "learning_rate": 3.438775510204082e-05, "loss": 0.4143, "num_tokens": 184583415.0, "step": 624 }, { "epoch": 0.7662178702570379, "grad_norm": 0.21517063677310944, "learning_rate": 3.431972789115647e-05, "loss": 0.4112, "num_tokens": 185197955.0, "step": 626 }, { "epoch": 0.7686658506731946, "grad_norm": 0.2146264612674713, "learning_rate": 3.4251700680272105e-05, "loss": 0.3906, "num_tokens": 185785557.0, "step": 628 }, { "epoch": 0.7711138310893513, "grad_norm": 0.23694811761379242, "learning_rate": 3.4183673469387755e-05, "loss": 0.4011, "num_tokens": 186356034.0, "step": 630 }, { "epoch": 0.773561811505508, "grad_norm": 0.240101158618927, "learning_rate": 3.4115646258503405e-05, "loss": 0.4097, "num_tokens": 186944825.0, "step": 632 }, { "epoch": 0.7760097919216646, "grad_norm": 0.21157005429267883, "learning_rate": 3.404761904761905e-05, "loss": 0.4091, "num_tokens": 187555120.0, "step": 634 }, { "epoch": 0.7784577723378213, "grad_norm": 0.20811296999454498, "learning_rate": 3.39795918367347e-05, "loss": 0.4106, "num_tokens": 188158111.0, "step": 636 }, { "epoch": 0.780905752753978, "grad_norm": 0.21462666988372803, "learning_rate": 3.391156462585034e-05, "loss": 0.4132, "num_tokens": 188752896.0, "step": 638 }, { "epoch": 0.7833537331701347, "grad_norm": 0.20285725593566895, "learning_rate": 3.3843537414965984e-05, "loss": 0.4108, "num_tokens": 189348219.0, "step": 640 }, { "epoch": 0.7858017135862914, "grad_norm": 0.20694288611412048, "learning_rate": 3.3775510204081634e-05, "loss": 0.3946, "num_tokens": 189894564.0, "step": 642 }, { "epoch": 0.7882496940024479, "grad_norm": 0.19729703664779663, "learning_rate": 3.3707482993197284e-05, "loss": 0.4125, "num_tokens": 190529171.0, "step": 644 }, { "epoch": 0.7906976744186046, "grad_norm": 0.20968155562877655, "learning_rate": 3.363945578231293e-05, "loss": 0.4013, "num_tokens": 191131915.0, "step": 646 }, { "epoch": 0.7931456548347613, "grad_norm": 0.20942412316799164, "learning_rate": 3.357142857142857e-05, "loss": 0.4149, "num_tokens": 191724725.0, "step": 648 }, { "epoch": 0.795593635250918, "grad_norm": 0.20489519834518433, "learning_rate": 3.350340136054422e-05, "loss": 0.4089, "num_tokens": 192336245.0, "step": 650 }, { "epoch": 0.7980416156670747, "grad_norm": 0.2073380947113037, "learning_rate": 3.3435374149659863e-05, "loss": 0.4009, "num_tokens": 192923779.0, "step": 652 }, { "epoch": 0.8004895960832313, "grad_norm": 0.21561507880687714, "learning_rate": 3.336734693877551e-05, "loss": 0.3852, "num_tokens": 193499166.0, "step": 654 }, { "epoch": 0.802937576499388, "grad_norm": 0.21683001518249512, "learning_rate": 3.3299319727891156e-05, "loss": 0.4048, "num_tokens": 194104195.0, "step": 656 }, { "epoch": 0.8053855569155447, "grad_norm": 0.22452619671821594, "learning_rate": 3.3231292517006806e-05, "loss": 0.413, "num_tokens": 194700389.0, "step": 658 }, { "epoch": 0.8078335373317014, "grad_norm": 0.2256346493959427, "learning_rate": 3.316326530612245e-05, "loss": 0.4122, "num_tokens": 195293054.0, "step": 660 }, { "epoch": 0.8102815177478581, "grad_norm": 0.20950421690940857, "learning_rate": 3.309523809523809e-05, "loss": 0.3922, "num_tokens": 195898402.0, "step": 662 }, { "epoch": 0.8127294981640147, "grad_norm": 0.25200343132019043, "learning_rate": 3.302721088435374e-05, "loss": 0.3983, "num_tokens": 196479623.0, "step": 664 }, { "epoch": 0.8151774785801713, "grad_norm": 0.2127966731786728, "learning_rate": 3.295918367346939e-05, "loss": 0.397, "num_tokens": 197025340.0, "step": 666 }, { "epoch": 0.817625458996328, "grad_norm": 0.21392999589443207, "learning_rate": 3.2891156462585036e-05, "loss": 0.3948, "num_tokens": 197594800.0, "step": 668 }, { "epoch": 0.8200734394124847, "grad_norm": 0.22576898336410522, "learning_rate": 3.282312925170068e-05, "loss": 0.4018, "num_tokens": 198193576.0, "step": 670 }, { "epoch": 0.8225214198286414, "grad_norm": 0.20613126456737518, "learning_rate": 3.275510204081633e-05, "loss": 0.3969, "num_tokens": 198803995.0, "step": 672 }, { "epoch": 0.824969400244798, "grad_norm": 0.21142712235450745, "learning_rate": 3.268707482993197e-05, "loss": 0.4005, "num_tokens": 199416611.0, "step": 674 }, { "epoch": 0.8274173806609547, "grad_norm": 0.21077384054660797, "learning_rate": 3.261904761904762e-05, "loss": 0.3923, "num_tokens": 200007315.0, "step": 676 }, { "epoch": 0.8298653610771114, "grad_norm": 0.19479753077030182, "learning_rate": 3.255102040816327e-05, "loss": 0.3971, "num_tokens": 200608406.0, "step": 678 }, { "epoch": 0.8323133414932681, "grad_norm": 0.2116435021162033, "learning_rate": 3.2482993197278915e-05, "loss": 0.4108, "num_tokens": 201208706.0, "step": 680 }, { "epoch": 0.8347613219094248, "grad_norm": 0.21046370267868042, "learning_rate": 3.241496598639456e-05, "loss": 0.3765, "num_tokens": 201783927.0, "step": 682 }, { "epoch": 0.8372093023255814, "grad_norm": 0.20691214501857758, "learning_rate": 3.234693877551021e-05, "loss": 0.4071, "num_tokens": 202380174.0, "step": 684 }, { "epoch": 0.8396572827417381, "grad_norm": 0.21859946846961975, "learning_rate": 3.227891156462585e-05, "loss": 0.4064, "num_tokens": 202985735.0, "step": 686 }, { "epoch": 0.8421052631578947, "grad_norm": 0.20617327094078064, "learning_rate": 3.22108843537415e-05, "loss": 0.4131, "num_tokens": 203582111.0, "step": 688 }, { "epoch": 0.8445532435740514, "grad_norm": 0.21658724546432495, "learning_rate": 3.2142857142857144e-05, "loss": 0.3922, "num_tokens": 204153573.0, "step": 690 }, { "epoch": 0.847001223990208, "grad_norm": 0.20561416447162628, "learning_rate": 3.207482993197279e-05, "loss": 0.3955, "num_tokens": 204763271.0, "step": 692 }, { "epoch": 0.8494492044063647, "grad_norm": 0.21089410781860352, "learning_rate": 3.200680272108844e-05, "loss": 0.3975, "num_tokens": 205341916.0, "step": 694 }, { "epoch": 0.8518971848225214, "grad_norm": 0.21863864362239838, "learning_rate": 3.193877551020408e-05, "loss": 0.4143, "num_tokens": 205924333.0, "step": 696 }, { "epoch": 0.8543451652386781, "grad_norm": 0.21622027456760406, "learning_rate": 3.187074829931973e-05, "loss": 0.403, "num_tokens": 206489074.0, "step": 698 }, { "epoch": 0.8567931456548348, "grad_norm": 0.22788439691066742, "learning_rate": 3.180272108843538e-05, "loss": 0.3977, "num_tokens": 207076635.0, "step": 700 }, { "epoch": 0.8592411260709915, "grad_norm": 0.21112795174121857, "learning_rate": 3.1734693877551016e-05, "loss": 0.4084, "num_tokens": 207679550.0, "step": 702 }, { "epoch": 0.8616891064871481, "grad_norm": 0.22571775317192078, "learning_rate": 3.1666666666666666e-05, "loss": 0.4045, "num_tokens": 208285797.0, "step": 704 }, { "epoch": 0.8641370869033048, "grad_norm": 0.21622362732887268, "learning_rate": 3.1598639455782316e-05, "loss": 0.4063, "num_tokens": 208900402.0, "step": 706 }, { "epoch": 0.8665850673194615, "grad_norm": 0.2100430279970169, "learning_rate": 3.153061224489796e-05, "loss": 0.3924, "num_tokens": 209478996.0, "step": 708 }, { "epoch": 0.8690330477356181, "grad_norm": 0.20774699747562408, "learning_rate": 3.146258503401361e-05, "loss": 0.3922, "num_tokens": 210061792.0, "step": 710 }, { "epoch": 0.8714810281517748, "grad_norm": 0.21711339056491852, "learning_rate": 3.139455782312926e-05, "loss": 0.3986, "num_tokens": 210619894.0, "step": 712 }, { "epoch": 0.8739290085679314, "grad_norm": 0.21686255931854248, "learning_rate": 3.1326530612244895e-05, "loss": 0.3977, "num_tokens": 211195236.0, "step": 714 }, { "epoch": 0.8763769889840881, "grad_norm": 0.21434612572193146, "learning_rate": 3.1258503401360545e-05, "loss": 0.3943, "num_tokens": 211757983.0, "step": 716 }, { "epoch": 0.8788249694002448, "grad_norm": 0.22192634642124176, "learning_rate": 3.1190476190476195e-05, "loss": 0.3997, "num_tokens": 212332438.0, "step": 718 }, { "epoch": 0.8812729498164015, "grad_norm": 0.21407008171081543, "learning_rate": 3.112244897959184e-05, "loss": 0.3892, "num_tokens": 212936652.0, "step": 720 }, { "epoch": 0.8837209302325582, "grad_norm": 0.20672668516635895, "learning_rate": 3.105442176870749e-05, "loss": 0.3835, "num_tokens": 213505407.0, "step": 722 }, { "epoch": 0.8861689106487148, "grad_norm": 0.23475483059883118, "learning_rate": 3.098639455782313e-05, "loss": 0.3985, "num_tokens": 214094524.0, "step": 724 }, { "epoch": 0.8886168910648715, "grad_norm": 0.21666058897972107, "learning_rate": 3.0918367346938774e-05, "loss": 0.3771, "num_tokens": 214670619.0, "step": 726 }, { "epoch": 0.8910648714810282, "grad_norm": 0.20156654715538025, "learning_rate": 3.0850340136054424e-05, "loss": 0.3905, "num_tokens": 215236751.0, "step": 728 }, { "epoch": 0.8935128518971848, "grad_norm": 0.21652384102344513, "learning_rate": 3.078231292517007e-05, "loss": 0.3922, "num_tokens": 215828751.0, "step": 730 }, { "epoch": 0.8959608323133414, "grad_norm": 0.2145451158285141, "learning_rate": 3.071428571428572e-05, "loss": 0.3936, "num_tokens": 216449250.0, "step": 732 }, { "epoch": 0.8984088127294981, "grad_norm": 0.2073957920074463, "learning_rate": 3.064625850340136e-05, "loss": 0.3896, "num_tokens": 217044781.0, "step": 734 }, { "epoch": 0.9008567931456548, "grad_norm": 0.20258405804634094, "learning_rate": 3.0578231292517004e-05, "loss": 0.3916, "num_tokens": 217611996.0, "step": 736 }, { "epoch": 0.9033047735618115, "grad_norm": 0.20792829990386963, "learning_rate": 3.0510204081632654e-05, "loss": 0.3835, "num_tokens": 218232059.0, "step": 738 }, { "epoch": 0.9057527539779682, "grad_norm": 0.20565353333950043, "learning_rate": 3.04421768707483e-05, "loss": 0.4216, "num_tokens": 218810653.0, "step": 740 }, { "epoch": 0.9082007343941249, "grad_norm": 0.20509371161460876, "learning_rate": 3.037414965986395e-05, "loss": 0.3867, "num_tokens": 219409815.0, "step": 742 }, { "epoch": 0.9106487148102815, "grad_norm": 0.2050284892320633, "learning_rate": 3.0306122448979597e-05, "loss": 0.4019, "num_tokens": 220016419.0, "step": 744 }, { "epoch": 0.9130966952264382, "grad_norm": 0.20096543431282043, "learning_rate": 3.0238095238095236e-05, "loss": 0.3953, "num_tokens": 220599687.0, "step": 746 }, { "epoch": 0.9155446756425949, "grad_norm": 0.20031777024269104, "learning_rate": 3.0170068027210886e-05, "loss": 0.396, "num_tokens": 221170683.0, "step": 748 }, { "epoch": 0.9179926560587516, "grad_norm": 0.19940078258514404, "learning_rate": 3.0102040816326533e-05, "loss": 0.3946, "num_tokens": 221764282.0, "step": 750 }, { "epoch": 0.9204406364749081, "grad_norm": 0.19883687794208527, "learning_rate": 3.003401360544218e-05, "loss": 0.4009, "num_tokens": 222383973.0, "step": 752 }, { "epoch": 0.9228886168910648, "grad_norm": 0.19221563637256622, "learning_rate": 2.9965986394557826e-05, "loss": 0.3916, "num_tokens": 222981657.0, "step": 754 }, { "epoch": 0.9253365973072215, "grad_norm": 0.22945253551006317, "learning_rate": 2.989795918367347e-05, "loss": 0.4035, "num_tokens": 223580164.0, "step": 756 }, { "epoch": 0.9277845777233782, "grad_norm": 0.20799842476844788, "learning_rate": 2.9829931972789115e-05, "loss": 0.3924, "num_tokens": 224181708.0, "step": 758 }, { "epoch": 0.9302325581395349, "grad_norm": 0.21721278131008148, "learning_rate": 2.9761904761904762e-05, "loss": 0.4022, "num_tokens": 224750929.0, "step": 760 }, { "epoch": 0.9326805385556916, "grad_norm": 0.20762306451797485, "learning_rate": 2.9693877551020412e-05, "loss": 0.3971, "num_tokens": 225353399.0, "step": 762 }, { "epoch": 0.9351285189718482, "grad_norm": 0.2234756201505661, "learning_rate": 2.9625850340136058e-05, "loss": 0.4058, "num_tokens": 225946912.0, "step": 764 }, { "epoch": 0.9375764993880049, "grad_norm": 0.2123413234949112, "learning_rate": 2.9557823129251698e-05, "loss": 0.3872, "num_tokens": 226513122.0, "step": 766 }, { "epoch": 0.9400244798041616, "grad_norm": 0.2043527066707611, "learning_rate": 2.9489795918367348e-05, "loss": 0.3923, "num_tokens": 227097275.0, "step": 768 }, { "epoch": 0.9424724602203183, "grad_norm": 0.2188185751438141, "learning_rate": 2.9421768707482994e-05, "loss": 0.3944, "num_tokens": 227685155.0, "step": 770 }, { "epoch": 0.944920440636475, "grad_norm": 0.19682037830352783, "learning_rate": 2.935374149659864e-05, "loss": 0.3874, "num_tokens": 228268933.0, "step": 772 }, { "epoch": 0.9473684210526315, "grad_norm": 0.20665135979652405, "learning_rate": 2.9285714285714288e-05, "loss": 0.3874, "num_tokens": 228844822.0, "step": 774 }, { "epoch": 0.9498164014687882, "grad_norm": 0.2342827022075653, "learning_rate": 2.9217687074829937e-05, "loss": 0.3919, "num_tokens": 229413402.0, "step": 776 }, { "epoch": 0.9522643818849449, "grad_norm": 0.24719218909740448, "learning_rate": 2.9149659863945577e-05, "loss": 0.3878, "num_tokens": 229957992.0, "step": 778 }, { "epoch": 0.9547123623011016, "grad_norm": 0.22680489718914032, "learning_rate": 2.9081632653061224e-05, "loss": 0.4041, "num_tokens": 230534148.0, "step": 780 }, { "epoch": 0.9571603427172583, "grad_norm": 0.22106143832206726, "learning_rate": 2.9013605442176874e-05, "loss": 0.3838, "num_tokens": 231127999.0, "step": 782 }, { "epoch": 0.9596083231334149, "grad_norm": 0.20560577511787415, "learning_rate": 2.894557823129252e-05, "loss": 0.3979, "num_tokens": 231731253.0, "step": 784 }, { "epoch": 0.9620563035495716, "grad_norm": 0.20319853723049164, "learning_rate": 2.8877551020408167e-05, "loss": 0.3934, "num_tokens": 232342748.0, "step": 786 }, { "epoch": 0.9645042839657283, "grad_norm": 0.21500813961029053, "learning_rate": 2.880952380952381e-05, "loss": 0.3857, "num_tokens": 232934775.0, "step": 788 }, { "epoch": 0.966952264381885, "grad_norm": 0.2133987694978714, "learning_rate": 2.8741496598639456e-05, "loss": 0.3964, "num_tokens": 233502064.0, "step": 790 }, { "epoch": 0.9694002447980417, "grad_norm": 0.20930859446525574, "learning_rate": 2.8673469387755103e-05, "loss": 0.3881, "num_tokens": 234073172.0, "step": 792 }, { "epoch": 0.9718482252141983, "grad_norm": 0.21337440609931946, "learning_rate": 2.860544217687075e-05, "loss": 0.4044, "num_tokens": 234661532.0, "step": 794 }, { "epoch": 0.9742962056303549, "grad_norm": 0.20554158091545105, "learning_rate": 2.85374149659864e-05, "loss": 0.4069, "num_tokens": 235274908.0, "step": 796 }, { "epoch": 0.9767441860465116, "grad_norm": 0.20806728303432465, "learning_rate": 2.8469387755102046e-05, "loss": 0.4039, "num_tokens": 235855663.0, "step": 798 }, { "epoch": 0.9791921664626683, "grad_norm": 0.198395237326622, "learning_rate": 2.8401360544217685e-05, "loss": 0.3767, "num_tokens": 236428558.0, "step": 800 }, { "epoch": 0.981640146878825, "grad_norm": 0.19444723427295685, "learning_rate": 2.8333333333333335e-05, "loss": 0.3951, "num_tokens": 237023018.0, "step": 802 }, { "epoch": 0.9840881272949816, "grad_norm": 0.21214230358600616, "learning_rate": 2.8265306122448982e-05, "loss": 0.3931, "num_tokens": 237611598.0, "step": 804 }, { "epoch": 0.9865361077111383, "grad_norm": 0.20231059193611145, "learning_rate": 2.819727891156463e-05, "loss": 0.3848, "num_tokens": 238214543.0, "step": 806 }, { "epoch": 0.988984088127295, "grad_norm": 0.2049066573381424, "learning_rate": 2.8129251700680275e-05, "loss": 0.3918, "num_tokens": 238786408.0, "step": 808 }, { "epoch": 0.9914320685434517, "grad_norm": 0.21458584070205688, "learning_rate": 2.8061224489795918e-05, "loss": 0.3882, "num_tokens": 239363151.0, "step": 810 }, { "epoch": 0.9938800489596084, "grad_norm": 0.20600514113903046, "learning_rate": 2.7993197278911565e-05, "loss": 0.3941, "num_tokens": 239967829.0, "step": 812 }, { "epoch": 0.996328029375765, "grad_norm": 0.19233451783657074, "learning_rate": 2.792517006802721e-05, "loss": 0.3838, "num_tokens": 240551191.0, "step": 814 }, { "epoch": 0.9987760097919217, "grad_norm": 0.21595363318920135, "learning_rate": 2.785714285714286e-05, "loss": 0.3978, "num_tokens": 241153107.0, "step": 816 }, { "epoch": 1.0012239902080784, "grad_norm": 0.24451519548892975, "learning_rate": 2.7789115646258508e-05, "loss": 0.3578, "num_tokens": 241769492.0, "step": 818 }, { "epoch": 1.003671970624235, "grad_norm": 0.7559052109718323, "learning_rate": 2.7721088435374147e-05, "loss": 0.3211, "num_tokens": 242359313.0, "step": 820 }, { "epoch": 1.0061199510403918, "grad_norm": 0.5023258328437805, "learning_rate": 2.7653061224489797e-05, "loss": 0.3153, "num_tokens": 242911445.0, "step": 822 }, { "epoch": 1.0085679314565483, "grad_norm": 0.28131622076034546, "learning_rate": 2.7585034013605444e-05, "loss": 0.3292, "num_tokens": 243515905.0, "step": 824 }, { "epoch": 1.0110159118727051, "grad_norm": 0.2248506397008896, "learning_rate": 2.751700680272109e-05, "loss": 0.3231, "num_tokens": 244085094.0, "step": 826 }, { "epoch": 1.0134638922888617, "grad_norm": 0.2191573828458786, "learning_rate": 2.7448979591836737e-05, "loss": 0.3337, "num_tokens": 244710189.0, "step": 828 }, { "epoch": 1.0159118727050183, "grad_norm": 0.2094300538301468, "learning_rate": 2.7380952380952383e-05, "loss": 0.3124, "num_tokens": 245302719.0, "step": 830 }, { "epoch": 1.018359853121175, "grad_norm": 0.23099683225154877, "learning_rate": 2.7312925170068026e-05, "loss": 0.3282, "num_tokens": 245899907.0, "step": 832 }, { "epoch": 1.0208078335373316, "grad_norm": 0.20248746871948242, "learning_rate": 2.7244897959183673e-05, "loss": 0.3256, "num_tokens": 246514507.0, "step": 834 }, { "epoch": 1.0232558139534884, "grad_norm": 0.2067061960697174, "learning_rate": 2.7176870748299323e-05, "loss": 0.3138, "num_tokens": 247121076.0, "step": 836 }, { "epoch": 1.025703794369645, "grad_norm": 0.20735739171504974, "learning_rate": 2.710884353741497e-05, "loss": 0.3098, "num_tokens": 247731957.0, "step": 838 }, { "epoch": 1.0281517747858018, "grad_norm": 0.2064848691225052, "learning_rate": 2.7040816326530616e-05, "loss": 0.3167, "num_tokens": 248333325.0, "step": 840 }, { "epoch": 1.0305997552019583, "grad_norm": 0.21973280608654022, "learning_rate": 2.697278911564626e-05, "loss": 0.3133, "num_tokens": 248929993.0, "step": 842 }, { "epoch": 1.0330477356181151, "grad_norm": 0.20441460609436035, "learning_rate": 2.6904761904761905e-05, "loss": 0.3046, "num_tokens": 249530707.0, "step": 844 }, { "epoch": 1.0354957160342717, "grad_norm": 0.20525039732456207, "learning_rate": 2.6836734693877552e-05, "loss": 0.3171, "num_tokens": 250144988.0, "step": 846 }, { "epoch": 1.0379436964504285, "grad_norm": 0.20789970457553864, "learning_rate": 2.67687074829932e-05, "loss": 0.3213, "num_tokens": 250726834.0, "step": 848 }, { "epoch": 1.040391676866585, "grad_norm": 0.21225066483020782, "learning_rate": 2.6700680272108845e-05, "loss": 0.3237, "num_tokens": 251338286.0, "step": 850 }, { "epoch": 1.0428396572827416, "grad_norm": 0.21135969460010529, "learning_rate": 2.6632653061224488e-05, "loss": 0.3188, "num_tokens": 251938256.0, "step": 852 }, { "epoch": 1.0452876376988984, "grad_norm": 0.1959303766489029, "learning_rate": 2.6564625850340135e-05, "loss": 0.308, "num_tokens": 252528052.0, "step": 854 }, { "epoch": 1.047735618115055, "grad_norm": 0.21999941766262054, "learning_rate": 2.6496598639455785e-05, "loss": 0.3111, "num_tokens": 253131838.0, "step": 856 }, { "epoch": 1.0501835985312118, "grad_norm": 0.21813488006591797, "learning_rate": 2.642857142857143e-05, "loss": 0.3063, "num_tokens": 253683217.0, "step": 858 }, { "epoch": 1.0526315789473684, "grad_norm": 0.1996009200811386, "learning_rate": 2.6360544217687078e-05, "loss": 0.3199, "num_tokens": 254305603.0, "step": 860 }, { "epoch": 1.0550795593635252, "grad_norm": 0.20491966605186462, "learning_rate": 2.6292517006802724e-05, "loss": 0.3075, "num_tokens": 254866953.0, "step": 862 }, { "epoch": 1.0575275397796817, "grad_norm": 0.20605306327342987, "learning_rate": 2.6224489795918367e-05, "loss": 0.3212, "num_tokens": 255438135.0, "step": 864 }, { "epoch": 1.0599755201958385, "grad_norm": 0.20627039670944214, "learning_rate": 2.6156462585034014e-05, "loss": 0.3089, "num_tokens": 256042020.0, "step": 866 }, { "epoch": 1.062423500611995, "grad_norm": 0.20019806921482086, "learning_rate": 2.608843537414966e-05, "loss": 0.3128, "num_tokens": 256618543.0, "step": 868 }, { "epoch": 1.0648714810281519, "grad_norm": 0.19757170975208282, "learning_rate": 2.6020408163265307e-05, "loss": 0.3199, "num_tokens": 257226439.0, "step": 870 }, { "epoch": 1.0673194614443084, "grad_norm": 0.2104121893644333, "learning_rate": 2.5952380952380957e-05, "loss": 0.315, "num_tokens": 257822132.0, "step": 872 }, { "epoch": 1.069767441860465, "grad_norm": 0.2085297554731369, "learning_rate": 2.5884353741496596e-05, "loss": 0.3077, "num_tokens": 258411819.0, "step": 874 }, { "epoch": 1.0722154222766218, "grad_norm": 0.22473223507404327, "learning_rate": 2.5816326530612246e-05, "loss": 0.3195, "num_tokens": 258980885.0, "step": 876 }, { "epoch": 1.0746634026927784, "grad_norm": 0.22228698432445526, "learning_rate": 2.5748299319727893e-05, "loss": 0.3232, "num_tokens": 259568916.0, "step": 878 }, { "epoch": 1.0771113831089352, "grad_norm": 0.20244480669498444, "learning_rate": 2.568027210884354e-05, "loss": 0.3205, "num_tokens": 260173762.0, "step": 880 }, { "epoch": 1.0795593635250917, "grad_norm": 0.22693733870983124, "learning_rate": 2.5612244897959186e-05, "loss": 0.326, "num_tokens": 260780328.0, "step": 882 }, { "epoch": 1.0820073439412485, "grad_norm": 0.2070688158273697, "learning_rate": 2.554421768707483e-05, "loss": 0.3126, "num_tokens": 261378955.0, "step": 884 }, { "epoch": 1.084455324357405, "grad_norm": 0.2115529328584671, "learning_rate": 2.5476190476190476e-05, "loss": 0.3127, "num_tokens": 261963127.0, "step": 886 }, { "epoch": 1.086903304773562, "grad_norm": 0.19817596673965454, "learning_rate": 2.5408163265306122e-05, "loss": 0.3165, "num_tokens": 262561647.0, "step": 888 }, { "epoch": 1.0893512851897185, "grad_norm": 0.22043918073177338, "learning_rate": 2.534013605442177e-05, "loss": 0.3171, "num_tokens": 263136540.0, "step": 890 }, { "epoch": 1.091799265605875, "grad_norm": 0.22259274125099182, "learning_rate": 2.527210884353742e-05, "loss": 0.3269, "num_tokens": 263709944.0, "step": 892 }, { "epoch": 1.0942472460220318, "grad_norm": 0.2146555483341217, "learning_rate": 2.5204081632653065e-05, "loss": 0.3185, "num_tokens": 264272683.0, "step": 894 }, { "epoch": 1.0966952264381884, "grad_norm": 0.20278215408325195, "learning_rate": 2.5136054421768708e-05, "loss": 0.3151, "num_tokens": 264868896.0, "step": 896 }, { "epoch": 1.0991432068543452, "grad_norm": 0.19636254012584686, "learning_rate": 2.5068027210884355e-05, "loss": 0.3162, "num_tokens": 265469534.0, "step": 898 }, { "epoch": 1.1015911872705018, "grad_norm": 0.2220093160867691, "learning_rate": 2.5e-05, "loss": 0.3256, "num_tokens": 266079945.0, "step": 900 }, { "epoch": 1.1040391676866586, "grad_norm": 0.2047000527381897, "learning_rate": 2.4931972789115648e-05, "loss": 0.3381, "num_tokens": 266677132.0, "step": 902 }, { "epoch": 1.1064871481028151, "grad_norm": 0.20770138502120972, "learning_rate": 2.4863945578231294e-05, "loss": 0.32, "num_tokens": 267251319.0, "step": 904 }, { "epoch": 1.108935128518972, "grad_norm": 0.21177665889263153, "learning_rate": 2.479591836734694e-05, "loss": 0.3228, "num_tokens": 267851096.0, "step": 906 }, { "epoch": 1.1113831089351285, "grad_norm": 0.20156866312026978, "learning_rate": 2.4727891156462587e-05, "loss": 0.3257, "num_tokens": 268461913.0, "step": 908 }, { "epoch": 1.1138310893512853, "grad_norm": 0.2272440493106842, "learning_rate": 2.4659863945578234e-05, "loss": 0.3204, "num_tokens": 269034487.0, "step": 910 }, { "epoch": 1.1162790697674418, "grad_norm": 0.19609837234020233, "learning_rate": 2.459183673469388e-05, "loss": 0.3084, "num_tokens": 269627033.0, "step": 912 }, { "epoch": 1.1187270501835984, "grad_norm": 0.20185841619968414, "learning_rate": 2.4523809523809523e-05, "loss": 0.3038, "num_tokens": 270221053.0, "step": 914 }, { "epoch": 1.1211750305997552, "grad_norm": 0.21575827896595, "learning_rate": 2.445578231292517e-05, "loss": 0.3141, "num_tokens": 270796741.0, "step": 916 }, { "epoch": 1.1236230110159118, "grad_norm": 0.21539226174354553, "learning_rate": 2.438775510204082e-05, "loss": 0.2977, "num_tokens": 271365659.0, "step": 918 }, { "epoch": 1.1260709914320686, "grad_norm": 0.20362544059753418, "learning_rate": 2.4319727891156463e-05, "loss": 0.3148, "num_tokens": 271959464.0, "step": 920 }, { "epoch": 1.1285189718482251, "grad_norm": 0.20306700468063354, "learning_rate": 2.425170068027211e-05, "loss": 0.3123, "num_tokens": 272536606.0, "step": 922 }, { "epoch": 1.130966952264382, "grad_norm": 0.2173018753528595, "learning_rate": 2.4183673469387756e-05, "loss": 0.3041, "num_tokens": 273133730.0, "step": 924 }, { "epoch": 1.1334149326805385, "grad_norm": 0.20855848491191864, "learning_rate": 2.4115646258503403e-05, "loss": 0.3047, "num_tokens": 273707414.0, "step": 926 }, { "epoch": 1.1358629130966953, "grad_norm": 0.20593813061714172, "learning_rate": 2.404761904761905e-05, "loss": 0.3071, "num_tokens": 274288846.0, "step": 928 }, { "epoch": 1.1383108935128519, "grad_norm": 0.19493956863880157, "learning_rate": 2.3979591836734696e-05, "loss": 0.3093, "num_tokens": 274886178.0, "step": 930 }, { "epoch": 1.1407588739290087, "grad_norm": 0.20352424681186676, "learning_rate": 2.3911564625850342e-05, "loss": 0.3142, "num_tokens": 275470345.0, "step": 932 }, { "epoch": 1.1432068543451652, "grad_norm": 0.21687689423561096, "learning_rate": 2.384353741496599e-05, "loss": 0.3195, "num_tokens": 276068187.0, "step": 934 }, { "epoch": 1.1456548347613218, "grad_norm": 0.20299257338047028, "learning_rate": 2.3775510204081632e-05, "loss": 0.3092, "num_tokens": 276667953.0, "step": 936 }, { "epoch": 1.1481028151774786, "grad_norm": 0.20521888136863708, "learning_rate": 2.370748299319728e-05, "loss": 0.3204, "num_tokens": 277258175.0, "step": 938 }, { "epoch": 1.1505507955936352, "grad_norm": 0.19685983657836914, "learning_rate": 2.3639455782312928e-05, "loss": 0.3085, "num_tokens": 277836527.0, "step": 940 }, { "epoch": 1.152998776009792, "grad_norm": 0.20181462168693542, "learning_rate": 2.357142857142857e-05, "loss": 0.3187, "num_tokens": 278405534.0, "step": 942 }, { "epoch": 1.1554467564259485, "grad_norm": 0.21559888124465942, "learning_rate": 2.3503401360544218e-05, "loss": 0.3174, "num_tokens": 278993945.0, "step": 944 }, { "epoch": 1.1578947368421053, "grad_norm": 0.2135152816772461, "learning_rate": 2.3435374149659864e-05, "loss": 0.321, "num_tokens": 279568621.0, "step": 946 }, { "epoch": 1.1603427172582619, "grad_norm": 0.22846734523773193, "learning_rate": 2.336734693877551e-05, "loss": 0.3212, "num_tokens": 280153080.0, "step": 948 }, { "epoch": 1.1627906976744187, "grad_norm": 0.22005695104599, "learning_rate": 2.3299319727891157e-05, "loss": 0.3308, "num_tokens": 280750498.0, "step": 950 }, { "epoch": 1.1652386780905752, "grad_norm": 0.20092672109603882, "learning_rate": 2.3231292517006804e-05, "loss": 0.3217, "num_tokens": 281356910.0, "step": 952 }, { "epoch": 1.167686658506732, "grad_norm": 0.20536336302757263, "learning_rate": 2.316326530612245e-05, "loss": 0.321, "num_tokens": 281936839.0, "step": 954 }, { "epoch": 1.1701346389228886, "grad_norm": 0.19616661965847015, "learning_rate": 2.3095238095238097e-05, "loss": 0.3129, "num_tokens": 282545026.0, "step": 956 }, { "epoch": 1.1725826193390452, "grad_norm": 0.2042839229106903, "learning_rate": 2.3027210884353743e-05, "loss": 0.3145, "num_tokens": 283148652.0, "step": 958 }, { "epoch": 1.175030599755202, "grad_norm": 0.217295840382576, "learning_rate": 2.295918367346939e-05, "loss": 0.3158, "num_tokens": 283741655.0, "step": 960 }, { "epoch": 1.1774785801713585, "grad_norm": 0.2030820995569229, "learning_rate": 2.2891156462585033e-05, "loss": 0.3188, "num_tokens": 284338996.0, "step": 962 }, { "epoch": 1.1799265605875153, "grad_norm": 0.2045370191335678, "learning_rate": 2.282312925170068e-05, "loss": 0.3199, "num_tokens": 284937493.0, "step": 964 }, { "epoch": 1.182374541003672, "grad_norm": 0.200841024518013, "learning_rate": 2.275510204081633e-05, "loss": 0.3243, "num_tokens": 285523397.0, "step": 966 }, { "epoch": 1.1848225214198287, "grad_norm": 0.2140469253063202, "learning_rate": 2.2687074829931973e-05, "loss": 0.3232, "num_tokens": 286120001.0, "step": 968 }, { "epoch": 1.1872705018359853, "grad_norm": 0.19735847413539886, "learning_rate": 2.261904761904762e-05, "loss": 0.3141, "num_tokens": 286695816.0, "step": 970 }, { "epoch": 1.189718482252142, "grad_norm": 0.2021227777004242, "learning_rate": 2.255102040816327e-05, "loss": 0.3131, "num_tokens": 287308514.0, "step": 972 }, { "epoch": 1.1921664626682986, "grad_norm": 0.2072349190711975, "learning_rate": 2.2482993197278912e-05, "loss": 0.2992, "num_tokens": 287864124.0, "step": 974 }, { "epoch": 1.1946144430844554, "grad_norm": 0.20413769781589508, "learning_rate": 2.241496598639456e-05, "loss": 0.3117, "num_tokens": 288440548.0, "step": 976 }, { "epoch": 1.197062423500612, "grad_norm": 0.1954454928636551, "learning_rate": 2.2346938775510205e-05, "loss": 0.3138, "num_tokens": 289023705.0, "step": 978 }, { "epoch": 1.1995104039167686, "grad_norm": 0.21502931416034698, "learning_rate": 2.2278911564625852e-05, "loss": 0.3169, "num_tokens": 289592388.0, "step": 980 }, { "epoch": 1.2019583843329253, "grad_norm": 0.2102571427822113, "learning_rate": 2.2210884353741498e-05, "loss": 0.3114, "num_tokens": 290139669.0, "step": 982 }, { "epoch": 1.204406364749082, "grad_norm": 0.19803567230701447, "learning_rate": 2.214285714285714e-05, "loss": 0.2988, "num_tokens": 290703981.0, "step": 984 }, { "epoch": 1.2068543451652387, "grad_norm": 0.21732397377490997, "learning_rate": 2.207482993197279e-05, "loss": 0.3237, "num_tokens": 291284094.0, "step": 986 }, { "epoch": 1.2093023255813953, "grad_norm": 0.2095579355955124, "learning_rate": 2.2006802721088438e-05, "loss": 0.3249, "num_tokens": 291859678.0, "step": 988 }, { "epoch": 1.211750305997552, "grad_norm": 0.19027207791805267, "learning_rate": 2.193877551020408e-05, "loss": 0.3165, "num_tokens": 292461042.0, "step": 990 }, { "epoch": 1.2141982864137086, "grad_norm": 0.2084067016839981, "learning_rate": 2.187074829931973e-05, "loss": 0.3143, "num_tokens": 293014987.0, "step": 992 }, { "epoch": 1.2166462668298654, "grad_norm": 0.19558677077293396, "learning_rate": 2.1802721088435374e-05, "loss": 0.3028, "num_tokens": 293602670.0, "step": 994 }, { "epoch": 1.219094247246022, "grad_norm": 0.20325857400894165, "learning_rate": 2.173469387755102e-05, "loss": 0.3154, "num_tokens": 294198701.0, "step": 996 }, { "epoch": 1.2215422276621788, "grad_norm": 0.1984713226556778, "learning_rate": 2.1666666666666667e-05, "loss": 0.3109, "num_tokens": 294765389.0, "step": 998 }, { "epoch": 1.2239902080783354, "grad_norm": 0.21285976469516754, "learning_rate": 2.1598639455782314e-05, "loss": 0.3108, "num_tokens": 295355827.0, "step": 1000 }, { "epoch": 1.226438188494492, "grad_norm": 0.20429600775241852, "learning_rate": 2.153061224489796e-05, "loss": 0.3116, "num_tokens": 295939047.0, "step": 1002 }, { "epoch": 1.2288861689106487, "grad_norm": 0.20868054032325745, "learning_rate": 2.1462585034013607e-05, "loss": 0.317, "num_tokens": 296511108.0, "step": 1004 }, { "epoch": 1.2313341493268053, "grad_norm": 0.19811798632144928, "learning_rate": 2.1394557823129253e-05, "loss": 0.3202, "num_tokens": 297091745.0, "step": 1006 }, { "epoch": 1.233782129742962, "grad_norm": 0.19950352609157562, "learning_rate": 2.13265306122449e-05, "loss": 0.3226, "num_tokens": 297693752.0, "step": 1008 }, { "epoch": 1.2362301101591187, "grad_norm": 0.19808277487754822, "learning_rate": 2.1258503401360543e-05, "loss": 0.3164, "num_tokens": 298274684.0, "step": 1010 }, { "epoch": 1.2386780905752754, "grad_norm": 0.1930680274963379, "learning_rate": 2.1190476190476193e-05, "loss": 0.3116, "num_tokens": 298871738.0, "step": 1012 }, { "epoch": 1.241126070991432, "grad_norm": 0.202926903963089, "learning_rate": 2.112244897959184e-05, "loss": 0.3139, "num_tokens": 299465770.0, "step": 1014 }, { "epoch": 1.2435740514075888, "grad_norm": 0.20331041514873505, "learning_rate": 2.1054421768707482e-05, "loss": 0.3133, "num_tokens": 300053388.0, "step": 1016 }, { "epoch": 1.2460220318237454, "grad_norm": 0.20188011229038239, "learning_rate": 2.098639455782313e-05, "loss": 0.3174, "num_tokens": 300639945.0, "step": 1018 }, { "epoch": 1.2484700122399022, "grad_norm": 0.20790152251720428, "learning_rate": 2.091836734693878e-05, "loss": 0.3345, "num_tokens": 301230177.0, "step": 1020 }, { "epoch": 1.2509179926560587, "grad_norm": 0.19395369291305542, "learning_rate": 2.0850340136054422e-05, "loss": 0.3257, "num_tokens": 301830469.0, "step": 1022 }, { "epoch": 1.2533659730722153, "grad_norm": 0.19990698993206024, "learning_rate": 2.078231292517007e-05, "loss": 0.3085, "num_tokens": 302402485.0, "step": 1024 }, { "epoch": 1.255813953488372, "grad_norm": 0.19362080097198486, "learning_rate": 2.0714285714285718e-05, "loss": 0.3146, "num_tokens": 303004905.0, "step": 1026 }, { "epoch": 1.258261933904529, "grad_norm": 0.2057778239250183, "learning_rate": 2.064625850340136e-05, "loss": 0.3113, "num_tokens": 303599350.0, "step": 1028 }, { "epoch": 1.2607099143206855, "grad_norm": 0.20011335611343384, "learning_rate": 2.0578231292517008e-05, "loss": 0.3166, "num_tokens": 304216931.0, "step": 1030 }, { "epoch": 1.263157894736842, "grad_norm": 0.2128947675228119, "learning_rate": 2.0510204081632654e-05, "loss": 0.3143, "num_tokens": 304825541.0, "step": 1032 }, { "epoch": 1.2656058751529988, "grad_norm": 0.18962650001049042, "learning_rate": 2.04421768707483e-05, "loss": 0.3203, "num_tokens": 305415939.0, "step": 1034 }, { "epoch": 1.2680538555691554, "grad_norm": 0.19993481040000916, "learning_rate": 2.0374149659863947e-05, "loss": 0.3154, "num_tokens": 306017718.0, "step": 1036 }, { "epoch": 1.2705018359853122, "grad_norm": 0.22088322043418884, "learning_rate": 2.030612244897959e-05, "loss": 0.3207, "num_tokens": 306632370.0, "step": 1038 }, { "epoch": 1.2729498164014688, "grad_norm": 0.23077014088630676, "learning_rate": 2.023809523809524e-05, "loss": 0.3146, "num_tokens": 307220724.0, "step": 1040 }, { "epoch": 1.2753977968176256, "grad_norm": 0.243485689163208, "learning_rate": 2.0170068027210887e-05, "loss": 0.318, "num_tokens": 307807114.0, "step": 1042 }, { "epoch": 1.2778457772337821, "grad_norm": 0.19597865641117096, "learning_rate": 2.010204081632653e-05, "loss": 0.2989, "num_tokens": 308391074.0, "step": 1044 }, { "epoch": 1.2802937576499387, "grad_norm": 0.20588846504688263, "learning_rate": 2.003401360544218e-05, "loss": 0.3179, "num_tokens": 308994281.0, "step": 1046 }, { "epoch": 1.2827417380660955, "grad_norm": 0.19821475446224213, "learning_rate": 1.9965986394557823e-05, "loss": 0.3171, "num_tokens": 309584100.0, "step": 1048 }, { "epoch": 1.2851897184822523, "grad_norm": 0.20089252293109894, "learning_rate": 1.989795918367347e-05, "loss": 0.3103, "num_tokens": 310173078.0, "step": 1050 }, { "epoch": 1.2876376988984088, "grad_norm": 0.19895407557487488, "learning_rate": 1.9829931972789116e-05, "loss": 0.3143, "num_tokens": 310756371.0, "step": 1052 }, { "epoch": 1.2900856793145654, "grad_norm": 0.20435795187950134, "learning_rate": 1.9761904761904763e-05, "loss": 0.3096, "num_tokens": 311329068.0, "step": 1054 }, { "epoch": 1.2925336597307222, "grad_norm": 0.19608020782470703, "learning_rate": 1.969387755102041e-05, "loss": 0.3267, "num_tokens": 311943569.0, "step": 1056 }, { "epoch": 1.2949816401468788, "grad_norm": 0.1848510205745697, "learning_rate": 1.9625850340136056e-05, "loss": 0.2957, "num_tokens": 312529715.0, "step": 1058 }, { "epoch": 1.2974296205630356, "grad_norm": 0.20560386776924133, "learning_rate": 1.9557823129251702e-05, "loss": 0.303, "num_tokens": 313111336.0, "step": 1060 }, { "epoch": 1.2998776009791921, "grad_norm": 0.20212872326374054, "learning_rate": 1.948979591836735e-05, "loss": 0.3204, "num_tokens": 313717861.0, "step": 1062 }, { "epoch": 1.302325581395349, "grad_norm": 0.19899769127368927, "learning_rate": 1.9421768707482992e-05, "loss": 0.3095, "num_tokens": 314306581.0, "step": 1064 }, { "epoch": 1.3047735618115055, "grad_norm": 0.22068466246128082, "learning_rate": 1.9353741496598642e-05, "loss": 0.2967, "num_tokens": 314870264.0, "step": 1066 }, { "epoch": 1.307221542227662, "grad_norm": 0.2030424326658249, "learning_rate": 1.928571428571429e-05, "loss": 0.3116, "num_tokens": 315449261.0, "step": 1068 }, { "epoch": 1.3096695226438189, "grad_norm": 0.20046113431453705, "learning_rate": 1.921768707482993e-05, "loss": 0.3098, "num_tokens": 316027882.0, "step": 1070 }, { "epoch": 1.3121175030599757, "grad_norm": 0.19095462560653687, "learning_rate": 1.9149659863945578e-05, "loss": 0.3076, "num_tokens": 316628343.0, "step": 1072 }, { "epoch": 1.3145654834761322, "grad_norm": 0.21928201615810394, "learning_rate": 1.9081632653061228e-05, "loss": 0.3259, "num_tokens": 317198024.0, "step": 1074 }, { "epoch": 1.3170134638922888, "grad_norm": 0.2161685675382614, "learning_rate": 1.901360544217687e-05, "loss": 0.3138, "num_tokens": 317772116.0, "step": 1076 }, { "epoch": 1.3194614443084456, "grad_norm": 0.18717218935489655, "learning_rate": 1.8945578231292518e-05, "loss": 0.3151, "num_tokens": 318373768.0, "step": 1078 }, { "epoch": 1.3219094247246022, "grad_norm": 0.20016470551490784, "learning_rate": 1.8877551020408164e-05, "loss": 0.3146, "num_tokens": 318961371.0, "step": 1080 }, { "epoch": 1.324357405140759, "grad_norm": 0.1940920054912567, "learning_rate": 1.880952380952381e-05, "loss": 0.3111, "num_tokens": 319555666.0, "step": 1082 }, { "epoch": 1.3268053855569155, "grad_norm": 0.18782751262187958, "learning_rate": 1.8741496598639457e-05, "loss": 0.3041, "num_tokens": 320139721.0, "step": 1084 }, { "epoch": 1.3292533659730723, "grad_norm": 0.20074118673801422, "learning_rate": 1.8673469387755104e-05, "loss": 0.3199, "num_tokens": 320715131.0, "step": 1086 }, { "epoch": 1.3317013463892289, "grad_norm": 0.19014829397201538, "learning_rate": 1.860544217687075e-05, "loss": 0.3309, "num_tokens": 321315666.0, "step": 1088 }, { "epoch": 1.3341493268053854, "grad_norm": 0.20679143071174622, "learning_rate": 1.8537414965986397e-05, "loss": 0.3134, "num_tokens": 321875487.0, "step": 1090 }, { "epoch": 1.3365973072215422, "grad_norm": 0.21121996641159058, "learning_rate": 1.846938775510204e-05, "loss": 0.3027, "num_tokens": 322452818.0, "step": 1092 }, { "epoch": 1.339045287637699, "grad_norm": 0.1982714831829071, "learning_rate": 1.840136054421769e-05, "loss": 0.3071, "num_tokens": 323038736.0, "step": 1094 }, { "epoch": 1.3414932680538556, "grad_norm": 0.20844446122646332, "learning_rate": 1.8333333333333333e-05, "loss": 0.3131, "num_tokens": 323658114.0, "step": 1096 }, { "epoch": 1.3439412484700122, "grad_norm": 0.20115956664085388, "learning_rate": 1.826530612244898e-05, "loss": 0.3211, "num_tokens": 324245794.0, "step": 1098 }, { "epoch": 1.346389228886169, "grad_norm": 0.1881430596113205, "learning_rate": 1.8197278911564626e-05, "loss": 0.3111, "num_tokens": 324874276.0, "step": 1100 }, { "epoch": 1.3488372093023255, "grad_norm": 0.21025033295154572, "learning_rate": 1.8129251700680272e-05, "loss": 0.3113, "num_tokens": 325448734.0, "step": 1102 }, { "epoch": 1.3512851897184823, "grad_norm": 0.19910575449466705, "learning_rate": 1.806122448979592e-05, "loss": 0.3204, "num_tokens": 326035181.0, "step": 1104 }, { "epoch": 1.353733170134639, "grad_norm": 0.2070266753435135, "learning_rate": 1.7993197278911565e-05, "loss": 0.3179, "num_tokens": 326621091.0, "step": 1106 }, { "epoch": 1.3561811505507957, "grad_norm": 0.2014313042163849, "learning_rate": 1.7925170068027212e-05, "loss": 0.3086, "num_tokens": 327207895.0, "step": 1108 }, { "epoch": 1.3586291309669523, "grad_norm": 0.20598489046096802, "learning_rate": 1.785714285714286e-05, "loss": 0.3239, "num_tokens": 327792682.0, "step": 1110 }, { "epoch": 1.3610771113831088, "grad_norm": 0.1989157795906067, "learning_rate": 1.77891156462585e-05, "loss": 0.3264, "num_tokens": 328402200.0, "step": 1112 }, { "epoch": 1.3635250917992656, "grad_norm": 0.1887408047914505, "learning_rate": 1.772108843537415e-05, "loss": 0.2993, "num_tokens": 329018344.0, "step": 1114 }, { "epoch": 1.3659730722154224, "grad_norm": 0.19435696303844452, "learning_rate": 1.7653061224489798e-05, "loss": 0.2979, "num_tokens": 329597401.0, "step": 1116 }, { "epoch": 1.368421052631579, "grad_norm": 0.19509488344192505, "learning_rate": 1.758503401360544e-05, "loss": 0.3191, "num_tokens": 330218091.0, "step": 1118 }, { "epoch": 1.3708690330477356, "grad_norm": 0.19721415638923645, "learning_rate": 1.7517006802721088e-05, "loss": 0.3221, "num_tokens": 330784836.0, "step": 1120 }, { "epoch": 1.3733170134638923, "grad_norm": 0.1986311972141266, "learning_rate": 1.7448979591836738e-05, "loss": 0.3199, "num_tokens": 331351354.0, "step": 1122 }, { "epoch": 1.375764993880049, "grad_norm": 0.1993299126625061, "learning_rate": 1.738095238095238e-05, "loss": 0.3054, "num_tokens": 331933476.0, "step": 1124 }, { "epoch": 1.3782129742962057, "grad_norm": 0.20406125485897064, "learning_rate": 1.7312925170068027e-05, "loss": 0.3082, "num_tokens": 332523353.0, "step": 1126 }, { "epoch": 1.3806609547123623, "grad_norm": 0.19135580956935883, "learning_rate": 1.7244897959183677e-05, "loss": 0.3071, "num_tokens": 333096257.0, "step": 1128 }, { "epoch": 1.383108935128519, "grad_norm": 0.21359121799468994, "learning_rate": 1.717687074829932e-05, "loss": 0.3246, "num_tokens": 333704506.0, "step": 1130 }, { "epoch": 1.3855569155446756, "grad_norm": 0.1883375346660614, "learning_rate": 1.7108843537414967e-05, "loss": 0.3098, "num_tokens": 334279055.0, "step": 1132 }, { "epoch": 1.3880048959608322, "grad_norm": 0.19225509464740753, "learning_rate": 1.7040816326530613e-05, "loss": 0.3182, "num_tokens": 334885694.0, "step": 1134 }, { "epoch": 1.390452876376989, "grad_norm": 0.20316381752490997, "learning_rate": 1.697278911564626e-05, "loss": 0.3154, "num_tokens": 335482541.0, "step": 1136 }, { "epoch": 1.3929008567931458, "grad_norm": 0.20530924201011658, "learning_rate": 1.6904761904761906e-05, "loss": 0.3179, "num_tokens": 336090417.0, "step": 1138 }, { "epoch": 1.3953488372093024, "grad_norm": 0.20215919613838196, "learning_rate": 1.683673469387755e-05, "loss": 0.3075, "num_tokens": 336652660.0, "step": 1140 }, { "epoch": 1.397796817625459, "grad_norm": 0.19927841424942017, "learning_rate": 1.67687074829932e-05, "loss": 0.3033, "num_tokens": 337236563.0, "step": 1142 }, { "epoch": 1.4002447980416157, "grad_norm": 0.1981232911348343, "learning_rate": 1.6700680272108846e-05, "loss": 0.3087, "num_tokens": 337843323.0, "step": 1144 }, { "epoch": 1.4026927784577723, "grad_norm": 0.19219759106636047, "learning_rate": 1.663265306122449e-05, "loss": 0.3086, "num_tokens": 338432787.0, "step": 1146 }, { "epoch": 1.405140758873929, "grad_norm": 0.1915847808122635, "learning_rate": 1.656462585034014e-05, "loss": 0.3089, "num_tokens": 339035087.0, "step": 1148 }, { "epoch": 1.4075887392900857, "grad_norm": 0.19235005974769592, "learning_rate": 1.6496598639455782e-05, "loss": 0.3208, "num_tokens": 339651239.0, "step": 1150 }, { "epoch": 1.4100367197062424, "grad_norm": 0.19046078622341156, "learning_rate": 1.642857142857143e-05, "loss": 0.3069, "num_tokens": 340245882.0, "step": 1152 }, { "epoch": 1.412484700122399, "grad_norm": 0.1885019838809967, "learning_rate": 1.6360544217687075e-05, "loss": 0.309, "num_tokens": 340840812.0, "step": 1154 }, { "epoch": 1.4149326805385556, "grad_norm": 0.19766561686992645, "learning_rate": 1.629251700680272e-05, "loss": 0.3078, "num_tokens": 341421452.0, "step": 1156 }, { "epoch": 1.4173806609547124, "grad_norm": 0.19824180006980896, "learning_rate": 1.6224489795918368e-05, "loss": 0.3094, "num_tokens": 342020118.0, "step": 1158 }, { "epoch": 1.4198286413708692, "grad_norm": 0.1872597634792328, "learning_rate": 1.6156462585034015e-05, "loss": 0.3152, "num_tokens": 342641497.0, "step": 1160 }, { "epoch": 1.4222766217870257, "grad_norm": 0.19009454548358917, "learning_rate": 1.608843537414966e-05, "loss": 0.3098, "num_tokens": 343240062.0, "step": 1162 }, { "epoch": 1.4247246022031823, "grad_norm": 0.2149920016527176, "learning_rate": 1.6020408163265308e-05, "loss": 0.3095, "num_tokens": 343834833.0, "step": 1164 }, { "epoch": 1.427172582619339, "grad_norm": 0.18525025248527527, "learning_rate": 1.595238095238095e-05, "loss": 0.3033, "num_tokens": 344424327.0, "step": 1166 }, { "epoch": 1.4296205630354957, "grad_norm": 0.19682975113391876, "learning_rate": 1.58843537414966e-05, "loss": 0.3236, "num_tokens": 345023321.0, "step": 1168 }, { "epoch": 1.4320685434516525, "grad_norm": 0.1881849616765976, "learning_rate": 1.5816326530612247e-05, "loss": 0.3155, "num_tokens": 345650727.0, "step": 1170 }, { "epoch": 1.434516523867809, "grad_norm": 0.19169044494628906, "learning_rate": 1.574829931972789e-05, "loss": 0.2988, "num_tokens": 346241537.0, "step": 1172 }, { "epoch": 1.4369645042839658, "grad_norm": 0.19946441054344177, "learning_rate": 1.5680272108843537e-05, "loss": 0.3222, "num_tokens": 346834512.0, "step": 1174 }, { "epoch": 1.4394124847001224, "grad_norm": 0.2189261019229889, "learning_rate": 1.5612244897959187e-05, "loss": 0.3123, "num_tokens": 347427754.0, "step": 1176 }, { "epoch": 1.441860465116279, "grad_norm": 0.19249054789543152, "learning_rate": 1.554421768707483e-05, "loss": 0.3146, "num_tokens": 348010163.0, "step": 1178 }, { "epoch": 1.4443084455324358, "grad_norm": 0.19783203303813934, "learning_rate": 1.5476190476190476e-05, "loss": 0.3115, "num_tokens": 348586523.0, "step": 1180 }, { "epoch": 1.4467564259485923, "grad_norm": 0.19105274975299835, "learning_rate": 1.5408163265306123e-05, "loss": 0.3, "num_tokens": 349176270.0, "step": 1182 }, { "epoch": 1.4492044063647491, "grad_norm": 0.19748911261558533, "learning_rate": 1.534013605442177e-05, "loss": 0.3139, "num_tokens": 349747952.0, "step": 1184 }, { "epoch": 1.4516523867809057, "grad_norm": 0.19224189221858978, "learning_rate": 1.5272108843537416e-05, "loss": 0.3052, "num_tokens": 350345000.0, "step": 1186 }, { "epoch": 1.4541003671970625, "grad_norm": 0.20573937892913818, "learning_rate": 1.520408163265306e-05, "loss": 0.3065, "num_tokens": 350916326.0, "step": 1188 }, { "epoch": 1.456548347613219, "grad_norm": 0.18738609552383423, "learning_rate": 1.5136054421768709e-05, "loss": 0.3116, "num_tokens": 351523950.0, "step": 1190 }, { "epoch": 1.4589963280293758, "grad_norm": 0.19046533107757568, "learning_rate": 1.5068027210884356e-05, "loss": 0.3065, "num_tokens": 352134013.0, "step": 1192 }, { "epoch": 1.4614443084455324, "grad_norm": 0.19589637219905853, "learning_rate": 1.5e-05, "loss": 0.3086, "num_tokens": 352718396.0, "step": 1194 }, { "epoch": 1.4638922888616892, "grad_norm": 0.18863846361637115, "learning_rate": 1.4931972789115647e-05, "loss": 0.3167, "num_tokens": 353339745.0, "step": 1196 }, { "epoch": 1.4663402692778458, "grad_norm": 0.18888550996780396, "learning_rate": 1.4863945578231292e-05, "loss": 0.3001, "num_tokens": 353931150.0, "step": 1198 }, { "epoch": 1.4687882496940023, "grad_norm": 0.1975889354944229, "learning_rate": 1.479591836734694e-05, "loss": 0.3163, "num_tokens": 354522064.0, "step": 1200 }, { "epoch": 1.4712362301101591, "grad_norm": 0.18136459589004517, "learning_rate": 1.4727891156462586e-05, "loss": 0.2994, "num_tokens": 355117113.0, "step": 1202 }, { "epoch": 1.4736842105263157, "grad_norm": 0.19150741398334503, "learning_rate": 1.4659863945578231e-05, "loss": 0.3098, "num_tokens": 355730097.0, "step": 1204 }, { "epoch": 1.4761321909424725, "grad_norm": 0.19516117870807648, "learning_rate": 1.4591836734693878e-05, "loss": 0.306, "num_tokens": 356338488.0, "step": 1206 }, { "epoch": 1.478580171358629, "grad_norm": 0.19421236217021942, "learning_rate": 1.4523809523809526e-05, "loss": 0.3192, "num_tokens": 356939071.0, "step": 1208 }, { "epoch": 1.4810281517747859, "grad_norm": 0.1933913677930832, "learning_rate": 1.445578231292517e-05, "loss": 0.3137, "num_tokens": 357525179.0, "step": 1210 }, { "epoch": 1.4834761321909424, "grad_norm": 0.18762294948101044, "learning_rate": 1.4387755102040817e-05, "loss": 0.3147, "num_tokens": 358154615.0, "step": 1212 }, { "epoch": 1.4859241126070992, "grad_norm": 0.20127852261066437, "learning_rate": 1.4319727891156462e-05, "loss": 0.3112, "num_tokens": 358716821.0, "step": 1214 }, { "epoch": 1.4883720930232558, "grad_norm": 0.19610735774040222, "learning_rate": 1.4251700680272109e-05, "loss": 0.3091, "num_tokens": 359309088.0, "step": 1216 }, { "epoch": 1.4908200734394126, "grad_norm": 0.1936454027891159, "learning_rate": 1.4183673469387757e-05, "loss": 0.3066, "num_tokens": 359882160.0, "step": 1218 }, { "epoch": 1.4932680538555692, "grad_norm": 0.1916629821062088, "learning_rate": 1.4115646258503402e-05, "loss": 0.3134, "num_tokens": 360488038.0, "step": 1220 }, { "epoch": 1.4957160342717257, "grad_norm": 0.18758879601955414, "learning_rate": 1.4047619047619048e-05, "loss": 0.3072, "num_tokens": 361096239.0, "step": 1222 }, { "epoch": 1.4981640146878825, "grad_norm": 0.2060474306344986, "learning_rate": 1.3979591836734696e-05, "loss": 0.3054, "num_tokens": 361655687.0, "step": 1224 }, { "epoch": 1.5006119951040393, "grad_norm": 0.19584183394908905, "learning_rate": 1.391156462585034e-05, "loss": 0.3045, "num_tokens": 362235721.0, "step": 1226 }, { "epoch": 1.5030599755201959, "grad_norm": 0.19787530601024628, "learning_rate": 1.3843537414965988e-05, "loss": 0.3202, "num_tokens": 362845579.0, "step": 1228 }, { "epoch": 1.5055079559363524, "grad_norm": 0.19545872509479523, "learning_rate": 1.3775510204081633e-05, "loss": 0.304, "num_tokens": 363422588.0, "step": 1230 }, { "epoch": 1.5079559363525092, "grad_norm": 0.19126883149147034, "learning_rate": 1.3707482993197279e-05, "loss": 0.3117, "num_tokens": 364012902.0, "step": 1232 }, { "epoch": 1.5104039167686658, "grad_norm": 0.20201963186264038, "learning_rate": 1.3639455782312927e-05, "loss": 0.2967, "num_tokens": 364590381.0, "step": 1234 }, { "epoch": 1.5128518971848224, "grad_norm": 0.19686050713062286, "learning_rate": 1.357142857142857e-05, "loss": 0.3041, "num_tokens": 365171373.0, "step": 1236 }, { "epoch": 1.5152998776009792, "grad_norm": 0.19988536834716797, "learning_rate": 1.3503401360544219e-05, "loss": 0.3078, "num_tokens": 365763431.0, "step": 1238 }, { "epoch": 1.517747858017136, "grad_norm": 0.20150606334209442, "learning_rate": 1.3435374149659865e-05, "loss": 0.3119, "num_tokens": 366380291.0, "step": 1240 }, { "epoch": 1.5201958384332925, "grad_norm": 0.19926296174526215, "learning_rate": 1.336734693877551e-05, "loss": 0.311, "num_tokens": 366967811.0, "step": 1242 }, { "epoch": 1.522643818849449, "grad_norm": 0.19956232607364655, "learning_rate": 1.3299319727891158e-05, "loss": 0.311, "num_tokens": 367561596.0, "step": 1244 }, { "epoch": 1.525091799265606, "grad_norm": 0.19856999814510345, "learning_rate": 1.3231292517006805e-05, "loss": 0.2986, "num_tokens": 368150273.0, "step": 1246 }, { "epoch": 1.5275397796817627, "grad_norm": 0.19735388457775116, "learning_rate": 1.316326530612245e-05, "loss": 0.3032, "num_tokens": 368720930.0, "step": 1248 }, { "epoch": 1.5299877600979193, "grad_norm": 0.19592858850955963, "learning_rate": 1.3095238095238096e-05, "loss": 0.3172, "num_tokens": 369314143.0, "step": 1250 }, { "epoch": 1.5324357405140758, "grad_norm": 0.21145842969417572, "learning_rate": 1.3027210884353741e-05, "loss": 0.302, "num_tokens": 369888591.0, "step": 1252 }, { "epoch": 1.5348837209302326, "grad_norm": 0.19561968743801117, "learning_rate": 1.2959183673469389e-05, "loss": 0.3129, "num_tokens": 370474915.0, "step": 1254 }, { "epoch": 1.5373317013463892, "grad_norm": 0.19179736077785492, "learning_rate": 1.2891156462585036e-05, "loss": 0.2954, "num_tokens": 371038197.0, "step": 1256 }, { "epoch": 1.5397796817625458, "grad_norm": 0.19345691800117493, "learning_rate": 1.282312925170068e-05, "loss": 0.3085, "num_tokens": 371644781.0, "step": 1258 }, { "epoch": 1.5422276621787026, "grad_norm": 0.18695752322673798, "learning_rate": 1.2755102040816327e-05, "loss": 0.2969, "num_tokens": 372236254.0, "step": 1260 }, { "epoch": 1.5446756425948593, "grad_norm": 0.19838133454322815, "learning_rate": 1.2687074829931975e-05, "loss": 0.3077, "num_tokens": 372798274.0, "step": 1262 }, { "epoch": 1.547123623011016, "grad_norm": 0.19439774751663208, "learning_rate": 1.261904761904762e-05, "loss": 0.3076, "num_tokens": 373380435.0, "step": 1264 }, { "epoch": 1.5495716034271725, "grad_norm": 0.19205980002880096, "learning_rate": 1.2551020408163267e-05, "loss": 0.3019, "num_tokens": 373955134.0, "step": 1266 }, { "epoch": 1.5520195838433293, "grad_norm": 0.19605402648448944, "learning_rate": 1.2482993197278913e-05, "loss": 0.2927, "num_tokens": 374540265.0, "step": 1268 }, { "epoch": 1.554467564259486, "grad_norm": 0.19268174469470978, "learning_rate": 1.2414965986394558e-05, "loss": 0.309, "num_tokens": 375154456.0, "step": 1270 }, { "epoch": 1.5569155446756426, "grad_norm": 0.19387708604335785, "learning_rate": 1.2346938775510204e-05, "loss": 0.3106, "num_tokens": 375773065.0, "step": 1272 }, { "epoch": 1.5593635250917992, "grad_norm": 0.1833484023809433, "learning_rate": 1.2278911564625851e-05, "loss": 0.3007, "num_tokens": 376389443.0, "step": 1274 }, { "epoch": 1.561811505507956, "grad_norm": 0.20474021136760712, "learning_rate": 1.2210884353741497e-05, "loss": 0.3062, "num_tokens": 376982296.0, "step": 1276 }, { "epoch": 1.5642594859241126, "grad_norm": 0.1933048814535141, "learning_rate": 1.2142857142857144e-05, "loss": 0.3064, "num_tokens": 377584261.0, "step": 1278 }, { "epoch": 1.5667074663402691, "grad_norm": 0.18160511553287506, "learning_rate": 1.2074829931972789e-05, "loss": 0.3103, "num_tokens": 378197370.0, "step": 1280 }, { "epoch": 1.569155446756426, "grad_norm": 0.2003134787082672, "learning_rate": 1.2006802721088437e-05, "loss": 0.3101, "num_tokens": 378763480.0, "step": 1282 }, { "epoch": 1.5716034271725827, "grad_norm": 0.19226349890232086, "learning_rate": 1.1938775510204082e-05, "loss": 0.3064, "num_tokens": 379348302.0, "step": 1284 }, { "epoch": 1.5740514075887393, "grad_norm": 0.19887998700141907, "learning_rate": 1.1870748299319728e-05, "loss": 0.3039, "num_tokens": 379944029.0, "step": 1286 }, { "epoch": 1.5764993880048959, "grad_norm": 0.1928476095199585, "learning_rate": 1.1802721088435375e-05, "loss": 0.3138, "num_tokens": 380545276.0, "step": 1288 }, { "epoch": 1.5789473684210527, "grad_norm": 0.18576791882514954, "learning_rate": 1.1734693877551021e-05, "loss": 0.3016, "num_tokens": 381165952.0, "step": 1290 }, { "epoch": 1.5813953488372094, "grad_norm": 0.19314470887184143, "learning_rate": 1.1666666666666668e-05, "loss": 0.3055, "num_tokens": 381731938.0, "step": 1292 }, { "epoch": 1.583843329253366, "grad_norm": 0.18967553973197937, "learning_rate": 1.1598639455782313e-05, "loss": 0.3148, "num_tokens": 382347508.0, "step": 1294 }, { "epoch": 1.5862913096695226, "grad_norm": 0.18154136836528778, "learning_rate": 1.153061224489796e-05, "loss": 0.2934, "num_tokens": 382959697.0, "step": 1296 }, { "epoch": 1.5887392900856794, "grad_norm": 0.19338959455490112, "learning_rate": 1.1462585034013606e-05, "loss": 0.3006, "num_tokens": 383562023.0, "step": 1298 }, { "epoch": 1.591187270501836, "grad_norm": 0.184366375207901, "learning_rate": 1.1394557823129252e-05, "loss": 0.3071, "num_tokens": 384162298.0, "step": 1300 }, { "epoch": 1.5936352509179925, "grad_norm": 0.19484610855579376, "learning_rate": 1.1326530612244899e-05, "loss": 0.3044, "num_tokens": 384785150.0, "step": 1302 }, { "epoch": 1.5960832313341493, "grad_norm": 0.19148942828178406, "learning_rate": 1.1258503401360544e-05, "loss": 0.308, "num_tokens": 385378921.0, "step": 1304 }, { "epoch": 1.598531211750306, "grad_norm": 0.18350958824157715, "learning_rate": 1.1190476190476192e-05, "loss": 0.3059, "num_tokens": 385993130.0, "step": 1306 }, { "epoch": 1.6009791921664627, "grad_norm": 0.18966390192508698, "learning_rate": 1.1122448979591837e-05, "loss": 0.3015, "num_tokens": 386558774.0, "step": 1308 }, { "epoch": 1.6034271725826192, "grad_norm": 0.18658103048801422, "learning_rate": 1.1054421768707483e-05, "loss": 0.3085, "num_tokens": 387160513.0, "step": 1310 }, { "epoch": 1.605875152998776, "grad_norm": 0.18809586763381958, "learning_rate": 1.098639455782313e-05, "loss": 0.3059, "num_tokens": 387749539.0, "step": 1312 }, { "epoch": 1.6083231334149328, "grad_norm": 0.18864519894123077, "learning_rate": 1.0918367346938776e-05, "loss": 0.3014, "num_tokens": 388325169.0, "step": 1314 }, { "epoch": 1.6107711138310894, "grad_norm": 0.21195875108242035, "learning_rate": 1.0850340136054423e-05, "loss": 0.3115, "num_tokens": 388917268.0, "step": 1316 }, { "epoch": 1.613219094247246, "grad_norm": 0.19140605628490448, "learning_rate": 1.0782312925170068e-05, "loss": 0.3121, "num_tokens": 389511580.0, "step": 1318 }, { "epoch": 1.6156670746634028, "grad_norm": 0.19427762925624847, "learning_rate": 1.0714285714285714e-05, "loss": 0.3065, "num_tokens": 390087089.0, "step": 1320 }, { "epoch": 1.6181150550795593, "grad_norm": 0.19682861864566803, "learning_rate": 1.0646258503401362e-05, "loss": 0.3018, "num_tokens": 390679866.0, "step": 1322 }, { "epoch": 1.620563035495716, "grad_norm": 0.18663744628429413, "learning_rate": 1.0578231292517007e-05, "loss": 0.3013, "num_tokens": 391280514.0, "step": 1324 }, { "epoch": 1.6230110159118727, "grad_norm": 0.19748732447624207, "learning_rate": 1.0510204081632654e-05, "loss": 0.303, "num_tokens": 391889240.0, "step": 1326 }, { "epoch": 1.6254589963280295, "grad_norm": 0.1948695182800293, "learning_rate": 1.0442176870748298e-05, "loss": 0.2958, "num_tokens": 392471411.0, "step": 1328 }, { "epoch": 1.627906976744186, "grad_norm": 0.19247005879878998, "learning_rate": 1.0374149659863947e-05, "loss": 0.3032, "num_tokens": 393038825.0, "step": 1330 }, { "epoch": 1.6303549571603426, "grad_norm": 0.1950393170118332, "learning_rate": 1.0306122448979593e-05, "loss": 0.3006, "num_tokens": 393629546.0, "step": 1332 }, { "epoch": 1.6328029375764994, "grad_norm": 0.20124682784080505, "learning_rate": 1.0238095238095238e-05, "loss": 0.3153, "num_tokens": 394214582.0, "step": 1334 }, { "epoch": 1.6352509179926562, "grad_norm": 0.19822737574577332, "learning_rate": 1.0170068027210885e-05, "loss": 0.2985, "num_tokens": 394799830.0, "step": 1336 }, { "epoch": 1.6376988984088128, "grad_norm": 0.19310900568962097, "learning_rate": 1.0102040816326531e-05, "loss": 0.3117, "num_tokens": 395383444.0, "step": 1338 }, { "epoch": 1.6401468788249693, "grad_norm": 0.1926649659872055, "learning_rate": 1.0034013605442178e-05, "loss": 0.3028, "num_tokens": 395962925.0, "step": 1340 }, { "epoch": 1.6425948592411261, "grad_norm": 0.1904018372297287, "learning_rate": 9.965986394557824e-06, "loss": 0.3004, "num_tokens": 396542126.0, "step": 1342 }, { "epoch": 1.6450428396572827, "grad_norm": 0.1939857006072998, "learning_rate": 9.897959183673469e-06, "loss": 0.304, "num_tokens": 397126157.0, "step": 1344 }, { "epoch": 1.6474908200734393, "grad_norm": 0.19093608856201172, "learning_rate": 9.829931972789117e-06, "loss": 0.2931, "num_tokens": 397699343.0, "step": 1346 }, { "epoch": 1.649938800489596, "grad_norm": 0.18991172313690186, "learning_rate": 9.761904761904762e-06, "loss": 0.3039, "num_tokens": 398272984.0, "step": 1348 }, { "epoch": 1.6523867809057529, "grad_norm": 0.1857612282037735, "learning_rate": 9.693877551020408e-06, "loss": 0.2986, "num_tokens": 398864706.0, "step": 1350 }, { "epoch": 1.6548347613219094, "grad_norm": 0.19996057450771332, "learning_rate": 9.625850340136055e-06, "loss": 0.2934, "num_tokens": 399430725.0, "step": 1352 }, { "epoch": 1.657282741738066, "grad_norm": 0.18894775211811066, "learning_rate": 9.557823129251701e-06, "loss": 0.2916, "num_tokens": 400017577.0, "step": 1354 }, { "epoch": 1.6597307221542228, "grad_norm": 0.19583207368850708, "learning_rate": 9.489795918367348e-06, "loss": 0.3181, "num_tokens": 400605339.0, "step": 1356 }, { "epoch": 1.6621787025703796, "grad_norm": 0.192321315407753, "learning_rate": 9.421768707482993e-06, "loss": 0.2899, "num_tokens": 401187208.0, "step": 1358 }, { "epoch": 1.6646266829865362, "grad_norm": 0.18309707939624786, "learning_rate": 9.353741496598641e-06, "loss": 0.2947, "num_tokens": 401780772.0, "step": 1360 }, { "epoch": 1.6670746634026927, "grad_norm": 0.1917714923620224, "learning_rate": 9.285714285714286e-06, "loss": 0.3195, "num_tokens": 402372693.0, "step": 1362 }, { "epoch": 1.6695226438188495, "grad_norm": 0.20221072435379028, "learning_rate": 9.217687074829932e-06, "loss": 0.3069, "num_tokens": 402964749.0, "step": 1364 }, { "epoch": 1.671970624235006, "grad_norm": 0.19205594062805176, "learning_rate": 9.149659863945579e-06, "loss": 0.3047, "num_tokens": 403556832.0, "step": 1366 }, { "epoch": 1.6744186046511627, "grad_norm": 0.18873633444309235, "learning_rate": 9.081632653061225e-06, "loss": 0.3088, "num_tokens": 404148708.0, "step": 1368 }, { "epoch": 1.6768665850673194, "grad_norm": 0.18638372421264648, "learning_rate": 9.013605442176872e-06, "loss": 0.3041, "num_tokens": 404747610.0, "step": 1370 }, { "epoch": 1.6793145654834762, "grad_norm": 0.19129733741283417, "learning_rate": 8.945578231292517e-06, "loss": 0.3022, "num_tokens": 405325414.0, "step": 1372 }, { "epoch": 1.6817625458996328, "grad_norm": 0.19730286300182343, "learning_rate": 8.877551020408163e-06, "loss": 0.3104, "num_tokens": 405909420.0, "step": 1374 }, { "epoch": 1.6842105263157894, "grad_norm": 0.19935332238674164, "learning_rate": 8.80952380952381e-06, "loss": 0.3095, "num_tokens": 406467182.0, "step": 1376 }, { "epoch": 1.6866585067319462, "grad_norm": 0.180069699883461, "learning_rate": 8.741496598639456e-06, "loss": 0.3068, "num_tokens": 407084206.0, "step": 1378 }, { "epoch": 1.689106487148103, "grad_norm": 0.18454380333423615, "learning_rate": 8.673469387755103e-06, "loss": 0.3052, "num_tokens": 407692897.0, "step": 1380 }, { "epoch": 1.6915544675642595, "grad_norm": 0.18517330288887024, "learning_rate": 8.605442176870748e-06, "loss": 0.3048, "num_tokens": 408288198.0, "step": 1382 }, { "epoch": 1.694002447980416, "grad_norm": 0.1934649795293808, "learning_rate": 8.537414965986396e-06, "loss": 0.2994, "num_tokens": 408862070.0, "step": 1384 }, { "epoch": 1.696450428396573, "grad_norm": 0.184217169880867, "learning_rate": 8.46938775510204e-06, "loss": 0.2883, "num_tokens": 409401455.0, "step": 1386 }, { "epoch": 1.6988984088127295, "grad_norm": 0.19365178048610687, "learning_rate": 8.401360544217687e-06, "loss": 0.298, "num_tokens": 409981536.0, "step": 1388 }, { "epoch": 1.701346389228886, "grad_norm": 0.1808663010597229, "learning_rate": 8.333333333333334e-06, "loss": 0.305, "num_tokens": 410624142.0, "step": 1390 }, { "epoch": 1.7037943696450428, "grad_norm": 0.18432636559009552, "learning_rate": 8.26530612244898e-06, "loss": 0.2968, "num_tokens": 411217859.0, "step": 1392 }, { "epoch": 1.7062423500611996, "grad_norm": 0.1906876564025879, "learning_rate": 8.197278911564627e-06, "loss": 0.2989, "num_tokens": 411793639.0, "step": 1394 }, { "epoch": 1.7086903304773562, "grad_norm": 0.19258762896060944, "learning_rate": 8.129251700680272e-06, "loss": 0.3059, "num_tokens": 412382835.0, "step": 1396 }, { "epoch": 1.7111383108935128, "grad_norm": 0.1843457818031311, "learning_rate": 8.061224489795918e-06, "loss": 0.2941, "num_tokens": 412978513.0, "step": 1398 }, { "epoch": 1.7135862913096696, "grad_norm": 0.19427083432674408, "learning_rate": 7.993197278911566e-06, "loss": 0.3049, "num_tokens": 413548350.0, "step": 1400 }, { "epoch": 1.7160342717258263, "grad_norm": 0.18227939307689667, "learning_rate": 7.925170068027211e-06, "loss": 0.296, "num_tokens": 414157385.0, "step": 1402 }, { "epoch": 1.718482252141983, "grad_norm": 0.2005896121263504, "learning_rate": 7.857142857142858e-06, "loss": 0.3033, "num_tokens": 414745862.0, "step": 1404 }, { "epoch": 1.7209302325581395, "grad_norm": 0.18954558670520782, "learning_rate": 7.789115646258502e-06, "loss": 0.3048, "num_tokens": 415343977.0, "step": 1406 }, { "epoch": 1.7233782129742963, "grad_norm": 0.1845894306898117, "learning_rate": 7.72108843537415e-06, "loss": 0.2856, "num_tokens": 415928711.0, "step": 1408 }, { "epoch": 1.7258261933904528, "grad_norm": 0.17458276450634003, "learning_rate": 7.653061224489797e-06, "loss": 0.289, "num_tokens": 416548135.0, "step": 1410 }, { "epoch": 1.7282741738066094, "grad_norm": 0.18068827688694, "learning_rate": 7.585034013605442e-06, "loss": 0.3021, "num_tokens": 417136899.0, "step": 1412 }, { "epoch": 1.7307221542227662, "grad_norm": 0.19675485789775848, "learning_rate": 7.5170068027210886e-06, "loss": 0.306, "num_tokens": 417734774.0, "step": 1414 }, { "epoch": 1.733170134638923, "grad_norm": 0.1864624172449112, "learning_rate": 7.448979591836736e-06, "loss": 0.2882, "num_tokens": 418335313.0, "step": 1416 }, { "epoch": 1.7356181150550796, "grad_norm": 0.19027724862098694, "learning_rate": 7.380952380952382e-06, "loss": 0.309, "num_tokens": 418929394.0, "step": 1418 }, { "epoch": 1.7380660954712361, "grad_norm": 0.18862073123455048, "learning_rate": 7.312925170068027e-06, "loss": 0.2948, "num_tokens": 419530495.0, "step": 1420 }, { "epoch": 1.740514075887393, "grad_norm": 0.19908729195594788, "learning_rate": 7.244897959183673e-06, "loss": 0.3114, "num_tokens": 420103794.0, "step": 1422 }, { "epoch": 1.7429620563035497, "grad_norm": 0.18573208153247833, "learning_rate": 7.17687074829932e-06, "loss": 0.3169, "num_tokens": 420709158.0, "step": 1424 }, { "epoch": 1.7454100367197063, "grad_norm": 0.18519924581050873, "learning_rate": 7.108843537414967e-06, "loss": 0.3129, "num_tokens": 421336727.0, "step": 1426 }, { "epoch": 1.7478580171358629, "grad_norm": 0.186129629611969, "learning_rate": 7.0408163265306125e-06, "loss": 0.299, "num_tokens": 421928607.0, "step": 1428 }, { "epoch": 1.7503059975520197, "grad_norm": 0.18803934752941132, "learning_rate": 6.972789115646258e-06, "loss": 0.2962, "num_tokens": 422497348.0, "step": 1430 }, { "epoch": 1.7527539779681762, "grad_norm": 0.19216306507587433, "learning_rate": 6.9047619047619055e-06, "loss": 0.2975, "num_tokens": 423079979.0, "step": 1432 }, { "epoch": 1.7552019583843328, "grad_norm": 0.17923884093761444, "learning_rate": 6.836734693877551e-06, "loss": 0.3021, "num_tokens": 423687525.0, "step": 1434 }, { "epoch": 1.7576499388004896, "grad_norm": 0.17736487090587616, "learning_rate": 6.768707482993198e-06, "loss": 0.2919, "num_tokens": 424262443.0, "step": 1436 }, { "epoch": 1.7600979192166464, "grad_norm": 0.19016993045806885, "learning_rate": 6.700680272108843e-06, "loss": 0.3024, "num_tokens": 424847534.0, "step": 1438 }, { "epoch": 1.762545899632803, "grad_norm": 0.17911458015441895, "learning_rate": 6.632653061224491e-06, "loss": 0.2999, "num_tokens": 425444472.0, "step": 1440 }, { "epoch": 1.7649938800489595, "grad_norm": 0.18407844007015228, "learning_rate": 6.5646258503401364e-06, "loss": 0.3126, "num_tokens": 426049494.0, "step": 1442 }, { "epoch": 1.7674418604651163, "grad_norm": 0.18401886522769928, "learning_rate": 6.496598639455782e-06, "loss": 0.2992, "num_tokens": 426653447.0, "step": 1444 }, { "epoch": 1.769889840881273, "grad_norm": 0.18777307868003845, "learning_rate": 6.428571428571429e-06, "loss": 0.3071, "num_tokens": 427239537.0, "step": 1446 }, { "epoch": 1.7723378212974297, "grad_norm": 0.18919534981250763, "learning_rate": 6.360544217687075e-06, "loss": 0.3087, "num_tokens": 427830403.0, "step": 1448 }, { "epoch": 1.7747858017135862, "grad_norm": 0.1860041618347168, "learning_rate": 6.292517006802722e-06, "loss": 0.3092, "num_tokens": 428446453.0, "step": 1450 }, { "epoch": 1.777233782129743, "grad_norm": 0.18577466905117035, "learning_rate": 6.224489795918367e-06, "loss": 0.3038, "num_tokens": 429059159.0, "step": 1452 }, { "epoch": 1.7796817625458996, "grad_norm": 0.18103350698947906, "learning_rate": 6.156462585034014e-06, "loss": 0.3065, "num_tokens": 429673711.0, "step": 1454 }, { "epoch": 1.7821297429620562, "grad_norm": 0.18147343397140503, "learning_rate": 6.0884353741496595e-06, "loss": 0.2921, "num_tokens": 430239528.0, "step": 1456 }, { "epoch": 1.784577723378213, "grad_norm": 0.17812971770763397, "learning_rate": 6.020408163265306e-06, "loss": 0.2989, "num_tokens": 430837471.0, "step": 1458 }, { "epoch": 1.7870257037943698, "grad_norm": 0.17790105938911438, "learning_rate": 5.9523809523809525e-06, "loss": 0.2953, "num_tokens": 431441012.0, "step": 1460 }, { "epoch": 1.7894736842105263, "grad_norm": 0.20933154225349426, "learning_rate": 5.884353741496599e-06, "loss": 0.3169, "num_tokens": 432042964.0, "step": 1462 }, { "epoch": 1.791921664626683, "grad_norm": 0.1813819408416748, "learning_rate": 5.816326530612245e-06, "loss": 0.309, "num_tokens": 432654843.0, "step": 1464 }, { "epoch": 1.7943696450428397, "grad_norm": 0.1865469366312027, "learning_rate": 5.748299319727891e-06, "loss": 0.2974, "num_tokens": 433233846.0, "step": 1466 }, { "epoch": 1.7968176254589965, "grad_norm": 0.19357292354106903, "learning_rate": 5.680272108843538e-06, "loss": 0.3006, "num_tokens": 433809551.0, "step": 1468 }, { "epoch": 1.799265605875153, "grad_norm": 0.17933765053749084, "learning_rate": 5.612244897959184e-06, "loss": 0.2941, "num_tokens": 434414308.0, "step": 1470 }, { "epoch": 1.8017135862913096, "grad_norm": 0.1841447949409485, "learning_rate": 5.544217687074831e-06, "loss": 0.2983, "num_tokens": 435020164.0, "step": 1472 }, { "epoch": 1.8041615667074664, "grad_norm": 0.18815261125564575, "learning_rate": 5.4761904761904765e-06, "loss": 0.3029, "num_tokens": 435605245.0, "step": 1474 }, { "epoch": 1.806609547123623, "grad_norm": 0.19180665910243988, "learning_rate": 5.408163265306123e-06, "loss": 0.2983, "num_tokens": 436186660.0, "step": 1476 }, { "epoch": 1.8090575275397796, "grad_norm": 0.18017077445983887, "learning_rate": 5.340136054421769e-06, "loss": 0.2973, "num_tokens": 436790859.0, "step": 1478 }, { "epoch": 1.8115055079559363, "grad_norm": 0.19583097100257874, "learning_rate": 5.272108843537415e-06, "loss": 0.3078, "num_tokens": 437359974.0, "step": 1480 }, { "epoch": 1.8139534883720931, "grad_norm": 0.1831379532814026, "learning_rate": 5.204081632653062e-06, "loss": 0.2957, "num_tokens": 437955768.0, "step": 1482 }, { "epoch": 1.8164014687882497, "grad_norm": 0.1913185566663742, "learning_rate": 5.136054421768708e-06, "loss": 0.3036, "num_tokens": 438555223.0, "step": 1484 }, { "epoch": 1.8188494492044063, "grad_norm": 0.17757216095924377, "learning_rate": 5.068027210884354e-06, "loss": 0.3007, "num_tokens": 439148244.0, "step": 1486 }, { "epoch": 1.821297429620563, "grad_norm": 0.17685405910015106, "learning_rate": 5e-06, "loss": 0.2938, "num_tokens": 439759887.0, "step": 1488 }, { "epoch": 1.8237454100367199, "grad_norm": 0.17941422760486603, "learning_rate": 4.931972789115646e-06, "loss": 0.2856, "num_tokens": 440362617.0, "step": 1490 }, { "epoch": 1.8261933904528764, "grad_norm": 0.20091430842876434, "learning_rate": 4.863945578231293e-06, "loss": 0.2981, "num_tokens": 440952427.0, "step": 1492 }, { "epoch": 1.828641370869033, "grad_norm": 0.1875113993883133, "learning_rate": 4.795918367346939e-06, "loss": 0.3039, "num_tokens": 441559643.0, "step": 1494 }, { "epoch": 1.8310893512851898, "grad_norm": 0.21347779035568237, "learning_rate": 4.727891156462586e-06, "loss": 0.3038, "num_tokens": 442151992.0, "step": 1496 }, { "epoch": 1.8335373317013464, "grad_norm": 0.20305398106575012, "learning_rate": 4.659863945578231e-06, "loss": 0.2974, "num_tokens": 442736825.0, "step": 1498 }, { "epoch": 1.835985312117503, "grad_norm": 0.18330271542072296, "learning_rate": 4.591836734693878e-06, "loss": 0.3001, "num_tokens": 443350122.0, "step": 1500 }, { "epoch": 1.8384332925336597, "grad_norm": 0.17919528484344482, "learning_rate": 4.5238095238095235e-06, "loss": 0.2985, "num_tokens": 443921129.0, "step": 1502 }, { "epoch": 1.8408812729498165, "grad_norm": 0.17953024804592133, "learning_rate": 4.455782312925171e-06, "loss": 0.2877, "num_tokens": 444501391.0, "step": 1504 }, { "epoch": 1.843329253365973, "grad_norm": 0.17917679250240326, "learning_rate": 4.3877551020408165e-06, "loss": 0.3003, "num_tokens": 445096945.0, "step": 1506 }, { "epoch": 1.8457772337821297, "grad_norm": 0.1793084740638733, "learning_rate": 4.319727891156463e-06, "loss": 0.2943, "num_tokens": 445709301.0, "step": 1508 }, { "epoch": 1.8482252141982864, "grad_norm": 0.18594089150428772, "learning_rate": 4.251700680272109e-06, "loss": 0.3087, "num_tokens": 446286266.0, "step": 1510 }, { "epoch": 1.8506731946144432, "grad_norm": 0.18956145644187927, "learning_rate": 4.183673469387755e-06, "loss": 0.2836, "num_tokens": 446840279.0, "step": 1512 }, { "epoch": 1.8531211750305998, "grad_norm": 0.1839389055967331, "learning_rate": 4.115646258503402e-06, "loss": 0.3013, "num_tokens": 447457483.0, "step": 1514 }, { "epoch": 1.8555691554467564, "grad_norm": 0.17976173758506775, "learning_rate": 4.047619047619048e-06, "loss": 0.2808, "num_tokens": 448021563.0, "step": 1516 }, { "epoch": 1.8580171358629132, "grad_norm": 0.18919742107391357, "learning_rate": 3.979591836734694e-06, "loss": 0.2889, "num_tokens": 448586869.0, "step": 1518 }, { "epoch": 1.8604651162790697, "grad_norm": 0.19154924154281616, "learning_rate": 3.9115646258503405e-06, "loss": 0.3011, "num_tokens": 449141403.0, "step": 1520 }, { "epoch": 1.8629130966952263, "grad_norm": 0.18337279558181763, "learning_rate": 3.843537414965986e-06, "loss": 0.2943, "num_tokens": 449731655.0, "step": 1522 }, { "epoch": 1.865361077111383, "grad_norm": 0.184468075633049, "learning_rate": 3.775510204081633e-06, "loss": 0.3075, "num_tokens": 450331437.0, "step": 1524 }, { "epoch": 1.86780905752754, "grad_norm": 0.17996063828468323, "learning_rate": 3.7074829931972787e-06, "loss": 0.3127, "num_tokens": 450949263.0, "step": 1526 }, { "epoch": 1.8702570379436965, "grad_norm": 0.1804162710905075, "learning_rate": 3.6394557823129257e-06, "loss": 0.3005, "num_tokens": 451562816.0, "step": 1528 }, { "epoch": 1.872705018359853, "grad_norm": 0.18102356791496277, "learning_rate": 3.5714285714285714e-06, "loss": 0.2919, "num_tokens": 452147375.0, "step": 1530 }, { "epoch": 1.8751529987760098, "grad_norm": 0.18842990696430206, "learning_rate": 3.503401360544218e-06, "loss": 0.3125, "num_tokens": 452751522.0, "step": 1532 }, { "epoch": 1.8776009791921666, "grad_norm": 0.1814049929380417, "learning_rate": 3.435374149659864e-06, "loss": 0.2903, "num_tokens": 453353463.0, "step": 1534 }, { "epoch": 1.880048959608323, "grad_norm": 0.1878400444984436, "learning_rate": 3.3673469387755105e-06, "loss": 0.2972, "num_tokens": 453920518.0, "step": 1536 }, { "epoch": 1.8824969400244798, "grad_norm": 0.18526817858219147, "learning_rate": 3.2993197278911566e-06, "loss": 0.3097, "num_tokens": 454526302.0, "step": 1538 }, { "epoch": 1.8849449204406366, "grad_norm": 0.1885441541671753, "learning_rate": 3.231292517006803e-06, "loss": 0.2978, "num_tokens": 455106980.0, "step": 1540 }, { "epoch": 1.8873929008567931, "grad_norm": 0.1805495172739029, "learning_rate": 3.1632653061224488e-06, "loss": 0.2959, "num_tokens": 455714265.0, "step": 1542 }, { "epoch": 1.8898408812729497, "grad_norm": 0.1821252703666687, "learning_rate": 3.0952380952380953e-06, "loss": 0.2948, "num_tokens": 456305393.0, "step": 1544 }, { "epoch": 1.8922888616891065, "grad_norm": 0.18508251011371613, "learning_rate": 3.027210884353742e-06, "loss": 0.3022, "num_tokens": 456904664.0, "step": 1546 }, { "epoch": 1.8947368421052633, "grad_norm": 0.19073516130447388, "learning_rate": 2.959183673469388e-06, "loss": 0.3073, "num_tokens": 457465974.0, "step": 1548 }, { "epoch": 1.8971848225214198, "grad_norm": 0.18541808426380157, "learning_rate": 2.8911564625850344e-06, "loss": 0.2969, "num_tokens": 458056695.0, "step": 1550 }, { "epoch": 1.8996328029375764, "grad_norm": 0.17976205050945282, "learning_rate": 2.8231292517006805e-06, "loss": 0.3005, "num_tokens": 458648053.0, "step": 1552 }, { "epoch": 1.9020807833537332, "grad_norm": 0.18303455412387848, "learning_rate": 2.7551020408163266e-06, "loss": 0.3029, "num_tokens": 459216811.0, "step": 1554 }, { "epoch": 1.90452876376989, "grad_norm": 0.17484115064144135, "learning_rate": 2.687074829931973e-06, "loss": 0.3059, "num_tokens": 459842266.0, "step": 1556 }, { "epoch": 1.9069767441860463, "grad_norm": 0.18087215721607208, "learning_rate": 2.6190476190476192e-06, "loss": 0.2958, "num_tokens": 460423041.0, "step": 1558 }, { "epoch": 1.9094247246022031, "grad_norm": 0.1765015572309494, "learning_rate": 2.5510204081632653e-06, "loss": 0.2989, "num_tokens": 461028921.0, "step": 1560 }, { "epoch": 1.91187270501836, "grad_norm": 0.18521203100681305, "learning_rate": 2.482993197278912e-06, "loss": 0.3095, "num_tokens": 461614263.0, "step": 1562 }, { "epoch": 1.9143206854345165, "grad_norm": 0.17324110865592957, "learning_rate": 2.414965986394558e-06, "loss": 0.2984, "num_tokens": 462219672.0, "step": 1564 }, { "epoch": 1.916768665850673, "grad_norm": 0.1875661462545395, "learning_rate": 2.346938775510204e-06, "loss": 0.299, "num_tokens": 462786923.0, "step": 1566 }, { "epoch": 1.9192166462668299, "grad_norm": 0.1870165467262268, "learning_rate": 2.2789115646258505e-06, "loss": 0.3036, "num_tokens": 463378442.0, "step": 1568 }, { "epoch": 1.9216646266829867, "grad_norm": 0.18141113221645355, "learning_rate": 2.2108843537414966e-06, "loss": 0.2994, "num_tokens": 463959340.0, "step": 1570 }, { "epoch": 1.9241126070991432, "grad_norm": 0.17851807177066803, "learning_rate": 2.142857142857143e-06, "loss": 0.3003, "num_tokens": 464550595.0, "step": 1572 }, { "epoch": 1.9265605875152998, "grad_norm": 0.1808425635099411, "learning_rate": 2.0748299319727892e-06, "loss": 0.2877, "num_tokens": 465127457.0, "step": 1574 }, { "epoch": 1.9290085679314566, "grad_norm": 0.18613682687282562, "learning_rate": 2.0068027210884353e-06, "loss": 0.3018, "num_tokens": 465688708.0, "step": 1576 }, { "epoch": 1.9314565483476134, "grad_norm": 0.17576543986797333, "learning_rate": 1.938775510204082e-06, "loss": 0.3024, "num_tokens": 466310661.0, "step": 1578 }, { "epoch": 1.9339045287637697, "grad_norm": 0.21868683397769928, "learning_rate": 1.870748299319728e-06, "loss": 0.3008, "num_tokens": 466914190.0, "step": 1580 }, { "epoch": 1.9363525091799265, "grad_norm": 0.17602218687534332, "learning_rate": 1.8027210884353743e-06, "loss": 0.3076, "num_tokens": 467542417.0, "step": 1582 }, { "epoch": 1.9388004895960833, "grad_norm": 0.1808781772851944, "learning_rate": 1.7346938775510206e-06, "loss": 0.3059, "num_tokens": 468147376.0, "step": 1584 }, { "epoch": 1.9412484700122399, "grad_norm": 0.18338029086589813, "learning_rate": 1.6666666666666667e-06, "loss": 0.3034, "num_tokens": 468759897.0, "step": 1586 }, { "epoch": 1.9436964504283964, "grad_norm": 0.17754128575325012, "learning_rate": 1.598639455782313e-06, "loss": 0.3073, "num_tokens": 469367956.0, "step": 1588 }, { "epoch": 1.9461444308445532, "grad_norm": 0.17857308685779572, "learning_rate": 1.5306122448979593e-06, "loss": 0.3058, "num_tokens": 469959880.0, "step": 1590 }, { "epoch": 1.94859241126071, "grad_norm": 0.18219216167926788, "learning_rate": 1.4625850340136054e-06, "loss": 0.2934, "num_tokens": 470552183.0, "step": 1592 }, { "epoch": 1.9510403916768666, "grad_norm": 0.17829884588718414, "learning_rate": 1.3945578231292517e-06, "loss": 0.306, "num_tokens": 471124861.0, "step": 1594 }, { "epoch": 1.9534883720930232, "grad_norm": 0.17285406589508057, "learning_rate": 1.326530612244898e-06, "loss": 0.2834, "num_tokens": 471746645.0, "step": 1596 }, { "epoch": 1.95593635250918, "grad_norm": 0.1866435408592224, "learning_rate": 1.2585034013605443e-06, "loss": 0.304, "num_tokens": 472348999.0, "step": 1598 }, { "epoch": 1.9583843329253368, "grad_norm": 0.17827358841896057, "learning_rate": 1.1904761904761904e-06, "loss": 0.3023, "num_tokens": 472943557.0, "step": 1600 }, { "epoch": 1.960832313341493, "grad_norm": 0.1909848302602768, "learning_rate": 1.1224489795918367e-06, "loss": 0.2929, "num_tokens": 473517613.0, "step": 1602 }, { "epoch": 1.96328029375765, "grad_norm": 0.18143118917942047, "learning_rate": 1.054421768707483e-06, "loss": 0.3029, "num_tokens": 474128167.0, "step": 1604 }, { "epoch": 1.9657282741738067, "grad_norm": 0.183173269033432, "learning_rate": 9.863945578231293e-07, "loss": 0.2993, "num_tokens": 474698870.0, "step": 1606 }, { "epoch": 1.9681762545899633, "grad_norm": 0.17815543711185455, "learning_rate": 9.183673469387756e-07, "loss": 0.2956, "num_tokens": 475292232.0, "step": 1608 }, { "epoch": 1.9706242350061198, "grad_norm": 0.17755338549613953, "learning_rate": 8.503401360544219e-07, "loss": 0.3021, "num_tokens": 475896729.0, "step": 1610 }, { "epoch": 1.9730722154222766, "grad_norm": 0.18035194277763367, "learning_rate": 7.823129251700681e-07, "loss": 0.3013, "num_tokens": 476487140.0, "step": 1612 }, { "epoch": 1.9755201958384334, "grad_norm": 0.17756494879722595, "learning_rate": 7.142857142857143e-07, "loss": 0.3029, "num_tokens": 477092050.0, "step": 1614 }, { "epoch": 1.97796817625459, "grad_norm": 0.18439583480358124, "learning_rate": 6.462585034013605e-07, "loss": 0.2992, "num_tokens": 477694927.0, "step": 1616 }, { "epoch": 1.9804161566707466, "grad_norm": 0.18490420281887054, "learning_rate": 5.782312925170068e-07, "loss": 0.2956, "num_tokens": 478253698.0, "step": 1618 }, { "epoch": 1.9828641370869033, "grad_norm": 0.1895751804113388, "learning_rate": 5.10204081632653e-07, "loss": 0.3009, "num_tokens": 478839057.0, "step": 1620 }, { "epoch": 1.9853121175030601, "grad_norm": 0.17537181079387665, "learning_rate": 4.421768707482994e-07, "loss": 0.2972, "num_tokens": 479421346.0, "step": 1622 }, { "epoch": 1.9877600979192165, "grad_norm": 0.18184703588485718, "learning_rate": 3.741496598639456e-07, "loss": 0.2854, "num_tokens": 479966942.0, "step": 1624 }, { "epoch": 1.9902080783353733, "grad_norm": 0.17638690769672394, "learning_rate": 3.0612244897959183e-07, "loss": 0.2927, "num_tokens": 480573283.0, "step": 1626 }, { "epoch": 1.99265605875153, "grad_norm": 0.1770249903202057, "learning_rate": 2.3809523809523814e-07, "loss": 0.2896, "num_tokens": 481160679.0, "step": 1628 }, { "epoch": 1.9951040391676866, "grad_norm": 0.173421248793602, "learning_rate": 1.7006802721088437e-07, "loss": 0.2902, "num_tokens": 481747619.0, "step": 1630 }, { "epoch": 1.9975520195838432, "grad_norm": 0.17720605432987213, "learning_rate": 1.0204081632653062e-07, "loss": 0.2903, "num_tokens": 482345468.0, "step": 1632 }, { "epoch": 2.0, "grad_norm": 0.17569027841091156, "learning_rate": 3.401360544217687e-08, "loss": 0.2883, "num_tokens": 482921493.0, "step": 1634 }, { "epoch": 2.0, "step": 1634, "total_flos": 1.9956949564627354e+19, "train_loss": 0.37813405603206873, "train_runtime": 14382.5359, "train_samples_per_second": 12.717, "train_steps_per_second": 0.114 } ], "logging_steps": 2, "max_steps": 1634, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 82, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9956949564627354e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }