{ "best_global_step": 25200, "best_metric": 0.2112353891134262, "best_model_checkpoint": "saves/prompt-tuning/gemma-3-1b-it/train_boolq_1745950272/checkpoint-25200", "epoch": 18.85902876001886, "eval_steps": 200, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023573785950023575, "grad_norm": 31.988269805908203, "learning_rate": 0.29999999259779675, "loss": 10.4129, "num_input_tokens_seen": 4304, "step": 5 }, { "epoch": 0.004714757190004715, "grad_norm": 20.223493576049805, "learning_rate": 0.29999996252634736, "loss": 12.1618, "num_input_tokens_seen": 8432, "step": 10 }, { "epoch": 0.007072135785007072, "grad_norm": 4.1196746826171875, "learning_rate": 0.2999999093230187, "loss": 9.0938, "num_input_tokens_seen": 12688, "step": 15 }, { "epoch": 0.00942951438000943, "grad_norm": 3.138713836669922, "learning_rate": 0.299999832987819, "loss": 4.8869, "num_input_tokens_seen": 17360, "step": 20 }, { "epoch": 0.011786892975011787, "grad_norm": 1.5348763465881348, "learning_rate": 0.29999973352076004, "loss": 2.1921, "num_input_tokens_seen": 21808, "step": 25 }, { "epoch": 0.014144271570014143, "grad_norm": 1.0330103635787964, "learning_rate": 0.2999996109218572, "loss": 1.3614, "num_input_tokens_seen": 25952, "step": 30 }, { "epoch": 0.0165016501650165, "grad_norm": 0.6571065783500671, "learning_rate": 0.2999994651911293, "loss": 1.008, "num_input_tokens_seen": 29568, "step": 35 }, { "epoch": 0.01885902876001886, "grad_norm": 1.2071079015731812, "learning_rate": 0.2999992963285989, "loss": 0.599, "num_input_tokens_seen": 33712, "step": 40 }, { "epoch": 0.021216407355021217, "grad_norm": 0.5068820118904114, "learning_rate": 0.29999910433429194, "loss": 0.8035, "num_input_tokens_seen": 37856, "step": 45 }, { "epoch": 0.023573785950023574, "grad_norm": 0.16513298451900482, "learning_rate": 0.29999888920823814, "loss": 0.3537, "num_input_tokens_seen": 41936, "step": 50 }, { "epoch": 0.02593116454502593, "grad_norm": 0.5797112584114075, "learning_rate": 0.29999865095047057, "loss": 0.4111, "num_input_tokens_seen": 46448, "step": 55 }, { "epoch": 0.028288543140028287, "grad_norm": 0.3267705738544464, "learning_rate": 0.29999838956102604, "loss": 0.4687, "num_input_tokens_seen": 51120, "step": 60 }, { "epoch": 0.030645921735030647, "grad_norm": 0.09841182082891464, "learning_rate": 0.29999810503994484, "loss": 0.2947, "num_input_tokens_seen": 55520, "step": 65 }, { "epoch": 0.033003300330033, "grad_norm": 0.10266968607902527, "learning_rate": 0.29999779738727084, "loss": 0.3085, "num_input_tokens_seen": 60496, "step": 70 }, { "epoch": 0.03536067892503536, "grad_norm": 0.20409666001796722, "learning_rate": 0.29999746660305154, "loss": 0.2472, "num_input_tokens_seen": 64368, "step": 75 }, { "epoch": 0.03771805752003772, "grad_norm": 0.17530688643455505, "learning_rate": 0.2999971126873379, "loss": 0.5316, "num_input_tokens_seen": 68720, "step": 80 }, { "epoch": 0.040075436115040074, "grad_norm": 0.1986703872680664, "learning_rate": 0.2999967356401845, "loss": 0.3228, "num_input_tokens_seen": 72592, "step": 85 }, { "epoch": 0.042432814710042434, "grad_norm": 0.10162245482206345, "learning_rate": 0.29999633546164944, "loss": 0.3781, "num_input_tokens_seen": 76384, "step": 90 }, { "epoch": 0.04479019330504479, "grad_norm": 0.2918356657028198, "learning_rate": 0.29999591215179444, "loss": 0.3153, "num_input_tokens_seen": 80224, "step": 95 }, { "epoch": 0.04714757190004715, "grad_norm": 0.11471439898014069, "learning_rate": 0.2999954657106849, "loss": 0.2848, "num_input_tokens_seen": 84704, "step": 100 }, { "epoch": 0.04950495049504951, "grad_norm": 0.16378742456436157, "learning_rate": 0.2999949961383896, "loss": 0.2694, "num_input_tokens_seen": 88688, "step": 105 }, { "epoch": 0.05186232909005186, "grad_norm": 0.31704339385032654, "learning_rate": 0.2999945034349809, "loss": 0.6022, "num_input_tokens_seen": 93184, "step": 110 }, { "epoch": 0.05421970768505422, "grad_norm": 0.08817286789417267, "learning_rate": 0.2999939876005348, "loss": 0.3188, "num_input_tokens_seen": 98192, "step": 115 }, { "epoch": 0.056577086280056574, "grad_norm": 0.43094635009765625, "learning_rate": 0.29999344863513094, "loss": 0.4523, "num_input_tokens_seen": 102912, "step": 120 }, { "epoch": 0.058934464875058934, "grad_norm": 0.22856155037879944, "learning_rate": 0.2999928865388523, "loss": 0.4735, "num_input_tokens_seen": 108384, "step": 125 }, { "epoch": 0.061291843470061294, "grad_norm": 0.20460647344589233, "learning_rate": 0.29999230131178567, "loss": 0.3447, "num_input_tokens_seen": 113248, "step": 130 }, { "epoch": 0.06364922206506365, "grad_norm": 0.13002483546733856, "learning_rate": 0.2999916929540212, "loss": 0.4521, "num_input_tokens_seen": 117872, "step": 135 }, { "epoch": 0.066006600660066, "grad_norm": 0.1392366588115692, "learning_rate": 0.29999106146565285, "loss": 0.3571, "num_input_tokens_seen": 122176, "step": 140 }, { "epoch": 0.06836397925506836, "grad_norm": 0.13612762093544006, "learning_rate": 0.29999040684677786, "loss": 0.2907, "num_input_tokens_seen": 126432, "step": 145 }, { "epoch": 0.07072135785007072, "grad_norm": 0.08337651938199997, "learning_rate": 0.2999897290974972, "loss": 0.2691, "num_input_tokens_seen": 130576, "step": 150 }, { "epoch": 0.07307873644507308, "grad_norm": 0.046880874782800674, "learning_rate": 0.2999890282179155, "loss": 0.2996, "num_input_tokens_seen": 135184, "step": 155 }, { "epoch": 0.07543611504007544, "grad_norm": 0.1292787343263626, "learning_rate": 0.29998830420814077, "loss": 0.2249, "num_input_tokens_seen": 139248, "step": 160 }, { "epoch": 0.07779349363507779, "grad_norm": 0.07504863291978836, "learning_rate": 0.2999875570682846, "loss": 0.328, "num_input_tokens_seen": 143984, "step": 165 }, { "epoch": 0.08015087223008015, "grad_norm": 0.10210473835468292, "learning_rate": 0.2999867867984623, "loss": 0.3487, "num_input_tokens_seen": 148704, "step": 170 }, { "epoch": 0.08250825082508251, "grad_norm": 0.10031376034021378, "learning_rate": 0.29998599339879267, "loss": 0.2901, "num_input_tokens_seen": 153440, "step": 175 }, { "epoch": 0.08486562942008487, "grad_norm": 0.10620614141225815, "learning_rate": 0.29998517686939796, "loss": 0.3292, "num_input_tokens_seen": 157264, "step": 180 }, { "epoch": 0.08722300801508723, "grad_norm": 0.06989361345767975, "learning_rate": 0.29998433721040413, "loss": 0.3229, "num_input_tokens_seen": 161840, "step": 185 }, { "epoch": 0.08958038661008957, "grad_norm": 0.037209998816251755, "learning_rate": 0.29998347442194073, "loss": 0.2519, "num_input_tokens_seen": 165616, "step": 190 }, { "epoch": 0.09193776520509193, "grad_norm": 0.03803081810474396, "learning_rate": 0.2999825885041407, "loss": 0.2571, "num_input_tokens_seen": 169760, "step": 195 }, { "epoch": 0.0942951438000943, "grad_norm": 0.08795905113220215, "learning_rate": 0.29998167945714077, "loss": 0.2964, "num_input_tokens_seen": 174096, "step": 200 }, { "epoch": 0.0942951438000943, "eval_loss": 0.3283520042896271, "eval_runtime": 21.8999, "eval_samples_per_second": 43.06, "eval_steps_per_second": 21.553, "num_input_tokens_seen": 174096, "step": 200 }, { "epoch": 0.09665252239509665, "grad_norm": 0.03700771927833557, "learning_rate": 0.2999807472810811, "loss": 0.3905, "num_input_tokens_seen": 178704, "step": 205 }, { "epoch": 0.09900990099009901, "grad_norm": 0.032493144273757935, "learning_rate": 0.29997979197610536, "loss": 0.26, "num_input_tokens_seen": 182912, "step": 210 }, { "epoch": 0.10136727958510136, "grad_norm": 0.05959407612681389, "learning_rate": 0.299978813542361, "loss": 0.2928, "num_input_tokens_seen": 186880, "step": 215 }, { "epoch": 0.10372465818010372, "grad_norm": 0.08966787159442902, "learning_rate": 0.2999778119799988, "loss": 0.2034, "num_input_tokens_seen": 190976, "step": 220 }, { "epoch": 0.10608203677510608, "grad_norm": 0.06561319530010223, "learning_rate": 0.29997678728917326, "loss": 0.2247, "num_input_tokens_seen": 195136, "step": 225 }, { "epoch": 0.10843941537010844, "grad_norm": 0.06829624623060226, "learning_rate": 0.2999757394700424, "loss": 0.2597, "num_input_tokens_seen": 199376, "step": 230 }, { "epoch": 0.1107967939651108, "grad_norm": 0.33134883642196655, "learning_rate": 0.29997466852276783, "loss": 0.4023, "num_input_tokens_seen": 203104, "step": 235 }, { "epoch": 0.11315417256011315, "grad_norm": 0.1378246247768402, "learning_rate": 0.29997357444751466, "loss": 0.3438, "num_input_tokens_seen": 207440, "step": 240 }, { "epoch": 0.11551155115511551, "grad_norm": 0.1637970358133316, "learning_rate": 0.2999724572444516, "loss": 0.2485, "num_input_tokens_seen": 211520, "step": 245 }, { "epoch": 0.11786892975011787, "grad_norm": 0.33689868450164795, "learning_rate": 0.29997131691375095, "loss": 0.4969, "num_input_tokens_seen": 216224, "step": 250 }, { "epoch": 0.12022630834512023, "grad_norm": 0.17855975031852722, "learning_rate": 0.2999701534555886, "loss": 0.4601, "num_input_tokens_seen": 220352, "step": 255 }, { "epoch": 0.12258368694012259, "grad_norm": 0.12806305289268494, "learning_rate": 0.2999689668701439, "loss": 0.3606, "num_input_tokens_seen": 224352, "step": 260 }, { "epoch": 0.12494106553512493, "grad_norm": 0.04102272540330887, "learning_rate": 0.29996775715759993, "loss": 0.2911, "num_input_tokens_seen": 228240, "step": 265 }, { "epoch": 0.1272984441301273, "grad_norm": 0.029556559398770332, "learning_rate": 0.2999665243181432, "loss": 0.2242, "num_input_tokens_seen": 232304, "step": 270 }, { "epoch": 0.12965582272512965, "grad_norm": 0.04120897129178047, "learning_rate": 0.2999652683519638, "loss": 0.1807, "num_input_tokens_seen": 236464, "step": 275 }, { "epoch": 0.132013201320132, "grad_norm": 0.07952621579170227, "learning_rate": 0.29996398925925544, "loss": 0.3414, "num_input_tokens_seen": 241440, "step": 280 }, { "epoch": 0.13437057991513437, "grad_norm": 0.10197393596172333, "learning_rate": 0.2999626870402154, "loss": 0.3377, "num_input_tokens_seen": 245104, "step": 285 }, { "epoch": 0.13672795851013672, "grad_norm": 0.07283695042133331, "learning_rate": 0.29996136169504445, "loss": 0.3599, "num_input_tokens_seen": 249552, "step": 290 }, { "epoch": 0.1390853371051391, "grad_norm": 0.036499980837106705, "learning_rate": 0.29996001322394694, "loss": 0.2603, "num_input_tokens_seen": 253232, "step": 295 }, { "epoch": 0.14144271570014144, "grad_norm": 0.07302112132310867, "learning_rate": 0.29995864162713093, "loss": 0.283, "num_input_tokens_seen": 257456, "step": 300 }, { "epoch": 0.1438000942951438, "grad_norm": 0.02899056300520897, "learning_rate": 0.2999572469048079, "loss": 0.4889, "num_input_tokens_seen": 262112, "step": 305 }, { "epoch": 0.14615747289014616, "grad_norm": 0.04443933069705963, "learning_rate": 0.29995582905719287, "loss": 0.2237, "num_input_tokens_seen": 266208, "step": 310 }, { "epoch": 0.1485148514851485, "grad_norm": 0.05909077078104019, "learning_rate": 0.2999543880845046, "loss": 0.2465, "num_input_tokens_seen": 269792, "step": 315 }, { "epoch": 0.15087223008015088, "grad_norm": 0.03209182620048523, "learning_rate": 0.2999529239869652, "loss": 0.2853, "num_input_tokens_seen": 274000, "step": 320 }, { "epoch": 0.15322960867515323, "grad_norm": 0.04247352480888367, "learning_rate": 0.2999514367648005, "loss": 0.2556, "num_input_tokens_seen": 278304, "step": 325 }, { "epoch": 0.15558698727015557, "grad_norm": 0.1382414549589157, "learning_rate": 0.29994992641823987, "loss": 0.2522, "num_input_tokens_seen": 282976, "step": 330 }, { "epoch": 0.15794436586515795, "grad_norm": 0.1388586014509201, "learning_rate": 0.29994839294751613, "loss": 0.2722, "num_input_tokens_seen": 287584, "step": 335 }, { "epoch": 0.1603017444601603, "grad_norm": 0.06847704946994781, "learning_rate": 0.29994683635286584, "loss": 0.254, "num_input_tokens_seen": 293632, "step": 340 }, { "epoch": 0.16265912305516267, "grad_norm": 0.15747271478176117, "learning_rate": 0.2999452566345291, "loss": 0.2665, "num_input_tokens_seen": 297712, "step": 345 }, { "epoch": 0.16501650165016502, "grad_norm": 0.45045727491378784, "learning_rate": 0.2999436537927494, "loss": 0.6767, "num_input_tokens_seen": 302240, "step": 350 }, { "epoch": 0.16737388024516736, "grad_norm": 0.05649186298251152, "learning_rate": 0.299942027827774, "loss": 0.3598, "num_input_tokens_seen": 306336, "step": 355 }, { "epoch": 0.16973125884016974, "grad_norm": 0.06870386004447937, "learning_rate": 0.29994037873985363, "loss": 0.3187, "num_input_tokens_seen": 310864, "step": 360 }, { "epoch": 0.17208863743517208, "grad_norm": 0.08070796728134155, "learning_rate": 0.29993870652924254, "loss": 0.2599, "num_input_tokens_seen": 315072, "step": 365 }, { "epoch": 0.17444601603017446, "grad_norm": 0.09333045780658722, "learning_rate": 0.29993701119619876, "loss": 0.3068, "num_input_tokens_seen": 319552, "step": 370 }, { "epoch": 0.1768033946251768, "grad_norm": 0.05553876981139183, "learning_rate": 0.2999352927409835, "loss": 0.3616, "num_input_tokens_seen": 323952, "step": 375 }, { "epoch": 0.17916077322017915, "grad_norm": 0.04436880350112915, "learning_rate": 0.29993355116386194, "loss": 0.2284, "num_input_tokens_seen": 327856, "step": 380 }, { "epoch": 0.18151815181518152, "grad_norm": 0.011247720569372177, "learning_rate": 0.29993178646510266, "loss": 0.1212, "num_input_tokens_seen": 332384, "step": 385 }, { "epoch": 0.18387553041018387, "grad_norm": 0.05439724773168564, "learning_rate": 0.2999299986449777, "loss": 0.353, "num_input_tokens_seen": 336992, "step": 390 }, { "epoch": 0.18623290900518624, "grad_norm": 0.06764350831508636, "learning_rate": 0.29992818770376284, "loss": 0.2543, "num_input_tokens_seen": 340592, "step": 395 }, { "epoch": 0.1885902876001886, "grad_norm": 0.055654413998126984, "learning_rate": 0.29992635364173725, "loss": 0.3882, "num_input_tokens_seen": 344560, "step": 400 }, { "epoch": 0.1885902876001886, "eval_loss": 0.25987884402275085, "eval_runtime": 21.9212, "eval_samples_per_second": 43.018, "eval_steps_per_second": 21.532, "num_input_tokens_seen": 344560, "step": 400 }, { "epoch": 0.19094766619519093, "grad_norm": 0.0670490488409996, "learning_rate": 0.2999244964591839, "loss": 0.3973, "num_input_tokens_seen": 349088, "step": 405 }, { "epoch": 0.1933050447901933, "grad_norm": 0.03140193969011307, "learning_rate": 0.2999226161563891, "loss": 0.3203, "num_input_tokens_seen": 353472, "step": 410 }, { "epoch": 0.19566242338519566, "grad_norm": 0.02117753215134144, "learning_rate": 0.2999207127336429, "loss": 0.2315, "num_input_tokens_seen": 357904, "step": 415 }, { "epoch": 0.19801980198019803, "grad_norm": 0.037303805351257324, "learning_rate": 0.2999187861912387, "loss": 0.2653, "num_input_tokens_seen": 362448, "step": 420 }, { "epoch": 0.20037718057520038, "grad_norm": 0.018823646008968353, "learning_rate": 0.2999168365294737, "loss": 0.2847, "num_input_tokens_seen": 367744, "step": 425 }, { "epoch": 0.20273455917020272, "grad_norm": 0.028078552335500717, "learning_rate": 0.29991486374864856, "loss": 0.2498, "num_input_tokens_seen": 371920, "step": 430 }, { "epoch": 0.2050919377652051, "grad_norm": 0.017890460789203644, "learning_rate": 0.29991286784906745, "loss": 0.2245, "num_input_tokens_seen": 376832, "step": 435 }, { "epoch": 0.20744931636020744, "grad_norm": 0.03922800347208977, "learning_rate": 0.2999108488310382, "loss": 0.2555, "num_input_tokens_seen": 380800, "step": 440 }, { "epoch": 0.20980669495520982, "grad_norm": 0.01561557874083519, "learning_rate": 0.29990880669487213, "loss": 0.3052, "num_input_tokens_seen": 384736, "step": 445 }, { "epoch": 0.21216407355021216, "grad_norm": 0.03607628121972084, "learning_rate": 0.29990674144088425, "loss": 0.2627, "num_input_tokens_seen": 389840, "step": 450 }, { "epoch": 0.2145214521452145, "grad_norm": 0.012515510432422161, "learning_rate": 0.299904653069393, "loss": 0.2101, "num_input_tokens_seen": 393680, "step": 455 }, { "epoch": 0.21687883074021688, "grad_norm": 0.021081510931253433, "learning_rate": 0.29990254158072044, "loss": 0.2589, "num_input_tokens_seen": 397328, "step": 460 }, { "epoch": 0.21923620933521923, "grad_norm": 0.01580131985247135, "learning_rate": 0.2999004069751921, "loss": 0.2122, "num_input_tokens_seen": 401648, "step": 465 }, { "epoch": 0.2215935879302216, "grad_norm": 0.050282612442970276, "learning_rate": 0.2998982492531373, "loss": 0.3045, "num_input_tokens_seen": 405744, "step": 470 }, { "epoch": 0.22395096652522395, "grad_norm": 0.01552183460444212, "learning_rate": 0.2998960684148887, "loss": 0.2318, "num_input_tokens_seen": 410016, "step": 475 }, { "epoch": 0.2263083451202263, "grad_norm": 0.027648374438285828, "learning_rate": 0.29989386446078264, "loss": 0.2629, "num_input_tokens_seen": 413680, "step": 480 }, { "epoch": 0.22866572371522867, "grad_norm": 0.030563373118638992, "learning_rate": 0.299891637391159, "loss": 0.2688, "num_input_tokens_seen": 418064, "step": 485 }, { "epoch": 0.23102310231023102, "grad_norm": 0.03588169813156128, "learning_rate": 0.2998893872063612, "loss": 0.2644, "num_input_tokens_seen": 422400, "step": 490 }, { "epoch": 0.2333804809052334, "grad_norm": 0.012404967099428177, "learning_rate": 0.2998871139067363, "loss": 0.2515, "num_input_tokens_seen": 426688, "step": 495 }, { "epoch": 0.23573785950023574, "grad_norm": 0.0213252454996109, "learning_rate": 0.2998848174926348, "loss": 0.231, "num_input_tokens_seen": 431264, "step": 500 }, { "epoch": 0.23809523809523808, "grad_norm": 0.018889490514993668, "learning_rate": 0.2998824979644109, "loss": 0.2347, "num_input_tokens_seen": 435184, "step": 505 }, { "epoch": 0.24045261669024046, "grad_norm": 0.03740219399333, "learning_rate": 0.29988015532242224, "loss": 0.2503, "num_input_tokens_seen": 439376, "step": 510 }, { "epoch": 0.2428099952852428, "grad_norm": 0.010630019009113312, "learning_rate": 0.29987778956703015, "loss": 0.2142, "num_input_tokens_seen": 443472, "step": 515 }, { "epoch": 0.24516737388024518, "grad_norm": 0.014807060360908508, "learning_rate": 0.2998754006985994, "loss": 0.2627, "num_input_tokens_seen": 447904, "step": 520 }, { "epoch": 0.24752475247524752, "grad_norm": 0.11478953063488007, "learning_rate": 0.29987298871749846, "loss": 0.2388, "num_input_tokens_seen": 452928, "step": 525 }, { "epoch": 0.24988213107024987, "grad_norm": 0.018875492736697197, "learning_rate": 0.2998705536240992, "loss": 0.2299, "num_input_tokens_seen": 456768, "step": 530 }, { "epoch": 0.2522395096652522, "grad_norm": 0.01869705691933632, "learning_rate": 0.2998680954187772, "loss": 0.1783, "num_input_tokens_seen": 461040, "step": 535 }, { "epoch": 0.2545968882602546, "grad_norm": 0.05137253180146217, "learning_rate": 0.2998656141019115, "loss": 0.3477, "num_input_tokens_seen": 465088, "step": 540 }, { "epoch": 0.25695426685525696, "grad_norm": 0.03432634472846985, "learning_rate": 0.2998631096738848, "loss": 0.1988, "num_input_tokens_seen": 469568, "step": 545 }, { "epoch": 0.2593116454502593, "grad_norm": 0.024540293961763382, "learning_rate": 0.29986058213508326, "loss": 0.2772, "num_input_tokens_seen": 473264, "step": 550 }, { "epoch": 0.26166902404526166, "grad_norm": 0.02988324500620365, "learning_rate": 0.29985803148589674, "loss": 0.2602, "num_input_tokens_seen": 477792, "step": 555 }, { "epoch": 0.264026402640264, "grad_norm": 0.02994399517774582, "learning_rate": 0.2998554577267185, "loss": 0.252, "num_input_tokens_seen": 481856, "step": 560 }, { "epoch": 0.2663837812352664, "grad_norm": 0.03604767099022865, "learning_rate": 0.2998528608579455, "loss": 0.2275, "num_input_tokens_seen": 487024, "step": 565 }, { "epoch": 0.26874115983026875, "grad_norm": 0.054496679455041885, "learning_rate": 0.2998502408799781, "loss": 0.2532, "num_input_tokens_seen": 490816, "step": 570 }, { "epoch": 0.2710985384252711, "grad_norm": 0.05150502547621727, "learning_rate": 0.2998475977932205, "loss": 0.2135, "num_input_tokens_seen": 495696, "step": 575 }, { "epoch": 0.27345591702027344, "grad_norm": 0.06451847404241562, "learning_rate": 0.29984493159808023, "loss": 0.2666, "num_input_tokens_seen": 500800, "step": 580 }, { "epoch": 0.2758132956152758, "grad_norm": 0.07649912685155869, "learning_rate": 0.29984224229496836, "loss": 0.2301, "num_input_tokens_seen": 504864, "step": 585 }, { "epoch": 0.2781706742102782, "grad_norm": 0.0168441291898489, "learning_rate": 0.2998395298842998, "loss": 0.1898, "num_input_tokens_seen": 509104, "step": 590 }, { "epoch": 0.28052805280528054, "grad_norm": 0.015987364575266838, "learning_rate": 0.29983679436649263, "loss": 0.3044, "num_input_tokens_seen": 512720, "step": 595 }, { "epoch": 0.2828854314002829, "grad_norm": 0.027446391060948372, "learning_rate": 0.2998340357419689, "loss": 0.262, "num_input_tokens_seen": 517536, "step": 600 }, { "epoch": 0.2828854314002829, "eval_loss": 0.3626938462257385, "eval_runtime": 21.8854, "eval_samples_per_second": 43.088, "eval_steps_per_second": 21.567, "num_input_tokens_seen": 517536, "step": 600 }, { "epoch": 0.28524280999528523, "grad_norm": 0.020620601251721382, "learning_rate": 0.29983125401115385, "loss": 0.3096, "num_input_tokens_seen": 521488, "step": 605 }, { "epoch": 0.2876001885902876, "grad_norm": 0.009467436000704765, "learning_rate": 0.29982844917447654, "loss": 0.1952, "num_input_tokens_seen": 525808, "step": 610 }, { "epoch": 0.28995756718529, "grad_norm": 0.022830162197351456, "learning_rate": 0.2998256212323695, "loss": 0.2815, "num_input_tokens_seen": 529840, "step": 615 }, { "epoch": 0.2923149457802923, "grad_norm": 0.011697076261043549, "learning_rate": 0.29982277018526887, "loss": 0.2466, "num_input_tokens_seen": 534096, "step": 620 }, { "epoch": 0.29467232437529467, "grad_norm": 0.023526204749941826, "learning_rate": 0.2998198960336143, "loss": 0.2343, "num_input_tokens_seen": 538752, "step": 625 }, { "epoch": 0.297029702970297, "grad_norm": 0.016746602952480316, "learning_rate": 0.299816998777849, "loss": 0.2687, "num_input_tokens_seen": 543968, "step": 630 }, { "epoch": 0.29938708156529936, "grad_norm": 0.027757268399000168, "learning_rate": 0.2998140784184197, "loss": 0.205, "num_input_tokens_seen": 548784, "step": 635 }, { "epoch": 0.30174446016030176, "grad_norm": 0.23165562748908997, "learning_rate": 0.2998111349557769, "loss": 0.456, "num_input_tokens_seen": 552880, "step": 640 }, { "epoch": 0.3041018387553041, "grad_norm": 0.016647884622216225, "learning_rate": 0.29980816839037444, "loss": 0.2253, "num_input_tokens_seen": 557184, "step": 645 }, { "epoch": 0.30645921735030646, "grad_norm": 0.019891247153282166, "learning_rate": 0.2998051787226698, "loss": 0.2597, "num_input_tokens_seen": 561840, "step": 650 }, { "epoch": 0.3088165959453088, "grad_norm": 0.028244683519005775, "learning_rate": 0.29980216595312403, "loss": 0.2352, "num_input_tokens_seen": 566736, "step": 655 }, { "epoch": 0.31117397454031115, "grad_norm": 0.02917248010635376, "learning_rate": 0.29979913008220177, "loss": 0.2471, "num_input_tokens_seen": 570528, "step": 660 }, { "epoch": 0.31353135313531355, "grad_norm": 0.04081382229924202, "learning_rate": 0.2997960711103711, "loss": 0.2323, "num_input_tokens_seen": 574720, "step": 665 }, { "epoch": 0.3158887317303159, "grad_norm": 0.016577957198023796, "learning_rate": 0.29979298903810386, "loss": 0.2201, "num_input_tokens_seen": 578912, "step": 670 }, { "epoch": 0.31824611032531824, "grad_norm": 0.014963102526962757, "learning_rate": 0.29978988386587524, "loss": 0.2517, "num_input_tokens_seen": 584544, "step": 675 }, { "epoch": 0.3206034889203206, "grad_norm": 0.012951698154211044, "learning_rate": 0.2997867555941642, "loss": 0.1883, "num_input_tokens_seen": 588752, "step": 680 }, { "epoch": 0.32296086751532294, "grad_norm": 0.023902473971247673, "learning_rate": 0.299783604223453, "loss": 0.2882, "num_input_tokens_seen": 593024, "step": 685 }, { "epoch": 0.32531824611032534, "grad_norm": 0.022149108350276947, "learning_rate": 0.29978042975422786, "loss": 0.2218, "num_input_tokens_seen": 597280, "step": 690 }, { "epoch": 0.3276756247053277, "grad_norm": 0.02305581234395504, "learning_rate": 0.29977723218697816, "loss": 0.2336, "num_input_tokens_seen": 601904, "step": 695 }, { "epoch": 0.33003300330033003, "grad_norm": 0.011331078596413136, "learning_rate": 0.299774011522197, "loss": 0.2282, "num_input_tokens_seen": 606368, "step": 700 }, { "epoch": 0.3323903818953324, "grad_norm": 0.019083211198449135, "learning_rate": 0.29977076776038114, "loss": 0.2236, "num_input_tokens_seen": 610096, "step": 705 }, { "epoch": 0.3347477604903347, "grad_norm": 0.047596994787454605, "learning_rate": 0.2997675009020307, "loss": 0.2257, "num_input_tokens_seen": 614288, "step": 710 }, { "epoch": 0.3371051390853371, "grad_norm": 0.019462116062641144, "learning_rate": 0.2997642109476496, "loss": 0.2999, "num_input_tokens_seen": 619344, "step": 715 }, { "epoch": 0.33946251768033947, "grad_norm": 0.032761137932538986, "learning_rate": 0.299760897897745, "loss": 0.2423, "num_input_tokens_seen": 624240, "step": 720 }, { "epoch": 0.3418198962753418, "grad_norm": 0.024460729211568832, "learning_rate": 0.29975756175282803, "loss": 0.2327, "num_input_tokens_seen": 629312, "step": 725 }, { "epoch": 0.34417727487034416, "grad_norm": 0.008055005222558975, "learning_rate": 0.29975420251341306, "loss": 0.1856, "num_input_tokens_seen": 633952, "step": 730 }, { "epoch": 0.3465346534653465, "grad_norm": 0.009399060159921646, "learning_rate": 0.29975082018001814, "loss": 0.282, "num_input_tokens_seen": 638352, "step": 735 }, { "epoch": 0.3488920320603489, "grad_norm": 0.013479001820087433, "learning_rate": 0.2997474147531648, "loss": 0.2542, "num_input_tokens_seen": 642656, "step": 740 }, { "epoch": 0.35124941065535126, "grad_norm": 0.013776581734418869, "learning_rate": 0.29974398623337833, "loss": 0.2361, "num_input_tokens_seen": 647696, "step": 745 }, { "epoch": 0.3536067892503536, "grad_norm": 0.03046252578496933, "learning_rate": 0.2997405346211873, "loss": 0.3057, "num_input_tokens_seen": 652832, "step": 750 }, { "epoch": 0.35596416784535595, "grad_norm": 0.024462277069687843, "learning_rate": 0.2997370599171241, "loss": 0.2421, "num_input_tokens_seen": 656768, "step": 755 }, { "epoch": 0.3583215464403583, "grad_norm": 0.012806993909180164, "learning_rate": 0.2997335621217246, "loss": 0.2402, "num_input_tokens_seen": 660624, "step": 760 }, { "epoch": 0.3606789250353607, "grad_norm": 0.01392971258610487, "learning_rate": 0.29973004123552816, "loss": 0.1834, "num_input_tokens_seen": 665488, "step": 765 }, { "epoch": 0.36303630363036304, "grad_norm": 0.014303174801170826, "learning_rate": 0.2997264972590777, "loss": 0.2897, "num_input_tokens_seen": 669504, "step": 770 }, { "epoch": 0.3653936822253654, "grad_norm": 0.024460889399051666, "learning_rate": 0.29972293019291973, "loss": 0.3049, "num_input_tokens_seen": 674032, "step": 775 }, { "epoch": 0.36775106082036774, "grad_norm": 0.03441541641950607, "learning_rate": 0.2997193400376045, "loss": 0.2487, "num_input_tokens_seen": 678384, "step": 780 }, { "epoch": 0.3701084394153701, "grad_norm": 0.014286152087152004, "learning_rate": 0.2997157267936854, "loss": 0.2959, "num_input_tokens_seen": 682944, "step": 785 }, { "epoch": 0.3724658180103725, "grad_norm": 0.015395323745906353, "learning_rate": 0.2997120904617199, "loss": 0.2552, "num_input_tokens_seen": 687072, "step": 790 }, { "epoch": 0.37482319660537483, "grad_norm": 0.030791128054261208, "learning_rate": 0.29970843104226863, "loss": 0.2168, "num_input_tokens_seen": 691632, "step": 795 }, { "epoch": 0.3771805752003772, "grad_norm": 0.01721709966659546, "learning_rate": 0.2997047485358959, "loss": 0.1964, "num_input_tokens_seen": 696016, "step": 800 }, { "epoch": 0.3771805752003772, "eval_loss": 0.22650428116321564, "eval_runtime": 21.9206, "eval_samples_per_second": 43.019, "eval_steps_per_second": 21.532, "num_input_tokens_seen": 696016, "step": 800 }, { "epoch": 0.3795379537953795, "grad_norm": 0.032669808715581894, "learning_rate": 0.2997010429431697, "loss": 0.2377, "num_input_tokens_seen": 700240, "step": 805 }, { "epoch": 0.38189533239038187, "grad_norm": 0.014330493286252022, "learning_rate": 0.29969731426466134, "loss": 0.2736, "num_input_tokens_seen": 704448, "step": 810 }, { "epoch": 0.38425271098538427, "grad_norm": 0.01586211286485195, "learning_rate": 0.299693562500946, "loss": 0.2426, "num_input_tokens_seen": 707968, "step": 815 }, { "epoch": 0.3866100895803866, "grad_norm": 0.027338912710547447, "learning_rate": 0.29968978765260207, "loss": 0.2485, "num_input_tokens_seen": 712272, "step": 820 }, { "epoch": 0.38896746817538896, "grad_norm": 0.012939891777932644, "learning_rate": 0.2996859897202118, "loss": 0.1615, "num_input_tokens_seen": 716784, "step": 825 }, { "epoch": 0.3913248467703913, "grad_norm": 0.02211010456085205, "learning_rate": 0.2996821687043609, "loss": 0.2933, "num_input_tokens_seen": 721296, "step": 830 }, { "epoch": 0.39368222536539366, "grad_norm": 0.015524206683039665, "learning_rate": 0.2996783246056384, "loss": 0.2197, "num_input_tokens_seen": 725536, "step": 835 }, { "epoch": 0.39603960396039606, "grad_norm": 0.02145790494978428, "learning_rate": 0.29967445742463744, "loss": 0.2467, "num_input_tokens_seen": 730384, "step": 840 }, { "epoch": 0.3983969825553984, "grad_norm": 0.01908414252102375, "learning_rate": 0.29967056716195417, "loss": 0.2577, "num_input_tokens_seen": 734960, "step": 845 }, { "epoch": 0.40075436115040075, "grad_norm": 0.01674271561205387, "learning_rate": 0.2996666538181885, "loss": 0.2306, "num_input_tokens_seen": 739040, "step": 850 }, { "epoch": 0.4031117397454031, "grad_norm": 0.017238173633813858, "learning_rate": 0.29966271739394407, "loss": 0.2691, "num_input_tokens_seen": 743184, "step": 855 }, { "epoch": 0.40546911834040544, "grad_norm": 0.009943000972270966, "learning_rate": 0.29965875788982776, "loss": 0.2181, "num_input_tokens_seen": 747056, "step": 860 }, { "epoch": 0.40782649693540785, "grad_norm": 0.06390685588121414, "learning_rate": 0.2996547753064503, "loss": 0.243, "num_input_tokens_seen": 751184, "step": 865 }, { "epoch": 0.4101838755304102, "grad_norm": 0.019871389493346214, "learning_rate": 0.29965076964442583, "loss": 0.2412, "num_input_tokens_seen": 755392, "step": 870 }, { "epoch": 0.41254125412541254, "grad_norm": 0.01780155673623085, "learning_rate": 0.299646740904372, "loss": 0.2466, "num_input_tokens_seen": 759648, "step": 875 }, { "epoch": 0.4148986327204149, "grad_norm": 0.009347254410386086, "learning_rate": 0.29964268908691016, "loss": 0.2328, "num_input_tokens_seen": 763792, "step": 880 }, { "epoch": 0.41725601131541723, "grad_norm": 0.010657994076609612, "learning_rate": 0.29963861419266513, "loss": 0.2629, "num_input_tokens_seen": 768944, "step": 885 }, { "epoch": 0.41961338991041963, "grad_norm": 0.010821755044162273, "learning_rate": 0.29963451622226533, "loss": 0.2335, "num_input_tokens_seen": 773360, "step": 890 }, { "epoch": 0.421970768505422, "grad_norm": 0.21754032373428345, "learning_rate": 0.29963039517634277, "loss": 0.2682, "num_input_tokens_seen": 777808, "step": 895 }, { "epoch": 0.4243281471004243, "grad_norm": 0.1522066444158554, "learning_rate": 0.2996262510555328, "loss": 0.304, "num_input_tokens_seen": 784416, "step": 900 }, { "epoch": 0.42668552569542667, "grad_norm": 0.04216880723834038, "learning_rate": 0.2996220838604746, "loss": 0.2466, "num_input_tokens_seen": 788464, "step": 905 }, { "epoch": 0.429042904290429, "grad_norm": 0.010158684104681015, "learning_rate": 0.29961789359181085, "loss": 0.2013, "num_input_tokens_seen": 793232, "step": 910 }, { "epoch": 0.4314002828854314, "grad_norm": 0.013883883133530617, "learning_rate": 0.29961368025018764, "loss": 0.2184, "num_input_tokens_seen": 797872, "step": 915 }, { "epoch": 0.43375766148043376, "grad_norm": 0.02833622694015503, "learning_rate": 0.2996094438362548, "loss": 0.2429, "num_input_tokens_seen": 802000, "step": 920 }, { "epoch": 0.4361150400754361, "grad_norm": 0.007342576049268246, "learning_rate": 0.2996051843506657, "loss": 0.2581, "num_input_tokens_seen": 806192, "step": 925 }, { "epoch": 0.43847241867043846, "grad_norm": 0.010723326355218887, "learning_rate": 0.299600901794077, "loss": 0.259, "num_input_tokens_seen": 810032, "step": 930 }, { "epoch": 0.4408297972654408, "grad_norm": 0.009109522216022015, "learning_rate": 0.29959659616714923, "loss": 0.2424, "num_input_tokens_seen": 813776, "step": 935 }, { "epoch": 0.4431871758604432, "grad_norm": 0.023440705612301826, "learning_rate": 0.2995922674705464, "loss": 0.2327, "num_input_tokens_seen": 818272, "step": 940 }, { "epoch": 0.44554455445544555, "grad_norm": 0.006356361787766218, "learning_rate": 0.2995879157049361, "loss": 0.1758, "num_input_tokens_seen": 823504, "step": 945 }, { "epoch": 0.4479019330504479, "grad_norm": 0.004717394709587097, "learning_rate": 0.2995835408709893, "loss": 0.4254, "num_input_tokens_seen": 827280, "step": 950 }, { "epoch": 0.45025931164545024, "grad_norm": 0.024635834619402885, "learning_rate": 0.29957914296938076, "loss": 0.2381, "num_input_tokens_seen": 831072, "step": 955 }, { "epoch": 0.4526166902404526, "grad_norm": 0.018351038917899132, "learning_rate": 0.2995747220007886, "loss": 0.2596, "num_input_tokens_seen": 834928, "step": 960 }, { "epoch": 0.454974068835455, "grad_norm": 0.004620480351150036, "learning_rate": 0.2995702779658947, "loss": 0.2103, "num_input_tokens_seen": 838800, "step": 965 }, { "epoch": 0.45733144743045734, "grad_norm": 0.006804523058235645, "learning_rate": 0.29956581086538425, "loss": 0.247, "num_input_tokens_seen": 842992, "step": 970 }, { "epoch": 0.4596888260254597, "grad_norm": 0.02102152444422245, "learning_rate": 0.2995613206999462, "loss": 0.2697, "num_input_tokens_seen": 847216, "step": 975 }, { "epoch": 0.46204620462046203, "grad_norm": 0.02126171998679638, "learning_rate": 0.29955680747027297, "loss": 0.2492, "num_input_tokens_seen": 851152, "step": 980 }, { "epoch": 0.4644035832154644, "grad_norm": 0.022664852440357208, "learning_rate": 0.2995522711770607, "loss": 0.26, "num_input_tokens_seen": 855520, "step": 985 }, { "epoch": 0.4667609618104668, "grad_norm": 0.010885710828006268, "learning_rate": 0.2995477118210087, "loss": 0.2049, "num_input_tokens_seen": 860288, "step": 990 }, { "epoch": 0.4691183404054691, "grad_norm": 0.00547809200361371, "learning_rate": 0.29954312940282024, "loss": 0.209, "num_input_tokens_seen": 864848, "step": 995 }, { "epoch": 0.47147571900047147, "grad_norm": 0.0102836973965168, "learning_rate": 0.29953852392320196, "loss": 0.2543, "num_input_tokens_seen": 868992, "step": 1000 }, { "epoch": 0.47147571900047147, "eval_loss": 0.22157074511051178, "eval_runtime": 21.9029, "eval_samples_per_second": 43.054, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 868992, "step": 1000 }, { "epoch": 0.4738330975954738, "grad_norm": 0.011545664630830288, "learning_rate": 0.2995338953828641, "loss": 0.2249, "num_input_tokens_seen": 872576, "step": 1005 }, { "epoch": 0.47619047619047616, "grad_norm": 0.008610946126282215, "learning_rate": 0.2995292437825204, "loss": 0.2423, "num_input_tokens_seen": 876752, "step": 1010 }, { "epoch": 0.47854785478547857, "grad_norm": 0.008063350804150105, "learning_rate": 0.29952456912288816, "loss": 0.2478, "num_input_tokens_seen": 880720, "step": 1015 }, { "epoch": 0.4809052333804809, "grad_norm": 0.010864391922950745, "learning_rate": 0.2995198714046884, "loss": 0.2403, "num_input_tokens_seen": 884528, "step": 1020 }, { "epoch": 0.48326261197548326, "grad_norm": 0.00809762068092823, "learning_rate": 0.2995151506286454, "loss": 0.234, "num_input_tokens_seen": 889360, "step": 1025 }, { "epoch": 0.4856199905704856, "grad_norm": 0.01353034283965826, "learning_rate": 0.2995104067954873, "loss": 0.2257, "num_input_tokens_seen": 893504, "step": 1030 }, { "epoch": 0.48797736916548795, "grad_norm": 0.006150421686470509, "learning_rate": 0.2995056399059456, "loss": 0.2528, "num_input_tokens_seen": 897392, "step": 1035 }, { "epoch": 0.49033474776049035, "grad_norm": 0.008271138183772564, "learning_rate": 0.2995008499607554, "loss": 0.2202, "num_input_tokens_seen": 901952, "step": 1040 }, { "epoch": 0.4926921263554927, "grad_norm": 0.010571779683232307, "learning_rate": 0.2994960369606554, "loss": 0.2182, "num_input_tokens_seen": 906720, "step": 1045 }, { "epoch": 0.49504950495049505, "grad_norm": 0.013821846805512905, "learning_rate": 0.2994912009063878, "loss": 0.2535, "num_input_tokens_seen": 911456, "step": 1050 }, { "epoch": 0.4974068835454974, "grad_norm": 0.015538020059466362, "learning_rate": 0.29948634179869843, "loss": 0.2251, "num_input_tokens_seen": 915760, "step": 1055 }, { "epoch": 0.49976426214049974, "grad_norm": 0.022615710273385048, "learning_rate": 0.29948145963833656, "loss": 0.2744, "num_input_tokens_seen": 920016, "step": 1060 }, { "epoch": 0.5021216407355021, "grad_norm": 0.009418646804988384, "learning_rate": 0.29947655442605514, "loss": 0.2259, "num_input_tokens_seen": 924720, "step": 1065 }, { "epoch": 0.5044790193305044, "grad_norm": 0.014893700368702412, "learning_rate": 0.2994716261626106, "loss": 0.2194, "num_input_tokens_seen": 928208, "step": 1070 }, { "epoch": 0.5068363979255068, "grad_norm": 0.010897199623286724, "learning_rate": 0.2994666748487629, "loss": 0.1953, "num_input_tokens_seen": 932064, "step": 1075 }, { "epoch": 0.5091937765205092, "grad_norm": 0.011832601390779018, "learning_rate": 0.2994617004852756, "loss": 0.3198, "num_input_tokens_seen": 935712, "step": 1080 }, { "epoch": 0.5115511551155115, "grad_norm": 0.00727560929954052, "learning_rate": 0.2994567030729159, "loss": 0.1944, "num_input_tokens_seen": 940096, "step": 1085 }, { "epoch": 0.5139085337105139, "grad_norm": 0.010667894966900349, "learning_rate": 0.29945168261245436, "loss": 0.2348, "num_input_tokens_seen": 943872, "step": 1090 }, { "epoch": 0.5162659123055162, "grad_norm": 0.01917083188891411, "learning_rate": 0.29944663910466524, "loss": 0.2307, "num_input_tokens_seen": 949296, "step": 1095 }, { "epoch": 0.5186232909005186, "grad_norm": 0.01053266879171133, "learning_rate": 0.2994415725503263, "loss": 0.1967, "num_input_tokens_seen": 953040, "step": 1100 }, { "epoch": 0.520980669495521, "grad_norm": 0.02649979293346405, "learning_rate": 0.29943648295021885, "loss": 0.2724, "num_input_tokens_seen": 957600, "step": 1105 }, { "epoch": 0.5233380480905233, "grad_norm": 0.004224368836730719, "learning_rate": 0.2994313703051278, "loss": 0.2845, "num_input_tokens_seen": 962496, "step": 1110 }, { "epoch": 0.5256954266855257, "grad_norm": 0.00910792127251625, "learning_rate": 0.29942623461584156, "loss": 0.2659, "num_input_tokens_seen": 966688, "step": 1115 }, { "epoch": 0.528052805280528, "grad_norm": 0.006304461974650621, "learning_rate": 0.29942107588315214, "loss": 0.2732, "num_input_tokens_seen": 969984, "step": 1120 }, { "epoch": 0.5304101838755304, "grad_norm": 0.0066589065827429295, "learning_rate": 0.29941589410785513, "loss": 0.2305, "num_input_tokens_seen": 975280, "step": 1125 }, { "epoch": 0.5327675624705328, "grad_norm": 0.02922910265624523, "learning_rate": 0.29941068929074954, "loss": 0.208, "num_input_tokens_seen": 980064, "step": 1130 }, { "epoch": 0.5351249410655351, "grad_norm": 0.011774340644478798, "learning_rate": 0.2994054614326381, "loss": 0.254, "num_input_tokens_seen": 985024, "step": 1135 }, { "epoch": 0.5374823196605375, "grad_norm": 0.011308026500046253, "learning_rate": 0.29940021053432686, "loss": 0.2832, "num_input_tokens_seen": 989504, "step": 1140 }, { "epoch": 0.5398396982555398, "grad_norm": 0.0053458791226148605, "learning_rate": 0.29939493659662575, "loss": 0.2444, "num_input_tokens_seen": 994448, "step": 1145 }, { "epoch": 0.5421970768505422, "grad_norm": 0.013303283601999283, "learning_rate": 0.299389639620348, "loss": 0.2416, "num_input_tokens_seen": 998976, "step": 1150 }, { "epoch": 0.5445544554455446, "grad_norm": 0.004726475104689598, "learning_rate": 0.29938431960631046, "loss": 0.2453, "num_input_tokens_seen": 1003136, "step": 1155 }, { "epoch": 0.5469118340405469, "grad_norm": 0.003945779521018267, "learning_rate": 0.2993789765553335, "loss": 0.2349, "num_input_tokens_seen": 1006512, "step": 1160 }, { "epoch": 0.5492692126355493, "grad_norm": 0.0076730637811124325, "learning_rate": 0.2993736104682412, "loss": 0.2334, "num_input_tokens_seen": 1010576, "step": 1165 }, { "epoch": 0.5516265912305516, "grad_norm": 0.011619127355515957, "learning_rate": 0.299368221345861, "loss": 0.2155, "num_input_tokens_seen": 1014928, "step": 1170 }, { "epoch": 0.553983969825554, "grad_norm": 0.007023296318948269, "learning_rate": 0.29936280918902397, "loss": 0.1792, "num_input_tokens_seen": 1019680, "step": 1175 }, { "epoch": 0.5563413484205564, "grad_norm": 0.01198938675224781, "learning_rate": 0.2993573739985648, "loss": 0.2527, "num_input_tokens_seen": 1024112, "step": 1180 }, { "epoch": 0.5586987270155587, "grad_norm": 0.017957186326384544, "learning_rate": 0.2993519157753216, "loss": 0.2582, "num_input_tokens_seen": 1028768, "step": 1185 }, { "epoch": 0.5610561056105611, "grad_norm": 0.0077983723022043705, "learning_rate": 0.2993464345201361, "loss": 0.2404, "num_input_tokens_seen": 1032288, "step": 1190 }, { "epoch": 0.5634134842055634, "grad_norm": 0.019709035754203796, "learning_rate": 0.2993409302338536, "loss": 0.2598, "num_input_tokens_seen": 1036368, "step": 1195 }, { "epoch": 0.5657708628005658, "grad_norm": 0.007961167953908443, "learning_rate": 0.2993354029173229, "loss": 0.2859, "num_input_tokens_seen": 1040544, "step": 1200 }, { "epoch": 0.5657708628005658, "eval_loss": 0.22787168622016907, "eval_runtime": 21.921, "eval_samples_per_second": 43.018, "eval_steps_per_second": 21.532, "num_input_tokens_seen": 1040544, "step": 1200 }, { "epoch": 0.5681282413955682, "grad_norm": 0.01525899674743414, "learning_rate": 0.2993298525713965, "loss": 0.3122, "num_input_tokens_seen": 1044576, "step": 1205 }, { "epoch": 0.5704856199905705, "grad_norm": 0.003580045886337757, "learning_rate": 0.29932427919693017, "loss": 0.2322, "num_input_tokens_seen": 1049408, "step": 1210 }, { "epoch": 0.5728429985855729, "grad_norm": 0.00673142122104764, "learning_rate": 0.2993186827947834, "loss": 0.2054, "num_input_tokens_seen": 1053424, "step": 1215 }, { "epoch": 0.5752003771805752, "grad_norm": 0.006981938146054745, "learning_rate": 0.2993130633658194, "loss": 0.2239, "num_input_tokens_seen": 1058128, "step": 1220 }, { "epoch": 0.5775577557755776, "grad_norm": 0.005521902348846197, "learning_rate": 0.29930742091090456, "loss": 0.2125, "num_input_tokens_seen": 1062112, "step": 1225 }, { "epoch": 0.57991513437058, "grad_norm": 0.006411140784621239, "learning_rate": 0.29930175543090914, "loss": 0.2233, "num_input_tokens_seen": 1066544, "step": 1230 }, { "epoch": 0.5822725129655822, "grad_norm": 0.0065961722284555435, "learning_rate": 0.2992960669267068, "loss": 0.1323, "num_input_tokens_seen": 1071280, "step": 1235 }, { "epoch": 0.5846298915605846, "grad_norm": 0.010791966691613197, "learning_rate": 0.29929035539917476, "loss": 0.3092, "num_input_tokens_seen": 1075568, "step": 1240 }, { "epoch": 0.5869872701555869, "grad_norm": 0.004317036829888821, "learning_rate": 0.2992846208491938, "loss": 0.2783, "num_input_tokens_seen": 1079648, "step": 1245 }, { "epoch": 0.5893446487505893, "grad_norm": 0.010308889672160149, "learning_rate": 0.2992788632776483, "loss": 0.2229, "num_input_tokens_seen": 1084080, "step": 1250 }, { "epoch": 0.5917020273455917, "grad_norm": 0.00871394481509924, "learning_rate": 0.29927308268542613, "loss": 0.2297, "num_input_tokens_seen": 1087680, "step": 1255 }, { "epoch": 0.594059405940594, "grad_norm": 0.010654519312083721, "learning_rate": 0.2992672790734187, "loss": 0.2159, "num_input_tokens_seen": 1092080, "step": 1260 }, { "epoch": 0.5964167845355964, "grad_norm": 0.012754036113619804, "learning_rate": 0.299261452442521, "loss": 0.2184, "num_input_tokens_seen": 1096288, "step": 1265 }, { "epoch": 0.5987741631305987, "grad_norm": 0.004326207097619772, "learning_rate": 0.29925560279363167, "loss": 0.2332, "num_input_tokens_seen": 1099776, "step": 1270 }, { "epoch": 0.6011315417256011, "grad_norm": 0.011387339793145657, "learning_rate": 0.29924973012765266, "loss": 0.2524, "num_input_tokens_seen": 1103440, "step": 1275 }, { "epoch": 0.6034889203206035, "grad_norm": 0.00819116085767746, "learning_rate": 0.29924383444548974, "loss": 0.2292, "num_input_tokens_seen": 1107344, "step": 1280 }, { "epoch": 0.6058462989156058, "grad_norm": 0.021335134282708168, "learning_rate": 0.299237915748052, "loss": 0.2507, "num_input_tokens_seen": 1111728, "step": 1285 }, { "epoch": 0.6082036775106082, "grad_norm": 0.007584770210087299, "learning_rate": 0.2992319740362522, "loss": 0.2722, "num_input_tokens_seen": 1115376, "step": 1290 }, { "epoch": 0.6105610561056105, "grad_norm": 0.008759505115449429, "learning_rate": 0.2992260093110066, "loss": 0.2248, "num_input_tokens_seen": 1119184, "step": 1295 }, { "epoch": 0.6129184347006129, "grad_norm": 0.007545670494437218, "learning_rate": 0.2992200215732352, "loss": 0.2334, "num_input_tokens_seen": 1123600, "step": 1300 }, { "epoch": 0.6152758132956153, "grad_norm": 0.006894233636558056, "learning_rate": 0.2992140108238611, "loss": 0.236, "num_input_tokens_seen": 1127408, "step": 1305 }, { "epoch": 0.6176331918906176, "grad_norm": 0.012917310930788517, "learning_rate": 0.2992079770638115, "loss": 0.2289, "num_input_tokens_seen": 1131872, "step": 1310 }, { "epoch": 0.61999057048562, "grad_norm": 0.01972266100347042, "learning_rate": 0.29920192029401677, "loss": 0.2335, "num_input_tokens_seen": 1138416, "step": 1315 }, { "epoch": 0.6223479490806223, "grad_norm": 0.005994094535708427, "learning_rate": 0.2991958405154109, "loss": 0.2291, "num_input_tokens_seen": 1142144, "step": 1320 }, { "epoch": 0.6247053276756247, "grad_norm": 0.00641050236299634, "learning_rate": 0.29918973772893154, "loss": 0.2399, "num_input_tokens_seen": 1146544, "step": 1325 }, { "epoch": 0.6270627062706271, "grad_norm": 0.010463126935064793, "learning_rate": 0.29918361193551973, "loss": 0.2255, "num_input_tokens_seen": 1150784, "step": 1330 }, { "epoch": 0.6294200848656294, "grad_norm": 0.005755925551056862, "learning_rate": 0.29917746313612026, "loss": 0.222, "num_input_tokens_seen": 1155120, "step": 1335 }, { "epoch": 0.6317774634606318, "grad_norm": 0.005667397752404213, "learning_rate": 0.29917129133168124, "loss": 0.2233, "num_input_tokens_seen": 1160256, "step": 1340 }, { "epoch": 0.6341348420556341, "grad_norm": 0.004874077159911394, "learning_rate": 0.2991650965231546, "loss": 0.1927, "num_input_tokens_seen": 1164544, "step": 1345 }, { "epoch": 0.6364922206506365, "grad_norm": 0.010840351693332195, "learning_rate": 0.29915887871149544, "loss": 0.2075, "num_input_tokens_seen": 1168464, "step": 1350 }, { "epoch": 0.6388495992456389, "grad_norm": 0.009263534098863602, "learning_rate": 0.2991526378976628, "loss": 0.2149, "num_input_tokens_seen": 1171888, "step": 1355 }, { "epoch": 0.6412069778406412, "grad_norm": 0.005568221677094698, "learning_rate": 0.29914637408261896, "loss": 0.2563, "num_input_tokens_seen": 1175920, "step": 1360 }, { "epoch": 0.6435643564356436, "grad_norm": 0.004826083779335022, "learning_rate": 0.29914008726733, "loss": 0.3011, "num_input_tokens_seen": 1179792, "step": 1365 }, { "epoch": 0.6459217350306459, "grad_norm": 0.0208759605884552, "learning_rate": 0.2991337774527653, "loss": 0.2366, "num_input_tokens_seen": 1185264, "step": 1370 }, { "epoch": 0.6482791136256483, "grad_norm": 0.0211018193513155, "learning_rate": 0.2991274446398981, "loss": 0.2531, "num_input_tokens_seen": 1189472, "step": 1375 }, { "epoch": 0.6506364922206507, "grad_norm": 0.017729585990309715, "learning_rate": 0.29912108882970484, "loss": 0.2272, "num_input_tokens_seen": 1193104, "step": 1380 }, { "epoch": 0.652993870815653, "grad_norm": 0.02269987016916275, "learning_rate": 0.2991147100231657, "loss": 0.245, "num_input_tokens_seen": 1197856, "step": 1385 }, { "epoch": 0.6553512494106554, "grad_norm": 0.012412120588123798, "learning_rate": 0.2991083082212644, "loss": 0.2131, "num_input_tokens_seen": 1203328, "step": 1390 }, { "epoch": 0.6577086280056577, "grad_norm": 0.014815357513725758, "learning_rate": 0.2991018834249881, "loss": 0.3288, "num_input_tokens_seen": 1207376, "step": 1395 }, { "epoch": 0.6600660066006601, "grad_norm": 0.00789864081889391, "learning_rate": 0.29909543563532764, "loss": 0.2272, "num_input_tokens_seen": 1211680, "step": 1400 }, { "epoch": 0.6600660066006601, "eval_loss": 0.3682122528553009, "eval_runtime": 21.9505, "eval_samples_per_second": 42.96, "eval_steps_per_second": 21.503, "num_input_tokens_seen": 1211680, "step": 1400 }, { "epoch": 0.6624233851956625, "grad_norm": 0.007755184546113014, "learning_rate": 0.29908896485327746, "loss": 0.2966, "num_input_tokens_seen": 1215600, "step": 1405 }, { "epoch": 0.6647807637906648, "grad_norm": 0.009833133779466152, "learning_rate": 0.29908247107983527, "loss": 0.258, "num_input_tokens_seen": 1220256, "step": 1410 }, { "epoch": 0.6671381423856672, "grad_norm": 0.011940604075789452, "learning_rate": 0.29907595431600253, "loss": 0.2248, "num_input_tokens_seen": 1224464, "step": 1415 }, { "epoch": 0.6694955209806694, "grad_norm": 0.012441758066415787, "learning_rate": 0.29906941456278424, "loss": 0.246, "num_input_tokens_seen": 1228464, "step": 1420 }, { "epoch": 0.6718528995756718, "grad_norm": 0.015507173724472523, "learning_rate": 0.2990628518211889, "loss": 0.2411, "num_input_tokens_seen": 1232640, "step": 1425 }, { "epoch": 0.6742102781706742, "grad_norm": 0.0043488843366503716, "learning_rate": 0.2990562660922286, "loss": 0.2264, "num_input_tokens_seen": 1237248, "step": 1430 }, { "epoch": 0.6765676567656765, "grad_norm": 0.008267401717603207, "learning_rate": 0.2990496573769189, "loss": 0.222, "num_input_tokens_seen": 1241056, "step": 1435 }, { "epoch": 0.6789250353606789, "grad_norm": 0.014699293300509453, "learning_rate": 0.29904302567627894, "loss": 0.1974, "num_input_tokens_seen": 1245296, "step": 1440 }, { "epoch": 0.6812824139556812, "grad_norm": 0.027788080275058746, "learning_rate": 0.2990363709913314, "loss": 0.1928, "num_input_tokens_seen": 1249600, "step": 1445 }, { "epoch": 0.6836397925506836, "grad_norm": 0.005524135194718838, "learning_rate": 0.29902969332310264, "loss": 0.2234, "num_input_tokens_seen": 1253616, "step": 1450 }, { "epoch": 0.685997171145686, "grad_norm": 0.014030787162482738, "learning_rate": 0.2990229926726223, "loss": 0.2734, "num_input_tokens_seen": 1257392, "step": 1455 }, { "epoch": 0.6883545497406883, "grad_norm": 0.009144607000052929, "learning_rate": 0.29901626904092365, "loss": 0.2246, "num_input_tokens_seen": 1261776, "step": 1460 }, { "epoch": 0.6907119283356907, "grad_norm": 0.016202040016651154, "learning_rate": 0.2990095224290438, "loss": 0.2583, "num_input_tokens_seen": 1265888, "step": 1465 }, { "epoch": 0.693069306930693, "grad_norm": 0.005822889506816864, "learning_rate": 0.29900275283802297, "loss": 0.2408, "num_input_tokens_seen": 1269712, "step": 1470 }, { "epoch": 0.6954266855256954, "grad_norm": 0.00962250679731369, "learning_rate": 0.2989959602689051, "loss": 0.2166, "num_input_tokens_seen": 1274464, "step": 1475 }, { "epoch": 0.6977840641206978, "grad_norm": 0.006010857410728931, "learning_rate": 0.2989891447227379, "loss": 0.1959, "num_input_tokens_seen": 1278944, "step": 1480 }, { "epoch": 0.7001414427157001, "grad_norm": 0.016445614397525787, "learning_rate": 0.29898230620057215, "loss": 0.2415, "num_input_tokens_seen": 1282608, "step": 1485 }, { "epoch": 0.7024988213107025, "grad_norm": 0.004994721617549658, "learning_rate": 0.2989754447034626, "loss": 0.2072, "num_input_tokens_seen": 1286480, "step": 1490 }, { "epoch": 0.7048561999057048, "grad_norm": 0.01699483022093773, "learning_rate": 0.2989685602324673, "loss": 0.2332, "num_input_tokens_seen": 1290560, "step": 1495 }, { "epoch": 0.7072135785007072, "grad_norm": 0.004955081734806299, "learning_rate": 0.298961652788648, "loss": 0.2175, "num_input_tokens_seen": 1294240, "step": 1500 }, { "epoch": 0.7095709570957096, "grad_norm": 0.004256641026586294, "learning_rate": 0.29895472237306986, "loss": 0.2333, "num_input_tokens_seen": 1298512, "step": 1505 }, { "epoch": 0.7119283356907119, "grad_norm": 0.007530726958066225, "learning_rate": 0.29894776898680164, "loss": 0.2444, "num_input_tokens_seen": 1302736, "step": 1510 }, { "epoch": 0.7142857142857143, "grad_norm": 0.012377893552184105, "learning_rate": 0.29894079263091566, "loss": 0.225, "num_input_tokens_seen": 1307376, "step": 1515 }, { "epoch": 0.7166430928807166, "grad_norm": 0.019442012533545494, "learning_rate": 0.2989337933064877, "loss": 0.1936, "num_input_tokens_seen": 1311760, "step": 1520 }, { "epoch": 0.719000471475719, "grad_norm": 0.008796768262982368, "learning_rate": 0.29892677101459725, "loss": 0.2063, "num_input_tokens_seen": 1315200, "step": 1525 }, { "epoch": 0.7213578500707214, "grad_norm": 0.005080565810203552, "learning_rate": 0.2989197257563272, "loss": 0.2352, "num_input_tokens_seen": 1319008, "step": 1530 }, { "epoch": 0.7237152286657237, "grad_norm": 0.005311491433531046, "learning_rate": 0.2989126575327639, "loss": 0.2439, "num_input_tokens_seen": 1323696, "step": 1535 }, { "epoch": 0.7260726072607261, "grad_norm": 0.0038522803224623203, "learning_rate": 0.29890556634499754, "loss": 0.2362, "num_input_tokens_seen": 1327264, "step": 1540 }, { "epoch": 0.7284299858557284, "grad_norm": 0.007134600542485714, "learning_rate": 0.2988984521941216, "loss": 0.2052, "num_input_tokens_seen": 1331200, "step": 1545 }, { "epoch": 0.7307873644507308, "grad_norm": 0.1557055562734604, "learning_rate": 0.29889131508123307, "loss": 0.3011, "num_input_tokens_seen": 1336144, "step": 1550 }, { "epoch": 0.7331447430457332, "grad_norm": 0.010834681801497936, "learning_rate": 0.2988841550074327, "loss": 0.24, "num_input_tokens_seen": 1340688, "step": 1555 }, { "epoch": 0.7355021216407355, "grad_norm": 0.017351586371660233, "learning_rate": 0.2988769719738246, "loss": 0.2442, "num_input_tokens_seen": 1344960, "step": 1560 }, { "epoch": 0.7378595002357379, "grad_norm": 0.008527003228664398, "learning_rate": 0.29886976598151666, "loss": 0.2186, "num_input_tokens_seen": 1349968, "step": 1565 }, { "epoch": 0.7402168788307402, "grad_norm": 0.014329357072710991, "learning_rate": 0.29886253703161986, "loss": 0.3611, "num_input_tokens_seen": 1354240, "step": 1570 }, { "epoch": 0.7425742574257426, "grad_norm": 0.019205156713724136, "learning_rate": 0.29885528512524917, "loss": 0.2181, "num_input_tokens_seen": 1359232, "step": 1575 }, { "epoch": 0.744931636020745, "grad_norm": 0.007677373010665178, "learning_rate": 0.29884801026352287, "loss": 0.1565, "num_input_tokens_seen": 1363408, "step": 1580 }, { "epoch": 0.7472890146157473, "grad_norm": 0.04532676562666893, "learning_rate": 0.2988407124475629, "loss": 0.3095, "num_input_tokens_seen": 1368176, "step": 1585 }, { "epoch": 0.7496463932107497, "grad_norm": 0.010856139473617077, "learning_rate": 0.2988333916784945, "loss": 0.2164, "num_input_tokens_seen": 1372752, "step": 1590 }, { "epoch": 0.752003771805752, "grad_norm": 0.02400929108262062, "learning_rate": 0.2988260479574468, "loss": 0.3062, "num_input_tokens_seen": 1376640, "step": 1595 }, { "epoch": 0.7543611504007544, "grad_norm": 0.01615690067410469, "learning_rate": 0.2988186812855523, "loss": 0.2348, "num_input_tokens_seen": 1381792, "step": 1600 }, { "epoch": 0.7543611504007544, "eval_loss": 0.3155163824558258, "eval_runtime": 21.8857, "eval_samples_per_second": 43.087, "eval_steps_per_second": 21.567, "num_input_tokens_seen": 1381792, "step": 1600 }, { "epoch": 0.7567185289957568, "grad_norm": 0.007281403988599777, "learning_rate": 0.29881129166394693, "loss": 0.2299, "num_input_tokens_seen": 1385968, "step": 1605 }, { "epoch": 0.759075907590759, "grad_norm": 0.005476027727127075, "learning_rate": 0.29880387909377026, "loss": 0.2251, "num_input_tokens_seen": 1390256, "step": 1610 }, { "epoch": 0.7614332861857614, "grad_norm": 0.005885435733944178, "learning_rate": 0.2987964435761655, "loss": 0.2186, "num_input_tokens_seen": 1394272, "step": 1615 }, { "epoch": 0.7637906647807637, "grad_norm": 0.005694640800356865, "learning_rate": 0.29878898511227925, "loss": 0.236, "num_input_tokens_seen": 1398672, "step": 1620 }, { "epoch": 0.7661480433757661, "grad_norm": 0.008330917917191982, "learning_rate": 0.2987815037032617, "loss": 0.1737, "num_input_tokens_seen": 1403168, "step": 1625 }, { "epoch": 0.7685054219707685, "grad_norm": 0.01563160866498947, "learning_rate": 0.29877399935026655, "loss": 0.2326, "num_input_tokens_seen": 1407792, "step": 1630 }, { "epoch": 0.7708628005657708, "grad_norm": 0.009742502123117447, "learning_rate": 0.2987664720544511, "loss": 0.2272, "num_input_tokens_seen": 1412080, "step": 1635 }, { "epoch": 0.7732201791607732, "grad_norm": 0.006067799869924784, "learning_rate": 0.2987589218169761, "loss": 0.228, "num_input_tokens_seen": 1416752, "step": 1640 }, { "epoch": 0.7755775577557755, "grad_norm": 0.0038220356218516827, "learning_rate": 0.29875134863900604, "loss": 0.2281, "num_input_tokens_seen": 1421728, "step": 1645 }, { "epoch": 0.7779349363507779, "grad_norm": 0.013462711125612259, "learning_rate": 0.29874375252170865, "loss": 0.2093, "num_input_tokens_seen": 1425744, "step": 1650 }, { "epoch": 0.7802923149457803, "grad_norm": 0.01707606017589569, "learning_rate": 0.2987361334662553, "loss": 0.2286, "num_input_tokens_seen": 1430528, "step": 1655 }, { "epoch": 0.7826496935407826, "grad_norm": 0.008576737716794014, "learning_rate": 0.29872849147382113, "loss": 0.271, "num_input_tokens_seen": 1435376, "step": 1660 }, { "epoch": 0.785007072135785, "grad_norm": 0.01290939748287201, "learning_rate": 0.2987208265455845, "loss": 0.2282, "num_input_tokens_seen": 1440272, "step": 1665 }, { "epoch": 0.7873644507307873, "grad_norm": 0.012482976540923119, "learning_rate": 0.29871313868272753, "loss": 0.2511, "num_input_tokens_seen": 1444384, "step": 1670 }, { "epoch": 0.7897218293257897, "grad_norm": 0.004670243710279465, "learning_rate": 0.29870542788643567, "loss": 0.2415, "num_input_tokens_seen": 1449056, "step": 1675 }, { "epoch": 0.7920792079207921, "grad_norm": 0.00447361683472991, "learning_rate": 0.2986976941578981, "loss": 0.2365, "num_input_tokens_seen": 1453104, "step": 1680 }, { "epoch": 0.7944365865157944, "grad_norm": 0.01498006284236908, "learning_rate": 0.29868993749830747, "loss": 0.2347, "num_input_tokens_seen": 1457520, "step": 1685 }, { "epoch": 0.7967939651107968, "grad_norm": 0.004394884686917067, "learning_rate": 0.2986821579088598, "loss": 0.2238, "num_input_tokens_seen": 1461984, "step": 1690 }, { "epoch": 0.7991513437057991, "grad_norm": 0.009351997636258602, "learning_rate": 0.29867435539075504, "loss": 0.2366, "num_input_tokens_seen": 1465952, "step": 1695 }, { "epoch": 0.8015087223008015, "grad_norm": 0.004777699243277311, "learning_rate": 0.2986665299451963, "loss": 0.2232, "num_input_tokens_seen": 1470480, "step": 1700 }, { "epoch": 0.8038661008958039, "grad_norm": 0.005686374381184578, "learning_rate": 0.29865868157339037, "loss": 0.2233, "num_input_tokens_seen": 1474736, "step": 1705 }, { "epoch": 0.8062234794908062, "grad_norm": 0.006763003300875425, "learning_rate": 0.2986508102765476, "loss": 0.2117, "num_input_tokens_seen": 1478944, "step": 1710 }, { "epoch": 0.8085808580858086, "grad_norm": 0.027879104018211365, "learning_rate": 0.2986429160558818, "loss": 0.2072, "num_input_tokens_seen": 1484288, "step": 1715 }, { "epoch": 0.8109382366808109, "grad_norm": 0.0064238267950713634, "learning_rate": 0.2986349989126104, "loss": 0.3049, "num_input_tokens_seen": 1489008, "step": 1720 }, { "epoch": 0.8132956152758133, "grad_norm": 0.016541101038455963, "learning_rate": 0.29862705884795426, "loss": 0.2446, "num_input_tokens_seen": 1494672, "step": 1725 }, { "epoch": 0.8156529938708157, "grad_norm": 0.012036236003041267, "learning_rate": 0.2986190958631379, "loss": 0.2432, "num_input_tokens_seen": 1499072, "step": 1730 }, { "epoch": 0.818010372465818, "grad_norm": 0.01372484304010868, "learning_rate": 0.29861110995938933, "loss": 0.2346, "num_input_tokens_seen": 1502816, "step": 1735 }, { "epoch": 0.8203677510608204, "grad_norm": 0.0069134049117565155, "learning_rate": 0.29860310113794, "loss": 0.2642, "num_input_tokens_seen": 1506736, "step": 1740 }, { "epoch": 0.8227251296558227, "grad_norm": 0.01307134609669447, "learning_rate": 0.29859506940002506, "loss": 0.2348, "num_input_tokens_seen": 1510992, "step": 1745 }, { "epoch": 0.8250825082508251, "grad_norm": 0.005410636775195599, "learning_rate": 0.298587014746883, "loss": 0.2195, "num_input_tokens_seen": 1515056, "step": 1750 }, { "epoch": 0.8274398868458275, "grad_norm": 0.01701577752828598, "learning_rate": 0.298578937179756, "loss": 0.2452, "num_input_tokens_seen": 1520048, "step": 1755 }, { "epoch": 0.8297972654408298, "grad_norm": 0.013892080634832382, "learning_rate": 0.29857083669988976, "loss": 0.2283, "num_input_tokens_seen": 1524176, "step": 1760 }, { "epoch": 0.8321546440358322, "grad_norm": 0.015122687444090843, "learning_rate": 0.29856271330853346, "loss": 0.2267, "num_input_tokens_seen": 1528784, "step": 1765 }, { "epoch": 0.8345120226308345, "grad_norm": 0.008332401514053345, "learning_rate": 0.2985545670069398, "loss": 0.2393, "num_input_tokens_seen": 1533328, "step": 1770 }, { "epoch": 0.8368694012258369, "grad_norm": 0.011506418697535992, "learning_rate": 0.29854639779636505, "loss": 0.2347, "num_input_tokens_seen": 1538064, "step": 1775 }, { "epoch": 0.8392267798208393, "grad_norm": 0.004224782343953848, "learning_rate": 0.298538205678069, "loss": 0.2159, "num_input_tokens_seen": 1541952, "step": 1780 }, { "epoch": 0.8415841584158416, "grad_norm": 0.005852525122463703, "learning_rate": 0.298529990653315, "loss": 0.2365, "num_input_tokens_seen": 1546032, "step": 1785 }, { "epoch": 0.843941537010844, "grad_norm": 0.003291608765721321, "learning_rate": 0.29852175272336984, "loss": 0.2267, "num_input_tokens_seen": 1550480, "step": 1790 }, { "epoch": 0.8462989156058462, "grad_norm": 0.01006405521184206, "learning_rate": 0.29851349188950405, "loss": 0.2215, "num_input_tokens_seen": 1554848, "step": 1795 }, { "epoch": 0.8486562942008486, "grad_norm": 0.0038961637765169144, "learning_rate": 0.2985052081529914, "loss": 0.2577, "num_input_tokens_seen": 1559456, "step": 1800 }, { "epoch": 0.8486562942008486, "eval_loss": 0.22190262377262115, "eval_runtime": 21.9289, "eval_samples_per_second": 43.003, "eval_steps_per_second": 21.524, "num_input_tokens_seen": 1559456, "step": 1800 }, { "epoch": 0.851013672795851, "grad_norm": 0.008501654490828514, "learning_rate": 0.29849690151510944, "loss": 0.2235, "num_input_tokens_seen": 1563360, "step": 1805 }, { "epoch": 0.8533710513908533, "grad_norm": 0.003951655235141516, "learning_rate": 0.2984885719771392, "loss": 0.192, "num_input_tokens_seen": 1567232, "step": 1810 }, { "epoch": 0.8557284299858557, "grad_norm": 0.005035221576690674, "learning_rate": 0.2984802195403651, "loss": 0.2581, "num_input_tokens_seen": 1571392, "step": 1815 }, { "epoch": 0.858085808580858, "grad_norm": 0.005520319566130638, "learning_rate": 0.2984718442060752, "loss": 0.2337, "num_input_tokens_seen": 1575600, "step": 1820 }, { "epoch": 0.8604431871758604, "grad_norm": 0.15033912658691406, "learning_rate": 0.2984634459755611, "loss": 0.2505, "num_input_tokens_seen": 1580640, "step": 1825 }, { "epoch": 0.8628005657708628, "grad_norm": 0.012041726149618626, "learning_rate": 0.29845502485011793, "loss": 0.2359, "num_input_tokens_seen": 1584656, "step": 1830 }, { "epoch": 0.8651579443658651, "grad_norm": 0.009560885839164257, "learning_rate": 0.2984465808310444, "loss": 0.2254, "num_input_tokens_seen": 1588976, "step": 1835 }, { "epoch": 0.8675153229608675, "grad_norm": 0.0041495999321341515, "learning_rate": 0.29843811391964253, "loss": 0.1744, "num_input_tokens_seen": 1592928, "step": 1840 }, { "epoch": 0.8698727015558698, "grad_norm": 0.004277557134628296, "learning_rate": 0.2984296241172182, "loss": 0.2359, "num_input_tokens_seen": 1596880, "step": 1845 }, { "epoch": 0.8722300801508722, "grad_norm": 0.00727043254300952, "learning_rate": 0.29842111142508043, "loss": 0.1977, "num_input_tokens_seen": 1601440, "step": 1850 }, { "epoch": 0.8745874587458746, "grad_norm": 0.005497789941728115, "learning_rate": 0.29841257584454217, "loss": 0.2538, "num_input_tokens_seen": 1606032, "step": 1855 }, { "epoch": 0.8769448373408769, "grad_norm": 0.009605045430362225, "learning_rate": 0.29840401737691963, "loss": 0.2496, "num_input_tokens_seen": 1610720, "step": 1860 }, { "epoch": 0.8793022159358793, "grad_norm": 0.01595224067568779, "learning_rate": 0.29839543602353263, "loss": 0.2506, "num_input_tokens_seen": 1615520, "step": 1865 }, { "epoch": 0.8816595945308816, "grad_norm": 0.005318209063261747, "learning_rate": 0.2983868317857046, "loss": 0.2333, "num_input_tokens_seen": 1620416, "step": 1870 }, { "epoch": 0.884016973125884, "grad_norm": 0.006282396614551544, "learning_rate": 0.2983782046647623, "loss": 0.2395, "num_input_tokens_seen": 1624368, "step": 1875 }, { "epoch": 0.8863743517208864, "grad_norm": 0.010913478210568428, "learning_rate": 0.2983695546620362, "loss": 0.2408, "num_input_tokens_seen": 1628656, "step": 1880 }, { "epoch": 0.8887317303158887, "grad_norm": 0.005084115080535412, "learning_rate": 0.2983608817788603, "loss": 0.2264, "num_input_tokens_seen": 1633072, "step": 1885 }, { "epoch": 0.8910891089108911, "grad_norm": 0.011594077572226524, "learning_rate": 0.29835218601657193, "loss": 0.234, "num_input_tokens_seen": 1637328, "step": 1890 }, { "epoch": 0.8934464875058934, "grad_norm": 0.007585883606225252, "learning_rate": 0.29834346737651224, "loss": 0.2136, "num_input_tokens_seen": 1642304, "step": 1895 }, { "epoch": 0.8958038661008958, "grad_norm": 0.008697092533111572, "learning_rate": 0.29833472586002563, "loss": 0.2273, "num_input_tokens_seen": 1646656, "step": 1900 }, { "epoch": 0.8981612446958982, "grad_norm": 0.003734774421900511, "learning_rate": 0.29832596146846024, "loss": 0.2073, "num_input_tokens_seen": 1650720, "step": 1905 }, { "epoch": 0.9005186232909005, "grad_norm": 0.003477261168882251, "learning_rate": 0.2983171742031676, "loss": 0.2737, "num_input_tokens_seen": 1655488, "step": 1910 }, { "epoch": 0.9028760018859029, "grad_norm": 0.005279069300740957, "learning_rate": 0.2983083640655028, "loss": 0.2003, "num_input_tokens_seen": 1660016, "step": 1915 }, { "epoch": 0.9052333804809052, "grad_norm": 0.004082551691681147, "learning_rate": 0.29829953105682455, "loss": 0.2306, "num_input_tokens_seen": 1664448, "step": 1920 }, { "epoch": 0.9075907590759076, "grad_norm": 0.016790742054581642, "learning_rate": 0.29829067517849495, "loss": 0.2208, "num_input_tokens_seen": 1669312, "step": 1925 }, { "epoch": 0.90994813767091, "grad_norm": 0.0040323324501514435, "learning_rate": 0.2982817964318797, "loss": 0.2253, "num_input_tokens_seen": 1673424, "step": 1930 }, { "epoch": 0.9123055162659123, "grad_norm": 0.004748993087559938, "learning_rate": 0.298272894818348, "loss": 0.2231, "num_input_tokens_seen": 1677760, "step": 1935 }, { "epoch": 0.9146628948609147, "grad_norm": 0.004319407511502504, "learning_rate": 0.2982639703392726, "loss": 0.2162, "num_input_tokens_seen": 1682064, "step": 1940 }, { "epoch": 0.917020273455917, "grad_norm": 0.004806531593203545, "learning_rate": 0.29825502299602974, "loss": 0.2543, "num_input_tokens_seen": 1686752, "step": 1945 }, { "epoch": 0.9193776520509194, "grad_norm": 0.0034314647782593966, "learning_rate": 0.2982460527899993, "loss": 0.1798, "num_input_tokens_seen": 1691232, "step": 1950 }, { "epoch": 0.9217350306459218, "grad_norm": 0.004376592114567757, "learning_rate": 0.29823705972256453, "loss": 0.3034, "num_input_tokens_seen": 1695440, "step": 1955 }, { "epoch": 0.9240924092409241, "grad_norm": 0.0073643955402076244, "learning_rate": 0.2982280437951123, "loss": 0.2547, "num_input_tokens_seen": 1699344, "step": 1960 }, { "epoch": 0.9264497878359265, "grad_norm": 0.011101096868515015, "learning_rate": 0.298219005009033, "loss": 0.2375, "num_input_tokens_seen": 1704560, "step": 1965 }, { "epoch": 0.9288071664309288, "grad_norm": 0.007933859713375568, "learning_rate": 0.29820994336572043, "loss": 0.2094, "num_input_tokens_seen": 1709312, "step": 1970 }, { "epoch": 0.9311645450259312, "grad_norm": 0.007413928396999836, "learning_rate": 0.2982008588665721, "loss": 0.2066, "num_input_tokens_seen": 1714064, "step": 1975 }, { "epoch": 0.9335219236209336, "grad_norm": 0.005915017798542976, "learning_rate": 0.2981917515129889, "loss": 0.2916, "num_input_tokens_seen": 1718688, "step": 1980 }, { "epoch": 0.9358793022159358, "grad_norm": 0.0036994311958551407, "learning_rate": 0.2981826213063753, "loss": 0.2458, "num_input_tokens_seen": 1723744, "step": 1985 }, { "epoch": 0.9382366808109383, "grad_norm": 0.0039099776186048985, "learning_rate": 0.2981734682481394, "loss": 0.2364, "num_input_tokens_seen": 1727824, "step": 1990 }, { "epoch": 0.9405940594059405, "grad_norm": 0.0028941142372787, "learning_rate": 0.29816429233969255, "loss": 0.2007, "num_input_tokens_seen": 1731728, "step": 1995 }, { "epoch": 0.9429514380009429, "grad_norm": 0.01276901550590992, "learning_rate": 0.2981550935824499, "loss": 0.3163, "num_input_tokens_seen": 1735840, "step": 2000 }, { "epoch": 0.9429514380009429, "eval_loss": 0.22208459675312042, "eval_runtime": 21.9025, "eval_samples_per_second": 43.054, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 1735840, "step": 2000 }, { "epoch": 0.9453088165959453, "grad_norm": 0.002622877014800906, "learning_rate": 0.29814587197783, "loss": 0.2442, "num_input_tokens_seen": 1739856, "step": 2005 }, { "epoch": 0.9476661951909476, "grad_norm": 0.005940092261880636, "learning_rate": 0.29813662752725495, "loss": 0.245, "num_input_tokens_seen": 1745168, "step": 2010 }, { "epoch": 0.95002357378595, "grad_norm": 0.005352016072720289, "learning_rate": 0.29812736023215025, "loss": 0.2275, "num_input_tokens_seen": 1749936, "step": 2015 }, { "epoch": 0.9523809523809523, "grad_norm": 0.006731404457241297, "learning_rate": 0.29811807009394514, "loss": 0.2488, "num_input_tokens_seen": 1754432, "step": 2020 }, { "epoch": 0.9547383309759547, "grad_norm": 0.004008446354418993, "learning_rate": 0.2981087571140723, "loss": 0.2155, "num_input_tokens_seen": 1758960, "step": 2025 }, { "epoch": 0.9570957095709571, "grad_norm": 0.0027782730758190155, "learning_rate": 0.2980994212939678, "loss": 0.1574, "num_input_tokens_seen": 1762704, "step": 2030 }, { "epoch": 0.9594530881659594, "grad_norm": 0.0029837798792868853, "learning_rate": 0.2980900626350715, "loss": 0.2808, "num_input_tokens_seen": 1767632, "step": 2035 }, { "epoch": 0.9618104667609618, "grad_norm": 0.0028322862926870584, "learning_rate": 0.29808068113882646, "loss": 0.2378, "num_input_tokens_seen": 1772032, "step": 2040 }, { "epoch": 0.9641678453559641, "grad_norm": 0.004754126071929932, "learning_rate": 0.2980712768066795, "loss": 0.2333, "num_input_tokens_seen": 1776112, "step": 2045 }, { "epoch": 0.9665252239509665, "grad_norm": 0.005631719250231981, "learning_rate": 0.2980618496400809, "loss": 0.234, "num_input_tokens_seen": 1780480, "step": 2050 }, { "epoch": 0.9688826025459689, "grad_norm": 0.004796988796442747, "learning_rate": 0.2980523996404844, "loss": 0.2292, "num_input_tokens_seen": 1784672, "step": 2055 }, { "epoch": 0.9712399811409712, "grad_norm": 0.007834039628505707, "learning_rate": 0.2980429268093473, "loss": 0.2086, "num_input_tokens_seen": 1789056, "step": 2060 }, { "epoch": 0.9735973597359736, "grad_norm": 0.010467850603163242, "learning_rate": 0.29803343114813047, "loss": 0.2401, "num_input_tokens_seen": 1793776, "step": 2065 }, { "epoch": 0.9759547383309759, "grad_norm": 0.004168745595961809, "learning_rate": 0.2980239126582983, "loss": 0.2563, "num_input_tokens_seen": 1798928, "step": 2070 }, { "epoch": 0.9783121169259783, "grad_norm": 0.011788712814450264, "learning_rate": 0.2980143713413186, "loss": 0.2368, "num_input_tokens_seen": 1803120, "step": 2075 }, { "epoch": 0.9806694955209807, "grad_norm": 0.005848876666277647, "learning_rate": 0.29800480719866274, "loss": 0.2363, "num_input_tokens_seen": 1807664, "step": 2080 }, { "epoch": 0.983026874115983, "grad_norm": 0.007544418331235647, "learning_rate": 0.2979952202318057, "loss": 0.2454, "num_input_tokens_seen": 1811344, "step": 2085 }, { "epoch": 0.9853842527109854, "grad_norm": 0.003753883996978402, "learning_rate": 0.2979856104422259, "loss": 0.2234, "num_input_tokens_seen": 1815472, "step": 2090 }, { "epoch": 0.9877416313059877, "grad_norm": 0.002804399933665991, "learning_rate": 0.2979759778314052, "loss": 0.2049, "num_input_tokens_seen": 1819744, "step": 2095 }, { "epoch": 0.9900990099009901, "grad_norm": 0.006288627162575722, "learning_rate": 0.2979663224008292, "loss": 0.2779, "num_input_tokens_seen": 1823792, "step": 2100 }, { "epoch": 0.9924563884959925, "grad_norm": 0.007085269317030907, "learning_rate": 0.2979566441519868, "loss": 0.2267, "num_input_tokens_seen": 1827632, "step": 2105 }, { "epoch": 0.9948137670909948, "grad_norm": 0.004938343074172735, "learning_rate": 0.29794694308637054, "loss": 0.2501, "num_input_tokens_seen": 1831376, "step": 2110 }, { "epoch": 0.9971711456859972, "grad_norm": 0.002113934373483062, "learning_rate": 0.2979372192054764, "loss": 0.2306, "num_input_tokens_seen": 1835840, "step": 2115 }, { "epoch": 0.9995285242809995, "grad_norm": 0.008228533901274204, "learning_rate": 0.297927472510804, "loss": 0.2152, "num_input_tokens_seen": 1840144, "step": 2120 }, { "epoch": 1.0018859028760019, "grad_norm": 0.0043676975183188915, "learning_rate": 0.29791770300385634, "loss": 0.2713, "num_input_tokens_seen": 1844720, "step": 2125 }, { "epoch": 1.0042432814710043, "grad_norm": 0.007573768496513367, "learning_rate": 0.29790791068614003, "loss": 0.2381, "num_input_tokens_seen": 1848592, "step": 2130 }, { "epoch": 1.0066006600660067, "grad_norm": 0.005674374755471945, "learning_rate": 0.2978980955591652, "loss": 0.2286, "num_input_tokens_seen": 1853344, "step": 2135 }, { "epoch": 1.0089580386610089, "grad_norm": 0.0025785774923861027, "learning_rate": 0.2978882576244454, "loss": 0.1867, "num_input_tokens_seen": 1858352, "step": 2140 }, { "epoch": 1.0113154172560113, "grad_norm": 0.0030189198441803455, "learning_rate": 0.2978783968834978, "loss": 0.2361, "num_input_tokens_seen": 1863120, "step": 2145 }, { "epoch": 1.0136727958510137, "grad_norm": 0.004002026282250881, "learning_rate": 0.29786851333784303, "loss": 0.1929, "num_input_tokens_seen": 1868640, "step": 2150 }, { "epoch": 1.016030174446016, "grad_norm": 0.004960834514349699, "learning_rate": 0.2978586069890053, "loss": 0.205, "num_input_tokens_seen": 1873088, "step": 2155 }, { "epoch": 1.0183875530410185, "grad_norm": 0.0032031885348260403, "learning_rate": 0.29784867783851227, "loss": 0.2988, "num_input_tokens_seen": 1877552, "step": 2160 }, { "epoch": 1.0207449316360206, "grad_norm": 0.005285789258778095, "learning_rate": 0.2978387258878951, "loss": 0.2183, "num_input_tokens_seen": 1882240, "step": 2165 }, { "epoch": 1.023102310231023, "grad_norm": 0.006060976069420576, "learning_rate": 0.29782875113868856, "loss": 0.2511, "num_input_tokens_seen": 1887120, "step": 2170 }, { "epoch": 1.0254596888260255, "grad_norm": 0.005063164047896862, "learning_rate": 0.2978187535924309, "loss": 0.2577, "num_input_tokens_seen": 1891312, "step": 2175 }, { "epoch": 1.0278170674210279, "grad_norm": 0.006553592626005411, "learning_rate": 0.29780873325066376, "loss": 0.2084, "num_input_tokens_seen": 1895360, "step": 2180 }, { "epoch": 1.0301744460160303, "grad_norm": 0.019467391073703766, "learning_rate": 0.2977986901149325, "loss": 0.2477, "num_input_tokens_seen": 1898704, "step": 2185 }, { "epoch": 1.0325318246110324, "grad_norm": 0.0025658246595412493, "learning_rate": 0.29778862418678587, "loss": 0.2296, "num_input_tokens_seen": 1903328, "step": 2190 }, { "epoch": 1.0348892032060348, "grad_norm": 0.003836560994386673, "learning_rate": 0.29777853546777616, "loss": 0.2438, "num_input_tokens_seen": 1907024, "step": 2195 }, { "epoch": 1.0372465818010372, "grad_norm": 0.002883586334064603, "learning_rate": 0.2977684239594592, "loss": 0.2406, "num_input_tokens_seen": 1910848, "step": 2200 }, { "epoch": 1.0372465818010372, "eval_loss": 0.24526606500148773, "eval_runtime": 21.9613, "eval_samples_per_second": 42.939, "eval_steps_per_second": 21.492, "num_input_tokens_seen": 1910848, "step": 2200 }, { "epoch": 1.0396039603960396, "grad_norm": 0.004581039305776358, "learning_rate": 0.29775828966339424, "loss": 0.2259, "num_input_tokens_seen": 1914672, "step": 2205 }, { "epoch": 1.041961338991042, "grad_norm": 0.007325735408812761, "learning_rate": 0.29774813258114424, "loss": 0.2587, "num_input_tokens_seen": 1918880, "step": 2210 }, { "epoch": 1.0443187175860442, "grad_norm": 0.004092403221875429, "learning_rate": 0.29773795271427544, "loss": 0.2302, "num_input_tokens_seen": 1923024, "step": 2215 }, { "epoch": 1.0466760961810466, "grad_norm": 0.002851843135431409, "learning_rate": 0.2977277500643577, "loss": 0.2074, "num_input_tokens_seen": 1927360, "step": 2220 }, { "epoch": 1.049033474776049, "grad_norm": 0.0043287393637001514, "learning_rate": 0.29771752463296447, "loss": 0.2367, "num_input_tokens_seen": 1931776, "step": 2225 }, { "epoch": 1.0513908533710514, "grad_norm": 0.003814509604126215, "learning_rate": 0.29770727642167266, "loss": 0.245, "num_input_tokens_seen": 1935376, "step": 2230 }, { "epoch": 1.0537482319660538, "grad_norm": 0.0050248634070158005, "learning_rate": 0.29769700543206257, "loss": 0.2181, "num_input_tokens_seen": 1939552, "step": 2235 }, { "epoch": 1.056105610561056, "grad_norm": 0.002869442105293274, "learning_rate": 0.2976867116657182, "loss": 0.2103, "num_input_tokens_seen": 1944192, "step": 2240 }, { "epoch": 1.0584629891560584, "grad_norm": 0.0022990142460912466, "learning_rate": 0.2976763951242269, "loss": 0.1913, "num_input_tokens_seen": 1948656, "step": 2245 }, { "epoch": 1.0608203677510608, "grad_norm": 0.013879192061722279, "learning_rate": 0.29766605580917965, "loss": 0.195, "num_input_tokens_seen": 1953344, "step": 2250 }, { "epoch": 1.0631777463460632, "grad_norm": 0.00637099938467145, "learning_rate": 0.29765569372217093, "loss": 0.2582, "num_input_tokens_seen": 1957392, "step": 2255 }, { "epoch": 1.0655351249410656, "grad_norm": 0.00394192710518837, "learning_rate": 0.2976453088647987, "loss": 0.1951, "num_input_tokens_seen": 1961440, "step": 2260 }, { "epoch": 1.0678925035360678, "grad_norm": 0.003272519912570715, "learning_rate": 0.2976349012386644, "loss": 0.1994, "num_input_tokens_seen": 1965504, "step": 2265 }, { "epoch": 1.0702498821310702, "grad_norm": 0.0030303613748401403, "learning_rate": 0.29762447084537297, "loss": 0.2081, "num_input_tokens_seen": 1969152, "step": 2270 }, { "epoch": 1.0726072607260726, "grad_norm": 0.0029132701456546783, "learning_rate": 0.29761401768653306, "loss": 0.2274, "num_input_tokens_seen": 1973952, "step": 2275 }, { "epoch": 1.074964639321075, "grad_norm": 0.0034993591252714396, "learning_rate": 0.29760354176375653, "loss": 0.256, "num_input_tokens_seen": 1977520, "step": 2280 }, { "epoch": 1.0773220179160774, "grad_norm": 0.007542883511632681, "learning_rate": 0.29759304307865897, "loss": 0.2172, "num_input_tokens_seen": 1981664, "step": 2285 }, { "epoch": 1.0796793965110796, "grad_norm": 0.004839949309825897, "learning_rate": 0.2975825216328594, "loss": 0.2284, "num_input_tokens_seen": 1986304, "step": 2290 }, { "epoch": 1.082036775106082, "grad_norm": 0.003058667993173003, "learning_rate": 0.2975719774279804, "loss": 0.2233, "num_input_tokens_seen": 1990400, "step": 2295 }, { "epoch": 1.0843941537010844, "grad_norm": 0.0036426065489649773, "learning_rate": 0.29756141046564794, "loss": 0.2132, "num_input_tokens_seen": 1994368, "step": 2300 }, { "epoch": 1.0867515322960868, "grad_norm": 0.0032126728910952806, "learning_rate": 0.2975508207474916, "loss": 0.245, "num_input_tokens_seen": 1999104, "step": 2305 }, { "epoch": 1.0891089108910892, "grad_norm": 0.0036312832962721586, "learning_rate": 0.2975402082751445, "loss": 0.2277, "num_input_tokens_seen": 2003248, "step": 2310 }, { "epoch": 1.0914662894860914, "grad_norm": 0.00797110516577959, "learning_rate": 0.29752957305024313, "loss": 0.2149, "num_input_tokens_seen": 2007040, "step": 2315 }, { "epoch": 1.0938236680810938, "grad_norm": 0.007661684416234493, "learning_rate": 0.2975189150744277, "loss": 0.2225, "num_input_tokens_seen": 2010720, "step": 2320 }, { "epoch": 1.0961810466760962, "grad_norm": 0.004179884679615498, "learning_rate": 0.29750823434934165, "loss": 0.2142, "num_input_tokens_seen": 2014752, "step": 2325 }, { "epoch": 1.0985384252710986, "grad_norm": 0.004948524758219719, "learning_rate": 0.29749753087663217, "loss": 0.233, "num_input_tokens_seen": 2019632, "step": 2330 }, { "epoch": 1.100895803866101, "grad_norm": 0.00805339403450489, "learning_rate": 0.29748680465794985, "loss": 0.2286, "num_input_tokens_seen": 2023712, "step": 2335 }, { "epoch": 1.1032531824611032, "grad_norm": 0.003974564839154482, "learning_rate": 0.29747605569494884, "loss": 0.2345, "num_input_tokens_seen": 2027680, "step": 2340 }, { "epoch": 1.1056105610561056, "grad_norm": 0.007854514755308628, "learning_rate": 0.29746528398928673, "loss": 0.241, "num_input_tokens_seen": 2032704, "step": 2345 }, { "epoch": 1.107967939651108, "grad_norm": 0.007960119284689426, "learning_rate": 0.2974544895426247, "loss": 0.2517, "num_input_tokens_seen": 2036160, "step": 2350 }, { "epoch": 1.1103253182461104, "grad_norm": 0.008488879539072514, "learning_rate": 0.29744367235662733, "loss": 0.239, "num_input_tokens_seen": 2040240, "step": 2355 }, { "epoch": 1.1126826968411128, "grad_norm": 0.020646855235099792, "learning_rate": 0.29743283243296276, "loss": 0.2545, "num_input_tokens_seen": 2046224, "step": 2360 }, { "epoch": 1.115040075436115, "grad_norm": 0.005945291370153427, "learning_rate": 0.29742196977330276, "loss": 0.226, "num_input_tokens_seen": 2050400, "step": 2365 }, { "epoch": 1.1173974540311173, "grad_norm": 0.010240446776151657, "learning_rate": 0.2974110843793223, "loss": 0.2393, "num_input_tokens_seen": 2055904, "step": 2370 }, { "epoch": 1.1197548326261197, "grad_norm": 0.011086949147284031, "learning_rate": 0.2974001762527002, "loss": 0.2397, "num_input_tokens_seen": 2060208, "step": 2375 }, { "epoch": 1.1221122112211221, "grad_norm": 0.011045214720070362, "learning_rate": 0.2973892453951186, "loss": 0.2349, "num_input_tokens_seen": 2064128, "step": 2380 }, { "epoch": 1.1244695898161245, "grad_norm": 0.053821492940187454, "learning_rate": 0.2973782918082631, "loss": 0.2441, "num_input_tokens_seen": 2068448, "step": 2385 }, { "epoch": 1.1268269684111267, "grad_norm": 0.009985426440834999, "learning_rate": 0.29736731549382295, "loss": 0.2119, "num_input_tokens_seen": 2072784, "step": 2390 }, { "epoch": 1.1291843470061291, "grad_norm": 0.01207641139626503, "learning_rate": 0.2973563164534908, "loss": 0.2299, "num_input_tokens_seen": 2076656, "step": 2395 }, { "epoch": 1.1315417256011315, "grad_norm": 0.004354174714535475, "learning_rate": 0.29734529468896287, "loss": 0.1931, "num_input_tokens_seen": 2081696, "step": 2400 }, { "epoch": 1.1315417256011315, "eval_loss": 0.22883723676204681, "eval_runtime": 21.9429, "eval_samples_per_second": 42.975, "eval_steps_per_second": 21.51, "num_input_tokens_seen": 2081696, "step": 2400 }, { "epoch": 1.133899104196134, "grad_norm": 0.009082062169909477, "learning_rate": 0.2973342502019388, "loss": 0.1991, "num_input_tokens_seen": 2085824, "step": 2405 }, { "epoch": 1.1362564827911363, "grad_norm": 0.006109756883233786, "learning_rate": 0.2973231829941219, "loss": 0.2215, "num_input_tokens_seen": 2090032, "step": 2410 }, { "epoch": 1.1386138613861387, "grad_norm": 0.008330211974680424, "learning_rate": 0.2973120930672188, "loss": 0.2293, "num_input_tokens_seen": 2093968, "step": 2415 }, { "epoch": 1.140971239981141, "grad_norm": 0.005231280345469713, "learning_rate": 0.2973009804229397, "loss": 0.2146, "num_input_tokens_seen": 2098352, "step": 2420 }, { "epoch": 1.1433286185761433, "grad_norm": 0.008973370306193829, "learning_rate": 0.29728984506299827, "loss": 0.2486, "num_input_tokens_seen": 2102624, "step": 2425 }, { "epoch": 1.1456859971711457, "grad_norm": 0.007892332971096039, "learning_rate": 0.2972786869891118, "loss": 0.2478, "num_input_tokens_seen": 2106672, "step": 2430 }, { "epoch": 1.1480433757661481, "grad_norm": 0.006100038066506386, "learning_rate": 0.29726750620300096, "loss": 0.2144, "num_input_tokens_seen": 2111472, "step": 2435 }, { "epoch": 1.1504007543611503, "grad_norm": 0.006571868900209665, "learning_rate": 0.29725630270639003, "loss": 0.2417, "num_input_tokens_seen": 2116624, "step": 2440 }, { "epoch": 1.1527581329561527, "grad_norm": 0.006925846450030804, "learning_rate": 0.2972450765010067, "loss": 0.2399, "num_input_tokens_seen": 2121456, "step": 2445 }, { "epoch": 1.155115511551155, "grad_norm": 0.004056949634104967, "learning_rate": 0.29723382758858213, "loss": 0.2124, "num_input_tokens_seen": 2125808, "step": 2450 }, { "epoch": 1.1574728901461575, "grad_norm": 0.00938568264245987, "learning_rate": 0.29722255597085107, "loss": 0.2284, "num_input_tokens_seen": 2131184, "step": 2455 }, { "epoch": 1.15983026874116, "grad_norm": 0.010969400405883789, "learning_rate": 0.2972112616495518, "loss": 0.2507, "num_input_tokens_seen": 2135600, "step": 2460 }, { "epoch": 1.1621876473361623, "grad_norm": 0.009343928657472134, "learning_rate": 0.297199944626426, "loss": 0.2142, "num_input_tokens_seen": 2140848, "step": 2465 }, { "epoch": 1.1645450259311645, "grad_norm": 0.01069515198469162, "learning_rate": 0.2971886049032189, "loss": 0.2194, "num_input_tokens_seen": 2144624, "step": 2470 }, { "epoch": 1.166902404526167, "grad_norm": 0.004001745954155922, "learning_rate": 0.29717724248167926, "loss": 0.2042, "num_input_tokens_seen": 2148752, "step": 2475 }, { "epoch": 1.1692597831211693, "grad_norm": 0.006407513283193111, "learning_rate": 0.29716585736355927, "loss": 0.2258, "num_input_tokens_seen": 2152624, "step": 2480 }, { "epoch": 1.1716171617161717, "grad_norm": 0.0050386106595396996, "learning_rate": 0.2971544495506147, "loss": 0.1263, "num_input_tokens_seen": 2157200, "step": 2485 }, { "epoch": 1.1739745403111739, "grad_norm": 0.013103808276355267, "learning_rate": 0.2971430190446048, "loss": 0.2598, "num_input_tokens_seen": 2161664, "step": 2490 }, { "epoch": 1.1763319189061763, "grad_norm": 0.0070473793894052505, "learning_rate": 0.2971315658472921, "loss": 0.2257, "num_input_tokens_seen": 2166464, "step": 2495 }, { "epoch": 1.1786892975011787, "grad_norm": 0.0071532223373651505, "learning_rate": 0.2971200899604431, "loss": 0.2438, "num_input_tokens_seen": 2170304, "step": 2500 }, { "epoch": 1.181046676096181, "grad_norm": 0.01070361863821745, "learning_rate": 0.29710859138582735, "loss": 0.2402, "num_input_tokens_seen": 2175696, "step": 2505 }, { "epoch": 1.1834040546911835, "grad_norm": 0.00840723142027855, "learning_rate": 0.29709707012521813, "loss": 0.2467, "num_input_tokens_seen": 2180240, "step": 2510 }, { "epoch": 1.1857614332861859, "grad_norm": 0.02045406401157379, "learning_rate": 0.29708552618039213, "loss": 0.269, "num_input_tokens_seen": 2184736, "step": 2515 }, { "epoch": 1.188118811881188, "grad_norm": 0.008667693473398685, "learning_rate": 0.2970739595531296, "loss": 0.2022, "num_input_tokens_seen": 2189072, "step": 2520 }, { "epoch": 1.1904761904761905, "grad_norm": 0.004271699581295252, "learning_rate": 0.2970623702452143, "loss": 0.2602, "num_input_tokens_seen": 2193680, "step": 2525 }, { "epoch": 1.1928335690711929, "grad_norm": 0.005421479698270559, "learning_rate": 0.2970507582584334, "loss": 0.2283, "num_input_tokens_seen": 2197968, "step": 2530 }, { "epoch": 1.1951909476661953, "grad_norm": 0.009646003134548664, "learning_rate": 0.2970391235945776, "loss": 0.2308, "num_input_tokens_seen": 2202096, "step": 2535 }, { "epoch": 1.1975483262611974, "grad_norm": 0.004087352659553289, "learning_rate": 0.2970274662554412, "loss": 0.2276, "num_input_tokens_seen": 2205728, "step": 2540 }, { "epoch": 1.1999057048561999, "grad_norm": 0.0040877158753573895, "learning_rate": 0.2970157862428218, "loss": 0.2234, "num_input_tokens_seen": 2210128, "step": 2545 }, { "epoch": 1.2022630834512023, "grad_norm": 0.0038329861126840115, "learning_rate": 0.2970040835585206, "loss": 0.2037, "num_input_tokens_seen": 2214336, "step": 2550 }, { "epoch": 1.2046204620462047, "grad_norm": 0.005805865861475468, "learning_rate": 0.2969923582043424, "loss": 0.2345, "num_input_tokens_seen": 2217968, "step": 2555 }, { "epoch": 1.206977840641207, "grad_norm": 0.0071100834757089615, "learning_rate": 0.2969806101820953, "loss": 0.1823, "num_input_tokens_seen": 2221888, "step": 2560 }, { "epoch": 1.2093352192362095, "grad_norm": 0.009370304644107819, "learning_rate": 0.2969688394935911, "loss": 0.2358, "num_input_tokens_seen": 2226000, "step": 2565 }, { "epoch": 1.2116925978312116, "grad_norm": 0.005700339563190937, "learning_rate": 0.2969570461406449, "loss": 0.262, "num_input_tokens_seen": 2230160, "step": 2570 }, { "epoch": 1.214049976426214, "grad_norm": 0.0029907983262091875, "learning_rate": 0.29694523012507534, "loss": 0.2123, "num_input_tokens_seen": 2234192, "step": 2575 }, { "epoch": 1.2164073550212164, "grad_norm": 0.007089912425726652, "learning_rate": 0.2969333914487048, "loss": 0.244, "num_input_tokens_seen": 2238400, "step": 2580 }, { "epoch": 1.2187647336162188, "grad_norm": 0.0034851315431296825, "learning_rate": 0.2969215301133587, "loss": 0.2459, "num_input_tokens_seen": 2242608, "step": 2585 }, { "epoch": 1.221122112211221, "grad_norm": 0.0042421650141477585, "learning_rate": 0.29690964612086634, "loss": 0.2201, "num_input_tokens_seen": 2247104, "step": 2590 }, { "epoch": 1.2234794908062234, "grad_norm": 0.003272887784987688, "learning_rate": 0.2968977394730604, "loss": 0.2121, "num_input_tokens_seen": 2251328, "step": 2595 }, { "epoch": 1.2258368694012258, "grad_norm": 0.005509421695023775, "learning_rate": 0.296885810171777, "loss": 0.2177, "num_input_tokens_seen": 2255952, "step": 2600 }, { "epoch": 1.2258368694012258, "eval_loss": 0.2199508547782898, "eval_runtime": 21.883, "eval_samples_per_second": 43.093, "eval_steps_per_second": 21.569, "num_input_tokens_seen": 2255952, "step": 2600 }, { "epoch": 1.2281942479962282, "grad_norm": 0.002827633870765567, "learning_rate": 0.2968738582188558, "loss": 0.1916, "num_input_tokens_seen": 2260160, "step": 2605 }, { "epoch": 1.2305516265912306, "grad_norm": 0.0024956511333584785, "learning_rate": 0.2968618836161399, "loss": 0.2045, "num_input_tokens_seen": 2264016, "step": 2610 }, { "epoch": 1.232909005186233, "grad_norm": 0.005286719184368849, "learning_rate": 0.296849886365476, "loss": 0.2092, "num_input_tokens_seen": 2267856, "step": 2615 }, { "epoch": 1.2352663837812352, "grad_norm": 0.0038813233841210604, "learning_rate": 0.2968378664687142, "loss": 0.2899, "num_input_tokens_seen": 2271808, "step": 2620 }, { "epoch": 1.2376237623762376, "grad_norm": 0.004714853595942259, "learning_rate": 0.296825823927708, "loss": 0.2136, "num_input_tokens_seen": 2276208, "step": 2625 }, { "epoch": 1.23998114097124, "grad_norm": 0.003636020701378584, "learning_rate": 0.29681375874431476, "loss": 0.2322, "num_input_tokens_seen": 2279968, "step": 2630 }, { "epoch": 1.2423385195662424, "grad_norm": 0.004139642231166363, "learning_rate": 0.29680167092039483, "loss": 0.2354, "num_input_tokens_seen": 2284208, "step": 2635 }, { "epoch": 1.2446958981612446, "grad_norm": 0.007399015594273806, "learning_rate": 0.2967895604578125, "loss": 0.2104, "num_input_tokens_seen": 2288944, "step": 2640 }, { "epoch": 1.247053276756247, "grad_norm": 0.002826611278578639, "learning_rate": 0.2967774273584352, "loss": 0.2437, "num_input_tokens_seen": 2293008, "step": 2645 }, { "epoch": 1.2494106553512494, "grad_norm": 0.00905651692301035, "learning_rate": 0.2967652716241342, "loss": 0.2396, "num_input_tokens_seen": 2298240, "step": 2650 }, { "epoch": 1.2517680339462518, "grad_norm": 0.00439093355089426, "learning_rate": 0.29675309325678384, "loss": 0.2495, "num_input_tokens_seen": 2302832, "step": 2655 }, { "epoch": 1.2541254125412542, "grad_norm": 0.00828529056161642, "learning_rate": 0.29674089225826233, "loss": 0.2587, "num_input_tokens_seen": 2306576, "step": 2660 }, { "epoch": 1.2564827911362566, "grad_norm": 0.002483899239450693, "learning_rate": 0.29672866863045116, "loss": 0.2288, "num_input_tokens_seen": 2310784, "step": 2665 }, { "epoch": 1.2588401697312588, "grad_norm": 0.007964280433952808, "learning_rate": 0.2967164223752354, "loss": 0.2377, "num_input_tokens_seen": 2314912, "step": 2670 }, { "epoch": 1.2611975483262612, "grad_norm": 0.013144832104444504, "learning_rate": 0.2967041534945035, "loss": 0.2334, "num_input_tokens_seen": 2319344, "step": 2675 }, { "epoch": 1.2635549269212636, "grad_norm": 0.005364317912608385, "learning_rate": 0.2966918619901476, "loss": 0.2073, "num_input_tokens_seen": 2323568, "step": 2680 }, { "epoch": 1.265912305516266, "grad_norm": 0.004201872274279594, "learning_rate": 0.2966795478640631, "loss": 0.1986, "num_input_tokens_seen": 2328048, "step": 2685 }, { "epoch": 1.2682696841112682, "grad_norm": 0.011573298834264278, "learning_rate": 0.29666721111814903, "loss": 0.2755, "num_input_tokens_seen": 2331712, "step": 2690 }, { "epoch": 1.2706270627062706, "grad_norm": 0.011270738206803799, "learning_rate": 0.2966548517543079, "loss": 0.2599, "num_input_tokens_seen": 2335808, "step": 2695 }, { "epoch": 1.272984441301273, "grad_norm": 0.01084190420806408, "learning_rate": 0.29664246977444564, "loss": 0.2541, "num_input_tokens_seen": 2340032, "step": 2700 }, { "epoch": 1.2753418198962754, "grad_norm": 0.005595136433839798, "learning_rate": 0.2966300651804717, "loss": 0.2465, "num_input_tokens_seen": 2344096, "step": 2705 }, { "epoch": 1.2776991984912778, "grad_norm": 0.006088255438953638, "learning_rate": 0.296617637974299, "loss": 0.2076, "num_input_tokens_seen": 2348096, "step": 2710 }, { "epoch": 1.2800565770862802, "grad_norm": 0.004901167005300522, "learning_rate": 0.2966051881578441, "loss": 0.2311, "num_input_tokens_seen": 2352304, "step": 2715 }, { "epoch": 1.2824139556812824, "grad_norm": 0.007242193445563316, "learning_rate": 0.29659271573302676, "loss": 0.2566, "num_input_tokens_seen": 2356736, "step": 2720 }, { "epoch": 1.2847713342762848, "grad_norm": 0.006228514015674591, "learning_rate": 0.2965802207017705, "loss": 0.2045, "num_input_tokens_seen": 2361088, "step": 2725 }, { "epoch": 1.2871287128712872, "grad_norm": 0.005607002880424261, "learning_rate": 0.2965677030660021, "loss": 0.1976, "num_input_tokens_seen": 2364976, "step": 2730 }, { "epoch": 1.2894860914662896, "grad_norm": 0.005090066231787205, "learning_rate": 0.2965551628276521, "loss": 0.2165, "num_input_tokens_seen": 2369792, "step": 2735 }, { "epoch": 1.2918434700612917, "grad_norm": 0.01625673845410347, "learning_rate": 0.29654259998865423, "loss": 0.2577, "num_input_tokens_seen": 2374064, "step": 2740 }, { "epoch": 1.2942008486562941, "grad_norm": 0.0029056889470666647, "learning_rate": 0.2965300145509458, "loss": 0.2539, "num_input_tokens_seen": 2378624, "step": 2745 }, { "epoch": 1.2965582272512965, "grad_norm": 0.0027273704763501883, "learning_rate": 0.2965174065164678, "loss": 0.2294, "num_input_tokens_seen": 2383088, "step": 2750 }, { "epoch": 1.298915605846299, "grad_norm": 0.004698619712144136, "learning_rate": 0.2965047758871644, "loss": 0.2362, "num_input_tokens_seen": 2387952, "step": 2755 }, { "epoch": 1.3012729844413014, "grad_norm": 0.0027133452240377665, "learning_rate": 0.2964921226649835, "loss": 0.2409, "num_input_tokens_seen": 2391984, "step": 2760 }, { "epoch": 1.3036303630363038, "grad_norm": 0.004819863010197878, "learning_rate": 0.2964794468518763, "loss": 0.2342, "num_input_tokens_seen": 2396944, "step": 2765 }, { "epoch": 1.305987741631306, "grad_norm": 0.0041705030016601086, "learning_rate": 0.2964667484497977, "loss": 0.224, "num_input_tokens_seen": 2401200, "step": 2770 }, { "epoch": 1.3083451202263083, "grad_norm": 0.0023280326277017593, "learning_rate": 0.29645402746070587, "loss": 0.2027, "num_input_tokens_seen": 2405936, "step": 2775 }, { "epoch": 1.3107024988213107, "grad_norm": 0.006026082206517458, "learning_rate": 0.2964412838865625, "loss": 0.2429, "num_input_tokens_seen": 2410256, "step": 2780 }, { "epoch": 1.3130598774163131, "grad_norm": 0.008059031330049038, "learning_rate": 0.29642851772933293, "loss": 0.2647, "num_input_tokens_seen": 2415136, "step": 2785 }, { "epoch": 1.3154172560113153, "grad_norm": 0.005650942679494619, "learning_rate": 0.29641572899098567, "loss": 0.225, "num_input_tokens_seen": 2418976, "step": 2790 }, { "epoch": 1.3177746346063177, "grad_norm": 0.004146013408899307, "learning_rate": 0.29640291767349314, "loss": 0.2144, "num_input_tokens_seen": 2422768, "step": 2795 }, { "epoch": 1.3201320132013201, "grad_norm": 0.006985373329371214, "learning_rate": 0.2963900837788308, "loss": 0.2397, "num_input_tokens_seen": 2427152, "step": 2800 }, { "epoch": 1.3201320132013201, "eval_loss": 0.21909011900424957, "eval_runtime": 21.8711, "eval_samples_per_second": 43.116, "eval_steps_per_second": 21.581, "num_input_tokens_seen": 2427152, "step": 2800 }, { "epoch": 1.3224893917963225, "grad_norm": 0.006013814825564623, "learning_rate": 0.2963772273089779, "loss": 0.2284, "num_input_tokens_seen": 2431568, "step": 2805 }, { "epoch": 1.324846770391325, "grad_norm": 0.0063807847909629345, "learning_rate": 0.2963643482659171, "loss": 0.2246, "num_input_tokens_seen": 2435680, "step": 2810 }, { "epoch": 1.3272041489863273, "grad_norm": 0.0033262642100453377, "learning_rate": 0.2963514466516345, "loss": 0.241, "num_input_tokens_seen": 2439456, "step": 2815 }, { "epoch": 1.3295615275813295, "grad_norm": 0.003506594803184271, "learning_rate": 0.2963385224681196, "loss": 0.2324, "num_input_tokens_seen": 2444048, "step": 2820 }, { "epoch": 1.331918906176332, "grad_norm": 0.0029124070424586535, "learning_rate": 0.29632557571736556, "loss": 0.1984, "num_input_tokens_seen": 2449792, "step": 2825 }, { "epoch": 1.3342762847713343, "grad_norm": 0.007220807950943708, "learning_rate": 0.2963126064013689, "loss": 0.2138, "num_input_tokens_seen": 2454688, "step": 2830 }, { "epoch": 1.3366336633663367, "grad_norm": 0.007118983659893274, "learning_rate": 0.29629961452212966, "loss": 0.2404, "num_input_tokens_seen": 2458480, "step": 2835 }, { "epoch": 1.338991041961339, "grad_norm": 0.003447006456553936, "learning_rate": 0.2962866000816513, "loss": 0.2265, "num_input_tokens_seen": 2462656, "step": 2840 }, { "epoch": 1.3413484205563413, "grad_norm": 0.0035183795262128115, "learning_rate": 0.2962735630819409, "loss": 0.2203, "num_input_tokens_seen": 2466960, "step": 2845 }, { "epoch": 1.3437057991513437, "grad_norm": 0.003590841544792056, "learning_rate": 0.2962605035250089, "loss": 0.2224, "num_input_tokens_seen": 2472096, "step": 2850 }, { "epoch": 1.346063177746346, "grad_norm": 0.0050714765675365925, "learning_rate": 0.29624742141286914, "loss": 0.2143, "num_input_tokens_seen": 2476624, "step": 2855 }, { "epoch": 1.3484205563413485, "grad_norm": 0.008710482157766819, "learning_rate": 0.29623431674753925, "loss": 0.2095, "num_input_tokens_seen": 2480400, "step": 2860 }, { "epoch": 1.350777934936351, "grad_norm": 0.00221889466047287, "learning_rate": 0.29622118953103993, "loss": 0.2403, "num_input_tokens_seen": 2484416, "step": 2865 }, { "epoch": 1.353135313531353, "grad_norm": 0.0035844026133418083, "learning_rate": 0.2962080397653957, "loss": 0.2343, "num_input_tokens_seen": 2489040, "step": 2870 }, { "epoch": 1.3554926921263555, "grad_norm": 0.00659441901370883, "learning_rate": 0.29619486745263435, "loss": 0.2413, "num_input_tokens_seen": 2493328, "step": 2875 }, { "epoch": 1.3578500707213579, "grad_norm": 0.005109064280986786, "learning_rate": 0.2961816725947873, "loss": 0.2496, "num_input_tokens_seen": 2496928, "step": 2880 }, { "epoch": 1.3602074493163603, "grad_norm": 0.0039333379827439785, "learning_rate": 0.29616845519388924, "loss": 0.2291, "num_input_tokens_seen": 2502416, "step": 2885 }, { "epoch": 1.3625648279113625, "grad_norm": 0.008502107113599777, "learning_rate": 0.2961552152519785, "loss": 0.2331, "num_input_tokens_seen": 2507296, "step": 2890 }, { "epoch": 1.3649222065063649, "grad_norm": 0.008395630866289139, "learning_rate": 0.29614195277109695, "loss": 0.2518, "num_input_tokens_seen": 2511264, "step": 2895 }, { "epoch": 1.3672795851013673, "grad_norm": 0.0033641664776951075, "learning_rate": 0.2961286677532897, "loss": 0.2394, "num_input_tokens_seen": 2515616, "step": 2900 }, { "epoch": 1.3696369636963697, "grad_norm": 0.005667990539222956, "learning_rate": 0.2961153602006055, "loss": 0.2214, "num_input_tokens_seen": 2520320, "step": 2905 }, { "epoch": 1.371994342291372, "grad_norm": 0.009371232241392136, "learning_rate": 0.29610203011509656, "loss": 0.2466, "num_input_tokens_seen": 2524624, "step": 2910 }, { "epoch": 1.3743517208863745, "grad_norm": 0.00920378603041172, "learning_rate": 0.29608867749881856, "loss": 0.2427, "num_input_tokens_seen": 2529632, "step": 2915 }, { "epoch": 1.3767090994813767, "grad_norm": 0.006576639134436846, "learning_rate": 0.29607530235383067, "loss": 0.2258, "num_input_tokens_seen": 2533664, "step": 2920 }, { "epoch": 1.379066478076379, "grad_norm": 0.005481430795043707, "learning_rate": 0.2960619046821954, "loss": 0.2244, "num_input_tokens_seen": 2537792, "step": 2925 }, { "epoch": 1.3814238566713815, "grad_norm": 0.0025994509924203157, "learning_rate": 0.2960484844859789, "loss": 0.1885, "num_input_tokens_seen": 2542368, "step": 2930 }, { "epoch": 1.3837812352663839, "grad_norm": 0.005182967986911535, "learning_rate": 0.29603504176725076, "loss": 0.2249, "num_input_tokens_seen": 2546512, "step": 2935 }, { "epoch": 1.386138613861386, "grad_norm": 0.007806459907442331, "learning_rate": 0.296021576528084, "loss": 0.2492, "num_input_tokens_seen": 2550400, "step": 2940 }, { "epoch": 1.3884959924563884, "grad_norm": 0.005865577608346939, "learning_rate": 0.29600808877055507, "loss": 0.2041, "num_input_tokens_seen": 2554480, "step": 2945 }, { "epoch": 1.3908533710513908, "grad_norm": 0.007396891713142395, "learning_rate": 0.29599457849674404, "loss": 0.2373, "num_input_tokens_seen": 2558576, "step": 2950 }, { "epoch": 1.3932107496463932, "grad_norm": 0.005331482272595167, "learning_rate": 0.2959810457087343, "loss": 0.2488, "num_input_tokens_seen": 2562768, "step": 2955 }, { "epoch": 1.3955681282413956, "grad_norm": 0.0031363656744360924, "learning_rate": 0.2959674904086128, "loss": 0.2439, "num_input_tokens_seen": 2566832, "step": 2960 }, { "epoch": 1.397925506836398, "grad_norm": 0.0062361364252865314, "learning_rate": 0.2959539125984699, "loss": 0.2227, "num_input_tokens_seen": 2571824, "step": 2965 }, { "epoch": 1.4002828854314002, "grad_norm": 0.008645808324217796, "learning_rate": 0.2959403122803996, "loss": 0.2126, "num_input_tokens_seen": 2576160, "step": 2970 }, { "epoch": 1.4026402640264026, "grad_norm": 0.002940503181889653, "learning_rate": 0.2959266894564991, "loss": 0.1898, "num_input_tokens_seen": 2580208, "step": 2975 }, { "epoch": 1.404997642621405, "grad_norm": 0.0023231273517012596, "learning_rate": 0.2959130441288692, "loss": 0.2433, "num_input_tokens_seen": 2583952, "step": 2980 }, { "epoch": 1.4073550212164074, "grad_norm": 0.0023128725588321686, "learning_rate": 0.2958993762996143, "loss": 0.1761, "num_input_tokens_seen": 2587856, "step": 2985 }, { "epoch": 1.4097123998114096, "grad_norm": 0.006500492803752422, "learning_rate": 0.2958856859708421, "loss": 0.213, "num_input_tokens_seen": 2592784, "step": 2990 }, { "epoch": 1.412069778406412, "grad_norm": 0.005657103378325701, "learning_rate": 0.2958719731446638, "loss": 0.1953, "num_input_tokens_seen": 2596864, "step": 2995 }, { "epoch": 1.4144271570014144, "grad_norm": 0.002139743883162737, "learning_rate": 0.29585823782319404, "loss": 0.1923, "num_input_tokens_seen": 2601296, "step": 3000 }, { "epoch": 1.4144271570014144, "eval_loss": 0.22986771166324615, "eval_runtime": 21.9265, "eval_samples_per_second": 43.007, "eval_steps_per_second": 21.527, "num_input_tokens_seen": 2601296, "step": 3000 }, { "epoch": 1.4167845355964168, "grad_norm": 0.006443341728299856, "learning_rate": 0.2958444800085511, "loss": 0.2067, "num_input_tokens_seen": 2604832, "step": 3005 }, { "epoch": 1.4191419141914192, "grad_norm": 0.005475982092320919, "learning_rate": 0.2958306997028565, "loss": 0.2021, "num_input_tokens_seen": 2608896, "step": 3010 }, { "epoch": 1.4214992927864216, "grad_norm": 0.003305422607809305, "learning_rate": 0.2958168969082354, "loss": 0.2454, "num_input_tokens_seen": 2612960, "step": 3015 }, { "epoch": 1.4238566713814238, "grad_norm": 0.003398105502128601, "learning_rate": 0.2958030716268164, "loss": 0.2323, "num_input_tokens_seen": 2616976, "step": 3020 }, { "epoch": 1.4262140499764262, "grad_norm": 0.007261589169502258, "learning_rate": 0.2957892238607314, "loss": 0.2097, "num_input_tokens_seen": 2620928, "step": 3025 }, { "epoch": 1.4285714285714286, "grad_norm": 0.002714838832616806, "learning_rate": 0.2957753536121161, "loss": 0.2191, "num_input_tokens_seen": 2625328, "step": 3030 }, { "epoch": 1.430928807166431, "grad_norm": 0.0059647406451404095, "learning_rate": 0.29576146088310923, "loss": 0.1983, "num_input_tokens_seen": 2629824, "step": 3035 }, { "epoch": 1.4332861857614332, "grad_norm": 0.0041071041487157345, "learning_rate": 0.2957475456758533, "loss": 0.2821, "num_input_tokens_seen": 2633568, "step": 3040 }, { "epoch": 1.4356435643564356, "grad_norm": 0.005326292477548122, "learning_rate": 0.2957336079924944, "loss": 0.1413, "num_input_tokens_seen": 2638400, "step": 3045 }, { "epoch": 1.438000942951438, "grad_norm": 0.009144378826022148, "learning_rate": 0.2957196478351816, "loss": 0.2384, "num_input_tokens_seen": 2643200, "step": 3050 }, { "epoch": 1.4403583215464404, "grad_norm": 0.0028494142461568117, "learning_rate": 0.295705665206068, "loss": 0.1904, "num_input_tokens_seen": 2647120, "step": 3055 }, { "epoch": 1.4427157001414428, "grad_norm": 0.011923309415578842, "learning_rate": 0.2956916601073097, "loss": 0.2623, "num_input_tokens_seen": 2651088, "step": 3060 }, { "epoch": 1.4450730787364452, "grad_norm": 0.005525130778551102, "learning_rate": 0.29567763254106655, "loss": 0.257, "num_input_tokens_seen": 2655616, "step": 3065 }, { "epoch": 1.4474304573314474, "grad_norm": 0.003473920514807105, "learning_rate": 0.29566358250950175, "loss": 0.231, "num_input_tokens_seen": 2659680, "step": 3070 }, { "epoch": 1.4497878359264498, "grad_norm": 0.004484968725591898, "learning_rate": 0.295649510014782, "loss": 0.2706, "num_input_tokens_seen": 2664048, "step": 3075 }, { "epoch": 1.4521452145214522, "grad_norm": 0.005521675106137991, "learning_rate": 0.2956354150590775, "loss": 0.228, "num_input_tokens_seen": 2668464, "step": 3080 }, { "epoch": 1.4545025931164546, "grad_norm": 0.009474598802626133, "learning_rate": 0.2956212976445618, "loss": 0.2345, "num_input_tokens_seen": 2672528, "step": 3085 }, { "epoch": 1.4568599717114568, "grad_norm": 0.005432508420199156, "learning_rate": 0.295607157773412, "loss": 0.1872, "num_input_tokens_seen": 2677152, "step": 3090 }, { "epoch": 1.4592173503064592, "grad_norm": 0.0029296104330569506, "learning_rate": 0.2955929954478087, "loss": 0.1983, "num_input_tokens_seen": 2681600, "step": 3095 }, { "epoch": 1.4615747289014616, "grad_norm": 0.013688499107956886, "learning_rate": 0.29557881066993585, "loss": 0.2312, "num_input_tokens_seen": 2686096, "step": 3100 }, { "epoch": 1.463932107496464, "grad_norm": 0.0052721574902534485, "learning_rate": 0.29556460344198093, "loss": 0.3167, "num_input_tokens_seen": 2690048, "step": 3105 }, { "epoch": 1.4662894860914664, "grad_norm": 0.005750676151365042, "learning_rate": 0.29555037376613486, "loss": 0.2292, "num_input_tokens_seen": 2693824, "step": 3110 }, { "epoch": 1.4686468646864688, "grad_norm": 0.0030463619623333216, "learning_rate": 0.29553612164459203, "loss": 0.238, "num_input_tokens_seen": 2698448, "step": 3115 }, { "epoch": 1.471004243281471, "grad_norm": 0.0037915317807346582, "learning_rate": 0.29552184707955037, "loss": 0.2368, "num_input_tokens_seen": 2702480, "step": 3120 }, { "epoch": 1.4733616218764733, "grad_norm": 0.004207046236842871, "learning_rate": 0.29550755007321117, "loss": 0.2277, "num_input_tokens_seen": 2706640, "step": 3125 }, { "epoch": 1.4757190004714758, "grad_norm": 0.003830442437902093, "learning_rate": 0.29549323062777916, "loss": 0.2376, "num_input_tokens_seen": 2711056, "step": 3130 }, { "epoch": 1.4780763790664782, "grad_norm": 0.003404313465580344, "learning_rate": 0.29547888874546263, "loss": 0.2209, "num_input_tokens_seen": 2715024, "step": 3135 }, { "epoch": 1.4804337576614803, "grad_norm": 0.0019108677515760064, "learning_rate": 0.2954645244284732, "loss": 0.2028, "num_input_tokens_seen": 2719120, "step": 3140 }, { "epoch": 1.4827911362564827, "grad_norm": 0.010326757095754147, "learning_rate": 0.2954501376790261, "loss": 0.2393, "num_input_tokens_seen": 2722992, "step": 3145 }, { "epoch": 1.4851485148514851, "grad_norm": 0.006713184993714094, "learning_rate": 0.29543572849933997, "loss": 0.22, "num_input_tokens_seen": 2727408, "step": 3150 }, { "epoch": 1.4875058934464875, "grad_norm": 0.005542915314435959, "learning_rate": 0.2954212968916368, "loss": 0.1752, "num_input_tokens_seen": 2731008, "step": 3155 }, { "epoch": 1.48986327204149, "grad_norm": 0.004811374936252832, "learning_rate": 0.29540684285814217, "loss": 0.2303, "num_input_tokens_seen": 2735456, "step": 3160 }, { "epoch": 1.4922206506364923, "grad_norm": 0.004465905018150806, "learning_rate": 0.2953923664010851, "loss": 0.2616, "num_input_tokens_seen": 2740544, "step": 3165 }, { "epoch": 1.4945780292314945, "grad_norm": 0.009649048559367657, "learning_rate": 0.295377867522698, "loss": 0.2241, "num_input_tokens_seen": 2745424, "step": 3170 }, { "epoch": 1.496935407826497, "grad_norm": 0.003139410400763154, "learning_rate": 0.2953633462252168, "loss": 0.2369, "num_input_tokens_seen": 2749536, "step": 3175 }, { "epoch": 1.4992927864214993, "grad_norm": 0.006606172304600477, "learning_rate": 0.2953488025108809, "loss": 0.2072, "num_input_tokens_seen": 2755152, "step": 3180 }, { "epoch": 1.5016501650165015, "grad_norm": 0.005086277145892382, "learning_rate": 0.295334236381933, "loss": 0.1791, "num_input_tokens_seen": 2760736, "step": 3185 }, { "epoch": 1.504007543611504, "grad_norm": 0.014225717633962631, "learning_rate": 0.29531964784061954, "loss": 0.2315, "num_input_tokens_seen": 2765776, "step": 3190 }, { "epoch": 1.5063649222065063, "grad_norm": 0.011708388105034828, "learning_rate": 0.2953050368891902, "loss": 0.303, "num_input_tokens_seen": 2770272, "step": 3195 }, { "epoch": 1.5087223008015087, "grad_norm": 0.0027984611224383116, "learning_rate": 0.29529040352989805, "loss": 0.2004, "num_input_tokens_seen": 2774672, "step": 3200 }, { "epoch": 1.5087223008015087, "eval_loss": 0.21937692165374756, "eval_runtime": 21.8923, "eval_samples_per_second": 43.075, "eval_steps_per_second": 21.56, "num_input_tokens_seen": 2774672, "step": 3200 }, { "epoch": 1.511079679396511, "grad_norm": 0.0024444940499961376, "learning_rate": 0.29527574776499993, "loss": 0.2198, "num_input_tokens_seen": 2779232, "step": 3205 }, { "epoch": 1.5134370579915135, "grad_norm": 0.002786466386169195, "learning_rate": 0.2952610695967558, "loss": 0.2304, "num_input_tokens_seen": 2784320, "step": 3210 }, { "epoch": 1.515794436586516, "grad_norm": 0.003062780015170574, "learning_rate": 0.29524636902742935, "loss": 0.2211, "num_input_tokens_seen": 2788400, "step": 3215 }, { "epoch": 1.5181518151815183, "grad_norm": 0.002640972612425685, "learning_rate": 0.2952316460592875, "loss": 0.2342, "num_input_tokens_seen": 2792896, "step": 3220 }, { "epoch": 1.5205091937765205, "grad_norm": 0.0037808935157954693, "learning_rate": 0.29521690069460066, "loss": 0.2308, "num_input_tokens_seen": 2796640, "step": 3225 }, { "epoch": 1.522866572371523, "grad_norm": 0.005789860617369413, "learning_rate": 0.29520213293564285, "loss": 0.2109, "num_input_tokens_seen": 2800752, "step": 3230 }, { "epoch": 1.525223950966525, "grad_norm": 0.007786838803440332, "learning_rate": 0.29518734278469144, "loss": 0.2254, "num_input_tokens_seen": 2804464, "step": 3235 }, { "epoch": 1.5275813295615275, "grad_norm": 0.003705211915075779, "learning_rate": 0.29517253024402723, "loss": 0.2017, "num_input_tokens_seen": 2809008, "step": 3240 }, { "epoch": 1.5299387081565299, "grad_norm": 0.0026761337649077177, "learning_rate": 0.2951576953159345, "loss": 0.2272, "num_input_tokens_seen": 2813344, "step": 3245 }, { "epoch": 1.5322960867515323, "grad_norm": 0.0026641436852514744, "learning_rate": 0.29514283800270097, "loss": 0.2781, "num_input_tokens_seen": 2817520, "step": 3250 }, { "epoch": 1.5346534653465347, "grad_norm": 0.004338329657912254, "learning_rate": 0.2951279583066179, "loss": 0.2222, "num_input_tokens_seen": 2822160, "step": 3255 }, { "epoch": 1.537010843941537, "grad_norm": 0.004540656693279743, "learning_rate": 0.2951130562299798, "loss": 0.2358, "num_input_tokens_seen": 2826624, "step": 3260 }, { "epoch": 1.5393682225365395, "grad_norm": 0.0059455507434904575, "learning_rate": 0.29509813177508487, "loss": 0.2393, "num_input_tokens_seen": 2830448, "step": 3265 }, { "epoch": 1.541725601131542, "grad_norm": 0.0031284764409065247, "learning_rate": 0.2950831849442346, "loss": 0.2331, "num_input_tokens_seen": 2834192, "step": 3270 }, { "epoch": 1.544082979726544, "grad_norm": 0.00307730445638299, "learning_rate": 0.2950682157397339, "loss": 0.2348, "num_input_tokens_seen": 2838736, "step": 3275 }, { "epoch": 1.5464403583215465, "grad_norm": 0.0033250777050852776, "learning_rate": 0.2950532241638914, "loss": 0.2159, "num_input_tokens_seen": 2842896, "step": 3280 }, { "epoch": 1.5487977369165487, "grad_norm": 0.001959958579391241, "learning_rate": 0.2950382102190188, "loss": 0.2272, "num_input_tokens_seen": 2846496, "step": 3285 }, { "epoch": 1.551155115511551, "grad_norm": 0.0016896735178306699, "learning_rate": 0.2950231739074316, "loss": 0.226, "num_input_tokens_seen": 2851568, "step": 3290 }, { "epoch": 1.5535124941065535, "grad_norm": 0.003085876815021038, "learning_rate": 0.29500811523144843, "loss": 0.2137, "num_input_tokens_seen": 2855984, "step": 3295 }, { "epoch": 1.5558698727015559, "grad_norm": 0.003959935624152422, "learning_rate": 0.2949930341933917, "loss": 0.2393, "num_input_tokens_seen": 2860912, "step": 3300 }, { "epoch": 1.5582272512965583, "grad_norm": 0.003104042261838913, "learning_rate": 0.29497793079558693, "loss": 0.2169, "num_input_tokens_seen": 2865312, "step": 3305 }, { "epoch": 1.5605846298915607, "grad_norm": 0.0034342617727816105, "learning_rate": 0.2949628050403633, "loss": 0.231, "num_input_tokens_seen": 2869200, "step": 3310 }, { "epoch": 1.562942008486563, "grad_norm": 0.005453883204609156, "learning_rate": 0.2949476569300535, "loss": 0.2159, "num_input_tokens_seen": 2872640, "step": 3315 }, { "epoch": 1.5652993870815655, "grad_norm": 0.0025117069017142057, "learning_rate": 0.29493248646699344, "loss": 0.1731, "num_input_tokens_seen": 2876848, "step": 3320 }, { "epoch": 1.5676567656765676, "grad_norm": 0.0020055363420397043, "learning_rate": 0.29491729365352265, "loss": 0.2449, "num_input_tokens_seen": 2881904, "step": 3325 }, { "epoch": 1.57001414427157, "grad_norm": 0.0038363824132829905, "learning_rate": 0.29490207849198397, "loss": 0.2299, "num_input_tokens_seen": 2886192, "step": 3330 }, { "epoch": 1.5723715228665722, "grad_norm": 0.006472740322351456, "learning_rate": 0.29488684098472384, "loss": 0.2158, "num_input_tokens_seen": 2890928, "step": 3335 }, { "epoch": 1.5747289014615746, "grad_norm": 0.004765896126627922, "learning_rate": 0.2948715811340921, "loss": 0.182, "num_input_tokens_seen": 2895488, "step": 3340 }, { "epoch": 1.577086280056577, "grad_norm": 0.004921776708215475, "learning_rate": 0.294856298942442, "loss": 0.1941, "num_input_tokens_seen": 2899712, "step": 3345 }, { "epoch": 1.5794436586515794, "grad_norm": 0.00912700779736042, "learning_rate": 0.2948409944121302, "loss": 0.2422, "num_input_tokens_seen": 2903728, "step": 3350 }, { "epoch": 1.5818010372465818, "grad_norm": 0.002673919079825282, "learning_rate": 0.29482566754551687, "loss": 0.249, "num_input_tokens_seen": 2907792, "step": 3355 }, { "epoch": 1.5841584158415842, "grad_norm": 0.0032459553331136703, "learning_rate": 0.2948103183449656, "loss": 0.2284, "num_input_tokens_seen": 2912352, "step": 3360 }, { "epoch": 1.5865157944365866, "grad_norm": 0.006294066086411476, "learning_rate": 0.2947949468128435, "loss": 0.2191, "num_input_tokens_seen": 2916432, "step": 3365 }, { "epoch": 1.588873173031589, "grad_norm": 0.002189614111557603, "learning_rate": 0.2947795529515209, "loss": 0.23, "num_input_tokens_seen": 2920416, "step": 3370 }, { "epoch": 1.5912305516265912, "grad_norm": 0.007093979511409998, "learning_rate": 0.29476413676337193, "loss": 0.2505, "num_input_tokens_seen": 2924048, "step": 3375 }, { "epoch": 1.5935879302215936, "grad_norm": 0.002555058803409338, "learning_rate": 0.2947486982507738, "loss": 0.2413, "num_input_tokens_seen": 2927760, "step": 3380 }, { "epoch": 1.5959453088165958, "grad_norm": 0.00733768567442894, "learning_rate": 0.29473323741610735, "loss": 0.2432, "num_input_tokens_seen": 2932448, "step": 3385 }, { "epoch": 1.5983026874115982, "grad_norm": 0.005436216481029987, "learning_rate": 0.2947177542617569, "loss": 0.2511, "num_input_tokens_seen": 2936800, "step": 3390 }, { "epoch": 1.6006600660066006, "grad_norm": 0.0027357100043445826, "learning_rate": 0.2947022487901101, "loss": 0.2087, "num_input_tokens_seen": 2941136, "step": 3395 }, { "epoch": 1.603017444601603, "grad_norm": 0.002693792339414358, "learning_rate": 0.2946867210035581, "loss": 0.214, "num_input_tokens_seen": 2944896, "step": 3400 }, { "epoch": 1.603017444601603, "eval_loss": 0.21779467165470123, "eval_runtime": 21.9033, "eval_samples_per_second": 43.053, "eval_steps_per_second": 21.549, "num_input_tokens_seen": 2944896, "step": 3400 }, { "epoch": 1.6053748231966054, "grad_norm": 0.0027324312832206488, "learning_rate": 0.2946711709044954, "loss": 0.232, "num_input_tokens_seen": 2949328, "step": 3405 }, { "epoch": 1.6077322017916078, "grad_norm": 0.0024546149652451277, "learning_rate": 0.2946555984953202, "loss": 0.201, "num_input_tokens_seen": 2953440, "step": 3410 }, { "epoch": 1.6100895803866102, "grad_norm": 0.003707753261551261, "learning_rate": 0.2946400037784338, "loss": 0.2107, "num_input_tokens_seen": 2958048, "step": 3415 }, { "epoch": 1.6124469589816126, "grad_norm": 0.0018873336957767606, "learning_rate": 0.29462438675624114, "loss": 0.2459, "num_input_tokens_seen": 2962064, "step": 3420 }, { "epoch": 1.6148043375766148, "grad_norm": 0.0025487891398370266, "learning_rate": 0.2946087474311506, "loss": 0.2161, "num_input_tokens_seen": 2966208, "step": 3425 }, { "epoch": 1.6171617161716172, "grad_norm": 0.0032318630255758762, "learning_rate": 0.294593085805574, "loss": 0.2324, "num_input_tokens_seen": 2970720, "step": 3430 }, { "epoch": 1.6195190947666194, "grad_norm": 0.004803461022675037, "learning_rate": 0.2945774018819264, "loss": 0.2263, "num_input_tokens_seen": 2974704, "step": 3435 }, { "epoch": 1.6218764733616218, "grad_norm": 0.0028848457150161266, "learning_rate": 0.2945616956626266, "loss": 0.2225, "num_input_tokens_seen": 2978496, "step": 3440 }, { "epoch": 1.6242338519566242, "grad_norm": 0.002597562503069639, "learning_rate": 0.2945459671500966, "loss": 0.2044, "num_input_tokens_seen": 2983088, "step": 3445 }, { "epoch": 1.6265912305516266, "grad_norm": 0.003441445529460907, "learning_rate": 0.2945302163467621, "loss": 0.2585, "num_input_tokens_seen": 2987632, "step": 3450 }, { "epoch": 1.628948609146629, "grad_norm": 0.002385909203439951, "learning_rate": 0.2945144432550519, "loss": 0.1919, "num_input_tokens_seen": 2991520, "step": 3455 }, { "epoch": 1.6313059877416314, "grad_norm": 0.009133965708315372, "learning_rate": 0.29449864787739843, "loss": 0.2424, "num_input_tokens_seen": 2995888, "step": 3460 }, { "epoch": 1.6336633663366338, "grad_norm": 0.0037928381934762, "learning_rate": 0.2944828302162376, "loss": 0.2338, "num_input_tokens_seen": 2999952, "step": 3465 }, { "epoch": 1.6360207449316362, "grad_norm": 0.0028389464132487774, "learning_rate": 0.2944669902740087, "loss": 0.219, "num_input_tokens_seen": 3003440, "step": 3470 }, { "epoch": 1.6383781235266384, "grad_norm": 0.006033920217305422, "learning_rate": 0.2944511280531544, "loss": 0.2345, "num_input_tokens_seen": 3008240, "step": 3475 }, { "epoch": 1.6407355021216408, "grad_norm": 0.0031067850068211555, "learning_rate": 0.29443524355612083, "loss": 0.2276, "num_input_tokens_seen": 3012672, "step": 3480 }, { "epoch": 1.643092880716643, "grad_norm": 0.003312419867143035, "learning_rate": 0.29441933678535764, "loss": 0.2141, "num_input_tokens_seen": 3018048, "step": 3485 }, { "epoch": 1.6454502593116453, "grad_norm": 0.004910258110612631, "learning_rate": 0.29440340774331786, "loss": 0.2392, "num_input_tokens_seen": 3021616, "step": 3490 }, { "epoch": 1.6478076379066477, "grad_norm": 0.008694070391356945, "learning_rate": 0.2943874564324579, "loss": 0.2496, "num_input_tokens_seen": 3026240, "step": 3495 }, { "epoch": 1.6501650165016502, "grad_norm": 0.007466481067240238, "learning_rate": 0.2943714828552376, "loss": 0.22, "num_input_tokens_seen": 3030512, "step": 3500 }, { "epoch": 1.6525223950966526, "grad_norm": 0.006169352680444717, "learning_rate": 0.29435548701412045, "loss": 0.2322, "num_input_tokens_seen": 3034656, "step": 3505 }, { "epoch": 1.654879773691655, "grad_norm": 0.004392172209918499, "learning_rate": 0.2943394689115731, "loss": 0.201, "num_input_tokens_seen": 3038592, "step": 3510 }, { "epoch": 1.6572371522866574, "grad_norm": 0.0062384638004004955, "learning_rate": 0.29432342855006577, "loss": 0.2792, "num_input_tokens_seen": 3042576, "step": 3515 }, { "epoch": 1.6595945308816598, "grad_norm": 0.004291083198040724, "learning_rate": 0.294307365932072, "loss": 0.2427, "num_input_tokens_seen": 3046704, "step": 3520 }, { "epoch": 1.661951909476662, "grad_norm": 0.0059023890644311905, "learning_rate": 0.294291281060069, "loss": 0.2121, "num_input_tokens_seen": 3051648, "step": 3525 }, { "epoch": 1.6643092880716643, "grad_norm": 0.007602191064506769, "learning_rate": 0.29427517393653724, "loss": 0.2449, "num_input_tokens_seen": 3056384, "step": 3530 }, { "epoch": 1.6666666666666665, "grad_norm": 0.0028451127000153065, "learning_rate": 0.29425904456396046, "loss": 0.2153, "num_input_tokens_seen": 3060368, "step": 3535 }, { "epoch": 1.669024045261669, "grad_norm": 0.003465003566816449, "learning_rate": 0.2942428929448262, "loss": 0.2145, "num_input_tokens_seen": 3063968, "step": 3540 }, { "epoch": 1.6713814238566713, "grad_norm": 0.004281880799680948, "learning_rate": 0.2942267190816252, "loss": 0.1448, "num_input_tokens_seen": 3067984, "step": 3545 }, { "epoch": 1.6737388024516737, "grad_norm": 0.0018966725328937173, "learning_rate": 0.2942105229768516, "loss": 0.1877, "num_input_tokens_seen": 3072288, "step": 3550 }, { "epoch": 1.6760961810466761, "grad_norm": 0.0031444847118109465, "learning_rate": 0.29419430463300306, "loss": 0.17, "num_input_tokens_seen": 3076448, "step": 3555 }, { "epoch": 1.6784535596416785, "grad_norm": 0.0053614722564816475, "learning_rate": 0.2941780640525808, "loss": 0.2677, "num_input_tokens_seen": 3081072, "step": 3560 }, { "epoch": 1.680810938236681, "grad_norm": 0.002356649376451969, "learning_rate": 0.2941618012380891, "loss": 0.2071, "num_input_tokens_seen": 3086160, "step": 3565 }, { "epoch": 1.6831683168316833, "grad_norm": 0.004262336064130068, "learning_rate": 0.29414551619203605, "loss": 0.182, "num_input_tokens_seen": 3090608, "step": 3570 }, { "epoch": 1.6855256954266855, "grad_norm": 0.004276457708328962, "learning_rate": 0.29412920891693295, "loss": 0.1906, "num_input_tokens_seen": 3095504, "step": 3575 }, { "epoch": 1.687883074021688, "grad_norm": 0.0033935445826500654, "learning_rate": 0.2941128794152946, "loss": 0.2702, "num_input_tokens_seen": 3100336, "step": 3580 }, { "epoch": 1.69024045261669, "grad_norm": 0.002060750499367714, "learning_rate": 0.2940965276896392, "loss": 0.2611, "num_input_tokens_seen": 3104624, "step": 3585 }, { "epoch": 1.6925978312116925, "grad_norm": 0.00911793764680624, "learning_rate": 0.2940801537424884, "loss": 0.2432, "num_input_tokens_seen": 3109184, "step": 3590 }, { "epoch": 1.694955209806695, "grad_norm": 0.0062773083336651325, "learning_rate": 0.2940637575763673, "loss": 0.2471, "num_input_tokens_seen": 3113248, "step": 3595 }, { "epoch": 1.6973125884016973, "grad_norm": 0.0036801807582378387, "learning_rate": 0.2940473391938043, "loss": 0.2293, "num_input_tokens_seen": 3117216, "step": 3600 }, { "epoch": 1.6973125884016973, "eval_loss": 0.23388880491256714, "eval_runtime": 21.9072, "eval_samples_per_second": 43.045, "eval_steps_per_second": 21.545, "num_input_tokens_seen": 3117216, "step": 3600 }, { "epoch": 1.6996699669966997, "grad_norm": 0.003441160311922431, "learning_rate": 0.29403089859733145, "loss": 0.2278, "num_input_tokens_seen": 3122032, "step": 3605 }, { "epoch": 1.702027345591702, "grad_norm": 0.0024703308008611202, "learning_rate": 0.294014435789484, "loss": 0.2191, "num_input_tokens_seen": 3126112, "step": 3610 }, { "epoch": 1.7043847241867045, "grad_norm": 0.002759689697995782, "learning_rate": 0.2939979507728007, "loss": 0.1957, "num_input_tokens_seen": 3131088, "step": 3615 }, { "epoch": 1.706742102781707, "grad_norm": 0.002302541397511959, "learning_rate": 0.2939814435498239, "loss": 0.1843, "num_input_tokens_seen": 3135632, "step": 3620 }, { "epoch": 1.709099481376709, "grad_norm": 0.007768738549202681, "learning_rate": 0.29396491412309905, "loss": 0.3017, "num_input_tokens_seen": 3139712, "step": 3625 }, { "epoch": 1.7114568599717115, "grad_norm": 0.003679306013509631, "learning_rate": 0.2939483624951753, "loss": 0.2353, "num_input_tokens_seen": 3143680, "step": 3630 }, { "epoch": 1.7138142385667137, "grad_norm": 0.008586880750954151, "learning_rate": 0.2939317886686051, "loss": 0.2396, "num_input_tokens_seen": 3148416, "step": 3635 }, { "epoch": 1.716171617161716, "grad_norm": 0.00738940667361021, "learning_rate": 0.2939151926459443, "loss": 0.2404, "num_input_tokens_seen": 3152592, "step": 3640 }, { "epoch": 1.7185289957567185, "grad_norm": 0.0029339324682950974, "learning_rate": 0.2938985744297522, "loss": 0.2343, "num_input_tokens_seen": 3156816, "step": 3645 }, { "epoch": 1.7208863743517209, "grad_norm": 0.0028502715285867453, "learning_rate": 0.29388193402259166, "loss": 0.2268, "num_input_tokens_seen": 3160672, "step": 3650 }, { "epoch": 1.7232437529467233, "grad_norm": 0.0030473375227302313, "learning_rate": 0.29386527142702873, "loss": 0.2258, "num_input_tokens_seen": 3164176, "step": 3655 }, { "epoch": 1.7256011315417257, "grad_norm": 0.005669806618243456, "learning_rate": 0.293848586645633, "loss": 0.2657, "num_input_tokens_seen": 3169184, "step": 3660 }, { "epoch": 1.727958510136728, "grad_norm": 0.0027305183466523886, "learning_rate": 0.2938318796809775, "loss": 0.2651, "num_input_tokens_seen": 3173056, "step": 3665 }, { "epoch": 1.7303158887317305, "grad_norm": 0.0023854009341448545, "learning_rate": 0.29381515053563867, "loss": 0.2436, "num_input_tokens_seen": 3177904, "step": 3670 }, { "epoch": 1.7326732673267327, "grad_norm": 0.007409377954900265, "learning_rate": 0.29379839921219636, "loss": 0.2237, "num_input_tokens_seen": 3182448, "step": 3675 }, { "epoch": 1.735030645921735, "grad_norm": 0.003367022145539522, "learning_rate": 0.2937816257132338, "loss": 0.2486, "num_input_tokens_seen": 3187056, "step": 3680 }, { "epoch": 1.7373880245167372, "grad_norm": 0.005471038166433573, "learning_rate": 0.2937648300413376, "loss": 0.2217, "num_input_tokens_seen": 3191120, "step": 3685 }, { "epoch": 1.7397454031117396, "grad_norm": 0.002517069224268198, "learning_rate": 0.293748012199098, "loss": 0.2406, "num_input_tokens_seen": 3194992, "step": 3690 }, { "epoch": 1.742102781706742, "grad_norm": 0.00541115365922451, "learning_rate": 0.29373117218910844, "loss": 0.2079, "num_input_tokens_seen": 3199040, "step": 3695 }, { "epoch": 1.7444601603017444, "grad_norm": 0.005660151597112417, "learning_rate": 0.2937143100139659, "loss": 0.2484, "num_input_tokens_seen": 3203392, "step": 3700 }, { "epoch": 1.7468175388967468, "grad_norm": 0.002180088311433792, "learning_rate": 0.29369742567627083, "loss": 0.2297, "num_input_tokens_seen": 3207296, "step": 3705 }, { "epoch": 1.7491749174917492, "grad_norm": 0.0022596705239266157, "learning_rate": 0.29368051917862675, "loss": 0.2124, "num_input_tokens_seen": 3211696, "step": 3710 }, { "epoch": 1.7515322960867516, "grad_norm": 0.0023608021438121796, "learning_rate": 0.2936635905236411, "loss": 0.2204, "num_input_tokens_seen": 3215920, "step": 3715 }, { "epoch": 1.753889674681754, "grad_norm": 0.002080093137919903, "learning_rate": 0.2936466397139244, "loss": 0.2412, "num_input_tokens_seen": 3219680, "step": 3720 }, { "epoch": 1.7562470532767562, "grad_norm": 0.00564040569588542, "learning_rate": 0.2936296667520907, "loss": 0.2454, "num_input_tokens_seen": 3223440, "step": 3725 }, { "epoch": 1.7586044318717586, "grad_norm": 0.002177602844312787, "learning_rate": 0.2936126716407574, "loss": 0.2338, "num_input_tokens_seen": 3227680, "step": 3730 }, { "epoch": 1.7609618104667608, "grad_norm": 0.0021530482918024063, "learning_rate": 0.29359565438254537, "loss": 0.2395, "num_input_tokens_seen": 3231728, "step": 3735 }, { "epoch": 1.7633191890617632, "grad_norm": 0.004011159762740135, "learning_rate": 0.29357861498007887, "loss": 0.2233, "num_input_tokens_seen": 3236112, "step": 3740 }, { "epoch": 1.7656765676567656, "grad_norm": 0.003476114943623543, "learning_rate": 0.29356155343598567, "loss": 0.2307, "num_input_tokens_seen": 3240704, "step": 3745 }, { "epoch": 1.768033946251768, "grad_norm": 0.002762726740911603, "learning_rate": 0.2935444697528968, "loss": 0.2196, "num_input_tokens_seen": 3244608, "step": 3750 }, { "epoch": 1.7703913248467704, "grad_norm": 0.003950927406549454, "learning_rate": 0.2935273639334468, "loss": 0.2121, "num_input_tokens_seen": 3249104, "step": 3755 }, { "epoch": 1.7727487034417728, "grad_norm": 0.0017755895387381315, "learning_rate": 0.29351023598027365, "loss": 0.1752, "num_input_tokens_seen": 3252656, "step": 3760 }, { "epoch": 1.7751060820367752, "grad_norm": 0.003332328051328659, "learning_rate": 0.2934930858960186, "loss": 0.1603, "num_input_tokens_seen": 3256304, "step": 3765 }, { "epoch": 1.7774634606317776, "grad_norm": 0.008481809869408607, "learning_rate": 0.29347591368332643, "loss": 0.2919, "num_input_tokens_seen": 3261104, "step": 3770 }, { "epoch": 1.7798208392267798, "grad_norm": 0.0029833847656846046, "learning_rate": 0.2934587193448454, "loss": 0.2307, "num_input_tokens_seen": 3264896, "step": 3775 }, { "epoch": 1.7821782178217822, "grad_norm": 0.00415279995650053, "learning_rate": 0.29344150288322696, "loss": 0.2344, "num_input_tokens_seen": 3270368, "step": 3780 }, { "epoch": 1.7845355964167844, "grad_norm": 0.003057242138311267, "learning_rate": 0.2934242643011263, "loss": 0.2507, "num_input_tokens_seen": 3275200, "step": 3785 }, { "epoch": 1.7868929750117868, "grad_norm": 0.0028615714982151985, "learning_rate": 0.2934070036012016, "loss": 0.2273, "num_input_tokens_seen": 3279552, "step": 3790 }, { "epoch": 1.7892503536067892, "grad_norm": 0.005318064242601395, "learning_rate": 0.29338972078611475, "loss": 0.1883, "num_input_tokens_seen": 3283712, "step": 3795 }, { "epoch": 1.7916077322017916, "grad_norm": 0.0028827835340052843, "learning_rate": 0.2933724158585311, "loss": 0.2782, "num_input_tokens_seen": 3287952, "step": 3800 }, { "epoch": 1.7916077322017916, "eval_loss": 0.21904699504375458, "eval_runtime": 21.949, "eval_samples_per_second": 42.963, "eval_steps_per_second": 21.504, "num_input_tokens_seen": 3287952, "step": 3800 }, { "epoch": 1.793965110796794, "grad_norm": 0.0036394246853888035, "learning_rate": 0.29335508882111916, "loss": 0.2082, "num_input_tokens_seen": 3292448, "step": 3805 }, { "epoch": 1.7963224893917964, "grad_norm": 0.0035431573633104563, "learning_rate": 0.29333773967655097, "loss": 0.203, "num_input_tokens_seen": 3297120, "step": 3810 }, { "epoch": 1.7986798679867988, "grad_norm": 0.003945441916584969, "learning_rate": 0.2933203684275021, "loss": 0.2455, "num_input_tokens_seen": 3300880, "step": 3815 }, { "epoch": 1.8010372465818012, "grad_norm": 0.006601437926292419, "learning_rate": 0.2933029750766513, "loss": 0.2134, "num_input_tokens_seen": 3304864, "step": 3820 }, { "epoch": 1.8033946251768034, "grad_norm": 0.007759299594908953, "learning_rate": 0.2932855596266809, "loss": 0.2645, "num_input_tokens_seen": 3309168, "step": 3825 }, { "epoch": 1.8057520037718058, "grad_norm": 0.004086011089384556, "learning_rate": 0.2932681220802765, "loss": 0.2295, "num_input_tokens_seen": 3313808, "step": 3830 }, { "epoch": 1.808109382366808, "grad_norm": 0.006060452666133642, "learning_rate": 0.2932506624401274, "loss": 0.2379, "num_input_tokens_seen": 3318656, "step": 3835 }, { "epoch": 1.8104667609618104, "grad_norm": 0.0024422467686235905, "learning_rate": 0.29323318070892584, "loss": 0.2442, "num_input_tokens_seen": 3322656, "step": 3840 }, { "epoch": 1.8128241395568128, "grad_norm": 0.007358995731920004, "learning_rate": 0.29321567688936784, "loss": 0.2194, "num_input_tokens_seen": 3327024, "step": 3845 }, { "epoch": 1.8151815181518152, "grad_norm": 0.007445203140377998, "learning_rate": 0.29319815098415275, "loss": 0.1795, "num_input_tokens_seen": 3332512, "step": 3850 }, { "epoch": 1.8175388967468176, "grad_norm": 0.0037934784777462482, "learning_rate": 0.2931806029959832, "loss": 0.1654, "num_input_tokens_seen": 3336992, "step": 3855 }, { "epoch": 1.81989627534182, "grad_norm": 0.008467917330563068, "learning_rate": 0.29316303292756535, "loss": 0.2031, "num_input_tokens_seen": 3342160, "step": 3860 }, { "epoch": 1.8222536539368224, "grad_norm": 0.0024131883401423693, "learning_rate": 0.29314544078160876, "loss": 0.2826, "num_input_tokens_seen": 3346688, "step": 3865 }, { "epoch": 1.8246110325318248, "grad_norm": 0.00276275840587914, "learning_rate": 0.2931278265608263, "loss": 0.2029, "num_input_tokens_seen": 3350768, "step": 3870 }, { "epoch": 1.826968411126827, "grad_norm": 0.0057418146170675755, "learning_rate": 0.29311019026793433, "loss": 0.2209, "num_input_tokens_seen": 3355072, "step": 3875 }, { "epoch": 1.8293257897218294, "grad_norm": 0.0031369219068437815, "learning_rate": 0.29309253190565254, "loss": 0.2296, "num_input_tokens_seen": 3359664, "step": 3880 }, { "epoch": 1.8316831683168315, "grad_norm": 0.002749488689005375, "learning_rate": 0.2930748514767042, "loss": 0.229, "num_input_tokens_seen": 3363744, "step": 3885 }, { "epoch": 1.834040546911834, "grad_norm": 0.002402936341241002, "learning_rate": 0.29305714898381574, "loss": 0.2326, "num_input_tokens_seen": 3368464, "step": 3890 }, { "epoch": 1.8363979255068363, "grad_norm": 0.0023259713780134916, "learning_rate": 0.29303942442971714, "loss": 0.2365, "num_input_tokens_seen": 3373520, "step": 3895 }, { "epoch": 1.8387553041018387, "grad_norm": 0.005345901940017939, "learning_rate": 0.2930216778171417, "loss": 0.2337, "num_input_tokens_seen": 3377808, "step": 3900 }, { "epoch": 1.8411126826968411, "grad_norm": 0.004513942636549473, "learning_rate": 0.2930039091488263, "loss": 0.1906, "num_input_tokens_seen": 3382080, "step": 3905 }, { "epoch": 1.8434700612918435, "grad_norm": 0.004008403513580561, "learning_rate": 0.29298611842751093, "loss": 0.2193, "num_input_tokens_seen": 3386736, "step": 3910 }, { "epoch": 1.845827439886846, "grad_norm": 0.009298855438828468, "learning_rate": 0.29296830565593923, "loss": 0.1996, "num_input_tokens_seen": 3391184, "step": 3915 }, { "epoch": 1.8481848184818483, "grad_norm": 0.0063628763891756535, "learning_rate": 0.2929504708368582, "loss": 0.2709, "num_input_tokens_seen": 3395328, "step": 3920 }, { "epoch": 1.8505421970768505, "grad_norm": 0.008808179758489132, "learning_rate": 0.29293261397301806, "loss": 0.2453, "num_input_tokens_seen": 3400192, "step": 3925 }, { "epoch": 1.852899575671853, "grad_norm": 0.009236971847712994, "learning_rate": 0.29291473506717275, "loss": 0.23, "num_input_tokens_seen": 3404544, "step": 3930 }, { "epoch": 1.855256954266855, "grad_norm": 0.0059799086302518845, "learning_rate": 0.29289683412207923, "loss": 0.2382, "num_input_tokens_seen": 3408912, "step": 3935 }, { "epoch": 1.8576143328618575, "grad_norm": 0.006135224364697933, "learning_rate": 0.29287891114049813, "loss": 0.2203, "num_input_tokens_seen": 3413008, "step": 3940 }, { "epoch": 1.85997171145686, "grad_norm": 0.005332720931619406, "learning_rate": 0.29286096612519347, "loss": 0.2154, "num_input_tokens_seen": 3417056, "step": 3945 }, { "epoch": 1.8623290900518623, "grad_norm": 0.009877987205982208, "learning_rate": 0.2928429990789325, "loss": 0.2509, "num_input_tokens_seen": 3421744, "step": 3950 }, { "epoch": 1.8646864686468647, "grad_norm": 0.0034489184617996216, "learning_rate": 0.29282501000448596, "loss": 0.1824, "num_input_tokens_seen": 3425696, "step": 3955 }, { "epoch": 1.8670438472418671, "grad_norm": 0.00753172067925334, "learning_rate": 0.2928069989046281, "loss": 0.209, "num_input_tokens_seen": 3431088, "step": 3960 }, { "epoch": 1.8694012258368695, "grad_norm": 0.006011512130498886, "learning_rate": 0.2927889657821363, "loss": 0.2106, "num_input_tokens_seen": 3435872, "step": 3965 }, { "epoch": 1.871758604431872, "grad_norm": 0.004222434479743242, "learning_rate": 0.2927709106397916, "loss": 0.2594, "num_input_tokens_seen": 3440016, "step": 3970 }, { "epoch": 1.874115983026874, "grad_norm": 0.005142661277204752, "learning_rate": 0.29275283348037834, "loss": 0.2112, "num_input_tokens_seen": 3444096, "step": 3975 }, { "epoch": 1.8764733616218765, "grad_norm": 0.0034148667473345995, "learning_rate": 0.29273473430668423, "loss": 0.2224, "num_input_tokens_seen": 3448448, "step": 3980 }, { "epoch": 1.8788307402168787, "grad_norm": 0.00545239495113492, "learning_rate": 0.2927166131215003, "loss": 0.2295, "num_input_tokens_seen": 3452320, "step": 3985 }, { "epoch": 1.881188118811881, "grad_norm": 0.005119252949953079, "learning_rate": 0.2926984699276212, "loss": 0.2469, "num_input_tokens_seen": 3456576, "step": 3990 }, { "epoch": 1.8835454974068835, "grad_norm": 0.010816463269293308, "learning_rate": 0.29268030472784473, "loss": 0.226, "num_input_tokens_seen": 3460288, "step": 3995 }, { "epoch": 1.8859028760018859, "grad_norm": 0.003116845153272152, "learning_rate": 0.2926621175249723, "loss": 0.1944, "num_input_tokens_seen": 3464640, "step": 4000 }, { "epoch": 1.8859028760018859, "eval_loss": 0.22011378407478333, "eval_runtime": 21.8911, "eval_samples_per_second": 43.077, "eval_steps_per_second": 21.561, "num_input_tokens_seen": 3464640, "step": 4000 }, { "epoch": 1.8882602545968883, "grad_norm": 0.0037346116732805967, "learning_rate": 0.29264390832180853, "loss": 0.2565, "num_input_tokens_seen": 3468608, "step": 4005 }, { "epoch": 1.8906176331918907, "grad_norm": 0.0036061585415154696, "learning_rate": 0.29262567712116144, "loss": 0.2024, "num_input_tokens_seen": 3472704, "step": 4010 }, { "epoch": 1.892975011786893, "grad_norm": 0.004128908738493919, "learning_rate": 0.29260742392584266, "loss": 0.2105, "num_input_tokens_seen": 3477024, "step": 4015 }, { "epoch": 1.8953323903818955, "grad_norm": 0.003922776784747839, "learning_rate": 0.292589148738667, "loss": 0.191, "num_input_tokens_seen": 3481328, "step": 4020 }, { "epoch": 1.8976897689768977, "grad_norm": 0.0026220264844596386, "learning_rate": 0.2925708515624527, "loss": 0.2685, "num_input_tokens_seen": 3485440, "step": 4025 }, { "epoch": 1.9000471475719, "grad_norm": 0.010648313909769058, "learning_rate": 0.29255253240002144, "loss": 0.2179, "num_input_tokens_seen": 3490576, "step": 4030 }, { "epoch": 1.9024045261669023, "grad_norm": 0.0029913729522377253, "learning_rate": 0.2925341912541983, "loss": 0.2254, "num_input_tokens_seen": 3495568, "step": 4035 }, { "epoch": 1.9047619047619047, "grad_norm": 0.003488389542326331, "learning_rate": 0.2925158281278116, "loss": 0.2206, "num_input_tokens_seen": 3499280, "step": 4040 }, { "epoch": 1.907119283356907, "grad_norm": 0.0034548789262771606, "learning_rate": 0.29249744302369324, "loss": 0.2283, "num_input_tokens_seen": 3504048, "step": 4045 }, { "epoch": 1.9094766619519095, "grad_norm": 0.019813381135463715, "learning_rate": 0.29247903594467844, "loss": 0.2328, "num_input_tokens_seen": 3508000, "step": 4050 }, { "epoch": 1.9118340405469119, "grad_norm": 0.010443145409226418, "learning_rate": 0.2924606068936058, "loss": 0.2234, "num_input_tokens_seen": 3511904, "step": 4055 }, { "epoch": 1.9141914191419143, "grad_norm": 0.018534457311034203, "learning_rate": 0.2924421558733173, "loss": 0.2508, "num_input_tokens_seen": 3516736, "step": 4060 }, { "epoch": 1.9165487977369167, "grad_norm": 0.022313378751277924, "learning_rate": 0.2924236828866583, "loss": 0.2132, "num_input_tokens_seen": 3520800, "step": 4065 }, { "epoch": 1.918906176331919, "grad_norm": 0.009341590106487274, "learning_rate": 0.29240518793647763, "loss": 0.1472, "num_input_tokens_seen": 3524784, "step": 4070 }, { "epoch": 1.9212635549269212, "grad_norm": 0.01987406425178051, "learning_rate": 0.29238667102562743, "loss": 0.3822, "num_input_tokens_seen": 3528992, "step": 4075 }, { "epoch": 1.9236209335219236, "grad_norm": 0.016693130135536194, "learning_rate": 0.29236813215696317, "loss": 0.2562, "num_input_tokens_seen": 3533344, "step": 4080 }, { "epoch": 1.9259783121169258, "grad_norm": 0.01725536212325096, "learning_rate": 0.2923495713333439, "loss": 0.2533, "num_input_tokens_seen": 3538112, "step": 4085 }, { "epoch": 1.9283356907119282, "grad_norm": 0.009405343793332577, "learning_rate": 0.29233098855763173, "loss": 0.2534, "num_input_tokens_seen": 3542608, "step": 4090 }, { "epoch": 1.9306930693069306, "grad_norm": 0.010560731403529644, "learning_rate": 0.29231238383269254, "loss": 0.2213, "num_input_tokens_seen": 3547344, "step": 4095 }, { "epoch": 1.933050447901933, "grad_norm": 0.03204645216464996, "learning_rate": 0.2922937571613954, "loss": 0.3757, "num_input_tokens_seen": 3551376, "step": 4100 }, { "epoch": 1.9354078264969354, "grad_norm": 0.017623309046030045, "learning_rate": 0.29227510854661265, "loss": 0.2976, "num_input_tokens_seen": 3555744, "step": 4105 }, { "epoch": 1.9377652050919378, "grad_norm": 0.008795729838311672, "learning_rate": 0.29225643799122025, "loss": 0.256, "num_input_tokens_seen": 3560256, "step": 4110 }, { "epoch": 1.9401225836869402, "grad_norm": 0.012652077712118626, "learning_rate": 0.2922377454980974, "loss": 0.273, "num_input_tokens_seen": 3564096, "step": 4115 }, { "epoch": 1.9424799622819426, "grad_norm": 0.01688888669013977, "learning_rate": 0.29221903107012676, "loss": 0.2421, "num_input_tokens_seen": 3567888, "step": 4120 }, { "epoch": 1.9448373408769448, "grad_norm": 1.7488819360733032, "learning_rate": 0.29220029471019426, "loss": 0.3146, "num_input_tokens_seen": 3571776, "step": 4125 }, { "epoch": 1.9471947194719472, "grad_norm": 0.021230120211839676, "learning_rate": 0.2921815364211893, "loss": 0.2769, "num_input_tokens_seen": 3576528, "step": 4130 }, { "epoch": 1.9495520980669494, "grad_norm": 0.32730531692504883, "learning_rate": 0.29216275620600474, "loss": 0.4779, "num_input_tokens_seen": 3580352, "step": 4135 }, { "epoch": 1.9519094766619518, "grad_norm": 0.05166056007146835, "learning_rate": 0.29214395406753657, "loss": 0.5019, "num_input_tokens_seen": 3586272, "step": 4140 }, { "epoch": 1.9542668552569542, "grad_norm": 0.1959736943244934, "learning_rate": 0.2921251300086844, "loss": 0.3479, "num_input_tokens_seen": 3590144, "step": 4145 }, { "epoch": 1.9566242338519566, "grad_norm": 0.03841518610715866, "learning_rate": 0.2921062840323511, "loss": 0.2291, "num_input_tokens_seen": 3594656, "step": 4150 }, { "epoch": 1.958981612446959, "grad_norm": 0.013489766977727413, "learning_rate": 0.29208741614144307, "loss": 0.3711, "num_input_tokens_seen": 3599536, "step": 4155 }, { "epoch": 1.9613389910419614, "grad_norm": 0.01739419996738434, "learning_rate": 0.2920685263388698, "loss": 0.3241, "num_input_tokens_seen": 3605136, "step": 4160 }, { "epoch": 1.9636963696369638, "grad_norm": 0.007645237725228071, "learning_rate": 0.2920496146275445, "loss": 0.2107, "num_input_tokens_seen": 3609728, "step": 4165 }, { "epoch": 1.9660537482319662, "grad_norm": 0.015126136131584644, "learning_rate": 0.29203068101038343, "loss": 0.2291, "num_input_tokens_seen": 3614320, "step": 4170 }, { "epoch": 1.9684111268269684, "grad_norm": 0.021407537162303925, "learning_rate": 0.2920117254903065, "loss": 0.3333, "num_input_tokens_seen": 3618400, "step": 4175 }, { "epoch": 1.9707685054219708, "grad_norm": 0.0060995579697191715, "learning_rate": 0.29199274807023695, "loss": 0.2546, "num_input_tokens_seen": 3622080, "step": 4180 }, { "epoch": 1.973125884016973, "grad_norm": 0.025952722877264023, "learning_rate": 0.29197374875310117, "loss": 0.3482, "num_input_tokens_seen": 3626656, "step": 4185 }, { "epoch": 1.9754832626119754, "grad_norm": 0.009526766836643219, "learning_rate": 0.2919547275418292, "loss": 0.2332, "num_input_tokens_seen": 3630912, "step": 4190 }, { "epoch": 1.9778406412069778, "grad_norm": 0.00838377233594656, "learning_rate": 0.29193568443935436, "loss": 0.2395, "num_input_tokens_seen": 3634992, "step": 4195 }, { "epoch": 1.9801980198019802, "grad_norm": 0.004952629096806049, "learning_rate": 0.2919166194486133, "loss": 0.2094, "num_input_tokens_seen": 3638880, "step": 4200 }, { "epoch": 1.9801980198019802, "eval_loss": 0.22351698577404022, "eval_runtime": 21.8933, "eval_samples_per_second": 43.072, "eval_steps_per_second": 21.559, "num_input_tokens_seen": 3638880, "step": 4200 }, { "epoch": 1.9825553983969826, "grad_norm": 0.008834758773446083, "learning_rate": 0.2918975325725461, "loss": 0.2112, "num_input_tokens_seen": 3643232, "step": 4205 }, { "epoch": 1.984912776991985, "grad_norm": 0.008356036618351936, "learning_rate": 0.29187842381409607, "loss": 0.2477, "num_input_tokens_seen": 3647728, "step": 4210 }, { "epoch": 1.9872701555869874, "grad_norm": 0.0049659935757517815, "learning_rate": 0.29185929317621023, "loss": 0.2318, "num_input_tokens_seen": 3651248, "step": 4215 }, { "epoch": 1.9896275341819898, "grad_norm": 0.004459875635802746, "learning_rate": 0.29184014066183867, "loss": 0.2309, "num_input_tokens_seen": 3655712, "step": 4220 }, { "epoch": 1.991984912776992, "grad_norm": 0.09021452069282532, "learning_rate": 0.2918209662739349, "loss": 0.3733, "num_input_tokens_seen": 3660336, "step": 4225 }, { "epoch": 1.9943422913719944, "grad_norm": 0.019175348803400993, "learning_rate": 0.29180177001545593, "loss": 0.2613, "num_input_tokens_seen": 3664880, "step": 4230 }, { "epoch": 1.9966996699669965, "grad_norm": 0.03967911750078201, "learning_rate": 0.29178255188936203, "loss": 0.2643, "num_input_tokens_seen": 3669216, "step": 4235 }, { "epoch": 1.999057048561999, "grad_norm": 0.008160686120390892, "learning_rate": 0.2917633118986169, "loss": 0.199, "num_input_tokens_seen": 3673840, "step": 4240 }, { "epoch": 2.0014144271570014, "grad_norm": 0.0055540394969284534, "learning_rate": 0.2917440500461875, "loss": 0.2959, "num_input_tokens_seen": 3678416, "step": 4245 }, { "epoch": 2.0037718057520038, "grad_norm": 0.005497124511748552, "learning_rate": 0.29172476633504435, "loss": 0.2011, "num_input_tokens_seen": 3682656, "step": 4250 }, { "epoch": 2.006129184347006, "grad_norm": 0.004424527287483215, "learning_rate": 0.2917054607681612, "loss": 0.2217, "num_input_tokens_seen": 3687472, "step": 4255 }, { "epoch": 2.0084865629420086, "grad_norm": 0.012912609614431858, "learning_rate": 0.29168613334851523, "loss": 0.2363, "num_input_tokens_seen": 3691168, "step": 4260 }, { "epoch": 2.010843941537011, "grad_norm": 0.007412537932395935, "learning_rate": 0.2916667840790869, "loss": 0.218, "num_input_tokens_seen": 3695312, "step": 4265 }, { "epoch": 2.0132013201320134, "grad_norm": 0.005817023571580648, "learning_rate": 0.2916474129628603, "loss": 0.163, "num_input_tokens_seen": 3699744, "step": 4270 }, { "epoch": 2.0155586987270158, "grad_norm": 0.00935419648885727, "learning_rate": 0.29162802000282245, "loss": 0.3135, "num_input_tokens_seen": 3704416, "step": 4275 }, { "epoch": 2.0179160773220177, "grad_norm": 0.00395997567102313, "learning_rate": 0.2916086052019642, "loss": 0.2077, "num_input_tokens_seen": 3709344, "step": 4280 }, { "epoch": 2.02027345591702, "grad_norm": 0.0050944071263074875, "learning_rate": 0.2915891685632794, "loss": 0.242, "num_input_tokens_seen": 3714368, "step": 4285 }, { "epoch": 2.0226308345120225, "grad_norm": 0.00501962611451745, "learning_rate": 0.29156971008976545, "loss": 0.2324, "num_input_tokens_seen": 3718688, "step": 4290 }, { "epoch": 2.024988213107025, "grad_norm": 0.004900871776044369, "learning_rate": 0.2915502297844232, "loss": 0.229, "num_input_tokens_seen": 3723120, "step": 4295 }, { "epoch": 2.0273455917020273, "grad_norm": 0.0026056463830173016, "learning_rate": 0.2915307276502566, "loss": 0.22, "num_input_tokens_seen": 3727312, "step": 4300 }, { "epoch": 2.0297029702970297, "grad_norm": 0.0133593138307333, "learning_rate": 0.29151120369027334, "loss": 0.2473, "num_input_tokens_seen": 3731696, "step": 4305 }, { "epoch": 2.032060348892032, "grad_norm": 0.002827686257660389, "learning_rate": 0.29149165790748405, "loss": 0.236, "num_input_tokens_seen": 3735968, "step": 4310 }, { "epoch": 2.0344177274870345, "grad_norm": 0.004360114224255085, "learning_rate": 0.291472090304903, "loss": 0.235, "num_input_tokens_seen": 3740256, "step": 4315 }, { "epoch": 2.036775106082037, "grad_norm": 0.010880178771913052, "learning_rate": 0.2914525008855478, "loss": 0.2443, "num_input_tokens_seen": 3744752, "step": 4320 }, { "epoch": 2.0391324846770393, "grad_norm": 0.004991465248167515, "learning_rate": 0.2914328896524394, "loss": 0.2475, "num_input_tokens_seen": 3749248, "step": 4325 }, { "epoch": 2.0414898632720413, "grad_norm": 0.003895543748512864, "learning_rate": 0.291413256608602, "loss": 0.1126, "num_input_tokens_seen": 3753440, "step": 4330 }, { "epoch": 2.0438472418670437, "grad_norm": 0.0074448008090257645, "learning_rate": 0.29139360175706336, "loss": 0.3136, "num_input_tokens_seen": 3758272, "step": 4335 }, { "epoch": 2.046204620462046, "grad_norm": 0.007494518533349037, "learning_rate": 0.2913739251008544, "loss": 0.1993, "num_input_tokens_seen": 3762128, "step": 4340 }, { "epoch": 2.0485619990570485, "grad_norm": 0.00447418075054884, "learning_rate": 0.29135422664300964, "loss": 0.2308, "num_input_tokens_seen": 3766432, "step": 4345 }, { "epoch": 2.050919377652051, "grad_norm": 0.006357372272759676, "learning_rate": 0.29133450638656677, "loss": 0.2466, "num_input_tokens_seen": 3770768, "step": 4350 }, { "epoch": 2.0532767562470533, "grad_norm": 0.006542697548866272, "learning_rate": 0.2913147643345669, "loss": 0.2448, "num_input_tokens_seen": 3774752, "step": 4355 }, { "epoch": 2.0556341348420557, "grad_norm": 0.004987667314708233, "learning_rate": 0.29129500049005447, "loss": 0.2046, "num_input_tokens_seen": 3778576, "step": 4360 }, { "epoch": 2.057991513437058, "grad_norm": 0.004037009086459875, "learning_rate": 0.2912752148560773, "loss": 0.2197, "num_input_tokens_seen": 3782384, "step": 4365 }, { "epoch": 2.0603488920320605, "grad_norm": 0.0024866056628525257, "learning_rate": 0.2912554074356866, "loss": 0.2868, "num_input_tokens_seen": 3786512, "step": 4370 }, { "epoch": 2.062706270627063, "grad_norm": 0.0036324032116681337, "learning_rate": 0.2912355782319371, "loss": 0.2465, "num_input_tokens_seen": 3790416, "step": 4375 }, { "epoch": 2.065063649222065, "grad_norm": 0.00657393503934145, "learning_rate": 0.2912157272478864, "loss": 0.2717, "num_input_tokens_seen": 3794992, "step": 4380 }, { "epoch": 2.0674210278170673, "grad_norm": 0.003984222188591957, "learning_rate": 0.291195854486596, "loss": 0.2342, "num_input_tokens_seen": 3799376, "step": 4385 }, { "epoch": 2.0697784064120697, "grad_norm": 0.002631403971463442, "learning_rate": 0.2911759599511305, "loss": 0.2178, "num_input_tokens_seen": 3803824, "step": 4390 }, { "epoch": 2.072135785007072, "grad_norm": 0.00347145670093596, "learning_rate": 0.29115604364455777, "loss": 0.1021, "num_input_tokens_seen": 3808576, "step": 4395 }, { "epoch": 2.0744931636020745, "grad_norm": 0.006253664381802082, "learning_rate": 0.2911361055699493, "loss": 0.3115, "num_input_tokens_seen": 3812624, "step": 4400 }, { "epoch": 2.0744931636020745, "eval_loss": 0.25132954120635986, "eval_runtime": 21.9168, "eval_samples_per_second": 43.026, "eval_steps_per_second": 21.536, "num_input_tokens_seen": 3812624, "step": 4400 }, { "epoch": 2.076850542197077, "grad_norm": 0.0021109282970428467, "learning_rate": 0.2911161457303797, "loss": 0.2398, "num_input_tokens_seen": 3816576, "step": 4405 }, { "epoch": 2.0792079207920793, "grad_norm": 0.0023716092109680176, "learning_rate": 0.291096164128927, "loss": 0.2462, "num_input_tokens_seen": 3821328, "step": 4410 }, { "epoch": 2.0815652993870817, "grad_norm": 0.005214781034737825, "learning_rate": 0.2910761607686727, "loss": 0.2291, "num_input_tokens_seen": 3825488, "step": 4415 }, { "epoch": 2.083922677982084, "grad_norm": 0.001994062913581729, "learning_rate": 0.2910561356527016, "loss": 0.2421, "num_input_tokens_seen": 3829712, "step": 4420 }, { "epoch": 2.0862800565770865, "grad_norm": 0.008380950428545475, "learning_rate": 0.2910360887841017, "loss": 0.2618, "num_input_tokens_seen": 3833936, "step": 4425 }, { "epoch": 2.0886374351720884, "grad_norm": 0.003959075082093477, "learning_rate": 0.2910160201659645, "loss": 0.2076, "num_input_tokens_seen": 3838816, "step": 4430 }, { "epoch": 2.090994813767091, "grad_norm": 0.0017179262358695269, "learning_rate": 0.29099592980138494, "loss": 0.1584, "num_input_tokens_seen": 3843136, "step": 4435 }, { "epoch": 2.0933521923620932, "grad_norm": 0.0018219833727926016, "learning_rate": 0.29097581769346115, "loss": 0.2341, "num_input_tokens_seen": 3848144, "step": 4440 }, { "epoch": 2.0957095709570956, "grad_norm": 0.003199170809239149, "learning_rate": 0.29095568384529463, "loss": 0.271, "num_input_tokens_seen": 3853584, "step": 4445 }, { "epoch": 2.098066949552098, "grad_norm": 0.0019119078060612082, "learning_rate": 0.2909355282599903, "loss": 0.2257, "num_input_tokens_seen": 3857808, "step": 4450 }, { "epoch": 2.1004243281471005, "grad_norm": 0.0033691106364130974, "learning_rate": 0.29091535094065635, "loss": 0.2283, "num_input_tokens_seen": 3861840, "step": 4455 }, { "epoch": 2.102781706742103, "grad_norm": 0.0015494617400690913, "learning_rate": 0.2908951518904045, "loss": 0.198, "num_input_tokens_seen": 3866128, "step": 4460 }, { "epoch": 2.1051390853371053, "grad_norm": 0.0022173866163939238, "learning_rate": 0.29087493111234963, "loss": 0.1705, "num_input_tokens_seen": 3870816, "step": 4465 }, { "epoch": 2.1074964639321077, "grad_norm": 0.010074043646454811, "learning_rate": 0.29085468860961, "loss": 0.2587, "num_input_tokens_seen": 3875600, "step": 4470 }, { "epoch": 2.10985384252711, "grad_norm": 0.0018980744061991572, "learning_rate": 0.2908344243853073, "loss": 0.2334, "num_input_tokens_seen": 3879552, "step": 4475 }, { "epoch": 2.112211221122112, "grad_norm": 0.002582897897809744, "learning_rate": 0.2908141384425666, "loss": 0.1987, "num_input_tokens_seen": 3883808, "step": 4480 }, { "epoch": 2.1145685997171144, "grad_norm": 0.0039053717628121376, "learning_rate": 0.2907938307845161, "loss": 0.2096, "num_input_tokens_seen": 3888752, "step": 4485 }, { "epoch": 2.116925978312117, "grad_norm": 0.003342817071825266, "learning_rate": 0.2907735014142876, "loss": 0.25, "num_input_tokens_seen": 3893376, "step": 4490 }, { "epoch": 2.119283356907119, "grad_norm": 0.0026601029094308615, "learning_rate": 0.2907531503350161, "loss": 0.2259, "num_input_tokens_seen": 3896800, "step": 4495 }, { "epoch": 2.1216407355021216, "grad_norm": 0.00332530215382576, "learning_rate": 0.29073277754983995, "loss": 0.2275, "num_input_tokens_seen": 3900832, "step": 4500 }, { "epoch": 2.123998114097124, "grad_norm": 0.0022627972066402435, "learning_rate": 0.290712383061901, "loss": 0.238, "num_input_tokens_seen": 3904880, "step": 4505 }, { "epoch": 2.1263554926921264, "grad_norm": 0.0037707414012402296, "learning_rate": 0.2906919668743443, "loss": 0.2182, "num_input_tokens_seen": 3909488, "step": 4510 }, { "epoch": 2.128712871287129, "grad_norm": 0.009261597879230976, "learning_rate": 0.29067152899031823, "loss": 0.2514, "num_input_tokens_seen": 3913824, "step": 4515 }, { "epoch": 2.1310702498821312, "grad_norm": 0.008174052461981773, "learning_rate": 0.2906510694129746, "loss": 0.2616, "num_input_tokens_seen": 3917872, "step": 4520 }, { "epoch": 2.1334276284771336, "grad_norm": 0.004461281467229128, "learning_rate": 0.2906305881454685, "loss": 0.2378, "num_input_tokens_seen": 3922384, "step": 4525 }, { "epoch": 2.1357850070721356, "grad_norm": 0.0032474680338054895, "learning_rate": 0.2906100851909585, "loss": 0.2401, "num_input_tokens_seen": 3927216, "step": 4530 }, { "epoch": 2.138142385667138, "grad_norm": 0.0038526973221451044, "learning_rate": 0.29058956055260626, "loss": 0.2268, "num_input_tokens_seen": 3931584, "step": 4535 }, { "epoch": 2.1404997642621404, "grad_norm": 0.010077448561787605, "learning_rate": 0.2905690142335771, "loss": 0.2424, "num_input_tokens_seen": 3936224, "step": 4540 }, { "epoch": 2.142857142857143, "grad_norm": 0.007212420925498009, "learning_rate": 0.29054844623703946, "loss": 0.2253, "num_input_tokens_seen": 3940880, "step": 4545 }, { "epoch": 2.145214521452145, "grad_norm": 0.004083470907062292, "learning_rate": 0.2905278565661651, "loss": 0.2448, "num_input_tokens_seen": 3945136, "step": 4550 }, { "epoch": 2.1475719000471476, "grad_norm": 0.002446418860927224, "learning_rate": 0.2905072452241293, "loss": 0.2267, "num_input_tokens_seen": 3948992, "step": 4555 }, { "epoch": 2.14992927864215, "grad_norm": 0.006347887217998505, "learning_rate": 0.2904866122141106, "loss": 0.2588, "num_input_tokens_seen": 3953200, "step": 4560 }, { "epoch": 2.1522866572371524, "grad_norm": 0.00412141066044569, "learning_rate": 0.2904659575392908, "loss": 0.2444, "num_input_tokens_seen": 3957136, "step": 4565 }, { "epoch": 2.154644035832155, "grad_norm": 0.004330959636718035, "learning_rate": 0.2904452812028551, "loss": 0.2367, "num_input_tokens_seen": 3961232, "step": 4570 }, { "epoch": 2.157001414427157, "grad_norm": 0.00738293444737792, "learning_rate": 0.2904245832079922, "loss": 0.236, "num_input_tokens_seen": 3964784, "step": 4575 }, { "epoch": 2.159358793022159, "grad_norm": 0.008906913921236992, "learning_rate": 0.29040386355789377, "loss": 0.2667, "num_input_tokens_seen": 3969648, "step": 4580 }, { "epoch": 2.1617161716171616, "grad_norm": 0.00525430915877223, "learning_rate": 0.29038312225575524, "loss": 0.2167, "num_input_tokens_seen": 3973680, "step": 4585 }, { "epoch": 2.164073550212164, "grad_norm": 0.004818788729608059, "learning_rate": 0.29036235930477505, "loss": 0.1991, "num_input_tokens_seen": 3977840, "step": 4590 }, { "epoch": 2.1664309288071664, "grad_norm": 0.0020086332224309444, "learning_rate": 0.29034157470815514, "loss": 0.2443, "num_input_tokens_seen": 3981920, "step": 4595 }, { "epoch": 2.1687883074021688, "grad_norm": 0.004461600910872221, "learning_rate": 0.2903207684691008, "loss": 0.2463, "num_input_tokens_seen": 3986544, "step": 4600 }, { "epoch": 2.1687883074021688, "eval_loss": 0.25094202160835266, "eval_runtime": 21.9135, "eval_samples_per_second": 43.033, "eval_steps_per_second": 21.539, "num_input_tokens_seen": 3986544, "step": 4600 }, { "epoch": 2.171145685997171, "grad_norm": 0.00898516084998846, "learning_rate": 0.29029994059082054, "loss": 0.2276, "num_input_tokens_seen": 3990896, "step": 4605 }, { "epoch": 2.1735030645921736, "grad_norm": 0.004172341898083687, "learning_rate": 0.2902790910765264, "loss": 0.3738, "num_input_tokens_seen": 3995296, "step": 4610 }, { "epoch": 2.175860443187176, "grad_norm": 0.0023360135965049267, "learning_rate": 0.29025821992943346, "loss": 0.2422, "num_input_tokens_seen": 3999728, "step": 4615 }, { "epoch": 2.1782178217821784, "grad_norm": 0.004342793952673674, "learning_rate": 0.29023732715276046, "loss": 0.2131, "num_input_tokens_seen": 4004160, "step": 4620 }, { "epoch": 2.1805752003771808, "grad_norm": 0.0015650418354198337, "learning_rate": 0.2902164127497293, "loss": 0.2393, "num_input_tokens_seen": 4008928, "step": 4625 }, { "epoch": 2.1829325789721827, "grad_norm": 0.004077928606420755, "learning_rate": 0.2901954767235652, "loss": 0.2254, "num_input_tokens_seen": 4012928, "step": 4630 }, { "epoch": 2.185289957567185, "grad_norm": 0.005993286147713661, "learning_rate": 0.2901745190774968, "loss": 0.2167, "num_input_tokens_seen": 4017376, "step": 4635 }, { "epoch": 2.1876473361621875, "grad_norm": 0.0023205699399113655, "learning_rate": 0.290153539814756, "loss": 0.2384, "num_input_tokens_seen": 4021904, "step": 4640 }, { "epoch": 2.19000471475719, "grad_norm": 0.0025256965309381485, "learning_rate": 0.2901325389385781, "loss": 0.2021, "num_input_tokens_seen": 4026368, "step": 4645 }, { "epoch": 2.1923620933521923, "grad_norm": 0.0029006185941398144, "learning_rate": 0.2901115164522016, "loss": 0.2535, "num_input_tokens_seen": 4030464, "step": 4650 }, { "epoch": 2.1947194719471947, "grad_norm": 0.0019522409420460463, "learning_rate": 0.29009047235886865, "loss": 0.2154, "num_input_tokens_seen": 4034560, "step": 4655 }, { "epoch": 2.197076850542197, "grad_norm": 0.0028362476732581854, "learning_rate": 0.2900694066618243, "loss": 0.2273, "num_input_tokens_seen": 4038432, "step": 4660 }, { "epoch": 2.1994342291371995, "grad_norm": 0.0031916063744574785, "learning_rate": 0.2900483193643172, "loss": 0.2179, "num_input_tokens_seen": 4042432, "step": 4665 }, { "epoch": 2.201791607732202, "grad_norm": 0.0029954013880342245, "learning_rate": 0.29002721046959934, "loss": 0.2076, "num_input_tokens_seen": 4046416, "step": 4670 }, { "epoch": 2.2041489863272044, "grad_norm": 0.010346957482397556, "learning_rate": 0.29000607998092587, "loss": 0.287, "num_input_tokens_seen": 4050208, "step": 4675 }, { "epoch": 2.2065063649222063, "grad_norm": 0.0069283414632081985, "learning_rate": 0.2899849279015555, "loss": 0.2286, "num_input_tokens_seen": 4054928, "step": 4680 }, { "epoch": 2.2088637435172087, "grad_norm": 0.004666629247367382, "learning_rate": 0.28996375423475007, "loss": 0.2268, "num_input_tokens_seen": 4059360, "step": 4685 }, { "epoch": 2.211221122112211, "grad_norm": 0.002812155056744814, "learning_rate": 0.28994255898377486, "loss": 0.2029, "num_input_tokens_seen": 4063424, "step": 4690 }, { "epoch": 2.2135785007072135, "grad_norm": 0.0015963328769430518, "learning_rate": 0.2899213421518984, "loss": 0.225, "num_input_tokens_seen": 4067248, "step": 4695 }, { "epoch": 2.215935879302216, "grad_norm": 0.003561423858627677, "learning_rate": 0.2899001037423926, "loss": 0.2338, "num_input_tokens_seen": 4070992, "step": 4700 }, { "epoch": 2.2182932578972183, "grad_norm": 0.005524960346519947, "learning_rate": 0.28987884375853273, "loss": 0.2118, "num_input_tokens_seen": 4075712, "step": 4705 }, { "epoch": 2.2206506364922207, "grad_norm": 0.0031697568483650684, "learning_rate": 0.2898575622035974, "loss": 0.2407, "num_input_tokens_seen": 4079744, "step": 4710 }, { "epoch": 2.223008015087223, "grad_norm": 0.002284466754645109, "learning_rate": 0.2898362590808683, "loss": 0.2387, "num_input_tokens_seen": 4083776, "step": 4715 }, { "epoch": 2.2253653936822255, "grad_norm": 0.004113307222723961, "learning_rate": 0.2898149343936308, "loss": 0.2354, "num_input_tokens_seen": 4087232, "step": 4720 }, { "epoch": 2.227722772277228, "grad_norm": 0.014603461138904095, "learning_rate": 0.2897935881451734, "loss": 0.2069, "num_input_tokens_seen": 4091840, "step": 4725 }, { "epoch": 2.23008015087223, "grad_norm": 0.004420225974172354, "learning_rate": 0.28977222033878797, "loss": 0.2364, "num_input_tokens_seen": 4096176, "step": 4730 }, { "epoch": 2.2324375294672323, "grad_norm": 0.0038562866393476725, "learning_rate": 0.28975083097776966, "loss": 0.2378, "num_input_tokens_seen": 4100080, "step": 4735 }, { "epoch": 2.2347949080622347, "grad_norm": 0.0030011695344001055, "learning_rate": 0.28972942006541696, "loss": 0.2129, "num_input_tokens_seen": 4106304, "step": 4740 }, { "epoch": 2.237152286657237, "grad_norm": 0.003008002880960703, "learning_rate": 0.2897079876050318, "loss": 0.2591, "num_input_tokens_seen": 4111664, "step": 4745 }, { "epoch": 2.2395096652522395, "grad_norm": 0.002006371971219778, "learning_rate": 0.2896865335999192, "loss": 0.2336, "num_input_tokens_seen": 4115936, "step": 4750 }, { "epoch": 2.241867043847242, "grad_norm": 0.00842322688549757, "learning_rate": 0.28966505805338777, "loss": 0.2465, "num_input_tokens_seen": 4120320, "step": 4755 }, { "epoch": 2.2442244224422443, "grad_norm": 0.0027929209172725677, "learning_rate": 0.2896435609687492, "loss": 0.2396, "num_input_tokens_seen": 4124928, "step": 4760 }, { "epoch": 2.2465818010372467, "grad_norm": 0.0038449065759778023, "learning_rate": 0.2896220423493187, "loss": 0.2222, "num_input_tokens_seen": 4128672, "step": 4765 }, { "epoch": 2.248939179632249, "grad_norm": 0.004670136142522097, "learning_rate": 0.28960050219841466, "loss": 0.2256, "num_input_tokens_seen": 4132848, "step": 4770 }, { "epoch": 2.251296558227251, "grad_norm": 0.00642580958083272, "learning_rate": 0.28957894051935884, "loss": 0.2578, "num_input_tokens_seen": 4136416, "step": 4775 }, { "epoch": 2.2536539368222535, "grad_norm": 0.0026334747672080994, "learning_rate": 0.2895573573154764, "loss": 0.1895, "num_input_tokens_seen": 4141040, "step": 4780 }, { "epoch": 2.256011315417256, "grad_norm": 0.00287412921898067, "learning_rate": 0.28953575259009556, "loss": 0.2193, "num_input_tokens_seen": 4145600, "step": 4785 }, { "epoch": 2.2583686940122583, "grad_norm": 0.004092961084097624, "learning_rate": 0.2895141263465482, "loss": 0.1857, "num_input_tokens_seen": 4150336, "step": 4790 }, { "epoch": 2.2607260726072607, "grad_norm": 0.0011726474622264504, "learning_rate": 0.28949247858816934, "loss": 0.2378, "num_input_tokens_seen": 4154192, "step": 4795 }, { "epoch": 2.263083451202263, "grad_norm": 0.004114753101021051, "learning_rate": 0.2894708093182973, "loss": 0.1836, "num_input_tokens_seen": 4158272, "step": 4800 }, { "epoch": 2.263083451202263, "eval_loss": 0.22281309962272644, "eval_runtime": 21.8831, "eval_samples_per_second": 43.093, "eval_steps_per_second": 21.569, "num_input_tokens_seen": 4158272, "step": 4800 }, { "epoch": 2.2654408297972655, "grad_norm": 0.0053550382144749165, "learning_rate": 0.2894491185402737, "loss": 0.2881, "num_input_tokens_seen": 4162816, "step": 4805 }, { "epoch": 2.267798208392268, "grad_norm": 0.002786647994071245, "learning_rate": 0.2894274062574437, "loss": 0.2427, "num_input_tokens_seen": 4166432, "step": 4810 }, { "epoch": 2.2701555869872703, "grad_norm": 0.0018372753402218223, "learning_rate": 0.2894056724731554, "loss": 0.2897, "num_input_tokens_seen": 4170416, "step": 4815 }, { "epoch": 2.2725129655822727, "grad_norm": 0.002965539461001754, "learning_rate": 0.28938391719076056, "loss": 0.2495, "num_input_tokens_seen": 4174528, "step": 4820 }, { "epoch": 2.274870344177275, "grad_norm": 0.0033832949120551348, "learning_rate": 0.28936214041361413, "loss": 0.2152, "num_input_tokens_seen": 4178944, "step": 4825 }, { "epoch": 2.2772277227722775, "grad_norm": 0.001343653304502368, "learning_rate": 0.2893403421450743, "loss": 0.165, "num_input_tokens_seen": 4183344, "step": 4830 }, { "epoch": 2.2795851013672794, "grad_norm": 0.01073263306170702, "learning_rate": 0.2893185223885026, "loss": 0.2894, "num_input_tokens_seen": 4187376, "step": 4835 }, { "epoch": 2.281942479962282, "grad_norm": 0.005695690400898457, "learning_rate": 0.289296681147264, "loss": 0.2753, "num_input_tokens_seen": 4191680, "step": 4840 }, { "epoch": 2.2842998585572842, "grad_norm": 0.003689198289066553, "learning_rate": 0.28927481842472663, "loss": 0.2471, "num_input_tokens_seen": 4196080, "step": 4845 }, { "epoch": 2.2866572371522866, "grad_norm": 0.00356759550049901, "learning_rate": 0.28925293422426207, "loss": 0.2367, "num_input_tokens_seen": 4200720, "step": 4850 }, { "epoch": 2.289014615747289, "grad_norm": 0.0024006229359656572, "learning_rate": 0.28923102854924504, "loss": 0.2357, "num_input_tokens_seen": 4205120, "step": 4855 }, { "epoch": 2.2913719943422914, "grad_norm": 0.0019002433400601149, "learning_rate": 0.2892091014030537, "loss": 0.268, "num_input_tokens_seen": 4208320, "step": 4860 }, { "epoch": 2.293729372937294, "grad_norm": 0.002187137957662344, "learning_rate": 0.2891871527890696, "loss": 0.2109, "num_input_tokens_seen": 4212976, "step": 4865 }, { "epoch": 2.2960867515322962, "grad_norm": 0.00423853937536478, "learning_rate": 0.2891651827106773, "loss": 0.1861, "num_input_tokens_seen": 4218096, "step": 4870 }, { "epoch": 2.298444130127298, "grad_norm": 0.0038500367663800716, "learning_rate": 0.2891431911712651, "loss": 0.225, "num_input_tokens_seen": 4222176, "step": 4875 }, { "epoch": 2.3008015087223006, "grad_norm": 0.008027452975511551, "learning_rate": 0.2891211781742241, "loss": 0.2495, "num_input_tokens_seen": 4226064, "step": 4880 }, { "epoch": 2.303158887317303, "grad_norm": 0.002122135367244482, "learning_rate": 0.2890991437229492, "loss": 0.2412, "num_input_tokens_seen": 4229744, "step": 4885 }, { "epoch": 2.3055162659123054, "grad_norm": 0.0014693469274789095, "learning_rate": 0.2890770878208383, "loss": 0.2333, "num_input_tokens_seen": 4233744, "step": 4890 }, { "epoch": 2.307873644507308, "grad_norm": 0.0016319864662364125, "learning_rate": 0.28905501047129273, "loss": 0.228, "num_input_tokens_seen": 4238336, "step": 4895 }, { "epoch": 2.31023102310231, "grad_norm": 0.0023906566202640533, "learning_rate": 0.289032911677717, "loss": 0.1994, "num_input_tokens_seen": 4242176, "step": 4900 }, { "epoch": 2.3125884016973126, "grad_norm": 0.0014183217426761985, "learning_rate": 0.28901079144351915, "loss": 0.2208, "num_input_tokens_seen": 4246832, "step": 4905 }, { "epoch": 2.314945780292315, "grad_norm": 0.003341552335768938, "learning_rate": 0.2889886497721103, "loss": 0.2184, "num_input_tokens_seen": 4251424, "step": 4910 }, { "epoch": 2.3173031588873174, "grad_norm": 0.0015684900572523475, "learning_rate": 0.28896648666690505, "loss": 0.2582, "num_input_tokens_seen": 4256128, "step": 4915 }, { "epoch": 2.31966053748232, "grad_norm": 0.003913819789886475, "learning_rate": 0.2889443021313212, "loss": 0.2243, "num_input_tokens_seen": 4260384, "step": 4920 }, { "epoch": 2.322017916077322, "grad_norm": 0.0025833460967987776, "learning_rate": 0.28892209616877984, "loss": 0.2178, "num_input_tokens_seen": 4264480, "step": 4925 }, { "epoch": 2.3243752946723246, "grad_norm": 0.0018669584533199668, "learning_rate": 0.28889986878270546, "loss": 0.217, "num_input_tokens_seen": 4267776, "step": 4930 }, { "epoch": 2.3267326732673266, "grad_norm": 0.0012349698226898909, "learning_rate": 0.28887761997652583, "loss": 0.2002, "num_input_tokens_seen": 4271856, "step": 4935 }, { "epoch": 2.329090051862329, "grad_norm": 0.005057424772530794, "learning_rate": 0.2888553497536719, "loss": 0.2814, "num_input_tokens_seen": 4275568, "step": 4940 }, { "epoch": 2.3314474304573314, "grad_norm": 0.0019691118504852057, "learning_rate": 0.2888330581175781, "loss": 0.2148, "num_input_tokens_seen": 4279632, "step": 4945 }, { "epoch": 2.333804809052334, "grad_norm": 0.0019750020001083612, "learning_rate": 0.28881074507168203, "loss": 0.2271, "num_input_tokens_seen": 4284016, "step": 4950 }, { "epoch": 2.336162187647336, "grad_norm": 0.0015469721984118223, "learning_rate": 0.2887884106194247, "loss": 0.2383, "num_input_tokens_seen": 4288032, "step": 4955 }, { "epoch": 2.3385195662423386, "grad_norm": 0.0036088062915951014, "learning_rate": 0.28876605476425027, "loss": 0.2417, "num_input_tokens_seen": 4292496, "step": 4960 }, { "epoch": 2.340876944837341, "grad_norm": 0.0014500333927571774, "learning_rate": 0.2887436775096064, "loss": 0.2406, "num_input_tokens_seen": 4296752, "step": 4965 }, { "epoch": 2.3432343234323434, "grad_norm": 0.0021647480316460133, "learning_rate": 0.2887212788589439, "loss": 0.2279, "num_input_tokens_seen": 4302048, "step": 4970 }, { "epoch": 2.3455917020273453, "grad_norm": 0.004856151062995195, "learning_rate": 0.2886988588157169, "loss": 0.2267, "num_input_tokens_seen": 4307088, "step": 4975 }, { "epoch": 2.3479490806223478, "grad_norm": 0.0021772270556539297, "learning_rate": 0.28867641738338284, "loss": 0.2154, "num_input_tokens_seen": 4311184, "step": 4980 }, { "epoch": 2.35030645921735, "grad_norm": 0.0025714533403515816, "learning_rate": 0.2886539545654026, "loss": 0.2352, "num_input_tokens_seen": 4315104, "step": 4985 }, { "epoch": 2.3526638378123526, "grad_norm": 0.002195181557908654, "learning_rate": 0.28863147036524006, "loss": 0.2169, "num_input_tokens_seen": 4319424, "step": 4990 }, { "epoch": 2.355021216407355, "grad_norm": 0.0022226301953196526, "learning_rate": 0.2886089647863626, "loss": 0.2252, "num_input_tokens_seen": 4323712, "step": 4995 }, { "epoch": 2.3573785950023574, "grad_norm": 0.0020217387937009335, "learning_rate": 0.288586437832241, "loss": 0.2252, "num_input_tokens_seen": 4328240, "step": 5000 }, { "epoch": 2.3573785950023574, "eval_loss": 0.22035084664821625, "eval_runtime": 21.9674, "eval_samples_per_second": 42.927, "eval_steps_per_second": 21.486, "num_input_tokens_seen": 4328240, "step": 5000 }, { "epoch": 2.3597359735973598, "grad_norm": 0.005042655859142542, "learning_rate": 0.28856388950634904, "loss": 0.2226, "num_input_tokens_seen": 4331856, "step": 5005 }, { "epoch": 2.362093352192362, "grad_norm": 0.002474566688761115, "learning_rate": 0.288541319812164, "loss": 0.233, "num_input_tokens_seen": 4336288, "step": 5010 }, { "epoch": 2.3644507307873646, "grad_norm": 0.0017734490102156997, "learning_rate": 0.2885187287531665, "loss": 0.218, "num_input_tokens_seen": 4340016, "step": 5015 }, { "epoch": 2.366808109382367, "grad_norm": 0.00244091241620481, "learning_rate": 0.2884961163328402, "loss": 0.2516, "num_input_tokens_seen": 4345280, "step": 5020 }, { "epoch": 2.3691654879773694, "grad_norm": 0.009196504950523376, "learning_rate": 0.28847348255467237, "loss": 0.2308, "num_input_tokens_seen": 4350496, "step": 5025 }, { "epoch": 2.3715228665723718, "grad_norm": 0.00429969048127532, "learning_rate": 0.28845082742215333, "loss": 0.1995, "num_input_tokens_seen": 4355024, "step": 5030 }, { "epoch": 2.3738802451673737, "grad_norm": 0.003474870231002569, "learning_rate": 0.2884281509387769, "loss": 0.2311, "num_input_tokens_seen": 4359136, "step": 5035 }, { "epoch": 2.376237623762376, "grad_norm": 0.0012972319964319468, "learning_rate": 0.2884054531080399, "loss": 0.2012, "num_input_tokens_seen": 4363264, "step": 5040 }, { "epoch": 2.3785950023573785, "grad_norm": 0.0036207414232194424, "learning_rate": 0.28838273393344277, "loss": 0.2261, "num_input_tokens_seen": 4367840, "step": 5045 }, { "epoch": 2.380952380952381, "grad_norm": 0.01373889297246933, "learning_rate": 0.288359993418489, "loss": 0.2305, "num_input_tokens_seen": 4372848, "step": 5050 }, { "epoch": 2.3833097595473833, "grad_norm": 0.0014804807724431157, "learning_rate": 0.28833723156668556, "loss": 0.2357, "num_input_tokens_seen": 4376624, "step": 5055 }, { "epoch": 2.3856671381423857, "grad_norm": 0.0035263265017420053, "learning_rate": 0.2883144483815425, "loss": 0.2485, "num_input_tokens_seen": 4380944, "step": 5060 }, { "epoch": 2.388024516737388, "grad_norm": 0.005051702260971069, "learning_rate": 0.28829164386657335, "loss": 0.2377, "num_input_tokens_seen": 4384880, "step": 5065 }, { "epoch": 2.3903818953323905, "grad_norm": 0.008408577181398869, "learning_rate": 0.28826881802529486, "loss": 0.2289, "num_input_tokens_seen": 4389184, "step": 5070 }, { "epoch": 2.3927392739273925, "grad_norm": 0.0017322440398856997, "learning_rate": 0.28824597086122705, "loss": 0.2552, "num_input_tokens_seen": 4393424, "step": 5075 }, { "epoch": 2.395096652522395, "grad_norm": 0.008912693709135056, "learning_rate": 0.28822310237789317, "loss": 0.246, "num_input_tokens_seen": 4397680, "step": 5080 }, { "epoch": 2.3974540311173973, "grad_norm": 0.005066584795713425, "learning_rate": 0.2882002125788199, "loss": 0.2224, "num_input_tokens_seen": 4402288, "step": 5085 }, { "epoch": 2.3998114097123997, "grad_norm": 0.009886041283607483, "learning_rate": 0.2881773014675371, "loss": 0.3468, "num_input_tokens_seen": 4406528, "step": 5090 }, { "epoch": 2.402168788307402, "grad_norm": 0.0060807447880506516, "learning_rate": 0.288154369047578, "loss": 0.2512, "num_input_tokens_seen": 4411392, "step": 5095 }, { "epoch": 2.4045261669024045, "grad_norm": 0.006759710144251585, "learning_rate": 0.28813141532247905, "loss": 0.2544, "num_input_tokens_seen": 4415600, "step": 5100 }, { "epoch": 2.406883545497407, "grad_norm": 0.0015812201891094446, "learning_rate": 0.28810844029578, "loss": 0.1868, "num_input_tokens_seen": 4420176, "step": 5105 }, { "epoch": 2.4092409240924093, "grad_norm": 0.002313104458153248, "learning_rate": 0.2880854439710238, "loss": 0.2231, "num_input_tokens_seen": 4423968, "step": 5110 }, { "epoch": 2.4115983026874117, "grad_norm": 0.00309342285618186, "learning_rate": 0.28806242635175694, "loss": 0.287, "num_input_tokens_seen": 4429840, "step": 5115 }, { "epoch": 2.413955681282414, "grad_norm": 0.0055124773643910885, "learning_rate": 0.2880393874415289, "loss": 0.2546, "num_input_tokens_seen": 4434592, "step": 5120 }, { "epoch": 2.4163130598774165, "grad_norm": 0.008148107677698135, "learning_rate": 0.2880163272438926, "loss": 0.2653, "num_input_tokens_seen": 4439632, "step": 5125 }, { "epoch": 2.418670438472419, "grad_norm": 0.004936034791171551, "learning_rate": 0.2879932457624042, "loss": 0.2439, "num_input_tokens_seen": 4444176, "step": 5130 }, { "epoch": 2.421027817067421, "grad_norm": 0.0019003687193617225, "learning_rate": 0.2879701430006232, "loss": 0.2445, "num_input_tokens_seen": 4448640, "step": 5135 }, { "epoch": 2.4233851956624233, "grad_norm": 0.006011665798723698, "learning_rate": 0.28794701896211233, "loss": 0.2702, "num_input_tokens_seen": 4453456, "step": 5140 }, { "epoch": 2.4257425742574257, "grad_norm": 0.0029711551032960415, "learning_rate": 0.28792387365043753, "loss": 0.2383, "num_input_tokens_seen": 4456736, "step": 5145 }, { "epoch": 2.428099952852428, "grad_norm": 0.003661549184471369, "learning_rate": 0.28790070706916815, "loss": 0.2408, "num_input_tokens_seen": 4462368, "step": 5150 }, { "epoch": 2.4304573314474305, "grad_norm": 0.003339531598612666, "learning_rate": 0.2878775192218768, "loss": 0.2297, "num_input_tokens_seen": 4468560, "step": 5155 }, { "epoch": 2.432814710042433, "grad_norm": 0.005055107641965151, "learning_rate": 0.2878543101121393, "loss": 0.2261, "num_input_tokens_seen": 4473184, "step": 5160 }, { "epoch": 2.4351720886374353, "grad_norm": 0.00307185179553926, "learning_rate": 0.28783107974353483, "loss": 0.2413, "num_input_tokens_seen": 4477344, "step": 5165 }, { "epoch": 2.4375294672324377, "grad_norm": 0.0029498999938368797, "learning_rate": 0.2878078281196457, "loss": 0.2132, "num_input_tokens_seen": 4481904, "step": 5170 }, { "epoch": 2.4398868458274396, "grad_norm": 0.001580321229994297, "learning_rate": 0.28778455524405777, "loss": 0.2763, "num_input_tokens_seen": 4485648, "step": 5175 }, { "epoch": 2.442244224422442, "grad_norm": 0.0066320146434009075, "learning_rate": 0.2877612611203598, "loss": 0.2596, "num_input_tokens_seen": 4489376, "step": 5180 }, { "epoch": 2.4446016030174444, "grad_norm": 0.0028738700784742832, "learning_rate": 0.28773794575214423, "loss": 0.2316, "num_input_tokens_seen": 4493872, "step": 5185 }, { "epoch": 2.446958981612447, "grad_norm": 0.0021227935794740915, "learning_rate": 0.28771460914300645, "loss": 0.2201, "num_input_tokens_seen": 4498016, "step": 5190 }, { "epoch": 2.4493163602074493, "grad_norm": 0.0033159791491925716, "learning_rate": 0.2876912512965454, "loss": 0.2363, "num_input_tokens_seen": 4503504, "step": 5195 }, { "epoch": 2.4516737388024517, "grad_norm": 0.004337094724178314, "learning_rate": 0.287667872216363, "loss": 0.2139, "num_input_tokens_seen": 4507760, "step": 5200 }, { "epoch": 2.4516737388024517, "eval_loss": 0.21937020123004913, "eval_runtime": 21.9146, "eval_samples_per_second": 43.031, "eval_steps_per_second": 21.538, "num_input_tokens_seen": 4507760, "step": 5200 }, { "epoch": 2.454031117397454, "grad_norm": 0.0028338846750557423, "learning_rate": 0.2876444719060647, "loss": 0.2255, "num_input_tokens_seen": 4512336, "step": 5205 }, { "epoch": 2.4563884959924565, "grad_norm": 0.0054929559119045734, "learning_rate": 0.287621050369259, "loss": 0.2433, "num_input_tokens_seen": 4515888, "step": 5210 }, { "epoch": 2.458745874587459, "grad_norm": 0.002135999035090208, "learning_rate": 0.28759760760955794, "loss": 0.2303, "num_input_tokens_seen": 4520320, "step": 5215 }, { "epoch": 2.4611032531824613, "grad_norm": 0.00253095431253314, "learning_rate": 0.2875741436305766, "loss": 0.2405, "num_input_tokens_seen": 4524544, "step": 5220 }, { "epoch": 2.4634606317774637, "grad_norm": 0.0063372207805514336, "learning_rate": 0.28755065843593347, "loss": 0.233, "num_input_tokens_seen": 4528464, "step": 5225 }, { "epoch": 2.465818010372466, "grad_norm": 0.005203817505389452, "learning_rate": 0.2875271520292502, "loss": 0.2161, "num_input_tokens_seen": 4532512, "step": 5230 }, { "epoch": 2.468175388967468, "grad_norm": 0.0031608010176569223, "learning_rate": 0.28750362441415184, "loss": 0.2236, "num_input_tokens_seen": 4536992, "step": 5235 }, { "epoch": 2.4705327675624704, "grad_norm": 0.004605791065841913, "learning_rate": 0.28748007559426664, "loss": 0.2237, "num_input_tokens_seen": 4542336, "step": 5240 }, { "epoch": 2.472890146157473, "grad_norm": 0.005680263042449951, "learning_rate": 0.2874565055732261, "loss": 0.2598, "num_input_tokens_seen": 4547808, "step": 5245 }, { "epoch": 2.4752475247524752, "grad_norm": 0.003358143847435713, "learning_rate": 0.28743291435466495, "loss": 0.219, "num_input_tokens_seen": 4552768, "step": 5250 }, { "epoch": 2.4776049033474776, "grad_norm": 0.0024696127511560917, "learning_rate": 0.2874093019422214, "loss": 0.2323, "num_input_tokens_seen": 4557072, "step": 5255 }, { "epoch": 2.47996228194248, "grad_norm": 0.004188489634543657, "learning_rate": 0.28738566833953666, "loss": 0.2473, "num_input_tokens_seen": 4561488, "step": 5260 }, { "epoch": 2.4823196605374824, "grad_norm": 0.002688789274543524, "learning_rate": 0.28736201355025537, "loss": 0.2241, "num_input_tokens_seen": 4565952, "step": 5265 }, { "epoch": 2.484677039132485, "grad_norm": 0.00432366831228137, "learning_rate": 0.28733833757802535, "loss": 0.2061, "num_input_tokens_seen": 4570944, "step": 5270 }, { "epoch": 2.487034417727487, "grad_norm": 0.002762431278824806, "learning_rate": 0.28731464042649785, "loss": 0.2399, "num_input_tokens_seen": 4575936, "step": 5275 }, { "epoch": 2.489391796322489, "grad_norm": 0.0021695769391953945, "learning_rate": 0.2872909220993271, "loss": 0.2282, "num_input_tokens_seen": 4580560, "step": 5280 }, { "epoch": 2.4917491749174916, "grad_norm": 0.002247544936835766, "learning_rate": 0.287267182600171, "loss": 0.1939, "num_input_tokens_seen": 4584800, "step": 5285 }, { "epoch": 2.494106553512494, "grad_norm": 0.0033613366540521383, "learning_rate": 0.2872434219326902, "loss": 0.2252, "num_input_tokens_seen": 4590368, "step": 5290 }, { "epoch": 2.4964639321074964, "grad_norm": 0.0024813413619995117, "learning_rate": 0.28721964010054907, "loss": 0.2655, "num_input_tokens_seen": 4594480, "step": 5295 }, { "epoch": 2.498821310702499, "grad_norm": 0.0018892339430749416, "learning_rate": 0.28719583710741503, "loss": 0.225, "num_input_tokens_seen": 4598880, "step": 5300 }, { "epoch": 2.501178689297501, "grad_norm": 0.002346500987187028, "learning_rate": 0.28717201295695877, "loss": 0.2216, "num_input_tokens_seen": 4602960, "step": 5305 }, { "epoch": 2.5035360678925036, "grad_norm": 0.001800031866878271, "learning_rate": 0.28714816765285434, "loss": 0.2214, "num_input_tokens_seen": 4607296, "step": 5310 }, { "epoch": 2.505893446487506, "grad_norm": 0.0015908876666799188, "learning_rate": 0.28712430119877896, "loss": 0.2157, "num_input_tokens_seen": 4611872, "step": 5315 }, { "epoch": 2.5082508250825084, "grad_norm": 0.0014735175063833594, "learning_rate": 0.28710041359841304, "loss": 0.2142, "num_input_tokens_seen": 4615568, "step": 5320 }, { "epoch": 2.510608203677511, "grad_norm": 0.0029877829365432262, "learning_rate": 0.28707650485544056, "loss": 0.1762, "num_input_tokens_seen": 4619520, "step": 5325 }, { "epoch": 2.512965582272513, "grad_norm": 0.00255657359957695, "learning_rate": 0.28705257497354836, "loss": 0.2291, "num_input_tokens_seen": 4623232, "step": 5330 }, { "epoch": 2.515322960867515, "grad_norm": 0.003087384393438697, "learning_rate": 0.28702862395642675, "loss": 0.2588, "num_input_tokens_seen": 4627312, "step": 5335 }, { "epoch": 2.5176803394625176, "grad_norm": 0.0013465832453221083, "learning_rate": 0.28700465180776935, "loss": 0.2035, "num_input_tokens_seen": 4631552, "step": 5340 }, { "epoch": 2.52003771805752, "grad_norm": 0.0020368569530546665, "learning_rate": 0.2869806585312729, "loss": 0.2362, "num_input_tokens_seen": 4635568, "step": 5345 }, { "epoch": 2.5223950966525224, "grad_norm": 0.004941504914313555, "learning_rate": 0.28695664413063754, "loss": 0.2234, "num_input_tokens_seen": 4639968, "step": 5350 }, { "epoch": 2.5247524752475248, "grad_norm": 0.004805352073162794, "learning_rate": 0.28693260860956654, "loss": 0.228, "num_input_tokens_seen": 4643536, "step": 5355 }, { "epoch": 2.527109853842527, "grad_norm": 0.004428005311638117, "learning_rate": 0.2869085519717665, "loss": 0.2771, "num_input_tokens_seen": 4648256, "step": 5360 }, { "epoch": 2.5294672324375296, "grad_norm": 0.0065118190832436085, "learning_rate": 0.28688447422094726, "loss": 0.2483, "num_input_tokens_seen": 4651712, "step": 5365 }, { "epoch": 2.531824611032532, "grad_norm": 0.0033065923489630222, "learning_rate": 0.2868603753608219, "loss": 0.2619, "num_input_tokens_seen": 4656128, "step": 5370 }, { "epoch": 2.534181989627534, "grad_norm": 0.00488028209656477, "learning_rate": 0.28683625539510665, "loss": 0.2399, "num_input_tokens_seen": 4660080, "step": 5375 }, { "epoch": 2.5365393682225363, "grad_norm": 0.002418127143755555, "learning_rate": 0.28681211432752135, "loss": 0.2249, "num_input_tokens_seen": 4664800, "step": 5380 }, { "epoch": 2.5388967468175387, "grad_norm": 0.002541935071349144, "learning_rate": 0.2867879521617887, "loss": 0.2387, "num_input_tokens_seen": 4669040, "step": 5385 }, { "epoch": 2.541254125412541, "grad_norm": 0.0015006556641310453, "learning_rate": 0.28676376890163485, "loss": 0.2397, "num_input_tokens_seen": 4673712, "step": 5390 }, { "epoch": 2.5436115040075435, "grad_norm": 0.0017031584866344929, "learning_rate": 0.2867395645507891, "loss": 0.2078, "num_input_tokens_seen": 4677344, "step": 5395 }, { "epoch": 2.545968882602546, "grad_norm": 0.0020535781513899565, "learning_rate": 0.2867153391129842, "loss": 0.2029, "num_input_tokens_seen": 4681664, "step": 5400 }, { "epoch": 2.545968882602546, "eval_loss": 0.2191556841135025, "eval_runtime": 21.8946, "eval_samples_per_second": 43.07, "eval_steps_per_second": 21.558, "num_input_tokens_seen": 4681664, "step": 5400 }, { "epoch": 2.5483262611975483, "grad_norm": 0.0016599649097770452, "learning_rate": 0.28669109259195585, "loss": 0.2069, "num_input_tokens_seen": 4685776, "step": 5405 }, { "epoch": 2.5506836397925507, "grad_norm": 0.0031225637067109346, "learning_rate": 0.2866668249914433, "loss": 0.1904, "num_input_tokens_seen": 4690416, "step": 5410 }, { "epoch": 2.553041018387553, "grad_norm": 0.0018391337944194674, "learning_rate": 0.2866425363151889, "loss": 0.1837, "num_input_tokens_seen": 4695328, "step": 5415 }, { "epoch": 2.5553983969825556, "grad_norm": 0.0030899751000106335, "learning_rate": 0.2866182265669382, "loss": 0.2126, "num_input_tokens_seen": 4698960, "step": 5420 }, { "epoch": 2.557755775577558, "grad_norm": 0.0010326363844797015, "learning_rate": 0.28659389575044014, "loss": 0.1468, "num_input_tokens_seen": 4703024, "step": 5425 }, { "epoch": 2.5601131541725604, "grad_norm": 0.0011398557107895613, "learning_rate": 0.28656954386944683, "loss": 0.2017, "num_input_tokens_seen": 4707040, "step": 5430 }, { "epoch": 2.5624705327675623, "grad_norm": 0.002205483615398407, "learning_rate": 0.28654517092771353, "loss": 0.2757, "num_input_tokens_seen": 4710896, "step": 5435 }, { "epoch": 2.5648279113625647, "grad_norm": 0.0039968714118003845, "learning_rate": 0.286520776928999, "loss": 0.1771, "num_input_tokens_seen": 4715456, "step": 5440 }, { "epoch": 2.567185289957567, "grad_norm": 0.001598479226231575, "learning_rate": 0.286496361877065, "loss": 0.238, "num_input_tokens_seen": 4720128, "step": 5445 }, { "epoch": 2.5695426685525695, "grad_norm": 0.004140135832130909, "learning_rate": 0.28647192577567676, "loss": 0.1952, "num_input_tokens_seen": 4725024, "step": 5450 }, { "epoch": 2.571900047147572, "grad_norm": 0.0015684018144384027, "learning_rate": 0.28644746862860254, "loss": 0.2627, "num_input_tokens_seen": 4730592, "step": 5455 }, { "epoch": 2.5742574257425743, "grad_norm": 0.001649559591896832, "learning_rate": 0.2864229904396139, "loss": 0.2376, "num_input_tokens_seen": 4734640, "step": 5460 }, { "epoch": 2.5766148043375767, "grad_norm": 0.001824932754971087, "learning_rate": 0.28639849121248573, "loss": 0.2111, "num_input_tokens_seen": 4738928, "step": 5465 }, { "epoch": 2.578972182932579, "grad_norm": 0.0038809129036962986, "learning_rate": 0.28637397095099615, "loss": 0.2245, "num_input_tokens_seen": 4743520, "step": 5470 }, { "epoch": 2.581329561527581, "grad_norm": 0.0017663838807493448, "learning_rate": 0.28634942965892646, "loss": 0.2502, "num_input_tokens_seen": 4746976, "step": 5475 }, { "epoch": 2.5836869401225835, "grad_norm": 0.0020731070544570684, "learning_rate": 0.28632486734006124, "loss": 0.2029, "num_input_tokens_seen": 4751152, "step": 5480 }, { "epoch": 2.586044318717586, "grad_norm": 0.0020378485787659883, "learning_rate": 0.28630028399818835, "loss": 0.2128, "num_input_tokens_seen": 4756624, "step": 5485 }, { "epoch": 2.5884016973125883, "grad_norm": 0.004616464953869581, "learning_rate": 0.2862756796370987, "loss": 0.2073, "num_input_tokens_seen": 4760416, "step": 5490 }, { "epoch": 2.5907590759075907, "grad_norm": 0.004779658280313015, "learning_rate": 0.2862510542605868, "loss": 0.232, "num_input_tokens_seen": 4765360, "step": 5495 }, { "epoch": 2.593116454502593, "grad_norm": 0.0019398464355617762, "learning_rate": 0.2862264078724501, "loss": 0.176, "num_input_tokens_seen": 4769264, "step": 5500 }, { "epoch": 2.5954738330975955, "grad_norm": 0.003660802496597171, "learning_rate": 0.28620174047648933, "loss": 0.2454, "num_input_tokens_seen": 4774064, "step": 5505 }, { "epoch": 2.597831211692598, "grad_norm": 0.003288082079961896, "learning_rate": 0.2861770520765086, "loss": 0.2096, "num_input_tokens_seen": 4777968, "step": 5510 }, { "epoch": 2.6001885902876003, "grad_norm": 0.002040012739598751, "learning_rate": 0.2861523426763151, "loss": 0.2291, "num_input_tokens_seen": 4781952, "step": 5515 }, { "epoch": 2.6025459688826027, "grad_norm": 0.0022936842869967222, "learning_rate": 0.2861276122797194, "loss": 0.2605, "num_input_tokens_seen": 4786448, "step": 5520 }, { "epoch": 2.604903347477605, "grad_norm": 0.0027759710792452097, "learning_rate": 0.28610286089053516, "loss": 0.2248, "num_input_tokens_seen": 4790512, "step": 5525 }, { "epoch": 2.6072607260726075, "grad_norm": 0.0029014230240136385, "learning_rate": 0.28607808851257943, "loss": 0.2307, "num_input_tokens_seen": 4794832, "step": 5530 }, { "epoch": 2.6096181046676095, "grad_norm": 0.0020303779747337103, "learning_rate": 0.28605329514967237, "loss": 0.2046, "num_input_tokens_seen": 4798688, "step": 5535 }, { "epoch": 2.611975483262612, "grad_norm": 0.005810845643281937, "learning_rate": 0.2860284808056374, "loss": 0.2725, "num_input_tokens_seen": 4802880, "step": 5540 }, { "epoch": 2.6143328618576143, "grad_norm": 0.0018301663221791387, "learning_rate": 0.28600364548430135, "loss": 0.2574, "num_input_tokens_seen": 4807776, "step": 5545 }, { "epoch": 2.6166902404526167, "grad_norm": 0.0049169957637786865, "learning_rate": 0.28597878918949393, "loss": 0.2651, "num_input_tokens_seen": 4812320, "step": 5550 }, { "epoch": 2.619047619047619, "grad_norm": 0.003385751973837614, "learning_rate": 0.2859539119250485, "loss": 0.2476, "num_input_tokens_seen": 4816464, "step": 5555 }, { "epoch": 2.6214049976426215, "grad_norm": 0.002661591162905097, "learning_rate": 0.2859290136948013, "loss": 0.2377, "num_input_tokens_seen": 4821168, "step": 5560 }, { "epoch": 2.623762376237624, "grad_norm": 0.006644025910645723, "learning_rate": 0.28590409450259197, "loss": 0.2105, "num_input_tokens_seen": 4826176, "step": 5565 }, { "epoch": 2.6261197548326263, "grad_norm": 0.003102001966908574, "learning_rate": 0.28587915435226346, "loss": 0.211, "num_input_tokens_seen": 4829856, "step": 5570 }, { "epoch": 2.6284771334276282, "grad_norm": 0.0029168615583330393, "learning_rate": 0.2858541932476617, "loss": 0.1973, "num_input_tokens_seen": 4834288, "step": 5575 }, { "epoch": 2.6308345120226306, "grad_norm": 0.003176958067342639, "learning_rate": 0.2858292111926361, "loss": 0.2102, "num_input_tokens_seen": 4838704, "step": 5580 }, { "epoch": 2.633191890617633, "grad_norm": 0.0017621120205149055, "learning_rate": 0.28580420819103924, "loss": 0.2216, "num_input_tokens_seen": 4843008, "step": 5585 }, { "epoch": 2.6355492692126354, "grad_norm": 0.002810442354530096, "learning_rate": 0.2857791842467269, "loss": 0.238, "num_input_tokens_seen": 4847760, "step": 5590 }, { "epoch": 2.637906647807638, "grad_norm": 0.0016119919018819928, "learning_rate": 0.2857541393635579, "loss": 0.2175, "num_input_tokens_seen": 4852352, "step": 5595 }, { "epoch": 2.6402640264026402, "grad_norm": 0.001806200947612524, "learning_rate": 0.2857290735453948, "loss": 0.2325, "num_input_tokens_seen": 4856928, "step": 5600 }, { "epoch": 2.6402640264026402, "eval_loss": 0.21884675323963165, "eval_runtime": 21.9152, "eval_samples_per_second": 43.03, "eval_steps_per_second": 21.538, "num_input_tokens_seen": 4856928, "step": 5600 }, { "epoch": 2.6426214049976426, "grad_norm": 0.001278447569347918, "learning_rate": 0.28570398679610276, "loss": 0.1866, "num_input_tokens_seen": 4861408, "step": 5605 }, { "epoch": 2.644978783592645, "grad_norm": 0.0029608765617012978, "learning_rate": 0.2856788791195506, "loss": 0.1827, "num_input_tokens_seen": 4865600, "step": 5610 }, { "epoch": 2.6473361621876474, "grad_norm": 0.0031368094496428967, "learning_rate": 0.28565375051961023, "loss": 0.2295, "num_input_tokens_seen": 4869744, "step": 5615 }, { "epoch": 2.64969354078265, "grad_norm": 0.0014416318153962493, "learning_rate": 0.28562860100015686, "loss": 0.2418, "num_input_tokens_seen": 4874448, "step": 5620 }, { "epoch": 2.6520509193776522, "grad_norm": 0.0018157470040023327, "learning_rate": 0.2856034305650687, "loss": 0.2428, "num_input_tokens_seen": 4878128, "step": 5625 }, { "epoch": 2.6544082979726547, "grad_norm": 0.0031846179626882076, "learning_rate": 0.28557823921822756, "loss": 0.2381, "num_input_tokens_seen": 4882048, "step": 5630 }, { "epoch": 2.6567656765676566, "grad_norm": 0.0014706782530993223, "learning_rate": 0.2855530269635181, "loss": 0.2457, "num_input_tokens_seen": 4886432, "step": 5635 }, { "epoch": 2.659123055162659, "grad_norm": 0.0021466852631419897, "learning_rate": 0.2855277938048284, "loss": 0.2219, "num_input_tokens_seen": 4891008, "step": 5640 }, { "epoch": 2.6614804337576614, "grad_norm": 0.0025439192540943623, "learning_rate": 0.2855025397460498, "loss": 0.2348, "num_input_tokens_seen": 4894960, "step": 5645 }, { "epoch": 2.663837812352664, "grad_norm": 0.0031455811113119125, "learning_rate": 0.28547726479107666, "loss": 0.2245, "num_input_tokens_seen": 4899952, "step": 5650 }, { "epoch": 2.666195190947666, "grad_norm": 0.0015372470952570438, "learning_rate": 0.2854519689438068, "loss": 0.2428, "num_input_tokens_seen": 4903712, "step": 5655 }, { "epoch": 2.6685525695426686, "grad_norm": 0.0034673293121159077, "learning_rate": 0.2854266522081412, "loss": 0.2161, "num_input_tokens_seen": 4907488, "step": 5660 }, { "epoch": 2.670909948137671, "grad_norm": 0.0037774250376969576, "learning_rate": 0.28540131458798385, "loss": 0.2032, "num_input_tokens_seen": 4911648, "step": 5665 }, { "epoch": 2.6732673267326734, "grad_norm": 0.0016357001150026917, "learning_rate": 0.28537595608724226, "loss": 0.2255, "num_input_tokens_seen": 4916560, "step": 5670 }, { "epoch": 2.6756247053276754, "grad_norm": 0.0021887377370148897, "learning_rate": 0.28535057670982705, "loss": 0.1899, "num_input_tokens_seen": 4920608, "step": 5675 }, { "epoch": 2.677982083922678, "grad_norm": 0.0022791773080825806, "learning_rate": 0.285325176459652, "loss": 0.2664, "num_input_tokens_seen": 4924672, "step": 5680 }, { "epoch": 2.68033946251768, "grad_norm": 0.0015386005397886038, "learning_rate": 0.28529975534063406, "loss": 0.2085, "num_input_tokens_seen": 4928816, "step": 5685 }, { "epoch": 2.6826968411126826, "grad_norm": 0.0017380303470417857, "learning_rate": 0.2852743133566936, "loss": 0.2277, "num_input_tokens_seen": 4933728, "step": 5690 }, { "epoch": 2.685054219707685, "grad_norm": 0.0013048667460680008, "learning_rate": 0.2852488505117541, "loss": 0.2238, "num_input_tokens_seen": 4937184, "step": 5695 }, { "epoch": 2.6874115983026874, "grad_norm": 0.002688546199351549, "learning_rate": 0.28522336680974214, "loss": 0.2407, "num_input_tokens_seen": 4942016, "step": 5700 }, { "epoch": 2.68976897689769, "grad_norm": 0.0024779201485216618, "learning_rate": 0.2851978622545877, "loss": 0.2237, "num_input_tokens_seen": 4945632, "step": 5705 }, { "epoch": 2.692126355492692, "grad_norm": 0.003268840489909053, "learning_rate": 0.285172336850224, "loss": 0.2091, "num_input_tokens_seen": 4949408, "step": 5710 }, { "epoch": 2.6944837340876946, "grad_norm": 0.0023994813673198223, "learning_rate": 0.2851467906005871, "loss": 0.2774, "num_input_tokens_seen": 4953616, "step": 5715 }, { "epoch": 2.696841112682697, "grad_norm": 0.0040252176113426685, "learning_rate": 0.28512122350961683, "loss": 0.2192, "num_input_tokens_seen": 4958304, "step": 5720 }, { "epoch": 2.6991984912776994, "grad_norm": 0.0016280869022011757, "learning_rate": 0.2850956355812559, "loss": 0.237, "num_input_tokens_seen": 4962576, "step": 5725 }, { "epoch": 2.701555869872702, "grad_norm": 0.0020573402289301157, "learning_rate": 0.28507002681945015, "loss": 0.2291, "num_input_tokens_seen": 4966864, "step": 5730 }, { "epoch": 2.7039132484677038, "grad_norm": 0.0045327614061534405, "learning_rate": 0.28504439722814895, "loss": 0.2345, "num_input_tokens_seen": 4971024, "step": 5735 }, { "epoch": 2.706270627062706, "grad_norm": 0.001511972164735198, "learning_rate": 0.28501874681130457, "loss": 0.2274, "num_input_tokens_seen": 4975392, "step": 5740 }, { "epoch": 2.7086280056577086, "grad_norm": 0.0014178297715261579, "learning_rate": 0.2849930755728727, "loss": 0.222, "num_input_tokens_seen": 4979056, "step": 5745 }, { "epoch": 2.710985384252711, "grad_norm": 0.0014265916543081403, "learning_rate": 0.28496738351681217, "loss": 0.2211, "num_input_tokens_seen": 4983296, "step": 5750 }, { "epoch": 2.7133427628477134, "grad_norm": 0.0016009480459615588, "learning_rate": 0.284941670647085, "loss": 0.1962, "num_input_tokens_seen": 4987456, "step": 5755 }, { "epoch": 2.7157001414427158, "grad_norm": 0.0014200917212292552, "learning_rate": 0.2849159369676563, "loss": 0.2124, "num_input_tokens_seen": 4991200, "step": 5760 }, { "epoch": 2.718057520037718, "grad_norm": 0.00255211372859776, "learning_rate": 0.2848901824824948, "loss": 0.1846, "num_input_tokens_seen": 4995008, "step": 5765 }, { "epoch": 2.7204148986327206, "grad_norm": 0.002506986493244767, "learning_rate": 0.284864407195572, "loss": 0.2369, "num_input_tokens_seen": 4999056, "step": 5770 }, { "epoch": 2.7227722772277225, "grad_norm": 0.0012661832151934505, "learning_rate": 0.28483861111086284, "loss": 0.2409, "num_input_tokens_seen": 5003616, "step": 5775 }, { "epoch": 2.725129655822725, "grad_norm": 0.0027129659429192543, "learning_rate": 0.2848127942323453, "loss": 0.2663, "num_input_tokens_seen": 5008816, "step": 5780 }, { "epoch": 2.7274870344177273, "grad_norm": 0.004870220087468624, "learning_rate": 0.2847869565640007, "loss": 0.2071, "num_input_tokens_seen": 5013008, "step": 5785 }, { "epoch": 2.7298444130127297, "grad_norm": 0.0013695033267140388, "learning_rate": 0.2847610981098136, "loss": 0.2235, "num_input_tokens_seen": 5017072, "step": 5790 }, { "epoch": 2.732201791607732, "grad_norm": 0.003665443742647767, "learning_rate": 0.2847352188737716, "loss": 0.2283, "num_input_tokens_seen": 5021024, "step": 5795 }, { "epoch": 2.7345591702027345, "grad_norm": 0.0037875790148973465, "learning_rate": 0.2847093188598658, "loss": 0.2387, "num_input_tokens_seen": 5024976, "step": 5800 }, { "epoch": 2.7345591702027345, "eval_loss": 0.21710790693759918, "eval_runtime": 21.9232, "eval_samples_per_second": 43.014, "eval_steps_per_second": 21.53, "num_input_tokens_seen": 5024976, "step": 5800 }, { "epoch": 2.736916548797737, "grad_norm": 0.0016749076312407851, "learning_rate": 0.28468339807209003, "loss": 0.2337, "num_input_tokens_seen": 5030048, "step": 5805 }, { "epoch": 2.7392739273927393, "grad_norm": 0.002567852148786187, "learning_rate": 0.2846574565144418, "loss": 0.2268, "num_input_tokens_seen": 5034192, "step": 5810 }, { "epoch": 2.7416313059877417, "grad_norm": 0.002483132528141141, "learning_rate": 0.28463149419092154, "loss": 0.2259, "num_input_tokens_seen": 5037888, "step": 5815 }, { "epoch": 2.743988684582744, "grad_norm": 0.0014428104041144252, "learning_rate": 0.284605511105533, "loss": 0.2336, "num_input_tokens_seen": 5042496, "step": 5820 }, { "epoch": 2.7463460631777465, "grad_norm": 0.0015334381023421884, "learning_rate": 0.28457950726228315, "loss": 0.1897, "num_input_tokens_seen": 5047424, "step": 5825 }, { "epoch": 2.748703441772749, "grad_norm": 0.0014513073256239295, "learning_rate": 0.28455348266518193, "loss": 0.1898, "num_input_tokens_seen": 5051536, "step": 5830 }, { "epoch": 2.751060820367751, "grad_norm": 0.004799570422619581, "learning_rate": 0.28452743731824287, "loss": 0.2929, "num_input_tokens_seen": 5055648, "step": 5835 }, { "epoch": 2.7534181989627533, "grad_norm": 0.0016081944340839982, "learning_rate": 0.28450137122548236, "loss": 0.1953, "num_input_tokens_seen": 5060176, "step": 5840 }, { "epoch": 2.7557755775577557, "grad_norm": 0.0018668733537197113, "learning_rate": 0.2844752843909201, "loss": 0.2237, "num_input_tokens_seen": 5065312, "step": 5845 }, { "epoch": 2.758132956152758, "grad_norm": 0.0045110383071005344, "learning_rate": 0.28444917681857923, "loss": 0.2221, "num_input_tokens_seen": 5069712, "step": 5850 }, { "epoch": 2.7604903347477605, "grad_norm": 0.0019985593389719725, "learning_rate": 0.28442304851248557, "loss": 0.2423, "num_input_tokens_seen": 5073936, "step": 5855 }, { "epoch": 2.762847713342763, "grad_norm": 0.004507777281105518, "learning_rate": 0.2843968994766686, "loss": 0.2105, "num_input_tokens_seen": 5078112, "step": 5860 }, { "epoch": 2.7652050919377653, "grad_norm": 0.0049678729847073555, "learning_rate": 0.28437072971516075, "loss": 0.2275, "num_input_tokens_seen": 5082000, "step": 5865 }, { "epoch": 2.7675624705327677, "grad_norm": 0.003286338411271572, "learning_rate": 0.2843445392319979, "loss": 0.1998, "num_input_tokens_seen": 5086064, "step": 5870 }, { "epoch": 2.7699198491277697, "grad_norm": 0.0027955747209489346, "learning_rate": 0.28431832803121865, "loss": 0.2615, "num_input_tokens_seen": 5090256, "step": 5875 }, { "epoch": 2.772277227722772, "grad_norm": 0.0017423109384253621, "learning_rate": 0.28429209611686534, "loss": 0.232, "num_input_tokens_seen": 5094880, "step": 5880 }, { "epoch": 2.7746346063177745, "grad_norm": 0.0011339137563481927, "learning_rate": 0.28426584349298323, "loss": 0.2358, "num_input_tokens_seen": 5099136, "step": 5885 }, { "epoch": 2.776991984912777, "grad_norm": 0.0017370324349030852, "learning_rate": 0.2842395701636207, "loss": 0.2199, "num_input_tokens_seen": 5103152, "step": 5890 }, { "epoch": 2.7793493635077793, "grad_norm": 0.00155707448720932, "learning_rate": 0.28421327613282954, "loss": 0.2313, "num_input_tokens_seen": 5107776, "step": 5895 }, { "epoch": 2.7817067421027817, "grad_norm": 0.0020146346651017666, "learning_rate": 0.28418696140466454, "loss": 0.2392, "num_input_tokens_seen": 5112480, "step": 5900 }, { "epoch": 2.784064120697784, "grad_norm": 0.0038995950017124414, "learning_rate": 0.2841606259831838, "loss": 0.2261, "num_input_tokens_seen": 5116672, "step": 5905 }, { "epoch": 2.7864214992927865, "grad_norm": 0.003765585832297802, "learning_rate": 0.2841342698724486, "loss": 0.225, "num_input_tokens_seen": 5121072, "step": 5910 }, { "epoch": 2.788778877887789, "grad_norm": 0.004106113687157631, "learning_rate": 0.28410789307652334, "loss": 0.2476, "num_input_tokens_seen": 5125568, "step": 5915 }, { "epoch": 2.7911362564827913, "grad_norm": 0.002156180329620838, "learning_rate": 0.2840814955994756, "loss": 0.2297, "num_input_tokens_seen": 5130560, "step": 5920 }, { "epoch": 2.7934936350777937, "grad_norm": 0.004338707309216261, "learning_rate": 0.2840550774453763, "loss": 0.2204, "num_input_tokens_seen": 5134976, "step": 5925 }, { "epoch": 2.795851013672796, "grad_norm": 0.0024962443858385086, "learning_rate": 0.28402863861829947, "loss": 0.2124, "num_input_tokens_seen": 5139072, "step": 5930 }, { "epoch": 2.798208392267798, "grad_norm": 0.0019814539700746536, "learning_rate": 0.2840021791223222, "loss": 0.2309, "num_input_tokens_seen": 5143776, "step": 5935 }, { "epoch": 2.8005657708628005, "grad_norm": 0.003615087131038308, "learning_rate": 0.2839756989615249, "loss": 0.2338, "num_input_tokens_seen": 5148080, "step": 5940 }, { "epoch": 2.802923149457803, "grad_norm": 0.0017136948881670833, "learning_rate": 0.28394919813999125, "loss": 0.2026, "num_input_tokens_seen": 5153120, "step": 5945 }, { "epoch": 2.8052805280528053, "grad_norm": 0.0014885151758790016, "learning_rate": 0.28392267666180787, "loss": 0.2032, "num_input_tokens_seen": 5157648, "step": 5950 }, { "epoch": 2.8076379066478077, "grad_norm": 0.0025144980754703283, "learning_rate": 0.2838961345310648, "loss": 0.2391, "num_input_tokens_seen": 5162448, "step": 5955 }, { "epoch": 2.80999528524281, "grad_norm": 0.002820675726979971, "learning_rate": 0.2838695717518552, "loss": 0.1649, "num_input_tokens_seen": 5167216, "step": 5960 }, { "epoch": 2.8123526638378125, "grad_norm": 0.0013507091207429767, "learning_rate": 0.28384298832827526, "loss": 0.2027, "num_input_tokens_seen": 5171568, "step": 5965 }, { "epoch": 2.814710042432815, "grad_norm": 0.0037363776937127113, "learning_rate": 0.28381638426442457, "loss": 0.2122, "num_input_tokens_seen": 5175888, "step": 5970 }, { "epoch": 2.817067421027817, "grad_norm": 0.005171388387680054, "learning_rate": 0.2837897595644057, "loss": 0.2315, "num_input_tokens_seen": 5179856, "step": 5975 }, { "epoch": 2.8194247996228192, "grad_norm": 0.0018650018610060215, "learning_rate": 0.28376311423232475, "loss": 0.2499, "num_input_tokens_seen": 5184144, "step": 5980 }, { "epoch": 2.8217821782178216, "grad_norm": 0.004200541414320469, "learning_rate": 0.2837364482722905, "loss": 0.2283, "num_input_tokens_seen": 5189376, "step": 5985 }, { "epoch": 2.824139556812824, "grad_norm": 0.0052624656818807125, "learning_rate": 0.28370976168841533, "loss": 0.2458, "num_input_tokens_seen": 5193568, "step": 5990 }, { "epoch": 2.8264969354078264, "grad_norm": 0.004971596412360668, "learning_rate": 0.2836830544848146, "loss": 0.2204, "num_input_tokens_seen": 5198368, "step": 5995 }, { "epoch": 2.828854314002829, "grad_norm": 0.0026749190874397755, "learning_rate": 0.2836563266656069, "loss": 0.3, "num_input_tokens_seen": 5202368, "step": 6000 }, { "epoch": 2.828854314002829, "eval_loss": 0.2189222127199173, "eval_runtime": 21.9478, "eval_samples_per_second": 42.966, "eval_steps_per_second": 21.506, "num_input_tokens_seen": 5202368, "step": 6000 }, { "epoch": 2.8312116925978312, "grad_norm": 0.003617185400798917, "learning_rate": 0.283629578234914, "loss": 0.1939, "num_input_tokens_seen": 5206368, "step": 6005 }, { "epoch": 2.8335690711928336, "grad_norm": 0.0013957360060885549, "learning_rate": 0.2836028091968608, "loss": 0.1953, "num_input_tokens_seen": 5210160, "step": 6010 }, { "epoch": 2.835926449787836, "grad_norm": 0.001142903114669025, "learning_rate": 0.28357601955557554, "loss": 0.1664, "num_input_tokens_seen": 5214704, "step": 6015 }, { "epoch": 2.8382838283828384, "grad_norm": 0.00453013414517045, "learning_rate": 0.2835492093151894, "loss": 0.287, "num_input_tokens_seen": 5218896, "step": 6020 }, { "epoch": 2.840641206977841, "grad_norm": 0.001208429574035108, "learning_rate": 0.2835223784798369, "loss": 0.1667, "num_input_tokens_seen": 5223232, "step": 6025 }, { "epoch": 2.8429985855728432, "grad_norm": 0.0019413250265643, "learning_rate": 0.2834955270536557, "loss": 0.253, "num_input_tokens_seen": 5227536, "step": 6030 }, { "epoch": 2.845355964167845, "grad_norm": 0.001728573814034462, "learning_rate": 0.2834686550407866, "loss": 0.2015, "num_input_tokens_seen": 5232512, "step": 6035 }, { "epoch": 2.8477133427628476, "grad_norm": 0.0021283936221152544, "learning_rate": 0.28344176244537367, "loss": 0.199, "num_input_tokens_seen": 5237472, "step": 6040 }, { "epoch": 2.85007072135785, "grad_norm": 0.0024944115430116653, "learning_rate": 0.28341484927156396, "loss": 0.2717, "num_input_tokens_seen": 5241904, "step": 6045 }, { "epoch": 2.8524280999528524, "grad_norm": 0.001851448556408286, "learning_rate": 0.28338791552350795, "loss": 0.2384, "num_input_tokens_seen": 5245712, "step": 6050 }, { "epoch": 2.854785478547855, "grad_norm": 0.0036855118814855814, "learning_rate": 0.28336096120535914, "loss": 0.2067, "num_input_tokens_seen": 5250704, "step": 6055 }, { "epoch": 2.857142857142857, "grad_norm": 0.001916488166898489, "learning_rate": 0.2833339863212741, "loss": 0.2289, "num_input_tokens_seen": 5255056, "step": 6060 }, { "epoch": 2.8595002357378596, "grad_norm": 0.0038230468053370714, "learning_rate": 0.28330699087541283, "loss": 0.2099, "num_input_tokens_seen": 5259792, "step": 6065 }, { "epoch": 2.861857614332862, "grad_norm": 0.0024399685207754374, "learning_rate": 0.2832799748719384, "loss": 0.2364, "num_input_tokens_seen": 5263968, "step": 6070 }, { "epoch": 2.864214992927864, "grad_norm": 0.0018647192046046257, "learning_rate": 0.28325293831501686, "loss": 0.2164, "num_input_tokens_seen": 5268112, "step": 6075 }, { "epoch": 2.8665723715228664, "grad_norm": 0.002013539196923375, "learning_rate": 0.2832258812088177, "loss": 0.2359, "num_input_tokens_seen": 5272576, "step": 6080 }, { "epoch": 2.8689297501178688, "grad_norm": 0.0029203263111412525, "learning_rate": 0.2831988035575134, "loss": 0.234, "num_input_tokens_seen": 5276608, "step": 6085 }, { "epoch": 2.871287128712871, "grad_norm": 0.0020659794099628925, "learning_rate": 0.28317170536527975, "loss": 0.2156, "num_input_tokens_seen": 5281344, "step": 6090 }, { "epoch": 2.8736445073078736, "grad_norm": 0.0035701135639101267, "learning_rate": 0.2831445866362956, "loss": 0.2057, "num_input_tokens_seen": 5285088, "step": 6095 }, { "epoch": 2.876001885902876, "grad_norm": 0.0022322030272334814, "learning_rate": 0.2831174473747429, "loss": 0.1958, "num_input_tokens_seen": 5289648, "step": 6100 }, { "epoch": 2.8783592644978784, "grad_norm": 0.005428030155599117, "learning_rate": 0.2830902875848071, "loss": 0.2557, "num_input_tokens_seen": 5294688, "step": 6105 }, { "epoch": 2.880716643092881, "grad_norm": 0.0017749059479683638, "learning_rate": 0.28306310727067635, "loss": 0.2181, "num_input_tokens_seen": 5299264, "step": 6110 }, { "epoch": 2.883074021687883, "grad_norm": 0.0055986144579946995, "learning_rate": 0.2830359064365423, "loss": 0.2958, "num_input_tokens_seen": 5303664, "step": 6115 }, { "epoch": 2.8854314002828856, "grad_norm": 0.002622120315209031, "learning_rate": 0.28300868508659965, "loss": 0.2345, "num_input_tokens_seen": 5308448, "step": 6120 }, { "epoch": 2.887788778877888, "grad_norm": 0.0019304427551105618, "learning_rate": 0.28298144322504626, "loss": 0.2325, "num_input_tokens_seen": 5312080, "step": 6125 }, { "epoch": 2.8901461574728904, "grad_norm": 0.0015272133750841022, "learning_rate": 0.2829541808560832, "loss": 0.2294, "num_input_tokens_seen": 5315936, "step": 6130 }, { "epoch": 2.8925035360678923, "grad_norm": 0.0020525394938886166, "learning_rate": 0.2829268979839146, "loss": 0.2345, "num_input_tokens_seen": 5320640, "step": 6135 }, { "epoch": 2.8948609146628947, "grad_norm": 0.001981817651540041, "learning_rate": 0.2828995946127479, "loss": 0.2238, "num_input_tokens_seen": 5324816, "step": 6140 }, { "epoch": 2.897218293257897, "grad_norm": 0.002629582304507494, "learning_rate": 0.2828722707467936, "loss": 0.2301, "num_input_tokens_seen": 5328512, "step": 6145 }, { "epoch": 2.8995756718528995, "grad_norm": 0.0041525582782924175, "learning_rate": 0.2828449263902653, "loss": 0.2219, "num_input_tokens_seen": 5332848, "step": 6150 }, { "epoch": 2.901933050447902, "grad_norm": 0.0021330085583031178, "learning_rate": 0.28281756154738, "loss": 0.2268, "num_input_tokens_seen": 5337920, "step": 6155 }, { "epoch": 2.9042904290429044, "grad_norm": 0.0022140624932944775, "learning_rate": 0.28279017622235764, "loss": 0.205, "num_input_tokens_seen": 5342432, "step": 6160 }, { "epoch": 2.9066478076379068, "grad_norm": 0.002364426851272583, "learning_rate": 0.28276277041942127, "loss": 0.2196, "num_input_tokens_seen": 5346464, "step": 6165 }, { "epoch": 2.909005186232909, "grad_norm": 0.0017965259030461311, "learning_rate": 0.2827353441427974, "loss": 0.256, "num_input_tokens_seen": 5350944, "step": 6170 }, { "epoch": 2.911362564827911, "grad_norm": 0.002053775591775775, "learning_rate": 0.2827078973967153, "loss": 0.1755, "num_input_tokens_seen": 5354912, "step": 6175 }, { "epoch": 2.9137199434229135, "grad_norm": 0.0017593017546460032, "learning_rate": 0.2826804301854078, "loss": 0.2561, "num_input_tokens_seen": 5359456, "step": 6180 }, { "epoch": 2.916077322017916, "grad_norm": 0.0038356217555701733, "learning_rate": 0.2826529425131105, "loss": 0.2129, "num_input_tokens_seen": 5364336, "step": 6185 }, { "epoch": 2.9184347006129183, "grad_norm": 0.003234922420233488, "learning_rate": 0.2826254343840625, "loss": 0.2141, "num_input_tokens_seen": 5368416, "step": 6190 }, { "epoch": 2.9207920792079207, "grad_norm": 0.0017645374173298478, "learning_rate": 0.2825979058025059, "loss": 0.2454, "num_input_tokens_seen": 5372848, "step": 6195 }, { "epoch": 2.923149457802923, "grad_norm": 0.0019703989382833242, "learning_rate": 0.2825703567726858, "loss": 0.2308, "num_input_tokens_seen": 5377360, "step": 6200 }, { "epoch": 2.923149457802923, "eval_loss": 0.24366508424282074, "eval_runtime": 21.9044, "eval_samples_per_second": 43.051, "eval_steps_per_second": 21.548, "num_input_tokens_seen": 5377360, "step": 6200 }, { "epoch": 2.9255068363979255, "grad_norm": 0.002375092590227723, "learning_rate": 0.2825427872988508, "loss": 0.248, "num_input_tokens_seen": 5381136, "step": 6205 }, { "epoch": 2.927864214992928, "grad_norm": 0.003724662121385336, "learning_rate": 0.28251519738525227, "loss": 0.2553, "num_input_tokens_seen": 5385296, "step": 6210 }, { "epoch": 2.9302215935879303, "grad_norm": 0.0024688835255801678, "learning_rate": 0.28248758703614507, "loss": 0.2244, "num_input_tokens_seen": 5389728, "step": 6215 }, { "epoch": 2.9325789721829327, "grad_norm": 0.002140147378668189, "learning_rate": 0.28245995625578696, "loss": 0.2352, "num_input_tokens_seen": 5393984, "step": 6220 }, { "epoch": 2.934936350777935, "grad_norm": 0.002051682909950614, "learning_rate": 0.282432305048439, "loss": 0.222, "num_input_tokens_seen": 5398048, "step": 6225 }, { "epoch": 2.9372937293729375, "grad_norm": 0.00481885951012373, "learning_rate": 0.28240463341836536, "loss": 0.2513, "num_input_tokens_seen": 5402384, "step": 6230 }, { "epoch": 2.9396511079679395, "grad_norm": 0.0016708329785615206, "learning_rate": 0.2823769413698334, "loss": 0.2155, "num_input_tokens_seen": 5406752, "step": 6235 }, { "epoch": 2.942008486562942, "grad_norm": 0.002223607385531068, "learning_rate": 0.2823492289071135, "loss": 0.2241, "num_input_tokens_seen": 5411120, "step": 6240 }, { "epoch": 2.9443658651579443, "grad_norm": 0.005155738443136215, "learning_rate": 0.2823214960344793, "loss": 0.2766, "num_input_tokens_seen": 5414992, "step": 6245 }, { "epoch": 2.9467232437529467, "grad_norm": 0.00252379453741014, "learning_rate": 0.28229374275620756, "loss": 0.2301, "num_input_tokens_seen": 5419808, "step": 6250 }, { "epoch": 2.949080622347949, "grad_norm": 0.0022023646160960197, "learning_rate": 0.28226596907657814, "loss": 0.1918, "num_input_tokens_seen": 5423840, "step": 6255 }, { "epoch": 2.9514380009429515, "grad_norm": 0.003191194264218211, "learning_rate": 0.28223817499987414, "loss": 0.2423, "num_input_tokens_seen": 5428208, "step": 6260 }, { "epoch": 2.953795379537954, "grad_norm": 0.005027868784964085, "learning_rate": 0.2822103605303818, "loss": 0.2772, "num_input_tokens_seen": 5433744, "step": 6265 }, { "epoch": 2.9561527581329563, "grad_norm": 0.0029444764368236065, "learning_rate": 0.2821825256723903, "loss": 0.2364, "num_input_tokens_seen": 5437568, "step": 6270 }, { "epoch": 2.9585101367279583, "grad_norm": 0.004891206976026297, "learning_rate": 0.2821546704301923, "loss": 0.2418, "num_input_tokens_seen": 5442016, "step": 6275 }, { "epoch": 2.9608675153229607, "grad_norm": 0.0017353928415104747, "learning_rate": 0.2821267948080834, "loss": 0.2242, "num_input_tokens_seen": 5445984, "step": 6280 }, { "epoch": 2.963224893917963, "grad_norm": 0.002085199346765876, "learning_rate": 0.28209889881036226, "loss": 0.2513, "num_input_tokens_seen": 5450192, "step": 6285 }, { "epoch": 2.9655822725129655, "grad_norm": 0.005125412251800299, "learning_rate": 0.28207098244133094, "loss": 0.2382, "num_input_tokens_seen": 5455088, "step": 6290 }, { "epoch": 2.967939651107968, "grad_norm": 0.005192582029849291, "learning_rate": 0.2820430457052943, "loss": 0.252, "num_input_tokens_seen": 5459152, "step": 6295 }, { "epoch": 2.9702970297029703, "grad_norm": 0.0017111661145463586, "learning_rate": 0.28201508860656077, "loss": 0.2398, "num_input_tokens_seen": 5462448, "step": 6300 }, { "epoch": 2.9726544082979727, "grad_norm": 0.004531386308372021, "learning_rate": 0.2819871111494415, "loss": 0.2472, "num_input_tokens_seen": 5466768, "step": 6305 }, { "epoch": 2.975011786892975, "grad_norm": 0.001943724462762475, "learning_rate": 0.28195911333825113, "loss": 0.2381, "num_input_tokens_seen": 5470448, "step": 6310 }, { "epoch": 2.9773691654879775, "grad_norm": 0.0014728866517543793, "learning_rate": 0.28193109517730713, "loss": 0.2441, "num_input_tokens_seen": 5475104, "step": 6315 }, { "epoch": 2.97972654408298, "grad_norm": 0.0012446832843124866, "learning_rate": 0.2819030566709303, "loss": 0.2283, "num_input_tokens_seen": 5479632, "step": 6320 }, { "epoch": 2.9820839226779823, "grad_norm": 0.0017083230195567012, "learning_rate": 0.2818749978234445, "loss": 0.1867, "num_input_tokens_seen": 5483776, "step": 6325 }, { "epoch": 2.9844413012729847, "grad_norm": 0.0021480051800608635, "learning_rate": 0.2818469186391768, "loss": 0.26, "num_input_tokens_seen": 5487952, "step": 6330 }, { "epoch": 2.9867986798679866, "grad_norm": 0.0018338409718126059, "learning_rate": 0.28181881912245743, "loss": 0.2199, "num_input_tokens_seen": 5491840, "step": 6335 }, { "epoch": 2.989156058462989, "grad_norm": 0.001991529716178775, "learning_rate": 0.2817906992776195, "loss": 0.236, "num_input_tokens_seen": 5496976, "step": 6340 }, { "epoch": 2.9915134370579914, "grad_norm": 0.002213717671111226, "learning_rate": 0.28176255910899967, "loss": 0.226, "num_input_tokens_seen": 5500800, "step": 6345 }, { "epoch": 2.993870815652994, "grad_norm": 0.0012342786649242043, "learning_rate": 0.2817343986209373, "loss": 0.2369, "num_input_tokens_seen": 5504688, "step": 6350 }, { "epoch": 2.9962281942479962, "grad_norm": 0.002489026403054595, "learning_rate": 0.2817062178177753, "loss": 0.2348, "num_input_tokens_seen": 5508864, "step": 6355 }, { "epoch": 2.9985855728429986, "grad_norm": 0.002911111107096076, "learning_rate": 0.2816780167038593, "loss": 0.2398, "num_input_tokens_seen": 5512784, "step": 6360 }, { "epoch": 3.000942951438001, "grad_norm": 0.002814113162457943, "learning_rate": 0.28164979528353834, "loss": 0.2374, "num_input_tokens_seen": 5516928, "step": 6365 }, { "epoch": 3.0033003300330035, "grad_norm": 0.0022455279249697924, "learning_rate": 0.28162155356116453, "loss": 0.2349, "num_input_tokens_seen": 5521024, "step": 6370 }, { "epoch": 3.005657708628006, "grad_norm": 0.006765482947230339, "learning_rate": 0.28159329154109314, "loss": 0.2362, "num_input_tokens_seen": 5526416, "step": 6375 }, { "epoch": 3.008015087223008, "grad_norm": 0.0020693691913038492, "learning_rate": 0.28156500922768246, "loss": 0.2298, "num_input_tokens_seen": 5531440, "step": 6380 }, { "epoch": 3.01037246581801, "grad_norm": 0.0027854018844664097, "learning_rate": 0.28153670662529406, "loss": 0.2256, "num_input_tokens_seen": 5535920, "step": 6385 }, { "epoch": 3.0127298444130126, "grad_norm": 0.003534893272444606, "learning_rate": 0.28150838373829246, "loss": 0.2172, "num_input_tokens_seen": 5540432, "step": 6390 }, { "epoch": 3.015087223008015, "grad_norm": 0.0021791390608996153, "learning_rate": 0.2814800405710455, "loss": 0.2345, "num_input_tokens_seen": 5546240, "step": 6395 }, { "epoch": 3.0174446016030174, "grad_norm": 0.0028235982172191143, "learning_rate": 0.2814516771279239, "loss": 0.2332, "num_input_tokens_seen": 5550480, "step": 6400 }, { "epoch": 3.0174446016030174, "eval_loss": 0.21977819502353668, "eval_runtime": 21.8991, "eval_samples_per_second": 43.061, "eval_steps_per_second": 21.553, "num_input_tokens_seen": 5550480, "step": 6400 }, { "epoch": 3.01980198019802, "grad_norm": 0.004244680982083082, "learning_rate": 0.28142329341330186, "loss": 0.2365, "num_input_tokens_seen": 5554144, "step": 6405 }, { "epoch": 3.022159358793022, "grad_norm": 0.0022190907038748264, "learning_rate": 0.2813948894315564, "loss": 0.2199, "num_input_tokens_seen": 5558128, "step": 6410 }, { "epoch": 3.0245167373880246, "grad_norm": 0.0020653260871767998, "learning_rate": 0.2813664651870677, "loss": 0.2466, "num_input_tokens_seen": 5562896, "step": 6415 }, { "epoch": 3.026874115983027, "grad_norm": 0.0012289370642974973, "learning_rate": 0.28133802068421926, "loss": 0.2336, "num_input_tokens_seen": 5567184, "step": 6420 }, { "epoch": 3.0292314945780294, "grad_norm": 0.0032292315736413, "learning_rate": 0.28130955592739754, "loss": 0.2127, "num_input_tokens_seen": 5571984, "step": 6425 }, { "epoch": 3.0315888731730314, "grad_norm": 0.0021824019495397806, "learning_rate": 0.2812810709209922, "loss": 0.2302, "num_input_tokens_seen": 5576528, "step": 6430 }, { "epoch": 3.033946251768034, "grad_norm": 0.0033686519600450993, "learning_rate": 0.2812525656693959, "loss": 0.2384, "num_input_tokens_seen": 5581632, "step": 6435 }, { "epoch": 3.036303630363036, "grad_norm": 0.002757749520242214, "learning_rate": 0.28122404017700453, "loss": 0.2092, "num_input_tokens_seen": 5586720, "step": 6440 }, { "epoch": 3.0386610089580386, "grad_norm": 0.0032411690335720778, "learning_rate": 0.2811954944482171, "loss": 0.1811, "num_input_tokens_seen": 5590256, "step": 6445 }, { "epoch": 3.041018387553041, "grad_norm": 0.0028251393232494593, "learning_rate": 0.2811669284874358, "loss": 0.1974, "num_input_tokens_seen": 5594912, "step": 6450 }, { "epoch": 3.0433757661480434, "grad_norm": 0.00428414810448885, "learning_rate": 0.2811383422990657, "loss": 0.2046, "num_input_tokens_seen": 5599248, "step": 6455 }, { "epoch": 3.045733144743046, "grad_norm": 0.00322744925506413, "learning_rate": 0.2811097358875152, "loss": 0.2642, "num_input_tokens_seen": 5603840, "step": 6460 }, { "epoch": 3.048090523338048, "grad_norm": 0.007903686724603176, "learning_rate": 0.2810811092571959, "loss": 0.2476, "num_input_tokens_seen": 5608688, "step": 6465 }, { "epoch": 3.0504479019330506, "grad_norm": 0.003340390743687749, "learning_rate": 0.28105246241252224, "loss": 0.2277, "num_input_tokens_seen": 5612896, "step": 6470 }, { "epoch": 3.052805280528053, "grad_norm": 0.004942734260112047, "learning_rate": 0.28102379535791194, "loss": 0.2363, "num_input_tokens_seen": 5617232, "step": 6475 }, { "epoch": 3.055162659123055, "grad_norm": 0.0014105342561379075, "learning_rate": 0.2809951080977859, "loss": 0.2238, "num_input_tokens_seen": 5621216, "step": 6480 }, { "epoch": 3.0575200377180574, "grad_norm": 0.0012574895517900586, "learning_rate": 0.28096640063656797, "loss": 0.2254, "num_input_tokens_seen": 5625664, "step": 6485 }, { "epoch": 3.0598774163130598, "grad_norm": 0.0010611230973154306, "learning_rate": 0.2809376729786852, "loss": 0.2511, "num_input_tokens_seen": 5629472, "step": 6490 }, { "epoch": 3.062234794908062, "grad_norm": 0.0030765223782509565, "learning_rate": 0.28090892512856785, "loss": 0.2279, "num_input_tokens_seen": 5633936, "step": 6495 }, { "epoch": 3.0645921735030646, "grad_norm": 0.0016410150565207005, "learning_rate": 0.2808801570906491, "loss": 0.2347, "num_input_tokens_seen": 5638112, "step": 6500 }, { "epoch": 3.066949552098067, "grad_norm": 0.00168778991792351, "learning_rate": 0.2808513688693654, "loss": 0.2093, "num_input_tokens_seen": 5642112, "step": 6505 }, { "epoch": 3.0693069306930694, "grad_norm": 0.00450535025447607, "learning_rate": 0.28082256046915627, "loss": 0.1896, "num_input_tokens_seen": 5646384, "step": 6510 }, { "epoch": 3.0716643092880718, "grad_norm": 0.0023907064460217953, "learning_rate": 0.28079373189446427, "loss": 0.2515, "num_input_tokens_seen": 5650912, "step": 6515 }, { "epoch": 3.074021687883074, "grad_norm": 0.0010663475841283798, "learning_rate": 0.28076488314973513, "loss": 0.1886, "num_input_tokens_seen": 5655472, "step": 6520 }, { "epoch": 3.0763790664780766, "grad_norm": 0.00347104761749506, "learning_rate": 0.28073601423941774, "loss": 0.2565, "num_input_tokens_seen": 5659296, "step": 6525 }, { "epoch": 3.0787364450730785, "grad_norm": 0.0025123795494437218, "learning_rate": 0.28070712516796403, "loss": 0.2324, "num_input_tokens_seen": 5663920, "step": 6530 }, { "epoch": 3.081093823668081, "grad_norm": 0.002451820531859994, "learning_rate": 0.28067821593982906, "loss": 0.2412, "num_input_tokens_seen": 5667840, "step": 6535 }, { "epoch": 3.0834512022630833, "grad_norm": 0.004395333118736744, "learning_rate": 0.28064928655947097, "loss": 0.2486, "num_input_tokens_seen": 5672656, "step": 6540 }, { "epoch": 3.0858085808580857, "grad_norm": 0.0016445236979052424, "learning_rate": 0.28062033703135103, "loss": 0.2204, "num_input_tokens_seen": 5677040, "step": 6545 }, { "epoch": 3.088165959453088, "grad_norm": 0.0038351030088961124, "learning_rate": 0.2805913673599337, "loss": 0.2683, "num_input_tokens_seen": 5680528, "step": 6550 }, { "epoch": 3.0905233380480905, "grad_norm": 0.002283252077177167, "learning_rate": 0.2805623775496864, "loss": 0.2099, "num_input_tokens_seen": 5685248, "step": 6555 }, { "epoch": 3.092880716643093, "grad_norm": 0.001359613612294197, "learning_rate": 0.2805333676050797, "loss": 0.2573, "num_input_tokens_seen": 5688976, "step": 6560 }, { "epoch": 3.0952380952380953, "grad_norm": 0.0030670214910060167, "learning_rate": 0.2805043375305873, "loss": 0.2179, "num_input_tokens_seen": 5692960, "step": 6565 }, { "epoch": 3.0975954738330977, "grad_norm": 0.0019961423240602016, "learning_rate": 0.2804752873306861, "loss": 0.2336, "num_input_tokens_seen": 5697248, "step": 6570 }, { "epoch": 3.0999528524281, "grad_norm": 0.004063866566866636, "learning_rate": 0.2804462170098559, "loss": 0.2256, "num_input_tokens_seen": 5701328, "step": 6575 }, { "epoch": 3.102310231023102, "grad_norm": 0.002048760186880827, "learning_rate": 0.2804171265725797, "loss": 0.2063, "num_input_tokens_seen": 5706528, "step": 6580 }, { "epoch": 3.1046676096181045, "grad_norm": 0.00381287420168519, "learning_rate": 0.28038801602334373, "loss": 0.2236, "num_input_tokens_seen": 5710752, "step": 6585 }, { "epoch": 3.107024988213107, "grad_norm": 0.0024239225313067436, "learning_rate": 0.28035888536663717, "loss": 0.2235, "num_input_tokens_seen": 5715520, "step": 6590 }, { "epoch": 3.1093823668081093, "grad_norm": 0.0027272114530205727, "learning_rate": 0.2803297346069522, "loss": 0.2289, "num_input_tokens_seen": 5719760, "step": 6595 }, { "epoch": 3.1117397454031117, "grad_norm": 0.003628681180998683, "learning_rate": 0.28030056374878437, "loss": 0.2422, "num_input_tokens_seen": 5724080, "step": 6600 }, { "epoch": 3.1117397454031117, "eval_loss": 0.22279927134513855, "eval_runtime": 21.9125, "eval_samples_per_second": 43.035, "eval_steps_per_second": 21.54, "num_input_tokens_seen": 5724080, "step": 6600 }, { "epoch": 3.114097123998114, "grad_norm": 0.001797323115170002, "learning_rate": 0.2802713727966321, "loss": 0.2068, "num_input_tokens_seen": 5728128, "step": 6605 }, { "epoch": 3.1164545025931165, "grad_norm": 0.0011620894074440002, "learning_rate": 0.28024216175499717, "loss": 0.2115, "num_input_tokens_seen": 5732576, "step": 6610 }, { "epoch": 3.118811881188119, "grad_norm": 0.003845749655738473, "learning_rate": 0.2802129306283841, "loss": 0.2201, "num_input_tokens_seen": 5735888, "step": 6615 }, { "epoch": 3.1211692597831213, "grad_norm": 0.0012406290043145418, "learning_rate": 0.28018367942130074, "loss": 0.2567, "num_input_tokens_seen": 5739968, "step": 6620 }, { "epoch": 3.1235266383781237, "grad_norm": 0.0015564817003905773, "learning_rate": 0.28015440813825804, "loss": 0.2291, "num_input_tokens_seen": 5743824, "step": 6625 }, { "epoch": 3.1258840169731257, "grad_norm": 0.003645627060905099, "learning_rate": 0.28012511678377006, "loss": 0.2465, "num_input_tokens_seen": 5748000, "step": 6630 }, { "epoch": 3.128241395568128, "grad_norm": 0.0034321791026741266, "learning_rate": 0.28009580536235373, "loss": 0.228, "num_input_tokens_seen": 5752384, "step": 6635 }, { "epoch": 3.1305987741631305, "grad_norm": 0.00308528495952487, "learning_rate": 0.28006647387852934, "loss": 0.2198, "num_input_tokens_seen": 5757216, "step": 6640 }, { "epoch": 3.132956152758133, "grad_norm": 0.0016433403361588717, "learning_rate": 0.28003712233682015, "loss": 0.226, "num_input_tokens_seen": 5762368, "step": 6645 }, { "epoch": 3.1353135313531353, "grad_norm": 0.0017365470994263887, "learning_rate": 0.2800077507417526, "loss": 0.222, "num_input_tokens_seen": 5766592, "step": 6650 }, { "epoch": 3.1376709099481377, "grad_norm": 0.001847127452492714, "learning_rate": 0.2799783590978561, "loss": 0.2358, "num_input_tokens_seen": 5770624, "step": 6655 }, { "epoch": 3.14002828854314, "grad_norm": 0.0014438219368457794, "learning_rate": 0.2799489474096632, "loss": 0.2483, "num_input_tokens_seen": 5775296, "step": 6660 }, { "epoch": 3.1423856671381425, "grad_norm": 0.0018508339999243617, "learning_rate": 0.27991951568170953, "loss": 0.23, "num_input_tokens_seen": 5779280, "step": 6665 }, { "epoch": 3.144743045733145, "grad_norm": 0.003142034634947777, "learning_rate": 0.2798900639185339, "loss": 0.2174, "num_input_tokens_seen": 5783856, "step": 6670 }, { "epoch": 3.1471004243281473, "grad_norm": 0.001180998864583671, "learning_rate": 0.2798605921246781, "loss": 0.1774, "num_input_tokens_seen": 5788336, "step": 6675 }, { "epoch": 3.1494578029231493, "grad_norm": 0.0009475816041231155, "learning_rate": 0.2798311003046871, "loss": 0.2162, "num_input_tokens_seen": 5793040, "step": 6680 }, { "epoch": 3.1518151815181517, "grad_norm": 0.0027856584638357162, "learning_rate": 0.2798015884631089, "loss": 0.2485, "num_input_tokens_seen": 5798576, "step": 6685 }, { "epoch": 3.154172560113154, "grad_norm": 0.0022963955998420715, "learning_rate": 0.27977205660449445, "loss": 0.2207, "num_input_tokens_seen": 5803760, "step": 6690 }, { "epoch": 3.1565299387081565, "grad_norm": 0.002540385816246271, "learning_rate": 0.2797425047333981, "loss": 0.2336, "num_input_tokens_seen": 5808192, "step": 6695 }, { "epoch": 3.158887317303159, "grad_norm": 0.0018690063152462244, "learning_rate": 0.27971293285437715, "loss": 0.2156, "num_input_tokens_seen": 5812400, "step": 6700 }, { "epoch": 3.1612446958981613, "grad_norm": 0.0016668778844177723, "learning_rate": 0.2796833409719918, "loss": 0.2239, "num_input_tokens_seen": 5817216, "step": 6705 }, { "epoch": 3.1636020744931637, "grad_norm": 0.0016467575915157795, "learning_rate": 0.27965372909080566, "loss": 0.2063, "num_input_tokens_seen": 5820944, "step": 6710 }, { "epoch": 3.165959453088166, "grad_norm": 0.0010513500310480595, "learning_rate": 0.27962409721538506, "loss": 0.2492, "num_input_tokens_seen": 5824912, "step": 6715 }, { "epoch": 3.1683168316831685, "grad_norm": 0.003182976972311735, "learning_rate": 0.27959444535029976, "loss": 0.2705, "num_input_tokens_seen": 5828944, "step": 6720 }, { "epoch": 3.170674210278171, "grad_norm": 0.0019462519558146596, "learning_rate": 0.27956477350012243, "loss": 0.2414, "num_input_tokens_seen": 5832608, "step": 6725 }, { "epoch": 3.173031588873173, "grad_norm": 0.0011494166683405638, "learning_rate": 0.27953508166942875, "loss": 0.248, "num_input_tokens_seen": 5836608, "step": 6730 }, { "epoch": 3.1753889674681752, "grad_norm": 0.0018198189791291952, "learning_rate": 0.27950536986279767, "loss": 0.2303, "num_input_tokens_seen": 5840416, "step": 6735 }, { "epoch": 3.1777463460631776, "grad_norm": 0.0017197655979543924, "learning_rate": 0.2794756380848111, "loss": 0.2412, "num_input_tokens_seen": 5844032, "step": 6740 }, { "epoch": 3.18010372465818, "grad_norm": 0.0013186497380957007, "learning_rate": 0.279445886340054, "loss": 0.1994, "num_input_tokens_seen": 5848336, "step": 6745 }, { "epoch": 3.1824611032531824, "grad_norm": 0.0009659620700404048, "learning_rate": 0.27941611463311455, "loss": 0.2229, "num_input_tokens_seen": 5852976, "step": 6750 }, { "epoch": 3.184818481848185, "grad_norm": 0.0025989271234720945, "learning_rate": 0.2793863229685839, "loss": 0.2748, "num_input_tokens_seen": 5857552, "step": 6755 }, { "epoch": 3.1871758604431872, "grad_norm": 0.001701919361948967, "learning_rate": 0.27935651135105627, "loss": 0.2513, "num_input_tokens_seen": 5862336, "step": 6760 }, { "epoch": 3.1895332390381896, "grad_norm": 0.003486581612378359, "learning_rate": 0.279326679785129, "loss": 0.2313, "num_input_tokens_seen": 5866368, "step": 6765 }, { "epoch": 3.191890617633192, "grad_norm": 0.0012486559571698308, "learning_rate": 0.2792968282754024, "loss": 0.2137, "num_input_tokens_seen": 5871600, "step": 6770 }, { "epoch": 3.1942479962281944, "grad_norm": 0.002585432957857847, "learning_rate": 0.2792669568264801, "loss": 0.2432, "num_input_tokens_seen": 5876560, "step": 6775 }, { "epoch": 3.1966053748231964, "grad_norm": 0.0017338250763714314, "learning_rate": 0.27923706544296856, "loss": 0.2075, "num_input_tokens_seen": 5880256, "step": 6780 }, { "epoch": 3.198962753418199, "grad_norm": 0.001195418299175799, "learning_rate": 0.2792071541294775, "loss": 0.2042, "num_input_tokens_seen": 5884096, "step": 6785 }, { "epoch": 3.201320132013201, "grad_norm": 0.0014828711282461882, "learning_rate": 0.27917722289061947, "loss": 0.2303, "num_input_tokens_seen": 5888256, "step": 6790 }, { "epoch": 3.2036775106082036, "grad_norm": 0.0013843594351783395, "learning_rate": 0.27914727173101034, "loss": 0.221, "num_input_tokens_seen": 5892768, "step": 6795 }, { "epoch": 3.206034889203206, "grad_norm": 0.0013452813727781177, "learning_rate": 0.279117300655269, "loss": 0.2444, "num_input_tokens_seen": 5896688, "step": 6800 }, { "epoch": 3.206034889203206, "eval_loss": 0.22003114223480225, "eval_runtime": 21.9705, "eval_samples_per_second": 42.921, "eval_steps_per_second": 21.483, "num_input_tokens_seen": 5896688, "step": 6800 }, { "epoch": 3.2083922677982084, "grad_norm": 0.0028393915854394436, "learning_rate": 0.2790873096680173, "loss": 0.2168, "num_input_tokens_seen": 5901520, "step": 6805 }, { "epoch": 3.210749646393211, "grad_norm": 0.0011864913394674659, "learning_rate": 0.2790572987738802, "loss": 0.2215, "num_input_tokens_seen": 5905920, "step": 6810 }, { "epoch": 3.213107024988213, "grad_norm": 0.002324012340977788, "learning_rate": 0.27902726797748584, "loss": 0.1941, "num_input_tokens_seen": 5909840, "step": 6815 }, { "epoch": 3.2154644035832156, "grad_norm": 0.0019916014280170202, "learning_rate": 0.2789972172834652, "loss": 0.1844, "num_input_tokens_seen": 5914752, "step": 6820 }, { "epoch": 3.217821782178218, "grad_norm": 0.002011052565649152, "learning_rate": 0.2789671466964527, "loss": 0.2439, "num_input_tokens_seen": 5919424, "step": 6825 }, { "epoch": 3.22017916077322, "grad_norm": 0.0011960356496274471, "learning_rate": 0.2789370562210854, "loss": 0.2498, "num_input_tokens_seen": 5924432, "step": 6830 }, { "epoch": 3.2225365393682224, "grad_norm": 0.0015195002779364586, "learning_rate": 0.27890694586200376, "loss": 0.2208, "num_input_tokens_seen": 5928848, "step": 6835 }, { "epoch": 3.2248939179632248, "grad_norm": 0.0013107417616993189, "learning_rate": 0.2788768156238511, "loss": 0.2411, "num_input_tokens_seen": 5933152, "step": 6840 }, { "epoch": 3.227251296558227, "grad_norm": 0.0017446527490392327, "learning_rate": 0.27884666551127385, "loss": 0.2343, "num_input_tokens_seen": 5937120, "step": 6845 }, { "epoch": 3.2296086751532296, "grad_norm": 0.002317015780135989, "learning_rate": 0.2788164955289217, "loss": 0.2509, "num_input_tokens_seen": 5941248, "step": 6850 }, { "epoch": 3.231966053748232, "grad_norm": 0.0017115857917815447, "learning_rate": 0.27878630568144697, "loss": 0.2325, "num_input_tokens_seen": 5945472, "step": 6855 }, { "epoch": 3.2343234323432344, "grad_norm": 0.0012043580645695329, "learning_rate": 0.2787560959735056, "loss": 0.2135, "num_input_tokens_seen": 5949888, "step": 6860 }, { "epoch": 3.236680810938237, "grad_norm": 0.0025454151909798384, "learning_rate": 0.27872586640975616, "loss": 0.249, "num_input_tokens_seen": 5953632, "step": 6865 }, { "epoch": 3.239038189533239, "grad_norm": 0.001849035150371492, "learning_rate": 0.27869561699486045, "loss": 0.2182, "num_input_tokens_seen": 5957808, "step": 6870 }, { "epoch": 3.2413955681282416, "grad_norm": 0.0020729422103613615, "learning_rate": 0.2786653477334833, "loss": 0.2328, "num_input_tokens_seen": 5961360, "step": 6875 }, { "epoch": 3.2437529467232435, "grad_norm": 0.002039064420387149, "learning_rate": 0.2786350586302926, "loss": 0.2327, "num_input_tokens_seen": 5966528, "step": 6880 }, { "epoch": 3.246110325318246, "grad_norm": 0.0016803611069917679, "learning_rate": 0.27860474968995935, "loss": 0.2383, "num_input_tokens_seen": 5970400, "step": 6885 }, { "epoch": 3.2484677039132484, "grad_norm": 0.0013120239600539207, "learning_rate": 0.27857442091715756, "loss": 0.2004, "num_input_tokens_seen": 5975616, "step": 6890 }, { "epoch": 3.2508250825082508, "grad_norm": 0.0033225726801902056, "learning_rate": 0.27854407231656425, "loss": 0.2458, "num_input_tokens_seen": 5979856, "step": 6895 }, { "epoch": 3.253182461103253, "grad_norm": 0.04170410707592964, "learning_rate": 0.2785137038928596, "loss": 0.292, "num_input_tokens_seen": 5985008, "step": 6900 }, { "epoch": 3.2555398396982556, "grad_norm": 0.001562706078402698, "learning_rate": 0.27848331565072687, "loss": 0.2531, "num_input_tokens_seen": 5990464, "step": 6905 }, { "epoch": 3.257897218293258, "grad_norm": 0.001959222834557295, "learning_rate": 0.27845290759485225, "loss": 0.2153, "num_input_tokens_seen": 5995024, "step": 6910 }, { "epoch": 3.2602545968882604, "grad_norm": 0.0030264370143413544, "learning_rate": 0.278422479729925, "loss": 0.2367, "num_input_tokens_seen": 5999488, "step": 6915 }, { "epoch": 3.2626119754832628, "grad_norm": 0.00212351162917912, "learning_rate": 0.2783920320606375, "loss": 0.2244, "num_input_tokens_seen": 6003536, "step": 6920 }, { "epoch": 3.264969354078265, "grad_norm": 0.0015657602343708277, "learning_rate": 0.2783615645916852, "loss": 0.2367, "num_input_tokens_seen": 6008432, "step": 6925 }, { "epoch": 3.2673267326732676, "grad_norm": 0.0020443841349333525, "learning_rate": 0.2783310773277666, "loss": 0.2359, "num_input_tokens_seen": 6012016, "step": 6930 }, { "epoch": 3.2696841112682695, "grad_norm": 0.0026490509044378996, "learning_rate": 0.2783005702735831, "loss": 0.2279, "num_input_tokens_seen": 6017088, "step": 6935 }, { "epoch": 3.272041489863272, "grad_norm": 0.003506440669298172, "learning_rate": 0.2782700434338394, "loss": 0.2326, "num_input_tokens_seen": 6021360, "step": 6940 }, { "epoch": 3.2743988684582743, "grad_norm": 0.002338959136977792, "learning_rate": 0.278239496813243, "loss": 0.2397, "num_input_tokens_seen": 6025504, "step": 6945 }, { "epoch": 3.2767562470532767, "grad_norm": 0.002607919042930007, "learning_rate": 0.27820893041650463, "loss": 0.2349, "num_input_tokens_seen": 6029824, "step": 6950 }, { "epoch": 3.279113625648279, "grad_norm": 0.0027701915241777897, "learning_rate": 0.27817834424833804, "loss": 0.2246, "num_input_tokens_seen": 6034432, "step": 6955 }, { "epoch": 3.2814710042432815, "grad_norm": 0.0016291236970573664, "learning_rate": 0.27814773831345996, "loss": 0.2174, "num_input_tokens_seen": 6038448, "step": 6960 }, { "epoch": 3.283828382838284, "grad_norm": 0.0015494412509724498, "learning_rate": 0.2781171126165902, "loss": 0.218, "num_input_tokens_seen": 6042256, "step": 6965 }, { "epoch": 3.2861857614332863, "grad_norm": 0.0030371190514415503, "learning_rate": 0.2780864671624517, "loss": 0.2768, "num_input_tokens_seen": 6046064, "step": 6970 }, { "epoch": 3.2885431400282887, "grad_norm": 0.003019363386556506, "learning_rate": 0.27805580195577034, "loss": 0.2107, "num_input_tokens_seen": 6050064, "step": 6975 }, { "epoch": 3.2909005186232907, "grad_norm": 0.00227131019346416, "learning_rate": 0.2780251170012751, "loss": 0.231, "num_input_tokens_seen": 6054176, "step": 6980 }, { "epoch": 3.293257897218293, "grad_norm": 0.0017293553100898862, "learning_rate": 0.27799441230369787, "loss": 0.2244, "num_input_tokens_seen": 6058368, "step": 6985 }, { "epoch": 3.2956152758132955, "grad_norm": 0.0032269610092043877, "learning_rate": 0.27796368786777387, "loss": 0.2286, "num_input_tokens_seen": 6062272, "step": 6990 }, { "epoch": 3.297972654408298, "grad_norm": 0.003913054242730141, "learning_rate": 0.277932943698241, "loss": 0.2252, "num_input_tokens_seen": 6065888, "step": 6995 }, { "epoch": 3.3003300330033003, "grad_norm": 0.005522516090422869, "learning_rate": 0.2779021797998406, "loss": 0.2385, "num_input_tokens_seen": 6070544, "step": 7000 }, { "epoch": 3.3003300330033003, "eval_loss": 0.21738483011722565, "eval_runtime": 21.8656, "eval_samples_per_second": 43.127, "eval_steps_per_second": 21.586, "num_input_tokens_seen": 6070544, "step": 7000 }, { "epoch": 3.3026874115983027, "grad_norm": 0.003493984928354621, "learning_rate": 0.2778713961773167, "loss": 0.2342, "num_input_tokens_seen": 6075024, "step": 7005 }, { "epoch": 3.305044790193305, "grad_norm": 0.0025782797019928694, "learning_rate": 0.2778405928354166, "loss": 0.1719, "num_input_tokens_seen": 6079120, "step": 7010 }, { "epoch": 3.3074021687883075, "grad_norm": 0.0037321806885302067, "learning_rate": 0.27780976977889055, "loss": 0.2646, "num_input_tokens_seen": 6082512, "step": 7015 }, { "epoch": 3.30975954738331, "grad_norm": 0.003182155778631568, "learning_rate": 0.27777892701249185, "loss": 0.2501, "num_input_tokens_seen": 6086992, "step": 7020 }, { "epoch": 3.3121169259783123, "grad_norm": 0.0037068689707666636, "learning_rate": 0.2777480645409768, "loss": 0.2501, "num_input_tokens_seen": 6091984, "step": 7025 }, { "epoch": 3.3144743045733147, "grad_norm": 0.0015096596907824278, "learning_rate": 0.27771718236910486, "loss": 0.2286, "num_input_tokens_seen": 6095632, "step": 7030 }, { "epoch": 3.3168316831683167, "grad_norm": 0.0020208507776260376, "learning_rate": 0.27768628050163835, "loss": 0.2363, "num_input_tokens_seen": 6100656, "step": 7035 }, { "epoch": 3.319189061763319, "grad_norm": 0.0013854358112439513, "learning_rate": 0.2776553589433428, "loss": 0.2264, "num_input_tokens_seen": 6105248, "step": 7040 }, { "epoch": 3.3215464403583215, "grad_norm": 0.0016285768942907453, "learning_rate": 0.27762441769898666, "loss": 0.2162, "num_input_tokens_seen": 6109184, "step": 7045 }, { "epoch": 3.323903818953324, "grad_norm": 0.0013114437460899353, "learning_rate": 0.2775934567733415, "loss": 0.2452, "num_input_tokens_seen": 6113696, "step": 7050 }, { "epoch": 3.3262611975483263, "grad_norm": 0.0021063259337097406, "learning_rate": 0.2775624761711819, "loss": 0.2284, "num_input_tokens_seen": 6118080, "step": 7055 }, { "epoch": 3.3286185761433287, "grad_norm": 0.001402810332365334, "learning_rate": 0.2775314758972854, "loss": 0.2188, "num_input_tokens_seen": 6122576, "step": 7060 }, { "epoch": 3.330975954738331, "grad_norm": 0.002076199045404792, "learning_rate": 0.2775004559564327, "loss": 0.2239, "num_input_tokens_seen": 6126816, "step": 7065 }, { "epoch": 3.3333333333333335, "grad_norm": 0.0018991362303495407, "learning_rate": 0.2774694163534073, "loss": 0.2513, "num_input_tokens_seen": 6131184, "step": 7070 }, { "epoch": 3.335690711928336, "grad_norm": 0.0017437102505937219, "learning_rate": 0.27743835709299614, "loss": 0.2232, "num_input_tokens_seen": 6135424, "step": 7075 }, { "epoch": 3.338048090523338, "grad_norm": 0.0017402650555595756, "learning_rate": 0.2774072781799888, "loss": 0.2185, "num_input_tokens_seen": 6139744, "step": 7080 }, { "epoch": 3.3404054691183402, "grad_norm": 0.001483156462199986, "learning_rate": 0.27737617961917804, "loss": 0.2011, "num_input_tokens_seen": 6143584, "step": 7085 }, { "epoch": 3.3427628477133426, "grad_norm": 0.0016855536960065365, "learning_rate": 0.27734506141535964, "loss": 0.2503, "num_input_tokens_seen": 6148512, "step": 7090 }, { "epoch": 3.345120226308345, "grad_norm": 0.0013885904336348176, "learning_rate": 0.2773139235733325, "loss": 0.2341, "num_input_tokens_seen": 6152928, "step": 7095 }, { "epoch": 3.3474776049033474, "grad_norm": 0.0015265572583302855, "learning_rate": 0.2772827660978984, "loss": 0.2251, "num_input_tokens_seen": 6156992, "step": 7100 }, { "epoch": 3.34983498349835, "grad_norm": 0.0019305471796542406, "learning_rate": 0.27725158899386226, "loss": 0.2316, "num_input_tokens_seen": 6160880, "step": 7105 }, { "epoch": 3.3521923620933523, "grad_norm": 0.0010126088745892048, "learning_rate": 0.27722039226603196, "loss": 0.2223, "num_input_tokens_seen": 6165120, "step": 7110 }, { "epoch": 3.3545497406883547, "grad_norm": 0.0018479155842214823, "learning_rate": 0.2771891759192184, "loss": 0.2329, "num_input_tokens_seen": 6169648, "step": 7115 }, { "epoch": 3.356907119283357, "grad_norm": 0.002539560664445162, "learning_rate": 0.2771579399582355, "loss": 0.2213, "num_input_tokens_seen": 6173552, "step": 7120 }, { "epoch": 3.3592644978783595, "grad_norm": 0.004805716220289469, "learning_rate": 0.2771266843879004, "loss": 0.2238, "num_input_tokens_seen": 6178144, "step": 7125 }, { "epoch": 3.361621876473362, "grad_norm": 0.001320306328125298, "learning_rate": 0.2770954092130329, "loss": 0.2167, "num_input_tokens_seen": 6182448, "step": 7130 }, { "epoch": 3.363979255068364, "grad_norm": 0.0015767494915053248, "learning_rate": 0.27706411443845613, "loss": 0.2556, "num_input_tokens_seen": 6186768, "step": 7135 }, { "epoch": 3.366336633663366, "grad_norm": 0.0031018180307000875, "learning_rate": 0.27703280006899617, "loss": 0.2155, "num_input_tokens_seen": 6191120, "step": 7140 }, { "epoch": 3.3686940122583686, "grad_norm": 0.0015641606878489256, "learning_rate": 0.277001466109482, "loss": 0.2382, "num_input_tokens_seen": 6194688, "step": 7145 }, { "epoch": 3.371051390853371, "grad_norm": 0.0015605957014486194, "learning_rate": 0.2769701125647458, "loss": 0.2206, "num_input_tokens_seen": 6199520, "step": 7150 }, { "epoch": 3.3734087694483734, "grad_norm": 0.0027610515244305134, "learning_rate": 0.27693873943962266, "loss": 0.263, "num_input_tokens_seen": 6203456, "step": 7155 }, { "epoch": 3.375766148043376, "grad_norm": 0.0019244597060605884, "learning_rate": 0.2769073467389506, "loss": 0.2171, "num_input_tokens_seen": 6208640, "step": 7160 }, { "epoch": 3.3781235266383782, "grad_norm": 0.0013964230893179774, "learning_rate": 0.2768759344675709, "loss": 0.2093, "num_input_tokens_seen": 6213392, "step": 7165 }, { "epoch": 3.3804809052333806, "grad_norm": 0.0013954725582152605, "learning_rate": 0.27684450263032767, "loss": 0.2274, "num_input_tokens_seen": 6217952, "step": 7170 }, { "epoch": 3.382838283828383, "grad_norm": 0.0012220799690112472, "learning_rate": 0.2768130512320682, "loss": 0.2229, "num_input_tokens_seen": 6222256, "step": 7175 }, { "epoch": 3.385195662423385, "grad_norm": 0.005545579828321934, "learning_rate": 0.27678158027764244, "loss": 0.2269, "num_input_tokens_seen": 6226496, "step": 7180 }, { "epoch": 3.3875530410183874, "grad_norm": 0.001451921765692532, "learning_rate": 0.27675008977190385, "loss": 0.1661, "num_input_tokens_seen": 6231312, "step": 7185 }, { "epoch": 3.38991041961339, "grad_norm": 0.0010781936580315232, "learning_rate": 0.2767185797197086, "loss": 0.2262, "num_input_tokens_seen": 6236048, "step": 7190 }, { "epoch": 3.392267798208392, "grad_norm": 0.002580445259809494, "learning_rate": 0.2766870501259159, "loss": 0.2799, "num_input_tokens_seen": 6239840, "step": 7195 }, { "epoch": 3.3946251768033946, "grad_norm": 0.0026025348342955112, "learning_rate": 0.276655500995388, "loss": 0.2365, "num_input_tokens_seen": 6244624, "step": 7200 }, { "epoch": 3.3946251768033946, "eval_loss": 0.2422030121088028, "eval_runtime": 21.8902, "eval_samples_per_second": 43.079, "eval_steps_per_second": 21.562, "num_input_tokens_seen": 6244624, "step": 7200 }, { "epoch": 3.396982555398397, "grad_norm": 0.002089216373860836, "learning_rate": 0.27662393233299015, "loss": 0.2468, "num_input_tokens_seen": 6248816, "step": 7205 }, { "epoch": 3.3993399339933994, "grad_norm": 0.001023127930238843, "learning_rate": 0.27659234414359074, "loss": 0.2213, "num_input_tokens_seen": 6252736, "step": 7210 }, { "epoch": 3.401697312588402, "grad_norm": 0.0016543582314625382, "learning_rate": 0.27656073643206097, "loss": 0.2212, "num_input_tokens_seen": 6256704, "step": 7215 }, { "epoch": 3.404054691183404, "grad_norm": 0.0019142511300742626, "learning_rate": 0.27652910920327517, "loss": 0.2214, "num_input_tokens_seen": 6260096, "step": 7220 }, { "epoch": 3.4064120697784066, "grad_norm": 0.001541539211757481, "learning_rate": 0.2764974624621107, "loss": 0.2095, "num_input_tokens_seen": 6264032, "step": 7225 }, { "epoch": 3.408769448373409, "grad_norm": 0.0027256172616034746, "learning_rate": 0.2764657962134479, "loss": 0.2085, "num_input_tokens_seen": 6268096, "step": 7230 }, { "epoch": 3.411126826968411, "grad_norm": 0.002020112005993724, "learning_rate": 0.27643411046217, "loss": 0.198, "num_input_tokens_seen": 6272848, "step": 7235 }, { "epoch": 3.4134842055634134, "grad_norm": 0.0038403095677495003, "learning_rate": 0.27640240521316334, "loss": 0.2779, "num_input_tokens_seen": 6277616, "step": 7240 }, { "epoch": 3.4158415841584158, "grad_norm": 0.0017760127084329724, "learning_rate": 0.2763706804713174, "loss": 0.2113, "num_input_tokens_seen": 6282768, "step": 7245 }, { "epoch": 3.418198962753418, "grad_norm": 0.0017612036317586899, "learning_rate": 0.2763389362415245, "loss": 0.2208, "num_input_tokens_seen": 6287680, "step": 7250 }, { "epoch": 3.4205563413484206, "grad_norm": 0.0023202865850180387, "learning_rate": 0.27630717252867987, "loss": 0.1709, "num_input_tokens_seen": 6292032, "step": 7255 }, { "epoch": 3.422913719943423, "grad_norm": 0.003048066282644868, "learning_rate": 0.276275389337682, "loss": 0.1757, "num_input_tokens_seen": 6296176, "step": 7260 }, { "epoch": 3.4252710985384254, "grad_norm": 0.0016517804469913244, "learning_rate": 0.2762435866734322, "loss": 0.2036, "num_input_tokens_seen": 6300896, "step": 7265 }, { "epoch": 3.4276284771334278, "grad_norm": 0.002743727993220091, "learning_rate": 0.27621176454083485, "loss": 0.2664, "num_input_tokens_seen": 6304912, "step": 7270 }, { "epoch": 3.42998585572843, "grad_norm": 0.0014227003557607532, "learning_rate": 0.2761799229447973, "loss": 0.2139, "num_input_tokens_seen": 6309328, "step": 7275 }, { "epoch": 3.432343234323432, "grad_norm": 0.0024173902347683907, "learning_rate": 0.27614806189023006, "loss": 0.2174, "num_input_tokens_seen": 6314192, "step": 7280 }, { "epoch": 3.4347006129184345, "grad_norm": 0.002194293076172471, "learning_rate": 0.27611618138204636, "loss": 0.2102, "num_input_tokens_seen": 6318464, "step": 7285 }, { "epoch": 3.437057991513437, "grad_norm": 0.003310837782919407, "learning_rate": 0.2760842814251626, "loss": 0.1846, "num_input_tokens_seen": 6322560, "step": 7290 }, { "epoch": 3.4394153701084393, "grad_norm": 0.0026300654280930758, "learning_rate": 0.2760523620244982, "loss": 0.2021, "num_input_tokens_seen": 6326320, "step": 7295 }, { "epoch": 3.4417727487034417, "grad_norm": 0.0044151959009468555, "learning_rate": 0.27602042318497544, "loss": 0.233, "num_input_tokens_seen": 6330192, "step": 7300 }, { "epoch": 3.444130127298444, "grad_norm": 0.0018041933653876185, "learning_rate": 0.2759884649115198, "loss": 0.2056, "num_input_tokens_seen": 6334288, "step": 7305 }, { "epoch": 3.4464875058934465, "grad_norm": 0.0023777398746460676, "learning_rate": 0.2759564872090596, "loss": 0.2035, "num_input_tokens_seen": 6338400, "step": 7310 }, { "epoch": 3.448844884488449, "grad_norm": 0.002719591837376356, "learning_rate": 0.2759244900825262, "loss": 0.2184, "num_input_tokens_seen": 6342976, "step": 7315 }, { "epoch": 3.4512022630834513, "grad_norm": 0.00211810739710927, "learning_rate": 0.2758924735368539, "loss": 0.234, "num_input_tokens_seen": 6346944, "step": 7320 }, { "epoch": 3.4535596416784538, "grad_norm": 0.0014107570750638843, "learning_rate": 0.27586043757698014, "loss": 0.2192, "num_input_tokens_seen": 6351424, "step": 7325 }, { "epoch": 3.455917020273456, "grad_norm": 0.0032008313573896885, "learning_rate": 0.27582838220784534, "loss": 0.1935, "num_input_tokens_seen": 6355392, "step": 7330 }, { "epoch": 3.458274398868458, "grad_norm": 0.0025628795847296715, "learning_rate": 0.27579630743439265, "loss": 0.1731, "num_input_tokens_seen": 6359040, "step": 7335 }, { "epoch": 3.4606317774634605, "grad_norm": 0.0017497598892077804, "learning_rate": 0.2757642132615686, "loss": 0.2006, "num_input_tokens_seen": 6363280, "step": 7340 }, { "epoch": 3.462989156058463, "grad_norm": 0.002716874470934272, "learning_rate": 0.2757320996943223, "loss": 0.2698, "num_input_tokens_seen": 6368224, "step": 7345 }, { "epoch": 3.4653465346534653, "grad_norm": 0.002742862794548273, "learning_rate": 0.2756999667376062, "loss": 0.2664, "num_input_tokens_seen": 6372240, "step": 7350 }, { "epoch": 3.4677039132484677, "grad_norm": 0.003939539194107056, "learning_rate": 0.2756678143963756, "loss": 0.2445, "num_input_tokens_seen": 6376352, "step": 7355 }, { "epoch": 3.47006129184347, "grad_norm": 0.004275953397154808, "learning_rate": 0.2756356426755888, "loss": 0.242, "num_input_tokens_seen": 6380864, "step": 7360 }, { "epoch": 3.4724186704384725, "grad_norm": 0.0020021451637148857, "learning_rate": 0.27560345158020705, "loss": 0.2079, "num_input_tokens_seen": 6385984, "step": 7365 }, { "epoch": 3.474776049033475, "grad_norm": 0.0025579510256648064, "learning_rate": 0.27557124111519465, "loss": 0.2258, "num_input_tokens_seen": 6390864, "step": 7370 }, { "epoch": 3.4771334276284773, "grad_norm": 0.001775909448042512, "learning_rate": 0.27553901128551883, "loss": 0.2774, "num_input_tokens_seen": 6394768, "step": 7375 }, { "epoch": 3.4794908062234793, "grad_norm": 0.00175549800042063, "learning_rate": 0.2755067620961498, "loss": 0.2605, "num_input_tokens_seen": 6399344, "step": 7380 }, { "epoch": 3.4818481848184817, "grad_norm": 0.00382578675635159, "learning_rate": 0.27547449355206094, "loss": 0.2383, "num_input_tokens_seen": 6403904, "step": 7385 }, { "epoch": 3.484205563413484, "grad_norm": 0.0027317763306200504, "learning_rate": 0.2754422056582283, "loss": 0.2525, "num_input_tokens_seen": 6408240, "step": 7390 }, { "epoch": 3.4865629420084865, "grad_norm": 0.0036534599494189024, "learning_rate": 0.27540989841963115, "loss": 0.2305, "num_input_tokens_seen": 6412352, "step": 7395 }, { "epoch": 3.488920320603489, "grad_norm": 0.0017173553351312876, "learning_rate": 0.27537757184125167, "loss": 0.2499, "num_input_tokens_seen": 6416176, "step": 7400 }, { "epoch": 3.488920320603489, "eval_loss": 0.22008851170539856, "eval_runtime": 21.9493, "eval_samples_per_second": 42.963, "eval_steps_per_second": 21.504, "num_input_tokens_seen": 6416176, "step": 7400 }, { "epoch": 3.4912776991984913, "grad_norm": 0.0015983363846316934, "learning_rate": 0.275345225928075, "loss": 0.2229, "num_input_tokens_seen": 6420416, "step": 7405 }, { "epoch": 3.4936350777934937, "grad_norm": 0.0015859893755987287, "learning_rate": 0.2753128606850893, "loss": 0.2134, "num_input_tokens_seen": 6424592, "step": 7410 }, { "epoch": 3.495992456388496, "grad_norm": 0.0019025172805413604, "learning_rate": 0.2752804761172858, "loss": 0.2288, "num_input_tokens_seen": 6429328, "step": 7415 }, { "epoch": 3.4983498349834985, "grad_norm": 0.0035407382529228926, "learning_rate": 0.27524807222965836, "loss": 0.249, "num_input_tokens_seen": 6433776, "step": 7420 }, { "epoch": 3.500707213578501, "grad_norm": 0.002059070859104395, "learning_rate": 0.27521564902720436, "loss": 0.2117, "num_input_tokens_seen": 6437840, "step": 7425 }, { "epoch": 3.5030645921735033, "grad_norm": 0.0020002226810902357, "learning_rate": 0.2751832065149236, "loss": 0.2144, "num_input_tokens_seen": 6441744, "step": 7430 }, { "epoch": 3.5054219707685053, "grad_norm": 0.002228444442152977, "learning_rate": 0.2751507446978193, "loss": 0.1877, "num_input_tokens_seen": 6445440, "step": 7435 }, { "epoch": 3.5077793493635077, "grad_norm": 0.0012377174571156502, "learning_rate": 0.2751182635808974, "loss": 0.1826, "num_input_tokens_seen": 6449888, "step": 7440 }, { "epoch": 3.51013672795851, "grad_norm": 0.005509630311280489, "learning_rate": 0.27508576316916694, "loss": 0.2039, "num_input_tokens_seen": 6453808, "step": 7445 }, { "epoch": 3.5124941065535125, "grad_norm": 0.0023684920743107796, "learning_rate": 0.2750532434676399, "loss": 0.2596, "num_input_tokens_seen": 6457696, "step": 7450 }, { "epoch": 3.514851485148515, "grad_norm": 0.0028551174327731133, "learning_rate": 0.27502070448133115, "loss": 0.1979, "num_input_tokens_seen": 6462960, "step": 7455 }, { "epoch": 3.5172088637435173, "grad_norm": 0.001305055688135326, "learning_rate": 0.2749881462152587, "loss": 0.2025, "num_input_tokens_seen": 6467536, "step": 7460 }, { "epoch": 3.5195662423385197, "grad_norm": 0.004031731281429529, "learning_rate": 0.2749555686744434, "loss": 0.2452, "num_input_tokens_seen": 6471616, "step": 7465 }, { "epoch": 3.521923620933522, "grad_norm": 0.001082683796994388, "learning_rate": 0.2749229718639091, "loss": 0.2318, "num_input_tokens_seen": 6475824, "step": 7470 }, { "epoch": 3.524280999528524, "grad_norm": 0.0013505717506632209, "learning_rate": 0.27489035578868265, "loss": 0.2329, "num_input_tokens_seen": 6479808, "step": 7475 }, { "epoch": 3.5266383781235264, "grad_norm": 0.0033814788330346346, "learning_rate": 0.2748577204537939, "loss": 0.2364, "num_input_tokens_seen": 6484048, "step": 7480 }, { "epoch": 3.528995756718529, "grad_norm": 0.0017407538834959269, "learning_rate": 0.2748250658642756, "loss": 0.2321, "num_input_tokens_seen": 6488784, "step": 7485 }, { "epoch": 3.5313531353135312, "grad_norm": 0.0013526425464078784, "learning_rate": 0.2747923920251634, "loss": 0.2098, "num_input_tokens_seen": 6493264, "step": 7490 }, { "epoch": 3.5337105139085336, "grad_norm": 0.0012853119987994432, "learning_rate": 0.27475969894149627, "loss": 0.2469, "num_input_tokens_seen": 6498368, "step": 7495 }, { "epoch": 3.536067892503536, "grad_norm": 0.0012513651745393872, "learning_rate": 0.2747269866183156, "loss": 0.2355, "num_input_tokens_seen": 6502416, "step": 7500 }, { "epoch": 3.5384252710985384, "grad_norm": 0.0013078675838187337, "learning_rate": 0.27469425506066625, "loss": 0.2035, "num_input_tokens_seen": 6507264, "step": 7505 }, { "epoch": 3.540782649693541, "grad_norm": 0.0013946976978331804, "learning_rate": 0.27466150427359576, "loss": 0.2354, "num_input_tokens_seen": 6511280, "step": 7510 }, { "epoch": 3.5431400282885432, "grad_norm": 0.001269750064238906, "learning_rate": 0.2746287342621547, "loss": 0.2088, "num_input_tokens_seen": 6515312, "step": 7515 }, { "epoch": 3.5454974068835456, "grad_norm": 0.0023586887400597334, "learning_rate": 0.2745959450313966, "loss": 0.1954, "num_input_tokens_seen": 6519312, "step": 7520 }, { "epoch": 3.547854785478548, "grad_norm": 0.0049217464402318, "learning_rate": 0.27456313658637804, "loss": 0.2771, "num_input_tokens_seen": 6524416, "step": 7525 }, { "epoch": 3.5502121640735504, "grad_norm": 0.001406246330589056, "learning_rate": 0.27453030893215846, "loss": 0.2125, "num_input_tokens_seen": 6528720, "step": 7530 }, { "epoch": 3.5525695426685524, "grad_norm": 0.001640518195927143, "learning_rate": 0.2744974620738003, "loss": 0.2326, "num_input_tokens_seen": 6533296, "step": 7535 }, { "epoch": 3.554926921263555, "grad_norm": 0.0025143090169876814, "learning_rate": 0.27446459601636897, "loss": 0.2319, "num_input_tokens_seen": 6537792, "step": 7540 }, { "epoch": 3.557284299858557, "grad_norm": 0.0016905986703932285, "learning_rate": 0.2744317107649328, "loss": 0.2246, "num_input_tokens_seen": 6542208, "step": 7545 }, { "epoch": 3.5596416784535596, "grad_norm": 0.0020396022591739893, "learning_rate": 0.2743988063245631, "loss": 0.258, "num_input_tokens_seen": 6546496, "step": 7550 }, { "epoch": 3.561999057048562, "grad_norm": 0.002980960300192237, "learning_rate": 0.2743658827003342, "loss": 0.2256, "num_input_tokens_seen": 6550288, "step": 7555 }, { "epoch": 3.5643564356435644, "grad_norm": 0.002304153284057975, "learning_rate": 0.27433293989732327, "loss": 0.203, "num_input_tokens_seen": 6554416, "step": 7560 }, { "epoch": 3.566713814238567, "grad_norm": 0.001734415884129703, "learning_rate": 0.27429997792061056, "loss": 0.2125, "num_input_tokens_seen": 6559232, "step": 7565 }, { "epoch": 3.569071192833569, "grad_norm": 0.0008995212847366929, "learning_rate": 0.27426699677527927, "loss": 0.2433, "num_input_tokens_seen": 6563008, "step": 7570 }, { "epoch": 3.571428571428571, "grad_norm": 0.001496580196544528, "learning_rate": 0.2742339964664154, "loss": 0.2617, "num_input_tokens_seen": 6566848, "step": 7575 }, { "epoch": 3.5737859500235736, "grad_norm": 0.0024066439364105463, "learning_rate": 0.274200976999108, "loss": 0.2322, "num_input_tokens_seen": 6571664, "step": 7580 }, { "epoch": 3.576143328618576, "grad_norm": 0.0023740872275084257, "learning_rate": 0.27416793837844916, "loss": 0.2047, "num_input_tokens_seen": 6575616, "step": 7585 }, { "epoch": 3.5785007072135784, "grad_norm": 0.0024361128453165293, "learning_rate": 0.27413488060953384, "loss": 0.2437, "num_input_tokens_seen": 6579280, "step": 7590 }, { "epoch": 3.580858085808581, "grad_norm": 0.0012804957805201411, "learning_rate": 0.27410180369745996, "loss": 0.2294, "num_input_tokens_seen": 6583472, "step": 7595 }, { "epoch": 3.583215464403583, "grad_norm": 0.0010128882713615894, "learning_rate": 0.27406870764732844, "loss": 0.2372, "num_input_tokens_seen": 6587616, "step": 7600 }, { "epoch": 3.583215464403583, "eval_loss": 0.22464415431022644, "eval_runtime": 21.9075, "eval_samples_per_second": 43.045, "eval_steps_per_second": 21.545, "num_input_tokens_seen": 6587616, "step": 7600 }, { "epoch": 3.5855728429985856, "grad_norm": 0.0010941236978396773, "learning_rate": 0.27403559246424297, "loss": 0.2348, "num_input_tokens_seen": 6591680, "step": 7605 }, { "epoch": 3.587930221593588, "grad_norm": 0.0013178616063669324, "learning_rate": 0.2740024581533105, "loss": 0.2158, "num_input_tokens_seen": 6595344, "step": 7610 }, { "epoch": 3.5902876001885904, "grad_norm": 0.0009237665217369795, "learning_rate": 0.2739693047196406, "loss": 0.1956, "num_input_tokens_seen": 6598832, "step": 7615 }, { "epoch": 3.592644978783593, "grad_norm": 0.0018076588166877627, "learning_rate": 0.27393613216834606, "loss": 0.2635, "num_input_tokens_seen": 6603056, "step": 7620 }, { "epoch": 3.595002357378595, "grad_norm": 0.002524383831769228, "learning_rate": 0.2739029405045424, "loss": 0.2527, "num_input_tokens_seen": 6607040, "step": 7625 }, { "epoch": 3.5973597359735976, "grad_norm": 0.002406864194199443, "learning_rate": 0.2738697297333483, "loss": 0.1826, "num_input_tokens_seen": 6611840, "step": 7630 }, { "epoch": 3.5997171145685996, "grad_norm": 0.0014414064353331923, "learning_rate": 0.2738364998598852, "loss": 0.2329, "num_input_tokens_seen": 6616608, "step": 7635 }, { "epoch": 3.602074493163602, "grad_norm": 0.0011891728499904275, "learning_rate": 0.27380325088927765, "loss": 0.2146, "num_input_tokens_seen": 6620624, "step": 7640 }, { "epoch": 3.6044318717586044, "grad_norm": 0.001107415766455233, "learning_rate": 0.27376998282665294, "loss": 0.2238, "num_input_tokens_seen": 6624656, "step": 7645 }, { "epoch": 3.6067892503536068, "grad_norm": 0.0012750504538416862, "learning_rate": 0.27373669567714154, "loss": 0.2206, "num_input_tokens_seen": 6628976, "step": 7650 }, { "epoch": 3.609146628948609, "grad_norm": 0.0017991869244724512, "learning_rate": 0.27370338944587663, "loss": 0.2313, "num_input_tokens_seen": 6633744, "step": 7655 }, { "epoch": 3.6115040075436116, "grad_norm": 0.0022403558250516653, "learning_rate": 0.27367006413799455, "loss": 0.2156, "num_input_tokens_seen": 6638256, "step": 7660 }, { "epoch": 3.613861386138614, "grad_norm": 0.0018583911005407572, "learning_rate": 0.2736367197586345, "loss": 0.2335, "num_input_tokens_seen": 6643408, "step": 7665 }, { "epoch": 3.6162187647336164, "grad_norm": 0.0026185179594904184, "learning_rate": 0.2736033563129385, "loss": 0.2347, "num_input_tokens_seen": 6647744, "step": 7670 }, { "epoch": 3.6185761433286183, "grad_norm": 0.00175188307184726, "learning_rate": 0.27356997380605164, "loss": 0.2146, "num_input_tokens_seen": 6652448, "step": 7675 }, { "epoch": 3.6209335219236207, "grad_norm": 0.0016861824551597238, "learning_rate": 0.27353657224312194, "loss": 0.198, "num_input_tokens_seen": 6656576, "step": 7680 }, { "epoch": 3.623290900518623, "grad_norm": 0.002153982874006033, "learning_rate": 0.2735031516293004, "loss": 0.21, "num_input_tokens_seen": 6661088, "step": 7685 }, { "epoch": 3.6256482791136255, "grad_norm": 0.0012487162603065372, "learning_rate": 0.2734697119697408, "loss": 0.2118, "num_input_tokens_seen": 6665008, "step": 7690 }, { "epoch": 3.628005657708628, "grad_norm": 0.0018250724533572793, "learning_rate": 0.27343625326959997, "loss": 0.1919, "num_input_tokens_seen": 6668880, "step": 7695 }, { "epoch": 3.6303630363036303, "grad_norm": 0.0011959741823375225, "learning_rate": 0.27340277553403775, "loss": 0.238, "num_input_tokens_seen": 6673984, "step": 7700 }, { "epoch": 3.6327204148986327, "grad_norm": 0.0015136012807488441, "learning_rate": 0.2733692787682167, "loss": 0.1635, "num_input_tokens_seen": 6678080, "step": 7705 }, { "epoch": 3.635077793493635, "grad_norm": 0.0023738176096230745, "learning_rate": 0.27333576297730255, "loss": 0.1934, "num_input_tokens_seen": 6682416, "step": 7710 }, { "epoch": 3.6374351720886375, "grad_norm": 0.0035761655308306217, "learning_rate": 0.2733022281664638, "loss": 0.2469, "num_input_tokens_seen": 6687072, "step": 7715 }, { "epoch": 3.63979255068364, "grad_norm": 0.0020619500428438187, "learning_rate": 0.273268674340872, "loss": 0.1793, "num_input_tokens_seen": 6691824, "step": 7720 }, { "epoch": 3.6421499292786423, "grad_norm": 0.033405501395463943, "learning_rate": 0.27323510150570146, "loss": 0.2553, "num_input_tokens_seen": 6696128, "step": 7725 }, { "epoch": 3.6445073078736447, "grad_norm": 0.0029146000742912292, "learning_rate": 0.27320150966612966, "loss": 0.2503, "num_input_tokens_seen": 6700192, "step": 7730 }, { "epoch": 3.6468646864686467, "grad_norm": 0.004211536142975092, "learning_rate": 0.2731678988273368, "loss": 0.2344, "num_input_tokens_seen": 6704064, "step": 7735 }, { "epoch": 3.649222065063649, "grad_norm": 0.0037623511161655188, "learning_rate": 0.27313426899450605, "loss": 0.2263, "num_input_tokens_seen": 6708944, "step": 7740 }, { "epoch": 3.6515794436586515, "grad_norm": 0.0022754922974854708, "learning_rate": 0.27310062017282366, "loss": 0.2291, "num_input_tokens_seen": 6713184, "step": 7745 }, { "epoch": 3.653936822253654, "grad_norm": 0.15484841167926788, "learning_rate": 0.2730669523674787, "loss": 0.3082, "num_input_tokens_seen": 6717424, "step": 7750 }, { "epoch": 3.6562942008486563, "grad_norm": 0.004124440718442202, "learning_rate": 0.2730332655836631, "loss": 0.1846, "num_input_tokens_seen": 6720960, "step": 7755 }, { "epoch": 3.6586515794436587, "grad_norm": 0.009590270929038525, "learning_rate": 0.2729995598265718, "loss": 0.3028, "num_input_tokens_seen": 6724800, "step": 7760 }, { "epoch": 3.661008958038661, "grad_norm": 0.004252584185451269, "learning_rate": 0.2729658351014027, "loss": 0.2277, "num_input_tokens_seen": 6728928, "step": 7765 }, { "epoch": 3.6633663366336635, "grad_norm": 0.004570796620100737, "learning_rate": 0.27293209141335656, "loss": 0.2213, "num_input_tokens_seen": 6732736, "step": 7770 }, { "epoch": 3.6657237152286655, "grad_norm": 0.005625327117741108, "learning_rate": 0.27289832876763703, "loss": 0.2587, "num_input_tokens_seen": 6737328, "step": 7775 }, { "epoch": 3.668081093823668, "grad_norm": 0.0033283380325883627, "learning_rate": 0.27286454716945074, "loss": 0.2251, "num_input_tokens_seen": 6741760, "step": 7780 }, { "epoch": 3.6704384724186703, "grad_norm": 0.0032676574774086475, "learning_rate": 0.27283074662400725, "loss": 0.2363, "num_input_tokens_seen": 6745952, "step": 7785 }, { "epoch": 3.6727958510136727, "grad_norm": 0.004196304362267256, "learning_rate": 0.2727969271365191, "loss": 0.2778, "num_input_tokens_seen": 6750096, "step": 7790 }, { "epoch": 3.675153229608675, "grad_norm": 0.03748945891857147, "learning_rate": 0.2727630887122016, "loss": 0.2278, "num_input_tokens_seen": 6754704, "step": 7795 }, { "epoch": 3.6775106082036775, "grad_norm": 0.035863518714904785, "learning_rate": 0.27272923135627314, "loss": 0.2897, "num_input_tokens_seen": 6759696, "step": 7800 }, { "epoch": 3.6775106082036775, "eval_loss": 0.3597075641155243, "eval_runtime": 21.8614, "eval_samples_per_second": 43.135, "eval_steps_per_second": 21.591, "num_input_tokens_seen": 6759696, "step": 7800 }, { "epoch": 3.67986798679868, "grad_norm": 0.033299729228019714, "learning_rate": 0.2726953550739548, "loss": 0.3237, "num_input_tokens_seen": 6763664, "step": 7805 }, { "epoch": 3.6822253653936823, "grad_norm": 0.010386921465396881, "learning_rate": 0.27266145987047086, "loss": 0.3984, "num_input_tokens_seen": 6767296, "step": 7810 }, { "epoch": 3.6845827439886847, "grad_norm": 0.018354682251811028, "learning_rate": 0.27262754575104836, "loss": 0.2373, "num_input_tokens_seen": 6771728, "step": 7815 }, { "epoch": 3.686940122583687, "grad_norm": 0.12378855794668198, "learning_rate": 0.27259361272091726, "loss": 0.6709, "num_input_tokens_seen": 6775968, "step": 7820 }, { "epoch": 3.6892975011786895, "grad_norm": 0.07291828840970993, "learning_rate": 0.27255966078531046, "loss": 0.2807, "num_input_tokens_seen": 6780384, "step": 7825 }, { "epoch": 3.691654879773692, "grad_norm": 0.008483180776238441, "learning_rate": 0.2725256899494638, "loss": 0.2658, "num_input_tokens_seen": 6784592, "step": 7830 }, { "epoch": 3.694012258368694, "grad_norm": 0.005903953220695257, "learning_rate": 0.272491700218616, "loss": 0.1911, "num_input_tokens_seen": 6788592, "step": 7835 }, { "epoch": 3.6963696369636962, "grad_norm": 0.02551133930683136, "learning_rate": 0.27245769159800876, "loss": 0.2617, "num_input_tokens_seen": 6792384, "step": 7840 }, { "epoch": 3.6987270155586986, "grad_norm": 0.011712886393070221, "learning_rate": 0.2724236640928865, "loss": 0.3135, "num_input_tokens_seen": 6796480, "step": 7845 }, { "epoch": 3.701084394153701, "grad_norm": 0.010079601779580116, "learning_rate": 0.27238961770849673, "loss": 0.2164, "num_input_tokens_seen": 6800400, "step": 7850 }, { "epoch": 3.7034417727487035, "grad_norm": 0.007175520062446594, "learning_rate": 0.27235555245008997, "loss": 0.2937, "num_input_tokens_seen": 6804896, "step": 7855 }, { "epoch": 3.705799151343706, "grad_norm": 0.009018326178193092, "learning_rate": 0.2723214683229193, "loss": 0.2472, "num_input_tokens_seen": 6809216, "step": 7860 }, { "epoch": 3.7081565299387083, "grad_norm": 0.009131485596299171, "learning_rate": 0.27228736533224107, "loss": 0.2829, "num_input_tokens_seen": 6813904, "step": 7865 }, { "epoch": 3.7105139085337107, "grad_norm": 0.007141607813537121, "learning_rate": 0.27225324348331437, "loss": 0.2614, "num_input_tokens_seen": 6818128, "step": 7870 }, { "epoch": 3.7128712871287126, "grad_norm": 0.0032803788781166077, "learning_rate": 0.27221910278140116, "loss": 0.2453, "num_input_tokens_seen": 6823136, "step": 7875 }, { "epoch": 3.715228665723715, "grad_norm": 0.002921412466093898, "learning_rate": 0.2721849432317664, "loss": 0.2982, "num_input_tokens_seen": 6827760, "step": 7880 }, { "epoch": 3.7175860443187174, "grad_norm": 0.0025941256899386644, "learning_rate": 0.2721507648396779, "loss": 0.2711, "num_input_tokens_seen": 6832144, "step": 7885 }, { "epoch": 3.71994342291372, "grad_norm": 0.005638516042381525, "learning_rate": 0.27211656761040653, "loss": 0.2318, "num_input_tokens_seen": 6835760, "step": 7890 }, { "epoch": 3.7223008015087222, "grad_norm": 0.01043060701340437, "learning_rate": 0.2720823515492257, "loss": 0.3034, "num_input_tokens_seen": 6840128, "step": 7895 }, { "epoch": 3.7246581801037246, "grad_norm": 0.0058115446008741856, "learning_rate": 0.27204811666141215, "loss": 0.2203, "num_input_tokens_seen": 6843920, "step": 7900 }, { "epoch": 3.727015558698727, "grad_norm": 0.005477519705891609, "learning_rate": 0.2720138629522452, "loss": 0.2485, "num_input_tokens_seen": 6848304, "step": 7905 }, { "epoch": 3.7293729372937294, "grad_norm": 0.009948208928108215, "learning_rate": 0.2719795904270073, "loss": 0.2686, "num_input_tokens_seen": 6852544, "step": 7910 }, { "epoch": 3.731730315888732, "grad_norm": 0.0054732575081288815, "learning_rate": 0.2719452990909837, "loss": 0.2363, "num_input_tokens_seen": 6857696, "step": 7915 }, { "epoch": 3.7340876944837342, "grad_norm": 0.00380932935513556, "learning_rate": 0.2719109889494625, "loss": 0.353, "num_input_tokens_seen": 6862496, "step": 7920 }, { "epoch": 3.7364450730787366, "grad_norm": 0.005342503078281879, "learning_rate": 0.27187666000773475, "loss": 0.2175, "num_input_tokens_seen": 6867056, "step": 7925 }, { "epoch": 3.738802451673739, "grad_norm": 0.00863492488861084, "learning_rate": 0.2718423122710944, "loss": 0.2141, "num_input_tokens_seen": 6872144, "step": 7930 }, { "epoch": 3.741159830268741, "grad_norm": 0.009092372842133045, "learning_rate": 0.2718079457448384, "loss": 0.293, "num_input_tokens_seen": 6876464, "step": 7935 }, { "epoch": 3.7435172088637434, "grad_norm": 0.004403768107295036, "learning_rate": 0.27177356043426637, "loss": 0.3639, "num_input_tokens_seen": 6882096, "step": 7940 }, { "epoch": 3.745874587458746, "grad_norm": 0.008505245670676231, "learning_rate": 0.27173915634468104, "loss": 0.2598, "num_input_tokens_seen": 6886048, "step": 7945 }, { "epoch": 3.748231966053748, "grad_norm": 0.0038329889066517353, "learning_rate": 0.27170473348138796, "loss": 0.258, "num_input_tokens_seen": 6890976, "step": 7950 }, { "epoch": 3.7505893446487506, "grad_norm": 0.0030589252710342407, "learning_rate": 0.27167029184969554, "loss": 0.2262, "num_input_tokens_seen": 6895440, "step": 7955 }, { "epoch": 3.752946723243753, "grad_norm": 0.002406762447208166, "learning_rate": 0.27163583145491504, "loss": 0.2236, "num_input_tokens_seen": 6899824, "step": 7960 }, { "epoch": 3.7553041018387554, "grad_norm": 0.0025360353756695986, "learning_rate": 0.2716013523023608, "loss": 0.2213, "num_input_tokens_seen": 6903632, "step": 7965 }, { "epoch": 3.757661480433758, "grad_norm": 0.0026745933573693037, "learning_rate": 0.27156685439734995, "loss": 0.2319, "num_input_tokens_seen": 6907648, "step": 7970 }, { "epoch": 3.7600188590287598, "grad_norm": 0.0028864555060863495, "learning_rate": 0.2715323377452024, "loss": 0.2014, "num_input_tokens_seen": 6912080, "step": 7975 }, { "epoch": 3.762376237623762, "grad_norm": 0.0023386471439152956, "learning_rate": 0.2714978023512411, "loss": 0.1671, "num_input_tokens_seen": 6916400, "step": 7980 }, { "epoch": 3.7647336162187646, "grad_norm": 0.0016304695745930076, "learning_rate": 0.2714632482207918, "loss": 0.258, "num_input_tokens_seen": 6920064, "step": 7985 }, { "epoch": 3.767090994813767, "grad_norm": 0.003582676174119115, "learning_rate": 0.2714286753591833, "loss": 0.2236, "num_input_tokens_seen": 6924256, "step": 7990 }, { "epoch": 3.7694483734087694, "grad_norm": 0.00515202758833766, "learning_rate": 0.27139408377174706, "loss": 0.2583, "num_input_tokens_seen": 6928624, "step": 7995 }, { "epoch": 3.7718057520037718, "grad_norm": 0.0033301750663667917, "learning_rate": 0.27135947346381756, "loss": 0.236, "num_input_tokens_seen": 6932384, "step": 8000 }, { "epoch": 3.7718057520037718, "eval_loss": 0.22836388647556305, "eval_runtime": 21.8833, "eval_samples_per_second": 43.092, "eval_steps_per_second": 21.569, "num_input_tokens_seen": 6932384, "step": 8000 }, { "epoch": 3.774163130598774, "grad_norm": 0.004053758457303047, "learning_rate": 0.2713248444407322, "loss": 0.2354, "num_input_tokens_seen": 6937344, "step": 8005 }, { "epoch": 3.7765205091937766, "grad_norm": 0.0029634456150233746, "learning_rate": 0.27129019670783106, "loss": 0.2114, "num_input_tokens_seen": 6942160, "step": 8010 }, { "epoch": 3.778877887788779, "grad_norm": 0.0019901327323168516, "learning_rate": 0.27125553027045746, "loss": 0.2104, "num_input_tokens_seen": 6946544, "step": 8015 }, { "epoch": 3.7812352663837814, "grad_norm": 0.003887371625751257, "learning_rate": 0.2712208451339572, "loss": 0.1887, "num_input_tokens_seen": 6951264, "step": 8020 }, { "epoch": 3.783592644978784, "grad_norm": 0.006146032828837633, "learning_rate": 0.27118614130367935, "loss": 0.2622, "num_input_tokens_seen": 6955280, "step": 8025 }, { "epoch": 3.785950023573786, "grad_norm": 0.002621586201712489, "learning_rate": 0.2711514187849756, "loss": 0.1861, "num_input_tokens_seen": 6960096, "step": 8030 }, { "epoch": 3.7883074021687886, "grad_norm": 0.002455734880641103, "learning_rate": 0.27111667758320057, "loss": 0.1973, "num_input_tokens_seen": 6963936, "step": 8035 }, { "epoch": 3.7906647807637905, "grad_norm": 0.002987583866342902, "learning_rate": 0.27108191770371176, "loss": 0.2013, "num_input_tokens_seen": 6968640, "step": 8040 }, { "epoch": 3.793022159358793, "grad_norm": 0.0023241937160491943, "learning_rate": 0.2710471391518697, "loss": 0.2857, "num_input_tokens_seen": 6972752, "step": 8045 }, { "epoch": 3.7953795379537953, "grad_norm": 0.005478262901306152, "learning_rate": 0.2710123419330375, "loss": 0.1962, "num_input_tokens_seen": 6976576, "step": 8050 }, { "epoch": 3.7977369165487977, "grad_norm": 0.0811113715171814, "learning_rate": 0.2709775260525816, "loss": 0.369, "num_input_tokens_seen": 6981408, "step": 8055 }, { "epoch": 3.8000942951438, "grad_norm": 0.006993884686380625, "learning_rate": 0.27094269151587075, "loss": 0.2418, "num_input_tokens_seen": 6985840, "step": 8060 }, { "epoch": 3.8024516737388026, "grad_norm": 0.0054725706577301025, "learning_rate": 0.27090783832827703, "loss": 0.229, "num_input_tokens_seen": 6989632, "step": 8065 }, { "epoch": 3.804809052333805, "grad_norm": 0.0058980234898626804, "learning_rate": 0.2708729664951753, "loss": 0.1913, "num_input_tokens_seen": 6993312, "step": 8070 }, { "epoch": 3.807166430928807, "grad_norm": 0.01407616026699543, "learning_rate": 0.27083807602194304, "loss": 0.2563, "num_input_tokens_seen": 6997824, "step": 8075 }, { "epoch": 3.8095238095238093, "grad_norm": 0.0160497035831213, "learning_rate": 0.270803166913961, "loss": 0.2232, "num_input_tokens_seen": 7002272, "step": 8080 }, { "epoch": 3.8118811881188117, "grad_norm": 0.01089609507471323, "learning_rate": 0.27076823917661247, "loss": 0.3546, "num_input_tokens_seen": 7006672, "step": 8085 }, { "epoch": 3.814238566713814, "grad_norm": 0.014443318359553814, "learning_rate": 0.2707332928152838, "loss": 0.2792, "num_input_tokens_seen": 7011024, "step": 8090 }, { "epoch": 3.8165959453088165, "grad_norm": 0.016217920929193497, "learning_rate": 0.2706983278353641, "loss": 0.2565, "num_input_tokens_seen": 7015264, "step": 8095 }, { "epoch": 3.818953323903819, "grad_norm": 0.015500267967581749, "learning_rate": 0.27066334424224553, "loss": 0.2381, "num_input_tokens_seen": 7019616, "step": 8100 }, { "epoch": 3.8213107024988213, "grad_norm": 0.010520357638597488, "learning_rate": 0.27062834204132297, "loss": 0.4027, "num_input_tokens_seen": 7023984, "step": 8105 }, { "epoch": 3.8236680810938237, "grad_norm": 0.016029974445700645, "learning_rate": 0.27059332123799407, "loss": 0.2592, "num_input_tokens_seen": 7027648, "step": 8110 }, { "epoch": 3.826025459688826, "grad_norm": 0.008358299732208252, "learning_rate": 0.27055828183765956, "loss": 0.1755, "num_input_tokens_seen": 7031744, "step": 8115 }, { "epoch": 3.8283828382838285, "grad_norm": 0.03197292983531952, "learning_rate": 0.270523223845723, "loss": 0.3782, "num_input_tokens_seen": 7036624, "step": 8120 }, { "epoch": 3.830740216878831, "grad_norm": 0.01838686503469944, "learning_rate": 0.2704881472675907, "loss": 0.2611, "num_input_tokens_seen": 7040960, "step": 8125 }, { "epoch": 3.8330975954738333, "grad_norm": 0.012036159634590149, "learning_rate": 0.270453052108672, "loss": 0.2003, "num_input_tokens_seen": 7045024, "step": 8130 }, { "epoch": 3.8354549740688357, "grad_norm": 0.02660859189927578, "learning_rate": 0.2704179383743789, "loss": 0.2147, "num_input_tokens_seen": 7049360, "step": 8135 }, { "epoch": 3.8378123526638377, "grad_norm": 0.023135241121053696, "learning_rate": 0.27038280607012644, "loss": 0.3311, "num_input_tokens_seen": 7053968, "step": 8140 }, { "epoch": 3.84016973125884, "grad_norm": 0.011876881122589111, "learning_rate": 0.27034765520133247, "loss": 0.2808, "num_input_tokens_seen": 7057520, "step": 8145 }, { "epoch": 3.8425271098538425, "grad_norm": 0.013870195485651493, "learning_rate": 0.2703124857734177, "loss": 0.2482, "num_input_tokens_seen": 7061728, "step": 8150 }, { "epoch": 3.844884488448845, "grad_norm": 0.010514350607991219, "learning_rate": 0.27027729779180565, "loss": 0.2692, "num_input_tokens_seen": 7065680, "step": 8155 }, { "epoch": 3.8472418670438473, "grad_norm": 0.01690591685473919, "learning_rate": 0.27024209126192283, "loss": 0.2613, "num_input_tokens_seen": 7069904, "step": 8160 }, { "epoch": 3.8495992456388497, "grad_norm": 0.009164371527731419, "learning_rate": 0.2702068661891984, "loss": 0.2686, "num_input_tokens_seen": 7074224, "step": 8165 }, { "epoch": 3.851956624233852, "grad_norm": 0.00950782559812069, "learning_rate": 0.2701716225790647, "loss": 0.2371, "num_input_tokens_seen": 7077968, "step": 8170 }, { "epoch": 3.854314002828854, "grad_norm": 0.00806693360209465, "learning_rate": 0.27013636043695655, "loss": 0.2105, "num_input_tokens_seen": 7081552, "step": 8175 }, { "epoch": 3.8566713814238565, "grad_norm": 0.01922430843114853, "learning_rate": 0.27010107976831194, "loss": 0.2657, "num_input_tokens_seen": 7086160, "step": 8180 }, { "epoch": 3.859028760018859, "grad_norm": 0.014402499422430992, "learning_rate": 0.2700657805785715, "loss": 0.229, "num_input_tokens_seen": 7090416, "step": 8185 }, { "epoch": 3.8613861386138613, "grad_norm": 0.016360608860850334, "learning_rate": 0.2700304628731789, "loss": 0.2475, "num_input_tokens_seen": 7094304, "step": 8190 }, { "epoch": 3.8637435172088637, "grad_norm": 0.014377044513821602, "learning_rate": 0.26999512665758046, "loss": 0.2466, "num_input_tokens_seen": 7099264, "step": 8195 }, { "epoch": 3.866100895803866, "grad_norm": 0.029197312891483307, "learning_rate": 0.2699597719372256, "loss": 0.2549, "num_input_tokens_seen": 7103328, "step": 8200 }, { "epoch": 3.866100895803866, "eval_loss": 0.24831777811050415, "eval_runtime": 21.9653, "eval_samples_per_second": 42.931, "eval_steps_per_second": 21.488, "num_input_tokens_seen": 7103328, "step": 8200 }, { "epoch": 3.8684582743988685, "grad_norm": 0.03602661192417145, "learning_rate": 0.26992439871756635, "loss": 0.2549, "num_input_tokens_seen": 7107472, "step": 8205 }, { "epoch": 3.870815652993871, "grad_norm": 0.06549565494060516, "learning_rate": 0.2698890070040578, "loss": 0.2201, "num_input_tokens_seen": 7111776, "step": 8210 }, { "epoch": 3.8731730315888733, "grad_norm": 0.08627652376890182, "learning_rate": 0.2698535968021577, "loss": 0.3053, "num_input_tokens_seen": 7115616, "step": 8215 }, { "epoch": 3.8755304101838757, "grad_norm": 0.059709735214710236, "learning_rate": 0.26981816811732684, "loss": 0.2824, "num_input_tokens_seen": 7119808, "step": 8220 }, { "epoch": 3.877887788778878, "grad_norm": 0.07356496155261993, "learning_rate": 0.26978272095502875, "loss": 0.4549, "num_input_tokens_seen": 7124064, "step": 8225 }, { "epoch": 3.8802451673738805, "grad_norm": 0.06219466030597687, "learning_rate": 0.26974725532072974, "loss": 0.2641, "num_input_tokens_seen": 7128848, "step": 8230 }, { "epoch": 3.882602545968883, "grad_norm": 0.1410224735736847, "learning_rate": 0.26971177121989914, "loss": 0.2735, "num_input_tokens_seen": 7134128, "step": 8235 }, { "epoch": 3.884959924563885, "grad_norm": 0.02292437106370926, "learning_rate": 0.2696762686580091, "loss": 0.2825, "num_input_tokens_seen": 7138912, "step": 8240 }, { "epoch": 3.8873173031588872, "grad_norm": 0.045064736157655716, "learning_rate": 0.26964074764053436, "loss": 0.3228, "num_input_tokens_seen": 7143792, "step": 8245 }, { "epoch": 3.8896746817538896, "grad_norm": 0.09522506594657898, "learning_rate": 0.2696052081729529, "loss": 0.3348, "num_input_tokens_seen": 7148368, "step": 8250 }, { "epoch": 3.892032060348892, "grad_norm": 0.018252654001116753, "learning_rate": 0.2695696502607453, "loss": 0.3503, "num_input_tokens_seen": 7154208, "step": 8255 }, { "epoch": 3.8943894389438944, "grad_norm": 0.013888959772884846, "learning_rate": 0.26953407390939504, "loss": 0.3341, "num_input_tokens_seen": 7158960, "step": 8260 }, { "epoch": 3.896746817538897, "grad_norm": 0.1465916484594345, "learning_rate": 0.26949847912438835, "loss": 0.326, "num_input_tokens_seen": 7164688, "step": 8265 }, { "epoch": 3.8991041961338992, "grad_norm": 0.007146978750824928, "learning_rate": 0.26946286591121454, "loss": 0.1702, "num_input_tokens_seen": 7168944, "step": 8270 }, { "epoch": 3.901461574728901, "grad_norm": 0.007197447121143341, "learning_rate": 0.2694272342753655, "loss": 0.5296, "num_input_tokens_seen": 7173648, "step": 8275 }, { "epoch": 3.9038189533239036, "grad_norm": 0.013607836328446865, "learning_rate": 0.26939158422233617, "loss": 0.2428, "num_input_tokens_seen": 7177888, "step": 8280 }, { "epoch": 3.906176331918906, "grad_norm": 0.012997593730688095, "learning_rate": 0.26935591575762413, "loss": 0.2243, "num_input_tokens_seen": 7181984, "step": 8285 }, { "epoch": 3.9085337105139084, "grad_norm": 0.02256922610104084, "learning_rate": 0.26932022888672996, "loss": 0.2657, "num_input_tokens_seen": 7186528, "step": 8290 }, { "epoch": 3.910891089108911, "grad_norm": 0.01380437146872282, "learning_rate": 0.26928452361515703, "loss": 0.4025, "num_input_tokens_seen": 7191296, "step": 8295 }, { "epoch": 3.913248467703913, "grad_norm": 0.016456304118037224, "learning_rate": 0.26924879994841155, "loss": 0.2605, "num_input_tokens_seen": 7194944, "step": 8300 }, { "epoch": 3.9156058462989156, "grad_norm": 0.011400873772799969, "learning_rate": 0.2692130578920025, "loss": 0.3116, "num_input_tokens_seen": 7198784, "step": 8305 }, { "epoch": 3.917963224893918, "grad_norm": 0.00675022229552269, "learning_rate": 0.26917729745144187, "loss": 0.2345, "num_input_tokens_seen": 7202752, "step": 8310 }, { "epoch": 3.9203206034889204, "grad_norm": 0.00681073684245348, "learning_rate": 0.2691415186322443, "loss": 0.2231, "num_input_tokens_seen": 7207680, "step": 8315 }, { "epoch": 3.922677982083923, "grad_norm": 0.007269169203937054, "learning_rate": 0.2691057214399273, "loss": 0.2017, "num_input_tokens_seen": 7211280, "step": 8320 }, { "epoch": 3.9250353606789252, "grad_norm": 0.01001354493200779, "learning_rate": 0.2690699058800113, "loss": 0.2628, "num_input_tokens_seen": 7215696, "step": 8325 }, { "epoch": 3.9273927392739276, "grad_norm": 0.017319487407803535, "learning_rate": 0.2690340719580194, "loss": 0.3087, "num_input_tokens_seen": 7219104, "step": 8330 }, { "epoch": 3.92975011786893, "grad_norm": 0.0163667444139719, "learning_rate": 0.2689982196794778, "loss": 0.2799, "num_input_tokens_seen": 7223504, "step": 8335 }, { "epoch": 3.932107496463932, "grad_norm": 0.008726874366402626, "learning_rate": 0.2689623490499153, "loss": 0.1567, "num_input_tokens_seen": 7227424, "step": 8340 }, { "epoch": 3.9344648750589344, "grad_norm": 0.014816123992204666, "learning_rate": 0.2689264600748636, "loss": 0.2103, "num_input_tokens_seen": 7231632, "step": 8345 }, { "epoch": 3.936822253653937, "grad_norm": 0.006456134375184774, "learning_rate": 0.26889055275985724, "loss": 0.1497, "num_input_tokens_seen": 7235712, "step": 8350 }, { "epoch": 3.939179632248939, "grad_norm": 0.00960117019712925, "learning_rate": 0.2688546271104335, "loss": 0.2162, "num_input_tokens_seen": 7240384, "step": 8355 }, { "epoch": 3.9415370108439416, "grad_norm": 0.012548311613500118, "learning_rate": 0.26881868313213275, "loss": 0.2795, "num_input_tokens_seen": 7244320, "step": 8360 }, { "epoch": 3.943894389438944, "grad_norm": 0.00907771848142147, "learning_rate": 0.2687827208304978, "loss": 0.258, "num_input_tokens_seen": 7248528, "step": 8365 }, { "epoch": 3.9462517680339464, "grad_norm": 0.008674316108226776, "learning_rate": 0.26874674021107464, "loss": 0.296, "num_input_tokens_seen": 7252352, "step": 8370 }, { "epoch": 3.9486091466289484, "grad_norm": 0.007676490116864443, "learning_rate": 0.2687107412794118, "loss": 0.2533, "num_input_tokens_seen": 7257152, "step": 8375 }, { "epoch": 3.9509665252239508, "grad_norm": 0.008198868483304977, "learning_rate": 0.26867472404106096, "loss": 0.2688, "num_input_tokens_seen": 7261136, "step": 8380 }, { "epoch": 3.953323903818953, "grad_norm": 0.01284998469054699, "learning_rate": 0.26863868850157624, "loss": 0.2246, "num_input_tokens_seen": 7265040, "step": 8385 }, { "epoch": 3.9556812824139556, "grad_norm": 0.01035616360604763, "learning_rate": 0.26860263466651485, "loss": 0.2298, "num_input_tokens_seen": 7269104, "step": 8390 }, { "epoch": 3.958038661008958, "grad_norm": 0.015500301495194435, "learning_rate": 0.26856656254143674, "loss": 0.2724, "num_input_tokens_seen": 7272512, "step": 8395 }, { "epoch": 3.9603960396039604, "grad_norm": 0.014138370752334595, "learning_rate": 0.2685304721319047, "loss": 0.264, "num_input_tokens_seen": 7276304, "step": 8400 }, { "epoch": 3.9603960396039604, "eval_loss": 0.29675740003585815, "eval_runtime": 21.8918, "eval_samples_per_second": 43.076, "eval_steps_per_second": 21.561, "num_input_tokens_seen": 7276304, "step": 8400 }, { "epoch": 3.9627534181989628, "grad_norm": 0.014727059751749039, "learning_rate": 0.2684943634434843, "loss": 0.3367, "num_input_tokens_seen": 7281440, "step": 8405 }, { "epoch": 3.965110796793965, "grad_norm": 0.007395980414003134, "learning_rate": 0.268458236481744, "loss": 0.2271, "num_input_tokens_seen": 7286192, "step": 8410 }, { "epoch": 3.9674681753889676, "grad_norm": 0.007851596921682358, "learning_rate": 0.2684220912522549, "loss": 0.2327, "num_input_tokens_seen": 7290224, "step": 8415 }, { "epoch": 3.96982555398397, "grad_norm": 0.005390584003180265, "learning_rate": 0.2683859277605913, "loss": 0.2506, "num_input_tokens_seen": 7294352, "step": 8420 }, { "epoch": 3.9721829325789724, "grad_norm": 0.006175574380904436, "learning_rate": 0.2683497460123298, "loss": 0.1862, "num_input_tokens_seen": 7299472, "step": 8425 }, { "epoch": 3.9745403111739748, "grad_norm": 0.010258338414132595, "learning_rate": 0.26831354601305013, "loss": 0.2085, "num_input_tokens_seen": 7303280, "step": 8430 }, { "epoch": 3.976897689768977, "grad_norm": 0.0073865302838385105, "learning_rate": 0.26827732776833496, "loss": 0.1748, "num_input_tokens_seen": 7307504, "step": 8435 }, { "epoch": 3.979255068363979, "grad_norm": 0.013866053894162178, "learning_rate": 0.26824109128376944, "loss": 0.2461, "num_input_tokens_seen": 7312192, "step": 8440 }, { "epoch": 3.9816124469589815, "grad_norm": 0.012387150898575783, "learning_rate": 0.2682048365649417, "loss": 0.2446, "num_input_tokens_seen": 7316352, "step": 8445 }, { "epoch": 3.983969825553984, "grad_norm": 0.01737266406416893, "learning_rate": 0.2681685636174428, "loss": 0.2515, "num_input_tokens_seen": 7321056, "step": 8450 }, { "epoch": 3.9863272041489863, "grad_norm": 0.016208503395318985, "learning_rate": 0.2681322724468663, "loss": 0.2272, "num_input_tokens_seen": 7324880, "step": 8455 }, { "epoch": 3.9886845827439887, "grad_norm": 0.013225015252828598, "learning_rate": 0.2680959630588089, "loss": 0.2416, "num_input_tokens_seen": 7328976, "step": 8460 }, { "epoch": 3.991041961338991, "grad_norm": 0.006945619825273752, "learning_rate": 0.26805963545886985, "loss": 0.2131, "num_input_tokens_seen": 7334784, "step": 8465 }, { "epoch": 3.9933993399339935, "grad_norm": 0.01150052435696125, "learning_rate": 0.26802328965265143, "loss": 0.3147, "num_input_tokens_seen": 7338528, "step": 8470 }, { "epoch": 3.9957567185289955, "grad_norm": 0.009082772769033909, "learning_rate": 0.26798692564575854, "loss": 0.2217, "num_input_tokens_seen": 7342560, "step": 8475 }, { "epoch": 3.998114097123998, "grad_norm": 0.00817931815981865, "learning_rate": 0.26795054344379904, "loss": 0.2414, "num_input_tokens_seen": 7346944, "step": 8480 }, { "epoch": 4.000471475719, "grad_norm": 0.005739480257034302, "learning_rate": 0.2679141430523835, "loss": 0.2225, "num_input_tokens_seen": 7350464, "step": 8485 }, { "epoch": 4.002828854314003, "grad_norm": 0.0051119523122906685, "learning_rate": 0.2678777244771252, "loss": 0.1819, "num_input_tokens_seen": 7354608, "step": 8490 }, { "epoch": 4.005186232909005, "grad_norm": 0.008346782065927982, "learning_rate": 0.2678412877236405, "loss": 0.2394, "num_input_tokens_seen": 7358480, "step": 8495 }, { "epoch": 4.0075436115040075, "grad_norm": 0.00833813101053238, "learning_rate": 0.2678048327975484, "loss": 0.2242, "num_input_tokens_seen": 7362608, "step": 8500 }, { "epoch": 4.00990099009901, "grad_norm": 0.005893635097891092, "learning_rate": 0.2677683597044706, "loss": 0.1832, "num_input_tokens_seen": 7367008, "step": 8505 }, { "epoch": 4.012258368694012, "grad_norm": 0.012804664671421051, "learning_rate": 0.2677318684500318, "loss": 0.2407, "num_input_tokens_seen": 7371376, "step": 8510 }, { "epoch": 4.014615747289015, "grad_norm": 0.008774391375482082, "learning_rate": 0.2676953590398593, "loss": 0.3487, "num_input_tokens_seen": 7375424, "step": 8515 }, { "epoch": 4.016973125884017, "grad_norm": 0.007615596055984497, "learning_rate": 0.2676588314795834, "loss": 0.2801, "num_input_tokens_seen": 7379472, "step": 8520 }, { "epoch": 4.0193305044790195, "grad_norm": 0.006788135506212711, "learning_rate": 0.26762228577483715, "loss": 0.2678, "num_input_tokens_seen": 7383408, "step": 8525 }, { "epoch": 4.021687883074022, "grad_norm": 0.0068372697569429874, "learning_rate": 0.2675857219312563, "loss": 0.1936, "num_input_tokens_seen": 7387280, "step": 8530 }, { "epoch": 4.024045261669024, "grad_norm": 0.008132080547511578, "learning_rate": 0.2675491399544794, "loss": 0.3034, "num_input_tokens_seen": 7391808, "step": 8535 }, { "epoch": 4.026402640264027, "grad_norm": 0.006972379982471466, "learning_rate": 0.2675125398501479, "loss": 0.2488, "num_input_tokens_seen": 7395728, "step": 8540 }, { "epoch": 4.028760018859029, "grad_norm": 0.0056379069574177265, "learning_rate": 0.26747592162390604, "loss": 0.1933, "num_input_tokens_seen": 7399760, "step": 8545 }, { "epoch": 4.0311173974540315, "grad_norm": 0.006394523195922375, "learning_rate": 0.26743928528140076, "loss": 0.2501, "num_input_tokens_seen": 7404896, "step": 8550 }, { "epoch": 4.033474776049033, "grad_norm": 0.004622991196811199, "learning_rate": 0.26740263082828186, "loss": 0.2193, "num_input_tokens_seen": 7408928, "step": 8555 }, { "epoch": 4.035832154644035, "grad_norm": 0.00964755192399025, "learning_rate": 0.2673659582702019, "loss": 0.2628, "num_input_tokens_seen": 7413392, "step": 8560 }, { "epoch": 4.038189533239038, "grad_norm": 0.007488161791115999, "learning_rate": 0.2673292676128163, "loss": 0.284, "num_input_tokens_seen": 7417424, "step": 8565 }, { "epoch": 4.04054691183404, "grad_norm": 0.007395235355943441, "learning_rate": 0.2672925588617831, "loss": 0.2138, "num_input_tokens_seen": 7422128, "step": 8570 }, { "epoch": 4.042904290429043, "grad_norm": 0.003505491651594639, "learning_rate": 0.2672558320227634, "loss": 0.2952, "num_input_tokens_seen": 7426512, "step": 8575 }, { "epoch": 4.045261669024045, "grad_norm": 0.005421205423772335, "learning_rate": 0.2672190871014209, "loss": 0.2716, "num_input_tokens_seen": 7430496, "step": 8580 }, { "epoch": 4.0476190476190474, "grad_norm": 0.005314737558364868, "learning_rate": 0.267182324103422, "loss": 0.2261, "num_input_tokens_seen": 7435552, "step": 8585 }, { "epoch": 4.04997642621405, "grad_norm": 0.006585229653865099, "learning_rate": 0.2671455430344362, "loss": 0.2543, "num_input_tokens_seen": 7439360, "step": 8590 }, { "epoch": 4.052333804809052, "grad_norm": 0.0045740315690636635, "learning_rate": 0.2671087439001355, "loss": 0.2301, "num_input_tokens_seen": 7443728, "step": 8595 }, { "epoch": 4.054691183404055, "grad_norm": 0.0055543044582009315, "learning_rate": 0.2670719267061948, "loss": 0.238, "num_input_tokens_seen": 7448112, "step": 8600 }, { "epoch": 4.054691183404055, "eval_loss": 0.22166672348976135, "eval_runtime": 21.9215, "eval_samples_per_second": 43.017, "eval_steps_per_second": 21.531, "num_input_tokens_seen": 7448112, "step": 8600 }, { "epoch": 4.057048561999057, "grad_norm": 0.003249850356951356, "learning_rate": 0.2670350914582918, "loss": 0.1931, "num_input_tokens_seen": 7452768, "step": 8605 }, { "epoch": 4.0594059405940595, "grad_norm": 0.010159089230000973, "learning_rate": 0.26699823816210694, "loss": 0.222, "num_input_tokens_seen": 7456800, "step": 8610 }, { "epoch": 4.061763319189062, "grad_norm": 0.004173879977315664, "learning_rate": 0.26696136682332344, "loss": 0.2533, "num_input_tokens_seen": 7460960, "step": 8615 }, { "epoch": 4.064120697784064, "grad_norm": 0.0071021756157279015, "learning_rate": 0.2669244774476274, "loss": 0.2223, "num_input_tokens_seen": 7465536, "step": 8620 }, { "epoch": 4.066478076379067, "grad_norm": 0.005173949524760246, "learning_rate": 0.2668875700407075, "loss": 0.2279, "num_input_tokens_seen": 7470400, "step": 8625 }, { "epoch": 4.068835454974069, "grad_norm": 0.004146794322878122, "learning_rate": 0.26685064460825547, "loss": 0.2217, "num_input_tokens_seen": 7474432, "step": 8630 }, { "epoch": 4.0711928335690715, "grad_norm": 0.006195591762661934, "learning_rate": 0.26681370115596553, "loss": 0.2398, "num_input_tokens_seen": 7478576, "step": 8635 }, { "epoch": 4.073550212164074, "grad_norm": 0.004962254781275988, "learning_rate": 0.26677673968953497, "loss": 0.2017, "num_input_tokens_seen": 7483120, "step": 8640 }, { "epoch": 4.075907590759076, "grad_norm": 0.010424282401800156, "learning_rate": 0.2667397602146636, "loss": 0.1808, "num_input_tokens_seen": 7487120, "step": 8645 }, { "epoch": 4.078264969354079, "grad_norm": 0.010652782395482063, "learning_rate": 0.2667027627370542, "loss": 0.2842, "num_input_tokens_seen": 7491776, "step": 8650 }, { "epoch": 4.08062234794908, "grad_norm": 0.005829522851854563, "learning_rate": 0.26666574726241216, "loss": 0.2031, "num_input_tokens_seen": 7496320, "step": 8655 }, { "epoch": 4.082979726544083, "grad_norm": 0.00436852453276515, "learning_rate": 0.2666287137964458, "loss": 0.2026, "num_input_tokens_seen": 7500144, "step": 8660 }, { "epoch": 4.085337105139085, "grad_norm": 0.0058376905508339405, "learning_rate": 0.26659166234486614, "loss": 0.2418, "num_input_tokens_seen": 7504432, "step": 8665 }, { "epoch": 4.087694483734087, "grad_norm": 0.002846485236659646, "learning_rate": 0.2665545929133869, "loss": 0.2202, "num_input_tokens_seen": 7508864, "step": 8670 }, { "epoch": 4.09005186232909, "grad_norm": 0.004907501861453056, "learning_rate": 0.2665175055077248, "loss": 0.2182, "num_input_tokens_seen": 7512544, "step": 8675 }, { "epoch": 4.092409240924092, "grad_norm": 0.0071626207791268826, "learning_rate": 0.2664804001335991, "loss": 0.2472, "num_input_tokens_seen": 7517328, "step": 8680 }, { "epoch": 4.094766619519095, "grad_norm": 0.004120876081287861, "learning_rate": 0.26644327679673185, "loss": 0.2148, "num_input_tokens_seen": 7521264, "step": 8685 }, { "epoch": 4.097123998114097, "grad_norm": 0.004130437038838863, "learning_rate": 0.26640613550284803, "loss": 0.2607, "num_input_tokens_seen": 7525824, "step": 8690 }, { "epoch": 4.099481376709099, "grad_norm": 0.0052009327337145805, "learning_rate": 0.26636897625767525, "loss": 0.2361, "num_input_tokens_seen": 7530672, "step": 8695 }, { "epoch": 4.101838755304102, "grad_norm": 0.005387585144490004, "learning_rate": 0.266331799066944, "loss": 0.2445, "num_input_tokens_seen": 7536336, "step": 8700 }, { "epoch": 4.104196133899104, "grad_norm": 0.007412449922412634, "learning_rate": 0.2662946039363874, "loss": 0.2239, "num_input_tokens_seen": 7540256, "step": 8705 }, { "epoch": 4.106553512494107, "grad_norm": 0.00426905881613493, "learning_rate": 0.2662573908717414, "loss": 0.2304, "num_input_tokens_seen": 7544160, "step": 8710 }, { "epoch": 4.108910891089109, "grad_norm": 0.0033299934584647417, "learning_rate": 0.2662201598787447, "loss": 0.2283, "num_input_tokens_seen": 7547904, "step": 8715 }, { "epoch": 4.111268269684111, "grad_norm": 0.007426364812999964, "learning_rate": 0.2661829109631389, "loss": 0.2298, "num_input_tokens_seen": 7551840, "step": 8720 }, { "epoch": 4.113625648279114, "grad_norm": 0.005416298285126686, "learning_rate": 0.26614564413066816, "loss": 0.2085, "num_input_tokens_seen": 7556896, "step": 8725 }, { "epoch": 4.115983026874116, "grad_norm": 0.005559032782912254, "learning_rate": 0.2661083593870795, "loss": 0.2377, "num_input_tokens_seen": 7561712, "step": 8730 }, { "epoch": 4.118340405469119, "grad_norm": 0.005237642675638199, "learning_rate": 0.26607105673812276, "loss": 0.2378, "num_input_tokens_seen": 7566528, "step": 8735 }, { "epoch": 4.120697784064121, "grad_norm": 0.0057446761056780815, "learning_rate": 0.2660337361895504, "loss": 0.2234, "num_input_tokens_seen": 7571664, "step": 8740 }, { "epoch": 4.123055162659123, "grad_norm": 0.0062489863485097885, "learning_rate": 0.26599639774711775, "loss": 0.2449, "num_input_tokens_seen": 7576048, "step": 8745 }, { "epoch": 4.125412541254126, "grad_norm": 0.004194039385765791, "learning_rate": 0.2659590414165829, "loss": 0.2086, "num_input_tokens_seen": 7581424, "step": 8750 }, { "epoch": 4.127769919849127, "grad_norm": 0.00473930174484849, "learning_rate": 0.2659216672037066, "loss": 0.2, "num_input_tokens_seen": 7585232, "step": 8755 }, { "epoch": 4.13012729844413, "grad_norm": 0.0042151473462581635, "learning_rate": 0.26588427511425244, "loss": 0.2943, "num_input_tokens_seen": 7588976, "step": 8760 }, { "epoch": 4.132484677039132, "grad_norm": 0.004632531199604273, "learning_rate": 0.26584686515398676, "loss": 0.2196, "num_input_tokens_seen": 7593120, "step": 8765 }, { "epoch": 4.1348420556341345, "grad_norm": 0.0064179552718997, "learning_rate": 0.2658094373286787, "loss": 0.2504, "num_input_tokens_seen": 7597488, "step": 8770 }, { "epoch": 4.137199434229137, "grad_norm": 0.003998386207967997, "learning_rate": 0.2657719916441, "loss": 0.2224, "num_input_tokens_seen": 7602464, "step": 8775 }, { "epoch": 4.139556812824139, "grad_norm": 0.00582402478903532, "learning_rate": 0.2657345281060253, "loss": 0.2189, "num_input_tokens_seen": 7606032, "step": 8780 }, { "epoch": 4.141914191419142, "grad_norm": 0.006836670450866222, "learning_rate": 0.26569704672023203, "loss": 0.3364, "num_input_tokens_seen": 7610416, "step": 8785 }, { "epoch": 4.144271570014144, "grad_norm": 0.009202091954648495, "learning_rate": 0.26565954749250015, "loss": 0.1929, "num_input_tokens_seen": 7614896, "step": 8790 }, { "epoch": 4.1466289486091465, "grad_norm": 0.013122500851750374, "learning_rate": 0.2656220304286126, "loss": 0.3037, "num_input_tokens_seen": 7619440, "step": 8795 }, { "epoch": 4.148986327204149, "grad_norm": 0.006890695542097092, "learning_rate": 0.265584495534355, "loss": 0.2153, "num_input_tokens_seen": 7623632, "step": 8800 }, { "epoch": 4.148986327204149, "eval_loss": 0.23249080777168274, "eval_runtime": 21.8923, "eval_samples_per_second": 43.075, "eval_steps_per_second": 21.56, "num_input_tokens_seen": 7623632, "step": 8800 }, { "epoch": 4.151343705799151, "grad_norm": 0.005266821477562189, "learning_rate": 0.2655469428155156, "loss": 0.2366, "num_input_tokens_seen": 7627584, "step": 8805 }, { "epoch": 4.153701084394154, "grad_norm": 0.004813721869140863, "learning_rate": 0.2655093722778856, "loss": 0.2195, "num_input_tokens_seen": 7631888, "step": 8810 }, { "epoch": 4.156058462989156, "grad_norm": 0.003676759544759989, "learning_rate": 0.2654717839272588, "loss": 0.2348, "num_input_tokens_seen": 7636096, "step": 8815 }, { "epoch": 4.158415841584159, "grad_norm": 0.006451107095927, "learning_rate": 0.2654341777694318, "loss": 0.2527, "num_input_tokens_seen": 7639424, "step": 8820 }, { "epoch": 4.160773220179161, "grad_norm": 0.0058168284595012665, "learning_rate": 0.265396553810204, "loss": 0.2548, "num_input_tokens_seen": 7643392, "step": 8825 }, { "epoch": 4.163130598774163, "grad_norm": 0.023541085422039032, "learning_rate": 0.26535891205537737, "loss": 0.2505, "num_input_tokens_seen": 7648496, "step": 8830 }, { "epoch": 4.165487977369166, "grad_norm": 0.005607908125966787, "learning_rate": 0.26532125251075683, "loss": 0.2364, "num_input_tokens_seen": 7652768, "step": 8835 }, { "epoch": 4.167845355964168, "grad_norm": 0.007601615507155657, "learning_rate": 0.26528357518214996, "loss": 0.2246, "num_input_tokens_seen": 7657312, "step": 8840 }, { "epoch": 4.170202734559171, "grad_norm": 0.0064124311320483685, "learning_rate": 0.26524588007536704, "loss": 0.2606, "num_input_tokens_seen": 7661824, "step": 8845 }, { "epoch": 4.172560113154173, "grad_norm": 0.0065261246636509895, "learning_rate": 0.26520816719622115, "loss": 0.2169, "num_input_tokens_seen": 7665680, "step": 8850 }, { "epoch": 4.174917491749175, "grad_norm": 0.007443987298756838, "learning_rate": 0.2651704365505281, "loss": 0.2304, "num_input_tokens_seen": 7670240, "step": 8855 }, { "epoch": 4.177274870344177, "grad_norm": 0.007404308300465345, "learning_rate": 0.26513268814410634, "loss": 0.2396, "num_input_tokens_seen": 7674256, "step": 8860 }, { "epoch": 4.179632248939179, "grad_norm": 0.004552786238491535, "learning_rate": 0.2650949219827773, "loss": 0.2485, "num_input_tokens_seen": 7678768, "step": 8865 }, { "epoch": 4.181989627534182, "grad_norm": 0.006495942827314138, "learning_rate": 0.26505713807236486, "loss": 0.2535, "num_input_tokens_seen": 7682800, "step": 8870 }, { "epoch": 4.184347006129184, "grad_norm": 0.043057870119810104, "learning_rate": 0.26501933641869585, "loss": 0.2321, "num_input_tokens_seen": 7687296, "step": 8875 }, { "epoch": 4.1867043847241865, "grad_norm": 0.0072715566493570805, "learning_rate": 0.26498151702759976, "loss": 0.2309, "num_input_tokens_seen": 7691568, "step": 8880 }, { "epoch": 4.189061763319189, "grad_norm": 0.0038313947152346373, "learning_rate": 0.2649436799049088, "loss": 0.2061, "num_input_tokens_seen": 7695616, "step": 8885 }, { "epoch": 4.191419141914191, "grad_norm": 0.006389549467712641, "learning_rate": 0.2649058250564579, "loss": 0.2271, "num_input_tokens_seen": 7700144, "step": 8890 }, { "epoch": 4.193776520509194, "grad_norm": 0.013330805115401745, "learning_rate": 0.26486795248808476, "loss": 0.2588, "num_input_tokens_seen": 7705216, "step": 8895 }, { "epoch": 4.196133899104196, "grad_norm": 0.008232085965573788, "learning_rate": 0.2648300622056298, "loss": 0.2501, "num_input_tokens_seen": 7709936, "step": 8900 }, { "epoch": 4.1984912776991985, "grad_norm": 0.008924928493797779, "learning_rate": 0.2647921542149363, "loss": 0.2461, "num_input_tokens_seen": 7714416, "step": 8905 }, { "epoch": 4.200848656294201, "grad_norm": 0.006211942061781883, "learning_rate": 0.26475422852185, "loss": 0.2365, "num_input_tokens_seen": 7718208, "step": 8910 }, { "epoch": 4.203206034889203, "grad_norm": 0.005423965398222208, "learning_rate": 0.2647162851322196, "loss": 0.2159, "num_input_tokens_seen": 7723008, "step": 8915 }, { "epoch": 4.205563413484206, "grad_norm": 0.0064941635355353355, "learning_rate": 0.2646783240518964, "loss": 0.2626, "num_input_tokens_seen": 7726864, "step": 8920 }, { "epoch": 4.207920792079208, "grad_norm": 0.005787142086774111, "learning_rate": 0.26464034528673447, "loss": 0.2282, "num_input_tokens_seen": 7731872, "step": 8925 }, { "epoch": 4.2102781706742105, "grad_norm": 0.007065621670335531, "learning_rate": 0.26460234884259065, "loss": 0.242, "num_input_tokens_seen": 7736864, "step": 8930 }, { "epoch": 4.212635549269213, "grad_norm": 0.005654407199472189, "learning_rate": 0.2645643347253245, "loss": 0.1892, "num_input_tokens_seen": 7740992, "step": 8935 }, { "epoch": 4.214992927864215, "grad_norm": 0.003399886889383197, "learning_rate": 0.2645263029407982, "loss": 0.1955, "num_input_tokens_seen": 7745168, "step": 8940 }, { "epoch": 4.217350306459218, "grad_norm": 0.00637801131233573, "learning_rate": 0.2644882534948767, "loss": 0.2852, "num_input_tokens_seen": 7749008, "step": 8945 }, { "epoch": 4.21970768505422, "grad_norm": 0.007995901629328728, "learning_rate": 0.2644501863934278, "loss": 0.2239, "num_input_tokens_seen": 7754176, "step": 8950 }, { "epoch": 4.222065063649222, "grad_norm": 0.006450950633734465, "learning_rate": 0.26441210164232193, "loss": 0.2259, "num_input_tokens_seen": 7759216, "step": 8955 }, { "epoch": 4.224422442244224, "grad_norm": 0.011865093372762203, "learning_rate": 0.26437399924743216, "loss": 0.2312, "num_input_tokens_seen": 7763376, "step": 8960 }, { "epoch": 4.226779820839226, "grad_norm": 0.004237552173435688, "learning_rate": 0.26433587921463436, "loss": 0.2051, "num_input_tokens_seen": 7767584, "step": 8965 }, { "epoch": 4.229137199434229, "grad_norm": 0.006064979825168848, "learning_rate": 0.2642977415498072, "loss": 0.2225, "num_input_tokens_seen": 7772192, "step": 8970 }, { "epoch": 4.231494578029231, "grad_norm": 0.003420487279072404, "learning_rate": 0.26425958625883195, "loss": 0.2387, "num_input_tokens_seen": 7776000, "step": 8975 }, { "epoch": 4.233851956624234, "grad_norm": 0.006010773591697216, "learning_rate": 0.2642214133475926, "loss": 0.2304, "num_input_tokens_seen": 7780272, "step": 8980 }, { "epoch": 4.236209335219236, "grad_norm": 0.006011554505676031, "learning_rate": 0.26418322282197587, "loss": 0.2068, "num_input_tokens_seen": 7784992, "step": 8985 }, { "epoch": 4.238566713814238, "grad_norm": 0.006112040486186743, "learning_rate": 0.2641450146878714, "loss": 0.2522, "num_input_tokens_seen": 7789888, "step": 8990 }, { "epoch": 4.240924092409241, "grad_norm": 0.005789645481854677, "learning_rate": 0.26410678895117107, "loss": 0.2106, "num_input_tokens_seen": 7794464, "step": 8995 }, { "epoch": 4.243281471004243, "grad_norm": 0.003842594102025032, "learning_rate": 0.26406854561777, "loss": 0.1812, "num_input_tokens_seen": 7799248, "step": 9000 }, { "epoch": 4.243281471004243, "eval_loss": 0.2273428738117218, "eval_runtime": 21.9331, "eval_samples_per_second": 42.994, "eval_steps_per_second": 21.52, "num_input_tokens_seen": 7799248, "step": 9000 }, { "epoch": 4.245638849599246, "grad_norm": 0.003607124090194702, "learning_rate": 0.26403028469356576, "loss": 0.2979, "num_input_tokens_seen": 7803184, "step": 9005 }, { "epoch": 4.247996228194248, "grad_norm": 0.007619247771799564, "learning_rate": 0.2639920061844585, "loss": 0.2008, "num_input_tokens_seen": 7808064, "step": 9010 }, { "epoch": 4.2503536067892504, "grad_norm": 0.00533232931047678, "learning_rate": 0.2639537100963515, "loss": 0.2172, "num_input_tokens_seen": 7812352, "step": 9015 }, { "epoch": 4.252710985384253, "grad_norm": 0.0036311037838459015, "learning_rate": 0.26391539643515033, "loss": 0.239, "num_input_tokens_seen": 7816400, "step": 9020 }, { "epoch": 4.255068363979255, "grad_norm": 0.005975732579827309, "learning_rate": 0.26387706520676346, "loss": 0.2182, "num_input_tokens_seen": 7819936, "step": 9025 }, { "epoch": 4.257425742574258, "grad_norm": 0.004812866915017366, "learning_rate": 0.26383871641710205, "loss": 0.25, "num_input_tokens_seen": 7823984, "step": 9030 }, { "epoch": 4.25978312116926, "grad_norm": 0.004944560118019581, "learning_rate": 0.26380035007208, "loss": 0.233, "num_input_tokens_seen": 7828624, "step": 9035 }, { "epoch": 4.2621404997642625, "grad_norm": 0.014386586844921112, "learning_rate": 0.26376196617761394, "loss": 0.2312, "num_input_tokens_seen": 7833136, "step": 9040 }, { "epoch": 4.264497878359265, "grad_norm": 0.005070526152849197, "learning_rate": 0.263723564739623, "loss": 0.2349, "num_input_tokens_seen": 7837360, "step": 9045 }, { "epoch": 4.266855256954267, "grad_norm": 0.0032284704502671957, "learning_rate": 0.2636851457640293, "loss": 0.2279, "num_input_tokens_seen": 7841408, "step": 9050 }, { "epoch": 4.26921263554927, "grad_norm": 0.0033750662114471197, "learning_rate": 0.26364670925675737, "loss": 0.1933, "num_input_tokens_seen": 7845056, "step": 9055 }, { "epoch": 4.271570014144271, "grad_norm": 0.0025347196497023106, "learning_rate": 0.2636082552237347, "loss": 0.2461, "num_input_tokens_seen": 7850144, "step": 9060 }, { "epoch": 4.273927392739274, "grad_norm": 0.0034134548623114824, "learning_rate": 0.26356978367089146, "loss": 0.2183, "num_input_tokens_seen": 7855024, "step": 9065 }, { "epoch": 4.276284771334276, "grad_norm": 0.009256832301616669, "learning_rate": 0.26353129460416036, "loss": 0.2574, "num_input_tokens_seen": 7859680, "step": 9070 }, { "epoch": 4.278642149929278, "grad_norm": 0.006364412605762482, "learning_rate": 0.2634927880294769, "loss": 0.2227, "num_input_tokens_seen": 7863776, "step": 9075 }, { "epoch": 4.280999528524281, "grad_norm": 0.006698859389871359, "learning_rate": 0.26345426395277927, "loss": 0.2382, "num_input_tokens_seen": 7868192, "step": 9080 }, { "epoch": 4.283356907119283, "grad_norm": 0.0031029193196445704, "learning_rate": 0.2634157223800084, "loss": 0.2241, "num_input_tokens_seen": 7872720, "step": 9085 }, { "epoch": 4.285714285714286, "grad_norm": 0.0047925915569067, "learning_rate": 0.26337716331710787, "loss": 0.2237, "num_input_tokens_seen": 7876880, "step": 9090 }, { "epoch": 4.288071664309288, "grad_norm": 0.0026141153648495674, "learning_rate": 0.2633385867700239, "loss": 0.2333, "num_input_tokens_seen": 7881184, "step": 9095 }, { "epoch": 4.29042904290429, "grad_norm": 0.005250149872153997, "learning_rate": 0.2632999927447056, "loss": 0.2601, "num_input_tokens_seen": 7886144, "step": 9100 }, { "epoch": 4.292786421499293, "grad_norm": 0.004535315558314323, "learning_rate": 0.2632613812471046, "loss": 0.2493, "num_input_tokens_seen": 7890272, "step": 9105 }, { "epoch": 4.295143800094295, "grad_norm": 0.0031777212861925364, "learning_rate": 0.2632227522831753, "loss": 0.207, "num_input_tokens_seen": 7894112, "step": 9110 }, { "epoch": 4.297501178689298, "grad_norm": 0.003022312419489026, "learning_rate": 0.26318410585887475, "loss": 0.2334, "num_input_tokens_seen": 7897920, "step": 9115 }, { "epoch": 4.2998585572843, "grad_norm": 0.004160049371421337, "learning_rate": 0.2631454419801627, "loss": 0.2082, "num_input_tokens_seen": 7901888, "step": 9120 }, { "epoch": 4.302215935879302, "grad_norm": 0.004063785076141357, "learning_rate": 0.2631067606530016, "loss": 0.3817, "num_input_tokens_seen": 7906512, "step": 9125 }, { "epoch": 4.304573314474305, "grad_norm": 0.0037633036263287067, "learning_rate": 0.2630680618833567, "loss": 0.2283, "num_input_tokens_seen": 7910848, "step": 9130 }, { "epoch": 4.306930693069307, "grad_norm": 0.006465422920882702, "learning_rate": 0.26302934567719566, "loss": 0.2735, "num_input_tokens_seen": 7915536, "step": 9135 }, { "epoch": 4.30928807166431, "grad_norm": 0.002964041195809841, "learning_rate": 0.2629906120404892, "loss": 0.2263, "num_input_tokens_seen": 7919904, "step": 9140 }, { "epoch": 4.311645450259312, "grad_norm": 0.0063500129617750645, "learning_rate": 0.26295186097921036, "loss": 0.237, "num_input_tokens_seen": 7925472, "step": 9145 }, { "epoch": 4.314002828854314, "grad_norm": 0.007083000149577856, "learning_rate": 0.2629130924993351, "loss": 0.2339, "num_input_tokens_seen": 7929152, "step": 9150 }, { "epoch": 4.316360207449316, "grad_norm": 0.005854646675288677, "learning_rate": 0.2628743066068421, "loss": 0.2349, "num_input_tokens_seen": 7933408, "step": 9155 }, { "epoch": 4.318717586044318, "grad_norm": 0.0035294583067297935, "learning_rate": 0.26283550330771244, "loss": 0.2103, "num_input_tokens_seen": 7937584, "step": 9160 }, { "epoch": 4.321074964639321, "grad_norm": 0.00582813611254096, "learning_rate": 0.2627966826079303, "loss": 0.2359, "num_input_tokens_seen": 7941616, "step": 9165 }, { "epoch": 4.323432343234323, "grad_norm": 0.004016651306301355, "learning_rate": 0.26275784451348216, "loss": 0.2085, "num_input_tokens_seen": 7946288, "step": 9170 }, { "epoch": 4.3257897218293255, "grad_norm": 0.0025169069413095713, "learning_rate": 0.2627189890303574, "loss": 0.1711, "num_input_tokens_seen": 7950496, "step": 9175 }, { "epoch": 4.328147100424328, "grad_norm": 0.002617758233100176, "learning_rate": 0.262680116164548, "loss": 0.2152, "num_input_tokens_seen": 7955056, "step": 9180 }, { "epoch": 4.33050447901933, "grad_norm": 0.004490070976316929, "learning_rate": 0.2626412259220487, "loss": 0.3224, "num_input_tokens_seen": 7959520, "step": 9185 }, { "epoch": 4.332861857614333, "grad_norm": 0.0029117155354470015, "learning_rate": 0.2626023183088568, "loss": 0.232, "num_input_tokens_seen": 7964944, "step": 9190 }, { "epoch": 4.335219236209335, "grad_norm": 0.005904405377805233, "learning_rate": 0.26256339333097234, "loss": 0.257, "num_input_tokens_seen": 7969600, "step": 9195 }, { "epoch": 4.3375766148043375, "grad_norm": 0.0056585026904940605, "learning_rate": 0.2625244509943981, "loss": 0.2833, "num_input_tokens_seen": 7974368, "step": 9200 }, { "epoch": 4.3375766148043375, "eval_loss": 0.2250385880470276, "eval_runtime": 21.8981, "eval_samples_per_second": 43.063, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 7974368, "step": 9200 }, { "epoch": 4.33993399339934, "grad_norm": 0.0029976810328662395, "learning_rate": 0.2624854913051395, "loss": 0.2323, "num_input_tokens_seen": 7979056, "step": 9205 }, { "epoch": 4.342291371994342, "grad_norm": 0.0024618261959403753, "learning_rate": 0.26244651426920446, "loss": 0.1943, "num_input_tokens_seen": 7982976, "step": 9210 }, { "epoch": 4.344648750589345, "grad_norm": 0.003350294893607497, "learning_rate": 0.26240751989260386, "loss": 0.2648, "num_input_tokens_seen": 7987152, "step": 9215 }, { "epoch": 4.347006129184347, "grad_norm": 0.00448601134121418, "learning_rate": 0.2623685081813511, "loss": 0.2773, "num_input_tokens_seen": 7991024, "step": 9220 }, { "epoch": 4.3493635077793495, "grad_norm": 0.003992486745119095, "learning_rate": 0.2623294791414623, "loss": 0.2521, "num_input_tokens_seen": 7995344, "step": 9225 }, { "epoch": 4.351720886374352, "grad_norm": 0.0038789138197898865, "learning_rate": 0.26229043277895614, "loss": 0.2408, "num_input_tokens_seen": 7999584, "step": 9230 }, { "epoch": 4.354078264969354, "grad_norm": 0.003746961010619998, "learning_rate": 0.2622513690998542, "loss": 0.2147, "num_input_tokens_seen": 8003920, "step": 9235 }, { "epoch": 4.356435643564357, "grad_norm": 0.004681269638240337, "learning_rate": 0.26221228811018044, "loss": 0.2417, "num_input_tokens_seen": 8008032, "step": 9240 }, { "epoch": 4.358793022159359, "grad_norm": 0.005208021495491266, "learning_rate": 0.2621731898159617, "loss": 0.2336, "num_input_tokens_seen": 8013088, "step": 9245 }, { "epoch": 4.3611504007543616, "grad_norm": 0.0037083022762089968, "learning_rate": 0.26213407422322743, "loss": 0.2163, "num_input_tokens_seen": 8018240, "step": 9250 }, { "epoch": 4.363507779349364, "grad_norm": 0.0032967613078653812, "learning_rate": 0.2620949413380098, "loss": 0.2131, "num_input_tokens_seen": 8022032, "step": 9255 }, { "epoch": 4.3658651579443655, "grad_norm": 0.003296947106719017, "learning_rate": 0.26205579116634353, "loss": 0.2466, "num_input_tokens_seen": 8026400, "step": 9260 }, { "epoch": 4.368222536539368, "grad_norm": 0.0055676912888884544, "learning_rate": 0.26201662371426604, "loss": 0.191, "num_input_tokens_seen": 8030672, "step": 9265 }, { "epoch": 4.37057991513437, "grad_norm": 0.003398091997951269, "learning_rate": 0.2619774389878175, "loss": 0.1649, "num_input_tokens_seen": 8035136, "step": 9270 }, { "epoch": 4.372937293729373, "grad_norm": 0.001919765374623239, "learning_rate": 0.2619382369930407, "loss": 0.279, "num_input_tokens_seen": 8039152, "step": 9275 }, { "epoch": 4.375294672324375, "grad_norm": 0.00324031850323081, "learning_rate": 0.261899017735981, "loss": 0.2124, "num_input_tokens_seen": 8043344, "step": 9280 }, { "epoch": 4.3776520509193775, "grad_norm": 0.005271621979773045, "learning_rate": 0.2618597812226866, "loss": 0.2193, "num_input_tokens_seen": 8047536, "step": 9285 }, { "epoch": 4.38000942951438, "grad_norm": 0.005476982332766056, "learning_rate": 0.2618205274592082, "loss": 0.2479, "num_input_tokens_seen": 8051920, "step": 9290 }, { "epoch": 4.382366808109382, "grad_norm": 0.004577161278575659, "learning_rate": 0.2617812564515992, "loss": 0.2462, "num_input_tokens_seen": 8056416, "step": 9295 }, { "epoch": 4.384724186704385, "grad_norm": 0.002623429987579584, "learning_rate": 0.2617419682059158, "loss": 0.2454, "num_input_tokens_seen": 8060576, "step": 9300 }, { "epoch": 4.387081565299387, "grad_norm": 0.003817977849394083, "learning_rate": 0.26170266272821663, "loss": 0.2118, "num_input_tokens_seen": 8065024, "step": 9305 }, { "epoch": 4.3894389438943895, "grad_norm": 0.0037288011517375708, "learning_rate": 0.26166334002456315, "loss": 0.2342, "num_input_tokens_seen": 8070416, "step": 9310 }, { "epoch": 4.391796322489392, "grad_norm": 0.0017615173710510135, "learning_rate": 0.2616240001010194, "loss": 0.2295, "num_input_tokens_seen": 8074608, "step": 9315 }, { "epoch": 4.394153701084394, "grad_norm": 0.004655566532164812, "learning_rate": 0.26158464296365197, "loss": 0.2591, "num_input_tokens_seen": 8078976, "step": 9320 }, { "epoch": 4.396511079679397, "grad_norm": 0.0037334789521992207, "learning_rate": 0.2615452686185304, "loss": 0.2048, "num_input_tokens_seen": 8082880, "step": 9325 }, { "epoch": 4.398868458274399, "grad_norm": 0.0060518281534314156, "learning_rate": 0.26150587707172673, "loss": 0.2288, "num_input_tokens_seen": 8086144, "step": 9330 }, { "epoch": 4.4012258368694015, "grad_norm": 0.004281120374798775, "learning_rate": 0.2614664683293154, "loss": 0.2206, "num_input_tokens_seen": 8090960, "step": 9335 }, { "epoch": 4.403583215464404, "grad_norm": 0.00545575050637126, "learning_rate": 0.26142704239737397, "loss": 0.2332, "num_input_tokens_seen": 8095072, "step": 9340 }, { "epoch": 4.405940594059406, "grad_norm": 0.002180506708100438, "learning_rate": 0.26138759928198235, "loss": 0.2409, "num_input_tokens_seen": 8099248, "step": 9345 }, { "epoch": 4.408297972654409, "grad_norm": 0.0035082832910120487, "learning_rate": 0.26134813898922304, "loss": 0.2314, "num_input_tokens_seen": 8103616, "step": 9350 }, { "epoch": 4.41065535124941, "grad_norm": 0.00253653759136796, "learning_rate": 0.26130866152518145, "loss": 0.2184, "num_input_tokens_seen": 8107648, "step": 9355 }, { "epoch": 4.413012729844413, "grad_norm": 0.0032756056170910597, "learning_rate": 0.2612691668959455, "loss": 0.2087, "num_input_tokens_seen": 8112080, "step": 9360 }, { "epoch": 4.415370108439415, "grad_norm": 0.00418089097365737, "learning_rate": 0.2612296551076057, "loss": 0.2375, "num_input_tokens_seen": 8116784, "step": 9365 }, { "epoch": 4.417727487034417, "grad_norm": 0.0038985793944448233, "learning_rate": 0.26119012616625525, "loss": 0.1829, "num_input_tokens_seen": 8120880, "step": 9370 }, { "epoch": 4.42008486562942, "grad_norm": 0.002612902782857418, "learning_rate": 0.26115058007799, "loss": 0.2374, "num_input_tokens_seen": 8125120, "step": 9375 }, { "epoch": 4.422442244224422, "grad_norm": 0.003643402364104986, "learning_rate": 0.26111101684890864, "loss": 0.2426, "num_input_tokens_seen": 8129088, "step": 9380 }, { "epoch": 4.424799622819425, "grad_norm": 0.00430586701259017, "learning_rate": 0.26107143648511205, "loss": 0.231, "num_input_tokens_seen": 8134192, "step": 9385 }, { "epoch": 4.427157001414427, "grad_norm": 0.004517610650509596, "learning_rate": 0.2610318389927042, "loss": 0.2043, "num_input_tokens_seen": 8138080, "step": 9390 }, { "epoch": 4.429514380009429, "grad_norm": 0.005822936072945595, "learning_rate": 0.26099222437779146, "loss": 0.2243, "num_input_tokens_seen": 8142304, "step": 9395 }, { "epoch": 4.431871758604432, "grad_norm": 0.005944953765720129, "learning_rate": 0.26095259264648285, "loss": 0.2609, "num_input_tokens_seen": 8146384, "step": 9400 }, { "epoch": 4.431871758604432, "eval_loss": 0.22689031064510345, "eval_runtime": 21.9136, "eval_samples_per_second": 43.033, "eval_steps_per_second": 21.539, "num_input_tokens_seen": 8146384, "step": 9400 }, { "epoch": 4.434229137199434, "grad_norm": 0.004055943340063095, "learning_rate": 0.2609129438048902, "loss": 0.2265, "num_input_tokens_seen": 8150448, "step": 9405 }, { "epoch": 4.436586515794437, "grad_norm": 0.0041501265950500965, "learning_rate": 0.2608732778591278, "loss": 0.2223, "num_input_tokens_seen": 8154784, "step": 9410 }, { "epoch": 4.438943894389439, "grad_norm": 0.004107818007469177, "learning_rate": 0.2608335948153126, "loss": 0.2159, "num_input_tokens_seen": 8159008, "step": 9415 }, { "epoch": 4.441301272984441, "grad_norm": 0.004072488751262426, "learning_rate": 0.26079389467956426, "loss": 0.2152, "num_input_tokens_seen": 8163488, "step": 9420 }, { "epoch": 4.443658651579444, "grad_norm": 0.003146030707284808, "learning_rate": 0.26075417745800505, "loss": 0.2487, "num_input_tokens_seen": 8167440, "step": 9425 }, { "epoch": 4.446016030174446, "grad_norm": 0.004799408372491598, "learning_rate": 0.26071444315675985, "loss": 0.2174, "num_input_tokens_seen": 8172336, "step": 9430 }, { "epoch": 4.448373408769449, "grad_norm": 0.0035962937399744987, "learning_rate": 0.2606746917819562, "loss": 0.1783, "num_input_tokens_seen": 8176160, "step": 9435 }, { "epoch": 4.450730787364451, "grad_norm": 0.00557676050812006, "learning_rate": 0.2606349233397242, "loss": 0.2094, "num_input_tokens_seen": 8179952, "step": 9440 }, { "epoch": 4.4530881659594534, "grad_norm": 0.00624077906832099, "learning_rate": 0.26059513783619676, "loss": 0.2379, "num_input_tokens_seen": 8184048, "step": 9445 }, { "epoch": 4.455445544554456, "grad_norm": 0.010251578874886036, "learning_rate": 0.26055533527750924, "loss": 0.255, "num_input_tokens_seen": 8188368, "step": 9450 }, { "epoch": 4.457802923149458, "grad_norm": 0.039820559322834015, "learning_rate": 0.26051551566979964, "loss": 0.2355, "num_input_tokens_seen": 8194000, "step": 9455 }, { "epoch": 4.46016030174446, "grad_norm": 0.009643997997045517, "learning_rate": 0.26047567901920876, "loss": 0.4538, "num_input_tokens_seen": 8201072, "step": 9460 }, { "epoch": 4.462517680339462, "grad_norm": 0.005420724395662546, "learning_rate": 0.2604358253318798, "loss": 0.1938, "num_input_tokens_seen": 8205680, "step": 9465 }, { "epoch": 4.464875058934465, "grad_norm": 0.01084974780678749, "learning_rate": 0.26039595461395876, "loss": 0.3105, "num_input_tokens_seen": 8210464, "step": 9470 }, { "epoch": 4.467232437529467, "grad_norm": 0.01789839379489422, "learning_rate": 0.26035606687159424, "loss": 0.2718, "num_input_tokens_seen": 8214752, "step": 9475 }, { "epoch": 4.469589816124469, "grad_norm": 0.005781130399554968, "learning_rate": 0.26031616211093733, "loss": 0.2405, "num_input_tokens_seen": 8218848, "step": 9480 }, { "epoch": 4.471947194719472, "grad_norm": 0.006741950288414955, "learning_rate": 0.26027624033814195, "loss": 0.2172, "num_input_tokens_seen": 8223200, "step": 9485 }, { "epoch": 4.474304573314474, "grad_norm": 0.005049307364970446, "learning_rate": 0.2602363015593645, "loss": 0.1959, "num_input_tokens_seen": 8227280, "step": 9490 }, { "epoch": 4.476661951909477, "grad_norm": 0.009826654568314552, "learning_rate": 0.26019634578076395, "loss": 0.2397, "num_input_tokens_seen": 8232304, "step": 9495 }, { "epoch": 4.479019330504479, "grad_norm": 0.009060743264853954, "learning_rate": 0.26015637300850214, "loss": 0.2526, "num_input_tokens_seen": 8236592, "step": 9500 }, { "epoch": 4.481376709099481, "grad_norm": 0.0037591268774122, "learning_rate": 0.26011638324874325, "loss": 0.1722, "num_input_tokens_seen": 8240800, "step": 9505 }, { "epoch": 4.483734087694484, "grad_norm": 0.003495309967547655, "learning_rate": 0.2600763765076543, "loss": 0.2709, "num_input_tokens_seen": 8244816, "step": 9510 }, { "epoch": 4.486091466289486, "grad_norm": 0.005895252339541912, "learning_rate": 0.2600363527914048, "loss": 0.2274, "num_input_tokens_seen": 8249136, "step": 9515 }, { "epoch": 4.488448844884489, "grad_norm": 0.004823778290301561, "learning_rate": 0.25999631210616686, "loss": 0.2103, "num_input_tokens_seen": 8253536, "step": 9520 }, { "epoch": 4.490806223479491, "grad_norm": 0.005318429786711931, "learning_rate": 0.25995625445811527, "loss": 0.2288, "num_input_tokens_seen": 8257504, "step": 9525 }, { "epoch": 4.493163602074493, "grad_norm": 0.005329417996108532, "learning_rate": 0.2599161798534275, "loss": 0.237, "num_input_tokens_seen": 8261728, "step": 9530 }, { "epoch": 4.495520980669496, "grad_norm": 0.006978516932576895, "learning_rate": 0.25987608829828346, "loss": 0.2462, "num_input_tokens_seen": 8266592, "step": 9535 }, { "epoch": 4.497878359264498, "grad_norm": 0.00986330397427082, "learning_rate": 0.25983597979886586, "loss": 0.2445, "num_input_tokens_seen": 8270112, "step": 9540 }, { "epoch": 4.500235737859501, "grad_norm": 0.004314525052905083, "learning_rate": 0.2597958543613599, "loss": 0.2615, "num_input_tokens_seen": 8274656, "step": 9545 }, { "epoch": 4.502593116454502, "grad_norm": 0.006108290050178766, "learning_rate": 0.25975571199195335, "loss": 0.2265, "num_input_tokens_seen": 8279040, "step": 9550 }, { "epoch": 4.5049504950495045, "grad_norm": 0.02270430140197277, "learning_rate": 0.25971555269683677, "loss": 0.252, "num_input_tokens_seen": 8283088, "step": 9555 }, { "epoch": 4.507307873644507, "grad_norm": 0.004617371596395969, "learning_rate": 0.25967537648220324, "loss": 0.2596, "num_input_tokens_seen": 8287664, "step": 9560 }, { "epoch": 4.509665252239509, "grad_norm": 0.005759835243225098, "learning_rate": 0.2596351833542483, "loss": 0.2208, "num_input_tokens_seen": 8291872, "step": 9565 }, { "epoch": 4.512022630834512, "grad_norm": 0.006046622525900602, "learning_rate": 0.25959497331917036, "loss": 0.2352, "num_input_tokens_seen": 8295744, "step": 9570 }, { "epoch": 4.514380009429514, "grad_norm": 0.00503824558109045, "learning_rate": 0.2595547463831703, "loss": 0.2361, "num_input_tokens_seen": 8300016, "step": 9575 }, { "epoch": 4.5167373880245165, "grad_norm": 0.007392666768282652, "learning_rate": 0.25951450255245156, "loss": 0.2336, "num_input_tokens_seen": 8303840, "step": 9580 }, { "epoch": 4.519094766619519, "grad_norm": 0.007078335154801607, "learning_rate": 0.2594742418332203, "loss": 0.2507, "num_input_tokens_seen": 8307472, "step": 9585 }, { "epoch": 4.521452145214521, "grad_norm": 0.0046226088888943195, "learning_rate": 0.2594339642316852, "loss": 0.197, "num_input_tokens_seen": 8312128, "step": 9590 }, { "epoch": 4.523809523809524, "grad_norm": 0.010770881548523903, "learning_rate": 0.2593936697540576, "loss": 0.3129, "num_input_tokens_seen": 8316672, "step": 9595 }, { "epoch": 4.526166902404526, "grad_norm": 0.004187664482742548, "learning_rate": 0.2593533584065514, "loss": 0.2603, "num_input_tokens_seen": 8321456, "step": 9600 }, { "epoch": 4.526166902404526, "eval_loss": 0.22926250100135803, "eval_runtime": 21.8861, "eval_samples_per_second": 43.087, "eval_steps_per_second": 21.566, "num_input_tokens_seen": 8321456, "step": 9600 }, { "epoch": 4.5285242809995285, "grad_norm": 0.004887247458100319, "learning_rate": 0.2593130301953831, "loss": 0.2147, "num_input_tokens_seen": 8326384, "step": 9605 }, { "epoch": 4.530881659594531, "grad_norm": 0.0026440327055752277, "learning_rate": 0.2592726851267718, "loss": 0.2067, "num_input_tokens_seen": 8329920, "step": 9610 }, { "epoch": 4.533239038189533, "grad_norm": 0.004833553917706013, "learning_rate": 0.2592323232069393, "loss": 0.2381, "num_input_tokens_seen": 8334832, "step": 9615 }, { "epoch": 4.535596416784536, "grad_norm": 0.005247662775218487, "learning_rate": 0.25919194444210986, "loss": 0.2769, "num_input_tokens_seen": 8339584, "step": 9620 }, { "epoch": 4.537953795379538, "grad_norm": 0.003497653640806675, "learning_rate": 0.2591515488385103, "loss": 0.2397, "num_input_tokens_seen": 8343552, "step": 9625 }, { "epoch": 4.5403111739745405, "grad_norm": 0.003319311188533902, "learning_rate": 0.2591111364023704, "loss": 0.2314, "num_input_tokens_seen": 8348160, "step": 9630 }, { "epoch": 4.542668552569543, "grad_norm": 0.0033221289049834013, "learning_rate": 0.259070707139922, "loss": 0.2282, "num_input_tokens_seen": 8352080, "step": 9635 }, { "epoch": 4.545025931164545, "grad_norm": 0.0030070925131440163, "learning_rate": 0.25903026105739985, "loss": 0.2247, "num_input_tokens_seen": 8356576, "step": 9640 }, { "epoch": 4.547383309759548, "grad_norm": 0.003090352052822709, "learning_rate": 0.2589897981610413, "loss": 0.2339, "num_input_tokens_seen": 8361328, "step": 9645 }, { "epoch": 4.54974068835455, "grad_norm": 0.0038421889767050743, "learning_rate": 0.2589493184570863, "loss": 0.2487, "num_input_tokens_seen": 8365744, "step": 9650 }, { "epoch": 4.5520980669495525, "grad_norm": 0.005688461475074291, "learning_rate": 0.25890882195177717, "loss": 0.2455, "num_input_tokens_seen": 8369984, "step": 9655 }, { "epoch": 4.554455445544555, "grad_norm": 0.005062142387032509, "learning_rate": 0.25886830865135907, "loss": 0.2418, "num_input_tokens_seen": 8374432, "step": 9660 }, { "epoch": 4.5568128241395565, "grad_norm": 0.00412049749866128, "learning_rate": 0.25882777856207967, "loss": 0.2452, "num_input_tokens_seen": 8378144, "step": 9665 }, { "epoch": 4.559170202734559, "grad_norm": 0.003634281689301133, "learning_rate": 0.2587872316901892, "loss": 0.2149, "num_input_tokens_seen": 8383056, "step": 9670 }, { "epoch": 4.561527581329561, "grad_norm": 0.006366658490151167, "learning_rate": 0.25874666804194046, "loss": 0.2425, "num_input_tokens_seen": 8387696, "step": 9675 }, { "epoch": 4.563884959924564, "grad_norm": 0.003567873500287533, "learning_rate": 0.258706087623589, "loss": 0.1813, "num_input_tokens_seen": 8391712, "step": 9680 }, { "epoch": 4.566242338519566, "grad_norm": 0.0028828075155615807, "learning_rate": 0.25866549044139264, "loss": 0.1598, "num_input_tokens_seen": 8395360, "step": 9685 }, { "epoch": 4.5685997171145685, "grad_norm": 0.003961341921240091, "learning_rate": 0.25862487650161214, "loss": 0.26, "num_input_tokens_seen": 8399152, "step": 9690 }, { "epoch": 4.570957095709571, "grad_norm": 0.003979521803557873, "learning_rate": 0.2585842458105106, "loss": 0.2226, "num_input_tokens_seen": 8403232, "step": 9695 }, { "epoch": 4.573314474304573, "grad_norm": 0.0027804041747003794, "learning_rate": 0.2585435983743538, "loss": 0.1773, "num_input_tokens_seen": 8407712, "step": 9700 }, { "epoch": 4.575671852899576, "grad_norm": 0.0028531397692859173, "learning_rate": 0.2585029341994101, "loss": 0.1727, "num_input_tokens_seen": 8412192, "step": 9705 }, { "epoch": 4.578029231494578, "grad_norm": 0.0029682388994842768, "learning_rate": 0.2584622532919504, "loss": 0.2049, "num_input_tokens_seen": 8416144, "step": 9710 }, { "epoch": 4.5803866100895805, "grad_norm": 0.003179004881531, "learning_rate": 0.2584215556582482, "loss": 0.2312, "num_input_tokens_seen": 8420096, "step": 9715 }, { "epoch": 4.582743988684583, "grad_norm": 0.002416998613625765, "learning_rate": 0.25838084130457967, "loss": 0.1769, "num_input_tokens_seen": 8423664, "step": 9720 }, { "epoch": 4.585101367279585, "grad_norm": 0.003860981436446309, "learning_rate": 0.2583401102372234, "loss": 0.2452, "num_input_tokens_seen": 8428112, "step": 9725 }, { "epoch": 4.587458745874588, "grad_norm": 0.005476855207234621, "learning_rate": 0.2582993624624606, "loss": 0.2191, "num_input_tokens_seen": 8432224, "step": 9730 }, { "epoch": 4.58981612446959, "grad_norm": 0.002516490640118718, "learning_rate": 0.25825859798657513, "loss": 0.2393, "num_input_tokens_seen": 8436816, "step": 9735 }, { "epoch": 4.5921735030645925, "grad_norm": 0.0033048244658857584, "learning_rate": 0.25821781681585343, "loss": 0.2307, "num_input_tokens_seen": 8441200, "step": 9740 }, { "epoch": 4.594530881659595, "grad_norm": 0.0030626184307038784, "learning_rate": 0.2581770189565844, "loss": 0.2231, "num_input_tokens_seen": 8446096, "step": 9745 }, { "epoch": 4.596888260254596, "grad_norm": 0.003267318708822131, "learning_rate": 0.25813620441505963, "loss": 0.1796, "num_input_tokens_seen": 8450800, "step": 9750 }, { "epoch": 4.599245638849599, "grad_norm": 0.0036412018816918135, "learning_rate": 0.2580953731975732, "loss": 0.236, "num_input_tokens_seen": 8455296, "step": 9755 }, { "epoch": 4.601603017444601, "grad_norm": 0.00294292438775301, "learning_rate": 0.2580545253104218, "loss": 0.2127, "num_input_tokens_seen": 8459472, "step": 9760 }, { "epoch": 4.603960396039604, "grad_norm": 0.0035328813828527927, "learning_rate": 0.2580136607599047, "loss": 0.248, "num_input_tokens_seen": 8463280, "step": 9765 }, { "epoch": 4.606317774634606, "grad_norm": 0.0023099060636013746, "learning_rate": 0.2579727795523238, "loss": 0.2591, "num_input_tokens_seen": 8467360, "step": 9770 }, { "epoch": 4.608675153229608, "grad_norm": 0.004458182957023382, "learning_rate": 0.25793188169398334, "loss": 0.2515, "num_input_tokens_seen": 8471152, "step": 9775 }, { "epoch": 4.611032531824611, "grad_norm": 0.0029415234457701445, "learning_rate": 0.25789096719119037, "loss": 0.2369, "num_input_tokens_seen": 8474912, "step": 9780 }, { "epoch": 4.613389910419613, "grad_norm": 0.004502985626459122, "learning_rate": 0.2578500360502544, "loss": 0.2517, "num_input_tokens_seen": 8478464, "step": 9785 }, { "epoch": 4.615747289014616, "grad_norm": 0.0023702296894043684, "learning_rate": 0.2578090882774876, "loss": 0.193, "num_input_tokens_seen": 8482656, "step": 9790 }, { "epoch": 4.618104667609618, "grad_norm": 0.003934910520911217, "learning_rate": 0.25776812387920456, "loss": 0.2942, "num_input_tokens_seen": 8486848, "step": 9795 }, { "epoch": 4.62046204620462, "grad_norm": 0.0023481135722249746, "learning_rate": 0.2577271428617225, "loss": 0.2278, "num_input_tokens_seen": 8490096, "step": 9800 }, { "epoch": 4.62046204620462, "eval_loss": 0.22128428518772125, "eval_runtime": 21.9274, "eval_samples_per_second": 43.006, "eval_steps_per_second": 21.526, "num_input_tokens_seen": 8490096, "step": 9800 }, { "epoch": 4.622819424799623, "grad_norm": 0.003357951296493411, "learning_rate": 0.25768614523136124, "loss": 0.2353, "num_input_tokens_seen": 8494800, "step": 9805 }, { "epoch": 4.625176803394625, "grad_norm": 0.0028887705411762, "learning_rate": 0.25764513099444314, "loss": 0.2115, "num_input_tokens_seen": 8498928, "step": 9810 }, { "epoch": 4.627534181989628, "grad_norm": 0.0038435584865510464, "learning_rate": 0.25760410015729307, "loss": 0.2105, "num_input_tokens_seen": 8503264, "step": 9815 }, { "epoch": 4.62989156058463, "grad_norm": 0.003056455636397004, "learning_rate": 0.2575630527262385, "loss": 0.1997, "num_input_tokens_seen": 8506624, "step": 9820 }, { "epoch": 4.632248939179632, "grad_norm": 0.006272217258810997, "learning_rate": 0.25752198870760945, "loss": 0.2357, "num_input_tokens_seen": 8511440, "step": 9825 }, { "epoch": 4.634606317774635, "grad_norm": 0.0039057552348822355, "learning_rate": 0.2574809081077386, "loss": 0.2347, "num_input_tokens_seen": 8515584, "step": 9830 }, { "epoch": 4.636963696369637, "grad_norm": 0.003493529511615634, "learning_rate": 0.257439810932961, "loss": 0.2357, "num_input_tokens_seen": 8519280, "step": 9835 }, { "epoch": 4.63932107496464, "grad_norm": 0.008781437762081623, "learning_rate": 0.2573986971896144, "loss": 0.2397, "num_input_tokens_seen": 8522816, "step": 9840 }, { "epoch": 4.641678453559642, "grad_norm": 0.0033617110457271338, "learning_rate": 0.257357566884039, "loss": 0.2296, "num_input_tokens_seen": 8526720, "step": 9845 }, { "epoch": 4.644035832154644, "grad_norm": 0.0022055399604141712, "learning_rate": 0.25731642002257765, "loss": 0.2069, "num_input_tokens_seen": 8531808, "step": 9850 }, { "epoch": 4.646393210749647, "grad_norm": 0.00842844694852829, "learning_rate": 0.25727525661157574, "loss": 0.2492, "num_input_tokens_seen": 8536000, "step": 9855 }, { "epoch": 4.648750589344649, "grad_norm": 0.0037738282699137926, "learning_rate": 0.2572340766573811, "loss": 0.2444, "num_input_tokens_seen": 8539776, "step": 9860 }, { "epoch": 4.651107967939651, "grad_norm": 0.0038234624080359936, "learning_rate": 0.25719288016634434, "loss": 0.1719, "num_input_tokens_seen": 8544384, "step": 9865 }, { "epoch": 4.653465346534653, "grad_norm": 0.003316260641440749, "learning_rate": 0.25715166714481835, "loss": 0.1947, "num_input_tokens_seen": 8548464, "step": 9870 }, { "epoch": 4.655822725129656, "grad_norm": 0.006922050379216671, "learning_rate": 0.2571104375991587, "loss": 0.2491, "num_input_tokens_seen": 8552496, "step": 9875 }, { "epoch": 4.658180103724658, "grad_norm": 0.002857015235349536, "learning_rate": 0.2570691915357236, "loss": 0.2446, "num_input_tokens_seen": 8557104, "step": 9880 }, { "epoch": 4.66053748231966, "grad_norm": 0.0035318995360285044, "learning_rate": 0.2570279289608736, "loss": 0.2364, "num_input_tokens_seen": 8561568, "step": 9885 }, { "epoch": 4.662894860914663, "grad_norm": 0.0036992570385336876, "learning_rate": 0.256986649880972, "loss": 0.2249, "num_input_tokens_seen": 8565760, "step": 9890 }, { "epoch": 4.665252239509665, "grad_norm": 0.003426705254241824, "learning_rate": 0.25694535430238447, "loss": 0.2438, "num_input_tokens_seen": 8571344, "step": 9895 }, { "epoch": 4.667609618104668, "grad_norm": 0.0063272821716964245, "learning_rate": 0.25690404223147933, "loss": 0.2213, "num_input_tokens_seen": 8576432, "step": 9900 }, { "epoch": 4.66996699669967, "grad_norm": 0.0038960715755820274, "learning_rate": 0.2568627136746275, "loss": 0.2623, "num_input_tokens_seen": 8580640, "step": 9905 }, { "epoch": 4.672324375294672, "grad_norm": 0.004203151445835829, "learning_rate": 0.25682136863820226, "loss": 0.2446, "num_input_tokens_seen": 8585392, "step": 9910 }, { "epoch": 4.674681753889675, "grad_norm": 0.004907167516648769, "learning_rate": 0.25678000712857957, "loss": 0.2304, "num_input_tokens_seen": 8589488, "step": 9915 }, { "epoch": 4.677039132484677, "grad_norm": 0.004104039166122675, "learning_rate": 0.2567386291521379, "loss": 0.2174, "num_input_tokens_seen": 8593936, "step": 9920 }, { "epoch": 4.67939651107968, "grad_norm": 0.002158834831789136, "learning_rate": 0.2566972347152583, "loss": 0.2293, "num_input_tokens_seen": 8598976, "step": 9925 }, { "epoch": 4.681753889674682, "grad_norm": 0.003145094495266676, "learning_rate": 0.2566558238243242, "loss": 0.2081, "num_input_tokens_seen": 8603312, "step": 9930 }, { "epoch": 4.684111268269684, "grad_norm": 0.0036779381334781647, "learning_rate": 0.25661439648572176, "loss": 0.2235, "num_input_tokens_seen": 8607680, "step": 9935 }, { "epoch": 4.686468646864687, "grad_norm": 0.0023090678732842207, "learning_rate": 0.25657295270583963, "loss": 0.2296, "num_input_tokens_seen": 8611216, "step": 9940 }, { "epoch": 4.688826025459689, "grad_norm": 0.0047711776569485664, "learning_rate": 0.25653149249106894, "loss": 0.2636, "num_input_tokens_seen": 8615744, "step": 9945 }, { "epoch": 4.691183404054691, "grad_norm": 0.0027853306382894516, "learning_rate": 0.25649001584780323, "loss": 0.1936, "num_input_tokens_seen": 8620416, "step": 9950 }, { "epoch": 4.693540782649693, "grad_norm": 0.00333112059161067, "learning_rate": 0.2564485227824389, "loss": 0.2227, "num_input_tokens_seen": 8624480, "step": 9955 }, { "epoch": 4.6958981612446955, "grad_norm": 0.003930002450942993, "learning_rate": 0.25640701330137466, "loss": 0.1872, "num_input_tokens_seen": 8629120, "step": 9960 }, { "epoch": 4.698255539839698, "grad_norm": 0.0029448510613292456, "learning_rate": 0.2563654874110117, "loss": 0.1782, "num_input_tokens_seen": 8633744, "step": 9965 }, { "epoch": 4.7006129184347, "grad_norm": 0.004129595123231411, "learning_rate": 0.256323945117754, "loss": 0.3511, "num_input_tokens_seen": 8638912, "step": 9970 }, { "epoch": 4.702970297029703, "grad_norm": 0.00583377992734313, "learning_rate": 0.2562823864280078, "loss": 0.2579, "num_input_tokens_seen": 8643360, "step": 9975 }, { "epoch": 4.705327675624705, "grad_norm": 0.0033459847327321768, "learning_rate": 0.25624081134818194, "loss": 0.2358, "num_input_tokens_seen": 8647680, "step": 9980 }, { "epoch": 4.7076850542197075, "grad_norm": 0.010874238796532154, "learning_rate": 0.2561992198846879, "loss": 0.2614, "num_input_tokens_seen": 8652288, "step": 9985 }, { "epoch": 4.71004243281471, "grad_norm": 0.006391587201505899, "learning_rate": 0.25615761204393955, "loss": 0.2564, "num_input_tokens_seen": 8656768, "step": 9990 }, { "epoch": 4.712399811409712, "grad_norm": 0.014513351023197174, "learning_rate": 0.2561159878323534, "loss": 0.2325, "num_input_tokens_seen": 8661920, "step": 9995 }, { "epoch": 4.714757190004715, "grad_norm": 0.009979763068258762, "learning_rate": 0.2560743472563483, "loss": 0.2126, "num_input_tokens_seen": 8665904, "step": 10000 }, { "epoch": 4.714757190004715, "eval_loss": 0.22703884541988373, "eval_runtime": 21.9521, "eval_samples_per_second": 42.957, "eval_steps_per_second": 21.501, "num_input_tokens_seen": 8665904, "step": 10000 }, { "epoch": 4.717114568599717, "grad_norm": 0.0031102376524358988, "learning_rate": 0.25603269032234593, "loss": 0.206, "num_input_tokens_seen": 8669696, "step": 10005 }, { "epoch": 4.7194719471947195, "grad_norm": 0.002624074462801218, "learning_rate": 0.2559910170367702, "loss": 0.2055, "num_input_tokens_seen": 8673440, "step": 10010 }, { "epoch": 4.721829325789722, "grad_norm": 0.0036257850006222725, "learning_rate": 0.2559493274060477, "loss": 0.1968, "num_input_tokens_seen": 8677504, "step": 10015 }, { "epoch": 4.724186704384724, "grad_norm": 0.010346390306949615, "learning_rate": 0.2559076214366074, "loss": 0.2598, "num_input_tokens_seen": 8682544, "step": 10020 }, { "epoch": 4.726544082979727, "grad_norm": 0.004050659481436014, "learning_rate": 0.25586589913488106, "loss": 0.2369, "num_input_tokens_seen": 8686576, "step": 10025 }, { "epoch": 4.728901461574729, "grad_norm": 0.007511364761739969, "learning_rate": 0.2558241605073026, "loss": 0.2446, "num_input_tokens_seen": 8690032, "step": 10030 }, { "epoch": 4.7312588401697315, "grad_norm": 0.006256521679461002, "learning_rate": 0.25578240556030873, "loss": 0.2592, "num_input_tokens_seen": 8694320, "step": 10035 }, { "epoch": 4.733616218764734, "grad_norm": 0.006115077529102564, "learning_rate": 0.2557406343003386, "loss": 0.2225, "num_input_tokens_seen": 8698736, "step": 10040 }, { "epoch": 4.735973597359736, "grad_norm": 0.006495185662060976, "learning_rate": 0.25569884673383375, "loss": 0.235, "num_input_tokens_seen": 8702816, "step": 10045 }, { "epoch": 4.738330975954739, "grad_norm": 0.006397186312824488, "learning_rate": 0.25565704286723856, "loss": 0.2272, "num_input_tokens_seen": 8707568, "step": 10050 }, { "epoch": 4.740688354549741, "grad_norm": 0.006227480247616768, "learning_rate": 0.25561522270699955, "loss": 0.2294, "num_input_tokens_seen": 8711792, "step": 10055 }, { "epoch": 4.7430457331447435, "grad_norm": 0.012340952642261982, "learning_rate": 0.25557338625956594, "loss": 0.2274, "num_input_tokens_seen": 8715872, "step": 10060 }, { "epoch": 4.745403111739745, "grad_norm": 0.005235287360846996, "learning_rate": 0.25553153353138947, "loss": 0.2481, "num_input_tokens_seen": 8719776, "step": 10065 }, { "epoch": 4.7477604903347475, "grad_norm": 0.005572305992245674, "learning_rate": 0.2554896645289243, "loss": 0.2194, "num_input_tokens_seen": 8723888, "step": 10070 }, { "epoch": 4.75011786892975, "grad_norm": 0.0073296381160616875, "learning_rate": 0.2554477792586272, "loss": 0.1983, "num_input_tokens_seen": 8728048, "step": 10075 }, { "epoch": 4.752475247524752, "grad_norm": 0.002989573869854212, "learning_rate": 0.25540587772695744, "loss": 0.1702, "num_input_tokens_seen": 8733280, "step": 10080 }, { "epoch": 4.754832626119755, "grad_norm": 0.005363411735743284, "learning_rate": 0.2553639599403767, "loss": 0.3059, "num_input_tokens_seen": 8737872, "step": 10085 }, { "epoch": 4.757190004714757, "grad_norm": 0.004296573810279369, "learning_rate": 0.2553220259053493, "loss": 0.2908, "num_input_tokens_seen": 8742208, "step": 10090 }, { "epoch": 4.7595473833097595, "grad_norm": 0.004309081472456455, "learning_rate": 0.2552800756283419, "loss": 0.2834, "num_input_tokens_seen": 8746144, "step": 10095 }, { "epoch": 4.761904761904762, "grad_norm": 0.0028175327461212873, "learning_rate": 0.25523810911582373, "loss": 0.2627, "num_input_tokens_seen": 8750000, "step": 10100 }, { "epoch": 4.764262140499764, "grad_norm": 0.002214773092418909, "learning_rate": 0.25519612637426675, "loss": 0.234, "num_input_tokens_seen": 8754080, "step": 10105 }, { "epoch": 4.766619519094767, "grad_norm": 0.00444077979773283, "learning_rate": 0.25515412741014504, "loss": 0.2597, "num_input_tokens_seen": 8758176, "step": 10110 }, { "epoch": 4.768976897689769, "grad_norm": 0.0037436443381011486, "learning_rate": 0.2551121122299355, "loss": 0.2135, "num_input_tokens_seen": 8762496, "step": 10115 }, { "epoch": 4.7713342762847715, "grad_norm": 0.0017685518832877278, "learning_rate": 0.2550700808401173, "loss": 0.2585, "num_input_tokens_seen": 8766992, "step": 10120 }, { "epoch": 4.773691654879774, "grad_norm": 0.001823101774789393, "learning_rate": 0.2550280332471722, "loss": 0.232, "num_input_tokens_seen": 8771360, "step": 10125 }, { "epoch": 4.776049033474776, "grad_norm": 0.0034805668983608484, "learning_rate": 0.2549859694575845, "loss": 0.231, "num_input_tokens_seen": 8775952, "step": 10130 }, { "epoch": 4.778406412069779, "grad_norm": 0.0030522567685693502, "learning_rate": 0.254943889477841, "loss": 0.2085, "num_input_tokens_seen": 8780464, "step": 10135 }, { "epoch": 4.780763790664781, "grad_norm": 0.0026623026933521032, "learning_rate": 0.25490179331443097, "loss": 0.2525, "num_input_tokens_seen": 8785264, "step": 10140 }, { "epoch": 4.7831211692597835, "grad_norm": 0.00334662152454257, "learning_rate": 0.25485968097384615, "loss": 0.2419, "num_input_tokens_seen": 8789904, "step": 10145 }, { "epoch": 4.785478547854785, "grad_norm": 0.003158440813422203, "learning_rate": 0.25481755246258075, "loss": 0.2221, "num_input_tokens_seen": 8794832, "step": 10150 }, { "epoch": 4.787835926449787, "grad_norm": 0.002311042742803693, "learning_rate": 0.2547754077871315, "loss": 0.2286, "num_input_tokens_seen": 8799680, "step": 10155 }, { "epoch": 4.79019330504479, "grad_norm": 0.0026144045405089855, "learning_rate": 0.25473324695399774, "loss": 0.2185, "num_input_tokens_seen": 8804256, "step": 10160 }, { "epoch": 4.792550683639792, "grad_norm": 0.0015132286353036761, "learning_rate": 0.25469106996968105, "loss": 0.2467, "num_input_tokens_seen": 8808496, "step": 10165 }, { "epoch": 4.794908062234795, "grad_norm": 0.005861798767000437, "learning_rate": 0.2546488768406858, "loss": 0.2271, "num_input_tokens_seen": 8812880, "step": 10170 }, { "epoch": 4.797265440829797, "grad_norm": 0.003185089910402894, "learning_rate": 0.25460666757351863, "loss": 0.2423, "num_input_tokens_seen": 8816720, "step": 10175 }, { "epoch": 4.799622819424799, "grad_norm": 0.0015774901257827878, "learning_rate": 0.25456444217468877, "loss": 0.2387, "num_input_tokens_seen": 8821040, "step": 10180 }, { "epoch": 4.801980198019802, "grad_norm": 0.002794100670143962, "learning_rate": 0.25452220065070785, "loss": 0.2122, "num_input_tokens_seen": 8825536, "step": 10185 }, { "epoch": 4.804337576614804, "grad_norm": 0.0017370900604873896, "learning_rate": 0.2544799430080901, "loss": 0.2491, "num_input_tokens_seen": 8829584, "step": 10190 }, { "epoch": 4.806694955209807, "grad_norm": 0.00214237323962152, "learning_rate": 0.2544376692533522, "loss": 0.3126, "num_input_tokens_seen": 8833904, "step": 10195 }, { "epoch": 4.809052333804809, "grad_norm": 0.001375768450088799, "learning_rate": 0.2543953793930132, "loss": 0.2166, "num_input_tokens_seen": 8837712, "step": 10200 }, { "epoch": 4.809052333804809, "eval_loss": 0.22068026661872864, "eval_runtime": 21.8971, "eval_samples_per_second": 43.065, "eval_steps_per_second": 21.555, "num_input_tokens_seen": 8837712, "step": 10200 }, { "epoch": 4.811409712399811, "grad_norm": 0.0020371403079479933, "learning_rate": 0.2543530734335948, "loss": 0.2026, "num_input_tokens_seen": 8842464, "step": 10205 }, { "epoch": 4.813767090994814, "grad_norm": 0.004257515538483858, "learning_rate": 0.2543107513816211, "loss": 0.2787, "num_input_tokens_seen": 8846768, "step": 10210 }, { "epoch": 4.816124469589816, "grad_norm": 0.0015303095569834113, "learning_rate": 0.25426841324361865, "loss": 0.2431, "num_input_tokens_seen": 8850944, "step": 10215 }, { "epoch": 4.818481848184819, "grad_norm": 0.002723381156101823, "learning_rate": 0.2542260590261166, "loss": 0.2516, "num_input_tokens_seen": 8855264, "step": 10220 }, { "epoch": 4.820839226779821, "grad_norm": 0.001737845130264759, "learning_rate": 0.2541836887356465, "loss": 0.242, "num_input_tokens_seen": 8859424, "step": 10225 }, { "epoch": 4.823196605374823, "grad_norm": 0.0038533592596650124, "learning_rate": 0.2541413023787423, "loss": 0.2771, "num_input_tokens_seen": 8864272, "step": 10230 }, { "epoch": 4.825553983969826, "grad_norm": 0.0027567155193537474, "learning_rate": 0.2540988999619405, "loss": 0.2575, "num_input_tokens_seen": 8868304, "step": 10235 }, { "epoch": 4.827911362564828, "grad_norm": 0.004842326510697603, "learning_rate": 0.25405648149178023, "loss": 0.2454, "num_input_tokens_seen": 8872832, "step": 10240 }, { "epoch": 4.830268741159831, "grad_norm": 0.001968837110325694, "learning_rate": 0.2540140469748028, "loss": 0.2221, "num_input_tokens_seen": 8877472, "step": 10245 }, { "epoch": 4.832626119754833, "grad_norm": 0.0018205288797616959, "learning_rate": 0.25397159641755224, "loss": 0.2036, "num_input_tokens_seen": 8882304, "step": 10250 }, { "epoch": 4.834983498349835, "grad_norm": 0.0015791808255016804, "learning_rate": 0.2539291298265749, "loss": 0.2284, "num_input_tokens_seen": 8886272, "step": 10255 }, { "epoch": 4.837340876944838, "grad_norm": 0.004217464942485094, "learning_rate": 0.2538866472084197, "loss": 0.2075, "num_input_tokens_seen": 8890368, "step": 10260 }, { "epoch": 4.839698255539839, "grad_norm": 0.001711296266876161, "learning_rate": 0.25384414856963794, "loss": 0.2283, "num_input_tokens_seen": 8894480, "step": 10265 }, { "epoch": 4.842055634134842, "grad_norm": 0.0027310035657137632, "learning_rate": 0.25380163391678356, "loss": 0.2166, "num_input_tokens_seen": 8898560, "step": 10270 }, { "epoch": 4.844413012729844, "grad_norm": 0.0016945303650572896, "learning_rate": 0.2537591032564127, "loss": 0.235, "num_input_tokens_seen": 8902608, "step": 10275 }, { "epoch": 4.8467703913248465, "grad_norm": 0.003419552929699421, "learning_rate": 0.25371655659508424, "loss": 0.2296, "num_input_tokens_seen": 8907040, "step": 10280 }, { "epoch": 4.849127769919849, "grad_norm": 0.0016422135522589087, "learning_rate": 0.25367399393935935, "loss": 0.2255, "num_input_tokens_seen": 8910704, "step": 10285 }, { "epoch": 4.851485148514851, "grad_norm": 0.002946835942566395, "learning_rate": 0.25363141529580174, "loss": 0.2231, "num_input_tokens_seen": 8915072, "step": 10290 }, { "epoch": 4.853842527109854, "grad_norm": 0.00183379917871207, "learning_rate": 0.2535888206709776, "loss": 0.2194, "num_input_tokens_seen": 8918976, "step": 10295 }, { "epoch": 4.856199905704856, "grad_norm": 0.001559894299134612, "learning_rate": 0.2535462100714555, "loss": 0.1925, "num_input_tokens_seen": 8923568, "step": 10300 }, { "epoch": 4.858557284299859, "grad_norm": 0.0023183412849903107, "learning_rate": 0.2535035835038066, "loss": 0.2546, "num_input_tokens_seen": 8927968, "step": 10305 }, { "epoch": 4.860914662894861, "grad_norm": 0.0023057523649185896, "learning_rate": 0.2534609409746044, "loss": 0.2439, "num_input_tokens_seen": 8932480, "step": 10310 }, { "epoch": 4.863272041489863, "grad_norm": 0.0016307109035551548, "learning_rate": 0.253418282490425, "loss": 0.216, "num_input_tokens_seen": 8936624, "step": 10315 }, { "epoch": 4.865629420084866, "grad_norm": 0.0017056745709851384, "learning_rate": 0.2533756080578467, "loss": 0.2387, "num_input_tokens_seen": 8941264, "step": 10320 }, { "epoch": 4.867986798679868, "grad_norm": 0.0024380641989409924, "learning_rate": 0.25333291768345056, "loss": 0.2222, "num_input_tokens_seen": 8945600, "step": 10325 }, { "epoch": 4.870344177274871, "grad_norm": 0.003554953495040536, "learning_rate": 0.25329021137381996, "loss": 0.2312, "num_input_tokens_seen": 8949616, "step": 10330 }, { "epoch": 4.872701555869873, "grad_norm": 0.0019154661567881703, "learning_rate": 0.25324748913554074, "loss": 0.2353, "num_input_tokens_seen": 8954672, "step": 10335 }, { "epoch": 4.875058934464875, "grad_norm": 0.0030683274380862713, "learning_rate": 0.2532047509752013, "loss": 0.2227, "num_input_tokens_seen": 8958576, "step": 10340 }, { "epoch": 4.877416313059878, "grad_norm": 0.0014877381036058068, "learning_rate": 0.25316199689939217, "loss": 0.2154, "num_input_tokens_seen": 8962880, "step": 10345 }, { "epoch": 4.879773691654879, "grad_norm": 0.0021655242890119553, "learning_rate": 0.2531192269147068, "loss": 0.2098, "num_input_tokens_seen": 8967952, "step": 10350 }, { "epoch": 4.882131070249882, "grad_norm": 0.0028664893470704556, "learning_rate": 0.2530764410277407, "loss": 0.2058, "num_input_tokens_seen": 8972160, "step": 10355 }, { "epoch": 4.884488448844884, "grad_norm": 0.0036216829903423786, "learning_rate": 0.25303363924509203, "loss": 0.2506, "num_input_tokens_seen": 8975888, "step": 10360 }, { "epoch": 4.8868458274398865, "grad_norm": 0.0016360852168872952, "learning_rate": 0.25299082157336145, "loss": 0.2178, "num_input_tokens_seen": 8980752, "step": 10365 }, { "epoch": 4.889203206034889, "grad_norm": 0.0016494934679940343, "learning_rate": 0.2529479880191519, "loss": 0.2383, "num_input_tokens_seen": 8984704, "step": 10370 }, { "epoch": 4.891560584629891, "grad_norm": 0.0020402274094522, "learning_rate": 0.2529051385890689, "loss": 0.2389, "num_input_tokens_seen": 8988432, "step": 10375 }, { "epoch": 4.893917963224894, "grad_norm": 0.0018654990708455443, "learning_rate": 0.2528622732897203, "loss": 0.2138, "num_input_tokens_seen": 8992704, "step": 10380 }, { "epoch": 4.896275341819896, "grad_norm": 0.0018442028667777777, "learning_rate": 0.25281939212771654, "loss": 0.2047, "num_input_tokens_seen": 8996560, "step": 10385 }, { "epoch": 4.8986327204148985, "grad_norm": 0.0025038737803697586, "learning_rate": 0.2527764951096704, "loss": 0.2352, "num_input_tokens_seen": 9001280, "step": 10390 }, { "epoch": 4.900990099009901, "grad_norm": 0.0023145771119743586, "learning_rate": 0.2527335822421971, "loss": 0.255, "num_input_tokens_seen": 9005616, "step": 10395 }, { "epoch": 4.903347477604903, "grad_norm": 0.0055069103837013245, "learning_rate": 0.25269065353191444, "loss": 0.2579, "num_input_tokens_seen": 9010400, "step": 10400 }, { "epoch": 4.903347477604903, "eval_loss": 0.24866719543933868, "eval_runtime": 21.911, "eval_samples_per_second": 43.038, "eval_steps_per_second": 21.542, "num_input_tokens_seen": 9010400, "step": 10400 }, { "epoch": 4.905704856199906, "grad_norm": 0.004870031028985977, "learning_rate": 0.2526477089854425, "loss": 0.2441, "num_input_tokens_seen": 9014832, "step": 10405 }, { "epoch": 4.908062234794908, "grad_norm": 0.002974500646814704, "learning_rate": 0.25260474860940385, "loss": 0.2375, "num_input_tokens_seen": 9019648, "step": 10410 }, { "epoch": 4.9104196133899105, "grad_norm": 0.001540199969895184, "learning_rate": 0.2525617724104236, "loss": 0.2121, "num_input_tokens_seen": 9024000, "step": 10415 }, { "epoch": 4.912776991984913, "grad_norm": 0.0016707731410861015, "learning_rate": 0.25251878039512915, "loss": 0.2241, "num_input_tokens_seen": 9028352, "step": 10420 }, { "epoch": 4.915134370579915, "grad_norm": 0.0013953062007203698, "learning_rate": 0.25247577257015047, "loss": 0.1846, "num_input_tokens_seen": 9032080, "step": 10425 }, { "epoch": 4.917491749174918, "grad_norm": 0.0015885273460298777, "learning_rate": 0.2524327489421198, "loss": 0.2745, "num_input_tokens_seen": 9036864, "step": 10430 }, { "epoch": 4.91984912776992, "grad_norm": 0.0033220627810806036, "learning_rate": 0.25238970951767203, "loss": 0.233, "num_input_tokens_seen": 9040800, "step": 10435 }, { "epoch": 4.9222065063649225, "grad_norm": 0.002091937931254506, "learning_rate": 0.25234665430344433, "loss": 0.2386, "num_input_tokens_seen": 9044864, "step": 10440 }, { "epoch": 4.924563884959925, "grad_norm": 0.004154612310230732, "learning_rate": 0.2523035833060764, "loss": 0.229, "num_input_tokens_seen": 9049024, "step": 10445 }, { "epoch": 4.926921263554927, "grad_norm": 0.003014535177499056, "learning_rate": 0.2522604965322103, "loss": 0.2057, "num_input_tokens_seen": 9053248, "step": 10450 }, { "epoch": 4.92927864214993, "grad_norm": 0.002263062633574009, "learning_rate": 0.25221739398849047, "loss": 0.2329, "num_input_tokens_seen": 9057376, "step": 10455 }, { "epoch": 4.931636020744932, "grad_norm": 0.0018450884381309152, "learning_rate": 0.252174275681564, "loss": 0.2425, "num_input_tokens_seen": 9061024, "step": 10460 }, { "epoch": 4.933993399339934, "grad_norm": 0.0014395025791600347, "learning_rate": 0.2521311416180802, "loss": 0.2082, "num_input_tokens_seen": 9065664, "step": 10465 }, { "epoch": 4.936350777934936, "grad_norm": 0.003026581835001707, "learning_rate": 0.25208799180469094, "loss": 0.2503, "num_input_tokens_seen": 9069968, "step": 10470 }, { "epoch": 4.938708156529938, "grad_norm": 0.0015050627989694476, "learning_rate": 0.2520448262480504, "loss": 0.2301, "num_input_tokens_seen": 9074432, "step": 10475 }, { "epoch": 4.941065535124941, "grad_norm": 0.0021908835042268038, "learning_rate": 0.25200164495481525, "loss": 0.2265, "num_input_tokens_seen": 9078816, "step": 10480 }, { "epoch": 4.943422913719943, "grad_norm": 0.003680185414850712, "learning_rate": 0.25195844793164474, "loss": 0.2227, "num_input_tokens_seen": 9083712, "step": 10485 }, { "epoch": 4.945780292314946, "grad_norm": 0.0018576316069811583, "learning_rate": 0.2519152351852001, "loss": 0.2193, "num_input_tokens_seen": 9088880, "step": 10490 }, { "epoch": 4.948137670909948, "grad_norm": 0.0018354365602135658, "learning_rate": 0.25187200672214555, "loss": 0.2327, "num_input_tokens_seen": 9093760, "step": 10495 }, { "epoch": 4.9504950495049505, "grad_norm": 0.0014261736068874598, "learning_rate": 0.2518287625491473, "loss": 0.1943, "num_input_tokens_seen": 9098944, "step": 10500 }, { "epoch": 4.952852428099953, "grad_norm": 0.0025744158774614334, "learning_rate": 0.25178550267287425, "loss": 0.1975, "num_input_tokens_seen": 9102640, "step": 10505 }, { "epoch": 4.955209806694955, "grad_norm": 0.00256257108412683, "learning_rate": 0.2517422270999976, "loss": 0.2771, "num_input_tokens_seen": 9107072, "step": 10510 }, { "epoch": 4.957567185289958, "grad_norm": 0.0013107958948239684, "learning_rate": 0.2516989358371909, "loss": 0.2114, "num_input_tokens_seen": 9111216, "step": 10515 }, { "epoch": 4.95992456388496, "grad_norm": 0.0016697305254638195, "learning_rate": 0.25165562889113025, "loss": 0.2262, "num_input_tokens_seen": 9116352, "step": 10520 }, { "epoch": 4.9622819424799625, "grad_norm": 0.0015130338724702597, "learning_rate": 0.2516123062684942, "loss": 0.2282, "num_input_tokens_seen": 9120528, "step": 10525 }, { "epoch": 4.964639321074965, "grad_norm": 0.0024297789204865694, "learning_rate": 0.25156896797596356, "loss": 0.2233, "num_input_tokens_seen": 9124560, "step": 10530 }, { "epoch": 4.966996699669967, "grad_norm": 0.0021261225920170546, "learning_rate": 0.2515256140202216, "loss": 0.2211, "num_input_tokens_seen": 9129120, "step": 10535 }, { "epoch": 4.96935407826497, "grad_norm": 0.001633642241358757, "learning_rate": 0.25148224440795425, "loss": 0.2366, "num_input_tokens_seen": 9132960, "step": 10540 }, { "epoch": 4.971711456859972, "grad_norm": 0.001610517967492342, "learning_rate": 0.2514388591458494, "loss": 0.212, "num_input_tokens_seen": 9137216, "step": 10545 }, { "epoch": 4.974068835454974, "grad_norm": 0.0018851879285648465, "learning_rate": 0.2513954582405977, "loss": 0.2295, "num_input_tokens_seen": 9141776, "step": 10550 }, { "epoch": 4.976426214049976, "grad_norm": 0.0026965998113155365, "learning_rate": 0.2513520416988922, "loss": 0.2289, "num_input_tokens_seen": 9145696, "step": 10555 }, { "epoch": 4.978783592644978, "grad_norm": 0.00170559820253402, "learning_rate": 0.2513086095274281, "loss": 0.2249, "num_input_tokens_seen": 9149904, "step": 10560 }, { "epoch": 4.981140971239981, "grad_norm": 0.001909121172502637, "learning_rate": 0.25126516173290336, "loss": 0.2299, "num_input_tokens_seen": 9154544, "step": 10565 }, { "epoch": 4.983498349834983, "grad_norm": 0.0016581149538978934, "learning_rate": 0.2512216983220181, "loss": 0.2198, "num_input_tokens_seen": 9158768, "step": 10570 }, { "epoch": 4.985855728429986, "grad_norm": 0.0031685964204370975, "learning_rate": 0.25117821930147494, "loss": 0.2033, "num_input_tokens_seen": 9163232, "step": 10575 }, { "epoch": 4.988213107024988, "grad_norm": 0.002248473232612014, "learning_rate": 0.2511347246779788, "loss": 0.1533, "num_input_tokens_seen": 9167456, "step": 10580 }, { "epoch": 4.99057048561999, "grad_norm": 0.005142302718013525, "learning_rate": 0.25109121445823723, "loss": 0.3131, "num_input_tokens_seen": 9172288, "step": 10585 }, { "epoch": 4.992927864214993, "grad_norm": 0.001775240758433938, "learning_rate": 0.25104768864896004, "loss": 0.2691, "num_input_tokens_seen": 9176736, "step": 10590 }, { "epoch": 4.995285242809995, "grad_norm": 0.0018103191396221519, "learning_rate": 0.2510041472568594, "loss": 0.2095, "num_input_tokens_seen": 9180880, "step": 10595 }, { "epoch": 4.997642621404998, "grad_norm": 0.001738940947689116, "learning_rate": 0.25096059028864987, "loss": 0.2256, "num_input_tokens_seen": 9185584, "step": 10600 }, { "epoch": 4.997642621404998, "eval_loss": 0.22060146927833557, "eval_runtime": 21.9577, "eval_samples_per_second": 42.946, "eval_steps_per_second": 21.496, "num_input_tokens_seen": 9185584, "step": 10600 }, { "epoch": 5.0, "grad_norm": 0.0017867760034278035, "learning_rate": 0.25091701775104863, "loss": 0.2195, "num_input_tokens_seen": 9189824, "step": 10605 }, { "epoch": 5.002357378595002, "grad_norm": 0.0036467607133090496, "learning_rate": 0.250873429650775, "loss": 0.2468, "num_input_tokens_seen": 9194368, "step": 10610 }, { "epoch": 5.004714757190005, "grad_norm": 0.0017336803721264005, "learning_rate": 0.25082982599455095, "loss": 0.2228, "num_input_tokens_seen": 9199328, "step": 10615 }, { "epoch": 5.007072135785007, "grad_norm": 0.0011912789195775986, "learning_rate": 0.2507862067891006, "loss": 0.2274, "num_input_tokens_seen": 9202896, "step": 10620 }, { "epoch": 5.00942951438001, "grad_norm": 0.0023062515538185835, "learning_rate": 0.25074257204115064, "loss": 0.2302, "num_input_tokens_seen": 9207088, "step": 10625 }, { "epoch": 5.011786892975012, "grad_norm": 0.001548964879475534, "learning_rate": 0.25069892175742997, "loss": 0.2188, "num_input_tokens_seen": 9211488, "step": 10630 }, { "epoch": 5.014144271570014, "grad_norm": 0.0010844528442248702, "learning_rate": 0.25065525594467014, "loss": 0.2214, "num_input_tokens_seen": 9215024, "step": 10635 }, { "epoch": 5.016501650165017, "grad_norm": 0.0024387016892433167, "learning_rate": 0.2506115746096049, "loss": 0.1965, "num_input_tokens_seen": 9218928, "step": 10640 }, { "epoch": 5.018859028760019, "grad_norm": 0.0014010679442435503, "learning_rate": 0.25056787775897055, "loss": 0.2221, "num_input_tokens_seen": 9222992, "step": 10645 }, { "epoch": 5.021216407355022, "grad_norm": 0.0014597059926018119, "learning_rate": 0.2505241653995056, "loss": 0.1899, "num_input_tokens_seen": 9227264, "step": 10650 }, { "epoch": 5.023573785950024, "grad_norm": 0.001042926567606628, "learning_rate": 0.25048043753795113, "loss": 0.1737, "num_input_tokens_seen": 9231808, "step": 10655 }, { "epoch": 5.0259311645450255, "grad_norm": 0.0010163764236494899, "learning_rate": 0.2504366941810504, "loss": 0.1785, "num_input_tokens_seen": 9237072, "step": 10660 }, { "epoch": 5.028288543140028, "grad_norm": 0.003876360598951578, "learning_rate": 0.2503929353355493, "loss": 0.2262, "num_input_tokens_seen": 9241808, "step": 10665 }, { "epoch": 5.03064592173503, "grad_norm": 0.002344526583328843, "learning_rate": 0.250349161008196, "loss": 0.1764, "num_input_tokens_seen": 9245840, "step": 10670 }, { "epoch": 5.033003300330033, "grad_norm": 0.0014534560032188892, "learning_rate": 0.2503053712057409, "loss": 0.2453, "num_input_tokens_seen": 9249728, "step": 10675 }, { "epoch": 5.035360678925035, "grad_norm": 0.002834377344697714, "learning_rate": 0.25026156593493715, "loss": 0.2956, "num_input_tokens_seen": 9254080, "step": 10680 }, { "epoch": 5.0377180575200375, "grad_norm": 0.0026421749498695135, "learning_rate": 0.2502177452025399, "loss": 0.2512, "num_input_tokens_seen": 9257952, "step": 10685 }, { "epoch": 5.04007543611504, "grad_norm": 0.0031543264631181955, "learning_rate": 0.25017390901530695, "loss": 0.265, "num_input_tokens_seen": 9261568, "step": 10690 }, { "epoch": 5.042432814710042, "grad_norm": 0.0021345820277929306, "learning_rate": 0.2501300573799984, "loss": 0.2388, "num_input_tokens_seen": 9266656, "step": 10695 }, { "epoch": 5.044790193305045, "grad_norm": 0.0032463984098285437, "learning_rate": 0.2500861903033766, "loss": 0.2296, "num_input_tokens_seen": 9271424, "step": 10700 }, { "epoch": 5.047147571900047, "grad_norm": 0.004770686384290457, "learning_rate": 0.25004230779220654, "loss": 0.2875, "num_input_tokens_seen": 9275696, "step": 10705 }, { "epoch": 5.0495049504950495, "grad_norm": 0.0016110694268718362, "learning_rate": 0.24999840985325542, "loss": 0.1924, "num_input_tokens_seen": 9280176, "step": 10710 }, { "epoch": 5.051862329090052, "grad_norm": 0.002559440676122904, "learning_rate": 0.24995449649329285, "loss": 0.2475, "num_input_tokens_seen": 9284528, "step": 10715 }, { "epoch": 5.054219707685054, "grad_norm": 0.0018230616115033627, "learning_rate": 0.2499105677190908, "loss": 0.2346, "num_input_tokens_seen": 9288912, "step": 10720 }, { "epoch": 5.056577086280057, "grad_norm": 0.0021518957801163197, "learning_rate": 0.24986662353742364, "loss": 0.2208, "num_input_tokens_seen": 9293120, "step": 10725 }, { "epoch": 5.058934464875059, "grad_norm": 0.0015826355665922165, "learning_rate": 0.24982266395506814, "loss": 0.2214, "num_input_tokens_seen": 9297088, "step": 10730 }, { "epoch": 5.061291843470062, "grad_norm": 0.001485955435782671, "learning_rate": 0.2497786889788034, "loss": 0.2091, "num_input_tokens_seen": 9300784, "step": 10735 }, { "epoch": 5.063649222065064, "grad_norm": 0.002488410333171487, "learning_rate": 0.24973469861541095, "loss": 0.2272, "num_input_tokens_seen": 9305200, "step": 10740 }, { "epoch": 5.066006600660066, "grad_norm": 0.0023279262240976095, "learning_rate": 0.24969069287167456, "loss": 0.213, "num_input_tokens_seen": 9309568, "step": 10745 }, { "epoch": 5.068363979255069, "grad_norm": 0.0015863998560234904, "learning_rate": 0.2496466717543806, "loss": 0.1581, "num_input_tokens_seen": 9314048, "step": 10750 }, { "epoch": 5.07072135785007, "grad_norm": 0.001054070657119155, "learning_rate": 0.24960263527031762, "loss": 0.2223, "num_input_tokens_seen": 9318416, "step": 10755 }, { "epoch": 5.073078736445073, "grad_norm": 0.0022177959326654673, "learning_rate": 0.24955858342627657, "loss": 0.2321, "num_input_tokens_seen": 9322880, "step": 10760 }, { "epoch": 5.075436115040075, "grad_norm": 0.005402293987572193, "learning_rate": 0.24951451622905083, "loss": 0.2653, "num_input_tokens_seen": 9327584, "step": 10765 }, { "epoch": 5.0777934936350775, "grad_norm": 0.004284504801034927, "learning_rate": 0.24947043368543612, "loss": 0.2304, "num_input_tokens_seen": 9331984, "step": 10770 }, { "epoch": 5.08015087223008, "grad_norm": 0.0022657192312180996, "learning_rate": 0.2494263358022305, "loss": 0.2208, "num_input_tokens_seen": 9336384, "step": 10775 }, { "epoch": 5.082508250825082, "grad_norm": 0.002195975976064801, "learning_rate": 0.24938222258623444, "loss": 0.2254, "num_input_tokens_seen": 9340352, "step": 10780 }, { "epoch": 5.084865629420085, "grad_norm": 0.0014843599637970328, "learning_rate": 0.24933809404425075, "loss": 0.2218, "num_input_tokens_seen": 9344656, "step": 10785 }, { "epoch": 5.087223008015087, "grad_norm": 0.0013890491100028157, "learning_rate": 0.24929395018308453, "loss": 0.1925, "num_input_tokens_seen": 9349920, "step": 10790 }, { "epoch": 5.0895803866100895, "grad_norm": 0.0023382019717246294, "learning_rate": 0.24924979100954348, "loss": 0.2345, "num_input_tokens_seen": 9354032, "step": 10795 }, { "epoch": 5.091937765205092, "grad_norm": 0.0033197090961039066, "learning_rate": 0.24920561653043735, "loss": 0.2219, "num_input_tokens_seen": 9358160, "step": 10800 }, { "epoch": 5.091937765205092, "eval_loss": 0.22373640537261963, "eval_runtime": 21.9153, "eval_samples_per_second": 43.029, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 9358160, "step": 10800 }, { "epoch": 5.094295143800094, "grad_norm": 0.0014296055305749178, "learning_rate": 0.24916142675257846, "loss": 0.2433, "num_input_tokens_seen": 9362512, "step": 10805 }, { "epoch": 5.096652522395097, "grad_norm": 0.002428998937830329, "learning_rate": 0.24911722168278144, "loss": 0.2581, "num_input_tokens_seen": 9366928, "step": 10810 }, { "epoch": 5.099009900990099, "grad_norm": 0.0014241700991988182, "learning_rate": 0.24907300132786328, "loss": 0.2217, "num_input_tokens_seen": 9371136, "step": 10815 }, { "epoch": 5.1013672795851015, "grad_norm": 0.001293994253501296, "learning_rate": 0.24902876569464322, "loss": 0.2266, "num_input_tokens_seen": 9375264, "step": 10820 }, { "epoch": 5.103724658180104, "grad_norm": 0.0021044372115284204, "learning_rate": 0.24898451478994305, "loss": 0.2349, "num_input_tokens_seen": 9380128, "step": 10825 }, { "epoch": 5.106082036775106, "grad_norm": 0.0018773652845993638, "learning_rate": 0.2489402486205868, "loss": 0.2334, "num_input_tokens_seen": 9384512, "step": 10830 }, { "epoch": 5.108439415370109, "grad_norm": 0.001599509734660387, "learning_rate": 0.24889596719340085, "loss": 0.2391, "num_input_tokens_seen": 9389424, "step": 10835 }, { "epoch": 5.110796793965111, "grad_norm": 0.001800917205400765, "learning_rate": 0.24885167051521392, "loss": 0.2362, "num_input_tokens_seen": 9393344, "step": 10840 }, { "epoch": 5.1131541725601135, "grad_norm": 0.0018363536801189184, "learning_rate": 0.24880735859285716, "loss": 0.2372, "num_input_tokens_seen": 9397904, "step": 10845 }, { "epoch": 5.115511551155116, "grad_norm": 0.0012843572767451406, "learning_rate": 0.24876303143316406, "loss": 0.226, "num_input_tokens_seen": 9401776, "step": 10850 }, { "epoch": 5.117868929750118, "grad_norm": 0.0014027270954102278, "learning_rate": 0.24871868904297031, "loss": 0.2172, "num_input_tokens_seen": 9405936, "step": 10855 }, { "epoch": 5.12022630834512, "grad_norm": 0.0011954716173931956, "learning_rate": 0.24867433142911416, "loss": 0.189, "num_input_tokens_seen": 9410640, "step": 10860 }, { "epoch": 5.122583686940122, "grad_norm": 0.003320219460874796, "learning_rate": 0.24862995859843612, "loss": 0.2892, "num_input_tokens_seen": 9415120, "step": 10865 }, { "epoch": 5.124941065535125, "grad_norm": 0.0019702287390828133, "learning_rate": 0.24858557055777897, "loss": 0.208, "num_input_tokens_seen": 9419248, "step": 10870 }, { "epoch": 5.127298444130127, "grad_norm": 0.0027095286641269922, "learning_rate": 0.24854116731398793, "loss": 0.22, "num_input_tokens_seen": 9424032, "step": 10875 }, { "epoch": 5.129655822725129, "grad_norm": 0.0013799879234284163, "learning_rate": 0.24849674887391052, "loss": 0.2408, "num_input_tokens_seen": 9428400, "step": 10880 }, { "epoch": 5.132013201320132, "grad_norm": 0.002387586748227477, "learning_rate": 0.2484523152443967, "loss": 0.2271, "num_input_tokens_seen": 9432944, "step": 10885 }, { "epoch": 5.134370579915134, "grad_norm": 0.0014568858314305544, "learning_rate": 0.24840786643229862, "loss": 0.2392, "num_input_tokens_seen": 9437792, "step": 10890 }, { "epoch": 5.136727958510137, "grad_norm": 0.001729584881104529, "learning_rate": 0.2483634024444709, "loss": 0.2245, "num_input_tokens_seen": 9441600, "step": 10895 }, { "epoch": 5.139085337105139, "grad_norm": 0.0015277467900887132, "learning_rate": 0.24831892328777033, "loss": 0.2117, "num_input_tokens_seen": 9446064, "step": 10900 }, { "epoch": 5.141442715700141, "grad_norm": 0.0015412027714774013, "learning_rate": 0.2482744289690563, "loss": 0.257, "num_input_tokens_seen": 9451536, "step": 10905 }, { "epoch": 5.143800094295144, "grad_norm": 0.0018606801750138402, "learning_rate": 0.2482299194951903, "loss": 0.2441, "num_input_tokens_seen": 9455904, "step": 10910 }, { "epoch": 5.146157472890146, "grad_norm": 0.0035080555826425552, "learning_rate": 0.2481853948730363, "loss": 0.2345, "num_input_tokens_seen": 9461824, "step": 10915 }, { "epoch": 5.148514851485149, "grad_norm": 0.0033204746432602406, "learning_rate": 0.24814085510946052, "loss": 0.2334, "num_input_tokens_seen": 9465440, "step": 10920 }, { "epoch": 5.150872230080151, "grad_norm": 0.002022932982072234, "learning_rate": 0.24809630021133158, "loss": 0.2345, "num_input_tokens_seen": 9470400, "step": 10925 }, { "epoch": 5.1532296086751535, "grad_norm": 0.0012817138340324163, "learning_rate": 0.24805173018552037, "loss": 0.2248, "num_input_tokens_seen": 9474496, "step": 10930 }, { "epoch": 5.155586987270156, "grad_norm": 0.004304759204387665, "learning_rate": 0.2480071450389002, "loss": 0.2364, "num_input_tokens_seen": 9478784, "step": 10935 }, { "epoch": 5.157944365865158, "grad_norm": 0.0015767533332109451, "learning_rate": 0.24796254477834662, "loss": 0.2313, "num_input_tokens_seen": 9482448, "step": 10940 }, { "epoch": 5.160301744460161, "grad_norm": 0.0025463809724897146, "learning_rate": 0.24791792941073754, "loss": 0.2257, "num_input_tokens_seen": 9486784, "step": 10945 }, { "epoch": 5.162659123055163, "grad_norm": 0.0018110288074240088, "learning_rate": 0.2478732989429533, "loss": 0.2466, "num_input_tokens_seen": 9491216, "step": 10950 }, { "epoch": 5.165016501650165, "grad_norm": 0.0022011820692569017, "learning_rate": 0.24782865338187632, "loss": 0.2259, "num_input_tokens_seen": 9495536, "step": 10955 }, { "epoch": 5.167373880245167, "grad_norm": 0.0015602281782776117, "learning_rate": 0.2477839927343916, "loss": 0.2068, "num_input_tokens_seen": 9499552, "step": 10960 }, { "epoch": 5.169731258840169, "grad_norm": 0.002355400938540697, "learning_rate": 0.2477393170073864, "loss": 0.1968, "num_input_tokens_seen": 9503712, "step": 10965 }, { "epoch": 5.172088637435172, "grad_norm": 0.0022267696913331747, "learning_rate": 0.2476946262077503, "loss": 0.1574, "num_input_tokens_seen": 9508544, "step": 10970 }, { "epoch": 5.174446016030174, "grad_norm": 0.00301539758220315, "learning_rate": 0.24764992034237507, "loss": 0.1935, "num_input_tokens_seen": 9512704, "step": 10975 }, { "epoch": 5.176803394625177, "grad_norm": 0.001113447593525052, "learning_rate": 0.24760519941815498, "loss": 0.1519, "num_input_tokens_seen": 9517216, "step": 10980 }, { "epoch": 5.179160773220179, "grad_norm": 0.003405479248613119, "learning_rate": 0.2475604634419866, "loss": 0.3768, "num_input_tokens_seen": 9522416, "step": 10985 }, { "epoch": 5.181518151815181, "grad_norm": 0.0032278024591505527, "learning_rate": 0.24751571242076872, "loss": 0.2224, "num_input_tokens_seen": 9526400, "step": 10990 }, { "epoch": 5.183875530410184, "grad_norm": 0.0029130878392606974, "learning_rate": 0.2474709463614025, "loss": 0.2318, "num_input_tokens_seen": 9531536, "step": 10995 }, { "epoch": 5.186232909005186, "grad_norm": 0.001300496980547905, "learning_rate": 0.24742616527079145, "loss": 0.2263, "num_input_tokens_seen": 9535520, "step": 11000 }, { "epoch": 5.186232909005186, "eval_loss": 0.22649720311164856, "eval_runtime": 21.9432, "eval_samples_per_second": 42.975, "eval_steps_per_second": 21.51, "num_input_tokens_seen": 9535520, "step": 11000 }, { "epoch": 5.188590287600189, "grad_norm": 0.0025213537737727165, "learning_rate": 0.24738136915584139, "loss": 0.2392, "num_input_tokens_seen": 9539696, "step": 11005 }, { "epoch": 5.190947666195191, "grad_norm": 0.003846134524792433, "learning_rate": 0.24733655802346047, "loss": 0.2325, "num_input_tokens_seen": 9543792, "step": 11010 }, { "epoch": 5.193305044790193, "grad_norm": 0.0015698407078161836, "learning_rate": 0.24729173188055906, "loss": 0.2052, "num_input_tokens_seen": 9548896, "step": 11015 }, { "epoch": 5.195662423385196, "grad_norm": 0.0019078021869063377, "learning_rate": 0.24724689073404996, "loss": 0.1879, "num_input_tokens_seen": 9552944, "step": 11020 }, { "epoch": 5.198019801980198, "grad_norm": 0.0019758448470383883, "learning_rate": 0.24720203459084822, "loss": 0.2087, "num_input_tokens_seen": 9557840, "step": 11025 }, { "epoch": 5.200377180575201, "grad_norm": 0.0024560161400586367, "learning_rate": 0.24715716345787123, "loss": 0.2588, "num_input_tokens_seen": 9562064, "step": 11030 }, { "epoch": 5.202734559170203, "grad_norm": 0.0019600773230195045, "learning_rate": 0.2471122773420387, "loss": 0.2117, "num_input_tokens_seen": 9566832, "step": 11035 }, { "epoch": 5.205091937765205, "grad_norm": 0.0015101187163963914, "learning_rate": 0.24706737625027259, "loss": 0.2467, "num_input_tokens_seen": 9571520, "step": 11040 }, { "epoch": 5.207449316360208, "grad_norm": 0.0017958359094336629, "learning_rate": 0.24702246018949725, "loss": 0.2279, "num_input_tokens_seen": 9576272, "step": 11045 }, { "epoch": 5.20980669495521, "grad_norm": 0.0020629449281841516, "learning_rate": 0.2469775291666393, "loss": 0.23, "num_input_tokens_seen": 9580704, "step": 11050 }, { "epoch": 5.212164073550213, "grad_norm": 0.002542410744354129, "learning_rate": 0.24693258318862765, "loss": 0.2315, "num_input_tokens_seen": 9585712, "step": 11055 }, { "epoch": 5.214521452145214, "grad_norm": 0.0016957283951342106, "learning_rate": 0.2468876222623935, "loss": 0.2166, "num_input_tokens_seen": 9590208, "step": 11060 }, { "epoch": 5.2168788307402165, "grad_norm": 0.0017622999148443341, "learning_rate": 0.2468426463948705, "loss": 0.2073, "num_input_tokens_seen": 9594672, "step": 11065 }, { "epoch": 5.219236209335219, "grad_norm": 0.001091672107577324, "learning_rate": 0.24679765559299438, "loss": 0.2344, "num_input_tokens_seen": 9599856, "step": 11070 }, { "epoch": 5.221593587930221, "grad_norm": 0.002150794258341193, "learning_rate": 0.24675264986370332, "loss": 0.1879, "num_input_tokens_seen": 9603440, "step": 11075 }, { "epoch": 5.223950966525224, "grad_norm": 0.0035143110435456038, "learning_rate": 0.2467076292139378, "loss": 0.2845, "num_input_tokens_seen": 9606960, "step": 11080 }, { "epoch": 5.226308345120226, "grad_norm": 0.0012609228724613786, "learning_rate": 0.24666259365064055, "loss": 0.2166, "num_input_tokens_seen": 9610752, "step": 11085 }, { "epoch": 5.2286657237152285, "grad_norm": 0.0017185568576678634, "learning_rate": 0.24661754318075663, "loss": 0.232, "num_input_tokens_seen": 9614880, "step": 11090 }, { "epoch": 5.231023102310231, "grad_norm": 0.0016587821301072836, "learning_rate": 0.2465724778112334, "loss": 0.2389, "num_input_tokens_seen": 9619408, "step": 11095 }, { "epoch": 5.233380480905233, "grad_norm": 0.002515598898753524, "learning_rate": 0.24652739754902042, "loss": 0.2414, "num_input_tokens_seen": 9623216, "step": 11100 }, { "epoch": 5.235737859500236, "grad_norm": 0.00206698733381927, "learning_rate": 0.24648230240106975, "loss": 0.2596, "num_input_tokens_seen": 9628032, "step": 11105 }, { "epoch": 5.238095238095238, "grad_norm": 0.0016193678602576256, "learning_rate": 0.2464371923743356, "loss": 0.2367, "num_input_tokens_seen": 9633072, "step": 11110 }, { "epoch": 5.2404526166902405, "grad_norm": 0.0012147077359259129, "learning_rate": 0.24639206747577444, "loss": 0.181, "num_input_tokens_seen": 9637600, "step": 11115 }, { "epoch": 5.242809995285243, "grad_norm": 0.001052511390298605, "learning_rate": 0.24634692771234515, "loss": 0.2052, "num_input_tokens_seen": 9641504, "step": 11120 }, { "epoch": 5.245167373880245, "grad_norm": 0.0011815240141004324, "learning_rate": 0.2463017730910088, "loss": 0.2075, "num_input_tokens_seen": 9645696, "step": 11125 }, { "epoch": 5.247524752475248, "grad_norm": 0.004300659988075495, "learning_rate": 0.2462566036187289, "loss": 0.2364, "num_input_tokens_seen": 9650528, "step": 11130 }, { "epoch": 5.24988213107025, "grad_norm": 0.002963155275210738, "learning_rate": 0.24621141930247106, "loss": 0.3011, "num_input_tokens_seen": 9653888, "step": 11135 }, { "epoch": 5.2522395096652525, "grad_norm": 0.0019008241361007094, "learning_rate": 0.2461662201492033, "loss": 0.2131, "num_input_tokens_seen": 9658768, "step": 11140 }, { "epoch": 5.254596888260255, "grad_norm": 0.0018571895780041814, "learning_rate": 0.24612100616589586, "loss": 0.2131, "num_input_tokens_seen": 9662976, "step": 11145 }, { "epoch": 5.256954266855257, "grad_norm": 0.0022801333107054234, "learning_rate": 0.24607577735952135, "loss": 0.17, "num_input_tokens_seen": 9667824, "step": 11150 }, { "epoch": 5.259311645450259, "grad_norm": 0.003788426984101534, "learning_rate": 0.24603053373705464, "loss": 0.2448, "num_input_tokens_seen": 9672864, "step": 11155 }, { "epoch": 5.261669024045261, "grad_norm": 0.002434577327221632, "learning_rate": 0.2459852753054728, "loss": 0.2744, "num_input_tokens_seen": 9676944, "step": 11160 }, { "epoch": 5.264026402640264, "grad_norm": 0.0022088466212153435, "learning_rate": 0.24594000207175526, "loss": 0.2326, "num_input_tokens_seen": 9681024, "step": 11165 }, { "epoch": 5.266383781235266, "grad_norm": 0.0017202841117978096, "learning_rate": 0.2458947140428838, "loss": 0.2408, "num_input_tokens_seen": 9685936, "step": 11170 }, { "epoch": 5.2687411598302685, "grad_norm": 0.004185760859400034, "learning_rate": 0.24584941122584233, "loss": 0.2475, "num_input_tokens_seen": 9689376, "step": 11175 }, { "epoch": 5.271098538425271, "grad_norm": 0.0024647661484777927, "learning_rate": 0.24580409362761713, "loss": 0.2344, "num_input_tokens_seen": 9693728, "step": 11180 }, { "epoch": 5.273455917020273, "grad_norm": 0.001956983469426632, "learning_rate": 0.2457587612551967, "loss": 0.2366, "num_input_tokens_seen": 9697952, "step": 11185 }, { "epoch": 5.275813295615276, "grad_norm": 0.0020128004252910614, "learning_rate": 0.24571341411557193, "loss": 0.192, "num_input_tokens_seen": 9701840, "step": 11190 }, { "epoch": 5.278170674210278, "grad_norm": 0.0027684196829795837, "learning_rate": 0.2456680522157359, "loss": 0.2245, "num_input_tokens_seen": 9705248, "step": 11195 }, { "epoch": 5.2805280528052805, "grad_norm": 0.0020245984196662903, "learning_rate": 0.245622675562684, "loss": 0.185, "num_input_tokens_seen": 9709232, "step": 11200 }, { "epoch": 5.2805280528052805, "eval_loss": 0.22369828820228577, "eval_runtime": 21.9379, "eval_samples_per_second": 42.985, "eval_steps_per_second": 21.515, "num_input_tokens_seen": 9709232, "step": 11200 }, { "epoch": 5.282885431400283, "grad_norm": 0.0013713166117668152, "learning_rate": 0.24557728416341384, "loss": 0.1959, "num_input_tokens_seen": 9712928, "step": 11205 }, { "epoch": 5.285242809995285, "grad_norm": 0.004494520835578442, "learning_rate": 0.24553187802492538, "loss": 0.2637, "num_input_tokens_seen": 9717616, "step": 11210 }, { "epoch": 5.287600188590288, "grad_norm": 0.0011971939820796251, "learning_rate": 0.24548645715422074, "loss": 0.1907, "num_input_tokens_seen": 9721968, "step": 11215 }, { "epoch": 5.28995756718529, "grad_norm": 0.0020063628908246756, "learning_rate": 0.2454410215583045, "loss": 0.1842, "num_input_tokens_seen": 9726688, "step": 11220 }, { "epoch": 5.2923149457802925, "grad_norm": 0.0018448256887495518, "learning_rate": 0.24539557124418332, "loss": 0.2395, "num_input_tokens_seen": 9730912, "step": 11225 }, { "epoch": 5.294672324375295, "grad_norm": 0.004227141849696636, "learning_rate": 0.24535010621886624, "loss": 0.2137, "num_input_tokens_seen": 9735136, "step": 11230 }, { "epoch": 5.297029702970297, "grad_norm": 0.003618549322709441, "learning_rate": 0.2453046264893646, "loss": 0.2505, "num_input_tokens_seen": 9739568, "step": 11235 }, { "epoch": 5.2993870815653, "grad_norm": 0.003519725287333131, "learning_rate": 0.24525913206269184, "loss": 0.2172, "num_input_tokens_seen": 9744976, "step": 11240 }, { "epoch": 5.301744460160302, "grad_norm": 0.00260105705820024, "learning_rate": 0.2452136229458638, "loss": 0.2164, "num_input_tokens_seen": 9748864, "step": 11245 }, { "epoch": 5.3041018387553045, "grad_norm": 0.0012512417742982507, "learning_rate": 0.24516809914589857, "loss": 0.2193, "num_input_tokens_seen": 9753840, "step": 11250 }, { "epoch": 5.306459217350307, "grad_norm": 0.0020031060557812452, "learning_rate": 0.2451225606698165, "loss": 0.214, "num_input_tokens_seen": 9758048, "step": 11255 }, { "epoch": 5.308816595945308, "grad_norm": 0.0031816039700061083, "learning_rate": 0.2450770075246402, "loss": 0.209, "num_input_tokens_seen": 9762000, "step": 11260 }, { "epoch": 5.311173974540311, "grad_norm": 0.0019977495539933443, "learning_rate": 0.24503143971739455, "loss": 0.2172, "num_input_tokens_seen": 9766112, "step": 11265 }, { "epoch": 5.313531353135313, "grad_norm": 0.0014568602200597525, "learning_rate": 0.24498585725510663, "loss": 0.2284, "num_input_tokens_seen": 9770160, "step": 11270 }, { "epoch": 5.315888731730316, "grad_norm": 0.0016768709756433964, "learning_rate": 0.24494026014480583, "loss": 0.2067, "num_input_tokens_seen": 9774816, "step": 11275 }, { "epoch": 5.318246110325318, "grad_norm": 0.0015292579773813486, "learning_rate": 0.24489464839352387, "loss": 0.2267, "num_input_tokens_seen": 9778672, "step": 11280 }, { "epoch": 5.32060348892032, "grad_norm": 0.0012707986170426011, "learning_rate": 0.2448490220082946, "loss": 0.2421, "num_input_tokens_seen": 9782000, "step": 11285 }, { "epoch": 5.322960867515323, "grad_norm": 0.0020803394727408886, "learning_rate": 0.24480338099615415, "loss": 0.2306, "num_input_tokens_seen": 9786048, "step": 11290 }, { "epoch": 5.325318246110325, "grad_norm": 0.0019657425582408905, "learning_rate": 0.244757725364141, "loss": 0.225, "num_input_tokens_seen": 9790592, "step": 11295 }, { "epoch": 5.327675624705328, "grad_norm": 0.001957591623067856, "learning_rate": 0.24471205511929583, "loss": 0.2278, "num_input_tokens_seen": 9794752, "step": 11300 }, { "epoch": 5.33003300330033, "grad_norm": 0.0019496369641274214, "learning_rate": 0.24466637026866145, "loss": 0.208, "num_input_tokens_seen": 9799664, "step": 11305 }, { "epoch": 5.332390381895332, "grad_norm": 0.0014565993333235383, "learning_rate": 0.2446206708192832, "loss": 0.2128, "num_input_tokens_seen": 9804048, "step": 11310 }, { "epoch": 5.334747760490335, "grad_norm": 0.0026904584374278784, "learning_rate": 0.2445749567782084, "loss": 0.2055, "num_input_tokens_seen": 9808992, "step": 11315 }, { "epoch": 5.337105139085337, "grad_norm": 0.0033633927814662457, "learning_rate": 0.2445292281524868, "loss": 0.2635, "num_input_tokens_seen": 9813104, "step": 11320 }, { "epoch": 5.33946251768034, "grad_norm": 0.008481680415570736, "learning_rate": 0.24448348494917022, "loss": 0.2266, "num_input_tokens_seen": 9817456, "step": 11325 }, { "epoch": 5.341819896275342, "grad_norm": 0.002741200150921941, "learning_rate": 0.24443772717531295, "loss": 0.2308, "num_input_tokens_seen": 9822336, "step": 11330 }, { "epoch": 5.344177274870344, "grad_norm": 0.0037526795640587807, "learning_rate": 0.24439195483797138, "loss": 0.2327, "num_input_tokens_seen": 9827168, "step": 11335 }, { "epoch": 5.346534653465347, "grad_norm": 0.0030065695755183697, "learning_rate": 0.24434616794420416, "loss": 0.2331, "num_input_tokens_seen": 9830928, "step": 11340 }, { "epoch": 5.348892032060349, "grad_norm": 0.0017801984213292599, "learning_rate": 0.24430036650107223, "loss": 0.2388, "num_input_tokens_seen": 9834848, "step": 11345 }, { "epoch": 5.351249410655352, "grad_norm": 0.0017228694632649422, "learning_rate": 0.2442545505156387, "loss": 0.1973, "num_input_tokens_seen": 9838512, "step": 11350 }, { "epoch": 5.353606789250353, "grad_norm": 0.003936372697353363, "learning_rate": 0.24420871999496904, "loss": 0.2755, "num_input_tokens_seen": 9842640, "step": 11355 }, { "epoch": 5.355964167845356, "grad_norm": 0.003623110009357333, "learning_rate": 0.24416287494613084, "loss": 0.2492, "num_input_tokens_seen": 9847904, "step": 11360 }, { "epoch": 5.358321546440358, "grad_norm": 0.0025485383812338114, "learning_rate": 0.24411701537619399, "loss": 0.2245, "num_input_tokens_seen": 9851712, "step": 11365 }, { "epoch": 5.36067892503536, "grad_norm": 0.001961366506293416, "learning_rate": 0.24407114129223062, "loss": 0.2012, "num_input_tokens_seen": 9854816, "step": 11370 }, { "epoch": 5.363036303630363, "grad_norm": 0.002851983066648245, "learning_rate": 0.2440252527013151, "loss": 0.2313, "num_input_tokens_seen": 9859664, "step": 11375 }, { "epoch": 5.365393682225365, "grad_norm": 0.004325107205659151, "learning_rate": 0.24397934961052403, "loss": 0.2275, "num_input_tokens_seen": 9863440, "step": 11380 }, { "epoch": 5.367751060820368, "grad_norm": 0.0026469980366528034, "learning_rate": 0.24393343202693618, "loss": 0.2088, "num_input_tokens_seen": 9867600, "step": 11385 }, { "epoch": 5.37010843941537, "grad_norm": 0.003807930974289775, "learning_rate": 0.2438874999576327, "loss": 0.2024, "num_input_tokens_seen": 9872208, "step": 11390 }, { "epoch": 5.372465818010372, "grad_norm": 0.002371760318055749, "learning_rate": 0.24384155340969688, "loss": 0.1965, "num_input_tokens_seen": 9876544, "step": 11395 }, { "epoch": 5.374823196605375, "grad_norm": 0.0037363239098340273, "learning_rate": 0.24379559239021423, "loss": 0.1847, "num_input_tokens_seen": 9880896, "step": 11400 }, { "epoch": 5.374823196605375, "eval_loss": 0.2308521419763565, "eval_runtime": 21.9422, "eval_samples_per_second": 42.976, "eval_steps_per_second": 21.511, "num_input_tokens_seen": 9880896, "step": 11400 }, { "epoch": 5.377180575200377, "grad_norm": 0.0032352821435779333, "learning_rate": 0.2437496169062725, "loss": 0.219, "num_input_tokens_seen": 9885840, "step": 11405 }, { "epoch": 5.37953795379538, "grad_norm": 0.004335922654718161, "learning_rate": 0.24370362696496176, "loss": 0.2603, "num_input_tokens_seen": 9889856, "step": 11410 }, { "epoch": 5.381895332390382, "grad_norm": 0.005886504892259836, "learning_rate": 0.24365762257337417, "loss": 0.2326, "num_input_tokens_seen": 9894976, "step": 11415 }, { "epoch": 5.384252710985384, "grad_norm": 0.0021525488700717688, "learning_rate": 0.2436116037386042, "loss": 0.2253, "num_input_tokens_seen": 9898720, "step": 11420 }, { "epoch": 5.386610089580387, "grad_norm": 0.002636562567204237, "learning_rate": 0.24356557046774852, "loss": 0.2387, "num_input_tokens_seen": 9902144, "step": 11425 }, { "epoch": 5.388967468175389, "grad_norm": 0.001826910418458283, "learning_rate": 0.24351952276790606, "loss": 0.2275, "num_input_tokens_seen": 9906016, "step": 11430 }, { "epoch": 5.391324846770392, "grad_norm": 0.002310969168320298, "learning_rate": 0.24347346064617797, "loss": 0.236, "num_input_tokens_seen": 9910160, "step": 11435 }, { "epoch": 5.393682225365394, "grad_norm": 0.002616526558995247, "learning_rate": 0.24342738410966758, "loss": 0.2108, "num_input_tokens_seen": 9914992, "step": 11440 }, { "epoch": 5.396039603960396, "grad_norm": 0.0016470474656671286, "learning_rate": 0.24338129316548046, "loss": 0.211, "num_input_tokens_seen": 9919824, "step": 11445 }, { "epoch": 5.398396982555399, "grad_norm": 0.002278881845995784, "learning_rate": 0.24333518782072444, "loss": 0.2238, "num_input_tokens_seen": 9923184, "step": 11450 }, { "epoch": 5.400754361150401, "grad_norm": 0.00238321116194129, "learning_rate": 0.24328906808250952, "loss": 0.2425, "num_input_tokens_seen": 9926976, "step": 11455 }, { "epoch": 5.403111739745403, "grad_norm": 0.0017931893235072494, "learning_rate": 0.243242933957948, "loss": 0.2062, "num_input_tokens_seen": 9931280, "step": 11460 }, { "epoch": 5.405469118340405, "grad_norm": 0.003933845087885857, "learning_rate": 0.24319678545415427, "loss": 0.2389, "num_input_tokens_seen": 9935376, "step": 11465 }, { "epoch": 5.4078264969354075, "grad_norm": 0.002394163515418768, "learning_rate": 0.24315062257824507, "loss": 0.2208, "num_input_tokens_seen": 9939616, "step": 11470 }, { "epoch": 5.41018387553041, "grad_norm": 0.0014431895688176155, "learning_rate": 0.24310444533733921, "loss": 0.1998, "num_input_tokens_seen": 9943808, "step": 11475 }, { "epoch": 5.412541254125412, "grad_norm": 0.0032012120354920626, "learning_rate": 0.2430582537385579, "loss": 0.2508, "num_input_tokens_seen": 9948352, "step": 11480 }, { "epoch": 5.414898632720415, "grad_norm": 0.002077660523355007, "learning_rate": 0.2430120477890244, "loss": 0.1297, "num_input_tokens_seen": 9952704, "step": 11485 }, { "epoch": 5.417256011315417, "grad_norm": 0.002724386751651764, "learning_rate": 0.24296582749586426, "loss": 0.2199, "num_input_tokens_seen": 9957440, "step": 11490 }, { "epoch": 5.4196133899104195, "grad_norm": 0.0017438612412661314, "learning_rate": 0.24291959286620526, "loss": 0.1892, "num_input_tokens_seen": 9961248, "step": 11495 }, { "epoch": 5.421970768505422, "grad_norm": 0.001281241187825799, "learning_rate": 0.24287334390717738, "loss": 0.2052, "num_input_tokens_seen": 9965520, "step": 11500 }, { "epoch": 5.424328147100424, "grad_norm": 0.00312383403070271, "learning_rate": 0.24282708062591268, "loss": 0.2256, "num_input_tokens_seen": 9970192, "step": 11505 }, { "epoch": 5.426685525695427, "grad_norm": 0.002079016761854291, "learning_rate": 0.24278080302954563, "loss": 0.2035, "num_input_tokens_seen": 9974320, "step": 11510 }, { "epoch": 5.429042904290429, "grad_norm": 0.0014294630382210016, "learning_rate": 0.24273451112521283, "loss": 0.1906, "num_input_tokens_seen": 9978752, "step": 11515 }, { "epoch": 5.4314002828854315, "grad_norm": 0.0014103431021794677, "learning_rate": 0.242688204920053, "loss": 0.2584, "num_input_tokens_seen": 9982768, "step": 11520 }, { "epoch": 5.433757661480434, "grad_norm": 0.001641866285353899, "learning_rate": 0.24264188442120715, "loss": 0.2205, "num_input_tokens_seen": 9987088, "step": 11525 }, { "epoch": 5.436115040075436, "grad_norm": 0.0017979093827307224, "learning_rate": 0.24259554963581853, "loss": 0.2195, "num_input_tokens_seen": 9991600, "step": 11530 }, { "epoch": 5.438472418670439, "grad_norm": 0.0029355494771152735, "learning_rate": 0.24254920057103257, "loss": 0.2077, "num_input_tokens_seen": 9996464, "step": 11535 }, { "epoch": 5.440829797265441, "grad_norm": 0.007207668386399746, "learning_rate": 0.24250283723399685, "loss": 0.2368, "num_input_tokens_seen": 10001024, "step": 11540 }, { "epoch": 5.4431871758604435, "grad_norm": 0.001982905901968479, "learning_rate": 0.24245645963186108, "loss": 0.2347, "num_input_tokens_seen": 10005488, "step": 11545 }, { "epoch": 5.445544554455446, "grad_norm": 0.002146466402336955, "learning_rate": 0.2424100677717774, "loss": 0.2111, "num_input_tokens_seen": 10009808, "step": 11550 }, { "epoch": 5.4479019330504475, "grad_norm": 0.0023647614289075136, "learning_rate": 0.24236366166090004, "loss": 0.1846, "num_input_tokens_seen": 10013664, "step": 11555 }, { "epoch": 5.45025931164545, "grad_norm": 0.0027295362669974566, "learning_rate": 0.24231724130638527, "loss": 0.2002, "num_input_tokens_seen": 10018080, "step": 11560 }, { "epoch": 5.452616690240452, "grad_norm": 0.003507294226437807, "learning_rate": 0.2422708067153917, "loss": 0.1185, "num_input_tokens_seen": 10022272, "step": 11565 }, { "epoch": 5.454974068835455, "grad_norm": 0.00338916527107358, "learning_rate": 0.24222435789508026, "loss": 0.1491, "num_input_tokens_seen": 10027568, "step": 11570 }, { "epoch": 5.457331447430457, "grad_norm": 0.001665207790210843, "learning_rate": 0.24217789485261387, "loss": 0.3225, "num_input_tokens_seen": 10032192, "step": 11575 }, { "epoch": 5.4596888260254595, "grad_norm": 0.0014746248489245772, "learning_rate": 0.2421314175951577, "loss": 0.1864, "num_input_tokens_seen": 10036048, "step": 11580 }, { "epoch": 5.462046204620462, "grad_norm": 0.0025962763465940952, "learning_rate": 0.2420849261298791, "loss": 0.2641, "num_input_tokens_seen": 10041088, "step": 11585 }, { "epoch": 5.464403583215464, "grad_norm": 0.0024121664464473724, "learning_rate": 0.24203842046394775, "loss": 0.2086, "num_input_tokens_seen": 10044688, "step": 11590 }, { "epoch": 5.466760961810467, "grad_norm": 0.0016893640859052539, "learning_rate": 0.24199190060453535, "loss": 0.2387, "num_input_tokens_seen": 10048928, "step": 11595 }, { "epoch": 5.469118340405469, "grad_norm": 0.00309431622736156, "learning_rate": 0.2419453665588158, "loss": 0.2223, "num_input_tokens_seen": 10053056, "step": 11600 }, { "epoch": 5.469118340405469, "eval_loss": 0.22025780379772186, "eval_runtime": 21.8714, "eval_samples_per_second": 43.116, "eval_steps_per_second": 21.581, "num_input_tokens_seen": 10053056, "step": 11600 }, { "epoch": 5.4714757190004715, "grad_norm": 0.002789038699120283, "learning_rate": 0.24189881833396523, "loss": 0.2049, "num_input_tokens_seen": 10057584, "step": 11605 }, { "epoch": 5.473833097595474, "grad_norm": 0.003727812087163329, "learning_rate": 0.24185225593716203, "loss": 0.2558, "num_input_tokens_seen": 10062016, "step": 11610 }, { "epoch": 5.476190476190476, "grad_norm": 0.002059090416878462, "learning_rate": 0.2418056793755867, "loss": 0.2088, "num_input_tokens_seen": 10067376, "step": 11615 }, { "epoch": 5.478547854785479, "grad_norm": 0.005041768774390221, "learning_rate": 0.24175908865642187, "loss": 0.2837, "num_input_tokens_seen": 10071568, "step": 11620 }, { "epoch": 5.480905233380481, "grad_norm": 0.00348680280148983, "learning_rate": 0.24171248378685248, "loss": 0.2069, "num_input_tokens_seen": 10074976, "step": 11625 }, { "epoch": 5.4832626119754835, "grad_norm": 0.005859711207449436, "learning_rate": 0.24166586477406554, "loss": 0.2314, "num_input_tokens_seen": 10079776, "step": 11630 }, { "epoch": 5.485619990570486, "grad_norm": 0.004862848203629255, "learning_rate": 0.24161923162525034, "loss": 0.1996, "num_input_tokens_seen": 10084416, "step": 11635 }, { "epoch": 5.487977369165488, "grad_norm": 0.011332573369145393, "learning_rate": 0.2415725843475982, "loss": 0.2579, "num_input_tokens_seen": 10089536, "step": 11640 }, { "epoch": 5.490334747760491, "grad_norm": 0.0020823704544454813, "learning_rate": 0.24152592294830286, "loss": 0.2346, "num_input_tokens_seen": 10094112, "step": 11645 }, { "epoch": 5.492692126355493, "grad_norm": 0.001893607317470014, "learning_rate": 0.24147924743455995, "loss": 0.2308, "num_input_tokens_seen": 10098336, "step": 11650 }, { "epoch": 5.4950495049504955, "grad_norm": 0.0020629488863050938, "learning_rate": 0.24143255781356754, "loss": 0.2235, "num_input_tokens_seen": 10102144, "step": 11655 }, { "epoch": 5.497406883545497, "grad_norm": 0.004417514428496361, "learning_rate": 0.24138585409252566, "loss": 0.2239, "num_input_tokens_seen": 10106240, "step": 11660 }, { "epoch": 5.499764262140499, "grad_norm": 0.00349277607165277, "learning_rate": 0.24133913627863662, "loss": 0.2486, "num_input_tokens_seen": 10111296, "step": 11665 }, { "epoch": 5.502121640735502, "grad_norm": 0.0022747863549739122, "learning_rate": 0.241292404379105, "loss": 0.2318, "num_input_tokens_seen": 10115280, "step": 11670 }, { "epoch": 5.504479019330504, "grad_norm": 0.003932808060199022, "learning_rate": 0.24124565840113735, "loss": 0.2376, "num_input_tokens_seen": 10118896, "step": 11675 }, { "epoch": 5.506836397925507, "grad_norm": 0.00474990950897336, "learning_rate": 0.2411988983519425, "loss": 0.2583, "num_input_tokens_seen": 10123360, "step": 11680 }, { "epoch": 5.509193776520509, "grad_norm": 0.003137154970318079, "learning_rate": 0.24115212423873145, "loss": 0.2141, "num_input_tokens_seen": 10127584, "step": 11685 }, { "epoch": 5.511551155115511, "grad_norm": 0.002722302684560418, "learning_rate": 0.24110533606871737, "loss": 0.2196, "num_input_tokens_seen": 10132480, "step": 11690 }, { "epoch": 5.513908533710514, "grad_norm": 0.004700688645243645, "learning_rate": 0.24105853384911552, "loss": 0.2324, "num_input_tokens_seen": 10137040, "step": 11695 }, { "epoch": 5.516265912305516, "grad_norm": 0.003957216162234545, "learning_rate": 0.24101171758714346, "loss": 0.2514, "num_input_tokens_seen": 10141984, "step": 11700 }, { "epoch": 5.518623290900519, "grad_norm": 0.004483755677938461, "learning_rate": 0.24096488729002086, "loss": 0.246, "num_input_tokens_seen": 10146880, "step": 11705 }, { "epoch": 5.520980669495521, "grad_norm": 0.0032473516184836626, "learning_rate": 0.24091804296496946, "loss": 0.2421, "num_input_tokens_seen": 10150880, "step": 11710 }, { "epoch": 5.523338048090523, "grad_norm": 0.0028087906539440155, "learning_rate": 0.2408711846192133, "loss": 0.2552, "num_input_tokens_seen": 10155184, "step": 11715 }, { "epoch": 5.525695426685526, "grad_norm": 0.003793142270296812, "learning_rate": 0.24082431225997855, "loss": 0.1796, "num_input_tokens_seen": 10159040, "step": 11720 }, { "epoch": 5.528052805280528, "grad_norm": 0.002156143309548497, "learning_rate": 0.24077742589449344, "loss": 0.2584, "num_input_tokens_seen": 10162880, "step": 11725 }, { "epoch": 5.530410183875531, "grad_norm": 0.002776357578113675, "learning_rate": 0.24073052552998844, "loss": 0.1528, "num_input_tokens_seen": 10167664, "step": 11730 }, { "epoch": 5.532767562470533, "grad_norm": 0.0022845561616122723, "learning_rate": 0.2406836111736963, "loss": 0.239, "num_input_tokens_seen": 10171808, "step": 11735 }, { "epoch": 5.535124941065535, "grad_norm": 0.004417682997882366, "learning_rate": 0.2406366828328517, "loss": 0.1832, "num_input_tokens_seen": 10176064, "step": 11740 }, { "epoch": 5.537482319660538, "grad_norm": 0.003715238533914089, "learning_rate": 0.2405897405146915, "loss": 0.2835, "num_input_tokens_seen": 10181328, "step": 11745 }, { "epoch": 5.539839698255539, "grad_norm": 0.0050498549826443195, "learning_rate": 0.240542784226455, "loss": 0.237, "num_input_tokens_seen": 10185088, "step": 11750 }, { "epoch": 5.542197076850542, "grad_norm": 0.0045802053064107895, "learning_rate": 0.24049581397538328, "loss": 0.2427, "num_input_tokens_seen": 10189440, "step": 11755 }, { "epoch": 5.544554455445544, "grad_norm": 0.003715166822075844, "learning_rate": 0.24044882976871984, "loss": 0.2302, "num_input_tokens_seen": 10194096, "step": 11760 }, { "epoch": 5.5469118340405466, "grad_norm": 0.003746785456314683, "learning_rate": 0.2404018316137102, "loss": 0.2324, "num_input_tokens_seen": 10198240, "step": 11765 }, { "epoch": 5.549269212635549, "grad_norm": 0.0029834131710231304, "learning_rate": 0.24035481951760204, "loss": 0.1971, "num_input_tokens_seen": 10202640, "step": 11770 }, { "epoch": 5.551626591230551, "grad_norm": 0.0029536550864577293, "learning_rate": 0.2403077934876452, "loss": 0.1963, "num_input_tokens_seen": 10206672, "step": 11775 }, { "epoch": 5.553983969825554, "grad_norm": 0.0037315948866307735, "learning_rate": 0.2402607535310918, "loss": 0.209, "num_input_tokens_seen": 10210720, "step": 11780 }, { "epoch": 5.556341348420556, "grad_norm": 0.002326107118278742, "learning_rate": 0.2402136996551959, "loss": 0.2134, "num_input_tokens_seen": 10215152, "step": 11785 }, { "epoch": 5.558698727015559, "grad_norm": 0.0046619838103652, "learning_rate": 0.24016663186721376, "loss": 0.2405, "num_input_tokens_seen": 10219536, "step": 11790 }, { "epoch": 5.561056105610561, "grad_norm": 0.0037690310273319483, "learning_rate": 0.24011955017440395, "loss": 0.1996, "num_input_tokens_seen": 10225104, "step": 11795 }, { "epoch": 5.563413484205563, "grad_norm": 0.004508114419877529, "learning_rate": 0.24007245458402696, "loss": 0.2419, "num_input_tokens_seen": 10229152, "step": 11800 }, { "epoch": 5.563413484205563, "eval_loss": 0.22546260058879852, "eval_runtime": 21.8944, "eval_samples_per_second": 43.07, "eval_steps_per_second": 21.558, "num_input_tokens_seen": 10229152, "step": 11800 }, { "epoch": 5.565770862800566, "grad_norm": 0.003787621157243848, "learning_rate": 0.2400253451033456, "loss": 0.241, "num_input_tokens_seen": 10233136, "step": 11805 }, { "epoch": 5.568128241395568, "grad_norm": 0.003572129411622882, "learning_rate": 0.23997822173962463, "loss": 0.2237, "num_input_tokens_seen": 10237696, "step": 11810 }, { "epoch": 5.570485619990571, "grad_norm": 0.0032877796329557896, "learning_rate": 0.23993108450013118, "loss": 0.2348, "num_input_tokens_seen": 10241504, "step": 11815 }, { "epoch": 5.572842998585573, "grad_norm": 0.0030637597665190697, "learning_rate": 0.2398839333921343, "loss": 0.2539, "num_input_tokens_seen": 10246064, "step": 11820 }, { "epoch": 5.575200377180575, "grad_norm": 0.0040597496554255486, "learning_rate": 0.23983676842290536, "loss": 0.2322, "num_input_tokens_seen": 10250736, "step": 11825 }, { "epoch": 5.577557755775578, "grad_norm": 0.005836512427777052, "learning_rate": 0.2397895895997178, "loss": 0.2508, "num_input_tokens_seen": 10255424, "step": 11830 }, { "epoch": 5.57991513437058, "grad_norm": 0.003832130925729871, "learning_rate": 0.23974239692984714, "loss": 0.23, "num_input_tokens_seen": 10259696, "step": 11835 }, { "epoch": 5.582272512965583, "grad_norm": 0.002438921947032213, "learning_rate": 0.2396951904205711, "loss": 0.2204, "num_input_tokens_seen": 10263712, "step": 11840 }, { "epoch": 5.584629891560585, "grad_norm": 0.004780299961566925, "learning_rate": 0.23964797007916952, "loss": 0.2383, "num_input_tokens_seen": 10269376, "step": 11845 }, { "epoch": 5.586987270155587, "grad_norm": 0.005251883529126644, "learning_rate": 0.23960073591292436, "loss": 0.2254, "num_input_tokens_seen": 10273488, "step": 11850 }, { "epoch": 5.58934464875059, "grad_norm": 0.004347655922174454, "learning_rate": 0.2395534879291197, "loss": 0.2675, "num_input_tokens_seen": 10277248, "step": 11855 }, { "epoch": 5.591702027345592, "grad_norm": 0.006654328666627407, "learning_rate": 0.23950622613504186, "loss": 0.2357, "num_input_tokens_seen": 10282784, "step": 11860 }, { "epoch": 5.594059405940594, "grad_norm": 0.00681394012644887, "learning_rate": 0.2394589505379791, "loss": 0.2429, "num_input_tokens_seen": 10287856, "step": 11865 }, { "epoch": 5.596416784535596, "grad_norm": 0.003026197897270322, "learning_rate": 0.23941166114522197, "loss": 0.2344, "num_input_tokens_seen": 10292128, "step": 11870 }, { "epoch": 5.5987741631305985, "grad_norm": 0.0019126959377899766, "learning_rate": 0.23936435796406308, "loss": 0.2296, "num_input_tokens_seen": 10296912, "step": 11875 }, { "epoch": 5.601131541725601, "grad_norm": 0.0026859338395297527, "learning_rate": 0.23931704100179715, "loss": 0.1882, "num_input_tokens_seen": 10302112, "step": 11880 }, { "epoch": 5.603488920320603, "grad_norm": 0.0031750337220728397, "learning_rate": 0.2392697102657211, "loss": 0.2381, "num_input_tokens_seen": 10306352, "step": 11885 }, { "epoch": 5.605846298915606, "grad_norm": 0.004438304342329502, "learning_rate": 0.23922236576313388, "loss": 0.2782, "num_input_tokens_seen": 10310464, "step": 11890 }, { "epoch": 5.608203677510608, "grad_norm": 0.0038150011096149683, "learning_rate": 0.2391750075013366, "loss": 0.2299, "num_input_tokens_seen": 10315120, "step": 11895 }, { "epoch": 5.6105610561056105, "grad_norm": 0.006884389091283083, "learning_rate": 0.2391276354876326, "loss": 0.2324, "num_input_tokens_seen": 10319344, "step": 11900 }, { "epoch": 5.612918434700613, "grad_norm": 0.002862598979845643, "learning_rate": 0.23908024972932707, "loss": 0.2308, "num_input_tokens_seen": 10323776, "step": 11905 }, { "epoch": 5.615275813295615, "grad_norm": 0.0035583414137363434, "learning_rate": 0.2390328502337276, "loss": 0.2251, "num_input_tokens_seen": 10327552, "step": 11910 }, { "epoch": 5.617633191890618, "grad_norm": 0.0027046960312873125, "learning_rate": 0.23898543700814376, "loss": 0.2342, "num_input_tokens_seen": 10331648, "step": 11915 }, { "epoch": 5.61999057048562, "grad_norm": 0.0032091052271425724, "learning_rate": 0.2389380100598873, "loss": 0.229, "num_input_tokens_seen": 10336112, "step": 11920 }, { "epoch": 5.6223479490806225, "grad_norm": 0.005163058638572693, "learning_rate": 0.23889056939627207, "loss": 0.2332, "num_input_tokens_seen": 10340640, "step": 11925 }, { "epoch": 5.624705327675625, "grad_norm": 0.003621079493314028, "learning_rate": 0.23884311502461386, "loss": 0.2067, "num_input_tokens_seen": 10344896, "step": 11930 }, { "epoch": 5.627062706270627, "grad_norm": 0.002058083424344659, "learning_rate": 0.23879564695223088, "loss": 0.1926, "num_input_tokens_seen": 10349136, "step": 11935 }, { "epoch": 5.62942008486563, "grad_norm": 0.005798717495054007, "learning_rate": 0.23874816518644332, "loss": 0.2479, "num_input_tokens_seen": 10353360, "step": 11940 }, { "epoch": 5.631777463460632, "grad_norm": 0.005166816525161266, "learning_rate": 0.23870066973457335, "loss": 0.2594, "num_input_tokens_seen": 10358000, "step": 11945 }, { "epoch": 5.634134842055634, "grad_norm": 0.010093100368976593, "learning_rate": 0.23865316060394545, "loss": 0.2614, "num_input_tokens_seen": 10361872, "step": 11950 }, { "epoch": 5.636492220650636, "grad_norm": 0.010243802331387997, "learning_rate": 0.2386056378018861, "loss": 0.2383, "num_input_tokens_seen": 10366416, "step": 11955 }, { "epoch": 5.6388495992456384, "grad_norm": 0.004981286823749542, "learning_rate": 0.2385581013357239, "loss": 0.2217, "num_input_tokens_seen": 10370672, "step": 11960 }, { "epoch": 5.641206977840641, "grad_norm": 0.007122271228581667, "learning_rate": 0.23851055121278958, "loss": 0.2333, "num_input_tokens_seen": 10375088, "step": 11965 }, { "epoch": 5.643564356435643, "grad_norm": 0.00337813189253211, "learning_rate": 0.23846298744041594, "loss": 0.2159, "num_input_tokens_seen": 10379712, "step": 11970 }, { "epoch": 5.645921735030646, "grad_norm": 0.002766592428088188, "learning_rate": 0.23841541002593802, "loss": 0.2368, "num_input_tokens_seen": 10383392, "step": 11975 }, { "epoch": 5.648279113625648, "grad_norm": 0.002178257331252098, "learning_rate": 0.23836781897669276, "loss": 0.1975, "num_input_tokens_seen": 10387328, "step": 11980 }, { "epoch": 5.6506364922206505, "grad_norm": 0.002414044924080372, "learning_rate": 0.23832021430001926, "loss": 0.2219, "num_input_tokens_seen": 10391760, "step": 11985 }, { "epoch": 5.652993870815653, "grad_norm": 0.004736360628157854, "learning_rate": 0.2382725960032588, "loss": 0.2652, "num_input_tokens_seen": 10396480, "step": 11990 }, { "epoch": 5.655351249410655, "grad_norm": 0.0019061478087678552, "learning_rate": 0.23822496409375482, "loss": 0.2628, "num_input_tokens_seen": 10400448, "step": 11995 }, { "epoch": 5.657708628005658, "grad_norm": 0.002871745266020298, "learning_rate": 0.2381773185788526, "loss": 0.2415, "num_input_tokens_seen": 10404384, "step": 12000 }, { "epoch": 5.657708628005658, "eval_loss": 0.22231729328632355, "eval_runtime": 21.9168, "eval_samples_per_second": 43.026, "eval_steps_per_second": 21.536, "num_input_tokens_seen": 10404384, "step": 12000 }, { "epoch": 5.66006600660066, "grad_norm": 0.0013145023258402944, "learning_rate": 0.2381296594658998, "loss": 0.2141, "num_input_tokens_seen": 10408064, "step": 12005 }, { "epoch": 5.6624233851956625, "grad_norm": 0.002741508651524782, "learning_rate": 0.238081986762246, "loss": 0.2301, "num_input_tokens_seen": 10412800, "step": 12010 }, { "epoch": 5.664780763790665, "grad_norm": 0.002253303537145257, "learning_rate": 0.23803430047524293, "loss": 0.2489, "num_input_tokens_seen": 10416640, "step": 12015 }, { "epoch": 5.667138142385667, "grad_norm": 0.0020363039802759886, "learning_rate": 0.23798660061224441, "loss": 0.2351, "num_input_tokens_seen": 10420736, "step": 12020 }, { "epoch": 5.66949552098067, "grad_norm": 0.0017612709198147058, "learning_rate": 0.23793888718060632, "loss": 0.2304, "num_input_tokens_seen": 10424976, "step": 12025 }, { "epoch": 5.671852899575672, "grad_norm": 0.0014409400755539536, "learning_rate": 0.23789116018768675, "loss": 0.2134, "num_input_tokens_seen": 10428768, "step": 12030 }, { "epoch": 5.6742102781706745, "grad_norm": 0.0018772990442812443, "learning_rate": 0.2378434196408458, "loss": 0.2209, "num_input_tokens_seen": 10432928, "step": 12035 }, { "epoch": 5.676567656765677, "grad_norm": 0.0033909196499735117, "learning_rate": 0.23779566554744563, "loss": 0.251, "num_input_tokens_seen": 10437168, "step": 12040 }, { "epoch": 5.678925035360679, "grad_norm": 0.0013322409940883517, "learning_rate": 0.23774789791485051, "loss": 0.2322, "num_input_tokens_seen": 10441632, "step": 12045 }, { "epoch": 5.681282413955682, "grad_norm": 0.00291747716255486, "learning_rate": 0.2377001167504268, "loss": 0.2165, "num_input_tokens_seen": 10445216, "step": 12050 }, { "epoch": 5.683639792550684, "grad_norm": 0.0015680603682994843, "learning_rate": 0.23765232206154302, "loss": 0.2202, "num_input_tokens_seen": 10450032, "step": 12055 }, { "epoch": 5.6859971711456865, "grad_norm": 0.0031468949746340513, "learning_rate": 0.23760451385556966, "loss": 0.2188, "num_input_tokens_seen": 10453856, "step": 12060 }, { "epoch": 5.688354549740688, "grad_norm": 0.0019345019245520234, "learning_rate": 0.23755669213987932, "loss": 0.2391, "num_input_tokens_seen": 10458832, "step": 12065 }, { "epoch": 5.69071192833569, "grad_norm": 0.0028907679952681065, "learning_rate": 0.23750885692184676, "loss": 0.2027, "num_input_tokens_seen": 10462768, "step": 12070 }, { "epoch": 5.693069306930693, "grad_norm": 0.0021425168961286545, "learning_rate": 0.23746100820884875, "loss": 0.2507, "num_input_tokens_seen": 10467360, "step": 12075 }, { "epoch": 5.695426685525695, "grad_norm": 0.003289925865828991, "learning_rate": 0.23741314600826421, "loss": 0.2362, "num_input_tokens_seen": 10471920, "step": 12080 }, { "epoch": 5.697784064120698, "grad_norm": 0.002098443917930126, "learning_rate": 0.23736527032747406, "loss": 0.2016, "num_input_tokens_seen": 10476496, "step": 12085 }, { "epoch": 5.7001414427157, "grad_norm": 0.0027826421428471804, "learning_rate": 0.23731738117386128, "loss": 0.2337, "num_input_tokens_seen": 10480528, "step": 12090 }, { "epoch": 5.702498821310702, "grad_norm": 0.002512238686904311, "learning_rate": 0.237269478554811, "loss": 0.2327, "num_input_tokens_seen": 10485536, "step": 12095 }, { "epoch": 5.704856199905705, "grad_norm": 0.002275969833135605, "learning_rate": 0.23722156247771053, "loss": 0.2163, "num_input_tokens_seen": 10490032, "step": 12100 }, { "epoch": 5.707213578500707, "grad_norm": 0.0034169454593211412, "learning_rate": 0.23717363294994895, "loss": 0.2066, "num_input_tokens_seen": 10493920, "step": 12105 }, { "epoch": 5.70957095709571, "grad_norm": 0.004329087678343058, "learning_rate": 0.2371256899789177, "loss": 0.2581, "num_input_tokens_seen": 10497408, "step": 12110 }, { "epoch": 5.711928335690712, "grad_norm": 0.0017217560671269894, "learning_rate": 0.23707773357201017, "loss": 0.2446, "num_input_tokens_seen": 10501152, "step": 12115 }, { "epoch": 5.714285714285714, "grad_norm": 0.004449272062629461, "learning_rate": 0.2370297637366218, "loss": 0.2367, "num_input_tokens_seen": 10505360, "step": 12120 }, { "epoch": 5.716643092880717, "grad_norm": 0.002114554401487112, "learning_rate": 0.23698178048015026, "loss": 0.219, "num_input_tokens_seen": 10509600, "step": 12125 }, { "epoch": 5.719000471475719, "grad_norm": 0.0022752247750759125, "learning_rate": 0.236933783809995, "loss": 0.1937, "num_input_tokens_seen": 10513616, "step": 12130 }, { "epoch": 5.721357850070722, "grad_norm": 0.0014830430736765265, "learning_rate": 0.23688577373355785, "loss": 0.2348, "num_input_tokens_seen": 10517728, "step": 12135 }, { "epoch": 5.723715228665724, "grad_norm": 0.011997158639132977, "learning_rate": 0.23683775025824247, "loss": 0.2596, "num_input_tokens_seen": 10522352, "step": 12140 }, { "epoch": 5.726072607260726, "grad_norm": 0.002105731749907136, "learning_rate": 0.2367897133914548, "loss": 0.1941, "num_input_tokens_seen": 10526368, "step": 12145 }, { "epoch": 5.728429985855728, "grad_norm": 0.0030811415053904057, "learning_rate": 0.2367416631406026, "loss": 0.2477, "num_input_tokens_seen": 10531104, "step": 12150 }, { "epoch": 5.73078736445073, "grad_norm": 0.002786726225167513, "learning_rate": 0.23669359951309588, "loss": 0.2327, "num_input_tokens_seen": 10534800, "step": 12155 }, { "epoch": 5.733144743045733, "grad_norm": 0.003152052639052272, "learning_rate": 0.23664552251634666, "loss": 0.2348, "num_input_tokens_seen": 10539056, "step": 12160 }, { "epoch": 5.735502121640735, "grad_norm": 0.002401920733973384, "learning_rate": 0.23659743215776907, "loss": 0.2309, "num_input_tokens_seen": 10542704, "step": 12165 }, { "epoch": 5.7378595002357375, "grad_norm": 0.0027268915437161922, "learning_rate": 0.23654932844477908, "loss": 0.2289, "num_input_tokens_seen": 10546832, "step": 12170 }, { "epoch": 5.74021687883074, "grad_norm": 0.004867551848292351, "learning_rate": 0.23650121138479507, "loss": 0.2331, "num_input_tokens_seen": 10551328, "step": 12175 }, { "epoch": 5.742574257425742, "grad_norm": 0.0012184649240225554, "learning_rate": 0.23645308098523724, "loss": 0.2315, "num_input_tokens_seen": 10555056, "step": 12180 }, { "epoch": 5.744931636020745, "grad_norm": 0.002394026145339012, "learning_rate": 0.23640493725352785, "loss": 0.2078, "num_input_tokens_seen": 10559680, "step": 12185 }, { "epoch": 5.747289014615747, "grad_norm": 0.0033655078150331974, "learning_rate": 0.2363567801970913, "loss": 0.2248, "num_input_tokens_seen": 10564480, "step": 12190 }, { "epoch": 5.7496463932107496, "grad_norm": 0.0030171151738613844, "learning_rate": 0.236308609823354, "loss": 0.2416, "num_input_tokens_seen": 10568672, "step": 12195 }, { "epoch": 5.752003771805752, "grad_norm": 0.002261737361550331, "learning_rate": 0.23626042613974452, "loss": 0.2475, "num_input_tokens_seen": 10573872, "step": 12200 }, { "epoch": 5.752003771805752, "eval_loss": 0.22059963643550873, "eval_runtime": 21.9231, "eval_samples_per_second": 43.014, "eval_steps_per_second": 21.53, "num_input_tokens_seen": 10573872, "step": 12200 }, { "epoch": 5.754361150400754, "grad_norm": 0.001770464819855988, "learning_rate": 0.23621222915369325, "loss": 0.2076, "num_input_tokens_seen": 10578272, "step": 12205 }, { "epoch": 5.756718528995757, "grad_norm": 0.0026349718682467937, "learning_rate": 0.23616401887263283, "loss": 0.2184, "num_input_tokens_seen": 10582000, "step": 12210 }, { "epoch": 5.759075907590759, "grad_norm": 0.0030838397797197104, "learning_rate": 0.23611579530399793, "loss": 0.2469, "num_input_tokens_seen": 10586720, "step": 12215 }, { "epoch": 5.761433286185762, "grad_norm": 0.0024073028471320868, "learning_rate": 0.23606755845522517, "loss": 0.2418, "num_input_tokens_seen": 10590960, "step": 12220 }, { "epoch": 5.763790664780764, "grad_norm": 0.0019175256602466106, "learning_rate": 0.23601930833375329, "loss": 0.2355, "num_input_tokens_seen": 10595568, "step": 12225 }, { "epoch": 5.766148043375766, "grad_norm": 0.0023678780999034643, "learning_rate": 0.23597104494702312, "loss": 0.2143, "num_input_tokens_seen": 10599392, "step": 12230 }, { "epoch": 5.768505421970769, "grad_norm": 0.003325186436995864, "learning_rate": 0.23592276830247744, "loss": 0.2857, "num_input_tokens_seen": 10603824, "step": 12235 }, { "epoch": 5.770862800565771, "grad_norm": 0.0029391483403742313, "learning_rate": 0.2358744784075611, "loss": 0.2459, "num_input_tokens_seen": 10608064, "step": 12240 }, { "epoch": 5.773220179160774, "grad_norm": 0.0034665463026612997, "learning_rate": 0.235826175269721, "loss": 0.2294, "num_input_tokens_seen": 10613536, "step": 12245 }, { "epoch": 5.775577557755776, "grad_norm": 0.001762371277436614, "learning_rate": 0.23577785889640612, "loss": 0.2214, "num_input_tokens_seen": 10618176, "step": 12250 }, { "epoch": 5.777934936350778, "grad_norm": 0.0037857822608202696, "learning_rate": 0.23572952929506744, "loss": 0.2062, "num_input_tokens_seen": 10623536, "step": 12255 }, { "epoch": 5.780292314945781, "grad_norm": 0.001770454808138311, "learning_rate": 0.23568118647315803, "loss": 0.2368, "num_input_tokens_seen": 10628336, "step": 12260 }, { "epoch": 5.782649693540782, "grad_norm": 0.0031121803913265467, "learning_rate": 0.23563283043813296, "loss": 0.2299, "num_input_tokens_seen": 10632336, "step": 12265 }, { "epoch": 5.785007072135785, "grad_norm": 0.002708704676479101, "learning_rate": 0.23558446119744922, "loss": 0.1791, "num_input_tokens_seen": 10636960, "step": 12270 }, { "epoch": 5.787364450730787, "grad_norm": 0.0024953093379735947, "learning_rate": 0.23553607875856608, "loss": 0.2755, "num_input_tokens_seen": 10640960, "step": 12275 }, { "epoch": 5.7897218293257895, "grad_norm": 0.0020556319504976273, "learning_rate": 0.2354876831289447, "loss": 0.2446, "num_input_tokens_seen": 10644544, "step": 12280 }, { "epoch": 5.792079207920792, "grad_norm": 0.003997668623924255, "learning_rate": 0.23543927431604827, "loss": 0.2359, "num_input_tokens_seen": 10648272, "step": 12285 }, { "epoch": 5.794436586515794, "grad_norm": 0.0042749145068228245, "learning_rate": 0.23539085232734203, "loss": 0.2499, "num_input_tokens_seen": 10651920, "step": 12290 }, { "epoch": 5.796793965110797, "grad_norm": 0.0029182955622673035, "learning_rate": 0.2353424171702933, "loss": 0.2514, "num_input_tokens_seen": 10656288, "step": 12295 }, { "epoch": 5.799151343705799, "grad_norm": 0.0016234625363722444, "learning_rate": 0.23529396885237133, "loss": 0.2362, "num_input_tokens_seen": 10660304, "step": 12300 }, { "epoch": 5.8015087223008015, "grad_norm": 0.002593948505818844, "learning_rate": 0.2352455073810475, "loss": 0.2207, "num_input_tokens_seen": 10664608, "step": 12305 }, { "epoch": 5.803866100895804, "grad_norm": 0.002961423946544528, "learning_rate": 0.23519703276379517, "loss": 0.2047, "num_input_tokens_seen": 10670064, "step": 12310 }, { "epoch": 5.806223479490806, "grad_norm": 0.001962879905477166, "learning_rate": 0.2351485450080897, "loss": 0.2564, "num_input_tokens_seen": 10674928, "step": 12315 }, { "epoch": 5.808580858085809, "grad_norm": 0.0020464598201215267, "learning_rate": 0.2351000441214086, "loss": 0.228, "num_input_tokens_seen": 10679264, "step": 12320 }, { "epoch": 5.810938236680811, "grad_norm": 0.002720191376283765, "learning_rate": 0.23505153011123125, "loss": 0.2269, "num_input_tokens_seen": 10683376, "step": 12325 }, { "epoch": 5.8132956152758135, "grad_norm": 0.0031840079464018345, "learning_rate": 0.23500300298503912, "loss": 0.1967, "num_input_tokens_seen": 10686928, "step": 12330 }, { "epoch": 5.815652993870816, "grad_norm": 0.0030439544934779406, "learning_rate": 0.23495446275031576, "loss": 0.2141, "num_input_tokens_seen": 10691632, "step": 12335 }, { "epoch": 5.818010372465818, "grad_norm": 0.0018445487366989255, "learning_rate": 0.2349059094145466, "loss": 0.2451, "num_input_tokens_seen": 10696096, "step": 12340 }, { "epoch": 5.820367751060821, "grad_norm": 0.003526841988787055, "learning_rate": 0.2348573429852192, "loss": 0.2664, "num_input_tokens_seen": 10700544, "step": 12345 }, { "epoch": 5.822725129655822, "grad_norm": 0.001826312392950058, "learning_rate": 0.23480876346982313, "loss": 0.2402, "num_input_tokens_seen": 10704944, "step": 12350 }, { "epoch": 5.825082508250825, "grad_norm": 0.00144754140637815, "learning_rate": 0.23476017087585, "loss": 0.2293, "num_input_tokens_seen": 10709632, "step": 12355 }, { "epoch": 5.827439886845827, "grad_norm": 0.002339455299079418, "learning_rate": 0.23471156521079334, "loss": 0.226, "num_input_tokens_seen": 10714336, "step": 12360 }, { "epoch": 5.829797265440829, "grad_norm": 0.002207937417551875, "learning_rate": 0.23466294648214875, "loss": 0.1906, "num_input_tokens_seen": 10719056, "step": 12365 }, { "epoch": 5.832154644035832, "grad_norm": 0.0034571720752865076, "learning_rate": 0.2346143146974139, "loss": 0.2498, "num_input_tokens_seen": 10722736, "step": 12370 }, { "epoch": 5.834512022630834, "grad_norm": 0.0023815277963876724, "learning_rate": 0.23456566986408836, "loss": 0.2589, "num_input_tokens_seen": 10727104, "step": 12375 }, { "epoch": 5.836869401225837, "grad_norm": 0.001980734756216407, "learning_rate": 0.23451701198967384, "loss": 0.2139, "num_input_tokens_seen": 10731504, "step": 12380 }, { "epoch": 5.839226779820839, "grad_norm": 0.001704265596345067, "learning_rate": 0.23446834108167397, "loss": 0.2064, "num_input_tokens_seen": 10735312, "step": 12385 }, { "epoch": 5.841584158415841, "grad_norm": 0.002763498807325959, "learning_rate": 0.23441965714759438, "loss": 0.2384, "num_input_tokens_seen": 10738928, "step": 12390 }, { "epoch": 5.843941537010844, "grad_norm": 0.0023877169005572796, "learning_rate": 0.23437096019494277, "loss": 0.2521, "num_input_tokens_seen": 10743568, "step": 12395 }, { "epoch": 5.846298915605846, "grad_norm": 0.0027713722083717585, "learning_rate": 0.23432225023122885, "loss": 0.2396, "num_input_tokens_seen": 10748304, "step": 12400 }, { "epoch": 5.846298915605846, "eval_loss": 0.2352769374847412, "eval_runtime": 21.8727, "eval_samples_per_second": 43.113, "eval_steps_per_second": 21.579, "num_input_tokens_seen": 10748304, "step": 12400 }, { "epoch": 5.848656294200849, "grad_norm": 0.0020673151593655348, "learning_rate": 0.23427352726396428, "loss": 0.2414, "num_input_tokens_seen": 10752736, "step": 12405 }, { "epoch": 5.851013672795851, "grad_norm": 0.0024210019037127495, "learning_rate": 0.2342247913006628, "loss": 0.2222, "num_input_tokens_seen": 10757216, "step": 12410 }, { "epoch": 5.8533710513908535, "grad_norm": 0.001864079968072474, "learning_rate": 0.23417604234883999, "loss": 0.2299, "num_input_tokens_seen": 10761056, "step": 12415 }, { "epoch": 5.855728429985856, "grad_norm": 0.0025835155975073576, "learning_rate": 0.23412728041601363, "loss": 0.2279, "num_input_tokens_seen": 10764784, "step": 12420 }, { "epoch": 5.858085808580858, "grad_norm": 0.0016529257409274578, "learning_rate": 0.23407850550970347, "loss": 0.2741, "num_input_tokens_seen": 10769296, "step": 12425 }, { "epoch": 5.860443187175861, "grad_norm": 0.0013685290468856692, "learning_rate": 0.23402971763743116, "loss": 0.2394, "num_input_tokens_seen": 10773200, "step": 12430 }, { "epoch": 5.862800565770863, "grad_norm": 0.001540546421892941, "learning_rate": 0.23398091680672037, "loss": 0.2498, "num_input_tokens_seen": 10777568, "step": 12435 }, { "epoch": 5.8651579443658655, "grad_norm": 0.0034385027829557657, "learning_rate": 0.23393210302509687, "loss": 0.24, "num_input_tokens_seen": 10782112, "step": 12440 }, { "epoch": 5.867515322960868, "grad_norm": 0.0028305246960371733, "learning_rate": 0.23388327630008832, "loss": 0.2464, "num_input_tokens_seen": 10786176, "step": 12445 }, { "epoch": 5.86987270155587, "grad_norm": 0.0017376882024109364, "learning_rate": 0.23383443663922443, "loss": 0.234, "num_input_tokens_seen": 10790592, "step": 12450 }, { "epoch": 5.872230080150873, "grad_norm": 0.0033119251020252705, "learning_rate": 0.23378558405003685, "loss": 0.2365, "num_input_tokens_seen": 10794480, "step": 12455 }, { "epoch": 5.874587458745875, "grad_norm": 0.0028617975767701864, "learning_rate": 0.2337367185400593, "loss": 0.2187, "num_input_tokens_seen": 10799120, "step": 12460 }, { "epoch": 5.876944837340877, "grad_norm": 0.00145920820068568, "learning_rate": 0.23368784011682747, "loss": 0.224, "num_input_tokens_seen": 10803936, "step": 12465 }, { "epoch": 5.879302215935879, "grad_norm": 0.0030155812855809927, "learning_rate": 0.23363894878787902, "loss": 0.2395, "num_input_tokens_seen": 10807936, "step": 12470 }, { "epoch": 5.881659594530881, "grad_norm": 0.0026623005978763103, "learning_rate": 0.23359004456075352, "loss": 0.2415, "num_input_tokens_seen": 10812032, "step": 12475 }, { "epoch": 5.884016973125884, "grad_norm": 0.0015894895186647773, "learning_rate": 0.23354112744299277, "loss": 0.2272, "num_input_tokens_seen": 10816032, "step": 12480 }, { "epoch": 5.886374351720886, "grad_norm": 0.001163095817901194, "learning_rate": 0.2334921974421403, "loss": 0.2324, "num_input_tokens_seen": 10819840, "step": 12485 }, { "epoch": 5.888731730315889, "grad_norm": 0.0015502591850236058, "learning_rate": 0.23344325456574178, "loss": 0.2084, "num_input_tokens_seen": 10823920, "step": 12490 }, { "epoch": 5.891089108910891, "grad_norm": 0.0013511625584214926, "learning_rate": 0.23339429882134477, "loss": 0.1981, "num_input_tokens_seen": 10827776, "step": 12495 }, { "epoch": 5.893446487505893, "grad_norm": 0.0033754869364202023, "learning_rate": 0.23334533021649884, "loss": 0.2348, "num_input_tokens_seen": 10833952, "step": 12500 }, { "epoch": 5.895803866100896, "grad_norm": 0.0011947082821279764, "learning_rate": 0.23329634875875566, "loss": 0.2233, "num_input_tokens_seen": 10837664, "step": 12505 }, { "epoch": 5.898161244695898, "grad_norm": 0.0013754245592281222, "learning_rate": 0.23324735445566874, "loss": 0.2927, "num_input_tokens_seen": 10841616, "step": 12510 }, { "epoch": 5.900518623290901, "grad_norm": 0.0012503743637353182, "learning_rate": 0.2331983473147936, "loss": 0.2141, "num_input_tokens_seen": 10845760, "step": 12515 }, { "epoch": 5.902876001885903, "grad_norm": 0.0026672605890780687, "learning_rate": 0.23314932734368776, "loss": 0.2235, "num_input_tokens_seen": 10850128, "step": 12520 }, { "epoch": 5.905233380480905, "grad_norm": 0.003437110222876072, "learning_rate": 0.2331002945499107, "loss": 0.2335, "num_input_tokens_seen": 10854448, "step": 12525 }, { "epoch": 5.907590759075908, "grad_norm": 0.0038402690552175045, "learning_rate": 0.23305124894102397, "loss": 0.2118, "num_input_tokens_seen": 10858656, "step": 12530 }, { "epoch": 5.90994813767091, "grad_norm": 0.004162314813584089, "learning_rate": 0.23300219052459092, "loss": 0.2584, "num_input_tokens_seen": 10863216, "step": 12535 }, { "epoch": 5.912305516265913, "grad_norm": 0.0019859131425619125, "learning_rate": 0.23295311930817708, "loss": 0.2433, "num_input_tokens_seen": 10867168, "step": 12540 }, { "epoch": 5.914662894860915, "grad_norm": 0.0019307526526972651, "learning_rate": 0.23290403529934972, "loss": 0.2285, "num_input_tokens_seen": 10871312, "step": 12545 }, { "epoch": 5.9170202734559165, "grad_norm": 0.0045627751387655735, "learning_rate": 0.23285493850567832, "loss": 0.2366, "num_input_tokens_seen": 10876528, "step": 12550 }, { "epoch": 5.919377652050919, "grad_norm": 0.0032950923778116703, "learning_rate": 0.23280582893473414, "loss": 0.2286, "num_input_tokens_seen": 10881120, "step": 12555 }, { "epoch": 5.921735030645921, "grad_norm": 0.003994523081928492, "learning_rate": 0.2327567065940906, "loss": 0.2379, "num_input_tokens_seen": 10884832, "step": 12560 }, { "epoch": 5.924092409240924, "grad_norm": 0.005103159695863724, "learning_rate": 0.23270757149132285, "loss": 0.2384, "num_input_tokens_seen": 10888816, "step": 12565 }, { "epoch": 5.926449787835926, "grad_norm": 0.00353094725869596, "learning_rate": 0.23265842363400827, "loss": 0.1811, "num_input_tokens_seen": 10892448, "step": 12570 }, { "epoch": 5.9288071664309285, "grad_norm": 0.005643037613481283, "learning_rate": 0.23260926302972595, "loss": 0.2172, "num_input_tokens_seen": 10896688, "step": 12575 }, { "epoch": 5.931164545025931, "grad_norm": 0.003823287785053253, "learning_rate": 0.2325600896860572, "loss": 0.2654, "num_input_tokens_seen": 10900736, "step": 12580 }, { "epoch": 5.933521923620933, "grad_norm": 0.002154023153707385, "learning_rate": 0.23251090361058505, "loss": 0.2544, "num_input_tokens_seen": 10905344, "step": 12585 }, { "epoch": 5.935879302215936, "grad_norm": 0.0033004574943333864, "learning_rate": 0.23246170481089476, "loss": 0.2598, "num_input_tokens_seen": 10909616, "step": 12590 }, { "epoch": 5.938236680810938, "grad_norm": 0.001611672225408256, "learning_rate": 0.23241249329457317, "loss": 0.2256, "num_input_tokens_seen": 10913840, "step": 12595 }, { "epoch": 5.9405940594059405, "grad_norm": 0.00158850965090096, "learning_rate": 0.23236326906920957, "loss": 0.1792, "num_input_tokens_seen": 10917920, "step": 12600 }, { "epoch": 5.9405940594059405, "eval_loss": 0.24834607541561127, "eval_runtime": 21.88, "eval_samples_per_second": 43.099, "eval_steps_per_second": 21.572, "num_input_tokens_seen": 10917920, "step": 12600 }, { "epoch": 5.942951438000943, "grad_norm": 0.005649749655276537, "learning_rate": 0.2323140321423948, "loss": 0.3518, "num_input_tokens_seen": 10922128, "step": 12605 }, { "epoch": 5.945308816595945, "grad_norm": 0.0037630286533385515, "learning_rate": 0.23226478252172184, "loss": 0.2146, "num_input_tokens_seen": 10926528, "step": 12610 }, { "epoch": 5.947666195190948, "grad_norm": 0.00246005249209702, "learning_rate": 0.23221552021478561, "loss": 0.1904, "num_input_tokens_seen": 10930816, "step": 12615 }, { "epoch": 5.95002357378595, "grad_norm": 0.0024156710132956505, "learning_rate": 0.232166245229183, "loss": 0.2159, "num_input_tokens_seen": 10935488, "step": 12620 }, { "epoch": 5.9523809523809526, "grad_norm": 0.0016405563801527023, "learning_rate": 0.2321169575725128, "loss": 0.2412, "num_input_tokens_seen": 10939616, "step": 12625 }, { "epoch": 5.954738330975955, "grad_norm": 0.0015273899771273136, "learning_rate": 0.23206765725237577, "loss": 0.2173, "num_input_tokens_seen": 10943616, "step": 12630 }, { "epoch": 5.957095709570957, "grad_norm": 0.0014597258996218443, "learning_rate": 0.2320183442763747, "loss": 0.2216, "num_input_tokens_seen": 10947456, "step": 12635 }, { "epoch": 5.95945308816596, "grad_norm": 0.003078373847529292, "learning_rate": 0.23196901865211422, "loss": 0.2405, "num_input_tokens_seen": 10951744, "step": 12640 }, { "epoch": 5.961810466760962, "grad_norm": 0.002240431262180209, "learning_rate": 0.231919680387201, "loss": 0.1852, "num_input_tokens_seen": 10956416, "step": 12645 }, { "epoch": 5.964167845355965, "grad_norm": 0.0017702371114864945, "learning_rate": 0.23187032948924358, "loss": 0.2046, "num_input_tokens_seen": 10960672, "step": 12650 }, { "epoch": 5.966525223950967, "grad_norm": 0.001617515692487359, "learning_rate": 0.23182096596585247, "loss": 0.2364, "num_input_tokens_seen": 10964896, "step": 12655 }, { "epoch": 5.968882602545969, "grad_norm": 0.0013805010821670294, "learning_rate": 0.23177158982464025, "loss": 0.2665, "num_input_tokens_seen": 10969568, "step": 12660 }, { "epoch": 5.971239981140971, "grad_norm": 0.0028485977090895176, "learning_rate": 0.23172220107322122, "loss": 0.2178, "num_input_tokens_seen": 10973712, "step": 12665 }, { "epoch": 5.973597359735973, "grad_norm": 0.002199440263211727, "learning_rate": 0.23167279971921184, "loss": 0.2298, "num_input_tokens_seen": 10978464, "step": 12670 }, { "epoch": 5.975954738330976, "grad_norm": 0.005057504400610924, "learning_rate": 0.23162338577023034, "loss": 0.2162, "num_input_tokens_seen": 10983392, "step": 12675 }, { "epoch": 5.978312116925978, "grad_norm": 0.001169491559267044, "learning_rate": 0.23157395923389704, "loss": 0.1782, "num_input_tokens_seen": 10986928, "step": 12680 }, { "epoch": 5.9806694955209805, "grad_norm": 0.0016375831328332424, "learning_rate": 0.2315245201178341, "loss": 0.29, "num_input_tokens_seen": 10990448, "step": 12685 }, { "epoch": 5.983026874115983, "grad_norm": 0.0029446629341691732, "learning_rate": 0.23147506842966564, "loss": 0.2921, "num_input_tokens_seen": 10995056, "step": 12690 }, { "epoch": 5.985384252710985, "grad_norm": 0.0011914746137335896, "learning_rate": 0.23142560417701774, "loss": 0.1977, "num_input_tokens_seen": 10999568, "step": 12695 }, { "epoch": 5.987741631305988, "grad_norm": 0.0010897465981543064, "learning_rate": 0.23137612736751845, "loss": 0.2362, "num_input_tokens_seen": 11003648, "step": 12700 }, { "epoch": 5.99009900990099, "grad_norm": 0.002389785135164857, "learning_rate": 0.23132663800879766, "loss": 0.2203, "num_input_tokens_seen": 11009488, "step": 12705 }, { "epoch": 5.9924563884959925, "grad_norm": 0.0009399185655638576, "learning_rate": 0.2312771361084873, "loss": 0.2415, "num_input_tokens_seen": 11015024, "step": 12710 }, { "epoch": 5.994813767090995, "grad_norm": 0.0021879307460039854, "learning_rate": 0.23122762167422112, "loss": 0.219, "num_input_tokens_seen": 11018768, "step": 12715 }, { "epoch": 5.997171145685997, "grad_norm": 0.0021859491243958473, "learning_rate": 0.23117809471363493, "loss": 0.2844, "num_input_tokens_seen": 11023872, "step": 12720 }, { "epoch": 5.999528524281, "grad_norm": 0.0009013023227453232, "learning_rate": 0.23112855523436637, "loss": 0.2157, "num_input_tokens_seen": 11027488, "step": 12725 }, { "epoch": 6.001885902876002, "grad_norm": 0.002629445167258382, "learning_rate": 0.23107900324405511, "loss": 0.2291, "num_input_tokens_seen": 11031792, "step": 12730 }, { "epoch": 6.0042432814710045, "grad_norm": 0.0011355848982930183, "learning_rate": 0.2310294387503426, "loss": 0.2081, "num_input_tokens_seen": 11036432, "step": 12735 }, { "epoch": 6.006600660066007, "grad_norm": 0.0005956445238552988, "learning_rate": 0.23097986176087237, "loss": 0.1925, "num_input_tokens_seen": 11040832, "step": 12740 }, { "epoch": 6.008958038661009, "grad_norm": 0.0006888190982863307, "learning_rate": 0.23093027228328986, "loss": 0.2421, "num_input_tokens_seen": 11045216, "step": 12745 }, { "epoch": 6.011315417256012, "grad_norm": 0.0016853951383382082, "learning_rate": 0.23088067032524226, "loss": 0.2138, "num_input_tokens_seen": 11048560, "step": 12750 }, { "epoch": 6.013672795851014, "grad_norm": 0.0016277659451588988, "learning_rate": 0.23083105589437888, "loss": 0.2091, "num_input_tokens_seen": 11053232, "step": 12755 }, { "epoch": 6.016030174446016, "grad_norm": 0.0018908953061327338, "learning_rate": 0.23078142899835094, "loss": 0.2641, "num_input_tokens_seen": 11057840, "step": 12760 }, { "epoch": 6.018387553041018, "grad_norm": 0.0008780216448940337, "learning_rate": 0.23073178964481147, "loss": 0.2416, "num_input_tokens_seen": 11061616, "step": 12765 }, { "epoch": 6.02074493163602, "grad_norm": 0.001263982616364956, "learning_rate": 0.2306821378414155, "loss": 0.2523, "num_input_tokens_seen": 11065584, "step": 12770 }, { "epoch": 6.023102310231023, "grad_norm": 0.001900622621178627, "learning_rate": 0.2306324735958199, "loss": 0.2302, "num_input_tokens_seen": 11070272, "step": 12775 }, { "epoch": 6.025459688826025, "grad_norm": 0.0019365698099136353, "learning_rate": 0.23058279691568362, "loss": 0.236, "num_input_tokens_seen": 11073904, "step": 12780 }, { "epoch": 6.027817067421028, "grad_norm": 0.001042835647240281, "learning_rate": 0.23053310780866745, "loss": 0.259, "num_input_tokens_seen": 11078400, "step": 12785 }, { "epoch": 6.03017444601603, "grad_norm": 0.00531243858858943, "learning_rate": 0.23048340628243397, "loss": 0.2262, "num_input_tokens_seen": 11083488, "step": 12790 }, { "epoch": 6.032531824611032, "grad_norm": 0.0017946711741387844, "learning_rate": 0.23043369234464783, "loss": 0.2231, "num_input_tokens_seen": 11088656, "step": 12795 }, { "epoch": 6.034889203206035, "grad_norm": 0.001121607143431902, "learning_rate": 0.2303839660029755, "loss": 0.2638, "num_input_tokens_seen": 11092736, "step": 12800 }, { "epoch": 6.034889203206035, "eval_loss": 0.22787941992282867, "eval_runtime": 21.9517, "eval_samples_per_second": 42.958, "eval_steps_per_second": 21.502, "num_input_tokens_seen": 11092736, "step": 12800 }, { "epoch": 6.037246581801037, "grad_norm": 0.0012658574851229787, "learning_rate": 0.23033422726508548, "loss": 0.2116, "num_input_tokens_seen": 11097248, "step": 12805 }, { "epoch": 6.03960396039604, "grad_norm": 0.002086346037685871, "learning_rate": 0.23028447613864808, "loss": 0.2496, "num_input_tokens_seen": 11102208, "step": 12810 }, { "epoch": 6.041961338991042, "grad_norm": 0.001454114099033177, "learning_rate": 0.2302347126313355, "loss": 0.1992, "num_input_tokens_seen": 11106880, "step": 12815 }, { "epoch": 6.044318717586044, "grad_norm": 0.0008547223405912519, "learning_rate": 0.23018493675082197, "loss": 0.2158, "num_input_tokens_seen": 11110560, "step": 12820 }, { "epoch": 6.046676096181047, "grad_norm": 0.0010011035483330488, "learning_rate": 0.2301351485047835, "loss": 0.2184, "num_input_tokens_seen": 11114496, "step": 12825 }, { "epoch": 6.049033474776049, "grad_norm": 0.0009463587775826454, "learning_rate": 0.23008534790089813, "loss": 0.2302, "num_input_tokens_seen": 11118640, "step": 12830 }, { "epoch": 6.051390853371052, "grad_norm": 0.0009053719695657492, "learning_rate": 0.2300355349468457, "loss": 0.2139, "num_input_tokens_seen": 11122720, "step": 12835 }, { "epoch": 6.053748231966054, "grad_norm": 0.0011583255836740136, "learning_rate": 0.22998570965030793, "loss": 0.2329, "num_input_tokens_seen": 11127840, "step": 12840 }, { "epoch": 6.0561056105610565, "grad_norm": 0.0011554175289347768, "learning_rate": 0.22993587201896862, "loss": 0.1967, "num_input_tokens_seen": 11131712, "step": 12845 }, { "epoch": 6.058462989156059, "grad_norm": 0.0008630368392914534, "learning_rate": 0.2298860220605133, "loss": 0.2065, "num_input_tokens_seen": 11135808, "step": 12850 }, { "epoch": 6.060820367751061, "grad_norm": 0.0015693021705374122, "learning_rate": 0.22983615978262942, "loss": 0.2602, "num_input_tokens_seen": 11140400, "step": 12855 }, { "epoch": 6.063177746346063, "grad_norm": 0.002148232888430357, "learning_rate": 0.22978628519300648, "loss": 0.2469, "num_input_tokens_seen": 11144880, "step": 12860 }, { "epoch": 6.065535124941065, "grad_norm": 0.0012318813242018223, "learning_rate": 0.22973639829933568, "loss": 0.2139, "num_input_tokens_seen": 11149232, "step": 12865 }, { "epoch": 6.067892503536068, "grad_norm": 0.0017584586748853326, "learning_rate": 0.22968649910931027, "loss": 0.2125, "num_input_tokens_seen": 11153792, "step": 12870 }, { "epoch": 6.07024988213107, "grad_norm": 0.0011191613739356399, "learning_rate": 0.22963658763062528, "loss": 0.231, "num_input_tokens_seen": 11158320, "step": 12875 }, { "epoch": 6.072607260726072, "grad_norm": 0.002522860886529088, "learning_rate": 0.22958666387097765, "loss": 0.2199, "num_input_tokens_seen": 11163424, "step": 12880 }, { "epoch": 6.074964639321075, "grad_norm": 0.0012958089355379343, "learning_rate": 0.22953672783806633, "loss": 0.2461, "num_input_tokens_seen": 11168016, "step": 12885 }, { "epoch": 6.077322017916077, "grad_norm": 0.0021013442892581224, "learning_rate": 0.22948677953959207, "loss": 0.1998, "num_input_tokens_seen": 11172288, "step": 12890 }, { "epoch": 6.07967939651108, "grad_norm": 0.0009940849849954247, "learning_rate": 0.2294368189832575, "loss": 0.1915, "num_input_tokens_seen": 11176112, "step": 12895 }, { "epoch": 6.082036775106082, "grad_norm": 0.0015855350065976381, "learning_rate": 0.2293868461767672, "loss": 0.1963, "num_input_tokens_seen": 11180656, "step": 12900 }, { "epoch": 6.084394153701084, "grad_norm": 0.0027663009241223335, "learning_rate": 0.22933686112782758, "loss": 0.2354, "num_input_tokens_seen": 11184400, "step": 12905 }, { "epoch": 6.086751532296087, "grad_norm": 0.0029524292331188917, "learning_rate": 0.22928686384414698, "loss": 0.2505, "num_input_tokens_seen": 11188704, "step": 12910 }, { "epoch": 6.089108910891089, "grad_norm": 0.002262555994093418, "learning_rate": 0.22923685433343552, "loss": 0.2112, "num_input_tokens_seen": 11192848, "step": 12915 }, { "epoch": 6.091466289486092, "grad_norm": 0.0012576004955917597, "learning_rate": 0.22918683260340542, "loss": 0.2634, "num_input_tokens_seen": 11197536, "step": 12920 }, { "epoch": 6.093823668081094, "grad_norm": 0.0012853280641138554, "learning_rate": 0.2291367986617706, "loss": 0.234, "num_input_tokens_seen": 11201984, "step": 12925 }, { "epoch": 6.096181046676096, "grad_norm": 0.002993642818182707, "learning_rate": 0.22908675251624697, "loss": 0.2416, "num_input_tokens_seen": 11206736, "step": 12930 }, { "epoch": 6.098538425271099, "grad_norm": 0.0017695165006443858, "learning_rate": 0.22903669417455216, "loss": 0.2287, "num_input_tokens_seen": 11211072, "step": 12935 }, { "epoch": 6.100895803866101, "grad_norm": 0.0020101468544453382, "learning_rate": 0.22898662364440592, "loss": 0.2351, "num_input_tokens_seen": 11215392, "step": 12940 }, { "epoch": 6.103253182461104, "grad_norm": 0.002004379639402032, "learning_rate": 0.2289365409335297, "loss": 0.2045, "num_input_tokens_seen": 11220000, "step": 12945 }, { "epoch": 6.105610561056106, "grad_norm": 0.0008654922130517662, "learning_rate": 0.2288864460496469, "loss": 0.2127, "num_input_tokens_seen": 11224128, "step": 12950 }, { "epoch": 6.107967939651108, "grad_norm": 0.0011194651015102863, "learning_rate": 0.22883633900048272, "loss": 0.2251, "num_input_tokens_seen": 11228000, "step": 12955 }, { "epoch": 6.11032531824611, "grad_norm": 0.0014099939726293087, "learning_rate": 0.2287862197937644, "loss": 0.2536, "num_input_tokens_seen": 11232304, "step": 12960 }, { "epoch": 6.112682696841112, "grad_norm": 0.005159828811883926, "learning_rate": 0.2287360884372209, "loss": 0.2671, "num_input_tokens_seen": 11237584, "step": 12965 }, { "epoch": 6.115040075436115, "grad_norm": 0.0009257703786715865, "learning_rate": 0.22868594493858307, "loss": 0.237, "num_input_tokens_seen": 11240992, "step": 12970 }, { "epoch": 6.117397454031117, "grad_norm": 0.0016210287576541305, "learning_rate": 0.2286357893055837, "loss": 0.2427, "num_input_tokens_seen": 11245568, "step": 12975 }, { "epoch": 6.1197548326261195, "grad_norm": 0.002432390348985791, "learning_rate": 0.22858562154595746, "loss": 0.2318, "num_input_tokens_seen": 11249840, "step": 12980 }, { "epoch": 6.122112211221122, "grad_norm": 0.0015136764850467443, "learning_rate": 0.22853544166744078, "loss": 0.2383, "num_input_tokens_seen": 11255968, "step": 12985 }, { "epoch": 6.124469589816124, "grad_norm": 0.0009550959803164005, "learning_rate": 0.22848524967777206, "loss": 0.219, "num_input_tokens_seen": 11260752, "step": 12990 }, { "epoch": 6.126826968411127, "grad_norm": 0.00235031358897686, "learning_rate": 0.22843504558469152, "loss": 0.2636, "num_input_tokens_seen": 11264720, "step": 12995 }, { "epoch": 6.129184347006129, "grad_norm": 0.0013155497144907713, "learning_rate": 0.2283848293959413, "loss": 0.2217, "num_input_tokens_seen": 11269264, "step": 13000 }, { "epoch": 6.129184347006129, "eval_loss": 0.2193126231431961, "eval_runtime": 21.9105, "eval_samples_per_second": 43.039, "eval_steps_per_second": 21.542, "num_input_tokens_seen": 11269264, "step": 13000 }, { "epoch": 6.1315417256011315, "grad_norm": 0.0008669832022860646, "learning_rate": 0.22833460111926532, "loss": 0.2345, "num_input_tokens_seen": 11273904, "step": 13005 }, { "epoch": 6.133899104196134, "grad_norm": 0.0024079419672489166, "learning_rate": 0.22828436076240946, "loss": 0.2313, "num_input_tokens_seen": 11279424, "step": 13010 }, { "epoch": 6.136256482791136, "grad_norm": 0.0014385926770046353, "learning_rate": 0.22823410833312135, "loss": 0.2334, "num_input_tokens_seen": 11284192, "step": 13015 }, { "epoch": 6.138613861386139, "grad_norm": 0.00253147236071527, "learning_rate": 0.2281838438391506, "loss": 0.2328, "num_input_tokens_seen": 11288992, "step": 13020 }, { "epoch": 6.140971239981141, "grad_norm": 0.0008541835704818368, "learning_rate": 0.22813356728824863, "loss": 0.2278, "num_input_tokens_seen": 11293456, "step": 13025 }, { "epoch": 6.1433286185761435, "grad_norm": 0.001154087483882904, "learning_rate": 0.2280832786881687, "loss": 0.2291, "num_input_tokens_seen": 11297488, "step": 13030 }, { "epoch": 6.145685997171146, "grad_norm": 0.0010083775268867612, "learning_rate": 0.22803297804666592, "loss": 0.233, "num_input_tokens_seen": 11302096, "step": 13035 }, { "epoch": 6.148043375766148, "grad_norm": 0.002139996038749814, "learning_rate": 0.22798266537149728, "loss": 0.2196, "num_input_tokens_seen": 11305856, "step": 13040 }, { "epoch": 6.150400754361151, "grad_norm": 0.0009965640492737293, "learning_rate": 0.22793234067042167, "loss": 0.1907, "num_input_tokens_seen": 11310768, "step": 13045 }, { "epoch": 6.152758132956153, "grad_norm": 0.0031632347963750362, "learning_rate": 0.22788200395119979, "loss": 0.2647, "num_input_tokens_seen": 11315152, "step": 13050 }, { "epoch": 6.1551155115511555, "grad_norm": 0.0016818954609334469, "learning_rate": 0.2278316552215942, "loss": 0.2589, "num_input_tokens_seen": 11319648, "step": 13055 }, { "epoch": 6.157472890146157, "grad_norm": 0.001628947095014155, "learning_rate": 0.22778129448936918, "loss": 0.232, "num_input_tokens_seen": 11323664, "step": 13060 }, { "epoch": 6.1598302687411595, "grad_norm": 0.001700666849501431, "learning_rate": 0.22773092176229118, "loss": 0.2648, "num_input_tokens_seen": 11328384, "step": 13065 }, { "epoch": 6.162187647336162, "grad_norm": 0.0012237941846251488, "learning_rate": 0.22768053704812816, "loss": 0.2266, "num_input_tokens_seen": 11332608, "step": 13070 }, { "epoch": 6.164545025931164, "grad_norm": 0.0009233771124854684, "learning_rate": 0.22763014035465018, "loss": 0.2099, "num_input_tokens_seen": 11337088, "step": 13075 }, { "epoch": 6.166902404526167, "grad_norm": 0.0012322289403527975, "learning_rate": 0.22757973168962892, "loss": 0.2533, "num_input_tokens_seen": 11340864, "step": 13080 }, { "epoch": 6.169259783121169, "grad_norm": 0.003005904844030738, "learning_rate": 0.22752931106083818, "loss": 0.269, "num_input_tokens_seen": 11345712, "step": 13085 }, { "epoch": 6.1716171617161715, "grad_norm": 0.0021580446045845747, "learning_rate": 0.22747887847605341, "loss": 0.2255, "num_input_tokens_seen": 11349296, "step": 13090 }, { "epoch": 6.173974540311174, "grad_norm": 0.0011636470444500446, "learning_rate": 0.22742843394305184, "loss": 0.2373, "num_input_tokens_seen": 11353200, "step": 13095 }, { "epoch": 6.176331918906176, "grad_norm": 0.0015956066781654954, "learning_rate": 0.22737797746961272, "loss": 0.2311, "num_input_tokens_seen": 11357184, "step": 13100 }, { "epoch": 6.178689297501179, "grad_norm": 0.0019788972567766905, "learning_rate": 0.22732750906351712, "loss": 0.2529, "num_input_tokens_seen": 11360736, "step": 13105 }, { "epoch": 6.181046676096181, "grad_norm": 0.0042763277888298035, "learning_rate": 0.22727702873254785, "loss": 0.2388, "num_input_tokens_seen": 11365888, "step": 13110 }, { "epoch": 6.1834040546911835, "grad_norm": 0.0010787597857415676, "learning_rate": 0.22722653648448968, "loss": 0.2305, "num_input_tokens_seen": 11370256, "step": 13115 }, { "epoch": 6.185761433286186, "grad_norm": 0.0011703427881002426, "learning_rate": 0.22717603232712902, "loss": 0.2286, "num_input_tokens_seen": 11375248, "step": 13120 }, { "epoch": 6.188118811881188, "grad_norm": 0.002150709507986903, "learning_rate": 0.22712551626825436, "loss": 0.2132, "num_input_tokens_seen": 11380064, "step": 13125 }, { "epoch": 6.190476190476191, "grad_norm": 0.0018260151846334338, "learning_rate": 0.2270749883156559, "loss": 0.228, "num_input_tokens_seen": 11384784, "step": 13130 }, { "epoch": 6.192833569071193, "grad_norm": 0.0008629740332253277, "learning_rate": 0.22702444847712563, "loss": 0.2064, "num_input_tokens_seen": 11389104, "step": 13135 }, { "epoch": 6.1951909476661955, "grad_norm": 0.0013775562401860952, "learning_rate": 0.22697389676045743, "loss": 0.2048, "num_input_tokens_seen": 11393200, "step": 13140 }, { "epoch": 6.197548326261198, "grad_norm": 0.0007523690583184361, "learning_rate": 0.22692333317344704, "loss": 0.2466, "num_input_tokens_seen": 11397200, "step": 13145 }, { "epoch": 6.1999057048562, "grad_norm": 0.0022523042280226946, "learning_rate": 0.22687275772389198, "loss": 0.218, "num_input_tokens_seen": 11400816, "step": 13150 }, { "epoch": 6.202263083451202, "grad_norm": 0.0016035367734730244, "learning_rate": 0.22682217041959168, "loss": 0.1835, "num_input_tokens_seen": 11404432, "step": 13155 }, { "epoch": 6.204620462046204, "grad_norm": 0.0015332121402025223, "learning_rate": 0.2267715712683473, "loss": 0.1668, "num_input_tokens_seen": 11408528, "step": 13160 }, { "epoch": 6.206977840641207, "grad_norm": 0.0016260864213109016, "learning_rate": 0.22672096027796182, "loss": 0.2475, "num_input_tokens_seen": 11413424, "step": 13165 }, { "epoch": 6.209335219236209, "grad_norm": 0.002393875503912568, "learning_rate": 0.22667033745624016, "loss": 0.266, "num_input_tokens_seen": 11416944, "step": 13170 }, { "epoch": 6.211692597831211, "grad_norm": 0.0012190823908895254, "learning_rate": 0.22661970281098895, "loss": 0.2568, "num_input_tokens_seen": 11420848, "step": 13175 }, { "epoch": 6.214049976426214, "grad_norm": 0.0014554086374118924, "learning_rate": 0.22656905635001667, "loss": 0.2303, "num_input_tokens_seen": 11424976, "step": 13180 }, { "epoch": 6.216407355021216, "grad_norm": 0.0010306069161742926, "learning_rate": 0.2265183980811337, "loss": 0.2329, "num_input_tokens_seen": 11429008, "step": 13185 }, { "epoch": 6.218764733616219, "grad_norm": 0.0010611936450004578, "learning_rate": 0.22646772801215218, "loss": 0.2367, "num_input_tokens_seen": 11432928, "step": 13190 }, { "epoch": 6.221122112211221, "grad_norm": 0.0019025321817025542, "learning_rate": 0.22641704615088598, "loss": 0.2392, "num_input_tokens_seen": 11437120, "step": 13195 }, { "epoch": 6.223479490806223, "grad_norm": 0.002721330849453807, "learning_rate": 0.22636635250515103, "loss": 0.2475, "num_input_tokens_seen": 11441120, "step": 13200 }, { "epoch": 6.223479490806223, "eval_loss": 0.22213885188102722, "eval_runtime": 21.906, "eval_samples_per_second": 43.048, "eval_steps_per_second": 21.547, "num_input_tokens_seen": 11441120, "step": 13200 }, { "epoch": 6.225836869401226, "grad_norm": 0.00204481091350317, "learning_rate": 0.2263156470827648, "loss": 0.22, "num_input_tokens_seen": 11445408, "step": 13205 }, { "epoch": 6.228194247996228, "grad_norm": 0.0019294159719720483, "learning_rate": 0.22626492989154678, "loss": 0.2118, "num_input_tokens_seen": 11449024, "step": 13210 }, { "epoch": 6.230551626591231, "grad_norm": 0.001992699457332492, "learning_rate": 0.22621420093931813, "loss": 0.2376, "num_input_tokens_seen": 11453728, "step": 13215 }, { "epoch": 6.232909005186233, "grad_norm": 0.0011381824733689427, "learning_rate": 0.22616346023390194, "loss": 0.2272, "num_input_tokens_seen": 11457600, "step": 13220 }, { "epoch": 6.235266383781235, "grad_norm": 0.0010387592483311892, "learning_rate": 0.22611270778312306, "loss": 0.2055, "num_input_tokens_seen": 11461696, "step": 13225 }, { "epoch": 6.237623762376238, "grad_norm": 0.0011006842833012342, "learning_rate": 0.2260619435948081, "loss": 0.259, "num_input_tokens_seen": 11465520, "step": 13230 }, { "epoch": 6.23998114097124, "grad_norm": 0.001763340551406145, "learning_rate": 0.22601116767678567, "loss": 0.243, "num_input_tokens_seen": 11470688, "step": 13235 }, { "epoch": 6.242338519566243, "grad_norm": 0.0009628318948671222, "learning_rate": 0.2259603800368859, "loss": 0.2254, "num_input_tokens_seen": 11475664, "step": 13240 }, { "epoch": 6.244695898161245, "grad_norm": 0.001001010532490909, "learning_rate": 0.22590958068294098, "loss": 0.224, "num_input_tokens_seen": 11479984, "step": 13245 }, { "epoch": 6.247053276756247, "grad_norm": 0.0011504285503178835, "learning_rate": 0.22585876962278478, "loss": 0.2374, "num_input_tokens_seen": 11484928, "step": 13250 }, { "epoch": 6.24941065535125, "grad_norm": 0.001338228932581842, "learning_rate": 0.22580794686425298, "loss": 0.2343, "num_input_tokens_seen": 11489760, "step": 13255 }, { "epoch": 6.251768033946251, "grad_norm": 0.0008723827195353806, "learning_rate": 0.22575711241518312, "loss": 0.2368, "num_input_tokens_seen": 11493472, "step": 13260 }, { "epoch": 6.254125412541254, "grad_norm": 0.0009143627248704433, "learning_rate": 0.22570626628341453, "loss": 0.2278, "num_input_tokens_seen": 11497152, "step": 13265 }, { "epoch": 6.256482791136256, "grad_norm": 0.0012281552189961076, "learning_rate": 0.22565540847678828, "loss": 0.2107, "num_input_tokens_seen": 11501568, "step": 13270 }, { "epoch": 6.258840169731259, "grad_norm": 0.001585561316460371, "learning_rate": 0.2256045390031473, "loss": 0.2503, "num_input_tokens_seen": 11505456, "step": 13275 }, { "epoch": 6.261197548326261, "grad_norm": 0.0009572579292580485, "learning_rate": 0.22555365787033627, "loss": 0.2253, "num_input_tokens_seen": 11509680, "step": 13280 }, { "epoch": 6.263554926921263, "grad_norm": 0.0012867635814473033, "learning_rate": 0.22550276508620173, "loss": 0.2301, "num_input_tokens_seen": 11514208, "step": 13285 }, { "epoch": 6.265912305516266, "grad_norm": 0.0019787573255598545, "learning_rate": 0.22545186065859202, "loss": 0.2019, "num_input_tokens_seen": 11518128, "step": 13290 }, { "epoch": 6.268269684111268, "grad_norm": 0.0018884100718423724, "learning_rate": 0.2254009445953572, "loss": 0.2139, "num_input_tokens_seen": 11522384, "step": 13295 }, { "epoch": 6.270627062706271, "grad_norm": 0.003030792810022831, "learning_rate": 0.22535001690434917, "loss": 0.246, "num_input_tokens_seen": 11526848, "step": 13300 }, { "epoch": 6.272984441301273, "grad_norm": 0.0011117053218185902, "learning_rate": 0.22529907759342163, "loss": 0.2186, "num_input_tokens_seen": 11531360, "step": 13305 }, { "epoch": 6.275341819896275, "grad_norm": 0.0012730286689475179, "learning_rate": 0.22524812667043007, "loss": 0.2403, "num_input_tokens_seen": 11536464, "step": 13310 }, { "epoch": 6.277699198491278, "grad_norm": 0.002316867234185338, "learning_rate": 0.22519716414323177, "loss": 0.2148, "num_input_tokens_seen": 11540816, "step": 13315 }, { "epoch": 6.28005657708628, "grad_norm": 0.003442081855610013, "learning_rate": 0.22514619001968567, "loss": 0.221, "num_input_tokens_seen": 11547056, "step": 13320 }, { "epoch": 6.282413955681283, "grad_norm": 0.0009833906078711152, "learning_rate": 0.2250952043076528, "loss": 0.2232, "num_input_tokens_seen": 11550496, "step": 13325 }, { "epoch": 6.284771334276285, "grad_norm": 0.0012660115025937557, "learning_rate": 0.2250442070149957, "loss": 0.2265, "num_input_tokens_seen": 11554240, "step": 13330 }, { "epoch": 6.287128712871287, "grad_norm": 0.0010071959113702178, "learning_rate": 0.22499319814957885, "loss": 0.2139, "num_input_tokens_seen": 11558064, "step": 13335 }, { "epoch": 6.28948609146629, "grad_norm": 0.0008060761028900743, "learning_rate": 0.2249421777192684, "loss": 0.1877, "num_input_tokens_seen": 11562368, "step": 13340 }, { "epoch": 6.291843470061292, "grad_norm": 0.0008081582491286099, "learning_rate": 0.22489114573193236, "loss": 0.2333, "num_input_tokens_seen": 11566704, "step": 13345 }, { "epoch": 6.294200848656295, "grad_norm": 0.0007041542557999492, "learning_rate": 0.2248401021954405, "loss": 0.2567, "num_input_tokens_seen": 11571408, "step": 13350 }, { "epoch": 6.296558227251296, "grad_norm": 0.0010830189567059278, "learning_rate": 0.22478904711766443, "loss": 0.2572, "num_input_tokens_seen": 11575408, "step": 13355 }, { "epoch": 6.2989156058462985, "grad_norm": 0.0015778144588693976, "learning_rate": 0.22473798050647734, "loss": 0.2438, "num_input_tokens_seen": 11580160, "step": 13360 }, { "epoch": 6.301272984441301, "grad_norm": 0.001337361754849553, "learning_rate": 0.22468690236975453, "loss": 0.2291, "num_input_tokens_seen": 11584144, "step": 13365 }, { "epoch": 6.303630363036303, "grad_norm": 0.0007003672071732581, "learning_rate": 0.22463581271537272, "loss": 0.2409, "num_input_tokens_seen": 11588048, "step": 13370 }, { "epoch": 6.305987741631306, "grad_norm": 0.0019723675213754177, "learning_rate": 0.22458471155121076, "loss": 0.2189, "num_input_tokens_seen": 11592144, "step": 13375 }, { "epoch": 6.308345120226308, "grad_norm": 0.001178054721094668, "learning_rate": 0.2245335988851489, "loss": 0.2193, "num_input_tokens_seen": 11596080, "step": 13380 }, { "epoch": 6.3107024988213105, "grad_norm": 0.003723704721778631, "learning_rate": 0.2244824747250695, "loss": 0.2514, "num_input_tokens_seen": 11601232, "step": 13385 }, { "epoch": 6.313059877416313, "grad_norm": 0.0010304072638973594, "learning_rate": 0.22443133907885646, "loss": 0.2189, "num_input_tokens_seen": 11605472, "step": 13390 }, { "epoch": 6.315417256011315, "grad_norm": 0.0007270880159921944, "learning_rate": 0.22438019195439557, "loss": 0.2224, "num_input_tokens_seen": 11610256, "step": 13395 }, { "epoch": 6.317774634606318, "grad_norm": 0.0010840314207598567, "learning_rate": 0.22432903335957435, "loss": 0.2348, "num_input_tokens_seen": 11614176, "step": 13400 }, { "epoch": 6.317774634606318, "eval_loss": 0.21879000961780548, "eval_runtime": 21.9065, "eval_samples_per_second": 43.047, "eval_steps_per_second": 21.546, "num_input_tokens_seen": 11614176, "step": 13400 }, { "epoch": 6.32013201320132, "grad_norm": 0.0018415770027786493, "learning_rate": 0.22427786330228214, "loss": 0.2209, "num_input_tokens_seen": 11617808, "step": 13405 }, { "epoch": 6.3224893917963225, "grad_norm": 0.0014010494342073798, "learning_rate": 0.22422668179040997, "loss": 0.2352, "num_input_tokens_seen": 11622544, "step": 13410 }, { "epoch": 6.324846770391325, "grad_norm": 0.0009197145118378103, "learning_rate": 0.2241754888318507, "loss": 0.206, "num_input_tokens_seen": 11627168, "step": 13415 }, { "epoch": 6.327204148986327, "grad_norm": 0.0009297403157688677, "learning_rate": 0.22412428443449886, "loss": 0.2084, "num_input_tokens_seen": 11631296, "step": 13420 }, { "epoch": 6.32956152758133, "grad_norm": 0.0006599495536647737, "learning_rate": 0.22407306860625087, "loss": 0.1864, "num_input_tokens_seen": 11636192, "step": 13425 }, { "epoch": 6.331918906176332, "grad_norm": 0.0016978821950033307, "learning_rate": 0.22402184135500483, "loss": 0.207, "num_input_tokens_seen": 11640592, "step": 13430 }, { "epoch": 6.3342762847713345, "grad_norm": 0.0014252585824579, "learning_rate": 0.22397060268866067, "loss": 0.2803, "num_input_tokens_seen": 11645024, "step": 13435 }, { "epoch": 6.336633663366337, "grad_norm": 0.0016570795560255647, "learning_rate": 0.22391935261511994, "loss": 0.2446, "num_input_tokens_seen": 11649184, "step": 13440 }, { "epoch": 6.338991041961339, "grad_norm": 0.0012716721976175904, "learning_rate": 0.22386809114228615, "loss": 0.2211, "num_input_tokens_seen": 11653584, "step": 13445 }, { "epoch": 6.341348420556342, "grad_norm": 0.0011080008698627353, "learning_rate": 0.22381681827806446, "loss": 0.2187, "num_input_tokens_seen": 11658176, "step": 13450 }, { "epoch": 6.343705799151344, "grad_norm": 0.0018002965953201056, "learning_rate": 0.22376553403036173, "loss": 0.2306, "num_input_tokens_seen": 11663008, "step": 13455 }, { "epoch": 6.346063177746346, "grad_norm": 0.002703767968341708, "learning_rate": 0.22371423840708662, "loss": 0.2335, "num_input_tokens_seen": 11668144, "step": 13460 }, { "epoch": 6.348420556341348, "grad_norm": 0.0010058965999633074, "learning_rate": 0.22366293141614962, "loss": 0.2282, "num_input_tokens_seen": 11672048, "step": 13465 }, { "epoch": 6.3507779349363505, "grad_norm": 0.0017756945453584194, "learning_rate": 0.22361161306546287, "loss": 0.2002, "num_input_tokens_seen": 11676000, "step": 13470 }, { "epoch": 6.353135313531353, "grad_norm": 0.001078912173397839, "learning_rate": 0.22356028336294037, "loss": 0.2383, "num_input_tokens_seen": 11680224, "step": 13475 }, { "epoch": 6.355492692126355, "grad_norm": 0.0011349486885592341, "learning_rate": 0.2235089423164977, "loss": 0.2243, "num_input_tokens_seen": 11684240, "step": 13480 }, { "epoch": 6.357850070721358, "grad_norm": 0.003357166424393654, "learning_rate": 0.22345758993405243, "loss": 0.2018, "num_input_tokens_seen": 11689632, "step": 13485 }, { "epoch": 6.36020744931636, "grad_norm": 0.0011042780242860317, "learning_rate": 0.2234062262235236, "loss": 0.23, "num_input_tokens_seen": 11693328, "step": 13490 }, { "epoch": 6.3625648279113625, "grad_norm": 0.0011562854051589966, "learning_rate": 0.22335485119283222, "loss": 0.2074, "num_input_tokens_seen": 11697984, "step": 13495 }, { "epoch": 6.364922206506365, "grad_norm": 0.0009417433757334948, "learning_rate": 0.22330346484990093, "loss": 0.2326, "num_input_tokens_seen": 11702480, "step": 13500 }, { "epoch": 6.367279585101367, "grad_norm": 0.0018067003693431616, "learning_rate": 0.22325206720265425, "loss": 0.2316, "num_input_tokens_seen": 11706128, "step": 13505 }, { "epoch": 6.36963696369637, "grad_norm": 0.0008320966153405607, "learning_rate": 0.2232006582590182, "loss": 0.2264, "num_input_tokens_seen": 11709680, "step": 13510 }, { "epoch": 6.371994342291372, "grad_norm": 0.0012325753923505545, "learning_rate": 0.22314923802692077, "loss": 0.2043, "num_input_tokens_seen": 11714976, "step": 13515 }, { "epoch": 6.3743517208863745, "grad_norm": 0.0010789618827402592, "learning_rate": 0.22309780651429156, "loss": 0.198, "num_input_tokens_seen": 11718704, "step": 13520 }, { "epoch": 6.376709099481377, "grad_norm": 0.0006451330264098942, "learning_rate": 0.22304636372906203, "loss": 0.1883, "num_input_tokens_seen": 11722768, "step": 13525 }, { "epoch": 6.379066478076379, "grad_norm": 0.001135817845351994, "learning_rate": 0.22299490967916522, "loss": 0.2047, "num_input_tokens_seen": 11727136, "step": 13530 }, { "epoch": 6.381423856671382, "grad_norm": 0.0009520763996988535, "learning_rate": 0.22294344437253602, "loss": 0.2229, "num_input_tokens_seen": 11730400, "step": 13535 }, { "epoch": 6.383781235266384, "grad_norm": 0.0007598455995321274, "learning_rate": 0.22289196781711101, "loss": 0.1864, "num_input_tokens_seen": 11734736, "step": 13540 }, { "epoch": 6.3861386138613865, "grad_norm": 0.0027822055853903294, "learning_rate": 0.2228404800208286, "loss": 0.2665, "num_input_tokens_seen": 11739744, "step": 13545 }, { "epoch": 6.388495992456389, "grad_norm": 0.0009701235685497522, "learning_rate": 0.22278898099162875, "loss": 0.2207, "num_input_tokens_seen": 11743712, "step": 13550 }, { "epoch": 6.39085337105139, "grad_norm": 0.0015511472010985017, "learning_rate": 0.22273747073745337, "loss": 0.2183, "num_input_tokens_seen": 11748384, "step": 13555 }, { "epoch": 6.393210749646393, "grad_norm": 0.0012719410005956888, "learning_rate": 0.22268594926624588, "loss": 0.24, "num_input_tokens_seen": 11753120, "step": 13560 }, { "epoch": 6.395568128241395, "grad_norm": 0.0011915857903659344, "learning_rate": 0.22263441658595162, "loss": 0.223, "num_input_tokens_seen": 11757744, "step": 13565 }, { "epoch": 6.397925506836398, "grad_norm": 0.0010721537983044982, "learning_rate": 0.2225828727045175, "loss": 0.2127, "num_input_tokens_seen": 11761600, "step": 13570 }, { "epoch": 6.4002828854314, "grad_norm": 0.000847398245241493, "learning_rate": 0.22253131762989228, "loss": 0.2256, "num_input_tokens_seen": 11765472, "step": 13575 }, { "epoch": 6.402640264026402, "grad_norm": 0.0011851497692987323, "learning_rate": 0.2224797513700264, "loss": 0.2423, "num_input_tokens_seen": 11769824, "step": 13580 }, { "epoch": 6.404997642621405, "grad_norm": 0.0015840467531234026, "learning_rate": 0.22242817393287204, "loss": 0.1854, "num_input_tokens_seen": 11773648, "step": 13585 }, { "epoch": 6.407355021216407, "grad_norm": 0.0005339236231520772, "learning_rate": 0.22237658532638305, "loss": 0.1915, "num_input_tokens_seen": 11777456, "step": 13590 }, { "epoch": 6.40971239981141, "grad_norm": 0.001420942833647132, "learning_rate": 0.22232498555851513, "loss": 0.229, "num_input_tokens_seen": 11781792, "step": 13595 }, { "epoch": 6.412069778406412, "grad_norm": 0.0008033700869418681, "learning_rate": 0.22227337463722546, "loss": 0.1659, "num_input_tokens_seen": 11785424, "step": 13600 }, { "epoch": 6.412069778406412, "eval_loss": 0.22639155387878418, "eval_runtime": 21.9253, "eval_samples_per_second": 43.01, "eval_steps_per_second": 21.528, "num_input_tokens_seen": 11785424, "step": 13600 }, { "epoch": 6.414427157001414, "grad_norm": 0.0008431847090832889, "learning_rate": 0.2222217525704732, "loss": 0.2197, "num_input_tokens_seen": 11789584, "step": 13605 }, { "epoch": 6.416784535596417, "grad_norm": 0.0013008592650294304, "learning_rate": 0.22217011936621908, "loss": 0.1686, "num_input_tokens_seen": 11793728, "step": 13610 }, { "epoch": 6.419141914191419, "grad_norm": 0.0013016933808103204, "learning_rate": 0.22211847503242566, "loss": 0.2165, "num_input_tokens_seen": 11798288, "step": 13615 }, { "epoch": 6.421499292786422, "grad_norm": 0.001297068316489458, "learning_rate": 0.22206681957705704, "loss": 0.2326, "num_input_tokens_seen": 11802368, "step": 13620 }, { "epoch": 6.423856671381424, "grad_norm": 0.001070246915332973, "learning_rate": 0.2220151530080792, "loss": 0.256, "num_input_tokens_seen": 11806656, "step": 13625 }, { "epoch": 6.426214049976426, "grad_norm": 0.0008172846864908934, "learning_rate": 0.2219634753334598, "loss": 0.23, "num_input_tokens_seen": 11810912, "step": 13630 }, { "epoch": 6.428571428571429, "grad_norm": 0.0012299135560169816, "learning_rate": 0.22191178656116817, "loss": 0.2221, "num_input_tokens_seen": 11815056, "step": 13635 }, { "epoch": 6.430928807166431, "grad_norm": 0.0009284638217650354, "learning_rate": 0.2218600866991753, "loss": 0.2203, "num_input_tokens_seen": 11819824, "step": 13640 }, { "epoch": 6.433286185761434, "grad_norm": 0.0011982939904555678, "learning_rate": 0.221808375755454, "loss": 0.1999, "num_input_tokens_seen": 11823888, "step": 13645 }, { "epoch": 6.435643564356436, "grad_norm": 0.0013545859837904572, "learning_rate": 0.22175665373797881, "loss": 0.1694, "num_input_tokens_seen": 11828256, "step": 13650 }, { "epoch": 6.438000942951438, "grad_norm": 0.0008710703114047647, "learning_rate": 0.22170492065472583, "loss": 0.2263, "num_input_tokens_seen": 11832848, "step": 13655 }, { "epoch": 6.44035832154644, "grad_norm": 0.0010835877619683743, "learning_rate": 0.221653176513673, "loss": 0.2121, "num_input_tokens_seen": 11837616, "step": 13660 }, { "epoch": 6.442715700141442, "grad_norm": 0.000717986433301121, "learning_rate": 0.2216014213227999, "loss": 0.1887, "num_input_tokens_seen": 11842144, "step": 13665 }, { "epoch": 6.445073078736445, "grad_norm": 0.0017253223340958357, "learning_rate": 0.22154965509008784, "loss": 0.1897, "num_input_tokens_seen": 11846560, "step": 13670 }, { "epoch": 6.447430457331447, "grad_norm": 0.0029545503202825785, "learning_rate": 0.2214978778235198, "loss": 0.2797, "num_input_tokens_seen": 11850208, "step": 13675 }, { "epoch": 6.4497878359264496, "grad_norm": 0.000699088501278311, "learning_rate": 0.2214460895310805, "loss": 0.1575, "num_input_tokens_seen": 11854624, "step": 13680 }, { "epoch": 6.452145214521452, "grad_norm": 0.002054176526144147, "learning_rate": 0.22139429022075635, "loss": 0.2904, "num_input_tokens_seen": 11859680, "step": 13685 }, { "epoch": 6.454502593116454, "grad_norm": 0.0012557499576359987, "learning_rate": 0.22134247990053546, "loss": 0.1981, "num_input_tokens_seen": 11863888, "step": 13690 }, { "epoch": 6.456859971711457, "grad_norm": 0.0009751489269547164, "learning_rate": 0.2212906585784076, "loss": 0.2085, "num_input_tokens_seen": 11868640, "step": 13695 }, { "epoch": 6.459217350306459, "grad_norm": 0.0026942628901451826, "learning_rate": 0.22123882626236432, "loss": 0.2429, "num_input_tokens_seen": 11872816, "step": 13700 }, { "epoch": 6.461574728901462, "grad_norm": 0.0016532782465219498, "learning_rate": 0.2211869829603988, "loss": 0.218, "num_input_tokens_seen": 11876288, "step": 13705 }, { "epoch": 6.463932107496464, "grad_norm": 0.0011220305459573865, "learning_rate": 0.22113512868050592, "loss": 0.2131, "num_input_tokens_seen": 11880528, "step": 13710 }, { "epoch": 6.466289486091466, "grad_norm": 0.0019360133446753025, "learning_rate": 0.2210832634306822, "loss": 0.2156, "num_input_tokens_seen": 11884512, "step": 13715 }, { "epoch": 6.468646864686469, "grad_norm": 0.0020713992416858673, "learning_rate": 0.22103138721892598, "loss": 0.2303, "num_input_tokens_seen": 11889184, "step": 13720 }, { "epoch": 6.471004243281471, "grad_norm": 0.0010106242261826992, "learning_rate": 0.22097950005323724, "loss": 0.2152, "num_input_tokens_seen": 11894064, "step": 13725 }, { "epoch": 6.473361621876474, "grad_norm": 0.0021346251014620066, "learning_rate": 0.22092760194161762, "loss": 0.2397, "num_input_tokens_seen": 11898992, "step": 13730 }, { "epoch": 6.475719000471476, "grad_norm": 0.0012591255363076925, "learning_rate": 0.2208756928920704, "loss": 0.2453, "num_input_tokens_seen": 11903840, "step": 13735 }, { "epoch": 6.478076379066478, "grad_norm": 0.001246446743607521, "learning_rate": 0.22082377291260072, "loss": 0.2233, "num_input_tokens_seen": 11907792, "step": 13740 }, { "epoch": 6.480433757661481, "grad_norm": 0.001191683579236269, "learning_rate": 0.2207718420112152, "loss": 0.2179, "num_input_tokens_seen": 11911536, "step": 13745 }, { "epoch": 6.482791136256483, "grad_norm": 0.0011459790403023362, "learning_rate": 0.22071990019592228, "loss": 0.2098, "num_input_tokens_seen": 11915920, "step": 13750 }, { "epoch": 6.485148514851485, "grad_norm": 0.0010541322408244014, "learning_rate": 0.22066794747473198, "loss": 0.2049, "num_input_tokens_seen": 11920528, "step": 13755 }, { "epoch": 6.487505893446487, "grad_norm": 0.0008160487632267177, "learning_rate": 0.2206159838556562, "loss": 0.1813, "num_input_tokens_seen": 11924784, "step": 13760 }, { "epoch": 6.4898632720414895, "grad_norm": 0.0019650927279144526, "learning_rate": 0.2205640093467082, "loss": 0.1918, "num_input_tokens_seen": 11928976, "step": 13765 }, { "epoch": 6.492220650636492, "grad_norm": 0.001733272336423397, "learning_rate": 0.22051202395590322, "loss": 0.2427, "num_input_tokens_seen": 11933984, "step": 13770 }, { "epoch": 6.494578029231494, "grad_norm": 0.0015685093821957707, "learning_rate": 0.22046002769125808, "loss": 0.2583, "num_input_tokens_seen": 11938400, "step": 13775 }, { "epoch": 6.496935407826497, "grad_norm": 0.0021439415868371725, "learning_rate": 0.2204080205607912, "loss": 0.2374, "num_input_tokens_seen": 11942928, "step": 13780 }, { "epoch": 6.499292786421499, "grad_norm": 0.0013968285638839006, "learning_rate": 0.22035600257252272, "loss": 0.2286, "num_input_tokens_seen": 11947536, "step": 13785 }, { "epoch": 6.5016501650165015, "grad_norm": 0.0019134548492729664, "learning_rate": 0.2203039737344745, "loss": 0.2182, "num_input_tokens_seen": 11951392, "step": 13790 }, { "epoch": 6.504007543611504, "grad_norm": 0.001454550656490028, "learning_rate": 0.22025193405467003, "loss": 0.2238, "num_input_tokens_seen": 11955776, "step": 13795 }, { "epoch": 6.506364922206506, "grad_norm": 0.001808667671866715, "learning_rate": 0.2201998835411345, "loss": 0.1838, "num_input_tokens_seen": 11960752, "step": 13800 }, { "epoch": 6.506364922206506, "eval_loss": 0.21857322752475739, "eval_runtime": 21.8866, "eval_samples_per_second": 43.086, "eval_steps_per_second": 21.566, "num_input_tokens_seen": 11960752, "step": 13800 }, { "epoch": 6.508722300801509, "grad_norm": 0.0036930737551301718, "learning_rate": 0.22014782220189474, "loss": 0.2434, "num_input_tokens_seen": 11965200, "step": 13805 }, { "epoch": 6.511079679396511, "grad_norm": 0.002268847543746233, "learning_rate": 0.2200957500449793, "loss": 0.2343, "num_input_tokens_seen": 11969120, "step": 13810 }, { "epoch": 6.5134370579915135, "grad_norm": 0.002026001224294305, "learning_rate": 0.22004366707841827, "loss": 0.2153, "num_input_tokens_seen": 11972944, "step": 13815 }, { "epoch": 6.515794436586516, "grad_norm": 0.0013748891651630402, "learning_rate": 0.21999157331024358, "loss": 0.2174, "num_input_tokens_seen": 11976752, "step": 13820 }, { "epoch": 6.518151815181518, "grad_norm": 0.0009646362741477787, "learning_rate": 0.21993946874848871, "loss": 0.2029, "num_input_tokens_seen": 11980896, "step": 13825 }, { "epoch": 6.520509193776521, "grad_norm": 0.0010318633867427707, "learning_rate": 0.2198873534011888, "loss": 0.1945, "num_input_tokens_seen": 11984960, "step": 13830 }, { "epoch": 6.522866572371523, "grad_norm": 0.001288180355913937, "learning_rate": 0.2198352272763808, "loss": 0.2305, "num_input_tokens_seen": 11989552, "step": 13835 }, { "epoch": 6.5252239509665255, "grad_norm": 0.0008407334098592401, "learning_rate": 0.2197830903821031, "loss": 0.1662, "num_input_tokens_seen": 11993584, "step": 13840 }, { "epoch": 6.527581329561528, "grad_norm": 0.001323329284787178, "learning_rate": 0.21973094272639598, "loss": 0.1991, "num_input_tokens_seen": 11997936, "step": 13845 }, { "epoch": 6.52993870815653, "grad_norm": 0.0010704640299081802, "learning_rate": 0.21967878431730117, "loss": 0.163, "num_input_tokens_seen": 12002848, "step": 13850 }, { "epoch": 6.532296086751533, "grad_norm": 0.0009565091459080577, "learning_rate": 0.21962661516286217, "loss": 0.2335, "num_input_tokens_seen": 12006912, "step": 13855 }, { "epoch": 6.534653465346535, "grad_norm": 0.0009982381016016006, "learning_rate": 0.21957443527112414, "loss": 0.2455, "num_input_tokens_seen": 12010976, "step": 13860 }, { "epoch": 6.537010843941537, "grad_norm": 0.0016621286049485207, "learning_rate": 0.21952224465013384, "loss": 0.2518, "num_input_tokens_seen": 12016208, "step": 13865 }, { "epoch": 6.539368222536539, "grad_norm": 0.0015303674153983593, "learning_rate": 0.21947004330793976, "loss": 0.2285, "num_input_tokens_seen": 12019856, "step": 13870 }, { "epoch": 6.5417256011315414, "grad_norm": 0.001197399222292006, "learning_rate": 0.21941783125259198, "loss": 0.2203, "num_input_tokens_seen": 12024480, "step": 13875 }, { "epoch": 6.544082979726544, "grad_norm": 0.0017466588178649545, "learning_rate": 0.21936560849214226, "loss": 0.2075, "num_input_tokens_seen": 12029072, "step": 13880 }, { "epoch": 6.546440358321546, "grad_norm": 0.0017335258889943361, "learning_rate": 0.21931337503464404, "loss": 0.2232, "num_input_tokens_seen": 12032688, "step": 13885 }, { "epoch": 6.548797736916549, "grad_norm": 0.001155829755589366, "learning_rate": 0.21926113088815233, "loss": 0.2096, "num_input_tokens_seen": 12036752, "step": 13890 }, { "epoch": 6.551155115511551, "grad_norm": 0.0014500155812129378, "learning_rate": 0.2192088760607238, "loss": 0.1907, "num_input_tokens_seen": 12040816, "step": 13895 }, { "epoch": 6.5535124941065535, "grad_norm": 0.001542670768685639, "learning_rate": 0.2191566105604169, "loss": 0.2291, "num_input_tokens_seen": 12046176, "step": 13900 }, { "epoch": 6.555869872701556, "grad_norm": 0.0013711112551391125, "learning_rate": 0.21910433439529153, "loss": 0.2466, "num_input_tokens_seen": 12050112, "step": 13905 }, { "epoch": 6.558227251296558, "grad_norm": 0.0011703994823619723, "learning_rate": 0.2190520475734094, "loss": 0.2462, "num_input_tokens_seen": 12054656, "step": 13910 }, { "epoch": 6.560584629891561, "grad_norm": 0.0010362978791818023, "learning_rate": 0.2189997501028338, "loss": 0.2049, "num_input_tokens_seen": 12058640, "step": 13915 }, { "epoch": 6.562942008486563, "grad_norm": 0.0031943386420607567, "learning_rate": 0.2189474419916296, "loss": 0.2066, "num_input_tokens_seen": 12063888, "step": 13920 }, { "epoch": 6.5652993870815655, "grad_norm": 0.0010580772068351507, "learning_rate": 0.21889512324786342, "loss": 0.2357, "num_input_tokens_seen": 12068064, "step": 13925 }, { "epoch": 6.567656765676568, "grad_norm": 0.003131187055259943, "learning_rate": 0.21884279387960345, "loss": 0.2366, "num_input_tokens_seen": 12072704, "step": 13930 }, { "epoch": 6.57001414427157, "grad_norm": 0.0009805160807445645, "learning_rate": 0.2187904538949195, "loss": 0.2007, "num_input_tokens_seen": 12076976, "step": 13935 }, { "epoch": 6.572371522866573, "grad_norm": 0.0008368046255782247, "learning_rate": 0.2187381033018831, "loss": 0.1932, "num_input_tokens_seen": 12080960, "step": 13940 }, { "epoch": 6.574728901461575, "grad_norm": 0.0008214358240365982, "learning_rate": 0.2186857421085673, "loss": 0.2011, "num_input_tokens_seen": 12085664, "step": 13945 }, { "epoch": 6.5770862800565775, "grad_norm": 0.0014826244441792369, "learning_rate": 0.21863337032304697, "loss": 0.2242, "num_input_tokens_seen": 12090448, "step": 13950 }, { "epoch": 6.579443658651579, "grad_norm": 0.0007693592342548072, "learning_rate": 0.21858098795339845, "loss": 0.2367, "num_input_tokens_seen": 12094384, "step": 13955 }, { "epoch": 6.581801037246581, "grad_norm": 0.0011449640151113272, "learning_rate": 0.21852859500769975, "loss": 0.2009, "num_input_tokens_seen": 12098720, "step": 13960 }, { "epoch": 6.584158415841584, "grad_norm": 0.0011215161066502333, "learning_rate": 0.21847619149403044, "loss": 0.2424, "num_input_tokens_seen": 12103072, "step": 13965 }, { "epoch": 6.586515794436586, "grad_norm": 0.0010358479339629412, "learning_rate": 0.21842377742047195, "loss": 0.2258, "num_input_tokens_seen": 12107840, "step": 13970 }, { "epoch": 6.588873173031589, "grad_norm": 0.0015076696872711182, "learning_rate": 0.21837135279510705, "loss": 0.2318, "num_input_tokens_seen": 12111984, "step": 13975 }, { "epoch": 6.591230551626591, "grad_norm": 0.0018098392756655812, "learning_rate": 0.21831891762602038, "loss": 0.2324, "num_input_tokens_seen": 12116800, "step": 13980 }, { "epoch": 6.593587930221593, "grad_norm": 0.0009791397023946047, "learning_rate": 0.21826647192129806, "loss": 0.2481, "num_input_tokens_seen": 12120752, "step": 13985 }, { "epoch": 6.595945308816596, "grad_norm": 0.0014681629836559296, "learning_rate": 0.21821401568902787, "loss": 0.2392, "num_input_tokens_seen": 12124848, "step": 13990 }, { "epoch": 6.598302687411598, "grad_norm": 0.0008760616183280945, "learning_rate": 0.21816154893729925, "loss": 0.2329, "num_input_tokens_seen": 12128784, "step": 13995 }, { "epoch": 6.600660066006601, "grad_norm": 0.0011897670337930322, "learning_rate": 0.2181090716742032, "loss": 0.222, "num_input_tokens_seen": 12132672, "step": 14000 }, { "epoch": 6.600660066006601, "eval_loss": 0.21828874945640564, "eval_runtime": 21.8717, "eval_samples_per_second": 43.115, "eval_steps_per_second": 21.58, "num_input_tokens_seen": 12132672, "step": 14000 }, { "epoch": 6.603017444601603, "grad_norm": 0.0012794890208169818, "learning_rate": 0.21805658390783236, "loss": 0.1993, "num_input_tokens_seen": 12136768, "step": 14005 }, { "epoch": 6.605374823196605, "grad_norm": 0.000785662850830704, "learning_rate": 0.21800408564628107, "loss": 0.2398, "num_input_tokens_seen": 12141456, "step": 14010 }, { "epoch": 6.607732201791608, "grad_norm": 0.0013250084593892097, "learning_rate": 0.21795157689764516, "loss": 0.1657, "num_input_tokens_seen": 12145920, "step": 14015 }, { "epoch": 6.61008958038661, "grad_norm": 0.0008349485578946769, "learning_rate": 0.21789905767002216, "loss": 0.2691, "num_input_tokens_seen": 12149760, "step": 14020 }, { "epoch": 6.612446958981613, "grad_norm": 0.0014035304775461555, "learning_rate": 0.2178465279715112, "loss": 0.1522, "num_input_tokens_seen": 12153088, "step": 14025 }, { "epoch": 6.614804337576615, "grad_norm": 0.0023050843738019466, "learning_rate": 0.21779398781021303, "loss": 0.2429, "num_input_tokens_seen": 12158048, "step": 14030 }, { "epoch": 6.617161716171617, "grad_norm": 0.001078520668670535, "learning_rate": 0.21774143719422998, "loss": 0.2879, "num_input_tokens_seen": 12162864, "step": 14035 }, { "epoch": 6.61951909476662, "grad_norm": 0.001807692926377058, "learning_rate": 0.21768887613166601, "loss": 0.2009, "num_input_tokens_seen": 12168352, "step": 14040 }, { "epoch": 6.621876473361622, "grad_norm": 0.0025743923615664244, "learning_rate": 0.2176363046306267, "loss": 0.2327, "num_input_tokens_seen": 12172768, "step": 14045 }, { "epoch": 6.624233851956625, "grad_norm": 0.0013855824945494533, "learning_rate": 0.21758372269921925, "loss": 0.2286, "num_input_tokens_seen": 12177296, "step": 14050 }, { "epoch": 6.626591230551627, "grad_norm": 0.0009394611115567386, "learning_rate": 0.21753113034555244, "loss": 0.2383, "num_input_tokens_seen": 12180576, "step": 14055 }, { "epoch": 6.628948609146629, "grad_norm": 0.0020144490990787745, "learning_rate": 0.2174785275777367, "loss": 0.2314, "num_input_tokens_seen": 12185184, "step": 14060 }, { "epoch": 6.631305987741631, "grad_norm": 0.001557944342494011, "learning_rate": 0.21742591440388404, "loss": 0.2271, "num_input_tokens_seen": 12189376, "step": 14065 }, { "epoch": 6.633663366336633, "grad_norm": 0.002066993387416005, "learning_rate": 0.21737329083210802, "loss": 0.2295, "num_input_tokens_seen": 12192992, "step": 14070 }, { "epoch": 6.636020744931636, "grad_norm": 0.001033844193443656, "learning_rate": 0.2173206568705239, "loss": 0.2175, "num_input_tokens_seen": 12197520, "step": 14075 }, { "epoch": 6.638378123526638, "grad_norm": 0.0018377258675172925, "learning_rate": 0.2172680125272485, "loss": 0.2237, "num_input_tokens_seen": 12201728, "step": 14080 }, { "epoch": 6.6407355021216405, "grad_norm": 0.0008944787550717592, "learning_rate": 0.2172153578104002, "loss": 0.2191, "num_input_tokens_seen": 12205648, "step": 14085 }, { "epoch": 6.643092880716643, "grad_norm": 0.0010799543233588338, "learning_rate": 0.21716269272809902, "loss": 0.2033, "num_input_tokens_seen": 12209312, "step": 14090 }, { "epoch": 6.645450259311645, "grad_norm": 0.0018022455042228103, "learning_rate": 0.21711001728846666, "loss": 0.1952, "num_input_tokens_seen": 12213728, "step": 14095 }, { "epoch": 6.647807637906648, "grad_norm": 0.002291918033733964, "learning_rate": 0.21705733149962628, "loss": 0.2702, "num_input_tokens_seen": 12218208, "step": 14100 }, { "epoch": 6.65016501650165, "grad_norm": 0.0008098890539258718, "learning_rate": 0.21700463536970263, "loss": 0.1928, "num_input_tokens_seen": 12222608, "step": 14105 }, { "epoch": 6.6525223950966526, "grad_norm": 0.0014783412916585803, "learning_rate": 0.21695192890682222, "loss": 0.212, "num_input_tokens_seen": 12226224, "step": 14110 }, { "epoch": 6.654879773691655, "grad_norm": 0.001821559970267117, "learning_rate": 0.21689921211911298, "loss": 0.2149, "num_input_tokens_seen": 12230192, "step": 14115 }, { "epoch": 6.657237152286657, "grad_norm": 0.0020028608851134777, "learning_rate": 0.21684648501470452, "loss": 0.2207, "num_input_tokens_seen": 12234768, "step": 14120 }, { "epoch": 6.65959453088166, "grad_norm": 0.0007979890797287226, "learning_rate": 0.216793747601728, "loss": 0.2001, "num_input_tokens_seen": 12238960, "step": 14125 }, { "epoch": 6.661951909476662, "grad_norm": 0.0014030808815732598, "learning_rate": 0.21674099988831627, "loss": 0.2027, "num_input_tokens_seen": 12243616, "step": 14130 }, { "epoch": 6.664309288071665, "grad_norm": 0.0007040109485387802, "learning_rate": 0.21668824188260363, "loss": 0.2349, "num_input_tokens_seen": 12248496, "step": 14135 }, { "epoch": 6.666666666666667, "grad_norm": 0.0029200236313045025, "learning_rate": 0.21663547359272606, "loss": 0.2395, "num_input_tokens_seen": 12253184, "step": 14140 }, { "epoch": 6.669024045261669, "grad_norm": 0.0009673293679952621, "learning_rate": 0.216582695026821, "loss": 0.2233, "num_input_tokens_seen": 12256848, "step": 14145 }, { "epoch": 6.671381423856672, "grad_norm": 0.003125124843791127, "learning_rate": 0.21652990619302767, "loss": 0.2249, "num_input_tokens_seen": 12261184, "step": 14150 }, { "epoch": 6.673738802451673, "grad_norm": 0.0009346173610538244, "learning_rate": 0.21647710709948673, "loss": 0.1769, "num_input_tokens_seen": 12265504, "step": 14155 }, { "epoch": 6.676096181046676, "grad_norm": 0.0015139473835006356, "learning_rate": 0.2164242977543405, "loss": 0.2006, "num_input_tokens_seen": 12269248, "step": 14160 }, { "epoch": 6.678453559641678, "grad_norm": 0.0007926283869892359, "learning_rate": 0.21637147816573277, "loss": 0.2125, "num_input_tokens_seen": 12273072, "step": 14165 }, { "epoch": 6.6808109382366805, "grad_norm": 0.001193562988191843, "learning_rate": 0.21631864834180908, "loss": 0.226, "num_input_tokens_seen": 12278112, "step": 14170 }, { "epoch": 6.683168316831683, "grad_norm": 0.0011515958467498422, "learning_rate": 0.21626580829071637, "loss": 0.2133, "num_input_tokens_seen": 12282320, "step": 14175 }, { "epoch": 6.685525695426685, "grad_norm": 0.000817668333183974, "learning_rate": 0.21621295802060328, "loss": 0.2283, "num_input_tokens_seen": 12286320, "step": 14180 }, { "epoch": 6.687883074021688, "grad_norm": 0.002206181176006794, "learning_rate": 0.21616009753961996, "loss": 0.2097, "num_input_tokens_seen": 12290448, "step": 14185 }, { "epoch": 6.69024045261669, "grad_norm": 0.0017704832134768367, "learning_rate": 0.2161072268559182, "loss": 0.2088, "num_input_tokens_seen": 12294864, "step": 14190 }, { "epoch": 6.6925978312116925, "grad_norm": 0.0014467319706454873, "learning_rate": 0.21605434597765133, "loss": 0.2303, "num_input_tokens_seen": 12299360, "step": 14195 }, { "epoch": 6.694955209806695, "grad_norm": 0.0021095958072692156, "learning_rate": 0.21600145491297418, "loss": 0.2841, "num_input_tokens_seen": 12303424, "step": 14200 }, { "epoch": 6.694955209806695, "eval_loss": 0.2159731090068817, "eval_runtime": 21.9216, "eval_samples_per_second": 43.017, "eval_steps_per_second": 21.531, "num_input_tokens_seen": 12303424, "step": 14200 }, { "epoch": 6.697312588401697, "grad_norm": 0.0008643734036013484, "learning_rate": 0.21594855367004326, "loss": 0.1991, "num_input_tokens_seen": 12307952, "step": 14205 }, { "epoch": 6.6996699669967, "grad_norm": 0.00099911168217659, "learning_rate": 0.21589564225701663, "loss": 0.1988, "num_input_tokens_seen": 12311552, "step": 14210 }, { "epoch": 6.702027345591702, "grad_norm": 0.000859889667481184, "learning_rate": 0.21584272068205385, "loss": 0.1859, "num_input_tokens_seen": 12314848, "step": 14215 }, { "epoch": 6.7043847241867045, "grad_norm": 0.0019989567808806896, "learning_rate": 0.2157897889533161, "loss": 0.2642, "num_input_tokens_seen": 12318800, "step": 14220 }, { "epoch": 6.706742102781707, "grad_norm": 0.001182570238597691, "learning_rate": 0.21573684707896612, "loss": 0.2393, "num_input_tokens_seen": 12323520, "step": 14225 }, { "epoch": 6.709099481376709, "grad_norm": 0.0009859168203547597, "learning_rate": 0.21568389506716826, "loss": 0.2222, "num_input_tokens_seen": 12328048, "step": 14230 }, { "epoch": 6.711456859971712, "grad_norm": 0.0027129182126373053, "learning_rate": 0.21563093292608831, "loss": 0.2617, "num_input_tokens_seen": 12332224, "step": 14235 }, { "epoch": 6.713814238566714, "grad_norm": 0.0008775991154834628, "learning_rate": 0.21557796066389376, "loss": 0.2275, "num_input_tokens_seen": 12336368, "step": 14240 }, { "epoch": 6.7161716171617165, "grad_norm": 0.0010439966572448611, "learning_rate": 0.21552497828875353, "loss": 0.2241, "num_input_tokens_seen": 12340544, "step": 14245 }, { "epoch": 6.718528995756719, "grad_norm": 0.000740572577342391, "learning_rate": 0.21547198580883828, "loss": 0.224, "num_input_tokens_seen": 12345072, "step": 14250 }, { "epoch": 6.720886374351721, "grad_norm": 0.0015781527617946267, "learning_rate": 0.21541898323232, "loss": 0.1997, "num_input_tokens_seen": 12351008, "step": 14255 }, { "epoch": 6.723243752946724, "grad_norm": 0.0012488086940720677, "learning_rate": 0.2153659705673724, "loss": 0.2146, "num_input_tokens_seen": 12355584, "step": 14260 }, { "epoch": 6.725601131541725, "grad_norm": 0.0024416050873696804, "learning_rate": 0.2153129478221707, "loss": 0.2455, "num_input_tokens_seen": 12359680, "step": 14265 }, { "epoch": 6.727958510136728, "grad_norm": 0.0008311145356856287, "learning_rate": 0.21525991500489164, "loss": 0.2439, "num_input_tokens_seen": 12363184, "step": 14270 }, { "epoch": 6.73031588873173, "grad_norm": 0.0007930436404421926, "learning_rate": 0.21520687212371362, "loss": 0.241, "num_input_tokens_seen": 12367200, "step": 14275 }, { "epoch": 6.732673267326732, "grad_norm": 0.00171434145886451, "learning_rate": 0.21515381918681648, "loss": 0.2016, "num_input_tokens_seen": 12371808, "step": 14280 }, { "epoch": 6.735030645921735, "grad_norm": 0.0007788133807480335, "learning_rate": 0.21510075620238167, "loss": 0.2511, "num_input_tokens_seen": 12375872, "step": 14285 }, { "epoch": 6.737388024516737, "grad_norm": 0.0016605884302407503, "learning_rate": 0.21504768317859208, "loss": 0.1864, "num_input_tokens_seen": 12379664, "step": 14290 }, { "epoch": 6.73974540311174, "grad_norm": 0.0015451674116775393, "learning_rate": 0.2149946001236323, "loss": 0.2635, "num_input_tokens_seen": 12383792, "step": 14295 }, { "epoch": 6.742102781706742, "grad_norm": 0.0018625977681949735, "learning_rate": 0.21494150704568848, "loss": 0.2212, "num_input_tokens_seen": 12389120, "step": 14300 }, { "epoch": 6.7444601603017444, "grad_norm": 0.0005674086860381067, "learning_rate": 0.21488840395294811, "loss": 0.2385, "num_input_tokens_seen": 12392992, "step": 14305 }, { "epoch": 6.746817538896747, "grad_norm": 0.0023185175377875566, "learning_rate": 0.21483529085360042, "loss": 0.2444, "num_input_tokens_seen": 12397616, "step": 14310 }, { "epoch": 6.749174917491749, "grad_norm": 0.001108133583329618, "learning_rate": 0.2147821677558361, "loss": 0.2367, "num_input_tokens_seen": 12402096, "step": 14315 }, { "epoch": 6.751532296086752, "grad_norm": 0.0007715950487181544, "learning_rate": 0.2147290346678475, "loss": 0.2103, "num_input_tokens_seen": 12406528, "step": 14320 }, { "epoch": 6.753889674681754, "grad_norm": 0.0015502843307331204, "learning_rate": 0.21467589159782827, "loss": 0.2176, "num_input_tokens_seen": 12411648, "step": 14325 }, { "epoch": 6.7562470532767565, "grad_norm": 0.0017109279287979007, "learning_rate": 0.21462273855397374, "loss": 0.2424, "num_input_tokens_seen": 12416800, "step": 14330 }, { "epoch": 6.758604431871759, "grad_norm": 0.0011203422909602523, "learning_rate": 0.21456957554448083, "loss": 0.2519, "num_input_tokens_seen": 12420608, "step": 14335 }, { "epoch": 6.760961810466761, "grad_norm": 0.0007673196960240602, "learning_rate": 0.21451640257754795, "loss": 0.2212, "num_input_tokens_seen": 12424832, "step": 14340 }, { "epoch": 6.763319189061764, "grad_norm": 0.0007216534577310085, "learning_rate": 0.21446321966137508, "loss": 0.2131, "num_input_tokens_seen": 12429024, "step": 14345 }, { "epoch": 6.765676567656766, "grad_norm": 0.0020378034096211195, "learning_rate": 0.21441002680416354, "loss": 0.2425, "num_input_tokens_seen": 12433040, "step": 14350 }, { "epoch": 6.768033946251768, "grad_norm": 0.0010856781154870987, "learning_rate": 0.21435682401411654, "loss": 0.2335, "num_input_tokens_seen": 12436544, "step": 14355 }, { "epoch": 6.77039132484677, "grad_norm": 0.000835278129670769, "learning_rate": 0.2143036112994385, "loss": 0.2226, "num_input_tokens_seen": 12440912, "step": 14360 }, { "epoch": 6.772748703441772, "grad_norm": 0.0015102807665243745, "learning_rate": 0.21425038866833548, "loss": 0.233, "num_input_tokens_seen": 12444672, "step": 14365 }, { "epoch": 6.775106082036775, "grad_norm": 0.0010432589333504438, "learning_rate": 0.21419715612901508, "loss": 0.2343, "num_input_tokens_seen": 12449344, "step": 14370 }, { "epoch": 6.777463460631777, "grad_norm": 0.0008407129789702594, "learning_rate": 0.21414391368968652, "loss": 0.2192, "num_input_tokens_seen": 12453120, "step": 14375 }, { "epoch": 6.77982083922678, "grad_norm": 0.0009093019179999828, "learning_rate": 0.21409066135856034, "loss": 0.1922, "num_input_tokens_seen": 12457120, "step": 14380 }, { "epoch": 6.782178217821782, "grad_norm": 0.0008792545995675027, "learning_rate": 0.21403739914384878, "loss": 0.2313, "num_input_tokens_seen": 12461392, "step": 14385 }, { "epoch": 6.784535596416784, "grad_norm": 0.0007856666925363243, "learning_rate": 0.21398412705376554, "loss": 0.2415, "num_input_tokens_seen": 12465408, "step": 14390 }, { "epoch": 6.786892975011787, "grad_norm": 0.0007584239356219769, "learning_rate": 0.2139308450965258, "loss": 0.2373, "num_input_tokens_seen": 12470256, "step": 14395 }, { "epoch": 6.789250353606789, "grad_norm": 0.001454116660170257, "learning_rate": 0.21387755328034638, "loss": 0.2265, "num_input_tokens_seen": 12474592, "step": 14400 }, { "epoch": 6.789250353606789, "eval_loss": 0.22272275388240814, "eval_runtime": 21.946, "eval_samples_per_second": 42.969, "eval_steps_per_second": 21.507, "num_input_tokens_seen": 12474592, "step": 14400 }, { "epoch": 6.791607732201792, "grad_norm": 0.0012333409395068884, "learning_rate": 0.2138242516134455, "loss": 0.2356, "num_input_tokens_seen": 12479136, "step": 14405 }, { "epoch": 6.793965110796794, "grad_norm": 0.0014046004507690668, "learning_rate": 0.2137709401040429, "loss": 0.2303, "num_input_tokens_seen": 12484368, "step": 14410 }, { "epoch": 6.796322489391796, "grad_norm": 0.001160518848337233, "learning_rate": 0.21371761876036, "loss": 0.2253, "num_input_tokens_seen": 12488832, "step": 14415 }, { "epoch": 6.798679867986799, "grad_norm": 0.0019423667108640075, "learning_rate": 0.21366428759061956, "loss": 0.2159, "num_input_tokens_seen": 12493440, "step": 14420 }, { "epoch": 6.801037246581801, "grad_norm": 0.0010687402682378888, "learning_rate": 0.2136109466030459, "loss": 0.2383, "num_input_tokens_seen": 12497264, "step": 14425 }, { "epoch": 6.803394625176804, "grad_norm": 0.0008811026928015053, "learning_rate": 0.2135575958058649, "loss": 0.2155, "num_input_tokens_seen": 12501872, "step": 14430 }, { "epoch": 6.805752003771806, "grad_norm": 0.0009930875385180116, "learning_rate": 0.2135042352073039, "loss": 0.2547, "num_input_tokens_seen": 12506080, "step": 14435 }, { "epoch": 6.808109382366808, "grad_norm": 0.0017960426630452275, "learning_rate": 0.2134508648155918, "loss": 0.2227, "num_input_tokens_seen": 12509536, "step": 14440 }, { "epoch": 6.810466760961811, "grad_norm": 0.003220172831788659, "learning_rate": 0.213397484638959, "loss": 0.2275, "num_input_tokens_seen": 12514624, "step": 14445 }, { "epoch": 6.812824139556813, "grad_norm": 0.0011971027124673128, "learning_rate": 0.21334409468563728, "loss": 0.2352, "num_input_tokens_seen": 12518320, "step": 14450 }, { "epoch": 6.815181518151816, "grad_norm": 0.0009361574775539339, "learning_rate": 0.2132906949638602, "loss": 0.2128, "num_input_tokens_seen": 12522800, "step": 14455 }, { "epoch": 6.817538896746818, "grad_norm": 0.0012258393689990044, "learning_rate": 0.21323728548186255, "loss": 0.2171, "num_input_tokens_seen": 12526992, "step": 14460 }, { "epoch": 6.8198962753418195, "grad_norm": 0.0015397552633658051, "learning_rate": 0.21318386624788088, "loss": 0.209, "num_input_tokens_seen": 12530784, "step": 14465 }, { "epoch": 6.822253653936822, "grad_norm": 0.003101245267316699, "learning_rate": 0.21313043727015288, "loss": 0.2666, "num_input_tokens_seen": 12535248, "step": 14470 }, { "epoch": 6.824611032531824, "grad_norm": 0.001084741554223001, "learning_rate": 0.2130769985569182, "loss": 0.1733, "num_input_tokens_seen": 12540160, "step": 14475 }, { "epoch": 6.826968411126827, "grad_norm": 0.003937000408768654, "learning_rate": 0.21302355011641766, "loss": 0.1941, "num_input_tokens_seen": 12544608, "step": 14480 }, { "epoch": 6.829325789721829, "grad_norm": 0.000859442341607064, "learning_rate": 0.21297009195689365, "loss": 0.1886, "num_input_tokens_seen": 12549920, "step": 14485 }, { "epoch": 6.8316831683168315, "grad_norm": 0.0006839822162874043, "learning_rate": 0.21291662408659015, "loss": 0.2425, "num_input_tokens_seen": 12554320, "step": 14490 }, { "epoch": 6.834040546911834, "grad_norm": 0.0012058347929269075, "learning_rate": 0.21286314651375254, "loss": 0.2444, "num_input_tokens_seen": 12558304, "step": 14495 }, { "epoch": 6.836397925506836, "grad_norm": 0.0018578370800241828, "learning_rate": 0.2128096592466278, "loss": 0.2431, "num_input_tokens_seen": 12562336, "step": 14500 }, { "epoch": 6.838755304101839, "grad_norm": 0.0011263216147199273, "learning_rate": 0.21275616229346428, "loss": 0.2368, "num_input_tokens_seen": 12565728, "step": 14505 }, { "epoch": 6.841112682696841, "grad_norm": 0.001029518898576498, "learning_rate": 0.21270265566251184, "loss": 0.2378, "num_input_tokens_seen": 12569440, "step": 14510 }, { "epoch": 6.8434700612918435, "grad_norm": 0.0009583405335433781, "learning_rate": 0.21264913936202193, "loss": 0.2369, "num_input_tokens_seen": 12573920, "step": 14515 }, { "epoch": 6.845827439886846, "grad_norm": 0.002080637263134122, "learning_rate": 0.2125956134002475, "loss": 0.2312, "num_input_tokens_seen": 12578464, "step": 14520 }, { "epoch": 6.848184818481848, "grad_norm": 0.000705792976077646, "learning_rate": 0.2125420777854428, "loss": 0.2279, "num_input_tokens_seen": 12582736, "step": 14525 }, { "epoch": 6.850542197076851, "grad_norm": 0.0007312246016226709, "learning_rate": 0.21248853252586372, "loss": 0.2287, "num_input_tokens_seen": 12587152, "step": 14530 }, { "epoch": 6.852899575671853, "grad_norm": 0.0019782166928052902, "learning_rate": 0.21243497762976774, "loss": 0.2218, "num_input_tokens_seen": 12591760, "step": 14535 }, { "epoch": 6.8552569542668556, "grad_norm": 0.0017789436969906092, "learning_rate": 0.21238141310541356, "loss": 0.2322, "num_input_tokens_seen": 12596448, "step": 14540 }, { "epoch": 6.857614332861858, "grad_norm": 0.0009594902512617409, "learning_rate": 0.21232783896106153, "loss": 0.2196, "num_input_tokens_seen": 12600496, "step": 14545 }, { "epoch": 6.85997171145686, "grad_norm": 0.0013743522576987743, "learning_rate": 0.21227425520497345, "loss": 0.2114, "num_input_tokens_seen": 12604448, "step": 14550 }, { "epoch": 6.862329090051862, "grad_norm": 0.00253375805914402, "learning_rate": 0.2122206618454127, "loss": 0.2769, "num_input_tokens_seen": 12608496, "step": 14555 }, { "epoch": 6.864686468646864, "grad_norm": 0.0019436494912952185, "learning_rate": 0.2121670588906439, "loss": 0.2222, "num_input_tokens_seen": 12612960, "step": 14560 }, { "epoch": 6.867043847241867, "grad_norm": 0.0023845082614570856, "learning_rate": 0.21211344634893345, "loss": 0.2205, "num_input_tokens_seen": 12617456, "step": 14565 }, { "epoch": 6.869401225836869, "grad_norm": 0.0019055824959650636, "learning_rate": 0.21205982422854897, "loss": 0.1972, "num_input_tokens_seen": 12621584, "step": 14570 }, { "epoch": 6.8717586044318715, "grad_norm": 0.001649558311328292, "learning_rate": 0.21200619253775974, "loss": 0.2459, "num_input_tokens_seen": 12626752, "step": 14575 }, { "epoch": 6.874115983026874, "grad_norm": 0.0012768738670274615, "learning_rate": 0.21195255128483637, "loss": 0.1846, "num_input_tokens_seen": 12631136, "step": 14580 }, { "epoch": 6.876473361621876, "grad_norm": 0.0010032659629359841, "learning_rate": 0.21189890047805102, "loss": 0.1697, "num_input_tokens_seen": 12636224, "step": 14585 }, { "epoch": 6.878830740216879, "grad_norm": 0.0031002480536699295, "learning_rate": 0.21184524012567735, "loss": 0.2841, "num_input_tokens_seen": 12640592, "step": 14590 }, { "epoch": 6.881188118811881, "grad_norm": 0.0016019722679629922, "learning_rate": 0.2117915702359905, "loss": 0.2598, "num_input_tokens_seen": 12644912, "step": 14595 }, { "epoch": 6.8835454974068835, "grad_norm": 0.001955955056473613, "learning_rate": 0.211737890817267, "loss": 0.2287, "num_input_tokens_seen": 12649424, "step": 14600 }, { "epoch": 6.8835454974068835, "eval_loss": 0.23954415321350098, "eval_runtime": 21.8956, "eval_samples_per_second": 43.068, "eval_steps_per_second": 21.557, "num_input_tokens_seen": 12649424, "step": 14600 }, { "epoch": 6.885902876001886, "grad_norm": 0.002885231049731374, "learning_rate": 0.21168420187778483, "loss": 0.2187, "num_input_tokens_seen": 12653056, "step": 14605 }, { "epoch": 6.888260254596888, "grad_norm": 0.0018564697820693254, "learning_rate": 0.21163050342582362, "loss": 0.2605, "num_input_tokens_seen": 12657536, "step": 14610 }, { "epoch": 6.890617633191891, "grad_norm": 0.001031252439133823, "learning_rate": 0.21157679546966426, "loss": 0.2262, "num_input_tokens_seen": 12661760, "step": 14615 }, { "epoch": 6.892975011786893, "grad_norm": 0.0008444297127425671, "learning_rate": 0.2115230780175892, "loss": 0.208, "num_input_tokens_seen": 12665888, "step": 14620 }, { "epoch": 6.8953323903818955, "grad_norm": 0.0014745905064046383, "learning_rate": 0.21146935107788237, "loss": 0.2147, "num_input_tokens_seen": 12669776, "step": 14625 }, { "epoch": 6.897689768976898, "grad_norm": 0.0013950886204838753, "learning_rate": 0.21141561465882916, "loss": 0.2675, "num_input_tokens_seen": 12674576, "step": 14630 }, { "epoch": 6.9000471475719, "grad_norm": 0.001465690671466291, "learning_rate": 0.21136186876871635, "loss": 0.2296, "num_input_tokens_seen": 12678592, "step": 14635 }, { "epoch": 6.902404526166903, "grad_norm": 0.0007786622154526412, "learning_rate": 0.21130811341583225, "loss": 0.2495, "num_input_tokens_seen": 12683296, "step": 14640 }, { "epoch": 6.904761904761905, "grad_norm": 0.0015234191669151187, "learning_rate": 0.21125434860846667, "loss": 0.2576, "num_input_tokens_seen": 12687728, "step": 14645 }, { "epoch": 6.9071192833569075, "grad_norm": 0.007685125805437565, "learning_rate": 0.2112005743549107, "loss": 0.2278, "num_input_tokens_seen": 12691616, "step": 14650 }, { "epoch": 6.90947666195191, "grad_norm": 0.0021657331380993128, "learning_rate": 0.21114679066345707, "loss": 0.2113, "num_input_tokens_seen": 12695456, "step": 14655 }, { "epoch": 6.911834040546912, "grad_norm": 0.0011278849560767412, "learning_rate": 0.21109299754239993, "loss": 0.1852, "num_input_tokens_seen": 12699312, "step": 14660 }, { "epoch": 6.914191419141914, "grad_norm": 0.00229400722309947, "learning_rate": 0.21103919500003482, "loss": 0.3198, "num_input_tokens_seen": 12704096, "step": 14665 }, { "epoch": 6.916548797736916, "grad_norm": 0.0010657842503860593, "learning_rate": 0.21098538304465872, "loss": 0.2192, "num_input_tokens_seen": 12708160, "step": 14670 }, { "epoch": 6.918906176331919, "grad_norm": 0.0022248895838856697, "learning_rate": 0.2109315616845702, "loss": 0.2231, "num_input_tokens_seen": 12712096, "step": 14675 }, { "epoch": 6.921263554926921, "grad_norm": 0.0015682694502174854, "learning_rate": 0.21087773092806925, "loss": 0.2298, "num_input_tokens_seen": 12716304, "step": 14680 }, { "epoch": 6.923620933521923, "grad_norm": 0.0032894080504775047, "learning_rate": 0.21082389078345704, "loss": 0.2376, "num_input_tokens_seen": 12719920, "step": 14685 }, { "epoch": 6.925978312116926, "grad_norm": 0.0013068506959825754, "learning_rate": 0.2107700412590365, "loss": 0.2202, "num_input_tokens_seen": 12724480, "step": 14690 }, { "epoch": 6.928335690711928, "grad_norm": 0.001223259954713285, "learning_rate": 0.210716182363112, "loss": 0.2388, "num_input_tokens_seen": 12728992, "step": 14695 }, { "epoch": 6.930693069306931, "grad_norm": 0.0009543129126541317, "learning_rate": 0.2106623141039891, "loss": 0.1979, "num_input_tokens_seen": 12733904, "step": 14700 }, { "epoch": 6.933050447901933, "grad_norm": 0.0010462512727826834, "learning_rate": 0.21060843648997507, "loss": 0.2761, "num_input_tokens_seen": 12738368, "step": 14705 }, { "epoch": 6.935407826496935, "grad_norm": 0.000838327337987721, "learning_rate": 0.21055454952937844, "loss": 0.21, "num_input_tokens_seen": 12742016, "step": 14710 }, { "epoch": 6.937765205091938, "grad_norm": 0.005613154731690884, "learning_rate": 0.21050065323050937, "loss": 0.2231, "num_input_tokens_seen": 12746576, "step": 14715 }, { "epoch": 6.94012258368694, "grad_norm": 0.001659718924202025, "learning_rate": 0.21044674760167928, "loss": 0.2214, "num_input_tokens_seen": 12750432, "step": 14720 }, { "epoch": 6.942479962281943, "grad_norm": 0.0026571345515549183, "learning_rate": 0.210392832651201, "loss": 0.246, "num_input_tokens_seen": 12755312, "step": 14725 }, { "epoch": 6.944837340876945, "grad_norm": 0.0043987394310534, "learning_rate": 0.210338908387389, "loss": 0.2334, "num_input_tokens_seen": 12759312, "step": 14730 }, { "epoch": 6.947194719471947, "grad_norm": 0.014044170267879963, "learning_rate": 0.21028497481855912, "loss": 0.2466, "num_input_tokens_seen": 12763600, "step": 14735 }, { "epoch": 6.94955209806695, "grad_norm": 0.0014765411615371704, "learning_rate": 0.21023103195302847, "loss": 0.211, "num_input_tokens_seen": 12768080, "step": 14740 }, { "epoch": 6.951909476661952, "grad_norm": 0.0023165519814938307, "learning_rate": 0.21017707979911582, "loss": 0.2077, "num_input_tokens_seen": 12772352, "step": 14745 }, { "epoch": 6.954266855256955, "grad_norm": 0.0016585185658186674, "learning_rate": 0.21012311836514122, "loss": 0.1738, "num_input_tokens_seen": 12776560, "step": 14750 }, { "epoch": 6.956624233851956, "grad_norm": 0.0016081148060038686, "learning_rate": 0.21006914765942622, "loss": 0.1522, "num_input_tokens_seen": 12780880, "step": 14755 }, { "epoch": 6.958981612446959, "grad_norm": 0.0034324100706726313, "learning_rate": 0.2100151676902938, "loss": 0.325, "num_input_tokens_seen": 12785216, "step": 14760 }, { "epoch": 6.961338991041961, "grad_norm": 0.013912621885538101, "learning_rate": 0.2099611784660683, "loss": 0.2483, "num_input_tokens_seen": 12789984, "step": 14765 }, { "epoch": 6.963696369636963, "grad_norm": 0.005421780981123447, "learning_rate": 0.20990717999507552, "loss": 0.2252, "num_input_tokens_seen": 12794688, "step": 14770 }, { "epoch": 6.966053748231966, "grad_norm": 0.005960391368716955, "learning_rate": 0.20985317228564276, "loss": 0.2282, "num_input_tokens_seen": 12799440, "step": 14775 }, { "epoch": 6.968411126826968, "grad_norm": 0.0022668817546218634, "learning_rate": 0.20979915534609872, "loss": 0.2377, "num_input_tokens_seen": 12803856, "step": 14780 }, { "epoch": 6.970768505421971, "grad_norm": 0.0021234306041151285, "learning_rate": 0.20974512918477342, "loss": 0.2199, "num_input_tokens_seen": 12808256, "step": 14785 }, { "epoch": 6.973125884016973, "grad_norm": 0.0023528514429926872, "learning_rate": 0.2096910938099984, "loss": 0.2269, "num_input_tokens_seen": 12812272, "step": 14790 }, { "epoch": 6.975483262611975, "grad_norm": 0.006343354471027851, "learning_rate": 0.2096370492301066, "loss": 0.2338, "num_input_tokens_seen": 12816640, "step": 14795 }, { "epoch": 6.977840641206978, "grad_norm": 0.00608060834929347, "learning_rate": 0.2095829954534323, "loss": 0.2388, "num_input_tokens_seen": 12821280, "step": 14800 }, { "epoch": 6.977840641206978, "eval_loss": 0.23311565816402435, "eval_runtime": 21.9006, "eval_samples_per_second": 43.058, "eval_steps_per_second": 21.552, "num_input_tokens_seen": 12821280, "step": 14800 }, { "epoch": 6.98019801980198, "grad_norm": 0.00808115117251873, "learning_rate": 0.2095289324883114, "loss": 0.2834, "num_input_tokens_seen": 12826720, "step": 14805 }, { "epoch": 6.982555398396983, "grad_norm": 0.0036127918865531683, "learning_rate": 0.20947486034308097, "loss": 0.2245, "num_input_tokens_seen": 12831536, "step": 14810 }, { "epoch": 6.984912776991985, "grad_norm": 0.0026020666118711233, "learning_rate": 0.2094207790260797, "loss": 0.2303, "num_input_tokens_seen": 12835536, "step": 14815 }, { "epoch": 6.987270155586987, "grad_norm": 0.005497944075614214, "learning_rate": 0.20936668854564758, "loss": 0.2206, "num_input_tokens_seen": 12839744, "step": 14820 }, { "epoch": 6.98962753418199, "grad_norm": 0.004506970755755901, "learning_rate": 0.20931258891012602, "loss": 0.2436, "num_input_tokens_seen": 12843856, "step": 14825 }, { "epoch": 6.991984912776992, "grad_norm": 0.005029045511037111, "learning_rate": 0.20925848012785792, "loss": 0.2386, "num_input_tokens_seen": 12848464, "step": 14830 }, { "epoch": 6.994342291371995, "grad_norm": 0.0025118188932538033, "learning_rate": 0.20920436220718747, "loss": 0.2251, "num_input_tokens_seen": 12852928, "step": 14835 }, { "epoch": 6.996699669966997, "grad_norm": 0.0025161406956613064, "learning_rate": 0.20915023515646033, "loss": 0.165, "num_input_tokens_seen": 12858096, "step": 14840 }, { "epoch": 6.999057048561999, "grad_norm": 0.0015647062100470066, "learning_rate": 0.20909609898402368, "loss": 0.123, "num_input_tokens_seen": 12861712, "step": 14845 }, { "epoch": 7.001414427157002, "grad_norm": 0.002299858955666423, "learning_rate": 0.2090419536982258, "loss": 0.2385, "num_input_tokens_seen": 12865632, "step": 14850 }, { "epoch": 7.003771805752004, "grad_norm": 0.00412387540563941, "learning_rate": 0.2089877993074168, "loss": 0.2773, "num_input_tokens_seen": 12870192, "step": 14855 }, { "epoch": 7.006129184347006, "grad_norm": 0.002483497140929103, "learning_rate": 0.20893363581994784, "loss": 0.2562, "num_input_tokens_seen": 12875152, "step": 14860 }, { "epoch": 7.008486562942008, "grad_norm": 0.0029971140902489424, "learning_rate": 0.2088794632441716, "loss": 0.2142, "num_input_tokens_seen": 12879376, "step": 14865 }, { "epoch": 7.0108439415370105, "grad_norm": 0.0053238715045154095, "learning_rate": 0.20882528158844219, "loss": 0.2217, "num_input_tokens_seen": 12884176, "step": 14870 }, { "epoch": 7.013201320132013, "grad_norm": 0.0021073271054774523, "learning_rate": 0.20877109086111514, "loss": 0.1911, "num_input_tokens_seen": 12889568, "step": 14875 }, { "epoch": 7.015558698727015, "grad_norm": 0.0026726997457444668, "learning_rate": 0.2087168910705473, "loss": 0.2268, "num_input_tokens_seen": 12893808, "step": 14880 }, { "epoch": 7.017916077322018, "grad_norm": 0.0016719794366508722, "learning_rate": 0.208662682225097, "loss": 0.2516, "num_input_tokens_seen": 12899184, "step": 14885 }, { "epoch": 7.02027345591702, "grad_norm": 0.0018244393868371844, "learning_rate": 0.2086084643331239, "loss": 0.2525, "num_input_tokens_seen": 12903568, "step": 14890 }, { "epoch": 7.0226308345120225, "grad_norm": 0.0029970731120556593, "learning_rate": 0.20855423740298906, "loss": 0.2298, "num_input_tokens_seen": 12907696, "step": 14895 }, { "epoch": 7.024988213107025, "grad_norm": 0.002714364090934396, "learning_rate": 0.208500001443055, "loss": 0.4703, "num_input_tokens_seen": 12912304, "step": 14900 }, { "epoch": 7.027345591702027, "grad_norm": 0.0015578607562929392, "learning_rate": 0.20844575646168553, "loss": 0.2211, "num_input_tokens_seen": 12916256, "step": 14905 }, { "epoch": 7.02970297029703, "grad_norm": 0.002424622653052211, "learning_rate": 0.20839150246724594, "loss": 0.2732, "num_input_tokens_seen": 12920992, "step": 14910 }, { "epoch": 7.032060348892032, "grad_norm": 0.0039414758794009686, "learning_rate": 0.20833723946810287, "loss": 0.2453, "num_input_tokens_seen": 12925024, "step": 14915 }, { "epoch": 7.0344177274870345, "grad_norm": 0.007846972905099392, "learning_rate": 0.20828296747262437, "loss": 0.236, "num_input_tokens_seen": 12929776, "step": 14920 }, { "epoch": 7.036775106082037, "grad_norm": 0.004184902645647526, "learning_rate": 0.20822868648917986, "loss": 0.2526, "num_input_tokens_seen": 12934096, "step": 14925 }, { "epoch": 7.039132484677039, "grad_norm": 0.0015606291126459837, "learning_rate": 0.20817439652614017, "loss": 0.2079, "num_input_tokens_seen": 12938080, "step": 14930 }, { "epoch": 7.041489863272042, "grad_norm": 0.002824156079441309, "learning_rate": 0.20812009759187744, "loss": 0.2248, "num_input_tokens_seen": 12941616, "step": 14935 }, { "epoch": 7.043847241867044, "grad_norm": 0.001625856733880937, "learning_rate": 0.2080657896947653, "loss": 0.2499, "num_input_tokens_seen": 12945440, "step": 14940 }, { "epoch": 7.0462046204620465, "grad_norm": 0.0014425426488742232, "learning_rate": 0.2080114728431787, "loss": 0.2102, "num_input_tokens_seen": 12949104, "step": 14945 }, { "epoch": 7.048561999057049, "grad_norm": 0.0017395182512700558, "learning_rate": 0.20795714704549392, "loss": 0.2073, "num_input_tokens_seen": 12953056, "step": 14950 }, { "epoch": 7.050919377652051, "grad_norm": 0.00194422691129148, "learning_rate": 0.20790281231008875, "loss": 0.2192, "num_input_tokens_seen": 12957632, "step": 14955 }, { "epoch": 7.053276756247053, "grad_norm": 0.0013639620738103986, "learning_rate": 0.20784846864534226, "loss": 0.2307, "num_input_tokens_seen": 12961408, "step": 14960 }, { "epoch": 7.055634134842055, "grad_norm": 0.00409960001707077, "learning_rate": 0.20779411605963496, "loss": 0.2073, "num_input_tokens_seen": 12966352, "step": 14965 }, { "epoch": 7.057991513437058, "grad_norm": 0.0013747570337727666, "learning_rate": 0.2077397545613487, "loss": 0.2359, "num_input_tokens_seen": 12970688, "step": 14970 }, { "epoch": 7.06034889203206, "grad_norm": 0.0012778989039361477, "learning_rate": 0.20768538415886661, "loss": 0.2199, "num_input_tokens_seen": 12974736, "step": 14975 }, { "epoch": 7.0627062706270625, "grad_norm": 0.0011710308026522398, "learning_rate": 0.20763100486057343, "loss": 0.2265, "num_input_tokens_seen": 12979712, "step": 14980 }, { "epoch": 7.065063649222065, "grad_norm": 0.0013039769837632775, "learning_rate": 0.20757661667485502, "loss": 0.2273, "num_input_tokens_seen": 12983488, "step": 14985 }, { "epoch": 7.067421027817067, "grad_norm": 0.0015102979959920049, "learning_rate": 0.2075222196100988, "loss": 0.2378, "num_input_tokens_seen": 12987936, "step": 14990 }, { "epoch": 7.06977840641207, "grad_norm": 0.0020068527664989233, "learning_rate": 0.20746781367469344, "loss": 0.2424, "num_input_tokens_seen": 12992608, "step": 14995 }, { "epoch": 7.072135785007072, "grad_norm": 0.002226723125204444, "learning_rate": 0.207413398877029, "loss": 0.2421, "num_input_tokens_seen": 12996208, "step": 15000 }, { "epoch": 7.072135785007072, "eval_loss": 0.2265898436307907, "eval_runtime": 21.9566, "eval_samples_per_second": 42.948, "eval_steps_per_second": 21.497, "num_input_tokens_seen": 12996208, "step": 15000 }, { "epoch": 7.0744931636020745, "grad_norm": 0.002201343420892954, "learning_rate": 0.20735897522549698, "loss": 0.2196, "num_input_tokens_seen": 13000144, "step": 15005 }, { "epoch": 7.076850542197077, "grad_norm": 0.002531293546780944, "learning_rate": 0.2073045427284902, "loss": 0.2313, "num_input_tokens_seen": 13004544, "step": 15010 }, { "epoch": 7.079207920792079, "grad_norm": 0.0018848689505830407, "learning_rate": 0.2072501013944027, "loss": 0.196, "num_input_tokens_seen": 13009456, "step": 15015 }, { "epoch": 7.081565299387082, "grad_norm": 0.0008991016657091677, "learning_rate": 0.20719565123163017, "loss": 0.1895, "num_input_tokens_seen": 13014432, "step": 15020 }, { "epoch": 7.083922677982084, "grad_norm": 0.003975799772888422, "learning_rate": 0.20714119224856944, "loss": 0.276, "num_input_tokens_seen": 13018912, "step": 15025 }, { "epoch": 7.0862800565770865, "grad_norm": 0.0010439081816002727, "learning_rate": 0.2070867244536188, "loss": 0.2686, "num_input_tokens_seen": 13024064, "step": 15030 }, { "epoch": 7.088637435172089, "grad_norm": 0.0022074102889746428, "learning_rate": 0.20703224785517785, "loss": 0.229, "num_input_tokens_seen": 13028016, "step": 15035 }, { "epoch": 7.090994813767091, "grad_norm": 0.0019474757136777043, "learning_rate": 0.20697776246164754, "loss": 0.2308, "num_input_tokens_seen": 13032672, "step": 15040 }, { "epoch": 7.093352192362094, "grad_norm": 0.0017951992340385914, "learning_rate": 0.2069232682814303, "loss": 0.2393, "num_input_tokens_seen": 13036544, "step": 15045 }, { "epoch": 7.095709570957096, "grad_norm": 0.0015423892764374614, "learning_rate": 0.20686876532292972, "loss": 0.227, "num_input_tokens_seen": 13040464, "step": 15050 }, { "epoch": 7.0980669495520985, "grad_norm": 0.001461912994273007, "learning_rate": 0.20681425359455083, "loss": 0.2193, "num_input_tokens_seen": 13044368, "step": 15055 }, { "epoch": 7.1004243281471, "grad_norm": 0.0014030665624886751, "learning_rate": 0.20675973310470008, "loss": 0.258, "num_input_tokens_seen": 13048928, "step": 15060 }, { "epoch": 7.102781706742102, "grad_norm": 0.0014632578240707517, "learning_rate": 0.2067052038617852, "loss": 0.2541, "num_input_tokens_seen": 13053024, "step": 15065 }, { "epoch": 7.105139085337105, "grad_norm": 0.0026318735908716917, "learning_rate": 0.2066506658742153, "loss": 0.237, "num_input_tokens_seen": 13057888, "step": 15070 }, { "epoch": 7.107496463932107, "grad_norm": 0.002468305639922619, "learning_rate": 0.20659611915040077, "loss": 0.2297, "num_input_tokens_seen": 13061984, "step": 15075 }, { "epoch": 7.10985384252711, "grad_norm": 0.002756178379058838, "learning_rate": 0.20654156369875348, "loss": 0.2328, "num_input_tokens_seen": 13066496, "step": 15080 }, { "epoch": 7.112211221122112, "grad_norm": 0.0028739015106111765, "learning_rate": 0.20648699952768648, "loss": 0.2524, "num_input_tokens_seen": 13070288, "step": 15085 }, { "epoch": 7.114568599717114, "grad_norm": 0.0043462240137159824, "learning_rate": 0.20643242664561437, "loss": 0.2334, "num_input_tokens_seen": 13074800, "step": 15090 }, { "epoch": 7.116925978312117, "grad_norm": 0.001739034429192543, "learning_rate": 0.20637784506095277, "loss": 0.2069, "num_input_tokens_seen": 13079808, "step": 15095 }, { "epoch": 7.119283356907119, "grad_norm": 0.001652404898777604, "learning_rate": 0.20632325478211908, "loss": 0.2215, "num_input_tokens_seen": 13083824, "step": 15100 }, { "epoch": 7.121640735502122, "grad_norm": 0.0013313082745298743, "learning_rate": 0.20626865581753165, "loss": 0.2037, "num_input_tokens_seen": 13088368, "step": 15105 }, { "epoch": 7.123998114097124, "grad_norm": 0.0029345087241381407, "learning_rate": 0.2062140481756104, "loss": 0.2222, "num_input_tokens_seen": 13092624, "step": 15110 }, { "epoch": 7.126355492692126, "grad_norm": 0.0025416959542781115, "learning_rate": 0.20615943186477648, "loss": 0.254, "num_input_tokens_seen": 13096400, "step": 15115 }, { "epoch": 7.128712871287129, "grad_norm": 0.003997765481472015, "learning_rate": 0.20610480689345242, "loss": 0.2425, "num_input_tokens_seen": 13100448, "step": 15120 }, { "epoch": 7.131070249882131, "grad_norm": 0.002323502441868186, "learning_rate": 0.2060501732700621, "loss": 0.2367, "num_input_tokens_seen": 13105232, "step": 15125 }, { "epoch": 7.133427628477134, "grad_norm": 0.0020868442952632904, "learning_rate": 0.20599553100303067, "loss": 0.2295, "num_input_tokens_seen": 13110208, "step": 15130 }, { "epoch": 7.135785007072136, "grad_norm": 0.001666162395849824, "learning_rate": 0.20594088010078465, "loss": 0.2222, "num_input_tokens_seen": 13114800, "step": 15135 }, { "epoch": 7.138142385667138, "grad_norm": 0.00163518195040524, "learning_rate": 0.20588622057175196, "loss": 0.1929, "num_input_tokens_seen": 13119792, "step": 15140 }, { "epoch": 7.140499764262141, "grad_norm": 0.0010642927372828126, "learning_rate": 0.20583155242436177, "loss": 0.2236, "num_input_tokens_seen": 13123840, "step": 15145 }, { "epoch": 7.142857142857143, "grad_norm": 0.0034376857802271843, "learning_rate": 0.20577687566704453, "loss": 0.2836, "num_input_tokens_seen": 13127760, "step": 15150 }, { "epoch": 7.145214521452146, "grad_norm": 0.0010964744724333286, "learning_rate": 0.20572219030823213, "loss": 0.2193, "num_input_tokens_seen": 13133360, "step": 15155 }, { "epoch": 7.147571900047147, "grad_norm": 0.0014615522231906652, "learning_rate": 0.20566749635635775, "loss": 0.2151, "num_input_tokens_seen": 13137392, "step": 15160 }, { "epoch": 7.1499292786421496, "grad_norm": 0.001081032445654273, "learning_rate": 0.20561279381985587, "loss": 0.2387, "num_input_tokens_seen": 13142160, "step": 15165 }, { "epoch": 7.152286657237152, "grad_norm": 0.0021379422396421432, "learning_rate": 0.2055580827071623, "loss": 0.2357, "num_input_tokens_seen": 13148080, "step": 15170 }, { "epoch": 7.154644035832154, "grad_norm": 0.0010404668282717466, "learning_rate": 0.20550336302671418, "loss": 0.2334, "num_input_tokens_seen": 13151632, "step": 15175 }, { "epoch": 7.157001414427157, "grad_norm": 0.0011815376346930861, "learning_rate": 0.20544863478695, "loss": 0.2374, "num_input_tokens_seen": 13155424, "step": 15180 }, { "epoch": 7.159358793022159, "grad_norm": 0.0014085306320339441, "learning_rate": 0.20539389799630953, "loss": 0.2236, "num_input_tokens_seen": 13160064, "step": 15185 }, { "epoch": 7.161716171617162, "grad_norm": 0.0038801319897174835, "learning_rate": 0.20533915266323388, "loss": 0.2464, "num_input_tokens_seen": 13164576, "step": 15190 }, { "epoch": 7.164073550212164, "grad_norm": 0.0031731300987303257, "learning_rate": 0.20528439879616542, "loss": 0.2606, "num_input_tokens_seen": 13168512, "step": 15195 }, { "epoch": 7.166430928807166, "grad_norm": 0.0011300339829176664, "learning_rate": 0.20522963640354794, "loss": 0.2458, "num_input_tokens_seen": 13172592, "step": 15200 }, { "epoch": 7.166430928807166, "eval_loss": 0.24147500097751617, "eval_runtime": 21.9191, "eval_samples_per_second": 43.022, "eval_steps_per_second": 21.534, "num_input_tokens_seen": 13172592, "step": 15200 }, { "epoch": 7.168788307402169, "grad_norm": 0.0028294268995523453, "learning_rate": 0.20517486549382644, "loss": 0.2612, "num_input_tokens_seen": 13177312, "step": 15205 }, { "epoch": 7.171145685997171, "grad_norm": 0.0022586060222238302, "learning_rate": 0.20512008607544735, "loss": 0.232, "num_input_tokens_seen": 13181936, "step": 15210 }, { "epoch": 7.173503064592174, "grad_norm": 0.0018090792000293732, "learning_rate": 0.20506529815685826, "loss": 0.2198, "num_input_tokens_seen": 13186176, "step": 15215 }, { "epoch": 7.175860443187176, "grad_norm": 0.0023372580762952566, "learning_rate": 0.2050105017465082, "loss": 0.2493, "num_input_tokens_seen": 13189872, "step": 15220 }, { "epoch": 7.178217821782178, "grad_norm": 0.0014842526288703084, "learning_rate": 0.20495569685284754, "loss": 0.2412, "num_input_tokens_seen": 13194272, "step": 15225 }, { "epoch": 7.180575200377181, "grad_norm": 0.0014635034603998065, "learning_rate": 0.20490088348432778, "loss": 0.2303, "num_input_tokens_seen": 13198128, "step": 15230 }, { "epoch": 7.182932578972183, "grad_norm": 0.0013976584887132049, "learning_rate": 0.2048460616494018, "loss": 0.2173, "num_input_tokens_seen": 13203216, "step": 15235 }, { "epoch": 7.185289957567186, "grad_norm": 0.002899093320593238, "learning_rate": 0.2047912313565239, "loss": 0.2193, "num_input_tokens_seen": 13207744, "step": 15240 }, { "epoch": 7.187647336162188, "grad_norm": 0.0010404656641185284, "learning_rate": 0.20473639261414958, "loss": 0.2126, "num_input_tokens_seen": 13211904, "step": 15245 }, { "epoch": 7.19000471475719, "grad_norm": 0.0026291217654943466, "learning_rate": 0.2046815454307357, "loss": 0.2498, "num_input_tokens_seen": 13216208, "step": 15250 }, { "epoch": 7.192362093352193, "grad_norm": 0.001881828997284174, "learning_rate": 0.20462668981474028, "loss": 0.2451, "num_input_tokens_seen": 13220704, "step": 15255 }, { "epoch": 7.194719471947194, "grad_norm": 0.0011119935661554337, "learning_rate": 0.20457182577462288, "loss": 0.2155, "num_input_tokens_seen": 13225136, "step": 15260 }, { "epoch": 7.197076850542197, "grad_norm": 0.001778302830643952, "learning_rate": 0.2045169533188441, "loss": 0.2327, "num_input_tokens_seen": 13229264, "step": 15265 }, { "epoch": 7.199434229137199, "grad_norm": 0.0009732965263538063, "learning_rate": 0.20446207245586603, "loss": 0.237, "num_input_tokens_seen": 13233040, "step": 15270 }, { "epoch": 7.2017916077322015, "grad_norm": 0.0016485468950122595, "learning_rate": 0.20440718319415196, "loss": 0.233, "num_input_tokens_seen": 13237344, "step": 15275 }, { "epoch": 7.204148986327204, "grad_norm": 0.0018658199114724994, "learning_rate": 0.20435228554216653, "loss": 0.2385, "num_input_tokens_seen": 13242640, "step": 15280 }, { "epoch": 7.206506364922206, "grad_norm": 0.0020858540665358305, "learning_rate": 0.20429737950837565, "loss": 0.2291, "num_input_tokens_seen": 13247008, "step": 15285 }, { "epoch": 7.208863743517209, "grad_norm": 0.0015889480710029602, "learning_rate": 0.20424246510124647, "loss": 0.1571, "num_input_tokens_seen": 13251072, "step": 15290 }, { "epoch": 7.211221122112211, "grad_norm": 0.0006653518066741526, "learning_rate": 0.20418754232924755, "loss": 0.24, "num_input_tokens_seen": 13254960, "step": 15295 }, { "epoch": 7.2135785007072135, "grad_norm": 0.001337501802481711, "learning_rate": 0.20413261120084863, "loss": 0.2148, "num_input_tokens_seen": 13258992, "step": 15300 }, { "epoch": 7.215935879302216, "grad_norm": 0.0014349065022543073, "learning_rate": 0.2040776717245208, "loss": 0.2111, "num_input_tokens_seen": 13262880, "step": 15305 }, { "epoch": 7.218293257897218, "grad_norm": 0.000719567877240479, "learning_rate": 0.2040227239087364, "loss": 0.266, "num_input_tokens_seen": 13267456, "step": 15310 }, { "epoch": 7.220650636492221, "grad_norm": 0.0014087497256696224, "learning_rate": 0.20396776776196904, "loss": 0.2092, "num_input_tokens_seen": 13271776, "step": 15315 }, { "epoch": 7.223008015087223, "grad_norm": 0.0010972082382068038, "learning_rate": 0.20391280329269373, "loss": 0.2005, "num_input_tokens_seen": 13275856, "step": 15320 }, { "epoch": 7.2253653936822255, "grad_norm": 0.0009513216791674495, "learning_rate": 0.20385783050938663, "loss": 0.2003, "num_input_tokens_seen": 13279872, "step": 15325 }, { "epoch": 7.227722772277228, "grad_norm": 0.0010587811702862382, "learning_rate": 0.20380284942052526, "loss": 0.2362, "num_input_tokens_seen": 13283968, "step": 15330 }, { "epoch": 7.23008015087223, "grad_norm": 0.0017593813827261329, "learning_rate": 0.2037478600345884, "loss": 0.2135, "num_input_tokens_seen": 13288160, "step": 15335 }, { "epoch": 7.232437529467233, "grad_norm": 0.0010429348330944777, "learning_rate": 0.20369286236005604, "loss": 0.2352, "num_input_tokens_seen": 13292944, "step": 15340 }, { "epoch": 7.234794908062235, "grad_norm": 0.0012294008629396558, "learning_rate": 0.20363785640540957, "loss": 0.2075, "num_input_tokens_seen": 13296864, "step": 15345 }, { "epoch": 7.2371522866572375, "grad_norm": 0.0008983141160570085, "learning_rate": 0.2035828421791316, "loss": 0.2271, "num_input_tokens_seen": 13301744, "step": 15350 }, { "epoch": 7.23950966525224, "grad_norm": 0.0015812971396371722, "learning_rate": 0.20352781968970599, "loss": 0.1815, "num_input_tokens_seen": 13305488, "step": 15355 }, { "epoch": 7.2418670438472414, "grad_norm": 0.0013623128179460764, "learning_rate": 0.2034727889456179, "loss": 0.1957, "num_input_tokens_seen": 13310352, "step": 15360 }, { "epoch": 7.244224422442244, "grad_norm": 0.00042585001210682094, "learning_rate": 0.2034177499553538, "loss": 0.1887, "num_input_tokens_seen": 13314096, "step": 15365 }, { "epoch": 7.246581801037246, "grad_norm": 0.0015833373181521893, "learning_rate": 0.2033627027274014, "loss": 0.1882, "num_input_tokens_seen": 13318064, "step": 15370 }, { "epoch": 7.248939179632249, "grad_norm": 0.001885665813460946, "learning_rate": 0.20330764727024955, "loss": 0.2861, "num_input_tokens_seen": 13322304, "step": 15375 }, { "epoch": 7.251296558227251, "grad_norm": 0.0012640377972275019, "learning_rate": 0.20325258359238868, "loss": 0.2839, "num_input_tokens_seen": 13326688, "step": 15380 }, { "epoch": 7.2536539368222535, "grad_norm": 0.001171878888271749, "learning_rate": 0.20319751170231018, "loss": 0.2491, "num_input_tokens_seen": 13330816, "step": 15385 }, { "epoch": 7.256011315417256, "grad_norm": 0.0014559588162228465, "learning_rate": 0.2031424316085068, "loss": 0.2344, "num_input_tokens_seen": 13335232, "step": 15390 }, { "epoch": 7.258368694012258, "grad_norm": 0.001987034222111106, "learning_rate": 0.20308734331947265, "loss": 0.2842, "num_input_tokens_seen": 13339120, "step": 15395 }, { "epoch": 7.260726072607261, "grad_norm": 0.001524219405837357, "learning_rate": 0.20303224684370305, "loss": 0.2424, "num_input_tokens_seen": 13342864, "step": 15400 }, { "epoch": 7.260726072607261, "eval_loss": 0.23568521440029144, "eval_runtime": 21.8976, "eval_samples_per_second": 43.064, "eval_steps_per_second": 21.555, "num_input_tokens_seen": 13342864, "step": 15400 }, { "epoch": 7.263083451202263, "grad_norm": 0.0008001226815395057, "learning_rate": 0.20297714218969456, "loss": 0.2263, "num_input_tokens_seen": 13347184, "step": 15405 }, { "epoch": 7.2654408297972655, "grad_norm": 0.001295391470193863, "learning_rate": 0.20292202936594497, "loss": 0.2194, "num_input_tokens_seen": 13352176, "step": 15410 }, { "epoch": 7.267798208392268, "grad_norm": 0.00221432582475245, "learning_rate": 0.2028669083809534, "loss": 0.2495, "num_input_tokens_seen": 13356480, "step": 15415 }, { "epoch": 7.27015558698727, "grad_norm": 0.0011023107217624784, "learning_rate": 0.20281177924322016, "loss": 0.2179, "num_input_tokens_seen": 13360720, "step": 15420 }, { "epoch": 7.272512965582273, "grad_norm": 0.0017586336471140385, "learning_rate": 0.2027566419612469, "loss": 0.3001, "num_input_tokens_seen": 13365552, "step": 15425 }, { "epoch": 7.274870344177275, "grad_norm": 0.0010329211363568902, "learning_rate": 0.20270149654353647, "loss": 0.2358, "num_input_tokens_seen": 13369600, "step": 15430 }, { "epoch": 7.2772277227722775, "grad_norm": 0.002331542782485485, "learning_rate": 0.202646342998593, "loss": 0.2265, "num_input_tokens_seen": 13374576, "step": 15435 }, { "epoch": 7.27958510136728, "grad_norm": 0.0010810852982103825, "learning_rate": 0.20259118133492185, "loss": 0.2259, "num_input_tokens_seen": 13379056, "step": 15440 }, { "epoch": 7.281942479962282, "grad_norm": 0.0010518874041736126, "learning_rate": 0.20253601156102966, "loss": 0.232, "num_input_tokens_seen": 13384272, "step": 15445 }, { "epoch": 7.284299858557285, "grad_norm": 0.0010362756438553333, "learning_rate": 0.20248083368542422, "loss": 0.2115, "num_input_tokens_seen": 13387632, "step": 15450 }, { "epoch": 7.286657237152287, "grad_norm": 0.001184705994091928, "learning_rate": 0.2024256477166147, "loss": 0.2454, "num_input_tokens_seen": 13391408, "step": 15455 }, { "epoch": 7.2890146157472895, "grad_norm": 0.0011438924120739102, "learning_rate": 0.2023704536631115, "loss": 0.2011, "num_input_tokens_seen": 13396240, "step": 15460 }, { "epoch": 7.291371994342291, "grad_norm": 0.0011968269245699048, "learning_rate": 0.20231525153342625, "loss": 0.2455, "num_input_tokens_seen": 13400336, "step": 15465 }, { "epoch": 7.293729372937293, "grad_norm": 0.0008323107031174004, "learning_rate": 0.20226004133607173, "loss": 0.2113, "num_input_tokens_seen": 13404384, "step": 15470 }, { "epoch": 7.296086751532296, "grad_norm": 0.0011118457186967134, "learning_rate": 0.20220482307956214, "loss": 0.2453, "num_input_tokens_seen": 13408640, "step": 15475 }, { "epoch": 7.298444130127298, "grad_norm": 0.0008762665092945099, "learning_rate": 0.20214959677241276, "loss": 0.2246, "num_input_tokens_seen": 13412528, "step": 15480 }, { "epoch": 7.300801508722301, "grad_norm": 0.0017892989562824368, "learning_rate": 0.20209436242314022, "loss": 0.2393, "num_input_tokens_seen": 13416960, "step": 15485 }, { "epoch": 7.303158887317303, "grad_norm": 0.0017204215982928872, "learning_rate": 0.2020391200402623, "loss": 0.2256, "num_input_tokens_seen": 13421824, "step": 15490 }, { "epoch": 7.305516265912305, "grad_norm": 0.0021427995525300503, "learning_rate": 0.2019838696322981, "loss": 0.2207, "num_input_tokens_seen": 13426240, "step": 15495 }, { "epoch": 7.307873644507308, "grad_norm": 0.0010970188304781914, "learning_rate": 0.20192861120776798, "loss": 0.2153, "num_input_tokens_seen": 13430144, "step": 15500 }, { "epoch": 7.31023102310231, "grad_norm": 0.001152031240053475, "learning_rate": 0.20187334477519345, "loss": 0.2277, "num_input_tokens_seen": 13434208, "step": 15505 }, { "epoch": 7.312588401697313, "grad_norm": 0.0013564972905442119, "learning_rate": 0.20181807034309726, "loss": 0.2107, "num_input_tokens_seen": 13438896, "step": 15510 }, { "epoch": 7.314945780292315, "grad_norm": 0.0020762740168720484, "learning_rate": 0.2017627879200034, "loss": 0.2171, "num_input_tokens_seen": 13443152, "step": 15515 }, { "epoch": 7.317303158887317, "grad_norm": 0.0009688197751529515, "learning_rate": 0.2017074975144372, "loss": 0.2026, "num_input_tokens_seen": 13447216, "step": 15520 }, { "epoch": 7.31966053748232, "grad_norm": 0.0009530129027552903, "learning_rate": 0.20165219913492508, "loss": 0.2567, "num_input_tokens_seen": 13451360, "step": 15525 }, { "epoch": 7.322017916077322, "grad_norm": 0.0011105960002169013, "learning_rate": 0.20159689278999468, "loss": 0.2694, "num_input_tokens_seen": 13455584, "step": 15530 }, { "epoch": 7.324375294672325, "grad_norm": 0.0020682020112872124, "learning_rate": 0.20154157848817508, "loss": 0.2153, "num_input_tokens_seen": 13460160, "step": 15535 }, { "epoch": 7.326732673267327, "grad_norm": 0.00088716228492558, "learning_rate": 0.20148625623799632, "loss": 0.2431, "num_input_tokens_seen": 13463728, "step": 15540 }, { "epoch": 7.329090051862329, "grad_norm": 0.0007348372600972652, "learning_rate": 0.20143092604798984, "loss": 0.2407, "num_input_tokens_seen": 13468480, "step": 15545 }, { "epoch": 7.331447430457332, "grad_norm": 0.0008155219256877899, "learning_rate": 0.2013755879266883, "loss": 0.231, "num_input_tokens_seen": 13472608, "step": 15550 }, { "epoch": 7.333804809052333, "grad_norm": 0.0011456632055342197, "learning_rate": 0.20132024188262543, "loss": 0.2046, "num_input_tokens_seen": 13476944, "step": 15555 }, { "epoch": 7.336162187647336, "grad_norm": 0.0012141769984737039, "learning_rate": 0.2012648879243363, "loss": 0.2225, "num_input_tokens_seen": 13482192, "step": 15560 }, { "epoch": 7.338519566242338, "grad_norm": 0.0013456086162477732, "learning_rate": 0.20120952606035725, "loss": 0.2131, "num_input_tokens_seen": 13486128, "step": 15565 }, { "epoch": 7.3408769448373405, "grad_norm": 0.005772563628852367, "learning_rate": 0.20115415629922576, "loss": 0.1939, "num_input_tokens_seen": 13491056, "step": 15570 }, { "epoch": 7.343234323432343, "grad_norm": 0.001241033896803856, "learning_rate": 0.20109877864948048, "loss": 0.1871, "num_input_tokens_seen": 13494736, "step": 15575 }, { "epoch": 7.345591702027345, "grad_norm": 0.0014223781181499362, "learning_rate": 0.20104339311966138, "loss": 0.2944, "num_input_tokens_seen": 13498848, "step": 15580 }, { "epoch": 7.347949080622348, "grad_norm": 0.0012310738675296307, "learning_rate": 0.2009879997183097, "loss": 0.1879, "num_input_tokens_seen": 13502752, "step": 15585 }, { "epoch": 7.35030645921735, "grad_norm": 0.0038612382486462593, "learning_rate": 0.20093259845396763, "loss": 0.22, "num_input_tokens_seen": 13507296, "step": 15590 }, { "epoch": 7.3526638378123526, "grad_norm": 0.0010143116815015674, "learning_rate": 0.20087718933517884, "loss": 0.1975, "num_input_tokens_seen": 13511584, "step": 15595 }, { "epoch": 7.355021216407355, "grad_norm": 0.003364418400451541, "learning_rate": 0.20082177237048807, "loss": 0.2588, "num_input_tokens_seen": 13515600, "step": 15600 }, { "epoch": 7.355021216407355, "eval_loss": 0.2185654193162918, "eval_runtime": 21.9802, "eval_samples_per_second": 42.902, "eval_steps_per_second": 21.474, "num_input_tokens_seen": 13515600, "step": 15600 }, { "epoch": 7.357378595002357, "grad_norm": 0.0025536210741847754, "learning_rate": 0.20076634756844133, "loss": 0.207, "num_input_tokens_seen": 13520000, "step": 15605 }, { "epoch": 7.35973597359736, "grad_norm": 0.002411969704553485, "learning_rate": 0.20071091493758586, "loss": 0.2388, "num_input_tokens_seen": 13523840, "step": 15610 }, { "epoch": 7.362093352192362, "grad_norm": 0.00144630775321275, "learning_rate": 0.20065547448647003, "loss": 0.2317, "num_input_tokens_seen": 13528448, "step": 15615 }, { "epoch": 7.364450730787365, "grad_norm": 0.0011975698871538043, "learning_rate": 0.20060002622364348, "loss": 0.2207, "num_input_tokens_seen": 13533104, "step": 15620 }, { "epoch": 7.366808109382367, "grad_norm": 0.0012333836639299989, "learning_rate": 0.20054457015765695, "loss": 0.2094, "num_input_tokens_seen": 13537808, "step": 15625 }, { "epoch": 7.369165487977369, "grad_norm": 0.0016573378816246986, "learning_rate": 0.20048910629706254, "loss": 0.2351, "num_input_tokens_seen": 13541952, "step": 15630 }, { "epoch": 7.371522866572372, "grad_norm": 0.0017191787483170629, "learning_rate": 0.20043363465041347, "loss": 0.2182, "num_input_tokens_seen": 13546208, "step": 15635 }, { "epoch": 7.373880245167374, "grad_norm": 0.0035955270286649466, "learning_rate": 0.2003781552262641, "loss": 0.2804, "num_input_tokens_seen": 13550480, "step": 15640 }, { "epoch": 7.376237623762377, "grad_norm": 0.0011233547702431679, "learning_rate": 0.20032266803317014, "loss": 0.1973, "num_input_tokens_seen": 13554752, "step": 15645 }, { "epoch": 7.378595002357379, "grad_norm": 0.001515724929049611, "learning_rate": 0.2002671730796884, "loss": 0.2504, "num_input_tokens_seen": 13559696, "step": 15650 }, { "epoch": 7.380952380952381, "grad_norm": 0.0017262190813198686, "learning_rate": 0.20021167037437684, "loss": 0.2337, "num_input_tokens_seen": 13563856, "step": 15655 }, { "epoch": 7.383309759547384, "grad_norm": 0.0013095950707793236, "learning_rate": 0.20015615992579472, "loss": 0.2322, "num_input_tokens_seen": 13567760, "step": 15660 }, { "epoch": 7.385667138142385, "grad_norm": 0.00241126143373549, "learning_rate": 0.20010064174250244, "loss": 0.2137, "num_input_tokens_seen": 13572624, "step": 15665 }, { "epoch": 7.388024516737388, "grad_norm": 0.0012708373833447695, "learning_rate": 0.2000451158330616, "loss": 0.2077, "num_input_tokens_seen": 13577088, "step": 15670 }, { "epoch": 7.39038189533239, "grad_norm": 0.0013766729971393943, "learning_rate": 0.199989582206035, "loss": 0.2414, "num_input_tokens_seen": 13580704, "step": 15675 }, { "epoch": 7.3927392739273925, "grad_norm": 0.0015739204827696085, "learning_rate": 0.1999340408699866, "loss": 0.1947, "num_input_tokens_seen": 13584752, "step": 15680 }, { "epoch": 7.395096652522395, "grad_norm": 0.0011760651832446456, "learning_rate": 0.19987849183348155, "loss": 0.2395, "num_input_tokens_seen": 13589504, "step": 15685 }, { "epoch": 7.397454031117397, "grad_norm": 0.0019183579133823514, "learning_rate": 0.19982293510508628, "loss": 0.2495, "num_input_tokens_seen": 13593312, "step": 15690 }, { "epoch": 7.3998114097124, "grad_norm": 0.001145180081948638, "learning_rate": 0.19976737069336833, "loss": 0.2, "num_input_tokens_seen": 13597968, "step": 15695 }, { "epoch": 7.402168788307402, "grad_norm": 0.001067962497472763, "learning_rate": 0.1997117986068964, "loss": 0.2107, "num_input_tokens_seen": 13602448, "step": 15700 }, { "epoch": 7.4045261669024045, "grad_norm": 0.0013458176981657743, "learning_rate": 0.19965621885424037, "loss": 0.2008, "num_input_tokens_seen": 13606912, "step": 15705 }, { "epoch": 7.406883545497407, "grad_norm": 0.0009260313236154616, "learning_rate": 0.19960063144397142, "loss": 0.2474, "num_input_tokens_seen": 13610944, "step": 15710 }, { "epoch": 7.409240924092409, "grad_norm": 0.0012564401840791106, "learning_rate": 0.19954503638466176, "loss": 0.2496, "num_input_tokens_seen": 13615184, "step": 15715 }, { "epoch": 7.411598302687412, "grad_norm": 0.0012379857944324613, "learning_rate": 0.1994894336848848, "loss": 0.2025, "num_input_tokens_seen": 13619440, "step": 15720 }, { "epoch": 7.413955681282414, "grad_norm": 0.0009774168720468879, "learning_rate": 0.1994338233532153, "loss": 0.2172, "num_input_tokens_seen": 13623152, "step": 15725 }, { "epoch": 7.4163130598774165, "grad_norm": 0.0011851703748106956, "learning_rate": 0.19937820539822904, "loss": 0.2465, "num_input_tokens_seen": 13627936, "step": 15730 }, { "epoch": 7.418670438472419, "grad_norm": 0.0018155736615881324, "learning_rate": 0.199322579828503, "loss": 0.2547, "num_input_tokens_seen": 13631968, "step": 15735 }, { "epoch": 7.421027817067421, "grad_norm": 0.0025571875739842653, "learning_rate": 0.19926694665261527, "loss": 0.2201, "num_input_tokens_seen": 13636576, "step": 15740 }, { "epoch": 7.423385195662424, "grad_norm": 0.0005860735545866191, "learning_rate": 0.19921130587914526, "loss": 0.2325, "num_input_tokens_seen": 13640688, "step": 15745 }, { "epoch": 7.425742574257426, "grad_norm": 0.0009464659378863871, "learning_rate": 0.19915565751667344, "loss": 0.2276, "num_input_tokens_seen": 13644768, "step": 15750 }, { "epoch": 7.428099952852428, "grad_norm": 0.0015194168081507087, "learning_rate": 0.19910000157378152, "loss": 0.1962, "num_input_tokens_seen": 13649040, "step": 15755 }, { "epoch": 7.43045733144743, "grad_norm": 0.0005551839130930603, "learning_rate": 0.1990443380590523, "loss": 0.1946, "num_input_tokens_seen": 13653344, "step": 15760 }, { "epoch": 7.432814710042432, "grad_norm": 0.0019931152928620577, "learning_rate": 0.19898866698106984, "loss": 0.2267, "num_input_tokens_seen": 13657696, "step": 15765 }, { "epoch": 7.435172088637435, "grad_norm": 0.000997345196083188, "learning_rate": 0.19893298834841933, "loss": 0.1653, "num_input_tokens_seen": 13661696, "step": 15770 }, { "epoch": 7.437529467232437, "grad_norm": 0.0004948952118866146, "learning_rate": 0.19887730216968705, "loss": 0.2821, "num_input_tokens_seen": 13666240, "step": 15775 }, { "epoch": 7.43988684582744, "grad_norm": 0.0011417437344789505, "learning_rate": 0.19882160845346053, "loss": 0.1867, "num_input_tokens_seen": 13670944, "step": 15780 }, { "epoch": 7.442244224422442, "grad_norm": 0.0009144588257186115, "learning_rate": 0.1987659072083285, "loss": 0.176, "num_input_tokens_seen": 13676320, "step": 15785 }, { "epoch": 7.4446016030174444, "grad_norm": 0.0012812716886401176, "learning_rate": 0.1987101984428807, "loss": 0.2066, "num_input_tokens_seen": 13680272, "step": 15790 }, { "epoch": 7.446958981612447, "grad_norm": 0.0007047458202578127, "learning_rate": 0.19865448216570822, "loss": 0.1903, "num_input_tokens_seen": 13684880, "step": 15795 }, { "epoch": 7.449316360207449, "grad_norm": 0.003035178640857339, "learning_rate": 0.19859875838540317, "loss": 0.2608, "num_input_tokens_seen": 13688640, "step": 15800 }, { "epoch": 7.449316360207449, "eval_loss": 0.2198134958744049, "eval_runtime": 21.921, "eval_samples_per_second": 43.018, "eval_steps_per_second": 21.532, "num_input_tokens_seen": 13688640, "step": 15800 }, { "epoch": 7.451673738802452, "grad_norm": 0.0019790607038885355, "learning_rate": 0.1985430271105588, "loss": 0.2624, "num_input_tokens_seen": 13693616, "step": 15805 }, { "epoch": 7.454031117397454, "grad_norm": 0.0008906322764232755, "learning_rate": 0.19848728834976961, "loss": 0.211, "num_input_tokens_seen": 13697392, "step": 15810 }, { "epoch": 7.4563884959924565, "grad_norm": 0.0006739745731465518, "learning_rate": 0.19843154211163128, "loss": 0.2187, "num_input_tokens_seen": 13702080, "step": 15815 }, { "epoch": 7.458745874587459, "grad_norm": 0.0010772119276225567, "learning_rate": 0.1983757884047405, "loss": 0.2297, "num_input_tokens_seen": 13706192, "step": 15820 }, { "epoch": 7.461103253182461, "grad_norm": 0.0006713903858326375, "learning_rate": 0.1983200272376952, "loss": 0.2225, "num_input_tokens_seen": 13710528, "step": 15825 }, { "epoch": 7.463460631777464, "grad_norm": 0.001854804577305913, "learning_rate": 0.1982642586190945, "loss": 0.2074, "num_input_tokens_seen": 13714736, "step": 15830 }, { "epoch": 7.465818010372466, "grad_norm": 0.0017706318758428097, "learning_rate": 0.1982084825575386, "loss": 0.1985, "num_input_tokens_seen": 13719360, "step": 15835 }, { "epoch": 7.4681753889674685, "grad_norm": 0.0009043480968102813, "learning_rate": 0.19815269906162883, "loss": 0.1822, "num_input_tokens_seen": 13722864, "step": 15840 }, { "epoch": 7.470532767562471, "grad_norm": 0.0007522740634158254, "learning_rate": 0.19809690813996775, "loss": 0.2615, "num_input_tokens_seen": 13727424, "step": 15845 }, { "epoch": 7.472890146157473, "grad_norm": 0.0025366076733917, "learning_rate": 0.19804110980115905, "loss": 0.2044, "num_input_tokens_seen": 13730912, "step": 15850 }, { "epoch": 7.475247524752476, "grad_norm": 0.00226388918235898, "learning_rate": 0.19798530405380746, "loss": 0.2744, "num_input_tokens_seen": 13734720, "step": 15855 }, { "epoch": 7.477604903347478, "grad_norm": 0.0008240643655881286, "learning_rate": 0.19792949090651893, "loss": 0.2225, "num_input_tokens_seen": 13739520, "step": 15860 }, { "epoch": 7.47996228194248, "grad_norm": 0.001074794097803533, "learning_rate": 0.19787367036790066, "loss": 0.2317, "num_input_tokens_seen": 13743504, "step": 15865 }, { "epoch": 7.482319660537482, "grad_norm": 0.001268785330466926, "learning_rate": 0.19781784244656075, "loss": 0.2216, "num_input_tokens_seen": 13747216, "step": 15870 }, { "epoch": 7.484677039132484, "grad_norm": 0.0009768782183527946, "learning_rate": 0.19776200715110864, "loss": 0.2474, "num_input_tokens_seen": 13751872, "step": 15875 }, { "epoch": 7.487034417727487, "grad_norm": 0.001726972870528698, "learning_rate": 0.1977061644901548, "loss": 0.214, "num_input_tokens_seen": 13756848, "step": 15880 }, { "epoch": 7.489391796322489, "grad_norm": 0.0008257489535026252, "learning_rate": 0.1976503144723109, "loss": 0.2281, "num_input_tokens_seen": 13761408, "step": 15885 }, { "epoch": 7.491749174917492, "grad_norm": 0.0006839131820015609, "learning_rate": 0.19759445710618967, "loss": 0.2039, "num_input_tokens_seen": 13766096, "step": 15890 }, { "epoch": 7.494106553512494, "grad_norm": 0.0008757683099247515, "learning_rate": 0.19753859240040508, "loss": 0.2019, "num_input_tokens_seen": 13770896, "step": 15895 }, { "epoch": 7.496463932107496, "grad_norm": 0.002707946812734008, "learning_rate": 0.1974827203635721, "loss": 0.2262, "num_input_tokens_seen": 13776048, "step": 15900 }, { "epoch": 7.498821310702499, "grad_norm": 0.0009542276966385543, "learning_rate": 0.19742684100430694, "loss": 0.2519, "num_input_tokens_seen": 13780752, "step": 15905 }, { "epoch": 7.501178689297501, "grad_norm": 0.0009286348940804601, "learning_rate": 0.19737095433122692, "loss": 0.224, "num_input_tokens_seen": 13784832, "step": 15910 }, { "epoch": 7.503536067892504, "grad_norm": 0.001171562122181058, "learning_rate": 0.19731506035295046, "loss": 0.2312, "num_input_tokens_seen": 13789328, "step": 15915 }, { "epoch": 7.505893446487506, "grad_norm": 0.002538565546274185, "learning_rate": 0.19725915907809702, "loss": 0.2013, "num_input_tokens_seen": 13793504, "step": 15920 }, { "epoch": 7.508250825082508, "grad_norm": 0.0006675312179140747, "learning_rate": 0.1972032505152874, "loss": 0.2489, "num_input_tokens_seen": 13797968, "step": 15925 }, { "epoch": 7.510608203677511, "grad_norm": 0.0006826460012234747, "learning_rate": 0.19714733467314338, "loss": 0.1986, "num_input_tokens_seen": 13801472, "step": 15930 }, { "epoch": 7.512965582272513, "grad_norm": 0.0006211880245245993, "learning_rate": 0.19709141156028784, "loss": 0.2176, "num_input_tokens_seen": 13806832, "step": 15935 }, { "epoch": 7.515322960867516, "grad_norm": 0.0012805040460079908, "learning_rate": 0.1970354811853448, "loss": 0.2419, "num_input_tokens_seen": 13810896, "step": 15940 }, { "epoch": 7.517680339462518, "grad_norm": 0.0010723763843998313, "learning_rate": 0.19697954355693953, "loss": 0.2325, "num_input_tokens_seen": 13815856, "step": 15945 }, { "epoch": 7.52003771805752, "grad_norm": 0.0007440208573825657, "learning_rate": 0.19692359868369827, "loss": 0.2164, "num_input_tokens_seen": 13819680, "step": 15950 }, { "epoch": 7.522395096652522, "grad_norm": 0.0011183223687112331, "learning_rate": 0.1968676465742484, "loss": 0.2108, "num_input_tokens_seen": 13823984, "step": 15955 }, { "epoch": 7.524752475247524, "grad_norm": 0.0007324377074837685, "learning_rate": 0.19681168723721845, "loss": 0.2269, "num_input_tokens_seen": 13828256, "step": 15960 }, { "epoch": 7.527109853842527, "grad_norm": 0.001567443017847836, "learning_rate": 0.19675572068123803, "loss": 0.2077, "num_input_tokens_seen": 13832112, "step": 15965 }, { "epoch": 7.529467232437529, "grad_norm": 0.0010483977384865284, "learning_rate": 0.19669974691493794, "loss": 0.2402, "num_input_tokens_seen": 13836928, "step": 15970 }, { "epoch": 7.5318246110325315, "grad_norm": 0.0009805227164179087, "learning_rate": 0.19664376594695002, "loss": 0.2074, "num_input_tokens_seen": 13841232, "step": 15975 }, { "epoch": 7.534181989627534, "grad_norm": 0.001306479680351913, "learning_rate": 0.19658777778590722, "loss": 0.2383, "num_input_tokens_seen": 13845056, "step": 15980 }, { "epoch": 7.536539368222536, "grad_norm": 0.0009496709099039435, "learning_rate": 0.19653178244044364, "loss": 0.2252, "num_input_tokens_seen": 13849648, "step": 15985 }, { "epoch": 7.538896746817539, "grad_norm": 0.001598716713488102, "learning_rate": 0.19647577991919443, "loss": 0.1804, "num_input_tokens_seen": 13853632, "step": 15990 }, { "epoch": 7.541254125412541, "grad_norm": 0.0010123283136636019, "learning_rate": 0.1964197702307959, "loss": 0.2027, "num_input_tokens_seen": 13858288, "step": 15995 }, { "epoch": 7.5436115040075435, "grad_norm": 0.0007701338035985827, "learning_rate": 0.19636375338388545, "loss": 0.2053, "num_input_tokens_seen": 13863312, "step": 16000 }, { "epoch": 7.5436115040075435, "eval_loss": 0.2214398831129074, "eval_runtime": 21.9276, "eval_samples_per_second": 43.005, "eval_steps_per_second": 21.525, "num_input_tokens_seen": 13863312, "step": 16000 }, { "epoch": 7.545968882602546, "grad_norm": 0.0023055970668792725, "learning_rate": 0.1963077293871016, "loss": 0.2474, "num_input_tokens_seen": 13867280, "step": 16005 }, { "epoch": 7.548326261197548, "grad_norm": 0.004393108654767275, "learning_rate": 0.19625169824908395, "loss": 0.238, "num_input_tokens_seen": 13871360, "step": 16010 }, { "epoch": 7.550683639792551, "grad_norm": 0.0009342250414192677, "learning_rate": 0.19619565997847319, "loss": 0.1977, "num_input_tokens_seen": 13876032, "step": 16015 }, { "epoch": 7.553041018387553, "grad_norm": 0.0018847247119992971, "learning_rate": 0.19613961458391113, "loss": 0.2091, "num_input_tokens_seen": 13880784, "step": 16020 }, { "epoch": 7.5553983969825556, "grad_norm": 0.0009684693068265915, "learning_rate": 0.19608356207404065, "loss": 0.2366, "num_input_tokens_seen": 13885280, "step": 16025 }, { "epoch": 7.557755775577558, "grad_norm": 0.0016907795798033476, "learning_rate": 0.1960275024575058, "loss": 0.2043, "num_input_tokens_seen": 13889328, "step": 16030 }, { "epoch": 7.56011315417256, "grad_norm": 0.000994437257759273, "learning_rate": 0.19597143574295164, "loss": 0.224, "num_input_tokens_seen": 13894256, "step": 16035 }, { "epoch": 7.562470532767563, "grad_norm": 0.0011292974231764674, "learning_rate": 0.1959153619390244, "loss": 0.2052, "num_input_tokens_seen": 13898560, "step": 16040 }, { "epoch": 7.564827911362565, "grad_norm": 0.0009519473533146083, "learning_rate": 0.1958592810543713, "loss": 0.2854, "num_input_tokens_seen": 13902592, "step": 16045 }, { "epoch": 7.567185289957568, "grad_norm": 0.000994957285001874, "learning_rate": 0.19580319309764077, "loss": 0.2145, "num_input_tokens_seen": 13906464, "step": 16050 }, { "epoch": 7.56954266855257, "grad_norm": 0.001000052085146308, "learning_rate": 0.1957470980774823, "loss": 0.2248, "num_input_tokens_seen": 13910752, "step": 16055 }, { "epoch": 7.571900047147572, "grad_norm": 0.000700485659763217, "learning_rate": 0.19569099600254639, "loss": 0.2326, "num_input_tokens_seen": 13915088, "step": 16060 }, { "epoch": 7.574257425742574, "grad_norm": 0.0007162366528064013, "learning_rate": 0.1956348868814847, "loss": 0.2381, "num_input_tokens_seen": 13919104, "step": 16065 }, { "epoch": 7.576614804337576, "grad_norm": 0.0011094091460108757, "learning_rate": 0.19557877072295, "loss": 0.2274, "num_input_tokens_seen": 13923392, "step": 16070 }, { "epoch": 7.578972182932579, "grad_norm": 0.0016657671658322215, "learning_rate": 0.19552264753559603, "loss": 0.2037, "num_input_tokens_seen": 13928240, "step": 16075 }, { "epoch": 7.581329561527581, "grad_norm": 0.0008856115746311843, "learning_rate": 0.19546651732807774, "loss": 0.2425, "num_input_tokens_seen": 13932464, "step": 16080 }, { "epoch": 7.5836869401225835, "grad_norm": 0.0009673985769040883, "learning_rate": 0.19541038010905112, "loss": 0.1742, "num_input_tokens_seen": 13936272, "step": 16085 }, { "epoch": 7.586044318717586, "grad_norm": 0.0012668371200561523, "learning_rate": 0.19535423588717324, "loss": 0.1836, "num_input_tokens_seen": 13940352, "step": 16090 }, { "epoch": 7.588401697312588, "grad_norm": 0.0004977318458259106, "learning_rate": 0.19529808467110224, "loss": 0.231, "num_input_tokens_seen": 13945376, "step": 16095 }, { "epoch": 7.590759075907591, "grad_norm": 0.0012938473373651505, "learning_rate": 0.19524192646949734, "loss": 0.2881, "num_input_tokens_seen": 13949488, "step": 16100 }, { "epoch": 7.593116454502593, "grad_norm": 0.0020714097190648317, "learning_rate": 0.19518576129101878, "loss": 0.1997, "num_input_tokens_seen": 13954256, "step": 16105 }, { "epoch": 7.5954738330975955, "grad_norm": 0.0017766216769814491, "learning_rate": 0.19512958914432804, "loss": 0.2246, "num_input_tokens_seen": 13958832, "step": 16110 }, { "epoch": 7.597831211692598, "grad_norm": 0.0012715080520138144, "learning_rate": 0.1950734100380875, "loss": 0.2115, "num_input_tokens_seen": 13963008, "step": 16115 }, { "epoch": 7.6001885902876, "grad_norm": 0.0006806178134866059, "learning_rate": 0.19501722398096066, "loss": 0.231, "num_input_tokens_seen": 13966944, "step": 16120 }, { "epoch": 7.602545968882603, "grad_norm": 0.00136559980455786, "learning_rate": 0.1949610309816122, "loss": 0.2319, "num_input_tokens_seen": 13971520, "step": 16125 }, { "epoch": 7.604903347477605, "grad_norm": 0.0012694124598056078, "learning_rate": 0.1949048310487078, "loss": 0.2285, "num_input_tokens_seen": 13975888, "step": 16130 }, { "epoch": 7.6072607260726075, "grad_norm": 0.0020746146328747272, "learning_rate": 0.19484862419091406, "loss": 0.2305, "num_input_tokens_seen": 13980304, "step": 16135 }, { "epoch": 7.60961810466761, "grad_norm": 0.0007621843833476305, "learning_rate": 0.19479241041689893, "loss": 0.2349, "num_input_tokens_seen": 13984016, "step": 16140 }, { "epoch": 7.611975483262612, "grad_norm": 0.0008407479617744684, "learning_rate": 0.19473618973533116, "loss": 0.2098, "num_input_tokens_seen": 13988464, "step": 16145 }, { "epoch": 7.614332861857615, "grad_norm": 0.000796615902800113, "learning_rate": 0.19467996215488076, "loss": 0.238, "num_input_tokens_seen": 13992208, "step": 16150 }, { "epoch": 7.616690240452616, "grad_norm": 0.0007784464978612959, "learning_rate": 0.1946237276842187, "loss": 0.2682, "num_input_tokens_seen": 13996688, "step": 16155 }, { "epoch": 7.619047619047619, "grad_norm": 0.0009017623378895223, "learning_rate": 0.19456748633201712, "loss": 0.2144, "num_input_tokens_seen": 14001056, "step": 16160 }, { "epoch": 7.621404997642621, "grad_norm": 0.0015838656108826399, "learning_rate": 0.194511238106949, "loss": 0.2081, "num_input_tokens_seen": 14004368, "step": 16165 }, { "epoch": 7.623762376237623, "grad_norm": 0.000657330674584955, "learning_rate": 0.19445498301768863, "loss": 0.2182, "num_input_tokens_seen": 14007936, "step": 16170 }, { "epoch": 7.626119754832626, "grad_norm": 0.0010585157433524728, "learning_rate": 0.19439872107291126, "loss": 0.2192, "num_input_tokens_seen": 14012112, "step": 16175 }, { "epoch": 7.628477133427628, "grad_norm": 0.0013791456585749984, "learning_rate": 0.1943424522812931, "loss": 0.244, "num_input_tokens_seen": 14016240, "step": 16180 }, { "epoch": 7.630834512022631, "grad_norm": 0.0025927203241735697, "learning_rate": 0.19428617665151157, "loss": 0.2397, "num_input_tokens_seen": 14020448, "step": 16185 }, { "epoch": 7.633191890617633, "grad_norm": 0.00101269013248384, "learning_rate": 0.19422989419224507, "loss": 0.2037, "num_input_tokens_seen": 14024864, "step": 16190 }, { "epoch": 7.635549269212635, "grad_norm": 0.0005833413451910019, "learning_rate": 0.19417360491217303, "loss": 0.2104, "num_input_tokens_seen": 14029184, "step": 16195 }, { "epoch": 7.637906647807638, "grad_norm": 0.0019800260197371244, "learning_rate": 0.19411730881997605, "loss": 0.231, "num_input_tokens_seen": 14032992, "step": 16200 }, { "epoch": 7.637906647807638, "eval_loss": 0.21739938855171204, "eval_runtime": 21.9077, "eval_samples_per_second": 43.044, "eval_steps_per_second": 21.545, "num_input_tokens_seen": 14032992, "step": 16200 }, { "epoch": 7.64026402640264, "grad_norm": 0.0008662063046358526, "learning_rate": 0.1940610059243356, "loss": 0.2459, "num_input_tokens_seen": 14037728, "step": 16205 }, { "epoch": 7.642621404997643, "grad_norm": 0.0009490397642366588, "learning_rate": 0.19400469623393435, "loss": 0.2189, "num_input_tokens_seen": 14042192, "step": 16210 }, { "epoch": 7.644978783592645, "grad_norm": 0.0008216191199608147, "learning_rate": 0.1939483797574559, "loss": 0.2072, "num_input_tokens_seen": 14046144, "step": 16215 }, { "epoch": 7.6473361621876474, "grad_norm": 0.000831150624435395, "learning_rate": 0.19389205650358504, "loss": 0.2304, "num_input_tokens_seen": 14050208, "step": 16220 }, { "epoch": 7.64969354078265, "grad_norm": 0.000904123648069799, "learning_rate": 0.19383572648100747, "loss": 0.2043, "num_input_tokens_seen": 14054880, "step": 16225 }, { "epoch": 7.652050919377652, "grad_norm": 0.0007262252038344741, "learning_rate": 0.19377938969841, "loss": 0.2359, "num_input_tokens_seen": 14059600, "step": 16230 }, { "epoch": 7.654408297972655, "grad_norm": 0.0005963325966149569, "learning_rate": 0.1937230461644805, "loss": 0.272, "num_input_tokens_seen": 14062976, "step": 16235 }, { "epoch": 7.656765676567657, "grad_norm": 0.0012865022290498018, "learning_rate": 0.19366669588790777, "loss": 0.2124, "num_input_tokens_seen": 14067120, "step": 16240 }, { "epoch": 7.6591230551626595, "grad_norm": 0.0008523368742316961, "learning_rate": 0.19361033887738185, "loss": 0.1985, "num_input_tokens_seen": 14071696, "step": 16245 }, { "epoch": 7.661480433757662, "grad_norm": 0.0017915060743689537, "learning_rate": 0.19355397514159361, "loss": 0.1894, "num_input_tokens_seen": 14075760, "step": 16250 }, { "epoch": 7.663837812352664, "grad_norm": 0.0009128367528319359, "learning_rate": 0.19349760468923508, "loss": 0.2233, "num_input_tokens_seen": 14080016, "step": 16255 }, { "epoch": 7.666195190947667, "grad_norm": 0.0020780207123607397, "learning_rate": 0.19344122752899925, "loss": 0.2391, "num_input_tokens_seen": 14085360, "step": 16260 }, { "epoch": 7.668552569542668, "grad_norm": 0.0012987582013010979, "learning_rate": 0.1933848436695802, "loss": 0.2128, "num_input_tokens_seen": 14089280, "step": 16265 }, { "epoch": 7.670909948137671, "grad_norm": 0.0007208376773633063, "learning_rate": 0.1933284531196731, "loss": 0.269, "num_input_tokens_seen": 14093728, "step": 16270 }, { "epoch": 7.673267326732673, "grad_norm": 0.0008002311224117875, "learning_rate": 0.19327205588797403, "loss": 0.2114, "num_input_tokens_seen": 14098384, "step": 16275 }, { "epoch": 7.675624705327675, "grad_norm": 0.0007173403864726424, "learning_rate": 0.19321565198318014, "loss": 0.2356, "num_input_tokens_seen": 14102880, "step": 16280 }, { "epoch": 7.677982083922678, "grad_norm": 0.0017524423310533166, "learning_rate": 0.1931592414139896, "loss": 0.2199, "num_input_tokens_seen": 14107184, "step": 16285 }, { "epoch": 7.68033946251768, "grad_norm": 0.0006862645386718214, "learning_rate": 0.19310282418910169, "loss": 0.1944, "num_input_tokens_seen": 14111456, "step": 16290 }, { "epoch": 7.682696841112683, "grad_norm": 0.0005590326036326587, "learning_rate": 0.1930464003172166, "loss": 0.1985, "num_input_tokens_seen": 14116192, "step": 16295 }, { "epoch": 7.685054219707685, "grad_norm": 0.001132928067818284, "learning_rate": 0.19298996980703567, "loss": 0.1948, "num_input_tokens_seen": 14120944, "step": 16300 }, { "epoch": 7.687411598302687, "grad_norm": 0.0011681076139211655, "learning_rate": 0.19293353266726113, "loss": 0.2757, "num_input_tokens_seen": 14125120, "step": 16305 }, { "epoch": 7.68976897689769, "grad_norm": 0.0005028427694924176, "learning_rate": 0.19287708890659633, "loss": 0.2377, "num_input_tokens_seen": 14129616, "step": 16310 }, { "epoch": 7.692126355492692, "grad_norm": 0.001385749434120953, "learning_rate": 0.19282063853374556, "loss": 0.1887, "num_input_tokens_seen": 14133696, "step": 16315 }, { "epoch": 7.694483734087695, "grad_norm": 0.0014273737324401736, "learning_rate": 0.19276418155741423, "loss": 0.1974, "num_input_tokens_seen": 14138128, "step": 16320 }, { "epoch": 7.696841112682697, "grad_norm": 0.0024340790696442127, "learning_rate": 0.19270771798630867, "loss": 0.2512, "num_input_tokens_seen": 14142608, "step": 16325 }, { "epoch": 7.699198491277699, "grad_norm": 0.0007513146847486496, "learning_rate": 0.1926512478291363, "loss": 0.2376, "num_input_tokens_seen": 14147120, "step": 16330 }, { "epoch": 7.701555869872702, "grad_norm": 0.0013525652466341853, "learning_rate": 0.19259477109460557, "loss": 0.2363, "num_input_tokens_seen": 14151648, "step": 16335 }, { "epoch": 7.703913248467704, "grad_norm": 0.000694717513397336, "learning_rate": 0.19253828779142584, "loss": 0.2298, "num_input_tokens_seen": 14156512, "step": 16340 }, { "epoch": 7.706270627062707, "grad_norm": 0.001004462013952434, "learning_rate": 0.19248179792830755, "loss": 0.2282, "num_input_tokens_seen": 14160640, "step": 16345 }, { "epoch": 7.708628005657709, "grad_norm": 0.001472437521442771, "learning_rate": 0.19242530151396217, "loss": 0.2369, "num_input_tokens_seen": 14165088, "step": 16350 }, { "epoch": 7.7109853842527105, "grad_norm": 0.0012310630409047008, "learning_rate": 0.19236879855710215, "loss": 0.2401, "num_input_tokens_seen": 14169376, "step": 16355 }, { "epoch": 7.713342762847713, "grad_norm": 0.002175379078835249, "learning_rate": 0.19231228906644096, "loss": 0.2224, "num_input_tokens_seen": 14173600, "step": 16360 }, { "epoch": 7.715700141442715, "grad_norm": 0.0014057999942451715, "learning_rate": 0.19225577305069302, "loss": 0.2175, "num_input_tokens_seen": 14177472, "step": 16365 }, { "epoch": 7.718057520037718, "grad_norm": 0.0009695614571683109, "learning_rate": 0.1921992505185739, "loss": 0.2387, "num_input_tokens_seen": 14181040, "step": 16370 }, { "epoch": 7.72041489863272, "grad_norm": 0.0018180604092776775, "learning_rate": 0.19214272147880004, "loss": 0.2625, "num_input_tokens_seen": 14184848, "step": 16375 }, { "epoch": 7.7227722772277225, "grad_norm": 0.0015521554742008448, "learning_rate": 0.19208618594008892, "loss": 0.2, "num_input_tokens_seen": 14189408, "step": 16380 }, { "epoch": 7.725129655822725, "grad_norm": 0.0015190079575404525, "learning_rate": 0.19202964391115904, "loss": 0.227, "num_input_tokens_seen": 14193536, "step": 16385 }, { "epoch": 7.727487034417727, "grad_norm": 0.0006034130346961319, "learning_rate": 0.1919730954007299, "loss": 0.2408, "num_input_tokens_seen": 14198064, "step": 16390 }, { "epoch": 7.72984441301273, "grad_norm": 0.0013140290975570679, "learning_rate": 0.19191654041752199, "loss": 0.2136, "num_input_tokens_seen": 14202336, "step": 16395 }, { "epoch": 7.732201791607732, "grad_norm": 0.0014657570281997323, "learning_rate": 0.19185997897025678, "loss": 0.2223, "num_input_tokens_seen": 14205936, "step": 16400 }, { "epoch": 7.732201791607732, "eval_loss": 0.21696949005126953, "eval_runtime": 21.877, "eval_samples_per_second": 43.105, "eval_steps_per_second": 21.575, "num_input_tokens_seen": 14205936, "step": 16400 }, { "epoch": 7.7345591702027345, "grad_norm": 0.0007751883822493255, "learning_rate": 0.19180341106765672, "loss": 0.2249, "num_input_tokens_seen": 14210128, "step": 16405 }, { "epoch": 7.736916548797737, "grad_norm": 0.0006194652523845434, "learning_rate": 0.19174683671844536, "loss": 0.1619, "num_input_tokens_seen": 14214608, "step": 16410 }, { "epoch": 7.739273927392739, "grad_norm": 0.0010008709505200386, "learning_rate": 0.19169025593134717, "loss": 0.2523, "num_input_tokens_seen": 14219392, "step": 16415 }, { "epoch": 7.741631305987742, "grad_norm": 0.001185420318506658, "learning_rate": 0.19163366871508764, "loss": 0.2002, "num_input_tokens_seen": 14223968, "step": 16420 }, { "epoch": 7.743988684582744, "grad_norm": 0.00063501694239676, "learning_rate": 0.19157707507839317, "loss": 0.2233, "num_input_tokens_seen": 14228160, "step": 16425 }, { "epoch": 7.7463460631777465, "grad_norm": 0.0007994241314008832, "learning_rate": 0.19152047502999123, "loss": 0.2074, "num_input_tokens_seen": 14233168, "step": 16430 }, { "epoch": 7.748703441772749, "grad_norm": 0.000623456435278058, "learning_rate": 0.19146386857861025, "loss": 0.221, "num_input_tokens_seen": 14237200, "step": 16435 }, { "epoch": 7.751060820367751, "grad_norm": 0.00451489957049489, "learning_rate": 0.19140725573297968, "loss": 0.2311, "num_input_tokens_seen": 14241712, "step": 16440 }, { "epoch": 7.753418198962754, "grad_norm": 0.0007984450785443187, "learning_rate": 0.19135063650182987, "loss": 0.245, "num_input_tokens_seen": 14246112, "step": 16445 }, { "epoch": 7.755775577557756, "grad_norm": 0.0007842029444873333, "learning_rate": 0.19129401089389234, "loss": 0.2117, "num_input_tokens_seen": 14249776, "step": 16450 }, { "epoch": 7.7581329561527586, "grad_norm": 0.0014097105013206601, "learning_rate": 0.19123737891789938, "loss": 0.2204, "num_input_tokens_seen": 14253232, "step": 16455 }, { "epoch": 7.760490334747761, "grad_norm": 0.0007513435557484627, "learning_rate": 0.19118074058258439, "loss": 0.206, "num_input_tokens_seen": 14257152, "step": 16460 }, { "epoch": 7.7628477133427625, "grad_norm": 0.0008209226070903242, "learning_rate": 0.1911240958966816, "loss": 0.2054, "num_input_tokens_seen": 14261152, "step": 16465 }, { "epoch": 7.765205091937765, "grad_norm": 0.0010022311471402645, "learning_rate": 0.19106744486892652, "loss": 0.2203, "num_input_tokens_seen": 14265632, "step": 16470 }, { "epoch": 7.767562470532767, "grad_norm": 0.0019513637525960803, "learning_rate": 0.1910107875080553, "loss": 0.2276, "num_input_tokens_seen": 14269760, "step": 16475 }, { "epoch": 7.76991984912777, "grad_norm": 0.001403791131451726, "learning_rate": 0.19095412382280533, "loss": 0.2693, "num_input_tokens_seen": 14273440, "step": 16480 }, { "epoch": 7.772277227722772, "grad_norm": 0.0009749647579155862, "learning_rate": 0.19089745382191473, "loss": 0.219, "num_input_tokens_seen": 14278048, "step": 16485 }, { "epoch": 7.7746346063177745, "grad_norm": 0.0010335799306631088, "learning_rate": 0.19084077751412284, "loss": 0.237, "num_input_tokens_seen": 14282496, "step": 16490 }, { "epoch": 7.776991984912777, "grad_norm": 0.00043918329174630344, "learning_rate": 0.19078409490816986, "loss": 0.2354, "num_input_tokens_seen": 14286608, "step": 16495 }, { "epoch": 7.779349363507779, "grad_norm": 0.000610384508036077, "learning_rate": 0.19072740601279686, "loss": 0.2423, "num_input_tokens_seen": 14290560, "step": 16500 }, { "epoch": 7.781706742102782, "grad_norm": 0.000394374190364033, "learning_rate": 0.19067071083674605, "loss": 0.2254, "num_input_tokens_seen": 14294512, "step": 16505 }, { "epoch": 7.784064120697784, "grad_norm": 0.0007553209434263408, "learning_rate": 0.19061400938876052, "loss": 0.2267, "num_input_tokens_seen": 14298384, "step": 16510 }, { "epoch": 7.7864214992927865, "grad_norm": 0.0005539768026210368, "learning_rate": 0.1905573016775844, "loss": 0.2147, "num_input_tokens_seen": 14302704, "step": 16515 }, { "epoch": 7.788778877887789, "grad_norm": 0.0013677573297172785, "learning_rate": 0.19050058771196263, "loss": 0.2283, "num_input_tokens_seen": 14307552, "step": 16520 }, { "epoch": 7.791136256482791, "grad_norm": 0.0007618629024364054, "learning_rate": 0.19044386750064132, "loss": 0.2032, "num_input_tokens_seen": 14313040, "step": 16525 }, { "epoch": 7.793493635077794, "grad_norm": 0.0007178982486948371, "learning_rate": 0.19038714105236737, "loss": 0.2698, "num_input_tokens_seen": 14318384, "step": 16530 }, { "epoch": 7.795851013672796, "grad_norm": 0.0006554012652486563, "learning_rate": 0.19033040837588874, "loss": 0.2312, "num_input_tokens_seen": 14322672, "step": 16535 }, { "epoch": 7.7982083922677985, "grad_norm": 0.0006028180359862745, "learning_rate": 0.1902736694799543, "loss": 0.2347, "num_input_tokens_seen": 14326944, "step": 16540 }, { "epoch": 7.800565770862801, "grad_norm": 0.0009679672075435519, "learning_rate": 0.19021692437331392, "loss": 0.2303, "num_input_tokens_seen": 14331360, "step": 16545 }, { "epoch": 7.802923149457803, "grad_norm": 0.0005771837313659489, "learning_rate": 0.1901601730647184, "loss": 0.2413, "num_input_tokens_seen": 14335888, "step": 16550 }, { "epoch": 7.805280528052805, "grad_norm": 0.0005262447521090508, "learning_rate": 0.19010341556291954, "loss": 0.2136, "num_input_tokens_seen": 14339856, "step": 16555 }, { "epoch": 7.807637906647807, "grad_norm": 0.0008040823740884662, "learning_rate": 0.19004665187667, "loss": 0.2234, "num_input_tokens_seen": 14344160, "step": 16560 }, { "epoch": 7.80999528524281, "grad_norm": 0.0006279649096541107, "learning_rate": 0.1899898820147235, "loss": 0.2258, "num_input_tokens_seen": 14348880, "step": 16565 }, { "epoch": 7.812352663837812, "grad_norm": 0.0006407352047972381, "learning_rate": 0.18993310598583465, "loss": 0.2557, "num_input_tokens_seen": 14353664, "step": 16570 }, { "epoch": 7.814710042432814, "grad_norm": 0.0008111881907097995, "learning_rate": 0.18987632379875904, "loss": 0.2001, "num_input_tokens_seen": 14357184, "step": 16575 }, { "epoch": 7.817067421027817, "grad_norm": 0.001462264102883637, "learning_rate": 0.18981953546225314, "loss": 0.2185, "num_input_tokens_seen": 14361408, "step": 16580 }, { "epoch": 7.819424799622819, "grad_norm": 0.0008462051046080887, "learning_rate": 0.18976274098507445, "loss": 0.221, "num_input_tokens_seen": 14365456, "step": 16585 }, { "epoch": 7.821782178217822, "grad_norm": 0.0006756051443517208, "learning_rate": 0.18970594037598146, "loss": 0.2054, "num_input_tokens_seen": 14369680, "step": 16590 }, { "epoch": 7.824139556812824, "grad_norm": 0.0007660582778044045, "learning_rate": 0.1896491336437335, "loss": 0.2301, "num_input_tokens_seen": 14374016, "step": 16595 }, { "epoch": 7.826496935407826, "grad_norm": 0.0004699956043623388, "learning_rate": 0.18959232079709085, "loss": 0.214, "num_input_tokens_seen": 14378336, "step": 16600 }, { "epoch": 7.826496935407826, "eval_loss": 0.2169797569513321, "eval_runtime": 21.8937, "eval_samples_per_second": 43.072, "eval_steps_per_second": 21.559, "num_input_tokens_seen": 14378336, "step": 16600 }, { "epoch": 7.828854314002829, "grad_norm": 0.0005402026581577957, "learning_rate": 0.18953550184481477, "loss": 0.2102, "num_input_tokens_seen": 14383152, "step": 16605 }, { "epoch": 7.831211692597831, "grad_norm": 0.0014377469196915627, "learning_rate": 0.18947867679566752, "loss": 0.2281, "num_input_tokens_seen": 14387344, "step": 16610 }, { "epoch": 7.833569071192834, "grad_norm": 0.0005228299996815622, "learning_rate": 0.18942184565841216, "loss": 0.2087, "num_input_tokens_seen": 14391088, "step": 16615 }, { "epoch": 7.835926449787836, "grad_norm": 0.0008317032479681075, "learning_rate": 0.18936500844181278, "loss": 0.233, "num_input_tokens_seen": 14396032, "step": 16620 }, { "epoch": 7.838283828382838, "grad_norm": 0.0006651821895502508, "learning_rate": 0.18930816515463436, "loss": 0.1767, "num_input_tokens_seen": 14400944, "step": 16625 }, { "epoch": 7.840641206977841, "grad_norm": 0.0007315398543141782, "learning_rate": 0.18925131580564297, "loss": 0.2229, "num_input_tokens_seen": 14405856, "step": 16630 }, { "epoch": 7.842998585572843, "grad_norm": 0.000786019314546138, "learning_rate": 0.1891944604036054, "loss": 0.2161, "num_input_tokens_seen": 14410528, "step": 16635 }, { "epoch": 7.845355964167846, "grad_norm": 0.0016952683217823505, "learning_rate": 0.1891375989572895, "loss": 0.1911, "num_input_tokens_seen": 14415520, "step": 16640 }, { "epoch": 7.847713342762848, "grad_norm": 0.0017846195260062814, "learning_rate": 0.18908073147546398, "loss": 0.2077, "num_input_tokens_seen": 14420144, "step": 16645 }, { "epoch": 7.8500707213578504, "grad_norm": 0.0008527045138180256, "learning_rate": 0.18902385796689858, "loss": 0.2757, "num_input_tokens_seen": 14424752, "step": 16650 }, { "epoch": 7.852428099952853, "grad_norm": 0.0015322606777772307, "learning_rate": 0.18896697844036384, "loss": 0.268, "num_input_tokens_seen": 14428800, "step": 16655 }, { "epoch": 7.854785478547855, "grad_norm": 0.0006297137006185949, "learning_rate": 0.18891009290463137, "loss": 0.2357, "num_input_tokens_seen": 14433488, "step": 16660 }, { "epoch": 7.857142857142857, "grad_norm": 0.0009909336222335696, "learning_rate": 0.18885320136847353, "loss": 0.2386, "num_input_tokens_seen": 14437232, "step": 16665 }, { "epoch": 7.859500235737859, "grad_norm": 0.0009163377690128982, "learning_rate": 0.1887963038406639, "loss": 0.2306, "num_input_tokens_seen": 14442208, "step": 16670 }, { "epoch": 7.861857614332862, "grad_norm": 0.0014180743601173162, "learning_rate": 0.18873940032997658, "loss": 0.2206, "num_input_tokens_seen": 14446208, "step": 16675 }, { "epoch": 7.864214992927864, "grad_norm": 0.0006967016961425543, "learning_rate": 0.18868249084518693, "loss": 0.2058, "num_input_tokens_seen": 14450464, "step": 16680 }, { "epoch": 7.866572371522866, "grad_norm": 0.0006140832556411624, "learning_rate": 0.18862557539507102, "loss": 0.2342, "num_input_tokens_seen": 14454256, "step": 16685 }, { "epoch": 7.868929750117869, "grad_norm": 0.0015372233465313911, "learning_rate": 0.18856865398840605, "loss": 0.188, "num_input_tokens_seen": 14459088, "step": 16690 }, { "epoch": 7.871287128712871, "grad_norm": 0.001364298164844513, "learning_rate": 0.18851172663396995, "loss": 0.2002, "num_input_tokens_seen": 14462928, "step": 16695 }, { "epoch": 7.873644507307874, "grad_norm": 0.0008025983115658164, "learning_rate": 0.1884547933405416, "loss": 0.2087, "num_input_tokens_seen": 14467824, "step": 16700 }, { "epoch": 7.876001885902876, "grad_norm": 0.0010142745450139046, "learning_rate": 0.1883978541169009, "loss": 0.16, "num_input_tokens_seen": 14472000, "step": 16705 }, { "epoch": 7.878359264497878, "grad_norm": 0.00030641487683169544, "learning_rate": 0.18834090897182854, "loss": 0.1899, "num_input_tokens_seen": 14475632, "step": 16710 }, { "epoch": 7.880716643092881, "grad_norm": 0.0021025799214839935, "learning_rate": 0.1882839579141062, "loss": 0.2581, "num_input_tokens_seen": 14479696, "step": 16715 }, { "epoch": 7.883074021687883, "grad_norm": 0.0010305347386747599, "learning_rate": 0.18822700095251646, "loss": 0.2346, "num_input_tokens_seen": 14483776, "step": 16720 }, { "epoch": 7.885431400282886, "grad_norm": 0.0007564348052255809, "learning_rate": 0.18817003809584273, "loss": 0.241, "num_input_tokens_seen": 14487472, "step": 16725 }, { "epoch": 7.887788778877888, "grad_norm": 0.0013136966153979301, "learning_rate": 0.1881130693528695, "loss": 0.2353, "num_input_tokens_seen": 14491568, "step": 16730 }, { "epoch": 7.89014615747289, "grad_norm": 0.00042561019654385746, "learning_rate": 0.18805609473238197, "loss": 0.2217, "num_input_tokens_seen": 14496336, "step": 16735 }, { "epoch": 7.892503536067893, "grad_norm": 0.003990917932242155, "learning_rate": 0.18799911424316643, "loss": 0.2249, "num_input_tokens_seen": 14501200, "step": 16740 }, { "epoch": 7.894860914662895, "grad_norm": 0.0007849353132769465, "learning_rate": 0.18794212789400994, "loss": 0.2174, "num_input_tokens_seen": 14505296, "step": 16745 }, { "epoch": 7.897218293257898, "grad_norm": 0.0006329694297164679, "learning_rate": 0.18788513569370052, "loss": 0.191, "num_input_tokens_seen": 14509536, "step": 16750 }, { "epoch": 7.899575671852899, "grad_norm": 0.001568491687066853, "learning_rate": 0.1878281376510271, "loss": 0.2624, "num_input_tokens_seen": 14513920, "step": 16755 }, { "epoch": 7.9019330504479015, "grad_norm": 0.0016136080957949162, "learning_rate": 0.18777113377477941, "loss": 0.2289, "num_input_tokens_seen": 14518016, "step": 16760 }, { "epoch": 7.904290429042904, "grad_norm": 0.0003511714457999915, "learning_rate": 0.1877141240737483, "loss": 0.2048, "num_input_tokens_seen": 14521296, "step": 16765 }, { "epoch": 7.906647807637906, "grad_norm": 0.0008250278769992292, "learning_rate": 0.18765710855672527, "loss": 0.2035, "num_input_tokens_seen": 14525600, "step": 16770 }, { "epoch": 7.909005186232909, "grad_norm": 0.0006678131176158786, "learning_rate": 0.18760008723250288, "loss": 0.2242, "num_input_tokens_seen": 14529600, "step": 16775 }, { "epoch": 7.911362564827911, "grad_norm": 0.0010437999153509736, "learning_rate": 0.18754306010987457, "loss": 0.2127, "num_input_tokens_seen": 14533744, "step": 16780 }, { "epoch": 7.9137199434229135, "grad_norm": 0.002104994608089328, "learning_rate": 0.18748602719763457, "loss": 0.243, "num_input_tokens_seen": 14538672, "step": 16785 }, { "epoch": 7.916077322017916, "grad_norm": 0.000894346390850842, "learning_rate": 0.18742898850457804, "loss": 0.2255, "num_input_tokens_seen": 14542560, "step": 16790 }, { "epoch": 7.918434700612918, "grad_norm": 0.0015995741123333573, "learning_rate": 0.1873719440395012, "loss": 0.1968, "num_input_tokens_seen": 14547376, "step": 16795 }, { "epoch": 7.920792079207921, "grad_norm": 0.0008060087566263974, "learning_rate": 0.1873148938112009, "loss": 0.23, "num_input_tokens_seen": 14551456, "step": 16800 }, { "epoch": 7.920792079207921, "eval_loss": 0.21796095371246338, "eval_runtime": 21.8888, "eval_samples_per_second": 43.081, "eval_steps_per_second": 21.564, "num_input_tokens_seen": 14551456, "step": 16800 }, { "epoch": 7.923149457802923, "grad_norm": 0.0007948409183882177, "learning_rate": 0.18725783782847508, "loss": 0.197, "num_input_tokens_seen": 14555536, "step": 16805 }, { "epoch": 7.9255068363979255, "grad_norm": 0.0007202514680102468, "learning_rate": 0.1872007761001224, "loss": 0.2052, "num_input_tokens_seen": 14559824, "step": 16810 }, { "epoch": 7.927864214992928, "grad_norm": 0.0004802125331480056, "learning_rate": 0.1871437086349426, "loss": 0.2233, "num_input_tokens_seen": 14563536, "step": 16815 }, { "epoch": 7.93022159358793, "grad_norm": 0.0004227221943438053, "learning_rate": 0.18708663544173615, "loss": 0.2098, "num_input_tokens_seen": 14567600, "step": 16820 }, { "epoch": 7.932578972182933, "grad_norm": 0.0012340486282482743, "learning_rate": 0.18702955652930442, "loss": 0.198, "num_input_tokens_seen": 14571920, "step": 16825 }, { "epoch": 7.934936350777935, "grad_norm": 0.0005463844863697886, "learning_rate": 0.18697247190644972, "loss": 0.1406, "num_input_tokens_seen": 14576016, "step": 16830 }, { "epoch": 7.9372937293729375, "grad_norm": 0.0006519047892652452, "learning_rate": 0.18691538158197527, "loss": 0.2501, "num_input_tokens_seen": 14581152, "step": 16835 }, { "epoch": 7.93965110796794, "grad_norm": 0.0010418759193271399, "learning_rate": 0.1868582855646851, "loss": 0.2309, "num_input_tokens_seen": 14586000, "step": 16840 }, { "epoch": 7.942008486562942, "grad_norm": 0.0009885280160233378, "learning_rate": 0.18680118386338404, "loss": 0.1987, "num_input_tokens_seen": 14589984, "step": 16845 }, { "epoch": 7.944365865157945, "grad_norm": 0.0009283453109674156, "learning_rate": 0.18674407648687794, "loss": 0.2376, "num_input_tokens_seen": 14594240, "step": 16850 }, { "epoch": 7.946723243752947, "grad_norm": 0.0006845822790637612, "learning_rate": 0.1866869634439736, "loss": 0.2156, "num_input_tokens_seen": 14598880, "step": 16855 }, { "epoch": 7.9490806223479495, "grad_norm": 0.0007146000280044973, "learning_rate": 0.18662984474347838, "loss": 0.2344, "num_input_tokens_seen": 14603888, "step": 16860 }, { "epoch": 7.951438000942951, "grad_norm": 0.001735373050905764, "learning_rate": 0.1865727203942008, "loss": 0.2156, "num_input_tokens_seen": 14608448, "step": 16865 }, { "epoch": 7.9537953795379535, "grad_norm": 0.0004984252736903727, "learning_rate": 0.1865155904049501, "loss": 0.2489, "num_input_tokens_seen": 14612624, "step": 16870 }, { "epoch": 7.956152758132956, "grad_norm": 0.0009617778705433011, "learning_rate": 0.1864584547845365, "loss": 0.2142, "num_input_tokens_seen": 14617136, "step": 16875 }, { "epoch": 7.958510136727958, "grad_norm": 0.0011309472611173987, "learning_rate": 0.186401313541771, "loss": 0.2444, "num_input_tokens_seen": 14621376, "step": 16880 }, { "epoch": 7.960867515322961, "grad_norm": 0.001117267063818872, "learning_rate": 0.18634416668546552, "loss": 0.2288, "num_input_tokens_seen": 14625424, "step": 16885 }, { "epoch": 7.963224893917963, "grad_norm": 0.0008237235015258193, "learning_rate": 0.1862870142244328, "loss": 0.2258, "num_input_tokens_seen": 14629488, "step": 16890 }, { "epoch": 7.9655822725129655, "grad_norm": 0.0006275955238379538, "learning_rate": 0.1862298561674865, "loss": 0.2082, "num_input_tokens_seen": 14633920, "step": 16895 }, { "epoch": 7.967939651107968, "grad_norm": 0.0008190554799512029, "learning_rate": 0.18617269252344104, "loss": 0.2567, "num_input_tokens_seen": 14638144, "step": 16900 }, { "epoch": 7.97029702970297, "grad_norm": 0.0008122554863803089, "learning_rate": 0.18611552330111186, "loss": 0.2576, "num_input_tokens_seen": 14642976, "step": 16905 }, { "epoch": 7.972654408297973, "grad_norm": 0.000676579016726464, "learning_rate": 0.18605834850931507, "loss": 0.2435, "num_input_tokens_seen": 14647136, "step": 16910 }, { "epoch": 7.975011786892975, "grad_norm": 0.000672378926537931, "learning_rate": 0.18600116815686787, "loss": 0.2135, "num_input_tokens_seen": 14652672, "step": 16915 }, { "epoch": 7.9773691654879775, "grad_norm": 0.0010219658724963665, "learning_rate": 0.1859439822525881, "loss": 0.2069, "num_input_tokens_seen": 14657952, "step": 16920 }, { "epoch": 7.97972654408298, "grad_norm": 0.0005619163857772946, "learning_rate": 0.18588679080529455, "loss": 0.244, "num_input_tokens_seen": 14662768, "step": 16925 }, { "epoch": 7.982083922677982, "grad_norm": 0.0007368511287495494, "learning_rate": 0.1858295938238069, "loss": 0.1957, "num_input_tokens_seen": 14667488, "step": 16930 }, { "epoch": 7.984441301272985, "grad_norm": 0.0005778747727163136, "learning_rate": 0.18577239131694562, "loss": 0.2265, "num_input_tokens_seen": 14672656, "step": 16935 }, { "epoch": 7.986798679867987, "grad_norm": 0.0005841944948770106, "learning_rate": 0.18571518329353204, "loss": 0.19, "num_input_tokens_seen": 14677168, "step": 16940 }, { "epoch": 7.9891560584629895, "grad_norm": 0.0004305484180804342, "learning_rate": 0.18565796976238838, "loss": 0.1969, "num_input_tokens_seen": 14681904, "step": 16945 }, { "epoch": 7.991513437057992, "grad_norm": 0.0009605127270333469, "learning_rate": 0.18560075073233764, "loss": 0.2415, "num_input_tokens_seen": 14685856, "step": 16950 }, { "epoch": 7.993870815652993, "grad_norm": 0.00220647850073874, "learning_rate": 0.18554352621220377, "loss": 0.2376, "num_input_tokens_seen": 14691056, "step": 16955 }, { "epoch": 7.996228194247996, "grad_norm": 0.001500558340921998, "learning_rate": 0.18548629621081153, "loss": 0.23, "num_input_tokens_seen": 14695488, "step": 16960 }, { "epoch": 7.998585572842998, "grad_norm": 0.0006912950775586069, "learning_rate": 0.18542906073698645, "loss": 0.2156, "num_input_tokens_seen": 14699536, "step": 16965 }, { "epoch": 8.000942951438, "grad_norm": 0.0007868616376072168, "learning_rate": 0.18537181979955494, "loss": 0.2194, "num_input_tokens_seen": 14704272, "step": 16970 }, { "epoch": 8.003300330033003, "grad_norm": 0.0014794172020629048, "learning_rate": 0.18531457340734434, "loss": 0.1774, "num_input_tokens_seen": 14708352, "step": 16975 }, { "epoch": 8.005657708628005, "grad_norm": 0.000944069295655936, "learning_rate": 0.1852573215691827, "loss": 0.1913, "num_input_tokens_seen": 14712880, "step": 16980 }, { "epoch": 8.008015087223008, "grad_norm": 0.0010439527686685324, "learning_rate": 0.18520006429389904, "loss": 0.2206, "num_input_tokens_seen": 14717344, "step": 16985 }, { "epoch": 8.01037246581801, "grad_norm": 0.0021208319813013077, "learning_rate": 0.1851428015903231, "loss": 0.2307, "num_input_tokens_seen": 14721632, "step": 16990 }, { "epoch": 8.012729844413013, "grad_norm": 0.0018792771734297276, "learning_rate": 0.1850855334672855, "loss": 0.2634, "num_input_tokens_seen": 14726080, "step": 16995 }, { "epoch": 8.015087223008015, "grad_norm": 0.000571501615922898, "learning_rate": 0.1850282599336178, "loss": 0.1974, "num_input_tokens_seen": 14730672, "step": 17000 }, { "epoch": 8.015087223008015, "eval_loss": 0.2183476835489273, "eval_runtime": 21.9196, "eval_samples_per_second": 43.021, "eval_steps_per_second": 21.533, "num_input_tokens_seen": 14730672, "step": 17000 }, { "epoch": 8.017444601603017, "grad_norm": 0.0005012344918213785, "learning_rate": 0.18497098099815215, "loss": 0.1977, "num_input_tokens_seen": 14734560, "step": 17005 }, { "epoch": 8.01980198019802, "grad_norm": 0.0007977562490850687, "learning_rate": 0.18491369666972174, "loss": 0.2173, "num_input_tokens_seen": 14738896, "step": 17010 }, { "epoch": 8.022159358793022, "grad_norm": 0.0005976989632472396, "learning_rate": 0.1848564069571606, "loss": 0.2212, "num_input_tokens_seen": 14743120, "step": 17015 }, { "epoch": 8.024516737388025, "grad_norm": 0.0005675868014805019, "learning_rate": 0.18479911186930348, "loss": 0.2483, "num_input_tokens_seen": 14747504, "step": 17020 }, { "epoch": 8.026874115983027, "grad_norm": 0.0004155739734414965, "learning_rate": 0.18474181141498597, "loss": 0.2145, "num_input_tokens_seen": 14751856, "step": 17025 }, { "epoch": 8.02923149457803, "grad_norm": 0.0011215610429644585, "learning_rate": 0.18468450560304453, "loss": 0.2355, "num_input_tokens_seen": 14756288, "step": 17030 }, { "epoch": 8.031588873173032, "grad_norm": 0.0009645810932852328, "learning_rate": 0.1846271944423165, "loss": 0.2386, "num_input_tokens_seen": 14760416, "step": 17035 }, { "epoch": 8.033946251768034, "grad_norm": 0.0005107417237013578, "learning_rate": 0.18456987794163993, "loss": 0.2353, "num_input_tokens_seen": 14765184, "step": 17040 }, { "epoch": 8.036303630363037, "grad_norm": 0.0022268968168646097, "learning_rate": 0.18451255610985373, "loss": 0.2409, "num_input_tokens_seen": 14769792, "step": 17045 }, { "epoch": 8.038661008958039, "grad_norm": 0.001414511469192803, "learning_rate": 0.18445522895579766, "loss": 0.2403, "num_input_tokens_seen": 14774560, "step": 17050 }, { "epoch": 8.041018387553041, "grad_norm": 0.0006551123224198818, "learning_rate": 0.1843978964883123, "loss": 0.2176, "num_input_tokens_seen": 14778400, "step": 17055 }, { "epoch": 8.043375766148044, "grad_norm": 0.0007221862324513495, "learning_rate": 0.18434055871623906, "loss": 0.2217, "num_input_tokens_seen": 14782704, "step": 17060 }, { "epoch": 8.045733144743046, "grad_norm": 0.0006119785830378532, "learning_rate": 0.18428321564842007, "loss": 0.2187, "num_input_tokens_seen": 14786688, "step": 17065 }, { "epoch": 8.048090523338049, "grad_norm": 0.0012603659415617585, "learning_rate": 0.18422586729369841, "loss": 0.2177, "num_input_tokens_seen": 14791104, "step": 17070 }, { "epoch": 8.050447901933051, "grad_norm": 0.0007500843494199216, "learning_rate": 0.1841685136609179, "loss": 0.2429, "num_input_tokens_seen": 14795872, "step": 17075 }, { "epoch": 8.052805280528053, "grad_norm": 0.0006081779138185084, "learning_rate": 0.18411115475892326, "loss": 0.2257, "num_input_tokens_seen": 14800368, "step": 17080 }, { "epoch": 8.055162659123056, "grad_norm": 0.0007733661332167685, "learning_rate": 0.18405379059655982, "loss": 0.2381, "num_input_tokens_seen": 14805408, "step": 17085 }, { "epoch": 8.057520037718058, "grad_norm": 0.000798562599811703, "learning_rate": 0.1839964211826739, "loss": 0.2055, "num_input_tokens_seen": 14809728, "step": 17090 }, { "epoch": 8.05987741631306, "grad_norm": 0.0006754283094778657, "learning_rate": 0.18393904652611265, "loss": 0.2168, "num_input_tokens_seen": 14814016, "step": 17095 }, { "epoch": 8.062234794908063, "grad_norm": 0.0006707613938488066, "learning_rate": 0.18388166663572392, "loss": 0.2172, "num_input_tokens_seen": 14818784, "step": 17100 }, { "epoch": 8.064592173503065, "grad_norm": 0.0014229442458599806, "learning_rate": 0.18382428152035643, "loss": 0.2461, "num_input_tokens_seen": 14824128, "step": 17105 }, { "epoch": 8.066949552098066, "grad_norm": 0.0005571647197939456, "learning_rate": 0.1837668911888596, "loss": 0.2169, "num_input_tokens_seen": 14828256, "step": 17110 }, { "epoch": 8.069306930693068, "grad_norm": 0.001506650703959167, "learning_rate": 0.18370949565008388, "loss": 0.2071, "num_input_tokens_seen": 14832656, "step": 17115 }, { "epoch": 8.07166430928807, "grad_norm": 0.0005854166811332107, "learning_rate": 0.1836520949128803, "loss": 0.2178, "num_input_tokens_seen": 14836544, "step": 17120 }, { "epoch": 8.074021687883073, "grad_norm": 0.0007889504777267575, "learning_rate": 0.18359468898610076, "loss": 0.2163, "num_input_tokens_seen": 14841632, "step": 17125 }, { "epoch": 8.076379066478076, "grad_norm": 0.0016347586642950773, "learning_rate": 0.18353727787859797, "loss": 0.2242, "num_input_tokens_seen": 14845632, "step": 17130 }, { "epoch": 8.078736445073078, "grad_norm": 0.0004919386119581759, "learning_rate": 0.18347986159922552, "loss": 0.1917, "num_input_tokens_seen": 14849824, "step": 17135 }, { "epoch": 8.08109382366808, "grad_norm": 0.0008397671044804156, "learning_rate": 0.1834224401568377, "loss": 0.2483, "num_input_tokens_seen": 14854416, "step": 17140 }, { "epoch": 8.083451202263083, "grad_norm": 0.001358521985821426, "learning_rate": 0.1833650135602896, "loss": 0.2407, "num_input_tokens_seen": 14857728, "step": 17145 }, { "epoch": 8.085808580858085, "grad_norm": 0.0013789423974230886, "learning_rate": 0.18330758181843707, "loss": 0.1937, "num_input_tokens_seen": 14861856, "step": 17150 }, { "epoch": 8.088165959453088, "grad_norm": 0.0007847712840884924, "learning_rate": 0.18325014494013686, "loss": 0.2233, "num_input_tokens_seen": 14865280, "step": 17155 }, { "epoch": 8.09052333804809, "grad_norm": 0.0005809231079183519, "learning_rate": 0.18319270293424647, "loss": 0.2076, "num_input_tokens_seen": 14869936, "step": 17160 }, { "epoch": 8.092880716643092, "grad_norm": 0.001483680447563529, "learning_rate": 0.18313525580962417, "loss": 0.207, "num_input_tokens_seen": 14873936, "step": 17165 }, { "epoch": 8.095238095238095, "grad_norm": 0.0007495217141695321, "learning_rate": 0.18307780357512896, "loss": 0.2297, "num_input_tokens_seen": 14878208, "step": 17170 }, { "epoch": 8.097595473833097, "grad_norm": 0.0007839567260816693, "learning_rate": 0.1830203462396208, "loss": 0.2125, "num_input_tokens_seen": 14882672, "step": 17175 }, { "epoch": 8.0999528524281, "grad_norm": 0.0007993195904418826, "learning_rate": 0.18296288381196033, "loss": 0.2295, "num_input_tokens_seen": 14887632, "step": 17180 }, { "epoch": 8.102310231023102, "grad_norm": 0.0014070695033296943, "learning_rate": 0.1829054163010089, "loss": 0.1975, "num_input_tokens_seen": 14891664, "step": 17185 }, { "epoch": 8.104667609618105, "grad_norm": 0.0004898497718386352, "learning_rate": 0.18284794371562874, "loss": 0.2214, "num_input_tokens_seen": 14896256, "step": 17190 }, { "epoch": 8.107024988213107, "grad_norm": 0.0005600343574769795, "learning_rate": 0.18279046606468288, "loss": 0.2247, "num_input_tokens_seen": 14900352, "step": 17195 }, { "epoch": 8.10938236680811, "grad_norm": 0.0006317143561318517, "learning_rate": 0.1827329833570351, "loss": 0.2445, "num_input_tokens_seen": 14904544, "step": 17200 }, { "epoch": 8.10938236680811, "eval_loss": 0.21596461534500122, "eval_runtime": 21.9243, "eval_samples_per_second": 43.012, "eval_steps_per_second": 21.529, "num_input_tokens_seen": 14904544, "step": 17200 }, { "epoch": 8.111739745403112, "grad_norm": 0.0006996337906457484, "learning_rate": 0.18267549560154991, "loss": 0.2483, "num_input_tokens_seen": 14908528, "step": 17205 }, { "epoch": 8.114097123998114, "grad_norm": 0.0017482263501733541, "learning_rate": 0.18261800280709267, "loss": 0.2201, "num_input_tokens_seen": 14912864, "step": 17210 }, { "epoch": 8.116454502593117, "grad_norm": 0.0017100503901019692, "learning_rate": 0.18256050498252957, "loss": 0.2187, "num_input_tokens_seen": 14917984, "step": 17215 }, { "epoch": 8.118811881188119, "grad_norm": 0.001442396780475974, "learning_rate": 0.18250300213672735, "loss": 0.2402, "num_input_tokens_seen": 14921760, "step": 17220 }, { "epoch": 8.121169259783121, "grad_norm": 0.0012662067310884595, "learning_rate": 0.18244549427855378, "loss": 0.2659, "num_input_tokens_seen": 14926160, "step": 17225 }, { "epoch": 8.123526638378124, "grad_norm": 0.0018833104986697435, "learning_rate": 0.1823879814168772, "loss": 0.2332, "num_input_tokens_seen": 14930368, "step": 17230 }, { "epoch": 8.125884016973126, "grad_norm": 0.0005143648595549166, "learning_rate": 0.18233046356056692, "loss": 0.2383, "num_input_tokens_seen": 14934672, "step": 17235 }, { "epoch": 8.128241395568129, "grad_norm": 0.0005064297583885491, "learning_rate": 0.18227294071849284, "loss": 0.2256, "num_input_tokens_seen": 14939696, "step": 17240 }, { "epoch": 8.130598774163131, "grad_norm": 0.0012462330050766468, "learning_rate": 0.18221541289952578, "loss": 0.1988, "num_input_tokens_seen": 14943568, "step": 17245 }, { "epoch": 8.132956152758133, "grad_norm": 0.000742485688533634, "learning_rate": 0.18215788011253717, "loss": 0.2265, "num_input_tokens_seen": 14948672, "step": 17250 }, { "epoch": 8.135313531353136, "grad_norm": 0.0010609545279294252, "learning_rate": 0.18210034236639935, "loss": 0.2026, "num_input_tokens_seen": 14953824, "step": 17255 }, { "epoch": 8.137670909948138, "grad_norm": 0.0010143255349248648, "learning_rate": 0.1820427996699853, "loss": 0.1671, "num_input_tokens_seen": 14957936, "step": 17260 }, { "epoch": 8.14002828854314, "grad_norm": 0.00041120799141936004, "learning_rate": 0.1819852520321689, "loss": 0.2356, "num_input_tokens_seen": 14962224, "step": 17265 }, { "epoch": 8.142385667138143, "grad_norm": 0.0008708821260370314, "learning_rate": 0.18192769946182466, "loss": 0.2237, "num_input_tokens_seen": 14966288, "step": 17270 }, { "epoch": 8.144743045733145, "grad_norm": 0.0010673468932509422, "learning_rate": 0.18187014196782794, "loss": 0.1769, "num_input_tokens_seen": 14971184, "step": 17275 }, { "epoch": 8.147100424328148, "grad_norm": 0.0003312168410047889, "learning_rate": 0.18181257955905486, "loss": 0.1729, "num_input_tokens_seen": 14976112, "step": 17280 }, { "epoch": 8.14945780292315, "grad_norm": 0.001018760958686471, "learning_rate": 0.18175501224438217, "loss": 0.2412, "num_input_tokens_seen": 14980448, "step": 17285 }, { "epoch": 8.151815181518153, "grad_norm": 0.001600234187208116, "learning_rate": 0.18169744003268756, "loss": 0.2634, "num_input_tokens_seen": 14985024, "step": 17290 }, { "epoch": 8.154172560113155, "grad_norm": 0.0006341019761748612, "learning_rate": 0.18163986293284937, "loss": 0.2502, "num_input_tokens_seen": 14989680, "step": 17295 }, { "epoch": 8.156529938708157, "grad_norm": 0.0015918717253953218, "learning_rate": 0.18158228095374673, "loss": 0.2165, "num_input_tokens_seen": 14994000, "step": 17300 }, { "epoch": 8.15888731730316, "grad_norm": 0.0008630157099105418, "learning_rate": 0.18152469410425945, "loss": 0.2274, "num_input_tokens_seen": 14998416, "step": 17305 }, { "epoch": 8.16124469589816, "grad_norm": 0.0012875405373051763, "learning_rate": 0.18146710239326813, "loss": 0.2354, "num_input_tokens_seen": 15003776, "step": 17310 }, { "epoch": 8.163602074493163, "grad_norm": 0.0007028420804999769, "learning_rate": 0.18140950582965423, "loss": 0.2218, "num_input_tokens_seen": 15008224, "step": 17315 }, { "epoch": 8.165959453088165, "grad_norm": 0.0019235782092437148, "learning_rate": 0.1813519044222998, "loss": 0.2278, "num_input_tokens_seen": 15012528, "step": 17320 }, { "epoch": 8.168316831683168, "grad_norm": 0.0010205862345173955, "learning_rate": 0.18129429818008772, "loss": 0.235, "num_input_tokens_seen": 15016656, "step": 17325 }, { "epoch": 8.17067421027817, "grad_norm": 0.0007929943967610598, "learning_rate": 0.18123668711190163, "loss": 0.2159, "num_input_tokens_seen": 15020752, "step": 17330 }, { "epoch": 8.173031588873172, "grad_norm": 0.0006862014997750521, "learning_rate": 0.18117907122662583, "loss": 0.2191, "num_input_tokens_seen": 15025216, "step": 17335 }, { "epoch": 8.175388967468175, "grad_norm": 0.0012741342652589083, "learning_rate": 0.1811214505331454, "loss": 0.1819, "num_input_tokens_seen": 15029184, "step": 17340 }, { "epoch": 8.177746346063177, "grad_norm": 0.0004548447031993419, "learning_rate": 0.1810638250403462, "loss": 0.2161, "num_input_tokens_seen": 15033008, "step": 17345 }, { "epoch": 8.18010372465818, "grad_norm": 0.0004260679997969419, "learning_rate": 0.1810061947571148, "loss": 0.2246, "num_input_tokens_seen": 15036560, "step": 17350 }, { "epoch": 8.182461103253182, "grad_norm": 0.0003665993281174451, "learning_rate": 0.1809485596923385, "loss": 0.1691, "num_input_tokens_seen": 15040336, "step": 17355 }, { "epoch": 8.184818481848184, "grad_norm": 0.0004515019536484033, "learning_rate": 0.18089091985490546, "loss": 0.2465, "num_input_tokens_seen": 15044256, "step": 17360 }, { "epoch": 8.187175860443187, "grad_norm": 0.001216707518324256, "learning_rate": 0.18083327525370432, "loss": 0.2349, "num_input_tokens_seen": 15047744, "step": 17365 }, { "epoch": 8.18953323903819, "grad_norm": 0.000654072966426611, "learning_rate": 0.18077562589762464, "loss": 0.2375, "num_input_tokens_seen": 15051616, "step": 17370 }, { "epoch": 8.191890617633192, "grad_norm": 0.0006570627447217703, "learning_rate": 0.1807179717955567, "loss": 0.2366, "num_input_tokens_seen": 15055632, "step": 17375 }, { "epoch": 8.194247996228194, "grad_norm": 0.0010528704151511192, "learning_rate": 0.1806603129563915, "loss": 0.2, "num_input_tokens_seen": 15059536, "step": 17380 }, { "epoch": 8.196605374823196, "grad_norm": 0.00138662604149431, "learning_rate": 0.1806026493890208, "loss": 0.2018, "num_input_tokens_seen": 15064336, "step": 17385 }, { "epoch": 8.198962753418199, "grad_norm": 0.0006904808105900884, "learning_rate": 0.18054498110233688, "loss": 0.2144, "num_input_tokens_seen": 15069456, "step": 17390 }, { "epoch": 8.201320132013201, "grad_norm": 0.0006260880618356168, "learning_rate": 0.1804873081052331, "loss": 0.1757, "num_input_tokens_seen": 15074416, "step": 17395 }, { "epoch": 8.203677510608204, "grad_norm": 0.001168601680546999, "learning_rate": 0.18042963040660326, "loss": 0.1646, "num_input_tokens_seen": 15078832, "step": 17400 }, { "epoch": 8.203677510608204, "eval_loss": 0.2226901650428772, "eval_runtime": 21.9877, "eval_samples_per_second": 42.888, "eval_steps_per_second": 21.467, "num_input_tokens_seen": 15078832, "step": 17400 }, { "epoch": 8.206034889203206, "grad_norm": 0.00063800293719396, "learning_rate": 0.180371948015342, "loss": 0.1843, "num_input_tokens_seen": 15083120, "step": 17405 }, { "epoch": 8.208392267798208, "grad_norm": 0.0005953225190751255, "learning_rate": 0.18031426094034472, "loss": 0.2623, "num_input_tokens_seen": 15088272, "step": 17410 }, { "epoch": 8.21074964639321, "grad_norm": 0.0011070126201957464, "learning_rate": 0.18025656919050737, "loss": 0.1672, "num_input_tokens_seen": 15092080, "step": 17415 }, { "epoch": 8.213107024988213, "grad_norm": 0.0012170132249593735, "learning_rate": 0.18019887277472688, "loss": 0.2125, "num_input_tokens_seen": 15095952, "step": 17420 }, { "epoch": 8.215464403583216, "grad_norm": 0.002036644145846367, "learning_rate": 0.18014117170190067, "loss": 0.2789, "num_input_tokens_seen": 15100128, "step": 17425 }, { "epoch": 8.217821782178218, "grad_norm": 0.0014281088951975107, "learning_rate": 0.18008346598092703, "loss": 0.2381, "num_input_tokens_seen": 15104416, "step": 17430 }, { "epoch": 8.22017916077322, "grad_norm": 0.0008759621996432543, "learning_rate": 0.18002575562070489, "loss": 0.2311, "num_input_tokens_seen": 15108592, "step": 17435 }, { "epoch": 8.222536539368223, "grad_norm": 0.000924751628190279, "learning_rate": 0.1799680406301339, "loss": 0.2133, "num_input_tokens_seen": 15112608, "step": 17440 }, { "epoch": 8.224893917963225, "grad_norm": 0.0012491259258240461, "learning_rate": 0.17991032101811447, "loss": 0.2237, "num_input_tokens_seen": 15116608, "step": 17445 }, { "epoch": 8.227251296558228, "grad_norm": 0.0004877631727140397, "learning_rate": 0.1798525967935476, "loss": 0.2203, "num_input_tokens_seen": 15120064, "step": 17450 }, { "epoch": 8.22960867515323, "grad_norm": 0.0006171252462081611, "learning_rate": 0.17979486796533517, "loss": 0.2401, "num_input_tokens_seen": 15124608, "step": 17455 }, { "epoch": 8.231966053748232, "grad_norm": 0.00073317188071087, "learning_rate": 0.1797371345423797, "loss": 0.1908, "num_input_tokens_seen": 15129376, "step": 17460 }, { "epoch": 8.234323432343235, "grad_norm": 0.0014318376779556274, "learning_rate": 0.17967939653358436, "loss": 0.2625, "num_input_tokens_seen": 15133936, "step": 17465 }, { "epoch": 8.236680810938237, "grad_norm": 0.0006916575948707759, "learning_rate": 0.17962165394785315, "loss": 0.2042, "num_input_tokens_seen": 15138272, "step": 17470 }, { "epoch": 8.23903818953324, "grad_norm": 0.0007186068105511367, "learning_rate": 0.17956390679409057, "loss": 0.2386, "num_input_tokens_seen": 15143056, "step": 17475 }, { "epoch": 8.241395568128242, "grad_norm": 0.0008136983960866928, "learning_rate": 0.1795061550812021, "loss": 0.2273, "num_input_tokens_seen": 15147168, "step": 17480 }, { "epoch": 8.243752946723244, "grad_norm": 0.000652920629363507, "learning_rate": 0.1794483988180937, "loss": 0.223, "num_input_tokens_seen": 15151392, "step": 17485 }, { "epoch": 8.246110325318247, "grad_norm": 0.0007319622673094273, "learning_rate": 0.17939063801367214, "loss": 0.217, "num_input_tokens_seen": 15156160, "step": 17490 }, { "epoch": 8.24846770391325, "grad_norm": 0.0014420924708247185, "learning_rate": 0.17933287267684483, "loss": 0.2457, "num_input_tokens_seen": 15159856, "step": 17495 }, { "epoch": 8.250825082508252, "grad_norm": 0.001195083255879581, "learning_rate": 0.17927510281651995, "loss": 0.2212, "num_input_tokens_seen": 15163392, "step": 17500 }, { "epoch": 8.253182461103254, "grad_norm": 0.002028409857302904, "learning_rate": 0.17921732844160634, "loss": 0.2395, "num_input_tokens_seen": 15167472, "step": 17505 }, { "epoch": 8.255539839698255, "grad_norm": 0.0003986610972788185, "learning_rate": 0.17915954956101351, "loss": 0.2278, "num_input_tokens_seen": 15171424, "step": 17510 }, { "epoch": 8.257897218293257, "grad_norm": 0.001574523514136672, "learning_rate": 0.17910176618365165, "loss": 0.2366, "num_input_tokens_seen": 15177968, "step": 17515 }, { "epoch": 8.26025459688826, "grad_norm": 0.0010000461479648948, "learning_rate": 0.17904397831843177, "loss": 0.2242, "num_input_tokens_seen": 15182432, "step": 17520 }, { "epoch": 8.262611975483262, "grad_norm": 0.0005931671475991607, "learning_rate": 0.17898618597426547, "loss": 0.2129, "num_input_tokens_seen": 15186352, "step": 17525 }, { "epoch": 8.264969354078264, "grad_norm": 0.001251478330232203, "learning_rate": 0.17892838916006495, "loss": 0.1803, "num_input_tokens_seen": 15191744, "step": 17530 }, { "epoch": 8.267326732673267, "grad_norm": 0.0009137343149632215, "learning_rate": 0.17887058788474333, "loss": 0.1468, "num_input_tokens_seen": 15196224, "step": 17535 }, { "epoch": 8.269684111268269, "grad_norm": 0.0007736209081485868, "learning_rate": 0.17881278215721427, "loss": 0.1954, "num_input_tokens_seen": 15201008, "step": 17540 }, { "epoch": 8.272041489863271, "grad_norm": 0.0014450362650677562, "learning_rate": 0.1787549719863921, "loss": 0.3063, "num_input_tokens_seen": 15205424, "step": 17545 }, { "epoch": 8.274398868458274, "grad_norm": 0.0004421088087838143, "learning_rate": 0.17869715738119188, "loss": 0.2827, "num_input_tokens_seen": 15209504, "step": 17550 }, { "epoch": 8.276756247053276, "grad_norm": 0.002065566834062338, "learning_rate": 0.17863933835052936, "loss": 0.2187, "num_input_tokens_seen": 15214256, "step": 17555 }, { "epoch": 8.279113625648279, "grad_norm": 0.002030690899118781, "learning_rate": 0.17858151490332097, "loss": 0.2252, "num_input_tokens_seen": 15218080, "step": 17560 }, { "epoch": 8.281471004243281, "grad_norm": 0.0010913773439824581, "learning_rate": 0.17852368704848381, "loss": 0.2253, "num_input_tokens_seen": 15221856, "step": 17565 }, { "epoch": 8.283828382838283, "grad_norm": 0.000587529968470335, "learning_rate": 0.17846585479493565, "loss": 0.2282, "num_input_tokens_seen": 15226192, "step": 17570 }, { "epoch": 8.286185761433286, "grad_norm": 0.0006090219249017537, "learning_rate": 0.178408018151595, "loss": 0.2343, "num_input_tokens_seen": 15230912, "step": 17575 }, { "epoch": 8.288543140028288, "grad_norm": 0.0041658394038677216, "learning_rate": 0.17835017712738085, "loss": 0.2207, "num_input_tokens_seen": 15236560, "step": 17580 }, { "epoch": 8.29090051862329, "grad_norm": 0.0014563678996637464, "learning_rate": 0.17829233173121323, "loss": 0.2296, "num_input_tokens_seen": 15241456, "step": 17585 }, { "epoch": 8.293257897218293, "grad_norm": 0.0011527508031576872, "learning_rate": 0.17823448197201244, "loss": 0.23, "num_input_tokens_seen": 15245744, "step": 17590 }, { "epoch": 8.295615275813295, "grad_norm": 0.000673772010486573, "learning_rate": 0.1781766278586997, "loss": 0.2308, "num_input_tokens_seen": 15249824, "step": 17595 }, { "epoch": 8.297972654408298, "grad_norm": 0.0013623657869175076, "learning_rate": 0.1781187694001969, "loss": 0.2175, "num_input_tokens_seen": 15254544, "step": 17600 }, { "epoch": 8.297972654408298, "eval_loss": 0.21614058315753937, "eval_runtime": 21.9041, "eval_samples_per_second": 43.051, "eval_steps_per_second": 21.548, "num_input_tokens_seen": 15254544, "step": 17600 }, { "epoch": 8.3003300330033, "grad_norm": 0.0015921415761113167, "learning_rate": 0.1780609066054265, "loss": 0.1982, "num_input_tokens_seen": 15258352, "step": 17605 }, { "epoch": 8.302687411598303, "grad_norm": 0.0007842971244826913, "learning_rate": 0.17800303948331164, "loss": 0.241, "num_input_tokens_seen": 15261856, "step": 17610 }, { "epoch": 8.305044790193305, "grad_norm": 0.0012833571527153254, "learning_rate": 0.1779451680427762, "loss": 0.2418, "num_input_tokens_seen": 15266240, "step": 17615 }, { "epoch": 8.307402168788308, "grad_norm": 0.0007401234470307827, "learning_rate": 0.17788729229274464, "loss": 0.2064, "num_input_tokens_seen": 15270000, "step": 17620 }, { "epoch": 8.30975954738331, "grad_norm": 0.000766481040045619, "learning_rate": 0.17782941224214222, "loss": 0.2025, "num_input_tokens_seen": 15274704, "step": 17625 }, { "epoch": 8.312116925978312, "grad_norm": 0.000734795059543103, "learning_rate": 0.17777152789989464, "loss": 0.2405, "num_input_tokens_seen": 15278448, "step": 17630 }, { "epoch": 8.314474304573315, "grad_norm": 0.001417364226654172, "learning_rate": 0.17771363927492845, "loss": 0.2083, "num_input_tokens_seen": 15282384, "step": 17635 }, { "epoch": 8.316831683168317, "grad_norm": 0.0008811577572487295, "learning_rate": 0.17765574637617085, "loss": 0.2216, "num_input_tokens_seen": 15286544, "step": 17640 }, { "epoch": 8.31918906176332, "grad_norm": 0.0009883709717541933, "learning_rate": 0.17759784921254962, "loss": 0.2032, "num_input_tokens_seen": 15290128, "step": 17645 }, { "epoch": 8.321546440358322, "grad_norm": 0.0017199733993038535, "learning_rate": 0.1775399477929932, "loss": 0.231, "num_input_tokens_seen": 15295424, "step": 17650 }, { "epoch": 8.323903818953324, "grad_norm": 0.002116607967764139, "learning_rate": 0.17748204212643076, "loss": 0.2032, "num_input_tokens_seen": 15299344, "step": 17655 }, { "epoch": 8.326261197548327, "grad_norm": 0.0005680570611730218, "learning_rate": 0.17742413222179204, "loss": 0.1751, "num_input_tokens_seen": 15303872, "step": 17660 }, { "epoch": 8.32861857614333, "grad_norm": 0.0005705593503080308, "learning_rate": 0.17736621808800754, "loss": 0.2437, "num_input_tokens_seen": 15307824, "step": 17665 }, { "epoch": 8.330975954738332, "grad_norm": 0.0007248824113048613, "learning_rate": 0.17730829973400827, "loss": 0.2552, "num_input_tokens_seen": 15311776, "step": 17670 }, { "epoch": 8.333333333333334, "grad_norm": 0.0012016367400065064, "learning_rate": 0.17725037716872602, "loss": 0.2527, "num_input_tokens_seen": 15316752, "step": 17675 }, { "epoch": 8.335690711928336, "grad_norm": 0.0016305889002978802, "learning_rate": 0.17719245040109313, "loss": 0.2053, "num_input_tokens_seen": 15320512, "step": 17680 }, { "epoch": 8.338048090523339, "grad_norm": 0.0008650401141494513, "learning_rate": 0.17713451944004271, "loss": 0.2318, "num_input_tokens_seen": 15324512, "step": 17685 }, { "epoch": 8.340405469118341, "grad_norm": 0.0008470658212900162, "learning_rate": 0.17707658429450843, "loss": 0.2008, "num_input_tokens_seen": 15329552, "step": 17690 }, { "epoch": 8.342762847713344, "grad_norm": 0.0005339845665730536, "learning_rate": 0.1770186449734245, "loss": 0.2078, "num_input_tokens_seen": 15333744, "step": 17695 }, { "epoch": 8.345120226308346, "grad_norm": 0.0012775103095918894, "learning_rate": 0.17696070148572599, "loss": 0.1954, "num_input_tokens_seen": 15338000, "step": 17700 }, { "epoch": 8.347477604903348, "grad_norm": 0.0009918904397636652, "learning_rate": 0.17690275384034856, "loss": 0.206, "num_input_tokens_seen": 15341776, "step": 17705 }, { "epoch": 8.34983498349835, "grad_norm": 0.002046305686235428, "learning_rate": 0.17684480204622835, "loss": 0.2724, "num_input_tokens_seen": 15346016, "step": 17710 }, { "epoch": 8.352192362093351, "grad_norm": 0.0005304171936586499, "learning_rate": 0.1767868461123023, "loss": 0.2623, "num_input_tokens_seen": 15350016, "step": 17715 }, { "epoch": 8.354549740688354, "grad_norm": 0.0006565371295437217, "learning_rate": 0.176728886047508, "loss": 0.2344, "num_input_tokens_seen": 15353536, "step": 17720 }, { "epoch": 8.356907119283356, "grad_norm": 0.0006002521840855479, "learning_rate": 0.17667092186078362, "loss": 0.2449, "num_input_tokens_seen": 15357696, "step": 17725 }, { "epoch": 8.359264497878359, "grad_norm": 0.0008706214721314609, "learning_rate": 0.17661295356106785, "loss": 0.2312, "num_input_tokens_seen": 15361936, "step": 17730 }, { "epoch": 8.361621876473361, "grad_norm": 0.0012711867457255721, "learning_rate": 0.1765549811573002, "loss": 0.2289, "num_input_tokens_seen": 15366000, "step": 17735 }, { "epoch": 8.363979255068363, "grad_norm": 0.0008479977841489017, "learning_rate": 0.17649700465842078, "loss": 0.2317, "num_input_tokens_seen": 15369792, "step": 17740 }, { "epoch": 8.366336633663366, "grad_norm": 0.0006554234423674643, "learning_rate": 0.17643902407337023, "loss": 0.2221, "num_input_tokens_seen": 15374800, "step": 17745 }, { "epoch": 8.368694012258368, "grad_norm": 0.0006481492309831083, "learning_rate": 0.17638103941108993, "loss": 0.224, "num_input_tokens_seen": 15378976, "step": 17750 }, { "epoch": 8.37105139085337, "grad_norm": 0.0013533122837543488, "learning_rate": 0.1763230506805218, "loss": 0.217, "num_input_tokens_seen": 15383968, "step": 17755 }, { "epoch": 8.373408769448373, "grad_norm": 0.0005582069279626012, "learning_rate": 0.1762650578906085, "loss": 0.2361, "num_input_tokens_seen": 15387776, "step": 17760 }, { "epoch": 8.375766148043375, "grad_norm": 0.0005073976935818791, "learning_rate": 0.1762070610502932, "loss": 0.2298, "num_input_tokens_seen": 15392624, "step": 17765 }, { "epoch": 8.378123526638378, "grad_norm": 0.00040189785067923367, "learning_rate": 0.17614906016851975, "loss": 0.2253, "num_input_tokens_seen": 15396768, "step": 17770 }, { "epoch": 8.38048090523338, "grad_norm": 0.0006163145881146193, "learning_rate": 0.17609105525423258, "loss": 0.2276, "num_input_tokens_seen": 15400592, "step": 17775 }, { "epoch": 8.382838283828383, "grad_norm": 0.0034978787880390882, "learning_rate": 0.1760330463163768, "loss": 0.2302, "num_input_tokens_seen": 15405136, "step": 17780 }, { "epoch": 8.385195662423385, "grad_norm": 0.0015213626902550459, "learning_rate": 0.17597503336389816, "loss": 0.2323, "num_input_tokens_seen": 15409408, "step": 17785 }, { "epoch": 8.387553041018387, "grad_norm": 0.0006921447929926217, "learning_rate": 0.17591701640574298, "loss": 0.2125, "num_input_tokens_seen": 15414112, "step": 17790 }, { "epoch": 8.38991041961339, "grad_norm": 0.0012722890824079514, "learning_rate": 0.17585899545085815, "loss": 0.2256, "num_input_tokens_seen": 15418160, "step": 17795 }, { "epoch": 8.392267798208392, "grad_norm": 0.0013010416878387332, "learning_rate": 0.17580097050819124, "loss": 0.2334, "num_input_tokens_seen": 15422256, "step": 17800 }, { "epoch": 8.392267798208392, "eval_loss": 0.21769794821739197, "eval_runtime": 21.9375, "eval_samples_per_second": 42.986, "eval_steps_per_second": 21.516, "num_input_tokens_seen": 15422256, "step": 17800 }, { "epoch": 8.394625176803395, "grad_norm": 0.0007857599412091076, "learning_rate": 0.17574294158669046, "loss": 0.2206, "num_input_tokens_seen": 15426528, "step": 17805 }, { "epoch": 8.396982555398397, "grad_norm": 0.00071547063998878, "learning_rate": 0.17568490869530456, "loss": 0.2268, "num_input_tokens_seen": 15430704, "step": 17810 }, { "epoch": 8.3993399339934, "grad_norm": 0.0007274668896570802, "learning_rate": 0.17562687184298295, "loss": 0.1982, "num_input_tokens_seen": 15434880, "step": 17815 }, { "epoch": 8.401697312588402, "grad_norm": 0.0005184274050407112, "learning_rate": 0.1755688310386757, "loss": 0.2271, "num_input_tokens_seen": 15438992, "step": 17820 }, { "epoch": 8.404054691183404, "grad_norm": 0.0014656211715191603, "learning_rate": 0.17551078629133335, "loss": 0.2391, "num_input_tokens_seen": 15443600, "step": 17825 }, { "epoch": 8.406412069778407, "grad_norm": 0.0006041778833605349, "learning_rate": 0.17545273760990718, "loss": 0.2435, "num_input_tokens_seen": 15448688, "step": 17830 }, { "epoch": 8.408769448373409, "grad_norm": 0.0013833378907293081, "learning_rate": 0.17539468500334904, "loss": 0.2207, "num_input_tokens_seen": 15452880, "step": 17835 }, { "epoch": 8.411126826968411, "grad_norm": 0.0007196586229838431, "learning_rate": 0.17533662848061132, "loss": 0.2291, "num_input_tokens_seen": 15457488, "step": 17840 }, { "epoch": 8.413484205563414, "grad_norm": 0.0005932339117862284, "learning_rate": 0.1752785680506471, "loss": 0.2166, "num_input_tokens_seen": 15461520, "step": 17845 }, { "epoch": 8.415841584158416, "grad_norm": 0.0006514521664939821, "learning_rate": 0.17522050372241, "loss": 0.215, "num_input_tokens_seen": 15465824, "step": 17850 }, { "epoch": 8.418198962753419, "grad_norm": 0.0013751299120485783, "learning_rate": 0.17516243550485425, "loss": 0.2075, "num_input_tokens_seen": 15470368, "step": 17855 }, { "epoch": 8.420556341348421, "grad_norm": 0.0006146053201518953, "learning_rate": 0.17510436340693478, "loss": 0.255, "num_input_tokens_seen": 15475168, "step": 17860 }, { "epoch": 8.422913719943423, "grad_norm": 0.0008043135749176145, "learning_rate": 0.175046287437607, "loss": 0.2348, "num_input_tokens_seen": 15479056, "step": 17865 }, { "epoch": 8.425271098538426, "grad_norm": 0.0006233906606212258, "learning_rate": 0.17498820760582695, "loss": 0.2273, "num_input_tokens_seen": 15484176, "step": 17870 }, { "epoch": 8.427628477133428, "grad_norm": 0.0007953964523039758, "learning_rate": 0.1749301239205512, "loss": 0.2182, "num_input_tokens_seen": 15487952, "step": 17875 }, { "epoch": 8.42998585572843, "grad_norm": 0.0007979943184182048, "learning_rate": 0.1748720363907371, "loss": 0.2276, "num_input_tokens_seen": 15492432, "step": 17880 }, { "epoch": 8.432343234323433, "grad_norm": 0.0018163628410547972, "learning_rate": 0.17481394502534242, "loss": 0.233, "num_input_tokens_seen": 15497120, "step": 17885 }, { "epoch": 8.434700612918435, "grad_norm": 0.0005012198816984892, "learning_rate": 0.17475584983332562, "loss": 0.2155, "num_input_tokens_seen": 15502160, "step": 17890 }, { "epoch": 8.437057991513438, "grad_norm": 0.0007532468298450112, "learning_rate": 0.17469775082364558, "loss": 0.2328, "num_input_tokens_seen": 15505920, "step": 17895 }, { "epoch": 8.43941537010844, "grad_norm": 0.0006603209185414016, "learning_rate": 0.17463964800526205, "loss": 0.2119, "num_input_tokens_seen": 15510432, "step": 17900 }, { "epoch": 8.441772748703443, "grad_norm": 0.0006522470503114164, "learning_rate": 0.17458154138713522, "loss": 0.1916, "num_input_tokens_seen": 15514688, "step": 17905 }, { "epoch": 8.444130127298443, "grad_norm": 0.0005579644348472357, "learning_rate": 0.17452343097822576, "loss": 0.2034, "num_input_tokens_seen": 15519424, "step": 17910 }, { "epoch": 8.446487505893446, "grad_norm": 0.0011432182509452105, "learning_rate": 0.17446531678749497, "loss": 0.2051, "num_input_tokens_seen": 15522912, "step": 17915 }, { "epoch": 8.448844884488448, "grad_norm": 0.001259755459614098, "learning_rate": 0.17440719882390496, "loss": 0.2582, "num_input_tokens_seen": 15526928, "step": 17920 }, { "epoch": 8.45120226308345, "grad_norm": 0.0010582468239590526, "learning_rate": 0.17434907709641814, "loss": 0.1299, "num_input_tokens_seen": 15530944, "step": 17925 }, { "epoch": 8.453559641678453, "grad_norm": 0.002214054809883237, "learning_rate": 0.17429095161399769, "loss": 0.2595, "num_input_tokens_seen": 15535408, "step": 17930 }, { "epoch": 8.455917020273455, "grad_norm": 0.0005403182585723698, "learning_rate": 0.1742328223856072, "loss": 0.1764, "num_input_tokens_seen": 15540544, "step": 17935 }, { "epoch": 8.458274398868458, "grad_norm": 0.0005449664895422757, "learning_rate": 0.174174689420211, "loss": 0.2507, "num_input_tokens_seen": 15544112, "step": 17940 }, { "epoch": 8.46063177746346, "grad_norm": 0.0004789628437720239, "learning_rate": 0.1741165527267739, "loss": 0.1652, "num_input_tokens_seen": 15548560, "step": 17945 }, { "epoch": 8.462989156058462, "grad_norm": 0.0010395179269835353, "learning_rate": 0.17405841231426125, "loss": 0.1884, "num_input_tokens_seen": 15553920, "step": 17950 }, { "epoch": 8.465346534653465, "grad_norm": 0.0008846030104905367, "learning_rate": 0.1740002681916391, "loss": 0.2387, "num_input_tokens_seen": 15557712, "step": 17955 }, { "epoch": 8.467703913248467, "grad_norm": 0.0008814088650979102, "learning_rate": 0.17394212036787401, "loss": 0.2507, "num_input_tokens_seen": 15562544, "step": 17960 }, { "epoch": 8.47006129184347, "grad_norm": 0.0015101421158760786, "learning_rate": 0.1738839688519331, "loss": 0.2085, "num_input_tokens_seen": 15566416, "step": 17965 }, { "epoch": 8.472418670438472, "grad_norm": 0.0006444223108701408, "learning_rate": 0.17382581365278402, "loss": 0.2332, "num_input_tokens_seen": 15571792, "step": 17970 }, { "epoch": 8.474776049033474, "grad_norm": 0.00211532530374825, "learning_rate": 0.17376765477939507, "loss": 0.2356, "num_input_tokens_seen": 15575968, "step": 17975 }, { "epoch": 8.477133427628477, "grad_norm": 0.001247377716936171, "learning_rate": 0.1737094922407351, "loss": 0.2461, "num_input_tokens_seen": 15579296, "step": 17980 }, { "epoch": 8.47949080622348, "grad_norm": 0.001665474264882505, "learning_rate": 0.1736513260457734, "loss": 0.2235, "num_input_tokens_seen": 15583408, "step": 17985 }, { "epoch": 8.481848184818482, "grad_norm": 0.0011555610690265894, "learning_rate": 0.17359315620348006, "loss": 0.2272, "num_input_tokens_seen": 15587680, "step": 17990 }, { "epoch": 8.484205563413484, "grad_norm": 0.0011111653875559568, "learning_rate": 0.17353498272282547, "loss": 0.2365, "num_input_tokens_seen": 15591344, "step": 17995 }, { "epoch": 8.486562942008486, "grad_norm": 0.0018261030782014132, "learning_rate": 0.17347680561278087, "loss": 0.2287, "num_input_tokens_seen": 15595776, "step": 18000 }, { "epoch": 8.486562942008486, "eval_loss": 0.2266455441713333, "eval_runtime": 21.9511, "eval_samples_per_second": 42.959, "eval_steps_per_second": 21.502, "num_input_tokens_seen": 15595776, "step": 18000 }, { "epoch": 8.488920320603489, "grad_norm": 0.0012799183605238795, "learning_rate": 0.1734186248823178, "loss": 0.2289, "num_input_tokens_seen": 15600048, "step": 18005 }, { "epoch": 8.491277699198491, "grad_norm": 0.0013307837070897222, "learning_rate": 0.17336044054040844, "loss": 0.2267, "num_input_tokens_seen": 15604464, "step": 18010 }, { "epoch": 8.493635077793494, "grad_norm": 0.0006515239365398884, "learning_rate": 0.1733022525960256, "loss": 0.215, "num_input_tokens_seen": 15608880, "step": 18015 }, { "epoch": 8.495992456388496, "grad_norm": 0.0006984287756495178, "learning_rate": 0.1732440610581426, "loss": 0.2066, "num_input_tokens_seen": 15613184, "step": 18020 }, { "epoch": 8.498349834983498, "grad_norm": 0.0007471186690963805, "learning_rate": 0.17318586593573326, "loss": 0.2164, "num_input_tokens_seen": 15616768, "step": 18025 }, { "epoch": 8.500707213578501, "grad_norm": 0.0005731939454562962, "learning_rate": 0.17312766723777204, "loss": 0.1775, "num_input_tokens_seen": 15620656, "step": 18030 }, { "epoch": 8.503064592173503, "grad_norm": 0.0011987403267994523, "learning_rate": 0.1730694649732339, "loss": 0.2684, "num_input_tokens_seen": 15624624, "step": 18035 }, { "epoch": 8.505421970768506, "grad_norm": 0.0010582548566162586, "learning_rate": 0.17301125915109428, "loss": 0.1788, "num_input_tokens_seen": 15629008, "step": 18040 }, { "epoch": 8.507779349363508, "grad_norm": 0.001136177801527083, "learning_rate": 0.17295304978032938, "loss": 0.2469, "num_input_tokens_seen": 15633984, "step": 18045 }, { "epoch": 8.51013672795851, "grad_norm": 0.0007597638759762049, "learning_rate": 0.17289483686991577, "loss": 0.2483, "num_input_tokens_seen": 15638064, "step": 18050 }, { "epoch": 8.512494106553513, "grad_norm": 0.0009094870765693486, "learning_rate": 0.1728366204288306, "loss": 0.218, "num_input_tokens_seen": 15642080, "step": 18055 }, { "epoch": 8.514851485148515, "grad_norm": 0.0015877906698733568, "learning_rate": 0.17277840046605153, "loss": 0.2355, "num_input_tokens_seen": 15647120, "step": 18060 }, { "epoch": 8.517208863743518, "grad_norm": 0.0012023497838526964, "learning_rate": 0.17272017699055686, "loss": 0.2225, "num_input_tokens_seen": 15651296, "step": 18065 }, { "epoch": 8.51956624233852, "grad_norm": 0.0007763237808831036, "learning_rate": 0.17266195001132542, "loss": 0.2111, "num_input_tokens_seen": 15655088, "step": 18070 }, { "epoch": 8.521923620933523, "grad_norm": 0.0007487460970878601, "learning_rate": 0.17260371953733647, "loss": 0.2101, "num_input_tokens_seen": 15659600, "step": 18075 }, { "epoch": 8.524280999528525, "grad_norm": 0.0006525563658215106, "learning_rate": 0.1725454855775699, "loss": 0.2143, "num_input_tokens_seen": 15663648, "step": 18080 }, { "epoch": 8.526638378123527, "grad_norm": 0.0005706198862753808, "learning_rate": 0.17248724814100616, "loss": 0.1839, "num_input_tokens_seen": 15668224, "step": 18085 }, { "epoch": 8.52899575671853, "grad_norm": 0.0009220067877322435, "learning_rate": 0.17242900723662619, "loss": 0.2287, "num_input_tokens_seen": 15672128, "step": 18090 }, { "epoch": 8.531353135313532, "grad_norm": 0.000576293095946312, "learning_rate": 0.1723707628734114, "loss": 0.2357, "num_input_tokens_seen": 15675712, "step": 18095 }, { "epoch": 8.533710513908535, "grad_norm": 0.0009397325338795781, "learning_rate": 0.1723125150603438, "loss": 0.232, "num_input_tokens_seen": 15680560, "step": 18100 }, { "epoch": 8.536067892503535, "grad_norm": 0.014482175931334496, "learning_rate": 0.1722542638064061, "loss": 0.2429, "num_input_tokens_seen": 15685008, "step": 18105 }, { "epoch": 8.53842527109854, "grad_norm": 0.0011561346473172307, "learning_rate": 0.17219600912058117, "loss": 0.2263, "num_input_tokens_seen": 15689344, "step": 18110 }, { "epoch": 8.54078264969354, "grad_norm": 0.0009198483894579113, "learning_rate": 0.17213775101185272, "loss": 0.2238, "num_input_tokens_seen": 15693296, "step": 18115 }, { "epoch": 8.543140028288542, "grad_norm": 0.00163687311578542, "learning_rate": 0.17207948948920485, "loss": 0.2042, "num_input_tokens_seen": 15697264, "step": 18120 }, { "epoch": 8.545497406883545, "grad_norm": 0.0005884267156943679, "learning_rate": 0.17202122456162228, "loss": 0.1856, "num_input_tokens_seen": 15702704, "step": 18125 }, { "epoch": 8.547854785478547, "grad_norm": 0.0009090225212275982, "learning_rate": 0.17196295623809013, "loss": 0.3012, "num_input_tokens_seen": 15707376, "step": 18130 }, { "epoch": 8.55021216407355, "grad_norm": 0.001534249517135322, "learning_rate": 0.1719046845275941, "loss": 0.2534, "num_input_tokens_seen": 15711408, "step": 18135 }, { "epoch": 8.552569542668552, "grad_norm": 0.00697338255122304, "learning_rate": 0.17184640943912044, "loss": 0.2148, "num_input_tokens_seen": 15716096, "step": 18140 }, { "epoch": 8.554926921263554, "grad_norm": 0.0008019887609407306, "learning_rate": 0.1717881309816559, "loss": 0.2391, "num_input_tokens_seen": 15720656, "step": 18145 }, { "epoch": 8.557284299858557, "grad_norm": 0.0009679835056886077, "learning_rate": 0.1717298491641878, "loss": 0.2368, "num_input_tokens_seen": 15724176, "step": 18150 }, { "epoch": 8.55964167845356, "grad_norm": 0.001741203828714788, "learning_rate": 0.17167156399570385, "loss": 0.2105, "num_input_tokens_seen": 15728144, "step": 18155 }, { "epoch": 8.561999057048562, "grad_norm": 0.0017620599828660488, "learning_rate": 0.17161327548519242, "loss": 0.2011, "num_input_tokens_seen": 15733328, "step": 18160 }, { "epoch": 8.564356435643564, "grad_norm": 0.0012654566671699286, "learning_rate": 0.1715549836416423, "loss": 0.2087, "num_input_tokens_seen": 15737712, "step": 18165 }, { "epoch": 8.566713814238566, "grad_norm": 0.0012035195250064135, "learning_rate": 0.17149668847404279, "loss": 0.29, "num_input_tokens_seen": 15741952, "step": 18170 }, { "epoch": 8.569071192833569, "grad_norm": 0.0030324305407702923, "learning_rate": 0.1714383899913838, "loss": 0.2007, "num_input_tokens_seen": 15746464, "step": 18175 }, { "epoch": 8.571428571428571, "grad_norm": 0.00086741684935987, "learning_rate": 0.17138008820265563, "loss": 0.2471, "num_input_tokens_seen": 15750480, "step": 18180 }, { "epoch": 8.573785950023574, "grad_norm": 0.0009010234498418868, "learning_rate": 0.17132178311684917, "loss": 0.2101, "num_input_tokens_seen": 15754736, "step": 18185 }, { "epoch": 8.576143328618576, "grad_norm": 0.0008850029553286731, "learning_rate": 0.1712634747429559, "loss": 0.2393, "num_input_tokens_seen": 15759280, "step": 18190 }, { "epoch": 8.578500707213578, "grad_norm": 0.0007456456078216434, "learning_rate": 0.17120516308996753, "loss": 0.2251, "num_input_tokens_seen": 15763312, "step": 18195 }, { "epoch": 8.58085808580858, "grad_norm": 0.0010238420218229294, "learning_rate": 0.17114684816687653, "loss": 0.2202, "num_input_tokens_seen": 15768288, "step": 18200 }, { "epoch": 8.58085808580858, "eval_loss": 0.219070166349411, "eval_runtime": 21.9087, "eval_samples_per_second": 43.042, "eval_steps_per_second": 21.544, "num_input_tokens_seen": 15768288, "step": 18200 }, { "epoch": 8.583215464403583, "grad_norm": 0.0007150620222091675, "learning_rate": 0.17108852998267585, "loss": 0.2247, "num_input_tokens_seen": 15772000, "step": 18205 }, { "epoch": 8.585572842998586, "grad_norm": 0.0005563875311054289, "learning_rate": 0.17103020854635878, "loss": 0.2364, "num_input_tokens_seen": 15775280, "step": 18210 }, { "epoch": 8.587930221593588, "grad_norm": 0.001316106179729104, "learning_rate": 0.1709718838669193, "loss": 0.2391, "num_input_tokens_seen": 15780464, "step": 18215 }, { "epoch": 8.59028760018859, "grad_norm": 0.0009191678836941719, "learning_rate": 0.17091355595335173, "loss": 0.221, "num_input_tokens_seen": 15785040, "step": 18220 }, { "epoch": 8.592644978783593, "grad_norm": 0.0007271976210176945, "learning_rate": 0.17085522481465107, "loss": 0.2217, "num_input_tokens_seen": 15789072, "step": 18225 }, { "epoch": 8.595002357378595, "grad_norm": 0.0008339803316630423, "learning_rate": 0.17079689045981264, "loss": 0.2226, "num_input_tokens_seen": 15793744, "step": 18230 }, { "epoch": 8.597359735973598, "grad_norm": 0.0007401821203529835, "learning_rate": 0.17073855289783238, "loss": 0.2519, "num_input_tokens_seen": 15798080, "step": 18235 }, { "epoch": 8.5997171145686, "grad_norm": 0.0005845606210641563, "learning_rate": 0.1706802121377066, "loss": 0.2165, "num_input_tokens_seen": 15802192, "step": 18240 }, { "epoch": 8.602074493163602, "grad_norm": 0.0015291826566681266, "learning_rate": 0.17062186818843225, "loss": 0.23, "num_input_tokens_seen": 15806464, "step": 18245 }, { "epoch": 8.604431871758605, "grad_norm": 0.001709967153146863, "learning_rate": 0.17056352105900668, "loss": 0.2268, "num_input_tokens_seen": 15811232, "step": 18250 }, { "epoch": 8.606789250353607, "grad_norm": 0.0005686606746166945, "learning_rate": 0.17050517075842772, "loss": 0.2246, "num_input_tokens_seen": 15815600, "step": 18255 }, { "epoch": 8.60914662894861, "grad_norm": 0.0006128829554654658, "learning_rate": 0.17044681729569375, "loss": 0.2431, "num_input_tokens_seen": 15819456, "step": 18260 }, { "epoch": 8.611504007543612, "grad_norm": 0.0005904178251512349, "learning_rate": 0.17038846067980365, "loss": 0.2208, "num_input_tokens_seen": 15823200, "step": 18265 }, { "epoch": 8.613861386138614, "grad_norm": 0.0011399361537769437, "learning_rate": 0.17033010091975664, "loss": 0.2323, "num_input_tokens_seen": 15828576, "step": 18270 }, { "epoch": 8.616218764733617, "grad_norm": 0.0016350378282368183, "learning_rate": 0.17027173802455262, "loss": 0.2061, "num_input_tokens_seen": 15832720, "step": 18275 }, { "epoch": 8.61857614332862, "grad_norm": 0.0006240732036530972, "learning_rate": 0.1702133720031918, "loss": 0.2173, "num_input_tokens_seen": 15837472, "step": 18280 }, { "epoch": 8.620933521923622, "grad_norm": 0.0007576360949315131, "learning_rate": 0.17015500286467503, "loss": 0.2129, "num_input_tokens_seen": 15841792, "step": 18285 }, { "epoch": 8.623290900518624, "grad_norm": 0.00043002175516448915, "learning_rate": 0.17009663061800354, "loss": 0.2077, "num_input_tokens_seen": 15846256, "step": 18290 }, { "epoch": 8.625648279113626, "grad_norm": 0.0008850198937579989, "learning_rate": 0.17003825527217903, "loss": 0.2193, "num_input_tokens_seen": 15850416, "step": 18295 }, { "epoch": 8.628005657708629, "grad_norm": 0.0007569605368189514, "learning_rate": 0.16997987683620377, "loss": 0.2623, "num_input_tokens_seen": 15854912, "step": 18300 }, { "epoch": 8.630363036303631, "grad_norm": 0.0006765298312529922, "learning_rate": 0.16992149531908043, "loss": 0.2293, "num_input_tokens_seen": 15859648, "step": 18305 }, { "epoch": 8.632720414898632, "grad_norm": 0.0009301642421633005, "learning_rate": 0.16986311072981214, "loss": 0.2356, "num_input_tokens_seen": 15863312, "step": 18310 }, { "epoch": 8.635077793493634, "grad_norm": 0.0005708853714168072, "learning_rate": 0.16980472307740255, "loss": 0.2704, "num_input_tokens_seen": 15868656, "step": 18315 }, { "epoch": 8.637435172088637, "grad_norm": 0.0004610781907103956, "learning_rate": 0.1697463323708558, "loss": 0.2358, "num_input_tokens_seen": 15872688, "step": 18320 }, { "epoch": 8.639792550683639, "grad_norm": 0.0005200142622925341, "learning_rate": 0.16968793861917641, "loss": 0.2278, "num_input_tokens_seen": 15876880, "step": 18325 }, { "epoch": 8.642149929278641, "grad_norm": 0.0006966299843043089, "learning_rate": 0.16962954183136952, "loss": 0.2247, "num_input_tokens_seen": 15881072, "step": 18330 }, { "epoch": 8.644507307873644, "grad_norm": 0.0014031528262421489, "learning_rate": 0.16957114201644058, "loss": 0.24, "num_input_tokens_seen": 15885120, "step": 18335 }, { "epoch": 8.646864686468646, "grad_norm": 0.0013513099402189255, "learning_rate": 0.16951273918339563, "loss": 0.209, "num_input_tokens_seen": 15889584, "step": 18340 }, { "epoch": 8.649222065063649, "grad_norm": 0.0007043613004498184, "learning_rate": 0.16945433334124105, "loss": 0.2267, "num_input_tokens_seen": 15893568, "step": 18345 }, { "epoch": 8.651579443658651, "grad_norm": 0.0013094445457682014, "learning_rate": 0.1693959244989838, "loss": 0.2176, "num_input_tokens_seen": 15897952, "step": 18350 }, { "epoch": 8.653936822253653, "grad_norm": 0.0007430961122736335, "learning_rate": 0.16933751266563127, "loss": 0.1973, "num_input_tokens_seen": 15901712, "step": 18355 }, { "epoch": 8.656294200848656, "grad_norm": 0.0008211465319618583, "learning_rate": 0.16927909785019118, "loss": 0.2055, "num_input_tokens_seen": 15906160, "step": 18360 }, { "epoch": 8.658651579443658, "grad_norm": 0.00038109449087642133, "learning_rate": 0.169220680061672, "loss": 0.1889, "num_input_tokens_seen": 15910256, "step": 18365 }, { "epoch": 8.66100895803866, "grad_norm": 0.0016713551012799144, "learning_rate": 0.16916225930908244, "loss": 0.2383, "num_input_tokens_seen": 15914592, "step": 18370 }, { "epoch": 8.663366336633663, "grad_norm": 0.0005188613431528211, "learning_rate": 0.16910383560143163, "loss": 0.1625, "num_input_tokens_seen": 15920224, "step": 18375 }, { "epoch": 8.665723715228665, "grad_norm": 0.001049801241606474, "learning_rate": 0.16904540894772935, "loss": 0.2608, "num_input_tokens_seen": 15924944, "step": 18380 }, { "epoch": 8.668081093823668, "grad_norm": 0.0015595387667417526, "learning_rate": 0.16898697935698562, "loss": 0.2254, "num_input_tokens_seen": 15929264, "step": 18385 }, { "epoch": 8.67043847241867, "grad_norm": 0.0006288376753218472, "learning_rate": 0.1689285468382111, "loss": 0.2103, "num_input_tokens_seen": 15933264, "step": 18390 }, { "epoch": 8.672795851013673, "grad_norm": 0.0012521749595180154, "learning_rate": 0.16887011140041677, "loss": 0.2058, "num_input_tokens_seen": 15937680, "step": 18395 }, { "epoch": 8.675153229608675, "grad_norm": 0.0004965679836459458, "learning_rate": 0.1688116730526141, "loss": 0.2279, "num_input_tokens_seen": 15941776, "step": 18400 }, { "epoch": 8.675153229608675, "eval_loss": 0.21795068681240082, "eval_runtime": 21.8939, "eval_samples_per_second": 43.071, "eval_steps_per_second": 21.559, "num_input_tokens_seen": 15941776, "step": 18400 }, { "epoch": 8.677510608203677, "grad_norm": 0.0014082754496484995, "learning_rate": 0.1687532318038151, "loss": 0.2047, "num_input_tokens_seen": 15945232, "step": 18405 }, { "epoch": 8.67986798679868, "grad_norm": 0.0014392426237463951, "learning_rate": 0.16869478766303206, "loss": 0.2363, "num_input_tokens_seen": 15948944, "step": 18410 }, { "epoch": 8.682225365393682, "grad_norm": 0.0015883008018136024, "learning_rate": 0.16863634063927788, "loss": 0.2113, "num_input_tokens_seen": 15952848, "step": 18415 }, { "epoch": 8.684582743988685, "grad_norm": 0.0004736390837933868, "learning_rate": 0.16857789074156568, "loss": 0.1829, "num_input_tokens_seen": 15957744, "step": 18420 }, { "epoch": 8.686940122583687, "grad_norm": 0.0005550820496864617, "learning_rate": 0.16851943797890928, "loss": 0.2316, "num_input_tokens_seen": 15962352, "step": 18425 }, { "epoch": 8.68929750117869, "grad_norm": 0.0008728143875487149, "learning_rate": 0.16846098236032284, "loss": 0.2552, "num_input_tokens_seen": 15967456, "step": 18430 }, { "epoch": 8.691654879773692, "grad_norm": 0.0004981233505532146, "learning_rate": 0.16840252389482097, "loss": 0.2198, "num_input_tokens_seen": 15971824, "step": 18435 }, { "epoch": 8.694012258368694, "grad_norm": 0.0006245926488190889, "learning_rate": 0.16834406259141857, "loss": 0.2227, "num_input_tokens_seen": 15976576, "step": 18440 }, { "epoch": 8.696369636963697, "grad_norm": 0.0006486743804998696, "learning_rate": 0.16828559845913124, "loss": 0.2398, "num_input_tokens_seen": 15980272, "step": 18445 }, { "epoch": 8.698727015558699, "grad_norm": 0.0006653268937952816, "learning_rate": 0.16822713150697488, "loss": 0.2014, "num_input_tokens_seen": 15985248, "step": 18450 }, { "epoch": 8.701084394153701, "grad_norm": 0.000704884878359735, "learning_rate": 0.16816866174396575, "loss": 0.2015, "num_input_tokens_seen": 15989104, "step": 18455 }, { "epoch": 8.703441772748704, "grad_norm": 0.0004543059039860964, "learning_rate": 0.16811018917912057, "loss": 0.2135, "num_input_tokens_seen": 15992736, "step": 18460 }, { "epoch": 8.705799151343706, "grad_norm": 0.0004836838343180716, "learning_rate": 0.16805171382145673, "loss": 0.2062, "num_input_tokens_seen": 15996912, "step": 18465 }, { "epoch": 8.708156529938709, "grad_norm": 0.0004295568505767733, "learning_rate": 0.16799323567999175, "loss": 0.2179, "num_input_tokens_seen": 16001568, "step": 18470 }, { "epoch": 8.710513908533711, "grad_norm": 0.0017301524057984352, "learning_rate": 0.16793475476374367, "loss": 0.2384, "num_input_tokens_seen": 16006288, "step": 18475 }, { "epoch": 8.712871287128714, "grad_norm": 0.0006898589199408889, "learning_rate": 0.1678762710817311, "loss": 0.2147, "num_input_tokens_seen": 16010976, "step": 18480 }, { "epoch": 8.715228665723716, "grad_norm": 0.0007041610078886151, "learning_rate": 0.1678177846429728, "loss": 0.2056, "num_input_tokens_seen": 16015840, "step": 18485 }, { "epoch": 8.717586044318718, "grad_norm": 0.0008175145485438406, "learning_rate": 0.16775929545648827, "loss": 0.174, "num_input_tokens_seen": 16019888, "step": 18490 }, { "epoch": 8.71994342291372, "grad_norm": 0.0011335917515680194, "learning_rate": 0.16770080353129715, "loss": 0.2524, "num_input_tokens_seen": 16023936, "step": 18495 }, { "epoch": 8.722300801508723, "grad_norm": 0.0008496858063153923, "learning_rate": 0.16764230887641968, "loss": 0.239, "num_input_tokens_seen": 16028208, "step": 18500 }, { "epoch": 8.724658180103724, "grad_norm": 0.001566729275509715, "learning_rate": 0.1675838115008765, "loss": 0.2237, "num_input_tokens_seen": 16032464, "step": 18505 }, { "epoch": 8.727015558698728, "grad_norm": 0.0008703620405867696, "learning_rate": 0.1675253114136886, "loss": 0.1987, "num_input_tokens_seen": 16036960, "step": 18510 }, { "epoch": 8.729372937293729, "grad_norm": 0.0011228607036173344, "learning_rate": 0.16746680862387747, "loss": 0.1883, "num_input_tokens_seen": 16042112, "step": 18515 }, { "epoch": 8.731730315888731, "grad_norm": 0.00037527523818425834, "learning_rate": 0.16740830314046493, "loss": 0.2017, "num_input_tokens_seen": 16045904, "step": 18520 }, { "epoch": 8.734087694483733, "grad_norm": 0.0003812809882219881, "learning_rate": 0.1673497949724733, "loss": 0.2248, "num_input_tokens_seen": 16049824, "step": 18525 }, { "epoch": 8.736445073078736, "grad_norm": 0.0007668108446523547, "learning_rate": 0.16729128412892522, "loss": 0.2269, "num_input_tokens_seen": 16053728, "step": 18530 }, { "epoch": 8.738802451673738, "grad_norm": 0.0011326927924528718, "learning_rate": 0.16723277061884384, "loss": 0.2094, "num_input_tokens_seen": 16058032, "step": 18535 }, { "epoch": 8.74115983026874, "grad_norm": 0.00044401141349226236, "learning_rate": 0.16717425445125267, "loss": 0.2029, "num_input_tokens_seen": 16062384, "step": 18540 }, { "epoch": 8.743517208863743, "grad_norm": 0.0004710571956820786, "learning_rate": 0.16711573563517565, "loss": 0.2414, "num_input_tokens_seen": 16067312, "step": 18545 }, { "epoch": 8.745874587458745, "grad_norm": 0.0015167276142165065, "learning_rate": 0.1670572141796371, "loss": 0.2186, "num_input_tokens_seen": 16071904, "step": 18550 }, { "epoch": 8.748231966053748, "grad_norm": 0.000627753499429673, "learning_rate": 0.16699869009366175, "loss": 0.2046, "num_input_tokens_seen": 16076880, "step": 18555 }, { "epoch": 8.75058934464875, "grad_norm": 0.0004848121025133878, "learning_rate": 0.1669401633862748, "loss": 0.223, "num_input_tokens_seen": 16080704, "step": 18560 }, { "epoch": 8.752946723243753, "grad_norm": 0.0007873569848015904, "learning_rate": 0.16688163406650178, "loss": 0.2463, "num_input_tokens_seen": 16084992, "step": 18565 }, { "epoch": 8.755304101838755, "grad_norm": 0.0006370774353854358, "learning_rate": 0.1668231021433686, "loss": 0.2359, "num_input_tokens_seen": 16090080, "step": 18570 }, { "epoch": 8.757661480433757, "grad_norm": 0.0005954314838163555, "learning_rate": 0.1667645676259017, "loss": 0.2413, "num_input_tokens_seen": 16094240, "step": 18575 }, { "epoch": 8.76001885902876, "grad_norm": 0.0004563817346934229, "learning_rate": 0.1667060305231277, "loss": 0.213, "num_input_tokens_seen": 16099184, "step": 18580 }, { "epoch": 8.762376237623762, "grad_norm": 0.0005711234407499433, "learning_rate": 0.16664749084407396, "loss": 0.2184, "num_input_tokens_seen": 16103680, "step": 18585 }, { "epoch": 8.764733616218765, "grad_norm": 0.0005423026741482317, "learning_rate": 0.16658894859776788, "loss": 0.2304, "num_input_tokens_seen": 16107536, "step": 18590 }, { "epoch": 8.767090994813767, "grad_norm": 0.0015658442862331867, "learning_rate": 0.16653040379323752, "loss": 0.2158, "num_input_tokens_seen": 16111104, "step": 18595 }, { "epoch": 8.76944837340877, "grad_norm": 0.0005919443792663515, "learning_rate": 0.16647185643951107, "loss": 0.2137, "num_input_tokens_seen": 16115152, "step": 18600 }, { "epoch": 8.76944837340877, "eval_loss": 0.217378169298172, "eval_runtime": 21.8988, "eval_samples_per_second": 43.062, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 16115152, "step": 18600 }, { "epoch": 8.771805752003772, "grad_norm": 0.0004206119629088789, "learning_rate": 0.1664133065456174, "loss": 0.2107, "num_input_tokens_seen": 16119024, "step": 18605 }, { "epoch": 8.774163130598774, "grad_norm": 0.001129704061895609, "learning_rate": 0.1663547541205856, "loss": 0.2279, "num_input_tokens_seen": 16123632, "step": 18610 }, { "epoch": 8.776520509193777, "grad_norm": 0.0008168525528162718, "learning_rate": 0.16629619917344518, "loss": 0.2286, "num_input_tokens_seen": 16127584, "step": 18615 }, { "epoch": 8.778877887788779, "grad_norm": 0.0014294319553300738, "learning_rate": 0.16623764171322605, "loss": 0.2053, "num_input_tokens_seen": 16131536, "step": 18620 }, { "epoch": 8.781235266383781, "grad_norm": 0.0014796837931498885, "learning_rate": 0.1661790817489585, "loss": 0.2725, "num_input_tokens_seen": 16136960, "step": 18625 }, { "epoch": 8.783592644978784, "grad_norm": 0.0008465485298074782, "learning_rate": 0.16612051928967328, "loss": 0.2209, "num_input_tokens_seen": 16140784, "step": 18630 }, { "epoch": 8.785950023573786, "grad_norm": 0.0007864964427426457, "learning_rate": 0.16606195434440138, "loss": 0.2044, "num_input_tokens_seen": 16144464, "step": 18635 }, { "epoch": 8.788307402168789, "grad_norm": 0.0008262985502369702, "learning_rate": 0.16600338692217426, "loss": 0.2175, "num_input_tokens_seen": 16148048, "step": 18640 }, { "epoch": 8.790664780763791, "grad_norm": 0.000739801733288914, "learning_rate": 0.16594481703202374, "loss": 0.2154, "num_input_tokens_seen": 16152016, "step": 18645 }, { "epoch": 8.793022159358793, "grad_norm": 0.0005678630550391972, "learning_rate": 0.1658862446829821, "loss": 0.2131, "num_input_tokens_seen": 16156848, "step": 18650 }, { "epoch": 8.795379537953796, "grad_norm": 0.0007014997536316514, "learning_rate": 0.16582766988408187, "loss": 0.2233, "num_input_tokens_seen": 16160960, "step": 18655 }, { "epoch": 8.797736916548798, "grad_norm": 0.0005888852756470442, "learning_rate": 0.16576909264435608, "loss": 0.2107, "num_input_tokens_seen": 16164960, "step": 18660 }, { "epoch": 8.8000942951438, "grad_norm": 0.001299911760725081, "learning_rate": 0.16571051297283798, "loss": 0.2076, "num_input_tokens_seen": 16169504, "step": 18665 }, { "epoch": 8.802451673738803, "grad_norm": 0.0006208215490914881, "learning_rate": 0.16565193087856137, "loss": 0.2086, "num_input_tokens_seen": 16173168, "step": 18670 }, { "epoch": 8.804809052333805, "grad_norm": 0.0011324415681883693, "learning_rate": 0.16559334637056033, "loss": 0.2036, "num_input_tokens_seen": 16177696, "step": 18675 }, { "epoch": 8.807166430928808, "grad_norm": 0.0009565473301336169, "learning_rate": 0.16553475945786933, "loss": 0.2283, "num_input_tokens_seen": 16182000, "step": 18680 }, { "epoch": 8.80952380952381, "grad_norm": 0.0007318815332837403, "learning_rate": 0.16547617014952318, "loss": 0.2125, "num_input_tokens_seen": 16186544, "step": 18685 }, { "epoch": 8.811881188118813, "grad_norm": 0.0008638911531306803, "learning_rate": 0.1654175784545571, "loss": 0.1975, "num_input_tokens_seen": 16191216, "step": 18690 }, { "epoch": 8.814238566713815, "grad_norm": 0.0016324676107615232, "learning_rate": 0.1653589843820067, "loss": 0.2472, "num_input_tokens_seen": 16196160, "step": 18695 }, { "epoch": 8.816595945308817, "grad_norm": 0.0007655143272131681, "learning_rate": 0.1653003879409079, "loss": 0.2305, "num_input_tokens_seen": 16200160, "step": 18700 }, { "epoch": 8.81895332390382, "grad_norm": 0.0015668487176299095, "learning_rate": 0.165241789140297, "loss": 0.2207, "num_input_tokens_seen": 16203376, "step": 18705 }, { "epoch": 8.82131070249882, "grad_norm": 0.00101180886849761, "learning_rate": 0.16518318798921064, "loss": 0.2099, "num_input_tokens_seen": 16207472, "step": 18710 }, { "epoch": 8.823668081093825, "grad_norm": 0.0016081476351246238, "learning_rate": 0.16512458449668593, "loss": 0.2296, "num_input_tokens_seen": 16212144, "step": 18715 }, { "epoch": 8.826025459688825, "grad_norm": 0.0006952566327527165, "learning_rate": 0.1650659786717602, "loss": 0.214, "num_input_tokens_seen": 16216432, "step": 18720 }, { "epoch": 8.828382838283828, "grad_norm": 0.004767382517457008, "learning_rate": 0.1650073705234712, "loss": 0.2681, "num_input_tokens_seen": 16220848, "step": 18725 }, { "epoch": 8.83074021687883, "grad_norm": 0.0005493024364113808, "learning_rate": 0.16494876006085712, "loss": 0.2304, "num_input_tokens_seen": 16224768, "step": 18730 }, { "epoch": 8.833097595473832, "grad_norm": 0.0005917272064834833, "learning_rate": 0.16489014729295634, "loss": 0.2137, "num_input_tokens_seen": 16228896, "step": 18735 }, { "epoch": 8.835454974068835, "grad_norm": 0.0008148672641254961, "learning_rate": 0.16483153222880775, "loss": 0.2355, "num_input_tokens_seen": 16233088, "step": 18740 }, { "epoch": 8.837812352663837, "grad_norm": 0.0009700177470222116, "learning_rate": 0.16477291487745052, "loss": 0.2166, "num_input_tokens_seen": 16237648, "step": 18745 }, { "epoch": 8.84016973125884, "grad_norm": 0.001340088085271418, "learning_rate": 0.16471429524792416, "loss": 0.2088, "num_input_tokens_seen": 16241824, "step": 18750 }, { "epoch": 8.842527109853842, "grad_norm": 0.0008712430135346949, "learning_rate": 0.16465567334926856, "loss": 0.1874, "num_input_tokens_seen": 16245856, "step": 18755 }, { "epoch": 8.844884488448844, "grad_norm": 0.001651065074838698, "learning_rate": 0.16459704919052395, "loss": 0.2256, "num_input_tokens_seen": 16250480, "step": 18760 }, { "epoch": 8.847241867043847, "grad_norm": 0.0011034810449928045, "learning_rate": 0.16453842278073086, "loss": 0.2442, "num_input_tokens_seen": 16254864, "step": 18765 }, { "epoch": 8.84959924563885, "grad_norm": 0.0005329998675733805, "learning_rate": 0.16447979412893038, "loss": 0.2389, "num_input_tokens_seen": 16258704, "step": 18770 }, { "epoch": 8.851956624233852, "grad_norm": 0.0006073230179026723, "learning_rate": 0.16442116324416367, "loss": 0.2346, "num_input_tokens_seen": 16262864, "step": 18775 }, { "epoch": 8.854314002828854, "grad_norm": 0.0010504636447876692, "learning_rate": 0.1643625301354723, "loss": 0.226, "num_input_tokens_seen": 16267632, "step": 18780 }, { "epoch": 8.856671381423856, "grad_norm": 0.000914181349799037, "learning_rate": 0.16430389481189828, "loss": 0.2193, "num_input_tokens_seen": 16271552, "step": 18785 }, { "epoch": 8.859028760018859, "grad_norm": 0.0009132477571256459, "learning_rate": 0.164245257282484, "loss": 0.212, "num_input_tokens_seen": 16275680, "step": 18790 }, { "epoch": 8.861386138613861, "grad_norm": 0.0006554905558004975, "learning_rate": 0.16418661755627195, "loss": 0.2112, "num_input_tokens_seen": 16279744, "step": 18795 }, { "epoch": 8.863743517208864, "grad_norm": 0.00128890888299793, "learning_rate": 0.16412797564230527, "loss": 0.2182, "num_input_tokens_seen": 16284384, "step": 18800 }, { "epoch": 8.863743517208864, "eval_loss": 0.2159968912601471, "eval_runtime": 21.9784, "eval_samples_per_second": 42.906, "eval_steps_per_second": 21.476, "num_input_tokens_seen": 16284384, "step": 18800 }, { "epoch": 8.866100895803866, "grad_norm": 0.0006238169153220952, "learning_rate": 0.16406933154962713, "loss": 0.2689, "num_input_tokens_seen": 16288832, "step": 18805 }, { "epoch": 8.868458274398868, "grad_norm": 0.0007709947531111538, "learning_rate": 0.16401068528728133, "loss": 0.2171, "num_input_tokens_seen": 16293152, "step": 18810 }, { "epoch": 8.87081565299387, "grad_norm": 0.0015059354482218623, "learning_rate": 0.16395203686431173, "loss": 0.2079, "num_input_tokens_seen": 16297616, "step": 18815 }, { "epoch": 8.873173031588873, "grad_norm": 0.0006565727526322007, "learning_rate": 0.16389338628976277, "loss": 0.2117, "num_input_tokens_seen": 16302224, "step": 18820 }, { "epoch": 8.875530410183876, "grad_norm": 0.002393738366663456, "learning_rate": 0.163834733572679, "loss": 0.1912, "num_input_tokens_seen": 16307392, "step": 18825 }, { "epoch": 8.877887788778878, "grad_norm": 0.009231410920619965, "learning_rate": 0.16377607872210545, "loss": 0.2494, "num_input_tokens_seen": 16312528, "step": 18830 }, { "epoch": 8.88024516737388, "grad_norm": 0.001489865593612194, "learning_rate": 0.16371742174708748, "loss": 0.2103, "num_input_tokens_seen": 16316736, "step": 18835 }, { "epoch": 8.882602545968883, "grad_norm": 0.0008405112312175333, "learning_rate": 0.16365876265667065, "loss": 0.1659, "num_input_tokens_seen": 16321152, "step": 18840 }, { "epoch": 8.884959924563885, "grad_norm": 0.0011108461767435074, "learning_rate": 0.163600101459901, "loss": 0.1761, "num_input_tokens_seen": 16325344, "step": 18845 }, { "epoch": 8.887317303158888, "grad_norm": 0.0005577962729148567, "learning_rate": 0.16354143816582484, "loss": 0.1763, "num_input_tokens_seen": 16328928, "step": 18850 }, { "epoch": 8.88967468175389, "grad_norm": 0.0005054153152741492, "learning_rate": 0.1634827727834887, "loss": 0.3091, "num_input_tokens_seen": 16333024, "step": 18855 }, { "epoch": 8.892032060348892, "grad_norm": 0.0007885924424044788, "learning_rate": 0.16342410532193954, "loss": 0.2695, "num_input_tokens_seen": 16337136, "step": 18860 }, { "epoch": 8.894389438943895, "grad_norm": 0.0006701434031128883, "learning_rate": 0.16336543579022464, "loss": 0.216, "num_input_tokens_seen": 16341472, "step": 18865 }, { "epoch": 8.896746817538897, "grad_norm": 0.0011918238596990705, "learning_rate": 0.16330676419739157, "loss": 0.2275, "num_input_tokens_seen": 16346480, "step": 18870 }, { "epoch": 8.8991041961339, "grad_norm": 0.0005918386159464717, "learning_rate": 0.1632480905524883, "loss": 0.2124, "num_input_tokens_seen": 16350272, "step": 18875 }, { "epoch": 8.901461574728902, "grad_norm": 0.0007370043313130736, "learning_rate": 0.16318941486456293, "loss": 0.1946, "num_input_tokens_seen": 16354160, "step": 18880 }, { "epoch": 8.903818953323904, "grad_norm": 0.0010830272221937776, "learning_rate": 0.16313073714266405, "loss": 0.2151, "num_input_tokens_seen": 16359248, "step": 18885 }, { "epoch": 8.906176331918907, "grad_norm": 0.002053155330941081, "learning_rate": 0.16307205739584052, "loss": 0.2693, "num_input_tokens_seen": 16363664, "step": 18890 }, { "epoch": 8.90853371051391, "grad_norm": 0.001284484169445932, "learning_rate": 0.16301337563314144, "loss": 0.226, "num_input_tokens_seen": 16367632, "step": 18895 }, { "epoch": 8.910891089108912, "grad_norm": 0.0005017881048843265, "learning_rate": 0.1629546918636163, "loss": 0.159, "num_input_tokens_seen": 16371536, "step": 18900 }, { "epoch": 8.913248467703912, "grad_norm": 0.0009290958405472338, "learning_rate": 0.16289600609631485, "loss": 0.2383, "num_input_tokens_seen": 16375888, "step": 18905 }, { "epoch": 8.915605846298917, "grad_norm": 0.0008688514353707433, "learning_rate": 0.16283731834028722, "loss": 0.1865, "num_input_tokens_seen": 16380816, "step": 18910 }, { "epoch": 8.917963224893917, "grad_norm": 0.0005103500443510711, "learning_rate": 0.16277862860458378, "loss": 0.2238, "num_input_tokens_seen": 16384944, "step": 18915 }, { "epoch": 8.92032060348892, "grad_norm": 0.0010723761515691876, "learning_rate": 0.16271993689825526, "loss": 0.2085, "num_input_tokens_seen": 16389296, "step": 18920 }, { "epoch": 8.922677982083922, "grad_norm": 0.0013141264207661152, "learning_rate": 0.1626612432303526, "loss": 0.254, "num_input_tokens_seen": 16393440, "step": 18925 }, { "epoch": 8.925035360678924, "grad_norm": 0.0005712908459827304, "learning_rate": 0.1626025476099271, "loss": 0.2255, "num_input_tokens_seen": 16397872, "step": 18930 }, { "epoch": 8.927392739273927, "grad_norm": 0.0007861306075938046, "learning_rate": 0.1625438500460304, "loss": 0.2286, "num_input_tokens_seen": 16401968, "step": 18935 }, { "epoch": 8.92975011786893, "grad_norm": 0.001409479882568121, "learning_rate": 0.16248515054771442, "loss": 0.2219, "num_input_tokens_seen": 16406000, "step": 18940 }, { "epoch": 8.932107496463932, "grad_norm": 0.0013923911610618234, "learning_rate": 0.16242644912403123, "loss": 0.2193, "num_input_tokens_seen": 16410336, "step": 18945 }, { "epoch": 8.934464875058934, "grad_norm": 0.0005249543464742601, "learning_rate": 0.1623677457840335, "loss": 0.2195, "num_input_tokens_seen": 16414896, "step": 18950 }, { "epoch": 8.936822253653936, "grad_norm": 0.0006817495450377464, "learning_rate": 0.16230904053677397, "loss": 0.2182, "num_input_tokens_seen": 16419056, "step": 18955 }, { "epoch": 8.939179632248939, "grad_norm": 0.002415242837741971, "learning_rate": 0.16225033339130568, "loss": 0.2357, "num_input_tokens_seen": 16423088, "step": 18960 }, { "epoch": 8.941537010843941, "grad_norm": 0.0007602553232572973, "learning_rate": 0.16219162435668197, "loss": 0.213, "num_input_tokens_seen": 16427152, "step": 18965 }, { "epoch": 8.943894389438944, "grad_norm": 0.0003763705608434975, "learning_rate": 0.16213291344195666, "loss": 0.1726, "num_input_tokens_seen": 16430688, "step": 18970 }, { "epoch": 8.946251768033946, "grad_norm": 0.0018044696189463139, "learning_rate": 0.16207420065618358, "loss": 0.2407, "num_input_tokens_seen": 16435040, "step": 18975 }, { "epoch": 8.948609146628948, "grad_norm": 0.0014765002997592092, "learning_rate": 0.16201548600841706, "loss": 0.2262, "num_input_tokens_seen": 16438960, "step": 18980 }, { "epoch": 8.95096652522395, "grad_norm": 0.001295358524657786, "learning_rate": 0.16195676950771154, "loss": 0.1891, "num_input_tokens_seen": 16442656, "step": 18985 }, { "epoch": 8.953323903818953, "grad_norm": 0.0005954341613687575, "learning_rate": 0.16189805116312198, "loss": 0.2915, "num_input_tokens_seen": 16446864, "step": 18990 }, { "epoch": 8.955681282413956, "grad_norm": 0.0020496181678026915, "learning_rate": 0.16183933098370337, "loss": 0.2403, "num_input_tokens_seen": 16453104, "step": 18995 }, { "epoch": 8.958038661008958, "grad_norm": 0.0007852452108636498, "learning_rate": 0.16178060897851115, "loss": 0.2249, "num_input_tokens_seen": 16457552, "step": 19000 }, { "epoch": 8.958038661008958, "eval_loss": 0.22157442569732666, "eval_runtime": 21.8927, "eval_samples_per_second": 43.074, "eval_steps_per_second": 21.56, "num_input_tokens_seen": 16457552, "step": 19000 }, { "epoch": 8.96039603960396, "grad_norm": 0.0015846642199903727, "learning_rate": 0.16172188515660096, "loss": 0.2346, "num_input_tokens_seen": 16461392, "step": 19005 }, { "epoch": 8.962753418198963, "grad_norm": 0.0014008423313498497, "learning_rate": 0.16166315952702878, "loss": 0.1988, "num_input_tokens_seen": 16466320, "step": 19010 }, { "epoch": 8.965110796793965, "grad_norm": 0.0008225043420679867, "learning_rate": 0.16160443209885084, "loss": 0.1992, "num_input_tokens_seen": 16470816, "step": 19015 }, { "epoch": 8.967468175388968, "grad_norm": 0.0008845989359542727, "learning_rate": 0.16154570288112363, "loss": 0.2425, "num_input_tokens_seen": 16474976, "step": 19020 }, { "epoch": 8.96982555398397, "grad_norm": 0.0009841021383181214, "learning_rate": 0.16148697188290395, "loss": 0.2282, "num_input_tokens_seen": 16479216, "step": 19025 }, { "epoch": 8.972182932578972, "grad_norm": 0.0007237554527819157, "learning_rate": 0.16142823911324888, "loss": 0.2296, "num_input_tokens_seen": 16483536, "step": 19030 }, { "epoch": 8.974540311173975, "grad_norm": 0.0012564309872686863, "learning_rate": 0.16136950458121568, "loss": 0.2529, "num_input_tokens_seen": 16488304, "step": 19035 }, { "epoch": 8.976897689768977, "grad_norm": 0.001177856232970953, "learning_rate": 0.16131076829586205, "loss": 0.222, "num_input_tokens_seen": 16492688, "step": 19040 }, { "epoch": 8.97925506836398, "grad_norm": 0.0008911918848752975, "learning_rate": 0.1612520302662457, "loss": 0.2295, "num_input_tokens_seen": 16496576, "step": 19045 }, { "epoch": 8.981612446958982, "grad_norm": 0.0008841329836286604, "learning_rate": 0.16119329050142497, "loss": 0.2411, "num_input_tokens_seen": 16501120, "step": 19050 }, { "epoch": 8.983969825553984, "grad_norm": 0.001054307329468429, "learning_rate": 0.16113454901045818, "loss": 0.2322, "num_input_tokens_seen": 16505520, "step": 19055 }, { "epoch": 8.986327204148987, "grad_norm": 0.0005575273535214365, "learning_rate": 0.16107580580240397, "loss": 0.2309, "num_input_tokens_seen": 16510032, "step": 19060 }, { "epoch": 8.98868458274399, "grad_norm": 0.0005366908735595644, "learning_rate": 0.16101706088632134, "loss": 0.2223, "num_input_tokens_seen": 16514416, "step": 19065 }, { "epoch": 8.991041961338992, "grad_norm": 0.0003956820582970977, "learning_rate": 0.16095831427126947, "loss": 0.2232, "num_input_tokens_seen": 16518256, "step": 19070 }, { "epoch": 8.993399339933994, "grad_norm": 0.00056469562696293, "learning_rate": 0.16089956596630783, "loss": 0.2416, "num_input_tokens_seen": 16522784, "step": 19075 }, { "epoch": 8.995756718528996, "grad_norm": 0.0011187553172931075, "learning_rate": 0.16084081598049618, "loss": 0.2248, "num_input_tokens_seen": 16527056, "step": 19080 }, { "epoch": 8.998114097123999, "grad_norm": 0.0004213513748254627, "learning_rate": 0.1607820643228944, "loss": 0.2224, "num_input_tokens_seen": 16531328, "step": 19085 }, { "epoch": 9.000471475719001, "grad_norm": 0.0010986063862219453, "learning_rate": 0.16072331100256285, "loss": 0.2314, "num_input_tokens_seen": 16535808, "step": 19090 }, { "epoch": 9.002828854314004, "grad_norm": 0.0004284385940991342, "learning_rate": 0.16066455602856197, "loss": 0.2117, "num_input_tokens_seen": 16541664, "step": 19095 }, { "epoch": 9.005186232909006, "grad_norm": 0.000663784914650023, "learning_rate": 0.16060579940995257, "loss": 0.2371, "num_input_tokens_seen": 16545808, "step": 19100 }, { "epoch": 9.007543611504008, "grad_norm": 0.0006084454944357276, "learning_rate": 0.16054704115579557, "loss": 0.2126, "num_input_tokens_seen": 16550400, "step": 19105 }, { "epoch": 9.009900990099009, "grad_norm": 0.0014110076008364558, "learning_rate": 0.1604882812751523, "loss": 0.2234, "num_input_tokens_seen": 16554240, "step": 19110 }, { "epoch": 9.012258368694011, "grad_norm": 0.0005746796377934515, "learning_rate": 0.16042951977708425, "loss": 0.2196, "num_input_tokens_seen": 16558672, "step": 19115 }, { "epoch": 9.014615747289014, "grad_norm": 0.0006133854622021317, "learning_rate": 0.16037075667065318, "loss": 0.2038, "num_input_tokens_seen": 16563168, "step": 19120 }, { "epoch": 9.016973125884016, "grad_norm": 0.0007921397918835282, "learning_rate": 0.1603119919649211, "loss": 0.207, "num_input_tokens_seen": 16567152, "step": 19125 }, { "epoch": 9.019330504479019, "grad_norm": 0.0004568667500279844, "learning_rate": 0.16025322566895028, "loss": 0.2207, "num_input_tokens_seen": 16571408, "step": 19130 }, { "epoch": 9.021687883074021, "grad_norm": 0.0005554032977670431, "learning_rate": 0.16019445779180322, "loss": 0.2327, "num_input_tokens_seen": 16575664, "step": 19135 }, { "epoch": 9.024045261669023, "grad_norm": 0.001470726216211915, "learning_rate": 0.16013568834254271, "loss": 0.211, "num_input_tokens_seen": 16579744, "step": 19140 }, { "epoch": 9.026402640264026, "grad_norm": 0.0005826438427902758, "learning_rate": 0.1600769173302316, "loss": 0.2262, "num_input_tokens_seen": 16583936, "step": 19145 }, { "epoch": 9.028760018859028, "grad_norm": 0.0007293298840522766, "learning_rate": 0.16001814476393322, "loss": 0.2165, "num_input_tokens_seen": 16588272, "step": 19150 }, { "epoch": 9.03111739745403, "grad_norm": 0.0006621292559430003, "learning_rate": 0.15995937065271104, "loss": 0.2155, "num_input_tokens_seen": 16591712, "step": 19155 }, { "epoch": 9.033474776049033, "grad_norm": 0.0016877889866009355, "learning_rate": 0.15990059500562873, "loss": 0.2061, "num_input_tokens_seen": 16595824, "step": 19160 }, { "epoch": 9.035832154644035, "grad_norm": 0.0007410135585814714, "learning_rate": 0.15984181783175025, "loss": 0.2428, "num_input_tokens_seen": 16600304, "step": 19165 }, { "epoch": 9.038189533239038, "grad_norm": 0.001243826001882553, "learning_rate": 0.1597830391401398, "loss": 0.2182, "num_input_tokens_seen": 16604912, "step": 19170 }, { "epoch": 9.04054691183404, "grad_norm": 0.0013863013591617346, "learning_rate": 0.15972425893986178, "loss": 0.2332, "num_input_tokens_seen": 16609376, "step": 19175 }, { "epoch": 9.042904290429043, "grad_norm": 0.0007335345726460218, "learning_rate": 0.15966547723998084, "loss": 0.2405, "num_input_tokens_seen": 16613296, "step": 19180 }, { "epoch": 9.045261669024045, "grad_norm": 0.001419559121131897, "learning_rate": 0.15960669404956176, "loss": 0.2328, "num_input_tokens_seen": 16617824, "step": 19185 }, { "epoch": 9.047619047619047, "grad_norm": 0.0015403294237330556, "learning_rate": 0.1595479093776698, "loss": 0.1817, "num_input_tokens_seen": 16622768, "step": 19190 }, { "epoch": 9.04997642621405, "grad_norm": 0.001643997966311872, "learning_rate": 0.15948912323337022, "loss": 0.2317, "num_input_tokens_seen": 16628464, "step": 19195 }, { "epoch": 9.052333804809052, "grad_norm": 0.0017574172234162688, "learning_rate": 0.1594303356257286, "loss": 0.2489, "num_input_tokens_seen": 16632272, "step": 19200 }, { "epoch": 9.052333804809052, "eval_loss": 0.2172006219625473, "eval_runtime": 21.8936, "eval_samples_per_second": 43.072, "eval_steps_per_second": 21.559, "num_input_tokens_seen": 16632272, "step": 19200 }, { "epoch": 9.054691183404055, "grad_norm": 0.0013368241488933563, "learning_rate": 0.15937154656381072, "loss": 0.2237, "num_input_tokens_seen": 16635680, "step": 19205 }, { "epoch": 9.057048561999057, "grad_norm": 0.0006881268927827477, "learning_rate": 0.15931275605668258, "loss": 0.1929, "num_input_tokens_seen": 16640208, "step": 19210 }, { "epoch": 9.05940594059406, "grad_norm": 0.0016955672763288021, "learning_rate": 0.1592539641134104, "loss": 0.2559, "num_input_tokens_seen": 16644784, "step": 19215 }, { "epoch": 9.061763319189062, "grad_norm": 0.0010982337407767773, "learning_rate": 0.1591951707430607, "loss": 0.239, "num_input_tokens_seen": 16648912, "step": 19220 }, { "epoch": 9.064120697784064, "grad_norm": 0.001591330743394792, "learning_rate": 0.15913637595470007, "loss": 0.2055, "num_input_tokens_seen": 16653216, "step": 19225 }, { "epoch": 9.066478076379067, "grad_norm": 0.0021961117163300514, "learning_rate": 0.15907757975739548, "loss": 0.2158, "num_input_tokens_seen": 16657296, "step": 19230 }, { "epoch": 9.068835454974069, "grad_norm": 0.000696578121278435, "learning_rate": 0.159018782160214, "loss": 0.2125, "num_input_tokens_seen": 16661856, "step": 19235 }, { "epoch": 9.071192833569071, "grad_norm": 0.0006848767516203225, "learning_rate": 0.158959983172223, "loss": 0.2019, "num_input_tokens_seen": 16665792, "step": 19240 }, { "epoch": 9.073550212164074, "grad_norm": 0.0007311970693990588, "learning_rate": 0.15890118280249, "loss": 0.2066, "num_input_tokens_seen": 16670336, "step": 19245 }, { "epoch": 9.075907590759076, "grad_norm": 0.0010550773004069924, "learning_rate": 0.15884238106008275, "loss": 0.2288, "num_input_tokens_seen": 16674832, "step": 19250 }, { "epoch": 9.078264969354079, "grad_norm": 0.0008420665981248021, "learning_rate": 0.15878357795406922, "loss": 0.2487, "num_input_tokens_seen": 16679696, "step": 19255 }, { "epoch": 9.080622347949081, "grad_norm": 0.0013266054447740316, "learning_rate": 0.15872477349351757, "loss": 0.22, "num_input_tokens_seen": 16683344, "step": 19260 }, { "epoch": 9.082979726544083, "grad_norm": 0.0015388745814561844, "learning_rate": 0.15866596768749622, "loss": 0.2216, "num_input_tokens_seen": 16687920, "step": 19265 }, { "epoch": 9.085337105139086, "grad_norm": 0.0007284139865078032, "learning_rate": 0.15860716054507373, "loss": 0.2519, "num_input_tokens_seen": 16692288, "step": 19270 }, { "epoch": 9.087694483734088, "grad_norm": 0.0011999985435977578, "learning_rate": 0.1585483520753189, "loss": 0.2283, "num_input_tokens_seen": 16696288, "step": 19275 }, { "epoch": 9.09005186232909, "grad_norm": 0.0006251749582588673, "learning_rate": 0.1584895422873008, "loss": 0.2246, "num_input_tokens_seen": 16700800, "step": 19280 }, { "epoch": 9.092409240924093, "grad_norm": 0.0006919074221514165, "learning_rate": 0.1584307311900886, "loss": 0.215, "num_input_tokens_seen": 16705424, "step": 19285 }, { "epoch": 9.094766619519095, "grad_norm": 0.0013530576834455132, "learning_rate": 0.1583719187927517, "loss": 0.2067, "num_input_tokens_seen": 16709648, "step": 19290 }, { "epoch": 9.097123998114098, "grad_norm": 0.0005306766252033412, "learning_rate": 0.15831310510435967, "loss": 0.1886, "num_input_tokens_seen": 16713552, "step": 19295 }, { "epoch": 9.0994813767091, "grad_norm": 0.000619783706497401, "learning_rate": 0.15825429013398243, "loss": 0.2857, "num_input_tokens_seen": 16717808, "step": 19300 }, { "epoch": 9.101838755304103, "grad_norm": 0.0005635828129015863, "learning_rate": 0.15819547389068986, "loss": 0.2173, "num_input_tokens_seen": 16721680, "step": 19305 }, { "epoch": 9.104196133899103, "grad_norm": 0.0006941886967979372, "learning_rate": 0.1581366563835522, "loss": 0.1976, "num_input_tokens_seen": 16725728, "step": 19310 }, { "epoch": 9.106553512494106, "grad_norm": 0.0006347576272673905, "learning_rate": 0.15807783762163993, "loss": 0.2243, "num_input_tokens_seen": 16730224, "step": 19315 }, { "epoch": 9.108910891089108, "grad_norm": 0.0012645877432078123, "learning_rate": 0.15801901761402365, "loss": 0.1793, "num_input_tokens_seen": 16734128, "step": 19320 }, { "epoch": 9.11126826968411, "grad_norm": 0.0011194159742444754, "learning_rate": 0.157960196369774, "loss": 0.1707, "num_input_tokens_seen": 16739024, "step": 19325 }, { "epoch": 9.113625648279113, "grad_norm": 0.0008897940861061215, "learning_rate": 0.157901373897962, "loss": 0.2312, "num_input_tokens_seen": 16742800, "step": 19330 }, { "epoch": 9.115983026874115, "grad_norm": 0.0010471489513292909, "learning_rate": 0.15784255020765892, "loss": 0.2578, "num_input_tokens_seen": 16748096, "step": 19335 }, { "epoch": 9.118340405469118, "grad_norm": 0.001466831425204873, "learning_rate": 0.157783725307936, "loss": 0.2035, "num_input_tokens_seen": 16752224, "step": 19340 }, { "epoch": 9.12069778406412, "grad_norm": 0.000609258480835706, "learning_rate": 0.15772489920786484, "loss": 0.2365, "num_input_tokens_seen": 16757056, "step": 19345 }, { "epoch": 9.123055162659123, "grad_norm": 0.0004949611029587686, "learning_rate": 0.15766607191651713, "loss": 0.225, "num_input_tokens_seen": 16760768, "step": 19350 }, { "epoch": 9.125412541254125, "grad_norm": 0.0013947695260867476, "learning_rate": 0.1576072434429648, "loss": 0.2067, "num_input_tokens_seen": 16765152, "step": 19355 }, { "epoch": 9.127769919849127, "grad_norm": 0.0006468924693763256, "learning_rate": 0.15754841379627998, "loss": 0.2333, "num_input_tokens_seen": 16769664, "step": 19360 }, { "epoch": 9.13012729844413, "grad_norm": 0.0010903155198320746, "learning_rate": 0.15748958298553484, "loss": 0.1976, "num_input_tokens_seen": 16774624, "step": 19365 }, { "epoch": 9.132484677039132, "grad_norm": 0.0005933353095315397, "learning_rate": 0.1574307510198019, "loss": 0.1987, "num_input_tokens_seen": 16778752, "step": 19370 }, { "epoch": 9.134842055634135, "grad_norm": 0.000687772873789072, "learning_rate": 0.15737191790815375, "loss": 0.2285, "num_input_tokens_seen": 16783648, "step": 19375 }, { "epoch": 9.137199434229137, "grad_norm": 0.0008050749311223626, "learning_rate": 0.15731308365966323, "loss": 0.2418, "num_input_tokens_seen": 16788640, "step": 19380 }, { "epoch": 9.13955681282414, "grad_norm": 0.0009536448051221669, "learning_rate": 0.15725424828340331, "loss": 0.2098, "num_input_tokens_seen": 16792560, "step": 19385 }, { "epoch": 9.141914191419142, "grad_norm": 0.0007546758279204369, "learning_rate": 0.15719541178844715, "loss": 0.2184, "num_input_tokens_seen": 16796624, "step": 19390 }, { "epoch": 9.144271570014144, "grad_norm": 0.000612596224527806, "learning_rate": 0.15713657418386806, "loss": 0.226, "num_input_tokens_seen": 16801712, "step": 19395 }, { "epoch": 9.146628948609147, "grad_norm": 0.0008675246499478817, "learning_rate": 0.15707773547873957, "loss": 0.2361, "num_input_tokens_seen": 16806304, "step": 19400 }, { "epoch": 9.146628948609147, "eval_loss": 0.21671287715435028, "eval_runtime": 21.9288, "eval_samples_per_second": 43.003, "eval_steps_per_second": 21.524, "num_input_tokens_seen": 16806304, "step": 19400 }, { "epoch": 9.148986327204149, "grad_norm": 0.0007517562480643392, "learning_rate": 0.1570188956821353, "loss": 0.2182, "num_input_tokens_seen": 16811440, "step": 19405 }, { "epoch": 9.151343705799151, "grad_norm": 0.0006993401912041008, "learning_rate": 0.1569600548031291, "loss": 0.2049, "num_input_tokens_seen": 16814816, "step": 19410 }, { "epoch": 9.153701084394154, "grad_norm": 0.0006031584343872964, "learning_rate": 0.156901212850795, "loss": 0.2054, "num_input_tokens_seen": 16818672, "step": 19415 }, { "epoch": 9.156058462989156, "grad_norm": 0.001139874686487019, "learning_rate": 0.15684236983420716, "loss": 0.2253, "num_input_tokens_seen": 16822880, "step": 19420 }, { "epoch": 9.158415841584159, "grad_norm": 0.0005851318128407001, "learning_rate": 0.1567835257624399, "loss": 0.2293, "num_input_tokens_seen": 16826544, "step": 19425 }, { "epoch": 9.160773220179161, "grad_norm": 0.0012328233569860458, "learning_rate": 0.1567246806445677, "loss": 0.2253, "num_input_tokens_seen": 16831024, "step": 19430 }, { "epoch": 9.163130598774163, "grad_norm": 0.0007991245947778225, "learning_rate": 0.15666583448966526, "loss": 0.1871, "num_input_tokens_seen": 16835632, "step": 19435 }, { "epoch": 9.165487977369166, "grad_norm": 0.0005156891420483589, "learning_rate": 0.1566069873068074, "loss": 0.2206, "num_input_tokens_seen": 16839744, "step": 19440 }, { "epoch": 9.167845355964168, "grad_norm": 0.0014800290809944272, "learning_rate": 0.156548139105069, "loss": 0.2401, "num_input_tokens_seen": 16844256, "step": 19445 }, { "epoch": 9.17020273455917, "grad_norm": 0.0005292773712426424, "learning_rate": 0.15648928989352529, "loss": 0.1986, "num_input_tokens_seen": 16848400, "step": 19450 }, { "epoch": 9.172560113154173, "grad_norm": 0.0005309787811711431, "learning_rate": 0.15643043968125156, "loss": 0.2066, "num_input_tokens_seen": 16852064, "step": 19455 }, { "epoch": 9.174917491749175, "grad_norm": 0.0007944013923406601, "learning_rate": 0.15637158847732316, "loss": 0.2195, "num_input_tokens_seen": 16856528, "step": 19460 }, { "epoch": 9.177274870344178, "grad_norm": 0.0006142548518255353, "learning_rate": 0.15631273629081582, "loss": 0.2073, "num_input_tokens_seen": 16860736, "step": 19465 }, { "epoch": 9.17963224893918, "grad_norm": 0.0006446227780543268, "learning_rate": 0.15625388313080518, "loss": 0.2295, "num_input_tokens_seen": 16864704, "step": 19470 }, { "epoch": 9.181989627534183, "grad_norm": 0.001815654686652124, "learning_rate": 0.15619502900636714, "loss": 0.2303, "num_input_tokens_seen": 16869536, "step": 19475 }, { "epoch": 9.184347006129185, "grad_norm": 0.0005904544377699494, "learning_rate": 0.15613617392657783, "loss": 0.2348, "num_input_tokens_seen": 16874416, "step": 19480 }, { "epoch": 9.186704384724187, "grad_norm": 0.0010089308489114046, "learning_rate": 0.15607731790051335, "loss": 0.2227, "num_input_tokens_seen": 16878800, "step": 19485 }, { "epoch": 9.18906176331919, "grad_norm": 0.0020343728829175234, "learning_rate": 0.15601846093725008, "loss": 0.2327, "num_input_tokens_seen": 16883856, "step": 19490 }, { "epoch": 9.191419141914192, "grad_norm": 0.0007493772427551448, "learning_rate": 0.1559596030458645, "loss": 0.1971, "num_input_tokens_seen": 16887808, "step": 19495 }, { "epoch": 9.193776520509195, "grad_norm": 0.001080323476344347, "learning_rate": 0.1559007442354333, "loss": 0.1862, "num_input_tokens_seen": 16891536, "step": 19500 }, { "epoch": 9.196133899104197, "grad_norm": 0.0008865422569215298, "learning_rate": 0.15584188451503314, "loss": 0.2151, "num_input_tokens_seen": 16896192, "step": 19505 }, { "epoch": 9.198491277699198, "grad_norm": 0.0008593936800025403, "learning_rate": 0.15578302389374094, "loss": 0.2016, "num_input_tokens_seen": 16900880, "step": 19510 }, { "epoch": 9.2008486562942, "grad_norm": 0.0010523648234084249, "learning_rate": 0.1557241623806338, "loss": 0.1814, "num_input_tokens_seen": 16904832, "step": 19515 }, { "epoch": 9.203206034889202, "grad_norm": 0.0010727095650509, "learning_rate": 0.15566529998478887, "loss": 0.2307, "num_input_tokens_seen": 16909552, "step": 19520 }, { "epoch": 9.205563413484205, "grad_norm": 0.00045209444942884147, "learning_rate": 0.15560643671528354, "loss": 0.2243, "num_input_tokens_seen": 16913872, "step": 19525 }, { "epoch": 9.207920792079207, "grad_norm": 0.000814487284515053, "learning_rate": 0.15554757258119514, "loss": 0.202, "num_input_tokens_seen": 16918176, "step": 19530 }, { "epoch": 9.21027817067421, "grad_norm": 0.0005932566709816456, "learning_rate": 0.1554887075916014, "loss": 0.2179, "num_input_tokens_seen": 16922144, "step": 19535 }, { "epoch": 9.212635549269212, "grad_norm": 0.0006946565699763596, "learning_rate": 0.15542984175558, "loss": 0.2359, "num_input_tokens_seen": 16926752, "step": 19540 }, { "epoch": 9.214992927864214, "grad_norm": 0.0008088828180916607, "learning_rate": 0.1553709750822087, "loss": 0.2183, "num_input_tokens_seen": 16931168, "step": 19545 }, { "epoch": 9.217350306459217, "grad_norm": 0.0009399834671057761, "learning_rate": 0.15531210758056554, "loss": 0.2314, "num_input_tokens_seen": 16935440, "step": 19550 }, { "epoch": 9.21970768505422, "grad_norm": 0.0013915252638980746, "learning_rate": 0.15525323925972867, "loss": 0.2504, "num_input_tokens_seen": 16940512, "step": 19555 }, { "epoch": 9.222065063649222, "grad_norm": 0.0008073599310591817, "learning_rate": 0.15519437012877627, "loss": 0.2318, "num_input_tokens_seen": 16944688, "step": 19560 }, { "epoch": 9.224422442244224, "grad_norm": 0.001296561909839511, "learning_rate": 0.15513550019678676, "loss": 0.2278, "num_input_tokens_seen": 16948416, "step": 19565 }, { "epoch": 9.226779820839226, "grad_norm": 0.0017550145275890827, "learning_rate": 0.15507662947283854, "loss": 0.2206, "num_input_tokens_seen": 16952800, "step": 19570 }, { "epoch": 9.229137199434229, "grad_norm": 0.0007011658162809908, "learning_rate": 0.15501775796601028, "loss": 0.2, "num_input_tokens_seen": 16956032, "step": 19575 }, { "epoch": 9.231494578029231, "grad_norm": 0.0007046512328088284, "learning_rate": 0.15495888568538066, "loss": 0.1873, "num_input_tokens_seen": 16961152, "step": 19580 }, { "epoch": 9.233851956624234, "grad_norm": 0.0005122357397340238, "learning_rate": 0.1549000126400286, "loss": 0.1993, "num_input_tokens_seen": 16965792, "step": 19585 }, { "epoch": 9.236209335219236, "grad_norm": 0.0021124614868313074, "learning_rate": 0.15484113883903294, "loss": 0.2007, "num_input_tokens_seen": 16971008, "step": 19590 }, { "epoch": 9.238566713814238, "grad_norm": 0.0018380408873781562, "learning_rate": 0.15478226429147288, "loss": 0.2849, "num_input_tokens_seen": 16974304, "step": 19595 }, { "epoch": 9.24092409240924, "grad_norm": 0.0014694529818370938, "learning_rate": 0.15472338900642757, "loss": 0.2656, "num_input_tokens_seen": 16979072, "step": 19600 }, { "epoch": 9.24092409240924, "eval_loss": 0.21500974893569946, "eval_runtime": 21.9022, "eval_samples_per_second": 43.055, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 16979072, "step": 19600 }, { "epoch": 9.243281471004243, "grad_norm": 0.0014381809160113335, "learning_rate": 0.15466451299297632, "loss": 0.2309, "num_input_tokens_seen": 16983264, "step": 19605 }, { "epoch": 9.245638849599246, "grad_norm": 0.0008513167849741876, "learning_rate": 0.15460563626019852, "loss": 0.2223, "num_input_tokens_seen": 16987328, "step": 19610 }, { "epoch": 9.247996228194248, "grad_norm": 0.0010599633678793907, "learning_rate": 0.15454675881717375, "loss": 0.2346, "num_input_tokens_seen": 16991472, "step": 19615 }, { "epoch": 9.25035360678925, "grad_norm": 0.0006905505433678627, "learning_rate": 0.1544878806729816, "loss": 0.2295, "num_input_tokens_seen": 16995536, "step": 19620 }, { "epoch": 9.252710985384253, "grad_norm": 0.0009247318957932293, "learning_rate": 0.1544290018367019, "loss": 0.2254, "num_input_tokens_seen": 16999216, "step": 19625 }, { "epoch": 9.255068363979255, "grad_norm": 0.0006859771674498916, "learning_rate": 0.15437012231741445, "loss": 0.2309, "num_input_tokens_seen": 17003168, "step": 19630 }, { "epoch": 9.257425742574258, "grad_norm": 0.0008022209513001144, "learning_rate": 0.1543112421241992, "loss": 0.2115, "num_input_tokens_seen": 17007056, "step": 19635 }, { "epoch": 9.25978312116926, "grad_norm": 0.001319844275712967, "learning_rate": 0.15425236126613626, "loss": 0.2342, "num_input_tokens_seen": 17011008, "step": 19640 }, { "epoch": 9.262140499764262, "grad_norm": 0.0006238723872229457, "learning_rate": 0.15419347975230577, "loss": 0.2237, "num_input_tokens_seen": 17014976, "step": 19645 }, { "epoch": 9.264497878359265, "grad_norm": 0.0007251785136759281, "learning_rate": 0.154134597591788, "loss": 0.2206, "num_input_tokens_seen": 17019920, "step": 19650 }, { "epoch": 9.266855256954267, "grad_norm": 0.000666293315589428, "learning_rate": 0.1540757147936633, "loss": 0.2171, "num_input_tokens_seen": 17024112, "step": 19655 }, { "epoch": 9.26921263554927, "grad_norm": 0.0023159752599895, "learning_rate": 0.1540168313670122, "loss": 0.232, "num_input_tokens_seen": 17028208, "step": 19660 }, { "epoch": 9.271570014144272, "grad_norm": 0.0013567954301834106, "learning_rate": 0.1539579473209152, "loss": 0.2176, "num_input_tokens_seen": 17032288, "step": 19665 }, { "epoch": 9.273927392739274, "grad_norm": 0.0009036398841999471, "learning_rate": 0.15389906266445294, "loss": 0.225, "num_input_tokens_seen": 17036864, "step": 19670 }, { "epoch": 9.276284771334277, "grad_norm": 0.0015848031034693122, "learning_rate": 0.15384017740670627, "loss": 0.1994, "num_input_tokens_seen": 17040512, "step": 19675 }, { "epoch": 9.27864214992928, "grad_norm": 0.0007027833489701152, "learning_rate": 0.15378129155675602, "loss": 0.2263, "num_input_tokens_seen": 17044544, "step": 19680 }, { "epoch": 9.280999528524282, "grad_norm": 0.0009053319226950407, "learning_rate": 0.15372240512368307, "loss": 0.1584, "num_input_tokens_seen": 17048800, "step": 19685 }, { "epoch": 9.283356907119284, "grad_norm": 0.0019408568041399121, "learning_rate": 0.1536635181165684, "loss": 0.1935, "num_input_tokens_seen": 17053232, "step": 19690 }, { "epoch": 9.285714285714286, "grad_norm": 0.0012799431569874287, "learning_rate": 0.15360463054449328, "loss": 0.1992, "num_input_tokens_seen": 17057328, "step": 19695 }, { "epoch": 9.288071664309289, "grad_norm": 0.0005950471386313438, "learning_rate": 0.1535457424165388, "loss": 0.2248, "num_input_tokens_seen": 17062208, "step": 19700 }, { "epoch": 9.290429042904291, "grad_norm": 0.0013661581324413419, "learning_rate": 0.15348685374178628, "loss": 0.2365, "num_input_tokens_seen": 17067200, "step": 19705 }, { "epoch": 9.292786421499294, "grad_norm": 0.0007819689344614744, "learning_rate": 0.1534279645293171, "loss": 0.2245, "num_input_tokens_seen": 17071200, "step": 19710 }, { "epoch": 9.295143800094294, "grad_norm": 0.0009435624233447015, "learning_rate": 0.1533690747882127, "loss": 0.2283, "num_input_tokens_seen": 17075408, "step": 19715 }, { "epoch": 9.297501178689297, "grad_norm": 0.0007729819626547396, "learning_rate": 0.15331018452755465, "loss": 0.2309, "num_input_tokens_seen": 17079328, "step": 19720 }, { "epoch": 9.299858557284299, "grad_norm": 0.0006764448480680585, "learning_rate": 0.15325129375642457, "loss": 0.2177, "num_input_tokens_seen": 17083808, "step": 19725 }, { "epoch": 9.302215935879302, "grad_norm": 0.004101473838090897, "learning_rate": 0.15319240248390406, "loss": 0.254, "num_input_tokens_seen": 17088944, "step": 19730 }, { "epoch": 9.304573314474304, "grad_norm": 0.002013708231970668, "learning_rate": 0.153133510719075, "loss": 0.2328, "num_input_tokens_seen": 17093200, "step": 19735 }, { "epoch": 9.306930693069306, "grad_norm": 0.00077823520405218, "learning_rate": 0.15307461847101922, "loss": 0.2117, "num_input_tokens_seen": 17097712, "step": 19740 }, { "epoch": 9.309288071664309, "grad_norm": 0.0007947483099997044, "learning_rate": 0.15301572574881864, "loss": 0.2377, "num_input_tokens_seen": 17102736, "step": 19745 }, { "epoch": 9.311645450259311, "grad_norm": 0.0006836146931163967, "learning_rate": 0.15295683256155523, "loss": 0.234, "num_input_tokens_seen": 17106240, "step": 19750 }, { "epoch": 9.314002828854314, "grad_norm": 0.0017669089138507843, "learning_rate": 0.15289793891831113, "loss": 0.2445, "num_input_tokens_seen": 17110816, "step": 19755 }, { "epoch": 9.316360207449316, "grad_norm": 0.0008257796871475875, "learning_rate": 0.15283904482816837, "loss": 0.2449, "num_input_tokens_seen": 17114848, "step": 19760 }, { "epoch": 9.318717586044318, "grad_norm": 0.0006342238048091531, "learning_rate": 0.15278015030020928, "loss": 0.2192, "num_input_tokens_seen": 17119488, "step": 19765 }, { "epoch": 9.32107496463932, "grad_norm": 0.000719713163562119, "learning_rate": 0.152721255343516, "loss": 0.2454, "num_input_tokens_seen": 17123888, "step": 19770 }, { "epoch": 9.323432343234323, "grad_norm": 0.001504553947597742, "learning_rate": 0.15266235996717098, "loss": 0.2145, "num_input_tokens_seen": 17128144, "step": 19775 }, { "epoch": 9.325789721829326, "grad_norm": 0.0007254997617565095, "learning_rate": 0.15260346418025664, "loss": 0.2266, "num_input_tokens_seen": 17131776, "step": 19780 }, { "epoch": 9.328147100424328, "grad_norm": 0.0007850494002923369, "learning_rate": 0.15254456799185537, "loss": 0.2145, "num_input_tokens_seen": 17135792, "step": 19785 }, { "epoch": 9.33050447901933, "grad_norm": 0.0006241676746867597, "learning_rate": 0.15248567141104974, "loss": 0.1923, "num_input_tokens_seen": 17140272, "step": 19790 }, { "epoch": 9.332861857614333, "grad_norm": 0.0005241729086264968, "learning_rate": 0.15242677444692232, "loss": 0.1863, "num_input_tokens_seen": 17145216, "step": 19795 }, { "epoch": 9.335219236209335, "grad_norm": 0.0010514442110434175, "learning_rate": 0.15236787710855584, "loss": 0.2317, "num_input_tokens_seen": 17150160, "step": 19800 }, { "epoch": 9.335219236209335, "eval_loss": 0.2187410593032837, "eval_runtime": 21.9395, "eval_samples_per_second": 42.982, "eval_steps_per_second": 21.514, "num_input_tokens_seen": 17150160, "step": 19800 }, { "epoch": 9.337576614804338, "grad_norm": 0.00046312736230902374, "learning_rate": 0.1523089794050329, "loss": 0.1852, "num_input_tokens_seen": 17154256, "step": 19805 }, { "epoch": 9.33993399339934, "grad_norm": 0.0006244800169952214, "learning_rate": 0.15225008134543633, "loss": 0.2253, "num_input_tokens_seen": 17157808, "step": 19810 }, { "epoch": 9.342291371994342, "grad_norm": 0.0006691525923088193, "learning_rate": 0.15219118293884895, "loss": 0.2275, "num_input_tokens_seen": 17162048, "step": 19815 }, { "epoch": 9.344648750589345, "grad_norm": 0.0008228733786381781, "learning_rate": 0.15213228419435362, "loss": 0.21, "num_input_tokens_seen": 17166272, "step": 19820 }, { "epoch": 9.347006129184347, "grad_norm": 0.0007401698967441916, "learning_rate": 0.15207338512103327, "loss": 0.1788, "num_input_tokens_seen": 17170272, "step": 19825 }, { "epoch": 9.34936350777935, "grad_norm": 0.001659247325733304, "learning_rate": 0.1520144857279709, "loss": 0.2671, "num_input_tokens_seen": 17174800, "step": 19830 }, { "epoch": 9.351720886374352, "grad_norm": 0.0008296079467982054, "learning_rate": 0.1519555860242495, "loss": 0.2028, "num_input_tokens_seen": 17178688, "step": 19835 }, { "epoch": 9.354078264969354, "grad_norm": 0.0007022588397376239, "learning_rate": 0.15189668601895218, "loss": 0.2364, "num_input_tokens_seen": 17182720, "step": 19840 }, { "epoch": 9.356435643564357, "grad_norm": 0.0005818885983899236, "learning_rate": 0.151837785721162, "loss": 0.2196, "num_input_tokens_seen": 17186480, "step": 19845 }, { "epoch": 9.35879302215936, "grad_norm": 0.0005630734376609325, "learning_rate": 0.15177888513996218, "loss": 0.245, "num_input_tokens_seen": 17190896, "step": 19850 }, { "epoch": 9.361150400754362, "grad_norm": 0.0005624854238703847, "learning_rate": 0.15171998428443592, "loss": 0.2181, "num_input_tokens_seen": 17195200, "step": 19855 }, { "epoch": 9.363507779349364, "grad_norm": 0.0006372167845256627, "learning_rate": 0.1516610831636665, "loss": 0.2334, "num_input_tokens_seen": 17198880, "step": 19860 }, { "epoch": 9.365865157944366, "grad_norm": 0.000984302954748273, "learning_rate": 0.15160218178673715, "loss": 0.228, "num_input_tokens_seen": 17203632, "step": 19865 }, { "epoch": 9.368222536539369, "grad_norm": 0.0017497262451797724, "learning_rate": 0.15154328016273122, "loss": 0.2379, "num_input_tokens_seen": 17208496, "step": 19870 }, { "epoch": 9.370579915134371, "grad_norm": 0.0014233578694984317, "learning_rate": 0.1514843783007321, "loss": 0.2394, "num_input_tokens_seen": 17212576, "step": 19875 }, { "epoch": 9.372937293729374, "grad_norm": 0.0006300628301687539, "learning_rate": 0.15142547620982322, "loss": 0.2096, "num_input_tokens_seen": 17216288, "step": 19880 }, { "epoch": 9.375294672324376, "grad_norm": 0.0019232876366004348, "learning_rate": 0.15136657389908797, "loss": 0.2546, "num_input_tokens_seen": 17220560, "step": 19885 }, { "epoch": 9.377652050919378, "grad_norm": 0.0006782234995625913, "learning_rate": 0.15130767137760986, "loss": 0.2421, "num_input_tokens_seen": 17224368, "step": 19890 }, { "epoch": 9.38000942951438, "grad_norm": 0.0014399238862097263, "learning_rate": 0.15124876865447243, "loss": 0.23, "num_input_tokens_seen": 17228464, "step": 19895 }, { "epoch": 9.382366808109383, "grad_norm": 0.0007698136032558978, "learning_rate": 0.15118986573875912, "loss": 0.197, "num_input_tokens_seen": 17233552, "step": 19900 }, { "epoch": 9.384724186704386, "grad_norm": 0.0006928215734660625, "learning_rate": 0.15113096263955358, "loss": 0.2636, "num_input_tokens_seen": 17237536, "step": 19905 }, { "epoch": 9.387081565299386, "grad_norm": 0.0007053091540001333, "learning_rate": 0.1510720593659394, "loss": 0.2299, "num_input_tokens_seen": 17242048, "step": 19910 }, { "epoch": 9.389438943894389, "grad_norm": 0.0008625455666333437, "learning_rate": 0.15101315592700015, "loss": 0.2183, "num_input_tokens_seen": 17246000, "step": 19915 }, { "epoch": 9.391796322489391, "grad_norm": 0.001172647695057094, "learning_rate": 0.15095425233181956, "loss": 0.2186, "num_input_tokens_seen": 17250096, "step": 19920 }, { "epoch": 9.394153701084393, "grad_norm": 0.0006847079494036734, "learning_rate": 0.15089534858948128, "loss": 0.2301, "num_input_tokens_seen": 17255152, "step": 19925 }, { "epoch": 9.396511079679396, "grad_norm": 0.0006967468070797622, "learning_rate": 0.15083644470906898, "loss": 0.2259, "num_input_tokens_seen": 17260128, "step": 19930 }, { "epoch": 9.398868458274398, "grad_norm": 0.0014693060657009482, "learning_rate": 0.1507775406996664, "loss": 0.2126, "num_input_tokens_seen": 17264064, "step": 19935 }, { "epoch": 9.4012258368694, "grad_norm": 0.0007725813193246722, "learning_rate": 0.15071863657035725, "loss": 0.2332, "num_input_tokens_seen": 17268896, "step": 19940 }, { "epoch": 9.403583215464403, "grad_norm": 0.0006865648902021348, "learning_rate": 0.15065973233022534, "loss": 0.2045, "num_input_tokens_seen": 17273504, "step": 19945 }, { "epoch": 9.405940594059405, "grad_norm": 0.0013231628108769655, "learning_rate": 0.15060082798835442, "loss": 0.2179, "num_input_tokens_seen": 17278608, "step": 19950 }, { "epoch": 9.408297972654408, "grad_norm": 0.0006564148352481425, "learning_rate": 0.15054192355382823, "loss": 0.2085, "num_input_tokens_seen": 17282800, "step": 19955 }, { "epoch": 9.41065535124941, "grad_norm": 0.0015824259025976062, "learning_rate": 0.15048301903573066, "loss": 0.2227, "num_input_tokens_seen": 17286608, "step": 19960 }, { "epoch": 9.413012729844413, "grad_norm": 0.0009594332077540457, "learning_rate": 0.15042411444314546, "loss": 0.2341, "num_input_tokens_seen": 17290912, "step": 19965 }, { "epoch": 9.415370108439415, "grad_norm": 0.0011147784534841776, "learning_rate": 0.1503652097851565, "loss": 0.1642, "num_input_tokens_seen": 17294976, "step": 19970 }, { "epoch": 9.417727487034417, "grad_norm": 0.0006778439856134355, "learning_rate": 0.15030630507084758, "loss": 0.1856, "num_input_tokens_seen": 17299744, "step": 19975 }, { "epoch": 9.42008486562942, "grad_norm": 0.0009395316010341048, "learning_rate": 0.1502474003093026, "loss": 0.2392, "num_input_tokens_seen": 17303296, "step": 19980 }, { "epoch": 9.422442244224422, "grad_norm": 0.000885099871084094, "learning_rate": 0.15018849550960536, "loss": 0.1811, "num_input_tokens_seen": 17307472, "step": 19985 }, { "epoch": 9.424799622819425, "grad_norm": 0.0012235849862918258, "learning_rate": 0.15012959068083975, "loss": 0.2407, "num_input_tokens_seen": 17312656, "step": 19990 }, { "epoch": 9.427157001414427, "grad_norm": 0.0007197518134489655, "learning_rate": 0.1500706858320896, "loss": 0.1996, "num_input_tokens_seen": 17316400, "step": 19995 }, { "epoch": 9.42951438000943, "grad_norm": 0.0009691521408967674, "learning_rate": 0.15001178097243886, "loss": 0.2482, "num_input_tokens_seen": 17321280, "step": 20000 }, { "epoch": 9.42951438000943, "eval_loss": 0.21536685526371002, "eval_runtime": 21.909, "eval_samples_per_second": 43.042, "eval_steps_per_second": 21.544, "num_input_tokens_seen": 17321280, "step": 20000 }, { "epoch": 9.431871758604432, "grad_norm": 0.0015848366310819983, "learning_rate": 0.1499528761109713, "loss": 0.2042, "num_input_tokens_seen": 17325056, "step": 20005 }, { "epoch": 9.434229137199434, "grad_norm": 0.0013230458134785295, "learning_rate": 0.14989397125677087, "loss": 0.1813, "num_input_tokens_seen": 17329088, "step": 20010 }, { "epoch": 9.436586515794437, "grad_norm": 0.0006012945086695254, "learning_rate": 0.14983506641892141, "loss": 0.2424, "num_input_tokens_seen": 17333536, "step": 20015 }, { "epoch": 9.438943894389439, "grad_norm": 0.0008526949095539749, "learning_rate": 0.14977616160650672, "loss": 0.2236, "num_input_tokens_seen": 17338048, "step": 20020 }, { "epoch": 9.441301272984441, "grad_norm": 0.0006431685760617256, "learning_rate": 0.14971725682861076, "loss": 0.2106, "num_input_tokens_seen": 17341888, "step": 20025 }, { "epoch": 9.443658651579444, "grad_norm": 0.0013497823383659124, "learning_rate": 0.14965835209431738, "loss": 0.22, "num_input_tokens_seen": 17346544, "step": 20030 }, { "epoch": 9.446016030174446, "grad_norm": 0.0009231448639184237, "learning_rate": 0.14959944741271036, "loss": 0.1884, "num_input_tokens_seen": 17351072, "step": 20035 }, { "epoch": 9.448373408769449, "grad_norm": 0.0010897951433435082, "learning_rate": 0.14954054279287363, "loss": 0.2162, "num_input_tokens_seen": 17354960, "step": 20040 }, { "epoch": 9.450730787364451, "grad_norm": 0.000543992267921567, "learning_rate": 0.14948163824389094, "loss": 0.183, "num_input_tokens_seen": 17359376, "step": 20045 }, { "epoch": 9.453088165959453, "grad_norm": 0.0009608584805391729, "learning_rate": 0.14942273377484613, "loss": 0.2006, "num_input_tokens_seen": 17364336, "step": 20050 }, { "epoch": 9.455445544554456, "grad_norm": 0.0006766465958207846, "learning_rate": 0.1493638293948231, "loss": 0.1832, "num_input_tokens_seen": 17368992, "step": 20055 }, { "epoch": 9.457802923149458, "grad_norm": 0.0009858724661171436, "learning_rate": 0.14930492511290547, "loss": 0.2258, "num_input_tokens_seen": 17373712, "step": 20060 }, { "epoch": 9.46016030174446, "grad_norm": 0.0005642150645144284, "learning_rate": 0.14924602093817715, "loss": 0.2628, "num_input_tokens_seen": 17378304, "step": 20065 }, { "epoch": 9.462517680339463, "grad_norm": 0.0012104737106710672, "learning_rate": 0.14918711687972194, "loss": 0.1878, "num_input_tokens_seen": 17382928, "step": 20070 }, { "epoch": 9.464875058934465, "grad_norm": 0.000554158235900104, "learning_rate": 0.14912821294662346, "loss": 0.1932, "num_input_tokens_seen": 17387024, "step": 20075 }, { "epoch": 9.467232437529468, "grad_norm": 0.0010312390513718128, "learning_rate": 0.14906930914796554, "loss": 0.232, "num_input_tokens_seen": 17391056, "step": 20080 }, { "epoch": 9.46958981612447, "grad_norm": 0.0011156884720548987, "learning_rate": 0.14901040549283182, "loss": 0.1705, "num_input_tokens_seen": 17394944, "step": 20085 }, { "epoch": 9.471947194719473, "grad_norm": 0.0004572556645143777, "learning_rate": 0.148951501990306, "loss": 0.2236, "num_input_tokens_seen": 17398608, "step": 20090 }, { "epoch": 9.474304573314475, "grad_norm": 0.0005564548191614449, "learning_rate": 0.14889259864947177, "loss": 0.2522, "num_input_tokens_seen": 17402736, "step": 20095 }, { "epoch": 9.476661951909477, "grad_norm": 0.0013159830123186111, "learning_rate": 0.14883369547941272, "loss": 0.2486, "num_input_tokens_seen": 17406784, "step": 20100 }, { "epoch": 9.47901933050448, "grad_norm": 0.0006625318783335388, "learning_rate": 0.14877479248921247, "loss": 0.1941, "num_input_tokens_seen": 17411296, "step": 20105 }, { "epoch": 9.481376709099482, "grad_norm": 0.0005537057877518237, "learning_rate": 0.14871588968795468, "loss": 0.2208, "num_input_tokens_seen": 17415616, "step": 20110 }, { "epoch": 9.483734087694483, "grad_norm": 0.002039598533883691, "learning_rate": 0.1486569870847228, "loss": 0.249, "num_input_tokens_seen": 17420560, "step": 20115 }, { "epoch": 9.486091466289485, "grad_norm": 0.0006275964551605284, "learning_rate": 0.1485980846886004, "loss": 0.2037, "num_input_tokens_seen": 17424816, "step": 20120 }, { "epoch": 9.488448844884488, "grad_norm": 0.0006537820445373654, "learning_rate": 0.14853918250867096, "loss": 0.1896, "num_input_tokens_seen": 17429344, "step": 20125 }, { "epoch": 9.49080622347949, "grad_norm": 0.0008272115956060588, "learning_rate": 0.1484802805540179, "loss": 0.2213, "num_input_tokens_seen": 17434160, "step": 20130 }, { "epoch": 9.493163602074493, "grad_norm": 0.0013498219195753336, "learning_rate": 0.14842137883372472, "loss": 0.2823, "num_input_tokens_seen": 17438656, "step": 20135 }, { "epoch": 9.495520980669495, "grad_norm": 0.0005481025436893106, "learning_rate": 0.14836247735687474, "loss": 0.2403, "num_input_tokens_seen": 17443888, "step": 20140 }, { "epoch": 9.497878359264497, "grad_norm": 0.000894532073289156, "learning_rate": 0.14830357613255132, "loss": 0.2199, "num_input_tokens_seen": 17448336, "step": 20145 }, { "epoch": 9.5002357378595, "grad_norm": 0.0008404235704801977, "learning_rate": 0.1482446751698378, "loss": 0.1944, "num_input_tokens_seen": 17452928, "step": 20150 }, { "epoch": 9.502593116454502, "grad_norm": 0.0005628942744806409, "learning_rate": 0.14818577447781744, "loss": 0.2252, "num_input_tokens_seen": 17456848, "step": 20155 }, { "epoch": 9.504950495049505, "grad_norm": 0.0015349660534411669, "learning_rate": 0.14812687406557346, "loss": 0.2287, "num_input_tokens_seen": 17460656, "step": 20160 }, { "epoch": 9.507307873644507, "grad_norm": 0.0012589588295668364, "learning_rate": 0.14806797394218899, "loss": 0.2047, "num_input_tokens_seen": 17464528, "step": 20165 }, { "epoch": 9.50966525223951, "grad_norm": 0.0008098451653495431, "learning_rate": 0.1480090741167472, "loss": 0.2446, "num_input_tokens_seen": 17468544, "step": 20170 }, { "epoch": 9.512022630834512, "grad_norm": 0.0006556081352755427, "learning_rate": 0.1479501745983313, "loss": 0.208, "num_input_tokens_seen": 17472576, "step": 20175 }, { "epoch": 9.514380009429514, "grad_norm": 0.0006366188172250986, "learning_rate": 0.14789127539602415, "loss": 0.2349, "num_input_tokens_seen": 17476688, "step": 20180 }, { "epoch": 9.516737388024517, "grad_norm": 0.0008537110988982022, "learning_rate": 0.14783237651890885, "loss": 0.2177, "num_input_tokens_seen": 17481872, "step": 20185 }, { "epoch": 9.519094766619519, "grad_norm": 0.0016984982648864388, "learning_rate": 0.14777347797606838, "loss": 0.2476, "num_input_tokens_seen": 17486224, "step": 20190 }, { "epoch": 9.521452145214521, "grad_norm": 0.0008916830993257463, "learning_rate": 0.14771457977658553, "loss": 0.2133, "num_input_tokens_seen": 17491120, "step": 20195 }, { "epoch": 9.523809523809524, "grad_norm": 0.0019968922715634108, "learning_rate": 0.14765568192954326, "loss": 0.2261, "num_input_tokens_seen": 17495488, "step": 20200 }, { "epoch": 9.523809523809524, "eval_loss": 0.21515996754169464, "eval_runtime": 21.9539, "eval_samples_per_second": 42.954, "eval_steps_per_second": 21.5, "num_input_tokens_seen": 17495488, "step": 20200 }, { "epoch": 9.526166902404526, "grad_norm": 0.0008885944844223559, "learning_rate": 0.14759678444402421, "loss": 0.2301, "num_input_tokens_seen": 17499936, "step": 20205 }, { "epoch": 9.528524280999529, "grad_norm": 0.002289389492943883, "learning_rate": 0.14753788732911122, "loss": 0.2337, "num_input_tokens_seen": 17504512, "step": 20210 }, { "epoch": 9.530881659594531, "grad_norm": 0.0009423093288205564, "learning_rate": 0.147478990593887, "loss": 0.2233, "num_input_tokens_seen": 17508800, "step": 20215 }, { "epoch": 9.533239038189533, "grad_norm": 0.0011014282936230302, "learning_rate": 0.14742009424743405, "loss": 0.2212, "num_input_tokens_seen": 17512608, "step": 20220 }, { "epoch": 9.535596416784536, "grad_norm": 0.0014416513731703162, "learning_rate": 0.14736119829883504, "loss": 0.192, "num_input_tokens_seen": 17517376, "step": 20225 }, { "epoch": 9.537953795379538, "grad_norm": 0.0018658344633877277, "learning_rate": 0.14730230275717243, "loss": 0.2104, "num_input_tokens_seen": 17521360, "step": 20230 }, { "epoch": 9.54031117397454, "grad_norm": 0.000669886649120599, "learning_rate": 0.14724340763152854, "loss": 0.2502, "num_input_tokens_seen": 17525440, "step": 20235 }, { "epoch": 9.542668552569543, "grad_norm": 0.0015540856402367353, "learning_rate": 0.14718451293098594, "loss": 0.2608, "num_input_tokens_seen": 17530208, "step": 20240 }, { "epoch": 9.545025931164545, "grad_norm": 0.0008324977243319154, "learning_rate": 0.14712561866462676, "loss": 0.2026, "num_input_tokens_seen": 17534224, "step": 20245 }, { "epoch": 9.547383309759548, "grad_norm": 0.001377228181809187, "learning_rate": 0.1470667248415333, "loss": 0.223, "num_input_tokens_seen": 17538768, "step": 20250 }, { "epoch": 9.54974068835455, "grad_norm": 0.0006202780641615391, "learning_rate": 0.1470078314707878, "loss": 0.1932, "num_input_tokens_seen": 17543088, "step": 20255 }, { "epoch": 9.552098066949553, "grad_norm": 0.0014777590986341238, "learning_rate": 0.14694893856147223, "loss": 0.2044, "num_input_tokens_seen": 17547984, "step": 20260 }, { "epoch": 9.554455445544555, "grad_norm": 0.0009235324687324464, "learning_rate": 0.14689004612266868, "loss": 0.2171, "num_input_tokens_seen": 17551808, "step": 20265 }, { "epoch": 9.556812824139557, "grad_norm": 0.0007294192910194397, "learning_rate": 0.14683115416345913, "loss": 0.2175, "num_input_tokens_seen": 17555456, "step": 20270 }, { "epoch": 9.55917020273456, "grad_norm": 0.0005667241639457643, "learning_rate": 0.1467722626929254, "loss": 0.1927, "num_input_tokens_seen": 17559840, "step": 20275 }, { "epoch": 9.561527581329562, "grad_norm": 0.000680093711707741, "learning_rate": 0.14671337172014937, "loss": 0.2132, "num_input_tokens_seen": 17564016, "step": 20280 }, { "epoch": 9.563884959924565, "grad_norm": 0.0006578586180694401, "learning_rate": 0.14665448125421265, "loss": 0.2001, "num_input_tokens_seen": 17568320, "step": 20285 }, { "epoch": 9.566242338519567, "grad_norm": 0.0010942034423351288, "learning_rate": 0.146595591304197, "loss": 0.2135, "num_input_tokens_seen": 17572544, "step": 20290 }, { "epoch": 9.56859971711457, "grad_norm": 0.0013192775659263134, "learning_rate": 0.14653670187918397, "loss": 0.2201, "num_input_tokens_seen": 17576704, "step": 20295 }, { "epoch": 9.570957095709572, "grad_norm": 0.0010448156390339136, "learning_rate": 0.14647781298825502, "loss": 0.2155, "num_input_tokens_seen": 17581056, "step": 20300 }, { "epoch": 9.573314474304574, "grad_norm": 0.0011846715351566672, "learning_rate": 0.14641892464049153, "loss": 0.2374, "num_input_tokens_seen": 17585168, "step": 20305 }, { "epoch": 9.575671852899575, "grad_norm": 0.0007550598820671439, "learning_rate": 0.14636003684497495, "loss": 0.2525, "num_input_tokens_seen": 17589472, "step": 20310 }, { "epoch": 9.578029231494579, "grad_norm": 0.0009879018180072308, "learning_rate": 0.14630114961078636, "loss": 0.2191, "num_input_tokens_seen": 17593456, "step": 20315 }, { "epoch": 9.58038661008958, "grad_norm": 0.0016299979761242867, "learning_rate": 0.14624226294700704, "loss": 0.2025, "num_input_tokens_seen": 17598576, "step": 20320 }, { "epoch": 9.582743988684582, "grad_norm": 0.0014353878796100616, "learning_rate": 0.14618337686271793, "loss": 0.1863, "num_input_tokens_seen": 17602704, "step": 20325 }, { "epoch": 9.585101367279584, "grad_norm": 0.0018011172069236636, "learning_rate": 0.1461244913670001, "loss": 0.2806, "num_input_tokens_seen": 17607328, "step": 20330 }, { "epoch": 9.587458745874587, "grad_norm": 0.0008793168235570192, "learning_rate": 0.1460656064689344, "loss": 0.2207, "num_input_tokens_seen": 17611696, "step": 20335 }, { "epoch": 9.58981612446959, "grad_norm": 0.0010298278648406267, "learning_rate": 0.14600672217760163, "loss": 0.2078, "num_input_tokens_seen": 17616928, "step": 20340 }, { "epoch": 9.592173503064592, "grad_norm": 0.0009212171426042914, "learning_rate": 0.14594783850208248, "loss": 0.2157, "num_input_tokens_seen": 17621360, "step": 20345 }, { "epoch": 9.594530881659594, "grad_norm": 0.0017712191911414266, "learning_rate": 0.14588895545145758, "loss": 0.2021, "num_input_tokens_seen": 17625952, "step": 20350 }, { "epoch": 9.596888260254596, "grad_norm": 0.0013275428209453821, "learning_rate": 0.14583007303480738, "loss": 0.2012, "num_input_tokens_seen": 17630480, "step": 20355 }, { "epoch": 9.599245638849599, "grad_norm": 0.0010387555230408907, "learning_rate": 0.14577119126121235, "loss": 0.2301, "num_input_tokens_seen": 17634944, "step": 20360 }, { "epoch": 9.601603017444601, "grad_norm": 0.0009452144149690866, "learning_rate": 0.14571231013975272, "loss": 0.2229, "num_input_tokens_seen": 17638992, "step": 20365 }, { "epoch": 9.603960396039604, "grad_norm": 0.0016455514123663306, "learning_rate": 0.1456534296795088, "loss": 0.2078, "num_input_tokens_seen": 17643328, "step": 20370 }, { "epoch": 9.606317774634606, "grad_norm": 0.0008877373184077442, "learning_rate": 0.14559454988956066, "loss": 0.2094, "num_input_tokens_seen": 17648112, "step": 20375 }, { "epoch": 9.608675153229608, "grad_norm": 0.0016427877126261592, "learning_rate": 0.1455356707789882, "loss": 0.2501, "num_input_tokens_seen": 17653600, "step": 20380 }, { "epoch": 9.61103253182461, "grad_norm": 0.0008372567826882005, "learning_rate": 0.14547679235687147, "loss": 0.1835, "num_input_tokens_seen": 17658096, "step": 20385 }, { "epoch": 9.613389910419613, "grad_norm": 0.001427405164577067, "learning_rate": 0.14541791463229023, "loss": 0.2384, "num_input_tokens_seen": 17661776, "step": 20390 }, { "epoch": 9.615747289014616, "grad_norm": 0.0012673120945692062, "learning_rate": 0.14535903761432406, "loss": 0.1968, "num_input_tokens_seen": 17665664, "step": 20395 }, { "epoch": 9.618104667609618, "grad_norm": 0.001242350204847753, "learning_rate": 0.1453001613120527, "loss": 0.2319, "num_input_tokens_seen": 17670576, "step": 20400 }, { "epoch": 9.618104667609618, "eval_loss": 0.2168968915939331, "eval_runtime": 21.9028, "eval_samples_per_second": 43.054, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 17670576, "step": 20400 }, { "epoch": 9.62046204620462, "grad_norm": 0.0018380923429504037, "learning_rate": 0.14524128573455547, "loss": 0.2208, "num_input_tokens_seen": 17674976, "step": 20405 }, { "epoch": 9.622819424799623, "grad_norm": 0.0005885528516955674, "learning_rate": 0.14518241089091177, "loss": 0.2693, "num_input_tokens_seen": 17679840, "step": 20410 }, { "epoch": 9.625176803394625, "grad_norm": 0.000979065545834601, "learning_rate": 0.1451235367902009, "loss": 0.216, "num_input_tokens_seen": 17684336, "step": 20415 }, { "epoch": 9.627534181989628, "grad_norm": 0.0008213087567128241, "learning_rate": 0.1450646634415019, "loss": 0.2207, "num_input_tokens_seen": 17688432, "step": 20420 }, { "epoch": 9.62989156058463, "grad_norm": 0.0007264121668413281, "learning_rate": 0.1450057908538938, "loss": 0.2226, "num_input_tokens_seen": 17692896, "step": 20425 }, { "epoch": 9.632248939179632, "grad_norm": 0.001003630575723946, "learning_rate": 0.14494691903645557, "loss": 0.2221, "num_input_tokens_seen": 17696560, "step": 20430 }, { "epoch": 9.634606317774635, "grad_norm": 0.0007365955971181393, "learning_rate": 0.14488804799826588, "loss": 0.2095, "num_input_tokens_seen": 17700432, "step": 20435 }, { "epoch": 9.636963696369637, "grad_norm": 0.001128949224948883, "learning_rate": 0.14482917774840348, "loss": 0.2217, "num_input_tokens_seen": 17704800, "step": 20440 }, { "epoch": 9.63932107496464, "grad_norm": 0.001799715799279511, "learning_rate": 0.14477030829594684, "loss": 0.2214, "num_input_tokens_seen": 17708368, "step": 20445 }, { "epoch": 9.641678453559642, "grad_norm": 0.0006580776534974575, "learning_rate": 0.14471143964997432, "loss": 0.2032, "num_input_tokens_seen": 17712976, "step": 20450 }, { "epoch": 9.644035832154644, "grad_norm": 0.0008678776794113219, "learning_rate": 0.14465257181956434, "loss": 0.2067, "num_input_tokens_seen": 17716992, "step": 20455 }, { "epoch": 9.646393210749647, "grad_norm": 0.0012524538906291127, "learning_rate": 0.1445937048137949, "loss": 0.2276, "num_input_tokens_seen": 17721200, "step": 20460 }, { "epoch": 9.64875058934465, "grad_norm": 0.0017552762292325497, "learning_rate": 0.14453483864174416, "loss": 0.2188, "num_input_tokens_seen": 17724432, "step": 20465 }, { "epoch": 9.651107967939652, "grad_norm": 0.000949060486163944, "learning_rate": 0.14447597331249, "loss": 0.2078, "num_input_tokens_seen": 17728928, "step": 20470 }, { "epoch": 9.653465346534654, "grad_norm": 0.0008475234499201179, "learning_rate": 0.1444171088351102, "loss": 0.2316, "num_input_tokens_seen": 17732768, "step": 20475 }, { "epoch": 9.655822725129656, "grad_norm": 0.0011733185965567827, "learning_rate": 0.14435824521868235, "loss": 0.2249, "num_input_tokens_seen": 17737584, "step": 20480 }, { "epoch": 9.658180103724659, "grad_norm": 0.0015648695407435298, "learning_rate": 0.14429938247228397, "loss": 0.1942, "num_input_tokens_seen": 17742608, "step": 20485 }, { "epoch": 9.660537482319661, "grad_norm": 0.0010379502782598138, "learning_rate": 0.14424052060499243, "loss": 0.2574, "num_input_tokens_seen": 17747696, "step": 20490 }, { "epoch": 9.662894860914664, "grad_norm": 0.0015979685122147202, "learning_rate": 0.14418165962588506, "loss": 0.2239, "num_input_tokens_seen": 17752144, "step": 20495 }, { "epoch": 9.665252239509666, "grad_norm": 0.001337651745416224, "learning_rate": 0.1441227995440388, "loss": 0.1875, "num_input_tokens_seen": 17756320, "step": 20500 }, { "epoch": 9.667609618104667, "grad_norm": 0.0015231596771627665, "learning_rate": 0.14406394036853082, "loss": 0.2081, "num_input_tokens_seen": 17760304, "step": 20505 }, { "epoch": 9.66996699669967, "grad_norm": 0.0007395526627078652, "learning_rate": 0.14400508210843774, "loss": 0.1752, "num_input_tokens_seen": 17765152, "step": 20510 }, { "epoch": 9.672324375294671, "grad_norm": 0.000899964477866888, "learning_rate": 0.1439462247728364, "loss": 0.2538, "num_input_tokens_seen": 17769280, "step": 20515 }, { "epoch": 9.674681753889674, "grad_norm": 0.0006994422874413431, "learning_rate": 0.14388736837080326, "loss": 0.2329, "num_input_tokens_seen": 17774432, "step": 20520 }, { "epoch": 9.677039132484676, "grad_norm": 0.0015399340773001313, "learning_rate": 0.14382851291141469, "loss": 0.2315, "num_input_tokens_seen": 17778400, "step": 20525 }, { "epoch": 9.679396511079679, "grad_norm": 0.0029570970218628645, "learning_rate": 0.14376965840374697, "loss": 0.2202, "num_input_tokens_seen": 17782720, "step": 20530 }, { "epoch": 9.681753889674681, "grad_norm": 0.0016727304318919778, "learning_rate": 0.14371080485687632, "loss": 0.2367, "num_input_tokens_seen": 17786832, "step": 20535 }, { "epoch": 9.684111268269683, "grad_norm": 0.0007829573005437851, "learning_rate": 0.1436519522798785, "loss": 0.22, "num_input_tokens_seen": 17790912, "step": 20540 }, { "epoch": 9.686468646864686, "grad_norm": 0.0008653285913169384, "learning_rate": 0.14359310068182948, "loss": 0.225, "num_input_tokens_seen": 17795424, "step": 20545 }, { "epoch": 9.688826025459688, "grad_norm": 0.0015246745897457004, "learning_rate": 0.14353425007180484, "loss": 0.1786, "num_input_tokens_seen": 17799744, "step": 20550 }, { "epoch": 9.69118340405469, "grad_norm": 0.0006649324204772711, "learning_rate": 0.14347540045888005, "loss": 0.2196, "num_input_tokens_seen": 17803872, "step": 20555 }, { "epoch": 9.693540782649693, "grad_norm": 0.0006599709740839899, "learning_rate": 0.14341655185213056, "loss": 0.2309, "num_input_tokens_seen": 17808880, "step": 20560 }, { "epoch": 9.695898161244696, "grad_norm": 0.0017684876220300794, "learning_rate": 0.14335770426063144, "loss": 0.2448, "num_input_tokens_seen": 17813328, "step": 20565 }, { "epoch": 9.698255539839698, "grad_norm": 0.0007855689036659896, "learning_rate": 0.1432988576934578, "loss": 0.1782, "num_input_tokens_seen": 17817760, "step": 20570 }, { "epoch": 9.7006129184347, "grad_norm": 0.000948451750446111, "learning_rate": 0.14324001215968457, "loss": 0.177, "num_input_tokens_seen": 17822080, "step": 20575 }, { "epoch": 9.702970297029703, "grad_norm": 0.0006067260983400047, "learning_rate": 0.14318116766838637, "loss": 0.2121, "num_input_tokens_seen": 17825984, "step": 20580 }, { "epoch": 9.705327675624705, "grad_norm": 0.0026487938594073057, "learning_rate": 0.14312232422863788, "loss": 0.2503, "num_input_tokens_seen": 17830240, "step": 20585 }, { "epoch": 9.707685054219708, "grad_norm": 0.0006089310627430677, "learning_rate": 0.14306348184951334, "loss": 0.2143, "num_input_tokens_seen": 17834912, "step": 20590 }, { "epoch": 9.71004243281471, "grad_norm": 0.000669643864966929, "learning_rate": 0.1430046405400871, "loss": 0.2116, "num_input_tokens_seen": 17838576, "step": 20595 }, { "epoch": 9.712399811409712, "grad_norm": 0.001280271215364337, "learning_rate": 0.14294580030943324, "loss": 0.205, "num_input_tokens_seen": 17843440, "step": 20600 }, { "epoch": 9.712399811409712, "eval_loss": 0.215559184551239, "eval_runtime": 21.9587, "eval_samples_per_second": 42.944, "eval_steps_per_second": 21.495, "num_input_tokens_seen": 17843440, "step": 20600 }, { "epoch": 9.714757190004715, "grad_norm": 0.000786292483098805, "learning_rate": 0.14288696116662553, "loss": 0.2386, "num_input_tokens_seen": 17848352, "step": 20605 }, { "epoch": 9.717114568599717, "grad_norm": 0.0011515662772580981, "learning_rate": 0.1428281231207378, "loss": 0.2102, "num_input_tokens_seen": 17852848, "step": 20610 }, { "epoch": 9.71947194719472, "grad_norm": 0.0011113951914012432, "learning_rate": 0.1427692861808437, "loss": 0.2242, "num_input_tokens_seen": 17856944, "step": 20615 }, { "epoch": 9.721829325789722, "grad_norm": 0.0015225758543238044, "learning_rate": 0.1427104503560165, "loss": 0.2231, "num_input_tokens_seen": 17861616, "step": 20620 }, { "epoch": 9.724186704384724, "grad_norm": 0.0008812613668851554, "learning_rate": 0.14265161565532947, "loss": 0.2053, "num_input_tokens_seen": 17865472, "step": 20625 }, { "epoch": 9.726544082979727, "grad_norm": 0.0010320247383788228, "learning_rate": 0.14259278208785564, "loss": 0.2165, "num_input_tokens_seen": 17870048, "step": 20630 }, { "epoch": 9.72890146157473, "grad_norm": 0.0008569251513108611, "learning_rate": 0.14253394966266789, "loss": 0.2451, "num_input_tokens_seen": 17874512, "step": 20635 }, { "epoch": 9.731258840169732, "grad_norm": 0.0013477386673912406, "learning_rate": 0.14247511838883894, "loss": 0.199, "num_input_tokens_seen": 17879136, "step": 20640 }, { "epoch": 9.733616218764734, "grad_norm": 0.0008317263564094901, "learning_rate": 0.14241628827544126, "loss": 0.2421, "num_input_tokens_seen": 17882528, "step": 20645 }, { "epoch": 9.735973597359736, "grad_norm": 0.0007979859365150332, "learning_rate": 0.14235745933154723, "loss": 0.184, "num_input_tokens_seen": 17886176, "step": 20650 }, { "epoch": 9.738330975954739, "grad_norm": 0.0023083535488694906, "learning_rate": 0.14229863156622907, "loss": 0.2672, "num_input_tokens_seen": 17890720, "step": 20655 }, { "epoch": 9.740688354549741, "grad_norm": 0.0011859270744025707, "learning_rate": 0.14223980498855868, "loss": 0.2582, "num_input_tokens_seen": 17894688, "step": 20660 }, { "epoch": 9.743045733144744, "grad_norm": 0.001322311582043767, "learning_rate": 0.14218097960760792, "loss": 0.2282, "num_input_tokens_seen": 17898480, "step": 20665 }, { "epoch": 9.745403111739746, "grad_norm": 0.001723092282190919, "learning_rate": 0.1421221554324483, "loss": 0.2162, "num_input_tokens_seen": 17902384, "step": 20670 }, { "epoch": 9.747760490334748, "grad_norm": 0.0013819780433550477, "learning_rate": 0.1420633324721513, "loss": 0.2319, "num_input_tokens_seen": 17906288, "step": 20675 }, { "epoch": 9.75011786892975, "grad_norm": 0.0023326657246798277, "learning_rate": 0.14200451073578824, "loss": 0.2244, "num_input_tokens_seen": 17910800, "step": 20680 }, { "epoch": 9.752475247524753, "grad_norm": 0.002872682409361005, "learning_rate": 0.14194569023243003, "loss": 0.2134, "num_input_tokens_seen": 17915488, "step": 20685 }, { "epoch": 9.754832626119756, "grad_norm": 0.0032045929692685604, "learning_rate": 0.14188687097114766, "loss": 0.2345, "num_input_tokens_seen": 17919136, "step": 20690 }, { "epoch": 9.757190004714758, "grad_norm": 0.0019054177682846785, "learning_rate": 0.14182805296101172, "loss": 0.2294, "num_input_tokens_seen": 17922928, "step": 20695 }, { "epoch": 9.75954738330976, "grad_norm": 0.003585758153349161, "learning_rate": 0.14176923621109272, "loss": 0.2103, "num_input_tokens_seen": 17927984, "step": 20700 }, { "epoch": 9.761904761904763, "grad_norm": 0.00202233181335032, "learning_rate": 0.14171042073046097, "loss": 0.2179, "num_input_tokens_seen": 17932144, "step": 20705 }, { "epoch": 9.764262140499763, "grad_norm": 0.0027521983720362186, "learning_rate": 0.14165160652818642, "loss": 0.2382, "num_input_tokens_seen": 17936032, "step": 20710 }, { "epoch": 9.766619519094768, "grad_norm": 0.0029339382890611887, "learning_rate": 0.14159279361333907, "loss": 0.2268, "num_input_tokens_seen": 17940464, "step": 20715 }, { "epoch": 9.768976897689768, "grad_norm": 0.0013346857158467174, "learning_rate": 0.14153398199498868, "loss": 0.2217, "num_input_tokens_seen": 17944352, "step": 20720 }, { "epoch": 9.77133427628477, "grad_norm": 0.0030369276646524668, "learning_rate": 0.14147517168220458, "loss": 0.201, "num_input_tokens_seen": 17948528, "step": 20725 }, { "epoch": 9.773691654879773, "grad_norm": 0.002730417065322399, "learning_rate": 0.14141636268405616, "loss": 0.2044, "num_input_tokens_seen": 17952496, "step": 20730 }, { "epoch": 9.776049033474775, "grad_norm": 0.0032366071827709675, "learning_rate": 0.14135755500961253, "loss": 0.2912, "num_input_tokens_seen": 17956880, "step": 20735 }, { "epoch": 9.778406412069778, "grad_norm": 0.004075288772583008, "learning_rate": 0.14129874866794245, "loss": 0.2262, "num_input_tokens_seen": 17961392, "step": 20740 }, { "epoch": 9.78076379066478, "grad_norm": 0.0016800656449049711, "learning_rate": 0.14123994366811476, "loss": 0.2186, "num_input_tokens_seen": 17965568, "step": 20745 }, { "epoch": 9.783121169259783, "grad_norm": 0.0035225781612098217, "learning_rate": 0.14118114001919774, "loss": 0.2123, "num_input_tokens_seen": 17969648, "step": 20750 }, { "epoch": 9.785478547854785, "grad_norm": 0.0032214259263128042, "learning_rate": 0.14112233773025978, "loss": 0.1869, "num_input_tokens_seen": 17974256, "step": 20755 }, { "epoch": 9.787835926449787, "grad_norm": 0.003996030893176794, "learning_rate": 0.14106353681036896, "loss": 0.2589, "num_input_tokens_seen": 17978192, "step": 20760 }, { "epoch": 9.79019330504479, "grad_norm": 0.003911331295967102, "learning_rate": 0.14100473726859303, "loss": 0.2196, "num_input_tokens_seen": 17981904, "step": 20765 }, { "epoch": 9.792550683639792, "grad_norm": 0.010588770732283592, "learning_rate": 0.14094593911399964, "loss": 0.2317, "num_input_tokens_seen": 17986512, "step": 20770 }, { "epoch": 9.794908062234795, "grad_norm": 0.008972296491265297, "learning_rate": 0.14088714235565625, "loss": 0.2462, "num_input_tokens_seen": 17990464, "step": 20775 }, { "epoch": 9.797265440829797, "grad_norm": 0.009860249236226082, "learning_rate": 0.14082834700263, "loss": 0.2429, "num_input_tokens_seen": 17995040, "step": 20780 }, { "epoch": 9.7996228194248, "grad_norm": 0.00562833808362484, "learning_rate": 0.14076955306398795, "loss": 0.233, "num_input_tokens_seen": 17999104, "step": 20785 }, { "epoch": 9.801980198019802, "grad_norm": 0.004770326893776655, "learning_rate": 0.14071076054879675, "loss": 0.2429, "num_input_tokens_seen": 18003776, "step": 20790 }, { "epoch": 9.804337576614804, "grad_norm": 0.010428115725517273, "learning_rate": 0.14065196946612302, "loss": 0.2426, "num_input_tokens_seen": 18008064, "step": 20795 }, { "epoch": 9.806694955209807, "grad_norm": 0.0032120042014867067, "learning_rate": 0.1405931798250331, "loss": 0.2642, "num_input_tokens_seen": 18012496, "step": 20800 }, { "epoch": 9.806694955209807, "eval_loss": 0.2300536334514618, "eval_runtime": 21.8914, "eval_samples_per_second": 43.076, "eval_steps_per_second": 21.561, "num_input_tokens_seen": 18012496, "step": 20800 }, { "epoch": 9.809052333804809, "grad_norm": 0.0030970994848757982, "learning_rate": 0.14053439163459308, "loss": 0.2422, "num_input_tokens_seen": 18016016, "step": 20805 }, { "epoch": 9.811409712399811, "grad_norm": 0.005022671073675156, "learning_rate": 0.14047560490386876, "loss": 0.2307, "num_input_tokens_seen": 18019792, "step": 20810 }, { "epoch": 9.813767090994814, "grad_norm": 0.0026529813185334206, "learning_rate": 0.14041681964192593, "loss": 0.1726, "num_input_tokens_seen": 18023792, "step": 20815 }, { "epoch": 9.816124469589816, "grad_norm": 0.003863097634166479, "learning_rate": 0.14035803585782988, "loss": 0.3141, "num_input_tokens_seen": 18027696, "step": 20820 }, { "epoch": 9.818481848184819, "grad_norm": 0.0015404460718855262, "learning_rate": 0.14029925356064593, "loss": 0.2269, "num_input_tokens_seen": 18031760, "step": 20825 }, { "epoch": 9.820839226779821, "grad_norm": 0.003424615366384387, "learning_rate": 0.1402404727594389, "loss": 0.2129, "num_input_tokens_seen": 18036416, "step": 20830 }, { "epoch": 9.823196605374823, "grad_norm": 0.002035647165030241, "learning_rate": 0.1401816934632737, "loss": 0.2355, "num_input_tokens_seen": 18041248, "step": 20835 }, { "epoch": 9.825553983969826, "grad_norm": 0.002866423688828945, "learning_rate": 0.1401229156812147, "loss": 0.2393, "num_input_tokens_seen": 18045680, "step": 20840 }, { "epoch": 9.827911362564828, "grad_norm": 0.0016310347709804773, "learning_rate": 0.14006413942232626, "loss": 0.2337, "num_input_tokens_seen": 18049680, "step": 20845 }, { "epoch": 9.83026874115983, "grad_norm": 0.002456455025821924, "learning_rate": 0.14000536469567235, "loss": 0.2399, "num_input_tokens_seen": 18054688, "step": 20850 }, { "epoch": 9.832626119754833, "grad_norm": 0.0029478874057531357, "learning_rate": 0.13994659151031685, "loss": 0.242, "num_input_tokens_seen": 18059600, "step": 20855 }, { "epoch": 9.834983498349835, "grad_norm": 0.001765622990205884, "learning_rate": 0.13988781987532323, "loss": 0.2043, "num_input_tokens_seen": 18064272, "step": 20860 }, { "epoch": 9.837340876944838, "grad_norm": 0.0019855352584272623, "learning_rate": 0.1398290497997549, "loss": 0.2481, "num_input_tokens_seen": 18068448, "step": 20865 }, { "epoch": 9.83969825553984, "grad_norm": 0.0021878390107303858, "learning_rate": 0.13977028129267488, "loss": 0.2512, "num_input_tokens_seen": 18072768, "step": 20870 }, { "epoch": 9.842055634134843, "grad_norm": 0.0013581026578322053, "learning_rate": 0.13971151436314605, "loss": 0.2349, "num_input_tokens_seen": 18077264, "step": 20875 }, { "epoch": 9.844413012729845, "grad_norm": 0.0028387701604515314, "learning_rate": 0.13965274902023103, "loss": 0.2098, "num_input_tokens_seen": 18082560, "step": 20880 }, { "epoch": 9.846770391324847, "grad_norm": 0.0010935210157185793, "learning_rate": 0.13959398527299208, "loss": 0.2224, "num_input_tokens_seen": 18086432, "step": 20885 }, { "epoch": 9.84912776991985, "grad_norm": 0.00331748160533607, "learning_rate": 0.13953522313049138, "loss": 0.2837, "num_input_tokens_seen": 18090832, "step": 20890 }, { "epoch": 9.851485148514852, "grad_norm": 0.002927246503531933, "learning_rate": 0.13947646260179083, "loss": 0.2372, "num_input_tokens_seen": 18094240, "step": 20895 }, { "epoch": 9.853842527109855, "grad_norm": 0.002717055846005678, "learning_rate": 0.13941770369595194, "loss": 0.217, "num_input_tokens_seen": 18099104, "step": 20900 }, { "epoch": 9.856199905704855, "grad_norm": 0.0019298747647553682, "learning_rate": 0.1393589464220362, "loss": 0.2513, "num_input_tokens_seen": 18103296, "step": 20905 }, { "epoch": 9.85855728429986, "grad_norm": 0.0016360790468752384, "learning_rate": 0.13930019078910455, "loss": 0.2239, "num_input_tokens_seen": 18107856, "step": 20910 }, { "epoch": 9.86091466289486, "grad_norm": 0.0031283670105040073, "learning_rate": 0.139241436806218, "loss": 0.2309, "num_input_tokens_seen": 18112064, "step": 20915 }, { "epoch": 9.863272041489862, "grad_norm": 0.0018226697575300932, "learning_rate": 0.13918268448243712, "loss": 0.1835, "num_input_tokens_seen": 18116464, "step": 20920 }, { "epoch": 9.865629420084865, "grad_norm": 0.008400446735322475, "learning_rate": 0.13912393382682217, "loss": 0.2324, "num_input_tokens_seen": 18120896, "step": 20925 }, { "epoch": 9.867986798679867, "grad_norm": 0.0018560690805315971, "learning_rate": 0.1390651848484333, "loss": 0.2568, "num_input_tokens_seen": 18125712, "step": 20930 }, { "epoch": 9.87034417727487, "grad_norm": 0.004047660622745752, "learning_rate": 0.1390064375563304, "loss": 0.2559, "num_input_tokens_seen": 18130592, "step": 20935 }, { "epoch": 9.872701555869872, "grad_norm": 0.0024155036080628633, "learning_rate": 0.13894769195957293, "loss": 0.2097, "num_input_tokens_seen": 18135248, "step": 20940 }, { "epoch": 9.875058934464874, "grad_norm": 0.004974984098225832, "learning_rate": 0.13888894806722032, "loss": 0.2157, "num_input_tokens_seen": 18139216, "step": 20945 }, { "epoch": 9.877416313059877, "grad_norm": 0.001171640818938613, "learning_rate": 0.1388302058883315, "loss": 0.237, "num_input_tokens_seen": 18144016, "step": 20950 }, { "epoch": 9.87977369165488, "grad_norm": 0.0011932163033634424, "learning_rate": 0.13877146543196528, "loss": 0.2176, "num_input_tokens_seen": 18147984, "step": 20955 }, { "epoch": 9.882131070249882, "grad_norm": 0.0011462057009339333, "learning_rate": 0.13871272670718027, "loss": 0.1857, "num_input_tokens_seen": 18152080, "step": 20960 }, { "epoch": 9.884488448844884, "grad_norm": 0.00197031581774354, "learning_rate": 0.13865398972303455, "loss": 0.2425, "num_input_tokens_seen": 18156128, "step": 20965 }, { "epoch": 9.886845827439886, "grad_norm": 0.0017002859385684133, "learning_rate": 0.13859525448858623, "loss": 0.2347, "num_input_tokens_seen": 18160224, "step": 20970 }, { "epoch": 9.889203206034889, "grad_norm": 0.001827147090807557, "learning_rate": 0.13853652101289304, "loss": 0.1967, "num_input_tokens_seen": 18164128, "step": 20975 }, { "epoch": 9.891560584629891, "grad_norm": 0.0016634489875286818, "learning_rate": 0.13847778930501234, "loss": 0.1633, "num_input_tokens_seen": 18168800, "step": 20980 }, { "epoch": 9.893917963224894, "grad_norm": 0.0012185194063931704, "learning_rate": 0.1384190593740013, "loss": 0.2056, "num_input_tokens_seen": 18173200, "step": 20985 }, { "epoch": 9.896275341819896, "grad_norm": 0.00219575478695333, "learning_rate": 0.13836033122891686, "loss": 0.264, "num_input_tokens_seen": 18177344, "step": 20990 }, { "epoch": 9.898632720414899, "grad_norm": 0.00421916414052248, "learning_rate": 0.1383016048788156, "loss": 0.2442, "num_input_tokens_seen": 18181968, "step": 20995 }, { "epoch": 9.900990099009901, "grad_norm": 0.0024581674952059984, "learning_rate": 0.13824288033275392, "loss": 0.2083, "num_input_tokens_seen": 18186480, "step": 21000 }, { "epoch": 9.900990099009901, "eval_loss": 0.22014720737934113, "eval_runtime": 21.9322, "eval_samples_per_second": 42.996, "eval_steps_per_second": 21.521, "num_input_tokens_seen": 18186480, "step": 21000 }, { "epoch": 9.903347477604903, "grad_norm": 0.0018667903495952487, "learning_rate": 0.1381841575997878, "loss": 0.1631, "num_input_tokens_seen": 18189808, "step": 21005 }, { "epoch": 9.905704856199906, "grad_norm": 0.0020262333564460278, "learning_rate": 0.13812543668897306, "loss": 0.2335, "num_input_tokens_seen": 18195152, "step": 21010 }, { "epoch": 9.908062234794908, "grad_norm": 0.0013739721616730094, "learning_rate": 0.13806671760936526, "loss": 0.2201, "num_input_tokens_seen": 18199424, "step": 21015 }, { "epoch": 9.91041961338991, "grad_norm": 0.0012936403509229422, "learning_rate": 0.13800800037001956, "loss": 0.1722, "num_input_tokens_seen": 18204592, "step": 21020 }, { "epoch": 9.912776991984913, "grad_norm": 0.002181220566853881, "learning_rate": 0.13794928497999087, "loss": 0.225, "num_input_tokens_seen": 18209456, "step": 21025 }, { "epoch": 9.915134370579915, "grad_norm": 0.0009585630614310503, "learning_rate": 0.1378905714483339, "loss": 0.2322, "num_input_tokens_seen": 18213952, "step": 21030 }, { "epoch": 9.917491749174918, "grad_norm": 0.0009372510248795152, "learning_rate": 0.13783185978410295, "loss": 0.2135, "num_input_tokens_seen": 18217600, "step": 21035 }, { "epoch": 9.91984912776992, "grad_norm": 0.00174662412609905, "learning_rate": 0.13777314999635218, "loss": 0.206, "num_input_tokens_seen": 18221904, "step": 21040 }, { "epoch": 9.922206506364923, "grad_norm": 0.0018752949545159936, "learning_rate": 0.1377144420941353, "loss": 0.2124, "num_input_tokens_seen": 18226464, "step": 21045 }, { "epoch": 9.924563884959925, "grad_norm": 0.0009554074495099485, "learning_rate": 0.13765573608650586, "loss": 0.2166, "num_input_tokens_seen": 18230192, "step": 21050 }, { "epoch": 9.926921263554927, "grad_norm": 0.001096214517019689, "learning_rate": 0.13759703198251702, "loss": 0.2553, "num_input_tokens_seen": 18234624, "step": 21055 }, { "epoch": 9.92927864214993, "grad_norm": 0.0020234626717865467, "learning_rate": 0.13753832979122174, "loss": 0.2229, "num_input_tokens_seen": 18238992, "step": 21060 }, { "epoch": 9.931636020744932, "grad_norm": 0.0009081608150154352, "learning_rate": 0.13747962952167264, "loss": 0.2205, "num_input_tokens_seen": 18242864, "step": 21065 }, { "epoch": 9.933993399339935, "grad_norm": 0.0016321418806910515, "learning_rate": 0.13742093118292192, "loss": 0.2402, "num_input_tokens_seen": 18247504, "step": 21070 }, { "epoch": 9.936350777934937, "grad_norm": 0.001026210025884211, "learning_rate": 0.13736223478402174, "loss": 0.2217, "num_input_tokens_seen": 18251024, "step": 21075 }, { "epoch": 9.93870815652994, "grad_norm": 0.002431320259347558, "learning_rate": 0.1373035403340238, "loss": 0.2029, "num_input_tokens_seen": 18256160, "step": 21080 }, { "epoch": 9.941065535124942, "grad_norm": 0.0019542360678315163, "learning_rate": 0.13724484784197943, "loss": 0.2308, "num_input_tokens_seen": 18260336, "step": 21085 }, { "epoch": 9.943422913719944, "grad_norm": 0.002780736656859517, "learning_rate": 0.13718615731693987, "loss": 0.2161, "num_input_tokens_seen": 18265536, "step": 21090 }, { "epoch": 9.945780292314947, "grad_norm": 0.0017699002055451274, "learning_rate": 0.13712746876795587, "loss": 0.219, "num_input_tokens_seen": 18269280, "step": 21095 }, { "epoch": 9.948137670909949, "grad_norm": 0.0017739393515512347, "learning_rate": 0.13706878220407792, "loss": 0.2349, "num_input_tokens_seen": 18273552, "step": 21100 }, { "epoch": 9.950495049504951, "grad_norm": 0.0027136639691889286, "learning_rate": 0.13701009763435631, "loss": 0.2109, "num_input_tokens_seen": 18277632, "step": 21105 }, { "epoch": 9.952852428099952, "grad_norm": 0.002867842325940728, "learning_rate": 0.13695141506784084, "loss": 0.2384, "num_input_tokens_seen": 18282576, "step": 21110 }, { "epoch": 9.955209806694956, "grad_norm": 0.0011049900203943253, "learning_rate": 0.13689273451358114, "loss": 0.2365, "num_input_tokens_seen": 18286064, "step": 21115 }, { "epoch": 9.957567185289957, "grad_norm": 0.002707589417695999, "learning_rate": 0.13683405598062653, "loss": 0.236, "num_input_tokens_seen": 18291392, "step": 21120 }, { "epoch": 9.95992456388496, "grad_norm": 0.0031974036246538162, "learning_rate": 0.1367753794780259, "loss": 0.2399, "num_input_tokens_seen": 18295888, "step": 21125 }, { "epoch": 9.962281942479962, "grad_norm": 0.0021392125636339188, "learning_rate": 0.13671670501482802, "loss": 0.2267, "num_input_tokens_seen": 18299856, "step": 21130 }, { "epoch": 9.964639321074964, "grad_norm": 0.0012960751773789525, "learning_rate": 0.1366580326000811, "loss": 0.2074, "num_input_tokens_seen": 18303584, "step": 21135 }, { "epoch": 9.966996699669966, "grad_norm": 0.0014119481202214956, "learning_rate": 0.1365993622428332, "loss": 0.1963, "num_input_tokens_seen": 18307792, "step": 21140 }, { "epoch": 9.969354078264969, "grad_norm": 0.001165733439847827, "learning_rate": 0.13654069395213211, "loss": 0.2008, "num_input_tokens_seen": 18312416, "step": 21145 }, { "epoch": 9.971711456859971, "grad_norm": 0.001463819993659854, "learning_rate": 0.13648202773702509, "loss": 0.288, "num_input_tokens_seen": 18316560, "step": 21150 }, { "epoch": 9.974068835454974, "grad_norm": 0.0026052154134958982, "learning_rate": 0.13642336360655927, "loss": 0.2013, "num_input_tokens_seen": 18321568, "step": 21155 }, { "epoch": 9.976426214049976, "grad_norm": 0.0010159129742532969, "learning_rate": 0.13636470156978145, "loss": 0.2346, "num_input_tokens_seen": 18326016, "step": 21160 }, { "epoch": 9.978783592644978, "grad_norm": 0.001300600590184331, "learning_rate": 0.13630604163573798, "loss": 0.1881, "num_input_tokens_seen": 18330560, "step": 21165 }, { "epoch": 9.98114097123998, "grad_norm": 0.001627613208256662, "learning_rate": 0.13624738381347495, "loss": 0.2026, "num_input_tokens_seen": 18335040, "step": 21170 }, { "epoch": 9.983498349834983, "grad_norm": 0.0023862365633249283, "learning_rate": 0.1361887281120382, "loss": 0.2305, "num_input_tokens_seen": 18339024, "step": 21175 }, { "epoch": 9.985855728429986, "grad_norm": 0.0027408418245613575, "learning_rate": 0.13613007454047307, "loss": 0.221, "num_input_tokens_seen": 18343520, "step": 21180 }, { "epoch": 9.988213107024988, "grad_norm": 0.001805687672458589, "learning_rate": 0.13607142310782486, "loss": 0.2216, "num_input_tokens_seen": 18348304, "step": 21185 }, { "epoch": 9.99057048561999, "grad_norm": 0.0013148213038221002, "learning_rate": 0.13601277382313814, "loss": 0.2183, "num_input_tokens_seen": 18352272, "step": 21190 }, { "epoch": 9.992927864214993, "grad_norm": 0.001472087693400681, "learning_rate": 0.1359541266954575, "loss": 0.212, "num_input_tokens_seen": 18356640, "step": 21195 }, { "epoch": 9.995285242809995, "grad_norm": 0.0012460732832551003, "learning_rate": 0.13589548173382707, "loss": 0.2329, "num_input_tokens_seen": 18360368, "step": 21200 }, { "epoch": 9.995285242809995, "eval_loss": 0.21801726520061493, "eval_runtime": 21.9151, "eval_samples_per_second": 43.03, "eval_steps_per_second": 21.538, "num_input_tokens_seen": 18360368, "step": 21200 }, { "epoch": 9.997642621404998, "grad_norm": 0.001578094088472426, "learning_rate": 0.1358368389472906, "loss": 0.2541, "num_input_tokens_seen": 18364368, "step": 21205 }, { "epoch": 10.0, "grad_norm": 0.0013486967654898763, "learning_rate": 0.13577819834489155, "loss": 0.2144, "num_input_tokens_seen": 18368192, "step": 21210 }, { "epoch": 10.002357378595002, "grad_norm": 0.002214340493083, "learning_rate": 0.135719559935673, "loss": 0.201, "num_input_tokens_seen": 18372992, "step": 21215 }, { "epoch": 10.004714757190005, "grad_norm": 0.0021097101271152496, "learning_rate": 0.13566092372867775, "loss": 0.2184, "num_input_tokens_seen": 18377920, "step": 21220 }, { "epoch": 10.007072135785007, "grad_norm": 0.002251502126455307, "learning_rate": 0.13560228973294833, "loss": 0.2173, "num_input_tokens_seen": 18383072, "step": 21225 }, { "epoch": 10.00942951438001, "grad_norm": 0.001967727905139327, "learning_rate": 0.13554365795752668, "loss": 0.2283, "num_input_tokens_seen": 18387824, "step": 21230 }, { "epoch": 10.011786892975012, "grad_norm": 0.0018676373874768615, "learning_rate": 0.1354850284114547, "loss": 0.2291, "num_input_tokens_seen": 18391760, "step": 21235 }, { "epoch": 10.014144271570014, "grad_norm": 0.0013665041187778115, "learning_rate": 0.13542640110377374, "loss": 0.2586, "num_input_tokens_seen": 18396720, "step": 21240 }, { "epoch": 10.016501650165017, "grad_norm": 0.0016202329425141215, "learning_rate": 0.13536777604352487, "loss": 0.2196, "num_input_tokens_seen": 18400704, "step": 21245 }, { "epoch": 10.01885902876002, "grad_norm": 0.0039044334553182125, "learning_rate": 0.13530915323974887, "loss": 0.2246, "num_input_tokens_seen": 18404624, "step": 21250 }, { "epoch": 10.021216407355022, "grad_norm": 0.001436038757674396, "learning_rate": 0.13525053270148596, "loss": 0.2223, "num_input_tokens_seen": 18409104, "step": 21255 }, { "epoch": 10.023573785950024, "grad_norm": 0.0016989236464723945, "learning_rate": 0.13519191443777628, "loss": 0.2223, "num_input_tokens_seen": 18413168, "step": 21260 }, { "epoch": 10.025931164545026, "grad_norm": 0.0012037827400490642, "learning_rate": 0.13513329845765953, "loss": 0.2197, "num_input_tokens_seen": 18417456, "step": 21265 }, { "epoch": 10.028288543140029, "grad_norm": 0.001256100251339376, "learning_rate": 0.13507468477017495, "loss": 0.2262, "num_input_tokens_seen": 18422256, "step": 21270 }, { "epoch": 10.030645921735031, "grad_norm": 0.0017386305844411254, "learning_rate": 0.13501607338436153, "loss": 0.1869, "num_input_tokens_seen": 18428192, "step": 21275 }, { "epoch": 10.033003300330034, "grad_norm": 0.001125015551224351, "learning_rate": 0.13495746430925798, "loss": 0.2079, "num_input_tokens_seen": 18432384, "step": 21280 }, { "epoch": 10.035360678925036, "grad_norm": 0.0031460796017199755, "learning_rate": 0.13489885755390238, "loss": 0.2575, "num_input_tokens_seen": 18436784, "step": 21285 }, { "epoch": 10.037718057520038, "grad_norm": 0.0011611274676397443, "learning_rate": 0.13484025312733275, "loss": 0.2817, "num_input_tokens_seen": 18441312, "step": 21290 }, { "epoch": 10.04007543611504, "grad_norm": 0.0016808726359158754, "learning_rate": 0.13478165103858658, "loss": 0.1929, "num_input_tokens_seen": 18445728, "step": 21295 }, { "epoch": 10.042432814710043, "grad_norm": 0.001612075138837099, "learning_rate": 0.13472305129670106, "loss": 0.1939, "num_input_tokens_seen": 18450096, "step": 21300 }, { "epoch": 10.044790193305046, "grad_norm": 0.002591305412352085, "learning_rate": 0.13466445391071305, "loss": 0.2557, "num_input_tokens_seen": 18454560, "step": 21305 }, { "epoch": 10.047147571900048, "grad_norm": 0.001299537718296051, "learning_rate": 0.13460585888965895, "loss": 0.2096, "num_input_tokens_seen": 18458656, "step": 21310 }, { "epoch": 10.049504950495049, "grad_norm": 0.0017594894161447883, "learning_rate": 0.13454726624257482, "loss": 0.254, "num_input_tokens_seen": 18463072, "step": 21315 }, { "epoch": 10.051862329090051, "grad_norm": 0.0026462057139724493, "learning_rate": 0.1344886759784965, "loss": 0.2314, "num_input_tokens_seen": 18467424, "step": 21320 }, { "epoch": 10.054219707685053, "grad_norm": 0.0018532495014369488, "learning_rate": 0.13443008810645923, "loss": 0.2125, "num_input_tokens_seen": 18471952, "step": 21325 }, { "epoch": 10.056577086280056, "grad_norm": 0.0010463576763868332, "learning_rate": 0.13437150263549807, "loss": 0.2408, "num_input_tokens_seen": 18476416, "step": 21330 }, { "epoch": 10.058934464875058, "grad_norm": 0.0013170493766665459, "learning_rate": 0.13431291957464755, "loss": 0.224, "num_input_tokens_seen": 18480768, "step": 21335 }, { "epoch": 10.06129184347006, "grad_norm": 0.0013173696352168918, "learning_rate": 0.13425433893294197, "loss": 0.215, "num_input_tokens_seen": 18485472, "step": 21340 }, { "epoch": 10.063649222065063, "grad_norm": 0.0019737286493182182, "learning_rate": 0.13419576071941525, "loss": 0.2372, "num_input_tokens_seen": 18490336, "step": 21345 }, { "epoch": 10.066006600660065, "grad_norm": 0.0014504195423796773, "learning_rate": 0.1341371849431008, "loss": 0.1848, "num_input_tokens_seen": 18494864, "step": 21350 }, { "epoch": 10.068363979255068, "grad_norm": 0.0016785485204309225, "learning_rate": 0.13407861161303178, "loss": 0.25, "num_input_tokens_seen": 18498960, "step": 21355 }, { "epoch": 10.07072135785007, "grad_norm": 0.001222933758981526, "learning_rate": 0.13402004073824098, "loss": 0.2358, "num_input_tokens_seen": 18503136, "step": 21360 }, { "epoch": 10.073078736445073, "grad_norm": 0.0011035986244678497, "learning_rate": 0.13396147232776062, "loss": 0.223, "num_input_tokens_seen": 18507808, "step": 21365 }, { "epoch": 10.075436115040075, "grad_norm": 0.0011290839174762368, "learning_rate": 0.13390290639062288, "loss": 0.2029, "num_input_tokens_seen": 18512496, "step": 21370 }, { "epoch": 10.077793493635077, "grad_norm": 0.0017865909030660987, "learning_rate": 0.13384434293585917, "loss": 0.2003, "num_input_tokens_seen": 18516976, "step": 21375 }, { "epoch": 10.08015087223008, "grad_norm": 0.0018875985406339169, "learning_rate": 0.13378578197250088, "loss": 0.2374, "num_input_tokens_seen": 18521216, "step": 21380 }, { "epoch": 10.082508250825082, "grad_norm": 0.0018964592600241303, "learning_rate": 0.13372722350957872, "loss": 0.2264, "num_input_tokens_seen": 18526768, "step": 21385 }, { "epoch": 10.084865629420085, "grad_norm": 0.0014962619170546532, "learning_rate": 0.13366866755612322, "loss": 0.2432, "num_input_tokens_seen": 18530624, "step": 21390 }, { "epoch": 10.087223008015087, "grad_norm": 0.0019652624614536762, "learning_rate": 0.13361011412116436, "loss": 0.229, "num_input_tokens_seen": 18535696, "step": 21395 }, { "epoch": 10.08958038661009, "grad_norm": 0.0009250770090147853, "learning_rate": 0.13355156321373196, "loss": 0.2151, "num_input_tokens_seen": 18539664, "step": 21400 }, { "epoch": 10.08958038661009, "eval_loss": 0.21888743340969086, "eval_runtime": 21.9452, "eval_samples_per_second": 42.971, "eval_steps_per_second": 21.508, "num_input_tokens_seen": 18539664, "step": 21400 }, { "epoch": 10.091937765205092, "grad_norm": 0.002055337419733405, "learning_rate": 0.13349301484285514, "loss": 0.2287, "num_input_tokens_seen": 18544992, "step": 21405 }, { "epoch": 10.094295143800094, "grad_norm": 0.0011586814653128386, "learning_rate": 0.13343446901756295, "loss": 0.2051, "num_input_tokens_seen": 18549200, "step": 21410 }, { "epoch": 10.096652522395097, "grad_norm": 0.0016689538024365902, "learning_rate": 0.13337592574688376, "loss": 0.2206, "num_input_tokens_seen": 18553632, "step": 21415 }, { "epoch": 10.099009900990099, "grad_norm": 0.0018155189463868737, "learning_rate": 0.13331738503984572, "loss": 0.2285, "num_input_tokens_seen": 18557632, "step": 21420 }, { "epoch": 10.101367279585101, "grad_norm": 0.001697544939815998, "learning_rate": 0.1332588469054766, "loss": 0.2359, "num_input_tokens_seen": 18562704, "step": 21425 }, { "epoch": 10.103724658180104, "grad_norm": 0.0009774485370144248, "learning_rate": 0.1332003113528036, "loss": 0.2105, "num_input_tokens_seen": 18567216, "step": 21430 }, { "epoch": 10.106082036775106, "grad_norm": 0.001356321619823575, "learning_rate": 0.13314177839085373, "loss": 0.2309, "num_input_tokens_seen": 18571856, "step": 21435 }, { "epoch": 10.108439415370109, "grad_norm": 0.0016018390888348222, "learning_rate": 0.13308324802865354, "loss": 0.217, "num_input_tokens_seen": 18575936, "step": 21440 }, { "epoch": 10.110796793965111, "grad_norm": 0.0014719140017405152, "learning_rate": 0.13302472027522905, "loss": 0.2335, "num_input_tokens_seen": 18580064, "step": 21445 }, { "epoch": 10.113154172560114, "grad_norm": 0.0016623333794996142, "learning_rate": 0.13296619513960606, "loss": 0.2308, "num_input_tokens_seen": 18584336, "step": 21450 }, { "epoch": 10.115511551155116, "grad_norm": 0.0014535210793837905, "learning_rate": 0.1329076726308098, "loss": 0.2247, "num_input_tokens_seen": 18588656, "step": 21455 }, { "epoch": 10.117868929750118, "grad_norm": 0.0013843660708516836, "learning_rate": 0.13284915275786519, "loss": 0.2248, "num_input_tokens_seen": 18592096, "step": 21460 }, { "epoch": 10.12022630834512, "grad_norm": 0.0017334980657324195, "learning_rate": 0.1327906355297968, "loss": 0.2198, "num_input_tokens_seen": 18596944, "step": 21465 }, { "epoch": 10.122583686940123, "grad_norm": 0.0021600709296762943, "learning_rate": 0.13273212095562867, "loss": 0.2063, "num_input_tokens_seen": 18601856, "step": 21470 }, { "epoch": 10.124941065535126, "grad_norm": 0.0016057930188253522, "learning_rate": 0.13267360904438444, "loss": 0.2525, "num_input_tokens_seen": 18607088, "step": 21475 }, { "epoch": 10.127298444130128, "grad_norm": 0.001730194315314293, "learning_rate": 0.1326150998050875, "loss": 0.2278, "num_input_tokens_seen": 18611888, "step": 21480 }, { "epoch": 10.12965582272513, "grad_norm": 0.0009589291294105351, "learning_rate": 0.1325565932467606, "loss": 0.1965, "num_input_tokens_seen": 18615904, "step": 21485 }, { "epoch": 10.132013201320133, "grad_norm": 0.001745629939250648, "learning_rate": 0.13249808937842628, "loss": 0.2192, "num_input_tokens_seen": 18620224, "step": 21490 }, { "epoch": 10.134370579915135, "grad_norm": 0.00217812554910779, "learning_rate": 0.1324395882091065, "loss": 0.249, "num_input_tokens_seen": 18624160, "step": 21495 }, { "epoch": 10.136727958510138, "grad_norm": 0.0013365367194637656, "learning_rate": 0.13238108974782284, "loss": 0.2014, "num_input_tokens_seen": 18628496, "step": 21500 }, { "epoch": 10.13908533710514, "grad_norm": 0.0018434942467138171, "learning_rate": 0.13232259400359664, "loss": 0.2006, "num_input_tokens_seen": 18632304, "step": 21505 }, { "epoch": 10.14144271570014, "grad_norm": 0.0016380795277655125, "learning_rate": 0.13226410098544852, "loss": 0.1927, "num_input_tokens_seen": 18637008, "step": 21510 }, { "epoch": 10.143800094295143, "grad_norm": 0.0020819443743675947, "learning_rate": 0.13220561070239892, "loss": 0.2414, "num_input_tokens_seen": 18641392, "step": 21515 }, { "epoch": 10.146157472890145, "grad_norm": 0.0030648489482700825, "learning_rate": 0.13214712316346783, "loss": 0.184, "num_input_tokens_seen": 18646368, "step": 21520 }, { "epoch": 10.148514851485148, "grad_norm": 0.001453018863685429, "learning_rate": 0.13208863837767465, "loss": 0.2525, "num_input_tokens_seen": 18650240, "step": 21525 }, { "epoch": 10.15087223008015, "grad_norm": 0.0017490035388618708, "learning_rate": 0.13203015635403856, "loss": 0.2153, "num_input_tokens_seen": 18655136, "step": 21530 }, { "epoch": 10.153229608675153, "grad_norm": 0.001426578382961452, "learning_rate": 0.13197167710157817, "loss": 0.1791, "num_input_tokens_seen": 18659632, "step": 21535 }, { "epoch": 10.155586987270155, "grad_norm": 0.0015461939619854093, "learning_rate": 0.13191320062931167, "loss": 0.2193, "num_input_tokens_seen": 18664080, "step": 21540 }, { "epoch": 10.157944365865157, "grad_norm": 0.0012613626895472407, "learning_rate": 0.13185472694625702, "loss": 0.1665, "num_input_tokens_seen": 18668272, "step": 21545 }, { "epoch": 10.16030174446016, "grad_norm": 0.0011577388504520059, "learning_rate": 0.13179625606143142, "loss": 0.2317, "num_input_tokens_seen": 18672352, "step": 21550 }, { "epoch": 10.162659123055162, "grad_norm": 0.002230259357020259, "learning_rate": 0.13173778798385188, "loss": 0.3123, "num_input_tokens_seen": 18678400, "step": 21555 }, { "epoch": 10.165016501650165, "grad_norm": 0.0012496886774897575, "learning_rate": 0.13167932272253505, "loss": 0.2203, "num_input_tokens_seen": 18682608, "step": 21560 }, { "epoch": 10.167373880245167, "grad_norm": 0.0010703924344852567, "learning_rate": 0.1316208602864968, "loss": 0.252, "num_input_tokens_seen": 18686832, "step": 21565 }, { "epoch": 10.16973125884017, "grad_norm": 0.0014066774165257812, "learning_rate": 0.13156240068475292, "loss": 0.2309, "num_input_tokens_seen": 18691808, "step": 21570 }, { "epoch": 10.172088637435172, "grad_norm": 0.0010762856109067798, "learning_rate": 0.1315039439263185, "loss": 0.2189, "num_input_tokens_seen": 18697088, "step": 21575 }, { "epoch": 10.174446016030174, "grad_norm": 0.0017218475695699453, "learning_rate": 0.13144549002020833, "loss": 0.2181, "num_input_tokens_seen": 18701440, "step": 21580 }, { "epoch": 10.176803394625177, "grad_norm": 0.0010263760341331363, "learning_rate": 0.13138703897543688, "loss": 0.2491, "num_input_tokens_seen": 18705472, "step": 21585 }, { "epoch": 10.179160773220179, "grad_norm": 0.0015922652091830969, "learning_rate": 0.1313285908010178, "loss": 0.1831, "num_input_tokens_seen": 18709616, "step": 21590 }, { "epoch": 10.181518151815181, "grad_norm": 0.0009499292937107384, "learning_rate": 0.13127014550596475, "loss": 0.1716, "num_input_tokens_seen": 18714176, "step": 21595 }, { "epoch": 10.183875530410184, "grad_norm": 0.0011007396969944239, "learning_rate": 0.1312117030992906, "loss": 0.2113, "num_input_tokens_seen": 18718016, "step": 21600 }, { "epoch": 10.183875530410184, "eval_loss": 0.22318710386753082, "eval_runtime": 21.9081, "eval_samples_per_second": 43.043, "eval_steps_per_second": 21.544, "num_input_tokens_seen": 18718016, "step": 21600 }, { "epoch": 10.186232909005186, "grad_norm": 0.0015942996833473444, "learning_rate": 0.13115326359000795, "loss": 0.2556, "num_input_tokens_seen": 18723008, "step": 21605 }, { "epoch": 10.188590287600189, "grad_norm": 0.0012011429062113166, "learning_rate": 0.13109482698712896, "loss": 0.2379, "num_input_tokens_seen": 18727184, "step": 21610 }, { "epoch": 10.190947666195191, "grad_norm": 0.0008231604588218033, "learning_rate": 0.1310363932996651, "loss": 0.2403, "num_input_tokens_seen": 18731072, "step": 21615 }, { "epoch": 10.193305044790193, "grad_norm": 0.0011203772155568004, "learning_rate": 0.13097796253662775, "loss": 0.2397, "num_input_tokens_seen": 18735424, "step": 21620 }, { "epoch": 10.195662423385196, "grad_norm": 0.002201799303293228, "learning_rate": 0.1309195347070277, "loss": 0.2135, "num_input_tokens_seen": 18739856, "step": 21625 }, { "epoch": 10.198019801980198, "grad_norm": 0.001779241138137877, "learning_rate": 0.13086110981987506, "loss": 0.2139, "num_input_tokens_seen": 18744720, "step": 21630 }, { "epoch": 10.2003771805752, "grad_norm": 0.0012009958736598492, "learning_rate": 0.13080268788417987, "loss": 0.2103, "num_input_tokens_seen": 18748656, "step": 21635 }, { "epoch": 10.202734559170203, "grad_norm": 0.001297451090067625, "learning_rate": 0.1307442689089515, "loss": 0.2371, "num_input_tokens_seen": 18751968, "step": 21640 }, { "epoch": 10.205091937765205, "grad_norm": 0.0014946481678634882, "learning_rate": 0.13068585290319873, "loss": 0.1917, "num_input_tokens_seen": 18756032, "step": 21645 }, { "epoch": 10.207449316360208, "grad_norm": 0.0010517353657633066, "learning_rate": 0.13062743987593026, "loss": 0.2133, "num_input_tokens_seen": 18759856, "step": 21650 }, { "epoch": 10.20980669495521, "grad_norm": 0.0008070350158959627, "learning_rate": 0.13056902983615395, "loss": 0.2321, "num_input_tokens_seen": 18764064, "step": 21655 }, { "epoch": 10.212164073550213, "grad_norm": 0.0022804324980825186, "learning_rate": 0.13051062279287742, "loss": 0.2576, "num_input_tokens_seen": 18768512, "step": 21660 }, { "epoch": 10.214521452145215, "grad_norm": 0.0014580650022253394, "learning_rate": 0.13045221875510782, "loss": 0.2102, "num_input_tokens_seen": 18772544, "step": 21665 }, { "epoch": 10.216878830740217, "grad_norm": 0.0029510511085391045, "learning_rate": 0.13039381773185174, "loss": 0.228, "num_input_tokens_seen": 18776656, "step": 21670 }, { "epoch": 10.21923620933522, "grad_norm": 0.0015310606686398387, "learning_rate": 0.1303354197321153, "loss": 0.2261, "num_input_tokens_seen": 18781200, "step": 21675 }, { "epoch": 10.221593587930222, "grad_norm": 0.0018566305516287684, "learning_rate": 0.13027702476490433, "loss": 0.2261, "num_input_tokens_seen": 18785024, "step": 21680 }, { "epoch": 10.223950966525225, "grad_norm": 0.001569773769006133, "learning_rate": 0.1302186328392239, "loss": 0.2163, "num_input_tokens_seen": 18789616, "step": 21685 }, { "epoch": 10.226308345120227, "grad_norm": 0.0009505623602308333, "learning_rate": 0.130160243964079, "loss": 0.2079, "num_input_tokens_seen": 18793872, "step": 21690 }, { "epoch": 10.22866572371523, "grad_norm": 0.0015276246704161167, "learning_rate": 0.13010185814847372, "loss": 0.2091, "num_input_tokens_seen": 18797920, "step": 21695 }, { "epoch": 10.231023102310232, "grad_norm": 0.001353686093352735, "learning_rate": 0.13004347540141192, "loss": 0.185, "num_input_tokens_seen": 18802784, "step": 21700 }, { "epoch": 10.233380480905234, "grad_norm": 0.0010096713667735457, "learning_rate": 0.12998509573189712, "loss": 0.2349, "num_input_tokens_seen": 18807200, "step": 21705 }, { "epoch": 10.235737859500237, "grad_norm": 0.0016245967708528042, "learning_rate": 0.12992671914893203, "loss": 0.208, "num_input_tokens_seen": 18812544, "step": 21710 }, { "epoch": 10.238095238095237, "grad_norm": 0.0017846112605184317, "learning_rate": 0.12986834566151909, "loss": 0.262, "num_input_tokens_seen": 18816752, "step": 21715 }, { "epoch": 10.24045261669024, "grad_norm": 0.002208509249612689, "learning_rate": 0.12980997527866028, "loss": 0.2166, "num_input_tokens_seen": 18821456, "step": 21720 }, { "epoch": 10.242809995285242, "grad_norm": 0.0019410925451666117, "learning_rate": 0.12975160800935692, "loss": 0.2495, "num_input_tokens_seen": 18826032, "step": 21725 }, { "epoch": 10.245167373880244, "grad_norm": 0.0011525312438607216, "learning_rate": 0.12969324386261016, "loss": 0.1902, "num_input_tokens_seen": 18830304, "step": 21730 }, { "epoch": 10.247524752475247, "grad_norm": 0.0012868059566244483, "learning_rate": 0.12963488284742034, "loss": 0.2204, "num_input_tokens_seen": 18834512, "step": 21735 }, { "epoch": 10.24988213107025, "grad_norm": 0.0017551748314872384, "learning_rate": 0.12957652497278752, "loss": 0.2513, "num_input_tokens_seen": 18838416, "step": 21740 }, { "epoch": 10.252239509665252, "grad_norm": 0.0009911122033372521, "learning_rate": 0.12951817024771117, "loss": 0.1872, "num_input_tokens_seen": 18842912, "step": 21745 }, { "epoch": 10.254596888260254, "grad_norm": 0.0016291238134726882, "learning_rate": 0.12945981868119041, "loss": 0.2241, "num_input_tokens_seen": 18846944, "step": 21750 }, { "epoch": 10.256954266855256, "grad_norm": 0.0016594991320744157, "learning_rate": 0.12940147028222376, "loss": 0.2299, "num_input_tokens_seen": 18851040, "step": 21755 }, { "epoch": 10.259311645450259, "grad_norm": 0.0010734983952715993, "learning_rate": 0.12934312505980916, "loss": 0.2028, "num_input_tokens_seen": 18855200, "step": 21760 }, { "epoch": 10.261669024045261, "grad_norm": 0.0016369061777368188, "learning_rate": 0.1292847830229443, "loss": 0.1998, "num_input_tokens_seen": 18859408, "step": 21765 }, { "epoch": 10.264026402640264, "grad_norm": 0.0010639375541359186, "learning_rate": 0.12922644418062626, "loss": 0.2318, "num_input_tokens_seen": 18863216, "step": 21770 }, { "epoch": 10.266383781235266, "grad_norm": 0.0014716271543875337, "learning_rate": 0.1291681085418515, "loss": 0.228, "num_input_tokens_seen": 18867184, "step": 21775 }, { "epoch": 10.268741159830268, "grad_norm": 0.002565921051427722, "learning_rate": 0.12910977611561628, "loss": 0.2596, "num_input_tokens_seen": 18871856, "step": 21780 }, { "epoch": 10.27109853842527, "grad_norm": 0.0009796597296372056, "learning_rate": 0.1290514469109161, "loss": 0.2189, "num_input_tokens_seen": 18876384, "step": 21785 }, { "epoch": 10.273455917020273, "grad_norm": 0.0014451759634539485, "learning_rate": 0.128993120936746, "loss": 0.2018, "num_input_tokens_seen": 18880672, "step": 21790 }, { "epoch": 10.275813295615276, "grad_norm": 0.0010475526796653867, "learning_rate": 0.12893479820210071, "loss": 0.2241, "num_input_tokens_seen": 18884240, "step": 21795 }, { "epoch": 10.278170674210278, "grad_norm": 0.0010318325366824865, "learning_rate": 0.1288764787159742, "loss": 0.2365, "num_input_tokens_seen": 18888560, "step": 21800 }, { "epoch": 10.278170674210278, "eval_loss": 0.21806401014328003, "eval_runtime": 21.94, "eval_samples_per_second": 42.981, "eval_steps_per_second": 21.513, "num_input_tokens_seen": 18888560, "step": 21800 }, { "epoch": 10.28052805280528, "grad_norm": 0.0012473678216338158, "learning_rate": 0.1288181624873601, "loss": 0.205, "num_input_tokens_seen": 18893248, "step": 21805 }, { "epoch": 10.282885431400283, "grad_norm": 0.0009067459614016116, "learning_rate": 0.12875984952525163, "loss": 0.1993, "num_input_tokens_seen": 18898320, "step": 21810 }, { "epoch": 10.285242809995285, "grad_norm": 0.0009492195094935596, "learning_rate": 0.12870153983864122, "loss": 0.2185, "num_input_tokens_seen": 18902720, "step": 21815 }, { "epoch": 10.287600188590288, "grad_norm": 0.0007980326190590858, "learning_rate": 0.12864323343652104, "loss": 0.1979, "num_input_tokens_seen": 18906912, "step": 21820 }, { "epoch": 10.28995756718529, "grad_norm": 0.0014669899828732014, "learning_rate": 0.12858493032788268, "loss": 0.2947, "num_input_tokens_seen": 18911728, "step": 21825 }, { "epoch": 10.292314945780292, "grad_norm": 0.001951332320459187, "learning_rate": 0.12852663052171714, "loss": 0.1875, "num_input_tokens_seen": 18916000, "step": 21830 }, { "epoch": 10.294672324375295, "grad_norm": 0.0026865974068641663, "learning_rate": 0.12846833402701507, "loss": 0.2132, "num_input_tokens_seen": 18920320, "step": 21835 }, { "epoch": 10.297029702970297, "grad_norm": 0.0012008611811324954, "learning_rate": 0.12841004085276642, "loss": 0.2429, "num_input_tokens_seen": 18924272, "step": 21840 }, { "epoch": 10.2993870815653, "grad_norm": 0.0012169939000159502, "learning_rate": 0.12835175100796076, "loss": 0.2582, "num_input_tokens_seen": 18929360, "step": 21845 }, { "epoch": 10.301744460160302, "grad_norm": 0.00099290837533772, "learning_rate": 0.12829346450158724, "loss": 0.2283, "num_input_tokens_seen": 18934096, "step": 21850 }, { "epoch": 10.304101838755304, "grad_norm": 0.0013641746481880546, "learning_rate": 0.12823518134263423, "loss": 0.2261, "num_input_tokens_seen": 18938336, "step": 21855 }, { "epoch": 10.306459217350307, "grad_norm": 0.0021834061481058598, "learning_rate": 0.12817690154008973, "loss": 0.2287, "num_input_tokens_seen": 18942704, "step": 21860 }, { "epoch": 10.30881659594531, "grad_norm": 0.0013836021535098553, "learning_rate": 0.12811862510294134, "loss": 0.2121, "num_input_tokens_seen": 18946672, "step": 21865 }, { "epoch": 10.311173974540312, "grad_norm": 0.0007522696978412569, "learning_rate": 0.12806035204017585, "loss": 0.212, "num_input_tokens_seen": 18950800, "step": 21870 }, { "epoch": 10.313531353135314, "grad_norm": 0.0018207726534456015, "learning_rate": 0.12800208236077987, "loss": 0.272, "num_input_tokens_seen": 18955888, "step": 21875 }, { "epoch": 10.315888731730317, "grad_norm": 0.001096277846954763, "learning_rate": 0.12794381607373917, "loss": 0.2357, "num_input_tokens_seen": 18959792, "step": 21880 }, { "epoch": 10.318246110325319, "grad_norm": 0.00134360883384943, "learning_rate": 0.12788555318803924, "loss": 0.2199, "num_input_tokens_seen": 18964224, "step": 21885 }, { "epoch": 10.320603488920321, "grad_norm": 0.0019122592639178038, "learning_rate": 0.1278272937126649, "loss": 0.2062, "num_input_tokens_seen": 18968368, "step": 21890 }, { "epoch": 10.322960867515324, "grad_norm": 0.002998204668983817, "learning_rate": 0.1277690376566005, "loss": 0.2548, "num_input_tokens_seen": 18973152, "step": 21895 }, { "epoch": 10.325318246110326, "grad_norm": 0.002046105219051242, "learning_rate": 0.12771078502882985, "loss": 0.2357, "num_input_tokens_seen": 18976816, "step": 21900 }, { "epoch": 10.327675624705329, "grad_norm": 0.0011666463688015938, "learning_rate": 0.12765253583833633, "loss": 0.2324, "num_input_tokens_seen": 18980352, "step": 21905 }, { "epoch": 10.33003300330033, "grad_norm": 0.0013692479114979506, "learning_rate": 0.12759429009410256, "loss": 0.2182, "num_input_tokens_seen": 18984288, "step": 21910 }, { "epoch": 10.332390381895332, "grad_norm": 0.0012697133934125304, "learning_rate": 0.12753604780511085, "loss": 0.2178, "num_input_tokens_seen": 18988544, "step": 21915 }, { "epoch": 10.334747760490334, "grad_norm": 0.0011422073002904654, "learning_rate": 0.12747780898034283, "loss": 0.2119, "num_input_tokens_seen": 18992448, "step": 21920 }, { "epoch": 10.337105139085336, "grad_norm": 0.0009961493778973818, "learning_rate": 0.12741957362877973, "loss": 0.2043, "num_input_tokens_seen": 18997696, "step": 21925 }, { "epoch": 10.339462517680339, "grad_norm": 0.0008563697338104248, "learning_rate": 0.12736134175940214, "loss": 0.1927, "num_input_tokens_seen": 19001856, "step": 21930 }, { "epoch": 10.341819896275341, "grad_norm": 0.0036909363698214293, "learning_rate": 0.12730311338119016, "loss": 0.292, "num_input_tokens_seen": 19006672, "step": 21935 }, { "epoch": 10.344177274870344, "grad_norm": 0.0011630273656919599, "learning_rate": 0.12724488850312327, "loss": 0.2306, "num_input_tokens_seen": 19010432, "step": 21940 }, { "epoch": 10.346534653465346, "grad_norm": 0.0011693460401147604, "learning_rate": 0.1271866671341806, "loss": 0.2169, "num_input_tokens_seen": 19014720, "step": 21945 }, { "epoch": 10.348892032060348, "grad_norm": 0.001077774097211659, "learning_rate": 0.12712844928334047, "loss": 0.2276, "num_input_tokens_seen": 19018624, "step": 21950 }, { "epoch": 10.35124941065535, "grad_norm": 0.002058951184153557, "learning_rate": 0.12707023495958095, "loss": 0.2351, "num_input_tokens_seen": 19022880, "step": 21955 }, { "epoch": 10.353606789250353, "grad_norm": 0.002122128615155816, "learning_rate": 0.12701202417187932, "loss": 0.2265, "num_input_tokens_seen": 19026288, "step": 21960 }, { "epoch": 10.355964167845356, "grad_norm": 0.0019919213373214006, "learning_rate": 0.12695381692921243, "loss": 0.2278, "num_input_tokens_seen": 19030720, "step": 21965 }, { "epoch": 10.358321546440358, "grad_norm": 0.0008595167309977114, "learning_rate": 0.12689561324055665, "loss": 0.231, "num_input_tokens_seen": 19035856, "step": 21970 }, { "epoch": 10.36067892503536, "grad_norm": 0.0017992517678067088, "learning_rate": 0.12683741311488758, "loss": 0.2458, "num_input_tokens_seen": 19040464, "step": 21975 }, { "epoch": 10.363036303630363, "grad_norm": 0.001202175160869956, "learning_rate": 0.1267792165611805, "loss": 0.2181, "num_input_tokens_seen": 19044592, "step": 21980 }, { "epoch": 10.365393682225365, "grad_norm": 0.0018862357828766108, "learning_rate": 0.1267210235884101, "loss": 0.1967, "num_input_tokens_seen": 19048640, "step": 21985 }, { "epoch": 10.367751060820368, "grad_norm": 0.001172007410787046, "learning_rate": 0.12666283420555033, "loss": 0.2086, "num_input_tokens_seen": 19053008, "step": 21990 }, { "epoch": 10.37010843941537, "grad_norm": 0.005234818905591965, "learning_rate": 0.12660464842157487, "loss": 0.2236, "num_input_tokens_seen": 19057808, "step": 21995 }, { "epoch": 10.372465818010372, "grad_norm": 0.001749674091115594, "learning_rate": 0.1265464662454566, "loss": 0.2137, "num_input_tokens_seen": 19061328, "step": 22000 }, { "epoch": 10.372465818010372, "eval_loss": 0.21637344360351562, "eval_runtime": 21.9709, "eval_samples_per_second": 42.92, "eval_steps_per_second": 21.483, "num_input_tokens_seen": 19061328, "step": 22000 }, { "epoch": 10.374823196605375, "grad_norm": 0.001183142652735114, "learning_rate": 0.12648828768616793, "loss": 0.2039, "num_input_tokens_seen": 19065376, "step": 22005 }, { "epoch": 10.377180575200377, "grad_norm": 0.0014305043732747436, "learning_rate": 0.12643011275268085, "loss": 0.2252, "num_input_tokens_seen": 19069360, "step": 22010 }, { "epoch": 10.37953795379538, "grad_norm": 0.0009327513398602605, "learning_rate": 0.1263719414539665, "loss": 0.1765, "num_input_tokens_seen": 19073296, "step": 22015 }, { "epoch": 10.381895332390382, "grad_norm": 0.0016643195413053036, "learning_rate": 0.1263137737989957, "loss": 0.1986, "num_input_tokens_seen": 19077008, "step": 22020 }, { "epoch": 10.384252710985384, "grad_norm": 0.0025107041001319885, "learning_rate": 0.1262556097967387, "loss": 0.2359, "num_input_tokens_seen": 19081856, "step": 22025 }, { "epoch": 10.386610089580387, "grad_norm": 0.001126704621128738, "learning_rate": 0.126197449456165, "loss": 0.2059, "num_input_tokens_seen": 19086576, "step": 22030 }, { "epoch": 10.38896746817539, "grad_norm": 0.0016975468024611473, "learning_rate": 0.12613929278624378, "loss": 0.2447, "num_input_tokens_seen": 19091408, "step": 22035 }, { "epoch": 10.391324846770392, "grad_norm": 0.0032961175311356783, "learning_rate": 0.12608113979594343, "loss": 0.2263, "num_input_tokens_seen": 19096608, "step": 22040 }, { "epoch": 10.393682225365394, "grad_norm": 0.001177679980173707, "learning_rate": 0.1260229904942319, "loss": 0.2087, "num_input_tokens_seen": 19100768, "step": 22045 }, { "epoch": 10.396039603960396, "grad_norm": 0.002918731886893511, "learning_rate": 0.12596484489007662, "loss": 0.2051, "num_input_tokens_seen": 19105440, "step": 22050 }, { "epoch": 10.398396982555399, "grad_norm": 0.0013701600255444646, "learning_rate": 0.1259067029924442, "loss": 0.2047, "num_input_tokens_seen": 19109488, "step": 22055 }, { "epoch": 10.400754361150401, "grad_norm": 0.002191714243963361, "learning_rate": 0.12584856481030096, "loss": 0.2083, "num_input_tokens_seen": 19113808, "step": 22060 }, { "epoch": 10.403111739745404, "grad_norm": 0.0009776485385373235, "learning_rate": 0.12579043035261261, "loss": 0.2069, "num_input_tokens_seen": 19117488, "step": 22065 }, { "epoch": 10.405469118340406, "grad_norm": 0.0017057891236618161, "learning_rate": 0.1257322996283441, "loss": 0.2275, "num_input_tokens_seen": 19122448, "step": 22070 }, { "epoch": 10.407826496935408, "grad_norm": 0.0008475471404381096, "learning_rate": 0.12567417264645994, "loss": 0.2028, "num_input_tokens_seen": 19126384, "step": 22075 }, { "epoch": 10.41018387553041, "grad_norm": 0.0010021085618063807, "learning_rate": 0.12561604941592408, "loss": 0.2502, "num_input_tokens_seen": 19130944, "step": 22080 }, { "epoch": 10.412541254125413, "grad_norm": 0.0011967484606429935, "learning_rate": 0.12555792994569978, "loss": 0.2131, "num_input_tokens_seen": 19135232, "step": 22085 }, { "epoch": 10.414898632720416, "grad_norm": 0.0009032103698700666, "learning_rate": 0.1254998142447499, "loss": 0.2237, "num_input_tokens_seen": 19139664, "step": 22090 }, { "epoch": 10.417256011315418, "grad_norm": 0.0019213058985769749, "learning_rate": 0.1254417023220365, "loss": 0.2302, "num_input_tokens_seen": 19144336, "step": 22095 }, { "epoch": 10.41961338991042, "grad_norm": 0.001135032158344984, "learning_rate": 0.12538359418652126, "loss": 0.1976, "num_input_tokens_seen": 19150320, "step": 22100 }, { "epoch": 10.421970768505423, "grad_norm": 0.0012120010796934366, "learning_rate": 0.12532548984716513, "loss": 0.2029, "num_input_tokens_seen": 19154768, "step": 22105 }, { "epoch": 10.424328147100425, "grad_norm": 0.001986401155591011, "learning_rate": 0.12526738931292855, "loss": 0.2255, "num_input_tokens_seen": 19158976, "step": 22110 }, { "epoch": 10.426685525695426, "grad_norm": 0.0014710555551573634, "learning_rate": 0.1252092925927714, "loss": 0.2616, "num_input_tokens_seen": 19162960, "step": 22115 }, { "epoch": 10.429042904290428, "grad_norm": 0.001634966116398573, "learning_rate": 0.12515119969565278, "loss": 0.2467, "num_input_tokens_seen": 19167296, "step": 22120 }, { "epoch": 10.43140028288543, "grad_norm": 0.0015396481612697244, "learning_rate": 0.12509311063053144, "loss": 0.2358, "num_input_tokens_seen": 19171104, "step": 22125 }, { "epoch": 10.433757661480433, "grad_norm": 0.0010717500699684024, "learning_rate": 0.1250350254063655, "loss": 0.2317, "num_input_tokens_seen": 19174880, "step": 22130 }, { "epoch": 10.436115040075435, "grad_norm": 0.001125298091210425, "learning_rate": 0.1249769440321123, "loss": 0.2419, "num_input_tokens_seen": 19179296, "step": 22135 }, { "epoch": 10.438472418670438, "grad_norm": 0.001966638257727027, "learning_rate": 0.12491886651672884, "loss": 0.2202, "num_input_tokens_seen": 19183504, "step": 22140 }, { "epoch": 10.44082979726544, "grad_norm": 0.0013706288300454617, "learning_rate": 0.12486079286917139, "loss": 0.2318, "num_input_tokens_seen": 19188096, "step": 22145 }, { "epoch": 10.443187175860443, "grad_norm": 0.0011789592681452632, "learning_rate": 0.12480272309839553, "loss": 0.2247, "num_input_tokens_seen": 19192464, "step": 22150 }, { "epoch": 10.445544554455445, "grad_norm": 0.0007968675927259028, "learning_rate": 0.12474465721335648, "loss": 0.2241, "num_input_tokens_seen": 19196576, "step": 22155 }, { "epoch": 10.447901933050447, "grad_norm": 0.0009436819818802178, "learning_rate": 0.12468659522300861, "loss": 0.2351, "num_input_tokens_seen": 19200784, "step": 22160 }, { "epoch": 10.45025931164545, "grad_norm": 0.0018445131136104465, "learning_rate": 0.12462853713630584, "loss": 0.2343, "num_input_tokens_seen": 19204864, "step": 22165 }, { "epoch": 10.452616690240452, "grad_norm": 0.0012664683163166046, "learning_rate": 0.12457048296220156, "loss": 0.232, "num_input_tokens_seen": 19209232, "step": 22170 }, { "epoch": 10.454974068835455, "grad_norm": 0.002282599685713649, "learning_rate": 0.12451243270964832, "loss": 0.2263, "num_input_tokens_seen": 19213552, "step": 22175 }, { "epoch": 10.457331447430457, "grad_norm": 0.001316384761594236, "learning_rate": 0.12445438638759827, "loss": 0.2257, "num_input_tokens_seen": 19218224, "step": 22180 }, { "epoch": 10.45968882602546, "grad_norm": 0.0010058984626084566, "learning_rate": 0.1243963440050029, "loss": 0.2061, "num_input_tokens_seen": 19222880, "step": 22185 }, { "epoch": 10.462046204620462, "grad_norm": 0.002403038088232279, "learning_rate": 0.12433830557081298, "loss": 0.2533, "num_input_tokens_seen": 19227040, "step": 22190 }, { "epoch": 10.464403583215464, "grad_norm": 0.004003562498837709, "learning_rate": 0.12428027109397889, "loss": 0.2293, "num_input_tokens_seen": 19231008, "step": 22195 }, { "epoch": 10.466760961810467, "grad_norm": 0.0011721281334757805, "learning_rate": 0.12422224058345015, "loss": 0.2118, "num_input_tokens_seen": 19236176, "step": 22200 }, { "epoch": 10.466760961810467, "eval_loss": 0.2165602296590805, "eval_runtime": 21.8687, "eval_samples_per_second": 43.121, "eval_steps_per_second": 21.583, "num_input_tokens_seen": 19236176, "step": 22200 }, { "epoch": 10.469118340405469, "grad_norm": 0.002238075016066432, "learning_rate": 0.12416421404817583, "loss": 0.2135, "num_input_tokens_seen": 19239840, "step": 22205 }, { "epoch": 10.471475719000471, "grad_norm": 0.001323640695773065, "learning_rate": 0.12410619149710447, "loss": 0.2421, "num_input_tokens_seen": 19244080, "step": 22210 }, { "epoch": 10.473833097595474, "grad_norm": 0.000849499658215791, "learning_rate": 0.12404817293918374, "loss": 0.2487, "num_input_tokens_seen": 19247856, "step": 22215 }, { "epoch": 10.476190476190476, "grad_norm": 0.0012140254257246852, "learning_rate": 0.12399015838336086, "loss": 0.2171, "num_input_tokens_seen": 19251760, "step": 22220 }, { "epoch": 10.478547854785479, "grad_norm": 0.0019237084779888391, "learning_rate": 0.12393214783858246, "loss": 0.2062, "num_input_tokens_seen": 19256480, "step": 22225 }, { "epoch": 10.480905233380481, "grad_norm": 0.0017789190169423819, "learning_rate": 0.1238741413137944, "loss": 0.2512, "num_input_tokens_seen": 19260720, "step": 22230 }, { "epoch": 10.483262611975483, "grad_norm": 0.0009467643685638905, "learning_rate": 0.12381613881794212, "loss": 0.2171, "num_input_tokens_seen": 19264672, "step": 22235 }, { "epoch": 10.485619990570486, "grad_norm": 0.002289701486006379, "learning_rate": 0.12375814035997022, "loss": 0.2002, "num_input_tokens_seen": 19269312, "step": 22240 }, { "epoch": 10.487977369165488, "grad_norm": 0.001704637543298304, "learning_rate": 0.12370014594882285, "loss": 0.2272, "num_input_tokens_seen": 19273920, "step": 22245 }, { "epoch": 10.49033474776049, "grad_norm": 0.0008350195712409914, "learning_rate": 0.12364215559344356, "loss": 0.2048, "num_input_tokens_seen": 19278112, "step": 22250 }, { "epoch": 10.492692126355493, "grad_norm": 0.0017205007607117295, "learning_rate": 0.12358416930277506, "loss": 0.2258, "num_input_tokens_seen": 19282784, "step": 22255 }, { "epoch": 10.495049504950495, "grad_norm": 0.0012135091237723827, "learning_rate": 0.1235261870857596, "loss": 0.2063, "num_input_tokens_seen": 19287840, "step": 22260 }, { "epoch": 10.497406883545498, "grad_norm": 0.0016083796508610249, "learning_rate": 0.12346820895133884, "loss": 0.2142, "num_input_tokens_seen": 19291664, "step": 22265 }, { "epoch": 10.4997642621405, "grad_norm": 0.0015308193396776915, "learning_rate": 0.12341023490845361, "loss": 0.2064, "num_input_tokens_seen": 19295872, "step": 22270 }, { "epoch": 10.502121640735503, "grad_norm": 0.0011116726091131568, "learning_rate": 0.12335226496604437, "loss": 0.2, "num_input_tokens_seen": 19300320, "step": 22275 }, { "epoch": 10.504479019330505, "grad_norm": 0.0010352479293942451, "learning_rate": 0.12329429913305069, "loss": 0.2634, "num_input_tokens_seen": 19303872, "step": 22280 }, { "epoch": 10.506836397925507, "grad_norm": 0.0016570861916989088, "learning_rate": 0.12323633741841171, "loss": 0.2394, "num_input_tokens_seen": 19308656, "step": 22285 }, { "epoch": 10.50919377652051, "grad_norm": 0.002036277437582612, "learning_rate": 0.12317837983106583, "loss": 0.219, "num_input_tokens_seen": 19312928, "step": 22290 }, { "epoch": 10.511551155115512, "grad_norm": 0.0009706165874376893, "learning_rate": 0.12312042637995087, "loss": 0.21, "num_input_tokens_seen": 19316912, "step": 22295 }, { "epoch": 10.513908533710515, "grad_norm": 0.001657850225456059, "learning_rate": 0.12306247707400389, "loss": 0.2245, "num_input_tokens_seen": 19321040, "step": 22300 }, { "epoch": 10.516265912305517, "grad_norm": 0.0011152183869853616, "learning_rate": 0.12300453192216154, "loss": 0.2439, "num_input_tokens_seen": 19324960, "step": 22305 }, { "epoch": 10.518623290900518, "grad_norm": 0.001008142833597958, "learning_rate": 0.12294659093335956, "loss": 0.2269, "num_input_tokens_seen": 19329648, "step": 22310 }, { "epoch": 10.520980669495522, "grad_norm": 0.0024475669488310814, "learning_rate": 0.12288865411653327, "loss": 0.2352, "num_input_tokens_seen": 19333296, "step": 22315 }, { "epoch": 10.523338048090523, "grad_norm": 0.0009934682166203856, "learning_rate": 0.12283072148061717, "loss": 0.2375, "num_input_tokens_seen": 19337952, "step": 22320 }, { "epoch": 10.525695426685525, "grad_norm": 0.0014270105166360736, "learning_rate": 0.12277279303454529, "loss": 0.2279, "num_input_tokens_seen": 19341616, "step": 22325 }, { "epoch": 10.528052805280527, "grad_norm": 0.0014033850748091936, "learning_rate": 0.12271486878725091, "loss": 0.2267, "num_input_tokens_seen": 19345568, "step": 22330 }, { "epoch": 10.53041018387553, "grad_norm": 0.0014108485775068402, "learning_rate": 0.12265694874766658, "loss": 0.2254, "num_input_tokens_seen": 19349568, "step": 22335 }, { "epoch": 10.532767562470532, "grad_norm": 0.0009394697262905538, "learning_rate": 0.12259903292472435, "loss": 0.2194, "num_input_tokens_seen": 19352816, "step": 22340 }, { "epoch": 10.535124941065535, "grad_norm": 0.0010639302199706435, "learning_rate": 0.12254112132735567, "loss": 0.2169, "num_input_tokens_seen": 19357392, "step": 22345 }, { "epoch": 10.537482319660537, "grad_norm": 0.0010175523348152637, "learning_rate": 0.12248321396449108, "loss": 0.2341, "num_input_tokens_seen": 19361376, "step": 22350 }, { "epoch": 10.53983969825554, "grad_norm": 0.001321848132647574, "learning_rate": 0.12242531084506075, "loss": 0.2332, "num_input_tokens_seen": 19365680, "step": 22355 }, { "epoch": 10.542197076850542, "grad_norm": 0.0010497123003005981, "learning_rate": 0.122367411977994, "loss": 0.2318, "num_input_tokens_seen": 19370256, "step": 22360 }, { "epoch": 10.544554455445544, "grad_norm": 0.00101200130302459, "learning_rate": 0.12230951737221954, "loss": 0.2122, "num_input_tokens_seen": 19373936, "step": 22365 }, { "epoch": 10.546911834040547, "grad_norm": 0.0010317484848201275, "learning_rate": 0.12225162703666555, "loss": 0.2304, "num_input_tokens_seen": 19378096, "step": 22370 }, { "epoch": 10.549269212635549, "grad_norm": 0.002733059925958514, "learning_rate": 0.1221937409802593, "loss": 0.2341, "num_input_tokens_seen": 19383152, "step": 22375 }, { "epoch": 10.551626591230551, "grad_norm": 0.0022361630108207464, "learning_rate": 0.12213585921192768, "loss": 0.2058, "num_input_tokens_seen": 19388176, "step": 22380 }, { "epoch": 10.553983969825554, "grad_norm": 0.0015509643126279116, "learning_rate": 0.1220779817405967, "loss": 0.2464, "num_input_tokens_seen": 19392720, "step": 22385 }, { "epoch": 10.556341348420556, "grad_norm": 0.00231257826089859, "learning_rate": 0.12202010857519181, "loss": 0.2198, "num_input_tokens_seen": 19396944, "step": 22390 }, { "epoch": 10.558698727015559, "grad_norm": 0.0010198347736150026, "learning_rate": 0.12196223972463785, "loss": 0.2367, "num_input_tokens_seen": 19400576, "step": 22395 }, { "epoch": 10.561056105610561, "grad_norm": 0.000954817864112556, "learning_rate": 0.12190437519785885, "loss": 0.208, "num_input_tokens_seen": 19404288, "step": 22400 }, { "epoch": 10.561056105610561, "eval_loss": 0.21777023375034332, "eval_runtime": 21.9132, "eval_samples_per_second": 43.034, "eval_steps_per_second": 21.54, "num_input_tokens_seen": 19404288, "step": 22400 }, { "epoch": 10.563413484205563, "grad_norm": 0.0006534851272590458, "learning_rate": 0.12184651500377823, "loss": 0.2141, "num_input_tokens_seen": 19407504, "step": 22405 }, { "epoch": 10.565770862800566, "grad_norm": 0.000744357705116272, "learning_rate": 0.12178865915131885, "loss": 0.226, "num_input_tokens_seen": 19411504, "step": 22410 }, { "epoch": 10.568128241395568, "grad_norm": 0.001136588747613132, "learning_rate": 0.1217308076494027, "loss": 0.2159, "num_input_tokens_seen": 19415856, "step": 22415 }, { "epoch": 10.57048561999057, "grad_norm": 0.0008811786537989974, "learning_rate": 0.12167296050695134, "loss": 0.2348, "num_input_tokens_seen": 19420304, "step": 22420 }, { "epoch": 10.572842998585573, "grad_norm": 0.0009200049098581076, "learning_rate": 0.12161511773288536, "loss": 0.2361, "num_input_tokens_seen": 19424640, "step": 22425 }, { "epoch": 10.575200377180575, "grad_norm": 0.0010602250695228577, "learning_rate": 0.121557279336125, "loss": 0.2243, "num_input_tokens_seen": 19429072, "step": 22430 }, { "epoch": 10.577557755775578, "grad_norm": 0.0009873583912849426, "learning_rate": 0.12149944532558957, "loss": 0.2196, "num_input_tokens_seen": 19433232, "step": 22435 }, { "epoch": 10.57991513437058, "grad_norm": 0.0016008374514058232, "learning_rate": 0.12144161571019785, "loss": 0.2128, "num_input_tokens_seen": 19438240, "step": 22440 }, { "epoch": 10.582272512965583, "grad_norm": 0.0009181021014228463, "learning_rate": 0.12138379049886781, "loss": 0.2155, "num_input_tokens_seen": 19442832, "step": 22445 }, { "epoch": 10.584629891560585, "grad_norm": 0.0011158337583765388, "learning_rate": 0.12132596970051697, "loss": 0.2444, "num_input_tokens_seen": 19447152, "step": 22450 }, { "epoch": 10.586987270155587, "grad_norm": 0.002858510008081794, "learning_rate": 0.12126815332406189, "loss": 0.2096, "num_input_tokens_seen": 19451872, "step": 22455 }, { "epoch": 10.58934464875059, "grad_norm": 0.0010608130833134055, "learning_rate": 0.12121034137841868, "loss": 0.2306, "num_input_tokens_seen": 19456448, "step": 22460 }, { "epoch": 10.591702027345592, "grad_norm": 0.0011453211773186922, "learning_rate": 0.12115253387250258, "loss": 0.1844, "num_input_tokens_seen": 19460896, "step": 22465 }, { "epoch": 10.594059405940595, "grad_norm": 0.001133226789534092, "learning_rate": 0.12109473081522831, "loss": 0.1907, "num_input_tokens_seen": 19465088, "step": 22470 }, { "epoch": 10.596416784535597, "grad_norm": 0.0020264792256057262, "learning_rate": 0.12103693221550982, "loss": 0.1822, "num_input_tokens_seen": 19469232, "step": 22475 }, { "epoch": 10.5987741631306, "grad_norm": 0.002018054248765111, "learning_rate": 0.12097913808226027, "loss": 0.2356, "num_input_tokens_seen": 19473424, "step": 22480 }, { "epoch": 10.601131541725602, "grad_norm": 0.0011062873527407646, "learning_rate": 0.12092134842439234, "loss": 0.246, "num_input_tokens_seen": 19477728, "step": 22485 }, { "epoch": 10.603488920320604, "grad_norm": 0.0013154511107131839, "learning_rate": 0.12086356325081798, "loss": 0.262, "num_input_tokens_seen": 19481888, "step": 22490 }, { "epoch": 10.605846298915607, "grad_norm": 0.0033291231375187635, "learning_rate": 0.12080578257044824, "loss": 0.2015, "num_input_tokens_seen": 19485872, "step": 22495 }, { "epoch": 10.608203677510609, "grad_norm": 0.0007461765198968351, "learning_rate": 0.12074800639219378, "loss": 0.2107, "num_input_tokens_seen": 19489952, "step": 22500 }, { "epoch": 10.61056105610561, "grad_norm": 0.0017839580541476607, "learning_rate": 0.12069023472496428, "loss": 0.2118, "num_input_tokens_seen": 19493568, "step": 22505 }, { "epoch": 10.612918434700614, "grad_norm": 0.0012284445110708475, "learning_rate": 0.12063246757766893, "loss": 0.2465, "num_input_tokens_seen": 19497216, "step": 22510 }, { "epoch": 10.615275813295614, "grad_norm": 0.001055700471624732, "learning_rate": 0.12057470495921618, "loss": 0.213, "num_input_tokens_seen": 19501984, "step": 22515 }, { "epoch": 10.617633191890617, "grad_norm": 0.0009513085824437439, "learning_rate": 0.12051694687851364, "loss": 0.2249, "num_input_tokens_seen": 19506528, "step": 22520 }, { "epoch": 10.61999057048562, "grad_norm": 0.0009714479092508554, "learning_rate": 0.12045919334446839, "loss": 0.209, "num_input_tokens_seen": 19510784, "step": 22525 }, { "epoch": 10.622347949080622, "grad_norm": 0.001339233247563243, "learning_rate": 0.12040144436598683, "loss": 0.2142, "num_input_tokens_seen": 19515392, "step": 22530 }, { "epoch": 10.624705327675624, "grad_norm": 0.0016931908903643489, "learning_rate": 0.12034369995197444, "loss": 0.224, "num_input_tokens_seen": 19519424, "step": 22535 }, { "epoch": 10.627062706270626, "grad_norm": 0.0008039309759624302, "learning_rate": 0.12028596011133627, "loss": 0.2201, "num_input_tokens_seen": 19523424, "step": 22540 }, { "epoch": 10.629420084865629, "grad_norm": 0.0011120367562398314, "learning_rate": 0.12022822485297643, "loss": 0.2333, "num_input_tokens_seen": 19527904, "step": 22545 }, { "epoch": 10.631777463460631, "grad_norm": 0.0018162421183660626, "learning_rate": 0.12017049418579843, "loss": 0.2011, "num_input_tokens_seen": 19532192, "step": 22550 }, { "epoch": 10.634134842055634, "grad_norm": 0.0018988415831699967, "learning_rate": 0.12011276811870514, "loss": 0.2266, "num_input_tokens_seen": 19536992, "step": 22555 }, { "epoch": 10.636492220650636, "grad_norm": 0.002750081242993474, "learning_rate": 0.12005504666059852, "loss": 0.1877, "num_input_tokens_seen": 19541168, "step": 22560 }, { "epoch": 10.638849599245638, "grad_norm": 0.0010525877587497234, "learning_rate": 0.11999732982038003, "loss": 0.2223, "num_input_tokens_seen": 19545024, "step": 22565 }, { "epoch": 10.64120697784064, "grad_norm": 0.0027161044999957085, "learning_rate": 0.11993961760695038, "loss": 0.2283, "num_input_tokens_seen": 19549120, "step": 22570 }, { "epoch": 10.643564356435643, "grad_norm": 0.0010227242019027472, "learning_rate": 0.11988191002920942, "loss": 0.2197, "num_input_tokens_seen": 19553520, "step": 22575 }, { "epoch": 10.645921735030646, "grad_norm": 0.0016675328370183706, "learning_rate": 0.11982420709605641, "loss": 0.2349, "num_input_tokens_seen": 19558096, "step": 22580 }, { "epoch": 10.648279113625648, "grad_norm": 0.0018678001360967755, "learning_rate": 0.11976650881638991, "loss": 0.2082, "num_input_tokens_seen": 19562224, "step": 22585 }, { "epoch": 10.65063649222065, "grad_norm": 0.0014401058433577418, "learning_rate": 0.11970881519910764, "loss": 0.2128, "num_input_tokens_seen": 19566000, "step": 22590 }, { "epoch": 10.652993870815653, "grad_norm": 0.0012255280744284391, "learning_rate": 0.1196511262531068, "loss": 0.2505, "num_input_tokens_seen": 19569680, "step": 22595 }, { "epoch": 10.655351249410655, "grad_norm": 0.001619109301827848, "learning_rate": 0.11959344198728361, "loss": 0.2201, "num_input_tokens_seen": 19574224, "step": 22600 }, { "epoch": 10.655351249410655, "eval_loss": 0.2144869565963745, "eval_runtime": 21.9154, "eval_samples_per_second": 43.029, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 19574224, "step": 22600 }, { "epoch": 10.657708628005658, "grad_norm": 0.0012130734976381063, "learning_rate": 0.11953576241053378, "loss": 0.2371, "num_input_tokens_seen": 19578000, "step": 22605 }, { "epoch": 10.66006600660066, "grad_norm": 0.0009198078187182546, "learning_rate": 0.11947808753175228, "loss": 0.2005, "num_input_tokens_seen": 19581552, "step": 22610 }, { "epoch": 10.662423385195662, "grad_norm": 0.0015829779440537095, "learning_rate": 0.1194204173598332, "loss": 0.1999, "num_input_tokens_seen": 19585184, "step": 22615 }, { "epoch": 10.664780763790665, "grad_norm": 0.0023783536162227392, "learning_rate": 0.11936275190367007, "loss": 0.2211, "num_input_tokens_seen": 19589840, "step": 22620 }, { "epoch": 10.667138142385667, "grad_norm": 0.0006507565267384052, "learning_rate": 0.11930509117215563, "loss": 0.2127, "num_input_tokens_seen": 19593744, "step": 22625 }, { "epoch": 10.66949552098067, "grad_norm": 0.0011839231010526419, "learning_rate": 0.11924743517418179, "loss": 0.1765, "num_input_tokens_seen": 19598112, "step": 22630 }, { "epoch": 10.671852899575672, "grad_norm": 0.0015322677791118622, "learning_rate": 0.11918978391864, "loss": 0.2543, "num_input_tokens_seen": 19602320, "step": 22635 }, { "epoch": 10.674210278170674, "grad_norm": 0.0008136096294037998, "learning_rate": 0.11913213741442065, "loss": 0.2067, "num_input_tokens_seen": 19606416, "step": 22640 }, { "epoch": 10.676567656765677, "grad_norm": 0.0014673310797661543, "learning_rate": 0.11907449567041364, "loss": 0.2607, "num_input_tokens_seen": 19610896, "step": 22645 }, { "epoch": 10.67892503536068, "grad_norm": 0.00149513257201761, "learning_rate": 0.11901685869550803, "loss": 0.2148, "num_input_tokens_seen": 19615904, "step": 22650 }, { "epoch": 10.681282413955682, "grad_norm": 0.0008098832331597805, "learning_rate": 0.1189592264985922, "loss": 0.2228, "num_input_tokens_seen": 19620096, "step": 22655 }, { "epoch": 10.683639792550684, "grad_norm": 0.0010021724738180637, "learning_rate": 0.11890159908855373, "loss": 0.2287, "num_input_tokens_seen": 19623712, "step": 22660 }, { "epoch": 10.685997171145686, "grad_norm": 0.0018505444750189781, "learning_rate": 0.11884397647427941, "loss": 0.2046, "num_input_tokens_seen": 19628432, "step": 22665 }, { "epoch": 10.688354549740689, "grad_norm": 0.0008550783968530595, "learning_rate": 0.11878635866465546, "loss": 0.2025, "num_input_tokens_seen": 19631952, "step": 22670 }, { "epoch": 10.690711928335691, "grad_norm": 0.0009578528697602451, "learning_rate": 0.11872874566856734, "loss": 0.2203, "num_input_tokens_seen": 19636432, "step": 22675 }, { "epoch": 10.693069306930694, "grad_norm": 0.0013436555163934827, "learning_rate": 0.11867113749489955, "loss": 0.2039, "num_input_tokens_seen": 19641120, "step": 22680 }, { "epoch": 10.695426685525696, "grad_norm": 0.0012823735596612096, "learning_rate": 0.11861353415253607, "loss": 0.2605, "num_input_tokens_seen": 19645216, "step": 22685 }, { "epoch": 10.697784064120698, "grad_norm": 0.0011053390335291624, "learning_rate": 0.11855593565036011, "loss": 0.2253, "num_input_tokens_seen": 19649232, "step": 22690 }, { "epoch": 10.700141442715701, "grad_norm": 0.0015341602265834808, "learning_rate": 0.11849834199725394, "loss": 0.2464, "num_input_tokens_seen": 19653344, "step": 22695 }, { "epoch": 10.702498821310703, "grad_norm": 0.0008902903646230698, "learning_rate": 0.1184407532020994, "loss": 0.2098, "num_input_tokens_seen": 19657904, "step": 22700 }, { "epoch": 10.704856199905706, "grad_norm": 0.0007464297232218087, "learning_rate": 0.11838316927377723, "loss": 0.2294, "num_input_tokens_seen": 19662368, "step": 22705 }, { "epoch": 10.707213578500706, "grad_norm": 0.0011085174046456814, "learning_rate": 0.11832559022116766, "loss": 0.2135, "num_input_tokens_seen": 19666608, "step": 22710 }, { "epoch": 10.70957095709571, "grad_norm": 0.0010259922128170729, "learning_rate": 0.11826801605315022, "loss": 0.1963, "num_input_tokens_seen": 19671088, "step": 22715 }, { "epoch": 10.711928335690711, "grad_norm": 0.0010548167629167438, "learning_rate": 0.1182104467786034, "loss": 0.213, "num_input_tokens_seen": 19675568, "step": 22720 }, { "epoch": 10.714285714285714, "grad_norm": 0.0016320622526109219, "learning_rate": 0.1181528824064052, "loss": 0.1927, "num_input_tokens_seen": 19680336, "step": 22725 }, { "epoch": 10.716643092880716, "grad_norm": 0.0008820195798762143, "learning_rate": 0.11809532294543279, "loss": 0.2039, "num_input_tokens_seen": 19683920, "step": 22730 }, { "epoch": 10.719000471475718, "grad_norm": 0.0011103474535048008, "learning_rate": 0.11803776840456245, "loss": 0.1979, "num_input_tokens_seen": 19689360, "step": 22735 }, { "epoch": 10.72135785007072, "grad_norm": 0.001783317537046969, "learning_rate": 0.11798021879266997, "loss": 0.2301, "num_input_tokens_seen": 19693200, "step": 22740 }, { "epoch": 10.723715228665723, "grad_norm": 0.0015118020819500089, "learning_rate": 0.11792267411863006, "loss": 0.1766, "num_input_tokens_seen": 19697216, "step": 22745 }, { "epoch": 10.726072607260726, "grad_norm": 0.0018448153277859092, "learning_rate": 0.1178651343913169, "loss": 0.2207, "num_input_tokens_seen": 19701952, "step": 22750 }, { "epoch": 10.728429985855728, "grad_norm": 0.0021835092920809984, "learning_rate": 0.11780759961960392, "loss": 0.2509, "num_input_tokens_seen": 19706176, "step": 22755 }, { "epoch": 10.73078736445073, "grad_norm": 0.0009327690349891782, "learning_rate": 0.1177500698123636, "loss": 0.2119, "num_input_tokens_seen": 19710400, "step": 22760 }, { "epoch": 10.733144743045733, "grad_norm": 0.001328403246589005, "learning_rate": 0.11769254497846778, "loss": 0.2537, "num_input_tokens_seen": 19714832, "step": 22765 }, { "epoch": 10.735502121640735, "grad_norm": 0.0008197848801501095, "learning_rate": 0.11763502512678758, "loss": 0.2328, "num_input_tokens_seen": 19718592, "step": 22770 }, { "epoch": 10.737859500235738, "grad_norm": 0.0009917898569256067, "learning_rate": 0.11757751026619315, "loss": 0.2197, "num_input_tokens_seen": 19722464, "step": 22775 }, { "epoch": 10.74021687883074, "grad_norm": 0.0021469490602612495, "learning_rate": 0.11752000040555416, "loss": 0.2285, "num_input_tokens_seen": 19727200, "step": 22780 }, { "epoch": 10.742574257425742, "grad_norm": 0.0010109434369951487, "learning_rate": 0.11746249555373921, "loss": 0.2037, "num_input_tokens_seen": 19732000, "step": 22785 }, { "epoch": 10.744931636020745, "grad_norm": 0.0012190317502245307, "learning_rate": 0.11740499571961638, "loss": 0.1956, "num_input_tokens_seen": 19735808, "step": 22790 }, { "epoch": 10.747289014615747, "grad_norm": 0.0009148928220383823, "learning_rate": 0.11734750091205279, "loss": 0.2184, "num_input_tokens_seen": 19740624, "step": 22795 }, { "epoch": 10.74964639321075, "grad_norm": 0.0012424859451130033, "learning_rate": 0.11729001113991493, "loss": 0.2331, "num_input_tokens_seen": 19744496, "step": 22800 }, { "epoch": 10.74964639321075, "eval_loss": 0.21467708051204681, "eval_runtime": 21.9515, "eval_samples_per_second": 42.958, "eval_steps_per_second": 21.502, "num_input_tokens_seen": 19744496, "step": 22800 }, { "epoch": 10.752003771805752, "grad_norm": 0.0007923890952952206, "learning_rate": 0.11723252641206837, "loss": 0.2158, "num_input_tokens_seen": 19748960, "step": 22805 }, { "epoch": 10.754361150400754, "grad_norm": 0.0008607939817011356, "learning_rate": 0.11717504673737808, "loss": 0.2293, "num_input_tokens_seen": 19752928, "step": 22810 }, { "epoch": 10.756718528995757, "grad_norm": 0.0013919376069679856, "learning_rate": 0.11711757212470802, "loss": 0.2137, "num_input_tokens_seen": 19756784, "step": 22815 }, { "epoch": 10.75907590759076, "grad_norm": 0.0011429657461121678, "learning_rate": 0.11706010258292165, "loss": 0.2, "num_input_tokens_seen": 19761248, "step": 22820 }, { "epoch": 10.761433286185762, "grad_norm": 0.000871662050485611, "learning_rate": 0.11700263812088131, "loss": 0.1905, "num_input_tokens_seen": 19764736, "step": 22825 }, { "epoch": 10.763790664780764, "grad_norm": 0.0010070183780044317, "learning_rate": 0.11694517874744892, "loss": 0.1878, "num_input_tokens_seen": 19768704, "step": 22830 }, { "epoch": 10.766148043375766, "grad_norm": 0.002095734467729926, "learning_rate": 0.11688772447148532, "loss": 0.2114, "num_input_tokens_seen": 19773488, "step": 22835 }, { "epoch": 10.768505421970769, "grad_norm": 0.0010858295718207955, "learning_rate": 0.11683027530185074, "loss": 0.1957, "num_input_tokens_seen": 19777360, "step": 22840 }, { "epoch": 10.770862800565771, "grad_norm": 0.0021071031223982573, "learning_rate": 0.11677283124740451, "loss": 0.2217, "num_input_tokens_seen": 19781904, "step": 22845 }, { "epoch": 10.773220179160774, "grad_norm": 0.0010490230051800609, "learning_rate": 0.11671539231700531, "loss": 0.2093, "num_input_tokens_seen": 19785472, "step": 22850 }, { "epoch": 10.775577557755776, "grad_norm": 0.0016758970450609922, "learning_rate": 0.11665795851951084, "loss": 0.1718, "num_input_tokens_seen": 19790160, "step": 22855 }, { "epoch": 10.777934936350778, "grad_norm": 0.0012555245775729418, "learning_rate": 0.11660052986377825, "loss": 0.1764, "num_input_tokens_seen": 19794448, "step": 22860 }, { "epoch": 10.78029231494578, "grad_norm": 0.0008648642105981708, "learning_rate": 0.1165431063586636, "loss": 0.2643, "num_input_tokens_seen": 19798560, "step": 22865 }, { "epoch": 10.782649693540783, "grad_norm": 0.002352221170440316, "learning_rate": 0.11648568801302245, "loss": 0.2543, "num_input_tokens_seen": 19803056, "step": 22870 }, { "epoch": 10.785007072135786, "grad_norm": 0.001754462136887014, "learning_rate": 0.11642827483570937, "loss": 0.1859, "num_input_tokens_seen": 19807056, "step": 22875 }, { "epoch": 10.787364450730788, "grad_norm": 0.0010804458288475871, "learning_rate": 0.11637086683557815, "loss": 0.2188, "num_input_tokens_seen": 19811808, "step": 22880 }, { "epoch": 10.78972182932579, "grad_norm": 0.0011585332686081529, "learning_rate": 0.11631346402148188, "loss": 0.2301, "num_input_tokens_seen": 19816032, "step": 22885 }, { "epoch": 10.792079207920793, "grad_norm": 0.001051040948368609, "learning_rate": 0.11625606640227285, "loss": 0.2388, "num_input_tokens_seen": 19820112, "step": 22890 }, { "epoch": 10.794436586515795, "grad_norm": 0.0015964859630912542, "learning_rate": 0.11619867398680238, "loss": 0.2238, "num_input_tokens_seen": 19824656, "step": 22895 }, { "epoch": 10.796793965110798, "grad_norm": 0.001063556526787579, "learning_rate": 0.11614128678392119, "loss": 0.2235, "num_input_tokens_seen": 19828832, "step": 22900 }, { "epoch": 10.799151343705798, "grad_norm": 0.0009501239401288331, "learning_rate": 0.11608390480247906, "loss": 0.2216, "num_input_tokens_seen": 19833072, "step": 22905 }, { "epoch": 10.801508722300802, "grad_norm": 0.0031957945320755243, "learning_rate": 0.11602652805132499, "loss": 0.2272, "num_input_tokens_seen": 19836976, "step": 22910 }, { "epoch": 10.803866100895803, "grad_norm": 0.0017655331175774336, "learning_rate": 0.11596915653930731, "loss": 0.2241, "num_input_tokens_seen": 19841248, "step": 22915 }, { "epoch": 10.806223479490805, "grad_norm": 0.0008380957879126072, "learning_rate": 0.11591179027527328, "loss": 0.2009, "num_input_tokens_seen": 19845728, "step": 22920 }, { "epoch": 10.808580858085808, "grad_norm": 0.0007886758539825678, "learning_rate": 0.11585442926806956, "loss": 0.2118, "num_input_tokens_seen": 19850640, "step": 22925 }, { "epoch": 10.81093823668081, "grad_norm": 0.002840352011844516, "learning_rate": 0.11579707352654202, "loss": 0.2117, "num_input_tokens_seen": 19857152, "step": 22930 }, { "epoch": 10.813295615275813, "grad_norm": 0.0007142882095649838, "learning_rate": 0.11573972305953548, "loss": 0.2228, "num_input_tokens_seen": 19860704, "step": 22935 }, { "epoch": 10.815652993870815, "grad_norm": 0.0013685787562280893, "learning_rate": 0.11568237787589426, "loss": 0.1931, "num_input_tokens_seen": 19865360, "step": 22940 }, { "epoch": 10.818010372465817, "grad_norm": 0.0014829769497737288, "learning_rate": 0.11562503798446161, "loss": 0.2534, "num_input_tokens_seen": 19870048, "step": 22945 }, { "epoch": 10.82036775106082, "grad_norm": 0.0007740053115412593, "learning_rate": 0.11556770339408005, "loss": 0.1968, "num_input_tokens_seen": 19874336, "step": 22950 }, { "epoch": 10.822725129655822, "grad_norm": 0.0013344844337552786, "learning_rate": 0.1155103741135914, "loss": 0.2129, "num_input_tokens_seen": 19877984, "step": 22955 }, { "epoch": 10.825082508250825, "grad_norm": 0.0037793894298374653, "learning_rate": 0.1154530501518364, "loss": 0.2715, "num_input_tokens_seen": 19881744, "step": 22960 }, { "epoch": 10.827439886845827, "grad_norm": 0.0007798086735419929, "learning_rate": 0.11539573151765523, "loss": 0.21, "num_input_tokens_seen": 19885920, "step": 22965 }, { "epoch": 10.82979726544083, "grad_norm": 0.0008358924533240497, "learning_rate": 0.11533841821988719, "loss": 0.1901, "num_input_tokens_seen": 19889456, "step": 22970 }, { "epoch": 10.832154644035832, "grad_norm": 0.0007854973082430661, "learning_rate": 0.11528111026737059, "loss": 0.1904, "num_input_tokens_seen": 19893696, "step": 22975 }, { "epoch": 10.834512022630834, "grad_norm": 0.0007651916239410639, "learning_rate": 0.11522380766894312, "loss": 0.2183, "num_input_tokens_seen": 19898064, "step": 22980 }, { "epoch": 10.836869401225837, "grad_norm": 0.0009095256100408733, "learning_rate": 0.11516651043344152, "loss": 0.2205, "num_input_tokens_seen": 19902480, "step": 22985 }, { "epoch": 10.839226779820839, "grad_norm": 0.0009304385748691857, "learning_rate": 0.11510921856970172, "loss": 0.2044, "num_input_tokens_seen": 19907008, "step": 22990 }, { "epoch": 10.841584158415841, "grad_norm": 0.0012064108159393072, "learning_rate": 0.11505193208655895, "loss": 0.2069, "num_input_tokens_seen": 19911696, "step": 22995 }, { "epoch": 10.843941537010844, "grad_norm": 0.0007563559338450432, "learning_rate": 0.11499465099284738, "loss": 0.2107, "num_input_tokens_seen": 19915984, "step": 23000 }, { "epoch": 10.843941537010844, "eval_loss": 0.2145698070526123, "eval_runtime": 21.8986, "eval_samples_per_second": 43.062, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 19915984, "step": 23000 }, { "epoch": 10.846298915605846, "grad_norm": 0.0013370446395128965, "learning_rate": 0.1149373752974006, "loss": 0.1957, "num_input_tokens_seen": 19920032, "step": 23005 }, { "epoch": 10.848656294200849, "grad_norm": 0.0019378806464374065, "learning_rate": 0.11488010500905109, "loss": 0.2459, "num_input_tokens_seen": 19925024, "step": 23010 }, { "epoch": 10.851013672795851, "grad_norm": 0.0009166030213236809, "learning_rate": 0.11482284013663077, "loss": 0.2605, "num_input_tokens_seen": 19929424, "step": 23015 }, { "epoch": 10.853371051390853, "grad_norm": 0.001316463341936469, "learning_rate": 0.11476558068897061, "loss": 0.2484, "num_input_tokens_seen": 19934048, "step": 23020 }, { "epoch": 10.855728429985856, "grad_norm": 0.0007123580435290933, "learning_rate": 0.11470832667490061, "loss": 0.2279, "num_input_tokens_seen": 19937968, "step": 23025 }, { "epoch": 10.858085808580858, "grad_norm": 0.001273282221518457, "learning_rate": 0.11465107810325013, "loss": 0.2317, "num_input_tokens_seen": 19942416, "step": 23030 }, { "epoch": 10.86044318717586, "grad_norm": 0.0008236832218244672, "learning_rate": 0.11459383498284771, "loss": 0.2384, "num_input_tokens_seen": 19946688, "step": 23035 }, { "epoch": 10.862800565770863, "grad_norm": 0.0007184587884694338, "learning_rate": 0.11453659732252082, "loss": 0.2166, "num_input_tokens_seen": 19950592, "step": 23040 }, { "epoch": 10.865157944365865, "grad_norm": 0.0021585498470813036, "learning_rate": 0.11447936513109633, "loss": 0.2505, "num_input_tokens_seen": 19955296, "step": 23045 }, { "epoch": 10.867515322960868, "grad_norm": 0.0012633205624297261, "learning_rate": 0.11442213841740011, "loss": 0.2297, "num_input_tokens_seen": 19959328, "step": 23050 }, { "epoch": 10.86987270155587, "grad_norm": 0.000999503885395825, "learning_rate": 0.1143649171902572, "loss": 0.2343, "num_input_tokens_seen": 19963296, "step": 23055 }, { "epoch": 10.872230080150873, "grad_norm": 0.0007588480366393924, "learning_rate": 0.11430770145849194, "loss": 0.2019, "num_input_tokens_seen": 19967520, "step": 23060 }, { "epoch": 10.874587458745875, "grad_norm": 0.0006340722902677953, "learning_rate": 0.11425049123092756, "loss": 0.195, "num_input_tokens_seen": 19972416, "step": 23065 }, { "epoch": 10.876944837340877, "grad_norm": 0.0006020409637130797, "learning_rate": 0.11419328651638674, "loss": 0.2181, "num_input_tokens_seen": 19977296, "step": 23070 }, { "epoch": 10.87930221593588, "grad_norm": 0.0011248351074755192, "learning_rate": 0.11413608732369115, "loss": 0.1815, "num_input_tokens_seen": 19981968, "step": 23075 }, { "epoch": 10.881659594530882, "grad_norm": 0.0012005064636468887, "learning_rate": 0.11407889366166153, "loss": 0.2181, "num_input_tokens_seen": 19986864, "step": 23080 }, { "epoch": 10.884016973125885, "grad_norm": 0.001299140159972012, "learning_rate": 0.11402170553911797, "loss": 0.2117, "num_input_tokens_seen": 19990992, "step": 23085 }, { "epoch": 10.886374351720887, "grad_norm": 0.0006875817780382931, "learning_rate": 0.11396452296487955, "loss": 0.2289, "num_input_tokens_seen": 19995280, "step": 23090 }, { "epoch": 10.88873173031589, "grad_norm": 0.0007676462410017848, "learning_rate": 0.11390734594776449, "loss": 0.2241, "num_input_tokens_seen": 19999376, "step": 23095 }, { "epoch": 10.891089108910892, "grad_norm": 0.0010512993903830647, "learning_rate": 0.11385017449659031, "loss": 0.2047, "num_input_tokens_seen": 20003072, "step": 23100 }, { "epoch": 10.893446487505894, "grad_norm": 0.0011791749857366085, "learning_rate": 0.11379300862017344, "loss": 0.23, "num_input_tokens_seen": 20007936, "step": 23105 }, { "epoch": 10.895803866100895, "grad_norm": 0.0011103141587227583, "learning_rate": 0.11373584832732966, "loss": 0.2031, "num_input_tokens_seen": 20012464, "step": 23110 }, { "epoch": 10.898161244695899, "grad_norm": 0.0008360130595974624, "learning_rate": 0.11367869362687386, "loss": 0.2238, "num_input_tokens_seen": 20016064, "step": 23115 }, { "epoch": 10.9005186232909, "grad_norm": 0.0010881806956604123, "learning_rate": 0.11362154452761988, "loss": 0.161, "num_input_tokens_seen": 20020288, "step": 23120 }, { "epoch": 10.902876001885902, "grad_norm": 0.0016385485650971532, "learning_rate": 0.11356440103838095, "loss": 0.2146, "num_input_tokens_seen": 20024512, "step": 23125 }, { "epoch": 10.905233380480905, "grad_norm": 0.0028529844712466, "learning_rate": 0.11350726316796922, "loss": 0.2454, "num_input_tokens_seen": 20028768, "step": 23130 }, { "epoch": 10.907590759075907, "grad_norm": 0.0012602547649294138, "learning_rate": 0.11345013092519607, "loss": 0.1702, "num_input_tokens_seen": 20033248, "step": 23135 }, { "epoch": 10.90994813767091, "grad_norm": 0.0017458338988944888, "learning_rate": 0.11339300431887213, "loss": 0.2451, "num_input_tokens_seen": 20037920, "step": 23140 }, { "epoch": 10.912305516265912, "grad_norm": 0.0006550296675413847, "learning_rate": 0.11333588335780687, "loss": 0.2476, "num_input_tokens_seen": 20044048, "step": 23145 }, { "epoch": 10.914662894860914, "grad_norm": 0.0010822601616382599, "learning_rate": 0.11327876805080916, "loss": 0.1903, "num_input_tokens_seen": 20048096, "step": 23150 }, { "epoch": 10.917020273455917, "grad_norm": 0.0012574298307299614, "learning_rate": 0.11322165840668696, "loss": 0.2167, "num_input_tokens_seen": 20052576, "step": 23155 }, { "epoch": 10.919377652050919, "grad_norm": 0.001156458049081266, "learning_rate": 0.11316455443424717, "loss": 0.2, "num_input_tokens_seen": 20056496, "step": 23160 }, { "epoch": 10.921735030645921, "grad_norm": 0.0017996854148805141, "learning_rate": 0.11310745614229603, "loss": 0.2538, "num_input_tokens_seen": 20060992, "step": 23165 }, { "epoch": 10.924092409240924, "grad_norm": 0.0007214352954179049, "learning_rate": 0.1130503635396387, "loss": 0.2464, "num_input_tokens_seen": 20065104, "step": 23170 }, { "epoch": 10.926449787835926, "grad_norm": 0.0012655975297093391, "learning_rate": 0.11299327663507966, "loss": 0.1832, "num_input_tokens_seen": 20069200, "step": 23175 }, { "epoch": 10.928807166430929, "grad_norm": 0.0008490518084727228, "learning_rate": 0.11293619543742246, "loss": 0.223, "num_input_tokens_seen": 20073232, "step": 23180 }, { "epoch": 10.931164545025931, "grad_norm": 0.0014907806180417538, "learning_rate": 0.11287911995546965, "loss": 0.2093, "num_input_tokens_seen": 20077392, "step": 23185 }, { "epoch": 10.933521923620933, "grad_norm": 0.0016722565051168203, "learning_rate": 0.11282205019802308, "loss": 0.2216, "num_input_tokens_seen": 20082448, "step": 23190 }, { "epoch": 10.935879302215936, "grad_norm": 0.0009421691065654159, "learning_rate": 0.11276498617388354, "loss": 0.1866, "num_input_tokens_seen": 20086544, "step": 23195 }, { "epoch": 10.938236680810938, "grad_norm": 0.00103525270242244, "learning_rate": 0.11270792789185109, "loss": 0.2288, "num_input_tokens_seen": 20090944, "step": 23200 }, { "epoch": 10.938236680810938, "eval_loss": 0.21472252905368805, "eval_runtime": 21.9349, "eval_samples_per_second": 42.991, "eval_steps_per_second": 21.518, "num_input_tokens_seen": 20090944, "step": 23200 }, { "epoch": 10.94059405940594, "grad_norm": 0.0008417396456934512, "learning_rate": 0.11265087536072482, "loss": 0.1847, "num_input_tokens_seen": 20095760, "step": 23205 }, { "epoch": 10.942951438000943, "grad_norm": 0.000897797173820436, "learning_rate": 0.11259382858930288, "loss": 0.2104, "num_input_tokens_seen": 20100384, "step": 23210 }, { "epoch": 10.945308816595945, "grad_norm": 0.0014295177534222603, "learning_rate": 0.11253678758638262, "loss": 0.2312, "num_input_tokens_seen": 20104880, "step": 23215 }, { "epoch": 10.947666195190948, "grad_norm": 0.0010773816611617804, "learning_rate": 0.11247975236076059, "loss": 0.1856, "num_input_tokens_seen": 20109648, "step": 23220 }, { "epoch": 10.95002357378595, "grad_norm": 0.0008797547779977322, "learning_rate": 0.11242272292123218, "loss": 0.2675, "num_input_tokens_seen": 20113456, "step": 23225 }, { "epoch": 10.952380952380953, "grad_norm": 0.0007080081850290298, "learning_rate": 0.11236569927659217, "loss": 0.2226, "num_input_tokens_seen": 20118160, "step": 23230 }, { "epoch": 10.954738330975955, "grad_norm": 0.0018796961521729827, "learning_rate": 0.11230868143563429, "loss": 0.2277, "num_input_tokens_seen": 20122384, "step": 23235 }, { "epoch": 10.957095709570957, "grad_norm": 0.0016922520007938147, "learning_rate": 0.11225166940715131, "loss": 0.2239, "num_input_tokens_seen": 20125760, "step": 23240 }, { "epoch": 10.95945308816596, "grad_norm": 0.0015863478183746338, "learning_rate": 0.11219466319993537, "loss": 0.2124, "num_input_tokens_seen": 20130048, "step": 23245 }, { "epoch": 10.961810466760962, "grad_norm": 0.0010020764311775565, "learning_rate": 0.11213766282277739, "loss": 0.2152, "num_input_tokens_seen": 20134480, "step": 23250 }, { "epoch": 10.964167845355965, "grad_norm": 0.001910592196509242, "learning_rate": 0.11208066828446761, "loss": 0.2261, "num_input_tokens_seen": 20139264, "step": 23255 }, { "epoch": 10.966525223950967, "grad_norm": 0.0014729124959558249, "learning_rate": 0.11202367959379537, "loss": 0.2272, "num_input_tokens_seen": 20143312, "step": 23260 }, { "epoch": 10.96888260254597, "grad_norm": 0.0012148782843723893, "learning_rate": 0.11196669675954894, "loss": 0.2116, "num_input_tokens_seen": 20147840, "step": 23265 }, { "epoch": 10.971239981140972, "grad_norm": 0.0009844157611951232, "learning_rate": 0.1119097197905158, "loss": 0.1816, "num_input_tokens_seen": 20151856, "step": 23270 }, { "epoch": 10.973597359735974, "grad_norm": 0.0010323496535420418, "learning_rate": 0.11185274869548259, "loss": 0.2437, "num_input_tokens_seen": 20155424, "step": 23275 }, { "epoch": 10.975954738330977, "grad_norm": 0.0008316069142892957, "learning_rate": 0.11179578348323486, "loss": 0.186, "num_input_tokens_seen": 20159472, "step": 23280 }, { "epoch": 10.978312116925979, "grad_norm": 0.0007583939586766064, "learning_rate": 0.1117388241625575, "loss": 0.2146, "num_input_tokens_seen": 20163488, "step": 23285 }, { "epoch": 10.980669495520981, "grad_norm": 0.0019978827331215143, "learning_rate": 0.11168187074223421, "loss": 0.2566, "num_input_tokens_seen": 20168320, "step": 23290 }, { "epoch": 10.983026874115984, "grad_norm": 0.001236087060533464, "learning_rate": 0.11162492323104796, "loss": 0.1884, "num_input_tokens_seen": 20172992, "step": 23295 }, { "epoch": 10.985384252710986, "grad_norm": 0.0016348578501492739, "learning_rate": 0.11156798163778091, "loss": 0.2398, "num_input_tokens_seen": 20177136, "step": 23300 }, { "epoch": 10.987741631305987, "grad_norm": 0.0020291681867092848, "learning_rate": 0.11151104597121399, "loss": 0.1902, "num_input_tokens_seen": 20181984, "step": 23305 }, { "epoch": 10.990099009900991, "grad_norm": 0.0017043852712959051, "learning_rate": 0.11145411624012742, "loss": 0.2123, "num_input_tokens_seen": 20185840, "step": 23310 }, { "epoch": 10.992456388495992, "grad_norm": 0.0011680348543450236, "learning_rate": 0.11139719245330063, "loss": 0.1776, "num_input_tokens_seen": 20191200, "step": 23315 }, { "epoch": 10.994813767090994, "grad_norm": 0.001845953636802733, "learning_rate": 0.11134027461951179, "loss": 0.2404, "num_input_tokens_seen": 20195264, "step": 23320 }, { "epoch": 10.997171145685996, "grad_norm": 0.0021170955151319504, "learning_rate": 0.11128336274753849, "loss": 0.2096, "num_input_tokens_seen": 20199808, "step": 23325 }, { "epoch": 10.999528524280999, "grad_norm": 0.00138994085136801, "learning_rate": 0.11122645684615715, "loss": 0.2267, "num_input_tokens_seen": 20203920, "step": 23330 }, { "epoch": 11.001885902876001, "grad_norm": 0.000994442729279399, "learning_rate": 0.11116955692414345, "loss": 0.1809, "num_input_tokens_seen": 20207728, "step": 23335 }, { "epoch": 11.004243281471004, "grad_norm": 0.0008911281474865973, "learning_rate": 0.11111266299027203, "loss": 0.2033, "num_input_tokens_seen": 20212592, "step": 23340 }, { "epoch": 11.006600660066006, "grad_norm": 0.0010320983128622174, "learning_rate": 0.11105577505331668, "loss": 0.2248, "num_input_tokens_seen": 20217056, "step": 23345 }, { "epoch": 11.008958038661008, "grad_norm": 0.0019610875751823187, "learning_rate": 0.11099889312205018, "loss": 0.2319, "num_input_tokens_seen": 20220848, "step": 23350 }, { "epoch": 11.01131541725601, "grad_norm": 0.0010735843097791076, "learning_rate": 0.11094201720524455, "loss": 0.211, "num_input_tokens_seen": 20225360, "step": 23355 }, { "epoch": 11.013672795851013, "grad_norm": 0.0024423038121312857, "learning_rate": 0.11088514731167064, "loss": 0.2235, "num_input_tokens_seen": 20230480, "step": 23360 }, { "epoch": 11.016030174446016, "grad_norm": 0.00119048822671175, "learning_rate": 0.11082828345009862, "loss": 0.222, "num_input_tokens_seen": 20234112, "step": 23365 }, { "epoch": 11.018387553041018, "grad_norm": 0.003125028917565942, "learning_rate": 0.11077142562929748, "loss": 0.2144, "num_input_tokens_seen": 20238320, "step": 23370 }, { "epoch": 11.02074493163602, "grad_norm": 0.0030460748821496964, "learning_rate": 0.11071457385803554, "loss": 0.2161, "num_input_tokens_seen": 20242592, "step": 23375 }, { "epoch": 11.023102310231023, "grad_norm": 0.002021255437284708, "learning_rate": 0.11065772814508001, "loss": 0.2318, "num_input_tokens_seen": 20246912, "step": 23380 }, { "epoch": 11.025459688826025, "grad_norm": 0.0014620937872678041, "learning_rate": 0.11060088849919715, "loss": 0.2086, "num_input_tokens_seen": 20251216, "step": 23385 }, { "epoch": 11.027817067421028, "grad_norm": 0.0012292565079405904, "learning_rate": 0.11054405492915244, "loss": 0.1968, "num_input_tokens_seen": 20256000, "step": 23390 }, { "epoch": 11.03017444601603, "grad_norm": 0.002381534082815051, "learning_rate": 0.11048722744371031, "loss": 0.226, "num_input_tokens_seen": 20259760, "step": 23395 }, { "epoch": 11.032531824611032, "grad_norm": 0.0018527020001783967, "learning_rate": 0.1104304060516342, "loss": 0.2446, "num_input_tokens_seen": 20264992, "step": 23400 }, { "epoch": 11.032531824611032, "eval_loss": 0.21772626042366028, "eval_runtime": 21.9538, "eval_samples_per_second": 42.954, "eval_steps_per_second": 21.5, "num_input_tokens_seen": 20264992, "step": 23400 }, { "epoch": 11.034889203206035, "grad_norm": 0.0011606266489252448, "learning_rate": 0.11037359076168682, "loss": 0.2531, "num_input_tokens_seen": 20268720, "step": 23405 }, { "epoch": 11.037246581801037, "grad_norm": 0.0011548015754669905, "learning_rate": 0.11031678158262966, "loss": 0.1999, "num_input_tokens_seen": 20273072, "step": 23410 }, { "epoch": 11.03960396039604, "grad_norm": 0.000966964231338352, "learning_rate": 0.11025997852322349, "loss": 0.2065, "num_input_tokens_seen": 20276864, "step": 23415 }, { "epoch": 11.041961338991042, "grad_norm": 0.0014709875686094165, "learning_rate": 0.11020318159222807, "loss": 0.2297, "num_input_tokens_seen": 20281168, "step": 23420 }, { "epoch": 11.044318717586044, "grad_norm": 0.0031000811140984297, "learning_rate": 0.1101463907984021, "loss": 0.1868, "num_input_tokens_seen": 20285664, "step": 23425 }, { "epoch": 11.046676096181047, "grad_norm": 0.0011572479270398617, "learning_rate": 0.11008960615050352, "loss": 0.2329, "num_input_tokens_seen": 20289680, "step": 23430 }, { "epoch": 11.04903347477605, "grad_norm": 0.00136491097509861, "learning_rate": 0.11003282765728925, "loss": 0.2089, "num_input_tokens_seen": 20293824, "step": 23435 }, { "epoch": 11.051390853371052, "grad_norm": 0.0012604534858837724, "learning_rate": 0.10997605532751518, "loss": 0.1942, "num_input_tokens_seen": 20298160, "step": 23440 }, { "epoch": 11.053748231966054, "grad_norm": 0.000972271547652781, "learning_rate": 0.1099192891699364, "loss": 0.2104, "num_input_tokens_seen": 20302096, "step": 23445 }, { "epoch": 11.056105610561056, "grad_norm": 0.0013990378938615322, "learning_rate": 0.10986252919330687, "loss": 0.2185, "num_input_tokens_seen": 20306432, "step": 23450 }, { "epoch": 11.058462989156059, "grad_norm": 0.0010516188340261579, "learning_rate": 0.10980577540637973, "loss": 0.1974, "num_input_tokens_seen": 20310736, "step": 23455 }, { "epoch": 11.060820367751061, "grad_norm": 0.00156517478171736, "learning_rate": 0.10974902781790719, "loss": 0.2622, "num_input_tokens_seen": 20315904, "step": 23460 }, { "epoch": 11.063177746346064, "grad_norm": 0.0012299699010327458, "learning_rate": 0.10969228643664032, "loss": 0.2208, "num_input_tokens_seen": 20319680, "step": 23465 }, { "epoch": 11.065535124941066, "grad_norm": 0.0019358410499989986, "learning_rate": 0.10963555127132942, "loss": 0.2119, "num_input_tokens_seen": 20323760, "step": 23470 }, { "epoch": 11.067892503536068, "grad_norm": 0.002813896397128701, "learning_rate": 0.10957882233072382, "loss": 0.2255, "num_input_tokens_seen": 20328608, "step": 23475 }, { "epoch": 11.07024988213107, "grad_norm": 0.002009752206504345, "learning_rate": 0.10952209962357176, "loss": 0.223, "num_input_tokens_seen": 20333280, "step": 23480 }, { "epoch": 11.072607260726073, "grad_norm": 0.0022760089486837387, "learning_rate": 0.10946538315862062, "loss": 0.2267, "num_input_tokens_seen": 20337568, "step": 23485 }, { "epoch": 11.074964639321076, "grad_norm": 0.0013420458417385817, "learning_rate": 0.10940867294461679, "loss": 0.2153, "num_input_tokens_seen": 20341600, "step": 23490 }, { "epoch": 11.077322017916078, "grad_norm": 0.0016229223692789674, "learning_rate": 0.10935196899030565, "loss": 0.1974, "num_input_tokens_seen": 20345696, "step": 23495 }, { "epoch": 11.07967939651108, "grad_norm": 0.0014904115814715624, "learning_rate": 0.10929527130443177, "loss": 0.2541, "num_input_tokens_seen": 20350384, "step": 23500 }, { "epoch": 11.082036775106083, "grad_norm": 0.0012175634037703276, "learning_rate": 0.1092385798957385, "loss": 0.2389, "num_input_tokens_seen": 20354224, "step": 23505 }, { "epoch": 11.084394153701085, "grad_norm": 0.0015207205433398485, "learning_rate": 0.10918189477296848, "loss": 0.2279, "num_input_tokens_seen": 20358384, "step": 23510 }, { "epoch": 11.086751532296086, "grad_norm": 0.0023589455522596836, "learning_rate": 0.1091252159448633, "loss": 0.2215, "num_input_tokens_seen": 20362944, "step": 23515 }, { "epoch": 11.089108910891088, "grad_norm": 0.001307054772041738, "learning_rate": 0.10906854342016345, "loss": 0.2264, "num_input_tokens_seen": 20367584, "step": 23520 }, { "epoch": 11.09146628948609, "grad_norm": 0.0009953967528417706, "learning_rate": 0.10901187720760858, "loss": 0.2224, "num_input_tokens_seen": 20371488, "step": 23525 }, { "epoch": 11.093823668081093, "grad_norm": 0.002436491660773754, "learning_rate": 0.10895521731593734, "loss": 0.2217, "num_input_tokens_seen": 20375920, "step": 23530 }, { "epoch": 11.096181046676096, "grad_norm": 0.0015205296222120523, "learning_rate": 0.10889856375388733, "loss": 0.2392, "num_input_tokens_seen": 20380320, "step": 23535 }, { "epoch": 11.098538425271098, "grad_norm": 0.0014971033670008183, "learning_rate": 0.1088419165301954, "loss": 0.1931, "num_input_tokens_seen": 20384400, "step": 23540 }, { "epoch": 11.1008958038661, "grad_norm": 0.0008927164017222822, "learning_rate": 0.1087852756535971, "loss": 0.2307, "num_input_tokens_seen": 20389120, "step": 23545 }, { "epoch": 11.103253182461103, "grad_norm": 0.0015340876998379827, "learning_rate": 0.10872864113282725, "loss": 0.1929, "num_input_tokens_seen": 20393408, "step": 23550 }, { "epoch": 11.105610561056105, "grad_norm": 0.001431796234101057, "learning_rate": 0.10867201297661958, "loss": 0.2583, "num_input_tokens_seen": 20398784, "step": 23555 }, { "epoch": 11.107967939651108, "grad_norm": 0.001124314614571631, "learning_rate": 0.10861539119370689, "loss": 0.238, "num_input_tokens_seen": 20404048, "step": 23560 }, { "epoch": 11.11032531824611, "grad_norm": 0.0017419190844520926, "learning_rate": 0.10855877579282096, "loss": 0.187, "num_input_tokens_seen": 20408448, "step": 23565 }, { "epoch": 11.112682696841112, "grad_norm": 0.003219668287783861, "learning_rate": 0.10850216678269252, "loss": 0.2385, "num_input_tokens_seen": 20412864, "step": 23570 }, { "epoch": 11.115040075436115, "grad_norm": 0.0009226940455846488, "learning_rate": 0.10844556417205146, "loss": 0.1942, "num_input_tokens_seen": 20416800, "step": 23575 }, { "epoch": 11.117397454031117, "grad_norm": 0.0017599056009203196, "learning_rate": 0.10838896796962669, "loss": 0.2192, "num_input_tokens_seen": 20421296, "step": 23580 }, { "epoch": 11.11975483262612, "grad_norm": 0.000826413684990257, "learning_rate": 0.1083323781841459, "loss": 0.2163, "num_input_tokens_seen": 20424912, "step": 23585 }, { "epoch": 11.122112211221122, "grad_norm": 0.0011797323822975159, "learning_rate": 0.10827579482433607, "loss": 0.238, "num_input_tokens_seen": 20428992, "step": 23590 }, { "epoch": 11.124469589816124, "grad_norm": 0.0013507818803191185, "learning_rate": 0.10821921789892304, "loss": 0.2149, "num_input_tokens_seen": 20433168, "step": 23595 }, { "epoch": 11.126826968411127, "grad_norm": 0.00153036555275321, "learning_rate": 0.10816264741663158, "loss": 0.2159, "num_input_tokens_seen": 20437952, "step": 23600 }, { "epoch": 11.126826968411127, "eval_loss": 0.21580545604228973, "eval_runtime": 21.9296, "eval_samples_per_second": 43.001, "eval_steps_per_second": 21.523, "num_input_tokens_seen": 20437952, "step": 23600 }, { "epoch": 11.12918434700613, "grad_norm": 0.0009373281500302255, "learning_rate": 0.10810608338618573, "loss": 0.212, "num_input_tokens_seen": 20442672, "step": 23605 }, { "epoch": 11.131541725601132, "grad_norm": 0.0017372275469824672, "learning_rate": 0.10804952581630821, "loss": 0.2144, "num_input_tokens_seen": 20447072, "step": 23610 }, { "epoch": 11.133899104196134, "grad_norm": 0.00094787054695189, "learning_rate": 0.10799297471572102, "loss": 0.1753, "num_input_tokens_seen": 20451088, "step": 23615 }, { "epoch": 11.136256482791136, "grad_norm": 0.0007915576570667326, "learning_rate": 0.10793643009314507, "loss": 0.2063, "num_input_tokens_seen": 20455472, "step": 23620 }, { "epoch": 11.138613861386139, "grad_norm": 0.0008447933942079544, "learning_rate": 0.10787989195730015, "loss": 0.1925, "num_input_tokens_seen": 20459728, "step": 23625 }, { "epoch": 11.140971239981141, "grad_norm": 0.001458832179196179, "learning_rate": 0.10782336031690525, "loss": 0.2308, "num_input_tokens_seen": 20464112, "step": 23630 }, { "epoch": 11.143328618576144, "grad_norm": 0.0020741235930472612, "learning_rate": 0.10776683518067821, "loss": 0.2527, "num_input_tokens_seen": 20468112, "step": 23635 }, { "epoch": 11.145685997171146, "grad_norm": 0.0016224384307861328, "learning_rate": 0.10771031655733587, "loss": 0.2306, "num_input_tokens_seen": 20473696, "step": 23640 }, { "epoch": 11.148043375766148, "grad_norm": 0.0009556170552968979, "learning_rate": 0.10765380445559422, "loss": 0.2155, "num_input_tokens_seen": 20477616, "step": 23645 }, { "epoch": 11.15040075436115, "grad_norm": 0.0008031262550503016, "learning_rate": 0.10759729888416801, "loss": 0.2446, "num_input_tokens_seen": 20481472, "step": 23650 }, { "epoch": 11.152758132956153, "grad_norm": 0.0009678932838141918, "learning_rate": 0.10754079985177119, "loss": 0.2285, "num_input_tokens_seen": 20485984, "step": 23655 }, { "epoch": 11.155115511551156, "grad_norm": 0.0018371345940977335, "learning_rate": 0.10748430736711667, "loss": 0.2442, "num_input_tokens_seen": 20490512, "step": 23660 }, { "epoch": 11.157472890146158, "grad_norm": 0.001014583744108677, "learning_rate": 0.10742782143891623, "loss": 0.2431, "num_input_tokens_seen": 20494288, "step": 23665 }, { "epoch": 11.15983026874116, "grad_norm": 0.0009361589327454567, "learning_rate": 0.10737134207588069, "loss": 0.2055, "num_input_tokens_seen": 20498368, "step": 23670 }, { "epoch": 11.162187647336163, "grad_norm": 0.001036115805618465, "learning_rate": 0.10731486928671992, "loss": 0.2362, "num_input_tokens_seen": 20502176, "step": 23675 }, { "epoch": 11.164545025931165, "grad_norm": 0.001800219644792378, "learning_rate": 0.10725840308014269, "loss": 0.1781, "num_input_tokens_seen": 20506560, "step": 23680 }, { "epoch": 11.166902404526168, "grad_norm": 0.0014015953056514263, "learning_rate": 0.10720194346485688, "loss": 0.2065, "num_input_tokens_seen": 20510544, "step": 23685 }, { "epoch": 11.16925978312117, "grad_norm": 0.0006467823986895382, "learning_rate": 0.10714549044956918, "loss": 0.1786, "num_input_tokens_seen": 20515072, "step": 23690 }, { "epoch": 11.171617161716172, "grad_norm": 0.0012705764966085553, "learning_rate": 0.10708904404298542, "loss": 0.2245, "num_input_tokens_seen": 20519472, "step": 23695 }, { "epoch": 11.173974540311175, "grad_norm": 0.0012578489258885384, "learning_rate": 0.1070326042538103, "loss": 0.2208, "num_input_tokens_seen": 20523744, "step": 23700 }, { "epoch": 11.176331918906177, "grad_norm": 0.0012912634992972016, "learning_rate": 0.10697617109074758, "loss": 0.1975, "num_input_tokens_seen": 20527968, "step": 23705 }, { "epoch": 11.17868929750118, "grad_norm": 0.0010073906742036343, "learning_rate": 0.10691974456249999, "loss": 0.1878, "num_input_tokens_seen": 20531616, "step": 23710 }, { "epoch": 11.18104667609618, "grad_norm": 0.0018123212503269315, "learning_rate": 0.10686332467776909, "loss": 0.2307, "num_input_tokens_seen": 20535920, "step": 23715 }, { "epoch": 11.183404054691183, "grad_norm": 0.0014034094056114554, "learning_rate": 0.10680691144525563, "loss": 0.1892, "num_input_tokens_seen": 20541616, "step": 23720 }, { "epoch": 11.185761433286185, "grad_norm": 0.0009230842697434127, "learning_rate": 0.10675050487365928, "loss": 0.2116, "num_input_tokens_seen": 20545200, "step": 23725 }, { "epoch": 11.188118811881187, "grad_norm": 0.000952008122112602, "learning_rate": 0.10669410497167851, "loss": 0.1504, "num_input_tokens_seen": 20549392, "step": 23730 }, { "epoch": 11.19047619047619, "grad_norm": 0.000988367130048573, "learning_rate": 0.10663771174801102, "loss": 0.1922, "num_input_tokens_seen": 20553008, "step": 23735 }, { "epoch": 11.192833569071192, "grad_norm": 0.0018495024414733052, "learning_rate": 0.10658132521135329, "loss": 0.2258, "num_input_tokens_seen": 20557264, "step": 23740 }, { "epoch": 11.195190947666195, "grad_norm": 0.0008102197316475213, "learning_rate": 0.10652494537040084, "loss": 0.1537, "num_input_tokens_seen": 20562144, "step": 23745 }, { "epoch": 11.197548326261197, "grad_norm": 0.000914545264095068, "learning_rate": 0.1064685722338482, "loss": 0.1839, "num_input_tokens_seen": 20565952, "step": 23750 }, { "epoch": 11.1999057048562, "grad_norm": 0.0013692631619051099, "learning_rate": 0.10641220581038871, "loss": 0.2519, "num_input_tokens_seen": 20570544, "step": 23755 }, { "epoch": 11.202263083451202, "grad_norm": 0.0018690354190766811, "learning_rate": 0.10635584610871483, "loss": 0.2358, "num_input_tokens_seen": 20576096, "step": 23760 }, { "epoch": 11.204620462046204, "grad_norm": 0.0010012367274612188, "learning_rate": 0.10629949313751803, "loss": 0.268, "num_input_tokens_seen": 20579984, "step": 23765 }, { "epoch": 11.206977840641207, "grad_norm": 0.0016885208897292614, "learning_rate": 0.10624314690548849, "loss": 0.2114, "num_input_tokens_seen": 20584528, "step": 23770 }, { "epoch": 11.209335219236209, "grad_norm": 0.0011862764367833734, "learning_rate": 0.1061868074213156, "loss": 0.2242, "num_input_tokens_seen": 20588384, "step": 23775 }, { "epoch": 11.211692597831211, "grad_norm": 0.0019721442367881536, "learning_rate": 0.10613047469368765, "loss": 0.2195, "num_input_tokens_seen": 20592992, "step": 23780 }, { "epoch": 11.214049976426214, "grad_norm": 0.001509863301180303, "learning_rate": 0.10607414873129171, "loss": 0.2112, "num_input_tokens_seen": 20597712, "step": 23785 }, { "epoch": 11.216407355021216, "grad_norm": 0.0010134277399629354, "learning_rate": 0.10601782954281413, "loss": 0.2388, "num_input_tokens_seen": 20603216, "step": 23790 }, { "epoch": 11.218764733616219, "grad_norm": 0.0012597590684890747, "learning_rate": 0.1059615171369399, "loss": 0.216, "num_input_tokens_seen": 20607344, "step": 23795 }, { "epoch": 11.221122112211221, "grad_norm": 0.0010668057948350906, "learning_rate": 0.10590521152235312, "loss": 0.1985, "num_input_tokens_seen": 20611040, "step": 23800 }, { "epoch": 11.221122112211221, "eval_loss": 0.2159261852502823, "eval_runtime": 21.9157, "eval_samples_per_second": 43.029, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 20611040, "step": 23800 }, { "epoch": 11.223479490806223, "grad_norm": 0.0008851926540955901, "learning_rate": 0.1058489127077369, "loss": 0.1808, "num_input_tokens_seen": 20614848, "step": 23805 }, { "epoch": 11.225836869401226, "grad_norm": 0.0010301230940967798, "learning_rate": 0.1057926207017732, "loss": 0.227, "num_input_tokens_seen": 20619584, "step": 23810 }, { "epoch": 11.228194247996228, "grad_norm": 0.001175128621980548, "learning_rate": 0.10573633551314285, "loss": 0.26, "num_input_tokens_seen": 20623408, "step": 23815 }, { "epoch": 11.23055162659123, "grad_norm": 0.0010005505755543709, "learning_rate": 0.1056800571505259, "loss": 0.217, "num_input_tokens_seen": 20628128, "step": 23820 }, { "epoch": 11.232909005186233, "grad_norm": 0.001662829308770597, "learning_rate": 0.10562378562260105, "loss": 0.2361, "num_input_tokens_seen": 20632848, "step": 23825 }, { "epoch": 11.235266383781235, "grad_norm": 0.001914051827043295, "learning_rate": 0.10556752093804615, "loss": 0.2019, "num_input_tokens_seen": 20637168, "step": 23830 }, { "epoch": 11.237623762376238, "grad_norm": 0.0011869823792949319, "learning_rate": 0.10551126310553786, "loss": 0.2238, "num_input_tokens_seen": 20641744, "step": 23835 }, { "epoch": 11.23998114097124, "grad_norm": 0.0008630105294287205, "learning_rate": 0.10545501213375187, "loss": 0.2218, "num_input_tokens_seen": 20646256, "step": 23840 }, { "epoch": 11.242338519566243, "grad_norm": 0.0015701164957135916, "learning_rate": 0.10539876803136287, "loss": 0.2107, "num_input_tokens_seen": 20650688, "step": 23845 }, { "epoch": 11.244695898161245, "grad_norm": 0.0009891436202451587, "learning_rate": 0.10534253080704428, "loss": 0.2365, "num_input_tokens_seen": 20655152, "step": 23850 }, { "epoch": 11.247053276756247, "grad_norm": 0.0011949061881750822, "learning_rate": 0.10528630046946862, "loss": 0.2496, "num_input_tokens_seen": 20659712, "step": 23855 }, { "epoch": 11.24941065535125, "grad_norm": 0.001867667306214571, "learning_rate": 0.1052300770273074, "loss": 0.213, "num_input_tokens_seen": 20664048, "step": 23860 }, { "epoch": 11.251768033946252, "grad_norm": 0.0015228319680318236, "learning_rate": 0.10517386048923086, "loss": 0.2335, "num_input_tokens_seen": 20667888, "step": 23865 }, { "epoch": 11.254125412541255, "grad_norm": 0.0015251276781782508, "learning_rate": 0.10511765086390841, "loss": 0.2218, "num_input_tokens_seen": 20672256, "step": 23870 }, { "epoch": 11.256482791136257, "grad_norm": 0.0011013098992407322, "learning_rate": 0.10506144816000816, "loss": 0.2409, "num_input_tokens_seen": 20676400, "step": 23875 }, { "epoch": 11.25884016973126, "grad_norm": 0.0013770844088867307, "learning_rate": 0.10500525238619736, "loss": 0.2091, "num_input_tokens_seen": 20680352, "step": 23880 }, { "epoch": 11.261197548326262, "grad_norm": 0.0021768221631646156, "learning_rate": 0.10494906355114209, "loss": 0.2362, "num_input_tokens_seen": 20686368, "step": 23885 }, { "epoch": 11.263554926921264, "grad_norm": 0.0010891961865127087, "learning_rate": 0.10489288166350737, "loss": 0.2655, "num_input_tokens_seen": 20692064, "step": 23890 }, { "epoch": 11.265912305516267, "grad_norm": 0.0009515869314782321, "learning_rate": 0.10483670673195711, "loss": 0.2127, "num_input_tokens_seen": 20697072, "step": 23895 }, { "epoch": 11.268269684111269, "grad_norm": 0.0008498075767420232, "learning_rate": 0.10478053876515431, "loss": 0.2101, "num_input_tokens_seen": 20701232, "step": 23900 }, { "epoch": 11.270627062706271, "grad_norm": 0.0014994601951912045, "learning_rate": 0.10472437777176061, "loss": 0.2234, "num_input_tokens_seen": 20705104, "step": 23905 }, { "epoch": 11.272984441301272, "grad_norm": 0.001110375509597361, "learning_rate": 0.1046682237604369, "loss": 0.2407, "num_input_tokens_seen": 20709616, "step": 23910 }, { "epoch": 11.275341819896274, "grad_norm": 0.001589594641700387, "learning_rate": 0.1046120767398427, "loss": 0.1926, "num_input_tokens_seen": 20714368, "step": 23915 }, { "epoch": 11.277699198491277, "grad_norm": 0.0013552963500842452, "learning_rate": 0.10455593671863667, "loss": 0.2052, "num_input_tokens_seen": 20719072, "step": 23920 }, { "epoch": 11.28005657708628, "grad_norm": 0.0008104029111564159, "learning_rate": 0.1044998037054763, "loss": 0.2154, "num_input_tokens_seen": 20722896, "step": 23925 }, { "epoch": 11.282413955681282, "grad_norm": 0.003819710109382868, "learning_rate": 0.10444367770901794, "loss": 0.2412, "num_input_tokens_seen": 20727664, "step": 23930 }, { "epoch": 11.284771334276284, "grad_norm": 0.0014984929002821445, "learning_rate": 0.10438755873791698, "loss": 0.2096, "num_input_tokens_seen": 20731520, "step": 23935 }, { "epoch": 11.287128712871286, "grad_norm": 0.0009691211162135005, "learning_rate": 0.10433144680082775, "loss": 0.205, "num_input_tokens_seen": 20736208, "step": 23940 }, { "epoch": 11.289486091466289, "grad_norm": 0.001547016086988151, "learning_rate": 0.10427534190640322, "loss": 0.224, "num_input_tokens_seen": 20740448, "step": 23945 }, { "epoch": 11.291843470061291, "grad_norm": 0.0010443029459565878, "learning_rate": 0.10421924406329568, "loss": 0.2283, "num_input_tokens_seen": 20744640, "step": 23950 }, { "epoch": 11.294200848656294, "grad_norm": 0.0018526441417634487, "learning_rate": 0.10416315328015598, "loss": 0.2098, "num_input_tokens_seen": 20749296, "step": 23955 }, { "epoch": 11.296558227251296, "grad_norm": 0.0009104938362725079, "learning_rate": 0.10410706956563402, "loss": 0.2392, "num_input_tokens_seen": 20753712, "step": 23960 }, { "epoch": 11.298915605846299, "grad_norm": 0.0008192817331291735, "learning_rate": 0.10405099292837874, "loss": 0.2087, "num_input_tokens_seen": 20757968, "step": 23965 }, { "epoch": 11.301272984441301, "grad_norm": 0.0010730481008067727, "learning_rate": 0.10399492337703771, "loss": 0.2126, "num_input_tokens_seen": 20762432, "step": 23970 }, { "epoch": 11.303630363036303, "grad_norm": 0.0011436528293415904, "learning_rate": 0.10393886092025764, "loss": 0.2398, "num_input_tokens_seen": 20766320, "step": 23975 }, { "epoch": 11.305987741631306, "grad_norm": 0.002610074356198311, "learning_rate": 0.10388280556668412, "loss": 0.241, "num_input_tokens_seen": 20769872, "step": 23980 }, { "epoch": 11.308345120226308, "grad_norm": 0.0010855704313144088, "learning_rate": 0.10382675732496145, "loss": 0.2269, "num_input_tokens_seen": 20774912, "step": 23985 }, { "epoch": 11.31070249882131, "grad_norm": 0.0012254139874130487, "learning_rate": 0.10377071620373311, "loss": 0.1937, "num_input_tokens_seen": 20779072, "step": 23990 }, { "epoch": 11.313059877416313, "grad_norm": 0.001671142759732902, "learning_rate": 0.10371468221164128, "loss": 0.2208, "num_input_tokens_seen": 20783584, "step": 23995 }, { "epoch": 11.315417256011315, "grad_norm": 0.0024140847381204367, "learning_rate": 0.10365865535732706, "loss": 0.2262, "num_input_tokens_seen": 20787488, "step": 24000 }, { "epoch": 11.315417256011315, "eval_loss": 0.2140500694513321, "eval_runtime": 21.9108, "eval_samples_per_second": 43.038, "eval_steps_per_second": 21.542, "num_input_tokens_seen": 20787488, "step": 24000 }, { "epoch": 11.317774634606318, "grad_norm": 0.0011820073705166578, "learning_rate": 0.10360263564943062, "loss": 0.18, "num_input_tokens_seen": 20791376, "step": 24005 }, { "epoch": 11.32013201320132, "grad_norm": 0.0009163091890513897, "learning_rate": 0.10354662309659075, "loss": 0.1769, "num_input_tokens_seen": 20795712, "step": 24010 }, { "epoch": 11.322489391796323, "grad_norm": 0.0012044081231579185, "learning_rate": 0.10349061770744537, "loss": 0.2304, "num_input_tokens_seen": 20799520, "step": 24015 }, { "epoch": 11.324846770391325, "grad_norm": 0.0011261162580922246, "learning_rate": 0.10343461949063128, "loss": 0.2523, "num_input_tokens_seen": 20804240, "step": 24020 }, { "epoch": 11.327204148986327, "grad_norm": 0.0012648450210690498, "learning_rate": 0.103378628454784, "loss": 0.2145, "num_input_tokens_seen": 20807936, "step": 24025 }, { "epoch": 11.32956152758133, "grad_norm": 0.0010526706464588642, "learning_rate": 0.10332264460853811, "loss": 0.2259, "num_input_tokens_seen": 20812464, "step": 24030 }, { "epoch": 11.331918906176332, "grad_norm": 0.001100714085623622, "learning_rate": 0.10326666796052701, "loss": 0.2176, "num_input_tokens_seen": 20816640, "step": 24035 }, { "epoch": 11.334276284771335, "grad_norm": 0.001818842370994389, "learning_rate": 0.10321069851938296, "loss": 0.2146, "num_input_tokens_seen": 20820736, "step": 24040 }, { "epoch": 11.336633663366337, "grad_norm": 0.0011157109402120113, "learning_rate": 0.10315473629373724, "loss": 0.2065, "num_input_tokens_seen": 20824448, "step": 24045 }, { "epoch": 11.33899104196134, "grad_norm": 0.0014897137880325317, "learning_rate": 0.10309878129221982, "loss": 0.2164, "num_input_tokens_seen": 20830240, "step": 24050 }, { "epoch": 11.341348420556342, "grad_norm": 0.0010584710398688912, "learning_rate": 0.10304283352345973, "loss": 0.2214, "num_input_tokens_seen": 20833584, "step": 24055 }, { "epoch": 11.343705799151344, "grad_norm": 0.0017780077178031206, "learning_rate": 0.10298689299608486, "loss": 0.2359, "num_input_tokens_seen": 20837424, "step": 24060 }, { "epoch": 11.346063177746347, "grad_norm": 0.0011143834562972188, "learning_rate": 0.10293095971872188, "loss": 0.1859, "num_input_tokens_seen": 20842736, "step": 24065 }, { "epoch": 11.348420556341349, "grad_norm": 0.0010674743680283427, "learning_rate": 0.10287503369999645, "loss": 0.197, "num_input_tokens_seen": 20846624, "step": 24070 }, { "epoch": 11.350777934936351, "grad_norm": 0.0014067590236663818, "learning_rate": 0.10281911494853295, "loss": 0.2168, "num_input_tokens_seen": 20850816, "step": 24075 }, { "epoch": 11.353135313531354, "grad_norm": 0.0013903224607929587, "learning_rate": 0.10276320347295485, "loss": 0.1993, "num_input_tokens_seen": 20855344, "step": 24080 }, { "epoch": 11.355492692126356, "grad_norm": 0.0022055944427847862, "learning_rate": 0.10270729928188446, "loss": 0.2974, "num_input_tokens_seen": 20859968, "step": 24085 }, { "epoch": 11.357850070721359, "grad_norm": 0.0014435197226703167, "learning_rate": 0.10265140238394276, "loss": 0.2092, "num_input_tokens_seen": 20864912, "step": 24090 }, { "epoch": 11.360207449316361, "grad_norm": 0.0015210837591439486, "learning_rate": 0.10259551278774988, "loss": 0.2339, "num_input_tokens_seen": 20870352, "step": 24095 }, { "epoch": 11.362564827911363, "grad_norm": 0.0012145775835961103, "learning_rate": 0.10253963050192462, "loss": 0.2002, "num_input_tokens_seen": 20874976, "step": 24100 }, { "epoch": 11.364922206506366, "grad_norm": 0.0010406265500932932, "learning_rate": 0.10248375553508478, "loss": 0.1912, "num_input_tokens_seen": 20878848, "step": 24105 }, { "epoch": 11.367279585101368, "grad_norm": 0.0015806496376171708, "learning_rate": 0.102427887895847, "loss": 0.2172, "num_input_tokens_seen": 20883072, "step": 24110 }, { "epoch": 11.369636963696369, "grad_norm": 0.001586403464898467, "learning_rate": 0.10237202759282668, "loss": 0.2356, "num_input_tokens_seen": 20888032, "step": 24115 }, { "epoch": 11.371994342291371, "grad_norm": 0.0012704903492704034, "learning_rate": 0.10231617463463821, "loss": 0.2332, "num_input_tokens_seen": 20892432, "step": 24120 }, { "epoch": 11.374351720886374, "grad_norm": 0.001378027256578207, "learning_rate": 0.10226032902989492, "loss": 0.227, "num_input_tokens_seen": 20896432, "step": 24125 }, { "epoch": 11.376709099481376, "grad_norm": 0.0010164872510358691, "learning_rate": 0.10220449078720877, "loss": 0.1988, "num_input_tokens_seen": 20900656, "step": 24130 }, { "epoch": 11.379066478076378, "grad_norm": 0.0011842716485261917, "learning_rate": 0.1021486599151908, "loss": 0.2359, "num_input_tokens_seen": 20904128, "step": 24135 }, { "epoch": 11.38142385667138, "grad_norm": 0.0012928934302181005, "learning_rate": 0.10209283642245084, "loss": 0.2119, "num_input_tokens_seen": 20907728, "step": 24140 }, { "epoch": 11.383781235266383, "grad_norm": 0.001320133451372385, "learning_rate": 0.10203702031759748, "loss": 0.2194, "num_input_tokens_seen": 20912224, "step": 24145 }, { "epoch": 11.386138613861386, "grad_norm": 0.0010877851163968444, "learning_rate": 0.1019812116092384, "loss": 0.1726, "num_input_tokens_seen": 20916576, "step": 24150 }, { "epoch": 11.388495992456388, "grad_norm": 0.0012953471159562469, "learning_rate": 0.10192541030597986, "loss": 0.2286, "num_input_tokens_seen": 20920464, "step": 24155 }, { "epoch": 11.39085337105139, "grad_norm": 0.0017360274214297533, "learning_rate": 0.1018696164164272, "loss": 0.1996, "num_input_tokens_seen": 20925232, "step": 24160 }, { "epoch": 11.393210749646393, "grad_norm": 0.0010649935575202107, "learning_rate": 0.10181382994918459, "loss": 0.254, "num_input_tokens_seen": 20929056, "step": 24165 }, { "epoch": 11.395568128241395, "grad_norm": 0.0008467952720820904, "learning_rate": 0.10175805091285492, "loss": 0.2167, "num_input_tokens_seen": 20933264, "step": 24170 }, { "epoch": 11.397925506836398, "grad_norm": 0.0010305946925655007, "learning_rate": 0.10170227931603999, "loss": 0.2519, "num_input_tokens_seen": 20936944, "step": 24175 }, { "epoch": 11.4002828854314, "grad_norm": 0.0015197590691968799, "learning_rate": 0.10164651516734062, "loss": 0.2225, "num_input_tokens_seen": 20940768, "step": 24180 }, { "epoch": 11.402640264026402, "grad_norm": 0.0010058096377179027, "learning_rate": 0.1015907584753562, "loss": 0.2172, "num_input_tokens_seen": 20944736, "step": 24185 }, { "epoch": 11.404997642621405, "grad_norm": 0.0014620432630181313, "learning_rate": 0.10153500924868523, "loss": 0.2181, "num_input_tokens_seen": 20948960, "step": 24190 }, { "epoch": 11.407355021216407, "grad_norm": 0.0013659505639225245, "learning_rate": 0.10147926749592483, "loss": 0.2034, "num_input_tokens_seen": 20953440, "step": 24195 }, { "epoch": 11.40971239981141, "grad_norm": 0.0016850425163283944, "learning_rate": 0.10142353322567112, "loss": 0.2036, "num_input_tokens_seen": 20958240, "step": 24200 }, { "epoch": 11.40971239981141, "eval_loss": 0.21602044999599457, "eval_runtime": 21.9124, "eval_samples_per_second": 43.035, "eval_steps_per_second": 21.54, "num_input_tokens_seen": 20958240, "step": 24200 }, { "epoch": 11.412069778406412, "grad_norm": 0.0011241210158914328, "learning_rate": 0.1013678064465191, "loss": 0.1905, "num_input_tokens_seen": 20962608, "step": 24205 }, { "epoch": 11.414427157001414, "grad_norm": 0.0012215763563290238, "learning_rate": 0.10131208716706244, "loss": 0.1994, "num_input_tokens_seen": 20966848, "step": 24210 }, { "epoch": 11.416784535596417, "grad_norm": 0.0012189783155918121, "learning_rate": 0.10125637539589379, "loss": 0.2403, "num_input_tokens_seen": 20971184, "step": 24215 }, { "epoch": 11.41914191419142, "grad_norm": 0.0010245434241369367, "learning_rate": 0.10120067114160464, "loss": 0.2315, "num_input_tokens_seen": 20975568, "step": 24220 }, { "epoch": 11.421499292786422, "grad_norm": 0.0012795149814337492, "learning_rate": 0.10114497441278517, "loss": 0.1795, "num_input_tokens_seen": 20979264, "step": 24225 }, { "epoch": 11.423856671381424, "grad_norm": 0.0012076591374352574, "learning_rate": 0.10108928521802468, "loss": 0.2352, "num_input_tokens_seen": 20983312, "step": 24230 }, { "epoch": 11.426214049976426, "grad_norm": 0.001956550171598792, "learning_rate": 0.101033603565911, "loss": 0.2101, "num_input_tokens_seen": 20988480, "step": 24235 }, { "epoch": 11.428571428571429, "grad_norm": 0.0011335097951814532, "learning_rate": 0.10097792946503102, "loss": 0.226, "num_input_tokens_seen": 20994288, "step": 24240 }, { "epoch": 11.430928807166431, "grad_norm": 0.0012657798361033201, "learning_rate": 0.10092226292397039, "loss": 0.2108, "num_input_tokens_seen": 20999264, "step": 24245 }, { "epoch": 11.433286185761434, "grad_norm": 0.0017539538675919175, "learning_rate": 0.10086660395131354, "loss": 0.2201, "num_input_tokens_seen": 21003504, "step": 24250 }, { "epoch": 11.435643564356436, "grad_norm": 0.0028838827274739742, "learning_rate": 0.10081095255564385, "loss": 0.2327, "num_input_tokens_seen": 21007328, "step": 24255 }, { "epoch": 11.438000942951438, "grad_norm": 0.00103617156855762, "learning_rate": 0.10075530874554335, "loss": 0.2229, "num_input_tokens_seen": 21011200, "step": 24260 }, { "epoch": 11.44035832154644, "grad_norm": 0.0018556129653006792, "learning_rate": 0.10069967252959311, "loss": 0.2281, "num_input_tokens_seen": 21015712, "step": 24265 }, { "epoch": 11.442715700141443, "grad_norm": 0.0013250431511551142, "learning_rate": 0.10064404391637297, "loss": 0.2243, "num_input_tokens_seen": 21019792, "step": 24270 }, { "epoch": 11.445073078736446, "grad_norm": 0.0011378051713109016, "learning_rate": 0.10058842291446145, "loss": 0.2301, "num_input_tokens_seen": 21024128, "step": 24275 }, { "epoch": 11.447430457331448, "grad_norm": 0.0011550643248483539, "learning_rate": 0.10053280953243608, "loss": 0.2088, "num_input_tokens_seen": 21028816, "step": 24280 }, { "epoch": 11.44978783592645, "grad_norm": 0.002494734711945057, "learning_rate": 0.10047720377887315, "loss": 0.2408, "num_input_tokens_seen": 21033824, "step": 24285 }, { "epoch": 11.452145214521453, "grad_norm": 0.0014824791578575969, "learning_rate": 0.10042160566234767, "loss": 0.2254, "num_input_tokens_seen": 21037248, "step": 24290 }, { "epoch": 11.454502593116455, "grad_norm": 0.0015586329391226172, "learning_rate": 0.10036601519143372, "loss": 0.2101, "num_input_tokens_seen": 21041168, "step": 24295 }, { "epoch": 11.456859971711458, "grad_norm": 0.0010274745291098952, "learning_rate": 0.1003104323747039, "loss": 0.2102, "num_input_tokens_seen": 21044640, "step": 24300 }, { "epoch": 11.45921735030646, "grad_norm": 0.0010759857250377536, "learning_rate": 0.10025485722072984, "loss": 0.1999, "num_input_tokens_seen": 21048944, "step": 24305 }, { "epoch": 11.46157472890146, "grad_norm": 0.001420038752257824, "learning_rate": 0.10019928973808201, "loss": 0.2424, "num_input_tokens_seen": 21054048, "step": 24310 }, { "epoch": 11.463932107496463, "grad_norm": 0.0010925345122814178, "learning_rate": 0.10014372993532945, "loss": 0.1953, "num_input_tokens_seen": 21058400, "step": 24315 }, { "epoch": 11.466289486091465, "grad_norm": 0.002286000642925501, "learning_rate": 0.1000881778210403, "loss": 0.2678, "num_input_tokens_seen": 21063056, "step": 24320 }, { "epoch": 11.468646864686468, "grad_norm": 0.0015329233137890697, "learning_rate": 0.10003263340378142, "loss": 0.2154, "num_input_tokens_seen": 21067328, "step": 24325 }, { "epoch": 11.47100424328147, "grad_norm": 0.0012834753142669797, "learning_rate": 0.09997709669211834, "loss": 0.2321, "num_input_tokens_seen": 21072160, "step": 24330 }, { "epoch": 11.473361621876473, "grad_norm": 0.0021054602693766356, "learning_rate": 0.0999215676946156, "loss": 0.2166, "num_input_tokens_seen": 21076128, "step": 24335 }, { "epoch": 11.475719000471475, "grad_norm": 0.0012935794657096267, "learning_rate": 0.0998660464198364, "loss": 0.2219, "num_input_tokens_seen": 21080208, "step": 24340 }, { "epoch": 11.478076379066477, "grad_norm": 0.0017072336049750447, "learning_rate": 0.09981053287634288, "loss": 0.2017, "num_input_tokens_seen": 21085040, "step": 24345 }, { "epoch": 11.48043375766148, "grad_norm": 0.0012616710737347603, "learning_rate": 0.09975502707269596, "loss": 0.2226, "num_input_tokens_seen": 21089200, "step": 24350 }, { "epoch": 11.482791136256482, "grad_norm": 0.0012929923832416534, "learning_rate": 0.09969952901745524, "loss": 0.2039, "num_input_tokens_seen": 21092944, "step": 24355 }, { "epoch": 11.485148514851485, "grad_norm": 0.002151044551283121, "learning_rate": 0.09964403871917925, "loss": 0.2218, "num_input_tokens_seen": 21097136, "step": 24360 }, { "epoch": 11.487505893446487, "grad_norm": 0.0024337039794772863, "learning_rate": 0.09958855618642536, "loss": 0.2308, "num_input_tokens_seen": 21102496, "step": 24365 }, { "epoch": 11.48986327204149, "grad_norm": 0.0012786609586328268, "learning_rate": 0.09953308142774955, "loss": 0.2097, "num_input_tokens_seen": 21106624, "step": 24370 }, { "epoch": 11.492220650636492, "grad_norm": 0.0010939042549580336, "learning_rate": 0.09947761445170686, "loss": 0.1855, "num_input_tokens_seen": 21111632, "step": 24375 }, { "epoch": 11.494578029231494, "grad_norm": 0.005516489502042532, "learning_rate": 0.09942215526685086, "loss": 0.257, "num_input_tokens_seen": 21115680, "step": 24380 }, { "epoch": 11.496935407826497, "grad_norm": 0.001244401908479631, "learning_rate": 0.09936670388173414, "loss": 0.2255, "num_input_tokens_seen": 21120192, "step": 24385 }, { "epoch": 11.499292786421499, "grad_norm": 0.0016690681222826242, "learning_rate": 0.09931126030490799, "loss": 0.2138, "num_input_tokens_seen": 21124144, "step": 24390 }, { "epoch": 11.501650165016502, "grad_norm": 0.0011564086889848113, "learning_rate": 0.0992558245449225, "loss": 0.1822, "num_input_tokens_seen": 21129024, "step": 24395 }, { "epoch": 11.504007543611504, "grad_norm": 0.0018102757167071104, "learning_rate": 0.09920039661032651, "loss": 0.2398, "num_input_tokens_seen": 21133392, "step": 24400 }, { "epoch": 11.504007543611504, "eval_loss": 0.21459263563156128, "eval_runtime": 21.9296, "eval_samples_per_second": 43.001, "eval_steps_per_second": 21.523, "num_input_tokens_seen": 21133392, "step": 24400 }, { "epoch": 11.506364922206506, "grad_norm": 0.0013346398482099175, "learning_rate": 0.09914497650966782, "loss": 0.2277, "num_input_tokens_seen": 21137664, "step": 24405 }, { "epoch": 11.508722300801509, "grad_norm": 0.001345939002931118, "learning_rate": 0.09908956425149276, "loss": 0.196, "num_input_tokens_seen": 21141136, "step": 24410 }, { "epoch": 11.511079679396511, "grad_norm": 0.0013239822583273053, "learning_rate": 0.09903415984434677, "loss": 0.2132, "num_input_tokens_seen": 21146080, "step": 24415 }, { "epoch": 11.513437057991514, "grad_norm": 0.0013430117396637797, "learning_rate": 0.09897876329677373, "loss": 0.2228, "num_input_tokens_seen": 21149920, "step": 24420 }, { "epoch": 11.515794436586516, "grad_norm": 0.0011524935252964497, "learning_rate": 0.09892337461731658, "loss": 0.2044, "num_input_tokens_seen": 21153952, "step": 24425 }, { "epoch": 11.518151815181518, "grad_norm": 0.002316533587872982, "learning_rate": 0.09886799381451693, "loss": 0.2391, "num_input_tokens_seen": 21158768, "step": 24430 }, { "epoch": 11.52050919377652, "grad_norm": 0.001123458961956203, "learning_rate": 0.09881262089691521, "loss": 0.2415, "num_input_tokens_seen": 21162720, "step": 24435 }, { "epoch": 11.522866572371523, "grad_norm": 0.0015193960862234235, "learning_rate": 0.09875725587305059, "loss": 0.2133, "num_input_tokens_seen": 21166944, "step": 24440 }, { "epoch": 11.525223950966526, "grad_norm": 0.0013115541078150272, "learning_rate": 0.09870189875146111, "loss": 0.1907, "num_input_tokens_seen": 21171904, "step": 24445 }, { "epoch": 11.527581329561528, "grad_norm": 0.0010188404703512788, "learning_rate": 0.09864654954068346, "loss": 0.2154, "num_input_tokens_seen": 21175728, "step": 24450 }, { "epoch": 11.52993870815653, "grad_norm": 0.0016422872431576252, "learning_rate": 0.09859120824925326, "loss": 0.1823, "num_input_tokens_seen": 21180192, "step": 24455 }, { "epoch": 11.532296086751533, "grad_norm": 0.0015043941093608737, "learning_rate": 0.09853587488570474, "loss": 0.1908, "num_input_tokens_seen": 21184224, "step": 24460 }, { "epoch": 11.534653465346535, "grad_norm": 0.002071185503154993, "learning_rate": 0.09848054945857107, "loss": 0.2591, "num_input_tokens_seen": 21188656, "step": 24465 }, { "epoch": 11.537010843941538, "grad_norm": 0.004276470746845007, "learning_rate": 0.09842523197638416, "loss": 0.1978, "num_input_tokens_seen": 21192912, "step": 24470 }, { "epoch": 11.53936822253654, "grad_norm": 0.0013434748398140073, "learning_rate": 0.09836992244767452, "loss": 0.204, "num_input_tokens_seen": 21197680, "step": 24475 }, { "epoch": 11.541725601131542, "grad_norm": 0.0012395968660712242, "learning_rate": 0.09831462088097168, "loss": 0.2321, "num_input_tokens_seen": 21201408, "step": 24480 }, { "epoch": 11.544082979726545, "grad_norm": 0.0012872024672105908, "learning_rate": 0.09825932728480385, "loss": 0.2039, "num_input_tokens_seen": 21205392, "step": 24485 }, { "epoch": 11.546440358321547, "grad_norm": 0.0021323596592992544, "learning_rate": 0.09820404166769794, "loss": 0.2641, "num_input_tokens_seen": 21210048, "step": 24490 }, { "epoch": 11.54879773691655, "grad_norm": 0.0010779349831864238, "learning_rate": 0.09814876403817978, "loss": 0.1939, "num_input_tokens_seen": 21213840, "step": 24495 }, { "epoch": 11.551155115511552, "grad_norm": 0.0014021014794707298, "learning_rate": 0.09809349440477376, "loss": 0.2299, "num_input_tokens_seen": 21218160, "step": 24500 }, { "epoch": 11.553512494106554, "grad_norm": 0.0014081717235967517, "learning_rate": 0.09803823277600317, "loss": 0.211, "num_input_tokens_seen": 21222688, "step": 24505 }, { "epoch": 11.555869872701557, "grad_norm": 0.0012839860282838345, "learning_rate": 0.09798297916039014, "loss": 0.2287, "num_input_tokens_seen": 21226464, "step": 24510 }, { "epoch": 11.558227251296557, "grad_norm": 0.001742522232234478, "learning_rate": 0.09792773356645534, "loss": 0.2388, "num_input_tokens_seen": 21230544, "step": 24515 }, { "epoch": 11.56058462989156, "grad_norm": 0.002315906574949622, "learning_rate": 0.09787249600271843, "loss": 0.223, "num_input_tokens_seen": 21235088, "step": 24520 }, { "epoch": 11.562942008486562, "grad_norm": 0.001618875190615654, "learning_rate": 0.09781726647769776, "loss": 0.2319, "num_input_tokens_seen": 21239504, "step": 24525 }, { "epoch": 11.565299387081565, "grad_norm": 0.0014063516864553094, "learning_rate": 0.0977620449999103, "loss": 0.209, "num_input_tokens_seen": 21243392, "step": 24530 }, { "epoch": 11.567656765676567, "grad_norm": 0.002089003100991249, "learning_rate": 0.09770683157787204, "loss": 0.2008, "num_input_tokens_seen": 21246816, "step": 24535 }, { "epoch": 11.57001414427157, "grad_norm": 0.0020877420902252197, "learning_rate": 0.09765162622009745, "loss": 0.2045, "num_input_tokens_seen": 21251920, "step": 24540 }, { "epoch": 11.572371522866572, "grad_norm": 0.001878962037153542, "learning_rate": 0.09759642893509995, "loss": 0.2491, "num_input_tokens_seen": 21256640, "step": 24545 }, { "epoch": 11.574728901461574, "grad_norm": 0.0014620802830904722, "learning_rate": 0.09754123973139169, "loss": 0.2273, "num_input_tokens_seen": 21260784, "step": 24550 }, { "epoch": 11.577086280056577, "grad_norm": 0.001695121987722814, "learning_rate": 0.09748605861748345, "loss": 0.1955, "num_input_tokens_seen": 21265712, "step": 24555 }, { "epoch": 11.579443658651579, "grad_norm": 0.0012805418809875846, "learning_rate": 0.0974308856018849, "loss": 0.2137, "num_input_tokens_seen": 21269360, "step": 24560 }, { "epoch": 11.581801037246581, "grad_norm": 0.0013549226569011807, "learning_rate": 0.09737572069310449, "loss": 0.1893, "num_input_tokens_seen": 21273440, "step": 24565 }, { "epoch": 11.584158415841584, "grad_norm": 0.0036154750268906355, "learning_rate": 0.09732056389964922, "loss": 0.261, "num_input_tokens_seen": 21278064, "step": 24570 }, { "epoch": 11.586515794436586, "grad_norm": 0.0014415037585422397, "learning_rate": 0.097265415230025, "loss": 0.2366, "num_input_tokens_seen": 21282336, "step": 24575 }, { "epoch": 11.588873173031589, "grad_norm": 0.0010968868155032396, "learning_rate": 0.09721027469273648, "loss": 0.2373, "num_input_tokens_seen": 21286544, "step": 24580 }, { "epoch": 11.591230551626591, "grad_norm": 0.001220164936967194, "learning_rate": 0.09715514229628695, "loss": 0.1998, "num_input_tokens_seen": 21290352, "step": 24585 }, { "epoch": 11.593587930221593, "grad_norm": 0.0009680346702225506, "learning_rate": 0.09710001804917864, "loss": 0.2155, "num_input_tokens_seen": 21295072, "step": 24590 }, { "epoch": 11.595945308816596, "grad_norm": 0.0019337751436978579, "learning_rate": 0.09704490195991226, "loss": 0.2261, "num_input_tokens_seen": 21299440, "step": 24595 }, { "epoch": 11.598302687411598, "grad_norm": 0.0018808655440807343, "learning_rate": 0.09698979403698753, "loss": 0.236, "num_input_tokens_seen": 21303360, "step": 24600 }, { "epoch": 11.598302687411598, "eval_loss": 0.21430175006389618, "eval_runtime": 21.9486, "eval_samples_per_second": 42.964, "eval_steps_per_second": 21.505, "num_input_tokens_seen": 21303360, "step": 24600 }, { "epoch": 11.6006600660066, "grad_norm": 0.001367606921121478, "learning_rate": 0.0969346942889027, "loss": 0.2072, "num_input_tokens_seen": 21307776, "step": 24605 }, { "epoch": 11.603017444601603, "grad_norm": 0.001170686911791563, "learning_rate": 0.09687960272415487, "loss": 0.1893, "num_input_tokens_seen": 21311696, "step": 24610 }, { "epoch": 11.605374823196605, "grad_norm": 0.001034544431604445, "learning_rate": 0.0968245193512399, "loss": 0.2116, "num_input_tokens_seen": 21315808, "step": 24615 }, { "epoch": 11.607732201791608, "grad_norm": 0.0016862135380506516, "learning_rate": 0.09676944417865221, "loss": 0.2406, "num_input_tokens_seen": 21319968, "step": 24620 }, { "epoch": 11.61008958038661, "grad_norm": 0.0016530383145436645, "learning_rate": 0.09671437721488517, "loss": 0.2105, "num_input_tokens_seen": 21324416, "step": 24625 }, { "epoch": 11.612446958981613, "grad_norm": 0.0010541364317759871, "learning_rate": 0.09665931846843086, "loss": 0.2025, "num_input_tokens_seen": 21328784, "step": 24630 }, { "epoch": 11.614804337576615, "grad_norm": 0.00205571879632771, "learning_rate": 0.0966042679477799, "loss": 0.208, "num_input_tokens_seen": 21333744, "step": 24635 }, { "epoch": 11.617161716171617, "grad_norm": 0.0012913331156596541, "learning_rate": 0.09654922566142186, "loss": 0.2052, "num_input_tokens_seen": 21337696, "step": 24640 }, { "epoch": 11.61951909476662, "grad_norm": 0.0015795450890436769, "learning_rate": 0.09649419161784498, "loss": 0.2287, "num_input_tokens_seen": 21342272, "step": 24645 }, { "epoch": 11.621876473361622, "grad_norm": 0.0016384940827265382, "learning_rate": 0.09643916582553606, "loss": 0.2167, "num_input_tokens_seen": 21345936, "step": 24650 }, { "epoch": 11.624233851956625, "grad_norm": 0.0016610833117738366, "learning_rate": 0.09638414829298093, "loss": 0.1613, "num_input_tokens_seen": 21350672, "step": 24655 }, { "epoch": 11.626591230551627, "grad_norm": 0.0015123037155717611, "learning_rate": 0.09632913902866386, "loss": 0.2399, "num_input_tokens_seen": 21355232, "step": 24660 }, { "epoch": 11.62894860914663, "grad_norm": 0.0014813189627602696, "learning_rate": 0.096274138041068, "loss": 0.2107, "num_input_tokens_seen": 21359744, "step": 24665 }, { "epoch": 11.631305987741632, "grad_norm": 0.0021001307759433985, "learning_rate": 0.09621914533867527, "loss": 0.2346, "num_input_tokens_seen": 21363104, "step": 24670 }, { "epoch": 11.633663366336634, "grad_norm": 0.0019138817442581058, "learning_rate": 0.09616416092996616, "loss": 0.188, "num_input_tokens_seen": 21367232, "step": 24675 }, { "epoch": 11.636020744931637, "grad_norm": 0.0010424847714602947, "learning_rate": 0.09610918482342, "loss": 0.2028, "num_input_tokens_seen": 21371808, "step": 24680 }, { "epoch": 11.638378123526639, "grad_norm": 0.0010596808278933167, "learning_rate": 0.09605421702751478, "loss": 0.2033, "num_input_tokens_seen": 21375840, "step": 24685 }, { "epoch": 11.640735502121641, "grad_norm": 0.0012585166841745377, "learning_rate": 0.09599925755072718, "loss": 0.2248, "num_input_tokens_seen": 21380112, "step": 24690 }, { "epoch": 11.643092880716644, "grad_norm": 0.0010412329575046897, "learning_rate": 0.09594430640153273, "loss": 0.2337, "num_input_tokens_seen": 21384096, "step": 24695 }, { "epoch": 11.645450259311646, "grad_norm": 0.0023194735404103994, "learning_rate": 0.09588936358840547, "loss": 0.2443, "num_input_tokens_seen": 21387728, "step": 24700 }, { "epoch": 11.647807637906649, "grad_norm": 0.0021233169827610254, "learning_rate": 0.09583442911981836, "loss": 0.2184, "num_input_tokens_seen": 21391808, "step": 24705 }, { "epoch": 11.65016501650165, "grad_norm": 0.0023815142922103405, "learning_rate": 0.09577950300424302, "loss": 0.2107, "num_input_tokens_seen": 21396832, "step": 24710 }, { "epoch": 11.652522395096653, "grad_norm": 0.0021937149576842785, "learning_rate": 0.09572458525014967, "loss": 0.2175, "num_input_tokens_seen": 21401184, "step": 24715 }, { "epoch": 11.654879773691654, "grad_norm": 0.0019147262210026383, "learning_rate": 0.0956696758660073, "loss": 0.193, "num_input_tokens_seen": 21406176, "step": 24720 }, { "epoch": 11.657237152286656, "grad_norm": 0.0009050863445736468, "learning_rate": 0.09561477486028373, "loss": 0.2136, "num_input_tokens_seen": 21411312, "step": 24725 }, { "epoch": 11.659594530881659, "grad_norm": 0.001486301771365106, "learning_rate": 0.09555988224144528, "loss": 0.2033, "num_input_tokens_seen": 21415616, "step": 24730 }, { "epoch": 11.661951909476661, "grad_norm": 0.0014088016469031572, "learning_rate": 0.09550499801795717, "loss": 0.238, "num_input_tokens_seen": 21419904, "step": 24735 }, { "epoch": 11.664309288071664, "grad_norm": 0.0010885733645409346, "learning_rate": 0.09545012219828314, "loss": 0.2057, "num_input_tokens_seen": 21424704, "step": 24740 }, { "epoch": 11.666666666666666, "grad_norm": 0.0017249888041988015, "learning_rate": 0.09539525479088577, "loss": 0.2285, "num_input_tokens_seen": 21429472, "step": 24745 }, { "epoch": 11.669024045261668, "grad_norm": 0.0011772416764870286, "learning_rate": 0.0953403958042264, "loss": 0.2186, "num_input_tokens_seen": 21433952, "step": 24750 }, { "epoch": 11.67138142385667, "grad_norm": 0.0008628592477180064, "learning_rate": 0.09528554524676484, "loss": 0.1772, "num_input_tokens_seen": 21438128, "step": 24755 }, { "epoch": 11.673738802451673, "grad_norm": 0.00136595091316849, "learning_rate": 0.09523070312695978, "loss": 0.2253, "num_input_tokens_seen": 21441504, "step": 24760 }, { "epoch": 11.676096181046676, "grad_norm": 0.0008873876067809761, "learning_rate": 0.09517586945326863, "loss": 0.2628, "num_input_tokens_seen": 21445232, "step": 24765 }, { "epoch": 11.678453559641678, "grad_norm": 0.0008397348574362695, "learning_rate": 0.0951210442341473, "loss": 0.1787, "num_input_tokens_seen": 21449280, "step": 24770 }, { "epoch": 11.68081093823668, "grad_norm": 0.0010120074730366468, "learning_rate": 0.09506622747805066, "loss": 0.2093, "num_input_tokens_seen": 21453200, "step": 24775 }, { "epoch": 11.683168316831683, "grad_norm": 0.0016064882511273026, "learning_rate": 0.09501141919343203, "loss": 0.2644, "num_input_tokens_seen": 21457888, "step": 24780 }, { "epoch": 11.685525695426685, "grad_norm": 0.0015735324705019593, "learning_rate": 0.09495661938874361, "loss": 0.1943, "num_input_tokens_seen": 21462176, "step": 24785 }, { "epoch": 11.687883074021688, "grad_norm": 0.000923692889045924, "learning_rate": 0.0949018280724362, "loss": 0.2391, "num_input_tokens_seen": 21466336, "step": 24790 }, { "epoch": 11.69024045261669, "grad_norm": 0.0018216018797829747, "learning_rate": 0.09484704525295934, "loss": 0.2168, "num_input_tokens_seen": 21471296, "step": 24795 }, { "epoch": 11.692597831211692, "grad_norm": 0.0009637311450205743, "learning_rate": 0.09479227093876112, "loss": 0.2261, "num_input_tokens_seen": 21475184, "step": 24800 }, { "epoch": 11.692597831211692, "eval_loss": 0.2172553837299347, "eval_runtime": 21.9208, "eval_samples_per_second": 43.019, "eval_steps_per_second": 21.532, "num_input_tokens_seen": 21475184, "step": 24800 }, { "epoch": 11.694955209806695, "grad_norm": 0.0010780264856293797, "learning_rate": 0.0947375051382886, "loss": 0.243, "num_input_tokens_seen": 21480320, "step": 24805 }, { "epoch": 11.697312588401697, "grad_norm": 0.001218119403347373, "learning_rate": 0.09468274785998718, "loss": 0.2172, "num_input_tokens_seen": 21484800, "step": 24810 }, { "epoch": 11.6996699669967, "grad_norm": 0.002056143945083022, "learning_rate": 0.09462799911230127, "loss": 0.2335, "num_input_tokens_seen": 21489104, "step": 24815 }, { "epoch": 11.702027345591702, "grad_norm": 0.0011257024016231298, "learning_rate": 0.0945732589036737, "loss": 0.2094, "num_input_tokens_seen": 21492928, "step": 24820 }, { "epoch": 11.704384724186705, "grad_norm": 0.0016977416817098856, "learning_rate": 0.09451852724254614, "loss": 0.2103, "num_input_tokens_seen": 21497456, "step": 24825 }, { "epoch": 11.706742102781707, "grad_norm": 0.0009925810154527426, "learning_rate": 0.09446380413735894, "loss": 0.2033, "num_input_tokens_seen": 21501440, "step": 24830 }, { "epoch": 11.70909948137671, "grad_norm": 0.0012298488290980458, "learning_rate": 0.09440908959655099, "loss": 0.221, "num_input_tokens_seen": 21506640, "step": 24835 }, { "epoch": 11.711456859971712, "grad_norm": 0.0011750245466828346, "learning_rate": 0.09435438362856004, "loss": 0.1949, "num_input_tokens_seen": 21511536, "step": 24840 }, { "epoch": 11.713814238566714, "grad_norm": 0.001713107223622501, "learning_rate": 0.0942996862418225, "loss": 0.222, "num_input_tokens_seen": 21516192, "step": 24845 }, { "epoch": 11.716171617161717, "grad_norm": 0.002661296399310231, "learning_rate": 0.09424499744477322, "loss": 0.2352, "num_input_tokens_seen": 21520032, "step": 24850 }, { "epoch": 11.718528995756719, "grad_norm": 0.0015966345090419054, "learning_rate": 0.09419031724584608, "loss": 0.1575, "num_input_tokens_seen": 21524368, "step": 24855 }, { "epoch": 11.720886374351721, "grad_norm": 0.0010222073178738356, "learning_rate": 0.09413564565347331, "loss": 0.1602, "num_input_tokens_seen": 21528832, "step": 24860 }, { "epoch": 11.723243752946724, "grad_norm": 0.0017616093391552567, "learning_rate": 0.094080982676086, "loss": 0.1948, "num_input_tokens_seen": 21533328, "step": 24865 }, { "epoch": 11.725601131541726, "grad_norm": 0.0010742853628471494, "learning_rate": 0.09402632832211395, "loss": 0.1844, "num_input_tokens_seen": 21537072, "step": 24870 }, { "epoch": 11.727958510136729, "grad_norm": 0.0038774514105170965, "learning_rate": 0.09397168259998541, "loss": 0.2809, "num_input_tokens_seen": 21541584, "step": 24875 }, { "epoch": 11.730315888731731, "grad_norm": 0.0019322356674820185, "learning_rate": 0.09391704551812759, "loss": 0.2217, "num_input_tokens_seen": 21545728, "step": 24880 }, { "epoch": 11.732673267326733, "grad_norm": 0.001406824099831283, "learning_rate": 0.09386241708496605, "loss": 0.2386, "num_input_tokens_seen": 21549536, "step": 24885 }, { "epoch": 11.735030645921736, "grad_norm": 0.001991902943700552, "learning_rate": 0.09380779730892527, "loss": 0.1957, "num_input_tokens_seen": 21554736, "step": 24890 }, { "epoch": 11.737388024516738, "grad_norm": 0.001078232191503048, "learning_rate": 0.09375318619842836, "loss": 0.2059, "num_input_tokens_seen": 21558768, "step": 24895 }, { "epoch": 11.73974540311174, "grad_norm": 0.0013251790078356862, "learning_rate": 0.09369858376189696, "loss": 0.2173, "num_input_tokens_seen": 21563232, "step": 24900 }, { "epoch": 11.742102781706743, "grad_norm": 0.0011969280894845724, "learning_rate": 0.09364399000775143, "loss": 0.1949, "num_input_tokens_seen": 21567040, "step": 24905 }, { "epoch": 11.744460160301745, "grad_norm": 0.0009954662527889013, "learning_rate": 0.09358940494441093, "loss": 0.1923, "num_input_tokens_seen": 21571392, "step": 24910 }, { "epoch": 11.746817538896746, "grad_norm": 0.0011927669402211905, "learning_rate": 0.09353482858029301, "loss": 0.2354, "num_input_tokens_seen": 21576016, "step": 24915 }, { "epoch": 11.749174917491748, "grad_norm": 0.0015159152681007981, "learning_rate": 0.09348026092381419, "loss": 0.2365, "num_input_tokens_seen": 21579920, "step": 24920 }, { "epoch": 11.75153229608675, "grad_norm": 0.0022116571199148893, "learning_rate": 0.09342570198338931, "loss": 0.2025, "num_input_tokens_seen": 21584592, "step": 24925 }, { "epoch": 11.753889674681753, "grad_norm": 0.0014214979019016027, "learning_rate": 0.0933711517674322, "loss": 0.1741, "num_input_tokens_seen": 21588944, "step": 24930 }, { "epoch": 11.756247053276756, "grad_norm": 0.0016390391392633319, "learning_rate": 0.09331661028435513, "loss": 0.1837, "num_input_tokens_seen": 21593808, "step": 24935 }, { "epoch": 11.758604431871758, "grad_norm": 0.00120930268894881, "learning_rate": 0.09326207754256909, "loss": 0.2079, "num_input_tokens_seen": 21598560, "step": 24940 }, { "epoch": 11.76096181046676, "grad_norm": 0.0008626120397821069, "learning_rate": 0.09320755355048366, "loss": 0.2238, "num_input_tokens_seen": 21602944, "step": 24945 }, { "epoch": 11.763319189061763, "grad_norm": 0.0011741317575797439, "learning_rate": 0.09315303831650722, "loss": 0.2793, "num_input_tokens_seen": 21607392, "step": 24950 }, { "epoch": 11.765676567656765, "grad_norm": 0.0017197990091517568, "learning_rate": 0.09309853184904661, "loss": 0.1757, "num_input_tokens_seen": 21611680, "step": 24955 }, { "epoch": 11.768033946251768, "grad_norm": 0.0014889169251546264, "learning_rate": 0.09304403415650753, "loss": 0.2059, "num_input_tokens_seen": 21615632, "step": 24960 }, { "epoch": 11.77039132484677, "grad_norm": 0.0012549147941172123, "learning_rate": 0.09298954524729405, "loss": 0.2095, "num_input_tokens_seen": 21619216, "step": 24965 }, { "epoch": 11.772748703441772, "grad_norm": 0.0012689559953287244, "learning_rate": 0.09293506512980916, "loss": 0.2186, "num_input_tokens_seen": 21623808, "step": 24970 }, { "epoch": 11.775106082036775, "grad_norm": 0.001233382266946137, "learning_rate": 0.0928805938124544, "loss": 0.2194, "num_input_tokens_seen": 21627888, "step": 24975 }, { "epoch": 11.777463460631777, "grad_norm": 0.0013282728614285588, "learning_rate": 0.09282613130362982, "loss": 0.1779, "num_input_tokens_seen": 21632256, "step": 24980 }, { "epoch": 11.77982083922678, "grad_norm": 0.001700561260804534, "learning_rate": 0.09277167761173427, "loss": 0.2652, "num_input_tokens_seen": 21636576, "step": 24985 }, { "epoch": 11.782178217821782, "grad_norm": 0.0013899715850129724, "learning_rate": 0.0927172327451653, "loss": 0.2131, "num_input_tokens_seen": 21640560, "step": 24990 }, { "epoch": 11.784535596416784, "grad_norm": 0.0010246846359223127, "learning_rate": 0.09266279671231882, "loss": 0.1575, "num_input_tokens_seen": 21644864, "step": 24995 }, { "epoch": 11.786892975011787, "grad_norm": 0.0012227657716721296, "learning_rate": 0.09260836952158967, "loss": 0.1897, "num_input_tokens_seen": 21649744, "step": 25000 }, { "epoch": 11.786892975011787, "eval_loss": 0.2155287116765976, "eval_runtime": 21.8904, "eval_samples_per_second": 43.078, "eval_steps_per_second": 21.562, "num_input_tokens_seen": 21649744, "step": 25000 }, { "epoch": 11.78925035360679, "grad_norm": 0.002352917566895485, "learning_rate": 0.09255395118137114, "loss": 0.2355, "num_input_tokens_seen": 21653408, "step": 25005 }, { "epoch": 11.791607732201792, "grad_norm": 0.0028916732408106327, "learning_rate": 0.09249954170005527, "loss": 0.2129, "num_input_tokens_seen": 21657904, "step": 25010 }, { "epoch": 11.793965110796794, "grad_norm": 0.0015345962019637227, "learning_rate": 0.0924451410860327, "loss": 0.1601, "num_input_tokens_seen": 21662016, "step": 25015 }, { "epoch": 11.796322489391796, "grad_norm": 0.0010948312701657414, "learning_rate": 0.09239074934769258, "loss": 0.2399, "num_input_tokens_seen": 21665904, "step": 25020 }, { "epoch": 11.798679867986799, "grad_norm": 0.0016351558733731508, "learning_rate": 0.09233636649342288, "loss": 0.2073, "num_input_tokens_seen": 21670480, "step": 25025 }, { "epoch": 11.801037246581801, "grad_norm": 0.002235376974567771, "learning_rate": 0.09228199253161017, "loss": 0.188, "num_input_tokens_seen": 21674928, "step": 25030 }, { "epoch": 11.803394625176804, "grad_norm": 0.0016001559561118484, "learning_rate": 0.09222762747063949, "loss": 0.219, "num_input_tokens_seen": 21678784, "step": 25035 }, { "epoch": 11.805752003771806, "grad_norm": 0.0015034000389277935, "learning_rate": 0.09217327131889473, "loss": 0.2214, "num_input_tokens_seen": 21682832, "step": 25040 }, { "epoch": 11.808109382366808, "grad_norm": 0.0022226108703762293, "learning_rate": 0.09211892408475818, "loss": 0.1985, "num_input_tokens_seen": 21687184, "step": 25045 }, { "epoch": 11.81046676096181, "grad_norm": 0.0018172124400734901, "learning_rate": 0.09206458577661089, "loss": 0.2302, "num_input_tokens_seen": 21691424, "step": 25050 }, { "epoch": 11.812824139556813, "grad_norm": 0.001658797962591052, "learning_rate": 0.09201025640283263, "loss": 0.2051, "num_input_tokens_seen": 21695584, "step": 25055 }, { "epoch": 11.815181518151816, "grad_norm": 0.0014435937628149986, "learning_rate": 0.09195593597180148, "loss": 0.1922, "num_input_tokens_seen": 21699520, "step": 25060 }, { "epoch": 11.817538896746818, "grad_norm": 0.0020107757300138474, "learning_rate": 0.09190162449189444, "loss": 0.2924, "num_input_tokens_seen": 21703520, "step": 25065 }, { "epoch": 11.81989627534182, "grad_norm": 0.0017625651089474559, "learning_rate": 0.09184732197148705, "loss": 0.1979, "num_input_tokens_seen": 21708512, "step": 25070 }, { "epoch": 11.822253653936823, "grad_norm": 0.0014290951658040285, "learning_rate": 0.09179302841895343, "loss": 0.2386, "num_input_tokens_seen": 21712752, "step": 25075 }, { "epoch": 11.824611032531825, "grad_norm": 0.001372298807837069, "learning_rate": 0.09173874384266625, "loss": 0.1671, "num_input_tokens_seen": 21717680, "step": 25080 }, { "epoch": 11.826968411126828, "grad_norm": 0.0011316505260765553, "learning_rate": 0.09168446825099695, "loss": 0.2017, "num_input_tokens_seen": 21721776, "step": 25085 }, { "epoch": 11.82932578972183, "grad_norm": 0.0014601885341107845, "learning_rate": 0.09163020165231545, "loss": 0.2058, "num_input_tokens_seen": 21726112, "step": 25090 }, { "epoch": 11.831683168316832, "grad_norm": 0.0016520641511306167, "learning_rate": 0.09157594405499044, "loss": 0.1986, "num_input_tokens_seen": 21730416, "step": 25095 }, { "epoch": 11.834040546911835, "grad_norm": 0.0016342884628102183, "learning_rate": 0.09152169546738899, "loss": 0.2146, "num_input_tokens_seen": 21734608, "step": 25100 }, { "epoch": 11.836397925506837, "grad_norm": 0.0018079245928674936, "learning_rate": 0.09146745589787698, "loss": 0.2024, "num_input_tokens_seen": 21738752, "step": 25105 }, { "epoch": 11.838755304101838, "grad_norm": 0.0017751138657331467, "learning_rate": 0.09141322535481891, "loss": 0.2382, "num_input_tokens_seen": 21743168, "step": 25110 }, { "epoch": 11.841112682696842, "grad_norm": 0.001870118547230959, "learning_rate": 0.0913590038465777, "loss": 0.2084, "num_input_tokens_seen": 21747824, "step": 25115 }, { "epoch": 11.843470061291843, "grad_norm": 0.00155125861056149, "learning_rate": 0.09130479138151505, "loss": 0.1855, "num_input_tokens_seen": 21751952, "step": 25120 }, { "epoch": 11.845827439886845, "grad_norm": 0.001150398631580174, "learning_rate": 0.09125058796799114, "loss": 0.1948, "num_input_tokens_seen": 21756544, "step": 25125 }, { "epoch": 11.848184818481847, "grad_norm": 0.0018827456515282393, "learning_rate": 0.09119639361436485, "loss": 0.2076, "num_input_tokens_seen": 21760368, "step": 25130 }, { "epoch": 11.85054219707685, "grad_norm": 0.0027398993261158466, "learning_rate": 0.09114220832899368, "loss": 0.234, "num_input_tokens_seen": 21765008, "step": 25135 }, { "epoch": 11.852899575671852, "grad_norm": 0.003395430976524949, "learning_rate": 0.0910880321202336, "loss": 0.2388, "num_input_tokens_seen": 21769824, "step": 25140 }, { "epoch": 11.855256954266855, "grad_norm": 0.0017761178314685822, "learning_rate": 0.09103386499643933, "loss": 0.25, "num_input_tokens_seen": 21773728, "step": 25145 }, { "epoch": 11.857614332861857, "grad_norm": 0.0029723462648689747, "learning_rate": 0.09097970696596407, "loss": 0.2152, "num_input_tokens_seen": 21778016, "step": 25150 }, { "epoch": 11.85997171145686, "grad_norm": 0.0017005990957841277, "learning_rate": 0.09092555803715971, "loss": 0.2434, "num_input_tokens_seen": 21782000, "step": 25155 }, { "epoch": 11.862329090051862, "grad_norm": 0.001295386697165668, "learning_rate": 0.0908714182183767, "loss": 0.2056, "num_input_tokens_seen": 21785840, "step": 25160 }, { "epoch": 11.864686468646864, "grad_norm": 0.0013133526081219316, "learning_rate": 0.090817287517964, "loss": 0.2227, "num_input_tokens_seen": 21789680, "step": 25165 }, { "epoch": 11.867043847241867, "grad_norm": 0.001896976842544973, "learning_rate": 0.09076316594426931, "loss": 0.2228, "num_input_tokens_seen": 21794352, "step": 25170 }, { "epoch": 11.869401225836869, "grad_norm": 0.0010368068469688296, "learning_rate": 0.09070905350563888, "loss": 0.2121, "num_input_tokens_seen": 21798640, "step": 25175 }, { "epoch": 11.871758604431871, "grad_norm": 0.0020882452372461557, "learning_rate": 0.09065495021041745, "loss": 0.1843, "num_input_tokens_seen": 21803024, "step": 25180 }, { "epoch": 11.874115983026874, "grad_norm": 0.0016612487379461527, "learning_rate": 0.09060085606694851, "loss": 0.2107, "num_input_tokens_seen": 21806640, "step": 25185 }, { "epoch": 11.876473361621876, "grad_norm": 0.0016265788581222296, "learning_rate": 0.09054677108357405, "loss": 0.2155, "num_input_tokens_seen": 21811232, "step": 25190 }, { "epoch": 11.878830740216879, "grad_norm": 0.001999874599277973, "learning_rate": 0.09049269526863457, "loss": 0.2213, "num_input_tokens_seen": 21815152, "step": 25195 }, { "epoch": 11.881188118811881, "grad_norm": 0.0024321454111486673, "learning_rate": 0.09043862863046935, "loss": 0.208, "num_input_tokens_seen": 21819728, "step": 25200 }, { "epoch": 11.881188118811881, "eval_loss": 0.2112353891134262, "eval_runtime": 21.9807, "eval_samples_per_second": 42.901, "eval_steps_per_second": 21.473, "num_input_tokens_seen": 21819728, "step": 25200 }, { "epoch": 11.883545497406883, "grad_norm": 0.0018754929769784212, "learning_rate": 0.09038457117741602, "loss": 0.2478, "num_input_tokens_seen": 21823952, "step": 25205 }, { "epoch": 11.885902876001886, "grad_norm": 0.002216377994045615, "learning_rate": 0.09033052291781099, "loss": 0.2261, "num_input_tokens_seen": 21827984, "step": 25210 }, { "epoch": 11.888260254596888, "grad_norm": 0.0019501898204907775, "learning_rate": 0.09027648385998926, "loss": 0.1974, "num_input_tokens_seen": 21832000, "step": 25215 }, { "epoch": 11.89061763319189, "grad_norm": 0.0015178475296124816, "learning_rate": 0.09022245401228417, "loss": 0.1926, "num_input_tokens_seen": 21836176, "step": 25220 }, { "epoch": 11.892975011786893, "grad_norm": 0.0013542333617806435, "learning_rate": 0.09016843338302792, "loss": 0.236, "num_input_tokens_seen": 21840848, "step": 25225 }, { "epoch": 11.895332390381895, "grad_norm": 0.0013553169555962086, "learning_rate": 0.09011442198055115, "loss": 0.2098, "num_input_tokens_seen": 21845344, "step": 25230 }, { "epoch": 11.897689768976898, "grad_norm": 0.0011615008115768433, "learning_rate": 0.09006041981318305, "loss": 0.2171, "num_input_tokens_seen": 21850208, "step": 25235 }, { "epoch": 11.9000471475719, "grad_norm": 0.000910402974113822, "learning_rate": 0.09000642688925149, "loss": 0.1922, "num_input_tokens_seen": 21854240, "step": 25240 }, { "epoch": 11.902404526166903, "grad_norm": 0.004234171472489834, "learning_rate": 0.0899524432170828, "loss": 0.22, "num_input_tokens_seen": 21858960, "step": 25245 }, { "epoch": 11.904761904761905, "grad_norm": 0.0010156435891985893, "learning_rate": 0.08989846880500196, "loss": 0.2259, "num_input_tokens_seen": 21863472, "step": 25250 }, { "epoch": 11.907119283356908, "grad_norm": 0.0018457279074937105, "learning_rate": 0.08984450366133256, "loss": 0.1856, "num_input_tokens_seen": 21867632, "step": 25255 }, { "epoch": 11.90947666195191, "grad_norm": 0.0013895700685679913, "learning_rate": 0.08979054779439664, "loss": 0.2409, "num_input_tokens_seen": 21871760, "step": 25260 }, { "epoch": 11.911834040546912, "grad_norm": 0.0027131098322570324, "learning_rate": 0.08973660121251485, "loss": 0.2499, "num_input_tokens_seen": 21876096, "step": 25265 }, { "epoch": 11.914191419141915, "grad_norm": 0.0011141665745526552, "learning_rate": 0.08968266392400655, "loss": 0.2126, "num_input_tokens_seen": 21880048, "step": 25270 }, { "epoch": 11.916548797736917, "grad_norm": 0.0009513244731351733, "learning_rate": 0.0896287359371894, "loss": 0.2199, "num_input_tokens_seen": 21883696, "step": 25275 }, { "epoch": 11.91890617633192, "grad_norm": 0.0018973363330587745, "learning_rate": 0.08957481726037989, "loss": 0.1932, "num_input_tokens_seen": 21888368, "step": 25280 }, { "epoch": 11.921263554926922, "grad_norm": 0.0013521626824513078, "learning_rate": 0.08952090790189286, "loss": 0.1854, "num_input_tokens_seen": 21892976, "step": 25285 }, { "epoch": 11.923620933521924, "grad_norm": 0.0014583324082195759, "learning_rate": 0.08946700787004187, "loss": 0.2366, "num_input_tokens_seen": 21896976, "step": 25290 }, { "epoch": 11.925978312116927, "grad_norm": 0.0013630498433485627, "learning_rate": 0.08941311717313899, "loss": 0.1757, "num_input_tokens_seen": 21900672, "step": 25295 }, { "epoch": 11.92833569071193, "grad_norm": 0.0014233427355065942, "learning_rate": 0.08935923581949483, "loss": 0.1879, "num_input_tokens_seen": 21904576, "step": 25300 }, { "epoch": 11.930693069306932, "grad_norm": 0.0018050877843052149, "learning_rate": 0.0893053638174185, "loss": 0.2378, "num_input_tokens_seen": 21908784, "step": 25305 }, { "epoch": 11.933050447901934, "grad_norm": 0.0016010937979444861, "learning_rate": 0.0892515011752179, "loss": 0.2355, "num_input_tokens_seen": 21913808, "step": 25310 }, { "epoch": 11.935407826496935, "grad_norm": 0.002102271653711796, "learning_rate": 0.08919764790119918, "loss": 0.2385, "num_input_tokens_seen": 21917824, "step": 25315 }, { "epoch": 11.937765205091937, "grad_norm": 0.001730751246213913, "learning_rate": 0.08914380400366727, "loss": 0.2352, "num_input_tokens_seen": 21922464, "step": 25320 }, { "epoch": 11.94012258368694, "grad_norm": 0.0031019810121506453, "learning_rate": 0.08908996949092551, "loss": 0.2189, "num_input_tokens_seen": 21928352, "step": 25325 }, { "epoch": 11.942479962281942, "grad_norm": 0.001600078190676868, "learning_rate": 0.08903614437127592, "loss": 0.2136, "num_input_tokens_seen": 21932704, "step": 25330 }, { "epoch": 11.944837340876944, "grad_norm": 0.0011619935976341367, "learning_rate": 0.088982328653019, "loss": 0.2252, "num_input_tokens_seen": 21937104, "step": 25335 }, { "epoch": 11.947194719471947, "grad_norm": 0.0013469844125211239, "learning_rate": 0.0889285223444538, "loss": 0.2136, "num_input_tokens_seen": 21940704, "step": 25340 }, { "epoch": 11.949552098066949, "grad_norm": 0.0015705502592027187, "learning_rate": 0.08887472545387787, "loss": 0.1868, "num_input_tokens_seen": 21944544, "step": 25345 }, { "epoch": 11.951909476661951, "grad_norm": 0.0013550722505897284, "learning_rate": 0.08882093798958751, "loss": 0.2047, "num_input_tokens_seen": 21948720, "step": 25350 }, { "epoch": 11.954266855256954, "grad_norm": 0.0015780176036059856, "learning_rate": 0.08876715995987726, "loss": 0.2067, "num_input_tokens_seen": 21953712, "step": 25355 }, { "epoch": 11.956624233851956, "grad_norm": 0.0014395128237083554, "learning_rate": 0.08871339137304052, "loss": 0.2557, "num_input_tokens_seen": 21957552, "step": 25360 }, { "epoch": 11.958981612446959, "grad_norm": 0.001709404750727117, "learning_rate": 0.0886596322373689, "loss": 0.2524, "num_input_tokens_seen": 21962592, "step": 25365 }, { "epoch": 11.961338991041961, "grad_norm": 0.002401844598352909, "learning_rate": 0.08860588256115293, "loss": 0.2287, "num_input_tokens_seen": 21966752, "step": 25370 }, { "epoch": 11.963696369636963, "grad_norm": 0.0016225959407165647, "learning_rate": 0.0885521423526814, "loss": 0.2177, "num_input_tokens_seen": 21971136, "step": 25375 }, { "epoch": 11.966053748231966, "grad_norm": 0.0015869117341935635, "learning_rate": 0.08849841162024165, "loss": 0.1922, "num_input_tokens_seen": 21976112, "step": 25380 }, { "epoch": 11.968411126826968, "grad_norm": 0.0013724610907956958, "learning_rate": 0.08844469037211973, "loss": 0.2455, "num_input_tokens_seen": 21980672, "step": 25385 }, { "epoch": 11.97076850542197, "grad_norm": 0.0011946968734264374, "learning_rate": 0.08839097861660014, "loss": 0.1961, "num_input_tokens_seen": 21985824, "step": 25390 }, { "epoch": 11.973125884016973, "grad_norm": 0.0021944670006632805, "learning_rate": 0.08833727636196585, "loss": 0.2153, "num_input_tokens_seen": 21989232, "step": 25395 }, { "epoch": 11.975483262611975, "grad_norm": 0.002419522497802973, "learning_rate": 0.08828358361649848, "loss": 0.1995, "num_input_tokens_seen": 21993120, "step": 25400 }, { "epoch": 11.975483262611975, "eval_loss": 0.2142464965581894, "eval_runtime": 21.906, "eval_samples_per_second": 43.048, "eval_steps_per_second": 21.547, "num_input_tokens_seen": 21993120, "step": 25400 }, { "epoch": 11.977840641206978, "grad_norm": 0.001564336707815528, "learning_rate": 0.08822990038847807, "loss": 0.2166, "num_input_tokens_seen": 21997584, "step": 25405 }, { "epoch": 11.98019801980198, "grad_norm": 0.00197174702771008, "learning_rate": 0.08817622668618325, "loss": 0.1952, "num_input_tokens_seen": 22001488, "step": 25410 }, { "epoch": 11.982555398396983, "grad_norm": 0.0013819426530972123, "learning_rate": 0.08812256251789125, "loss": 0.1971, "num_input_tokens_seen": 22006192, "step": 25415 }, { "epoch": 11.984912776991985, "grad_norm": 0.0014173618983477354, "learning_rate": 0.08806890789187766, "loss": 0.2066, "num_input_tokens_seen": 22010176, "step": 25420 }, { "epoch": 11.987270155586987, "grad_norm": 0.001422629109583795, "learning_rate": 0.08801526281641672, "loss": 0.2519, "num_input_tokens_seen": 22013968, "step": 25425 }, { "epoch": 11.98962753418199, "grad_norm": 0.0019721915014088154, "learning_rate": 0.0879616272997813, "loss": 0.2267, "num_input_tokens_seen": 22017920, "step": 25430 }, { "epoch": 11.991984912776992, "grad_norm": 0.0023337099701166153, "learning_rate": 0.08790800135024247, "loss": 0.2685, "num_input_tokens_seen": 22021968, "step": 25435 }, { "epoch": 11.994342291371995, "grad_norm": 0.0019517289474606514, "learning_rate": 0.08785438497607023, "loss": 0.2032, "num_input_tokens_seen": 22026832, "step": 25440 }, { "epoch": 11.996699669966997, "grad_norm": 0.001907902187667787, "learning_rate": 0.08780077818553277, "loss": 0.1898, "num_input_tokens_seen": 22031008, "step": 25445 }, { "epoch": 11.999057048562, "grad_norm": 0.0009369938634335995, "learning_rate": 0.0877471809868969, "loss": 0.2171, "num_input_tokens_seen": 22034768, "step": 25450 }, { "epoch": 12.001414427157002, "grad_norm": 0.0015117436414584517, "learning_rate": 0.08769359338842811, "loss": 0.2495, "num_input_tokens_seen": 22039216, "step": 25455 }, { "epoch": 12.003771805752004, "grad_norm": 0.001222338411025703, "learning_rate": 0.08764001539839016, "loss": 0.1734, "num_input_tokens_seen": 22043216, "step": 25460 }, { "epoch": 12.006129184347007, "grad_norm": 0.0009650439023971558, "learning_rate": 0.08758644702504548, "loss": 0.181, "num_input_tokens_seen": 22047968, "step": 25465 }, { "epoch": 12.008486562942009, "grad_norm": 0.0012462990125641227, "learning_rate": 0.0875328882766551, "loss": 0.2136, "num_input_tokens_seen": 22052000, "step": 25470 }, { "epoch": 12.010843941537011, "grad_norm": 0.004697110503911972, "learning_rate": 0.08747933916147828, "loss": 0.2, "num_input_tokens_seen": 22055824, "step": 25475 }, { "epoch": 12.013201320132014, "grad_norm": 0.0018836923409253359, "learning_rate": 0.0874257996877731, "loss": 0.1673, "num_input_tokens_seen": 22060128, "step": 25480 }, { "epoch": 12.015558698727016, "grad_norm": 0.0012637325562536716, "learning_rate": 0.08737226986379593, "loss": 0.1933, "num_input_tokens_seen": 22065184, "step": 25485 }, { "epoch": 12.017916077322019, "grad_norm": 0.0012028979836031795, "learning_rate": 0.08731874969780173, "loss": 0.1829, "num_input_tokens_seen": 22068912, "step": 25490 }, { "epoch": 12.020273455917021, "grad_norm": 0.001387283205986023, "learning_rate": 0.08726523919804412, "loss": 0.2691, "num_input_tokens_seen": 22072992, "step": 25495 }, { "epoch": 12.022630834512023, "grad_norm": 0.0016071537975221872, "learning_rate": 0.08721173837277492, "loss": 0.198, "num_input_tokens_seen": 22077312, "step": 25500 }, { "epoch": 12.024988213107026, "grad_norm": 0.002463748212903738, "learning_rate": 0.08715824723024479, "loss": 0.2326, "num_input_tokens_seen": 22081488, "step": 25505 }, { "epoch": 12.027345591702028, "grad_norm": 0.001992699922993779, "learning_rate": 0.08710476577870258, "loss": 0.2123, "num_input_tokens_seen": 22085136, "step": 25510 }, { "epoch": 12.029702970297029, "grad_norm": 0.0016763792373239994, "learning_rate": 0.08705129402639587, "loss": 0.203, "num_input_tokens_seen": 22089520, "step": 25515 }, { "epoch": 12.032060348892031, "grad_norm": 0.0013211703626438975, "learning_rate": 0.08699783198157078, "loss": 0.2044, "num_input_tokens_seen": 22094560, "step": 25520 }, { "epoch": 12.034417727487034, "grad_norm": 0.001403424539603293, "learning_rate": 0.08694437965247163, "loss": 0.2387, "num_input_tokens_seen": 22098560, "step": 25525 }, { "epoch": 12.036775106082036, "grad_norm": 0.001501058111898601, "learning_rate": 0.08689093704734165, "loss": 0.2034, "num_input_tokens_seen": 22103328, "step": 25530 }, { "epoch": 12.039132484677038, "grad_norm": 0.0016912651481106877, "learning_rate": 0.08683750417442222, "loss": 0.235, "num_input_tokens_seen": 22106880, "step": 25535 }, { "epoch": 12.04148986327204, "grad_norm": 0.0010738568380475044, "learning_rate": 0.08678408104195334, "loss": 0.2216, "num_input_tokens_seen": 22111536, "step": 25540 }, { "epoch": 12.043847241867043, "grad_norm": 0.0014230493688955903, "learning_rate": 0.08673066765817365, "loss": 0.2117, "num_input_tokens_seen": 22115968, "step": 25545 }, { "epoch": 12.046204620462046, "grad_norm": 0.0017954371869564056, "learning_rate": 0.08667726403132005, "loss": 0.218, "num_input_tokens_seen": 22119488, "step": 25550 }, { "epoch": 12.048561999057048, "grad_norm": 0.0016354575054720044, "learning_rate": 0.0866238701696281, "loss": 0.1977, "num_input_tokens_seen": 22123088, "step": 25555 }, { "epoch": 12.05091937765205, "grad_norm": 0.0012881620787084103, "learning_rate": 0.08657048608133185, "loss": 0.2345, "num_input_tokens_seen": 22128080, "step": 25560 }, { "epoch": 12.053276756247053, "grad_norm": 0.0012136102886870503, "learning_rate": 0.08651711177466369, "loss": 0.1891, "num_input_tokens_seen": 22132560, "step": 25565 }, { "epoch": 12.055634134842055, "grad_norm": 0.0014696959406137466, "learning_rate": 0.08646374725785466, "loss": 0.1955, "num_input_tokens_seen": 22137648, "step": 25570 }, { "epoch": 12.057991513437058, "grad_norm": 0.0016685780137777328, "learning_rate": 0.08641039253913434, "loss": 0.2461, "num_input_tokens_seen": 22141504, "step": 25575 }, { "epoch": 12.06034889203206, "grad_norm": 0.008065101690590382, "learning_rate": 0.08635704762673052, "loss": 0.1963, "num_input_tokens_seen": 22145664, "step": 25580 }, { "epoch": 12.062706270627062, "grad_norm": 0.0026334684807807207, "learning_rate": 0.08630371252886981, "loss": 0.2235, "num_input_tokens_seen": 22150448, "step": 25585 }, { "epoch": 12.065063649222065, "grad_norm": 0.0020950455218553543, "learning_rate": 0.08625038725377704, "loss": 0.1953, "num_input_tokens_seen": 22156480, "step": 25590 }, { "epoch": 12.067421027817067, "grad_norm": 0.002943019149824977, "learning_rate": 0.08619707180967566, "loss": 0.2224, "num_input_tokens_seen": 22160016, "step": 25595 }, { "epoch": 12.06977840641207, "grad_norm": 0.0020576019305735826, "learning_rate": 0.08614376620478768, "loss": 0.2066, "num_input_tokens_seen": 22164624, "step": 25600 }, { "epoch": 12.06977840641207, "eval_loss": 0.21620330214500427, "eval_runtime": 21.8809, "eval_samples_per_second": 43.097, "eval_steps_per_second": 21.571, "num_input_tokens_seen": 22164624, "step": 25600 }, { "epoch": 12.072135785007072, "grad_norm": 0.0020870338194072247, "learning_rate": 0.08609047044733344, "loss": 0.2072, "num_input_tokens_seen": 22168592, "step": 25605 }, { "epoch": 12.074493163602074, "grad_norm": 0.0016808483051136136, "learning_rate": 0.08603718454553168, "loss": 0.1757, "num_input_tokens_seen": 22172672, "step": 25610 }, { "epoch": 12.076850542197077, "grad_norm": 0.002127154730260372, "learning_rate": 0.08598390850759997, "loss": 0.2198, "num_input_tokens_seen": 22177136, "step": 25615 }, { "epoch": 12.07920792079208, "grad_norm": 0.0019992238376289606, "learning_rate": 0.08593064234175397, "loss": 0.2231, "num_input_tokens_seen": 22181696, "step": 25620 }, { "epoch": 12.081565299387082, "grad_norm": 0.0017763111973181367, "learning_rate": 0.08587738605620815, "loss": 0.1796, "num_input_tokens_seen": 22186032, "step": 25625 }, { "epoch": 12.083922677982084, "grad_norm": 0.0019198667723685503, "learning_rate": 0.08582413965917512, "loss": 0.2118, "num_input_tokens_seen": 22190736, "step": 25630 }, { "epoch": 12.086280056577086, "grad_norm": 0.0022524837404489517, "learning_rate": 0.08577090315886628, "loss": 0.2253, "num_input_tokens_seen": 22195280, "step": 25635 }, { "epoch": 12.088637435172089, "grad_norm": 0.0020613379310816526, "learning_rate": 0.08571767656349136, "loss": 0.1935, "num_input_tokens_seen": 22198944, "step": 25640 }, { "epoch": 12.090994813767091, "grad_norm": 0.0033502918668091297, "learning_rate": 0.08566445988125847, "loss": 0.1846, "num_input_tokens_seen": 22203264, "step": 25645 }, { "epoch": 12.093352192362094, "grad_norm": 0.002366072963923216, "learning_rate": 0.08561125312037436, "loss": 0.1669, "num_input_tokens_seen": 22208016, "step": 25650 }, { "epoch": 12.095709570957096, "grad_norm": 0.0017855376936495304, "learning_rate": 0.08555805628904424, "loss": 0.1905, "num_input_tokens_seen": 22212304, "step": 25655 }, { "epoch": 12.098066949552098, "grad_norm": 0.003180636325851083, "learning_rate": 0.08550486939547161, "loss": 0.2064, "num_input_tokens_seen": 22216672, "step": 25660 }, { "epoch": 12.100424328147101, "grad_norm": 0.0029660738073289394, "learning_rate": 0.08545169244785869, "loss": 0.2071, "num_input_tokens_seen": 22221920, "step": 25665 }, { "epoch": 12.102781706742103, "grad_norm": 0.0028700476977974176, "learning_rate": 0.08539852545440589, "loss": 0.2167, "num_input_tokens_seen": 22226640, "step": 25670 }, { "epoch": 12.105139085337106, "grad_norm": 0.0026035821065306664, "learning_rate": 0.08534536842331235, "loss": 0.2176, "num_input_tokens_seen": 22230672, "step": 25675 }, { "epoch": 12.107496463932108, "grad_norm": 0.0015027793124318123, "learning_rate": 0.08529222136277545, "loss": 0.2118, "num_input_tokens_seen": 22235568, "step": 25680 }, { "epoch": 12.10985384252711, "grad_norm": 0.001726228860206902, "learning_rate": 0.08523908428099125, "loss": 0.2051, "num_input_tokens_seen": 22239888, "step": 25685 }, { "epoch": 12.112211221122113, "grad_norm": 0.0020738658495247364, "learning_rate": 0.08518595718615402, "loss": 0.1858, "num_input_tokens_seen": 22243696, "step": 25690 }, { "epoch": 12.114568599717115, "grad_norm": 0.0029829961713403463, "learning_rate": 0.08513284008645675, "loss": 0.1969, "num_input_tokens_seen": 22248128, "step": 25695 }, { "epoch": 12.116925978312118, "grad_norm": 0.0016095552127808332, "learning_rate": 0.08507973299009065, "loss": 0.1828, "num_input_tokens_seen": 22251664, "step": 25700 }, { "epoch": 12.11928335690712, "grad_norm": 0.002045788336545229, "learning_rate": 0.08502663590524563, "loss": 0.1888, "num_input_tokens_seen": 22256064, "step": 25705 }, { "epoch": 12.121640735502123, "grad_norm": 0.002793314168229699, "learning_rate": 0.08497354884010981, "loss": 0.1946, "num_input_tokens_seen": 22261136, "step": 25710 }, { "epoch": 12.123998114097123, "grad_norm": 0.0013766068732365966, "learning_rate": 0.0849204718028699, "loss": 0.1704, "num_input_tokens_seen": 22265840, "step": 25715 }, { "epoch": 12.126355492692126, "grad_norm": 0.003446029732003808, "learning_rate": 0.08486740480171118, "loss": 0.215, "num_input_tokens_seen": 22270192, "step": 25720 }, { "epoch": 12.128712871287128, "grad_norm": 0.0038862829096615314, "learning_rate": 0.08481434784481706, "loss": 0.2974, "num_input_tokens_seen": 22274272, "step": 25725 }, { "epoch": 12.13107024988213, "grad_norm": 0.001949790515936911, "learning_rate": 0.08476130094036968, "loss": 0.1723, "num_input_tokens_seen": 22278976, "step": 25730 }, { "epoch": 12.133427628477133, "grad_norm": 0.003534179413691163, "learning_rate": 0.08470826409654961, "loss": 0.2309, "num_input_tokens_seen": 22283728, "step": 25735 }, { "epoch": 12.135785007072135, "grad_norm": 0.0029591440688818693, "learning_rate": 0.08465523732153564, "loss": 0.2026, "num_input_tokens_seen": 22287760, "step": 25740 }, { "epoch": 12.138142385667138, "grad_norm": 0.00276637333445251, "learning_rate": 0.08460222062350532, "loss": 0.1971, "num_input_tokens_seen": 22291472, "step": 25745 }, { "epoch": 12.14049976426214, "grad_norm": 0.002962039550766349, "learning_rate": 0.08454921401063442, "loss": 0.2495, "num_input_tokens_seen": 22295792, "step": 25750 }, { "epoch": 12.142857142857142, "grad_norm": 0.0019281741697341204, "learning_rate": 0.08449621749109716, "loss": 0.2251, "num_input_tokens_seen": 22300256, "step": 25755 }, { "epoch": 12.145214521452145, "grad_norm": 0.001813719398342073, "learning_rate": 0.08444323107306641, "loss": 0.1921, "num_input_tokens_seen": 22304400, "step": 25760 }, { "epoch": 12.147571900047147, "grad_norm": 0.003000825410708785, "learning_rate": 0.0843902547647132, "loss": 0.2199, "num_input_tokens_seen": 22308864, "step": 25765 }, { "epoch": 12.14992927864215, "grad_norm": 0.004205869045108557, "learning_rate": 0.0843372885742072, "loss": 0.2364, "num_input_tokens_seen": 22313200, "step": 25770 }, { "epoch": 12.152286657237152, "grad_norm": 0.002508777193725109, "learning_rate": 0.08428433250971652, "loss": 0.1821, "num_input_tokens_seen": 22317536, "step": 25775 }, { "epoch": 12.154644035832154, "grad_norm": 0.002177955349907279, "learning_rate": 0.08423138657940757, "loss": 0.2363, "num_input_tokens_seen": 22322208, "step": 25780 }, { "epoch": 12.157001414427157, "grad_norm": 0.002514116931706667, "learning_rate": 0.08417845079144536, "loss": 0.1795, "num_input_tokens_seen": 22327072, "step": 25785 }, { "epoch": 12.15935879302216, "grad_norm": 0.002000371925532818, "learning_rate": 0.08412552515399314, "loss": 0.1957, "num_input_tokens_seen": 22331024, "step": 25790 }, { "epoch": 12.161716171617162, "grad_norm": 0.0019110015127807856, "learning_rate": 0.08407260967521278, "loss": 0.2822, "num_input_tokens_seen": 22335088, "step": 25795 }, { "epoch": 12.164073550212164, "grad_norm": 0.0019304907182231545, "learning_rate": 0.08401970436326454, "loss": 0.1747, "num_input_tokens_seen": 22340064, "step": 25800 }, { "epoch": 12.164073550212164, "eval_loss": 0.21300135552883148, "eval_runtime": 21.8819, "eval_samples_per_second": 43.095, "eval_steps_per_second": 21.57, "num_input_tokens_seen": 22340064, "step": 25800 }, { "epoch": 12.166430928807166, "grad_norm": 0.0016985925612971187, "learning_rate": 0.08396680922630702, "loss": 0.2168, "num_input_tokens_seen": 22344432, "step": 25805 }, { "epoch": 12.168788307402169, "grad_norm": 0.0024870571214705706, "learning_rate": 0.08391392427249732, "loss": 0.1956, "num_input_tokens_seen": 22349040, "step": 25810 }, { "epoch": 12.171145685997171, "grad_norm": 0.001961824717000127, "learning_rate": 0.08386104950999107, "loss": 0.161, "num_input_tokens_seen": 22352928, "step": 25815 }, { "epoch": 12.173503064592174, "grad_norm": 0.0022859713062644005, "learning_rate": 0.0838081849469421, "loss": 0.2377, "num_input_tokens_seen": 22357312, "step": 25820 }, { "epoch": 12.175860443187176, "grad_norm": 0.0018203194485977292, "learning_rate": 0.08375533059150281, "loss": 0.2073, "num_input_tokens_seen": 22361824, "step": 25825 }, { "epoch": 12.178217821782178, "grad_norm": 0.002292245626449585, "learning_rate": 0.08370248645182406, "loss": 0.1979, "num_input_tokens_seen": 22365696, "step": 25830 }, { "epoch": 12.18057520037718, "grad_norm": 0.002885006135329604, "learning_rate": 0.083649652536055, "loss": 0.1799, "num_input_tokens_seen": 22370816, "step": 25835 }, { "epoch": 12.182932578972183, "grad_norm": 0.002114993054419756, "learning_rate": 0.08359682885234339, "loss": 0.2518, "num_input_tokens_seen": 22375936, "step": 25840 }, { "epoch": 12.185289957567186, "grad_norm": 0.001957348082214594, "learning_rate": 0.08354401540883516, "loss": 0.1682, "num_input_tokens_seen": 22379936, "step": 25845 }, { "epoch": 12.187647336162188, "grad_norm": 0.0018400036497041583, "learning_rate": 0.0834912122136749, "loss": 0.1758, "num_input_tokens_seen": 22385072, "step": 25850 }, { "epoch": 12.19000471475719, "grad_norm": 0.002717594848945737, "learning_rate": 0.0834384192750056, "loss": 0.2124, "num_input_tokens_seen": 22389072, "step": 25855 }, { "epoch": 12.192362093352193, "grad_norm": 0.002345660235732794, "learning_rate": 0.08338563660096844, "loss": 0.1789, "num_input_tokens_seen": 22393424, "step": 25860 }, { "epoch": 12.194719471947195, "grad_norm": 0.0026977569796144962, "learning_rate": 0.08333286419970329, "loss": 0.2616, "num_input_tokens_seen": 22398112, "step": 25865 }, { "epoch": 12.197076850542198, "grad_norm": 0.0020180665887892246, "learning_rate": 0.08328010207934824, "loss": 0.1984, "num_input_tokens_seen": 22403456, "step": 25870 }, { "epoch": 12.1994342291372, "grad_norm": 0.002175070345401764, "learning_rate": 0.08322735024803989, "loss": 0.2221, "num_input_tokens_seen": 22406992, "step": 25875 }, { "epoch": 12.201791607732202, "grad_norm": 0.0022843487095087767, "learning_rate": 0.08317460871391331, "loss": 0.2064, "num_input_tokens_seen": 22411184, "step": 25880 }, { "epoch": 12.204148986327205, "grad_norm": 0.002314218319952488, "learning_rate": 0.08312187748510179, "loss": 0.2192, "num_input_tokens_seen": 22416688, "step": 25885 }, { "epoch": 12.206506364922207, "grad_norm": 0.0029838450718671083, "learning_rate": 0.08306915656973726, "loss": 0.2165, "num_input_tokens_seen": 22421536, "step": 25890 }, { "epoch": 12.20886374351721, "grad_norm": 0.0017975930823013186, "learning_rate": 0.08301644597594988, "loss": 0.1706, "num_input_tokens_seen": 22426624, "step": 25895 }, { "epoch": 12.211221122112212, "grad_norm": 0.002504409058019519, "learning_rate": 0.08296374571186826, "loss": 0.2064, "num_input_tokens_seen": 22430448, "step": 25900 }, { "epoch": 12.213578500707214, "grad_norm": 0.00192357343621552, "learning_rate": 0.08291105578561955, "loss": 0.1826, "num_input_tokens_seen": 22434592, "step": 25905 }, { "epoch": 12.215935879302217, "grad_norm": 0.001866841223090887, "learning_rate": 0.08285837620532904, "loss": 0.1882, "num_input_tokens_seen": 22438816, "step": 25910 }, { "epoch": 12.218293257897217, "grad_norm": 0.0022493931464850903, "learning_rate": 0.0828057069791207, "loss": 0.1731, "num_input_tokens_seen": 22442848, "step": 25915 }, { "epoch": 12.22065063649222, "grad_norm": 0.0029825069941580296, "learning_rate": 0.0827530481151168, "loss": 0.2079, "num_input_tokens_seen": 22446768, "step": 25920 }, { "epoch": 12.223008015087222, "grad_norm": 0.0026687353383749723, "learning_rate": 0.08270039962143792, "loss": 0.1992, "num_input_tokens_seen": 22451600, "step": 25925 }, { "epoch": 12.225365393682225, "grad_norm": 0.004171442240476608, "learning_rate": 0.08264776150620314, "loss": 0.2573, "num_input_tokens_seen": 22455504, "step": 25930 }, { "epoch": 12.227722772277227, "grad_norm": 0.00356071419082582, "learning_rate": 0.08259513377753, "loss": 0.2198, "num_input_tokens_seen": 22458944, "step": 25935 }, { "epoch": 12.23008015087223, "grad_norm": 0.002362266182899475, "learning_rate": 0.08254251644353423, "loss": 0.2312, "num_input_tokens_seen": 22462560, "step": 25940 }, { "epoch": 12.232437529467232, "grad_norm": 0.002474409295246005, "learning_rate": 0.08248990951233022, "loss": 0.2489, "num_input_tokens_seen": 22466976, "step": 25945 }, { "epoch": 12.234794908062234, "grad_norm": 0.0023583583533763885, "learning_rate": 0.08243731299203048, "loss": 0.219, "num_input_tokens_seen": 22471664, "step": 25950 }, { "epoch": 12.237152286657237, "grad_norm": 0.0027400508988648653, "learning_rate": 0.08238472689074612, "loss": 0.1868, "num_input_tokens_seen": 22476448, "step": 25955 }, { "epoch": 12.239509665252239, "grad_norm": 0.002432436216622591, "learning_rate": 0.08233215121658666, "loss": 0.2162, "num_input_tokens_seen": 22480960, "step": 25960 }, { "epoch": 12.241867043847241, "grad_norm": 0.0034653102047741413, "learning_rate": 0.08227958597765982, "loss": 0.2647, "num_input_tokens_seen": 22484784, "step": 25965 }, { "epoch": 12.244224422442244, "grad_norm": 0.0019362765597179532, "learning_rate": 0.08222703118207181, "loss": 0.1985, "num_input_tokens_seen": 22489088, "step": 25970 }, { "epoch": 12.246581801037246, "grad_norm": 0.0019056793535128236, "learning_rate": 0.08217448683792734, "loss": 0.193, "num_input_tokens_seen": 22493520, "step": 25975 }, { "epoch": 12.248939179632249, "grad_norm": 0.0020754325669258833, "learning_rate": 0.08212195295332926, "loss": 0.2236, "num_input_tokens_seen": 22497248, "step": 25980 }, { "epoch": 12.251296558227251, "grad_norm": 0.0020912587642669678, "learning_rate": 0.08206942953637915, "loss": 0.2384, "num_input_tokens_seen": 22502880, "step": 25985 }, { "epoch": 12.253653936822253, "grad_norm": 0.0033875734079629183, "learning_rate": 0.08201691659517658, "loss": 0.2415, "num_input_tokens_seen": 22507184, "step": 25990 }, { "epoch": 12.256011315417256, "grad_norm": 0.0015324442647397518, "learning_rate": 0.08196441413781981, "loss": 0.2317, "num_input_tokens_seen": 22511520, "step": 25995 }, { "epoch": 12.258368694012258, "grad_norm": 0.002376343123614788, "learning_rate": 0.08191192217240544, "loss": 0.1969, "num_input_tokens_seen": 22515088, "step": 26000 }, { "epoch": 12.258368694012258, "eval_loss": 0.21766319870948792, "eval_runtime": 21.9432, "eval_samples_per_second": 42.975, "eval_steps_per_second": 21.51, "num_input_tokens_seen": 22515088, "step": 26000 }, { "epoch": 12.26072607260726, "grad_norm": 0.001220780424773693, "learning_rate": 0.08185944070702823, "loss": 0.2045, "num_input_tokens_seen": 22519088, "step": 26005 }, { "epoch": 12.263083451202263, "grad_norm": 0.0016136636259034276, "learning_rate": 0.08180696974978159, "loss": 0.1784, "num_input_tokens_seen": 22524144, "step": 26010 }, { "epoch": 12.265440829797265, "grad_norm": 0.0027210155967622995, "learning_rate": 0.08175450930875724, "loss": 0.2077, "num_input_tokens_seen": 22528816, "step": 26015 }, { "epoch": 12.267798208392268, "grad_norm": 0.0032448158599436283, "learning_rate": 0.08170205939204513, "loss": 0.2679, "num_input_tokens_seen": 22532704, "step": 26020 }, { "epoch": 12.27015558698727, "grad_norm": 0.0018816920928657055, "learning_rate": 0.08164962000773379, "loss": 0.1815, "num_input_tokens_seen": 22537456, "step": 26025 }, { "epoch": 12.272512965582273, "grad_norm": 0.0013753611128777266, "learning_rate": 0.08159719116390995, "loss": 0.1908, "num_input_tokens_seen": 22542176, "step": 26030 }, { "epoch": 12.274870344177275, "grad_norm": 0.002307357033714652, "learning_rate": 0.08154477286865887, "loss": 0.2189, "num_input_tokens_seen": 22546560, "step": 26035 }, { "epoch": 12.277227722772277, "grad_norm": 0.0016140807420015335, "learning_rate": 0.08149236513006404, "loss": 0.1898, "num_input_tokens_seen": 22550848, "step": 26040 }, { "epoch": 12.27958510136728, "grad_norm": 0.002089468529447913, "learning_rate": 0.08143996795620746, "loss": 0.2309, "num_input_tokens_seen": 22555696, "step": 26045 }, { "epoch": 12.281942479962282, "grad_norm": 0.0026942442636936903, "learning_rate": 0.08138758135516938, "loss": 0.1887, "num_input_tokens_seen": 22559232, "step": 26050 }, { "epoch": 12.284299858557285, "grad_norm": 0.001832404755987227, "learning_rate": 0.08133520533502851, "loss": 0.1871, "num_input_tokens_seen": 22563728, "step": 26055 }, { "epoch": 12.286657237152287, "grad_norm": 0.002027918351814151, "learning_rate": 0.08128283990386184, "loss": 0.1827, "num_input_tokens_seen": 22569664, "step": 26060 }, { "epoch": 12.28901461574729, "grad_norm": 0.0012998961610719562, "learning_rate": 0.08123048506974488, "loss": 0.1849, "num_input_tokens_seen": 22574096, "step": 26065 }, { "epoch": 12.291371994342292, "grad_norm": 0.0022959953639656305, "learning_rate": 0.08117814084075124, "loss": 0.2053, "num_input_tokens_seen": 22578384, "step": 26070 }, { "epoch": 12.293729372937294, "grad_norm": 0.0034285038709640503, "learning_rate": 0.08112580722495318, "loss": 0.2432, "num_input_tokens_seen": 22582400, "step": 26075 }, { "epoch": 12.296086751532297, "grad_norm": 0.0022926349192857742, "learning_rate": 0.08107348423042122, "loss": 0.1634, "num_input_tokens_seen": 22587168, "step": 26080 }, { "epoch": 12.298444130127299, "grad_norm": 0.004621081054210663, "learning_rate": 0.08102117186522413, "loss": 0.2267, "num_input_tokens_seen": 22591952, "step": 26085 }, { "epoch": 12.300801508722301, "grad_norm": 0.0017184215830639005, "learning_rate": 0.08096887013742916, "loss": 0.2481, "num_input_tokens_seen": 22596368, "step": 26090 }, { "epoch": 12.303158887317304, "grad_norm": 0.0016717249527573586, "learning_rate": 0.08091657905510198, "loss": 0.2535, "num_input_tokens_seen": 22600192, "step": 26095 }, { "epoch": 12.305516265912306, "grad_norm": 0.0027228593826293945, "learning_rate": 0.08086429862630642, "loss": 0.2217, "num_input_tokens_seen": 22604768, "step": 26100 }, { "epoch": 12.307873644507309, "grad_norm": 0.002259837230667472, "learning_rate": 0.08081202885910488, "loss": 0.26, "num_input_tokens_seen": 22608992, "step": 26105 }, { "epoch": 12.310231023102311, "grad_norm": 0.002469619968906045, "learning_rate": 0.08075976976155795, "loss": 0.2288, "num_input_tokens_seen": 22614000, "step": 26110 }, { "epoch": 12.312588401697312, "grad_norm": 0.0030219489708542824, "learning_rate": 0.08070752134172461, "loss": 0.2268, "num_input_tokens_seen": 22618224, "step": 26115 }, { "epoch": 12.314945780292314, "grad_norm": 0.002238794695585966, "learning_rate": 0.08065528360766229, "loss": 0.2349, "num_input_tokens_seen": 22622768, "step": 26120 }, { "epoch": 12.317303158887317, "grad_norm": 0.002145440084859729, "learning_rate": 0.08060305656742664, "loss": 0.1975, "num_input_tokens_seen": 22627456, "step": 26125 }, { "epoch": 12.319660537482319, "grad_norm": 0.002198203466832638, "learning_rate": 0.08055084022907182, "loss": 0.2423, "num_input_tokens_seen": 22631616, "step": 26130 }, { "epoch": 12.322017916077321, "grad_norm": 0.0020793797448277473, "learning_rate": 0.08049863460065014, "loss": 0.2025, "num_input_tokens_seen": 22636048, "step": 26135 }, { "epoch": 12.324375294672324, "grad_norm": 0.0023819527123123407, "learning_rate": 0.0804464396902124, "loss": 0.2201, "num_input_tokens_seen": 22640640, "step": 26140 }, { "epoch": 12.326732673267326, "grad_norm": 0.002019654493778944, "learning_rate": 0.08039425550580777, "loss": 0.2008, "num_input_tokens_seen": 22645728, "step": 26145 }, { "epoch": 12.329090051862329, "grad_norm": 0.0018002190627157688, "learning_rate": 0.08034208205548363, "loss": 0.2229, "num_input_tokens_seen": 22649968, "step": 26150 }, { "epoch": 12.331447430457331, "grad_norm": 0.0020829036366194487, "learning_rate": 0.08028991934728581, "loss": 0.1993, "num_input_tokens_seen": 22654016, "step": 26155 }, { "epoch": 12.333804809052333, "grad_norm": 0.0031222905963659286, "learning_rate": 0.0802377673892585, "loss": 0.2329, "num_input_tokens_seen": 22657888, "step": 26160 }, { "epoch": 12.336162187647336, "grad_norm": 0.0019448851235210896, "learning_rate": 0.0801856261894441, "loss": 0.2415, "num_input_tokens_seen": 22661968, "step": 26165 }, { "epoch": 12.338519566242338, "grad_norm": 0.0019721277058124542, "learning_rate": 0.08013349575588354, "loss": 0.217, "num_input_tokens_seen": 22665648, "step": 26170 }, { "epoch": 12.34087694483734, "grad_norm": 0.002281284425407648, "learning_rate": 0.08008137609661586, "loss": 0.2269, "num_input_tokens_seen": 22669760, "step": 26175 }, { "epoch": 12.343234323432343, "grad_norm": 0.0017794506857171655, "learning_rate": 0.08002926721967872, "loss": 0.2276, "num_input_tokens_seen": 22674240, "step": 26180 }, { "epoch": 12.345591702027345, "grad_norm": 0.0020533092319965363, "learning_rate": 0.07997716913310782, "loss": 0.2143, "num_input_tokens_seen": 22678464, "step": 26185 }, { "epoch": 12.347949080622348, "grad_norm": 0.0024264021776616573, "learning_rate": 0.07992508184493745, "loss": 0.2251, "num_input_tokens_seen": 22683152, "step": 26190 }, { "epoch": 12.35030645921735, "grad_norm": 0.0021962476894259453, "learning_rate": 0.07987300536320001, "loss": 0.1794, "num_input_tokens_seen": 22687664, "step": 26195 }, { "epoch": 12.352663837812353, "grad_norm": 0.002134514506906271, "learning_rate": 0.07982093969592649, "loss": 0.2165, "num_input_tokens_seen": 22692240, "step": 26200 }, { "epoch": 12.352663837812353, "eval_loss": 0.2127642035484314, "eval_runtime": 21.9311, "eval_samples_per_second": 42.998, "eval_steps_per_second": 21.522, "num_input_tokens_seen": 22692240, "step": 26200 }, { "epoch": 12.355021216407355, "grad_norm": 0.0023708236403763294, "learning_rate": 0.07976888485114592, "loss": 0.1936, "num_input_tokens_seen": 22697136, "step": 26205 }, { "epoch": 12.357378595002357, "grad_norm": 0.0016226297011598945, "learning_rate": 0.07971684083688595, "loss": 0.2075, "num_input_tokens_seen": 22701184, "step": 26210 }, { "epoch": 12.35973597359736, "grad_norm": 0.001754903350956738, "learning_rate": 0.0796648076611723, "loss": 0.1827, "num_input_tokens_seen": 22704944, "step": 26215 }, { "epoch": 12.362093352192362, "grad_norm": 0.0017046554712578654, "learning_rate": 0.07961278533202922, "loss": 0.1707, "num_input_tokens_seen": 22709488, "step": 26220 }, { "epoch": 12.364450730787365, "grad_norm": 0.0022566879633814096, "learning_rate": 0.07956077385747919, "loss": 0.2063, "num_input_tokens_seen": 22713616, "step": 26225 }, { "epoch": 12.366808109382367, "grad_norm": 0.0013881293125450611, "learning_rate": 0.079508773245543, "loss": 0.1053, "num_input_tokens_seen": 22717856, "step": 26230 }, { "epoch": 12.36916548797737, "grad_norm": 0.002244348172098398, "learning_rate": 0.07945678350423982, "loss": 0.2518, "num_input_tokens_seen": 22721712, "step": 26235 }, { "epoch": 12.371522866572372, "grad_norm": 0.003388015553355217, "learning_rate": 0.07940480464158717, "loss": 0.2653, "num_input_tokens_seen": 22726736, "step": 26240 }, { "epoch": 12.373880245167374, "grad_norm": 0.002988613909110427, "learning_rate": 0.07935283666560076, "loss": 0.2296, "num_input_tokens_seen": 22730848, "step": 26245 }, { "epoch": 12.376237623762377, "grad_norm": 0.0033228073734790087, "learning_rate": 0.07930087958429478, "loss": 0.2456, "num_input_tokens_seen": 22735056, "step": 26250 }, { "epoch": 12.378595002357379, "grad_norm": 0.0013690374325960875, "learning_rate": 0.07924893340568159, "loss": 0.2361, "num_input_tokens_seen": 22739872, "step": 26255 }, { "epoch": 12.380952380952381, "grad_norm": 0.0015261591179296374, "learning_rate": 0.07919699813777205, "loss": 0.217, "num_input_tokens_seen": 22744224, "step": 26260 }, { "epoch": 12.383309759547384, "grad_norm": 0.0014579371782019734, "learning_rate": 0.07914507378857515, "loss": 0.1969, "num_input_tokens_seen": 22748720, "step": 26265 }, { "epoch": 12.385667138142386, "grad_norm": 0.0016698380932211876, "learning_rate": 0.07909316036609822, "loss": 0.2281, "num_input_tokens_seen": 22753152, "step": 26270 }, { "epoch": 12.388024516737389, "grad_norm": 0.0016808349173516035, "learning_rate": 0.07904125787834704, "loss": 0.2179, "num_input_tokens_seen": 22756848, "step": 26275 }, { "epoch": 12.390381895332391, "grad_norm": 0.0018952348036691546, "learning_rate": 0.07898936633332569, "loss": 0.2201, "num_input_tokens_seen": 22760256, "step": 26280 }, { "epoch": 12.392739273927393, "grad_norm": 0.0018000735435634851, "learning_rate": 0.07893748573903635, "loss": 0.2027, "num_input_tokens_seen": 22765280, "step": 26285 }, { "epoch": 12.395096652522396, "grad_norm": 0.0015228565316647291, "learning_rate": 0.0788856161034798, "loss": 0.2082, "num_input_tokens_seen": 22769536, "step": 26290 }, { "epoch": 12.397454031117398, "grad_norm": 0.0019077903125435114, "learning_rate": 0.07883375743465487, "loss": 0.2172, "num_input_tokens_seen": 22773792, "step": 26295 }, { "epoch": 12.3998114097124, "grad_norm": 0.00255806022323668, "learning_rate": 0.07878190974055888, "loss": 0.2349, "num_input_tokens_seen": 22778112, "step": 26300 }, { "epoch": 12.402168788307403, "grad_norm": 0.0014959365362301469, "learning_rate": 0.07873007302918746, "loss": 0.1928, "num_input_tokens_seen": 22782720, "step": 26305 }, { "epoch": 12.404526166902404, "grad_norm": 0.0020221550948917866, "learning_rate": 0.07867824730853433, "loss": 0.214, "num_input_tokens_seen": 22787296, "step": 26310 }, { "epoch": 12.406883545497408, "grad_norm": 0.001640526344999671, "learning_rate": 0.07862643258659176, "loss": 0.2034, "num_input_tokens_seen": 22791312, "step": 26315 }, { "epoch": 12.409240924092408, "grad_norm": 0.0015621319180354476, "learning_rate": 0.07857462887135026, "loss": 0.2259, "num_input_tokens_seen": 22795920, "step": 26320 }, { "epoch": 12.41159830268741, "grad_norm": 0.0016736218240112066, "learning_rate": 0.0785228361707986, "loss": 0.2118, "num_input_tokens_seen": 22800512, "step": 26325 }, { "epoch": 12.413955681282413, "grad_norm": 0.0012658763444051147, "learning_rate": 0.07847105449292378, "loss": 0.2206, "num_input_tokens_seen": 22804544, "step": 26330 }, { "epoch": 12.416313059877416, "grad_norm": 0.002013157354667783, "learning_rate": 0.0784192838457113, "loss": 0.1792, "num_input_tokens_seen": 22809456, "step": 26335 }, { "epoch": 12.418670438472418, "grad_norm": 0.001976506784558296, "learning_rate": 0.07836752423714473, "loss": 0.1913, "num_input_tokens_seen": 22813648, "step": 26340 }, { "epoch": 12.42102781706742, "grad_norm": 0.0015136905713006854, "learning_rate": 0.07831577567520616, "loss": 0.2368, "num_input_tokens_seen": 22817808, "step": 26345 }, { "epoch": 12.423385195662423, "grad_norm": 0.0022582600358873606, "learning_rate": 0.07826403816787579, "loss": 0.2639, "num_input_tokens_seen": 22821648, "step": 26350 }, { "epoch": 12.425742574257425, "grad_norm": 0.002037752652540803, "learning_rate": 0.0782123117231322, "loss": 0.248, "num_input_tokens_seen": 22826560, "step": 26355 }, { "epoch": 12.428099952852428, "grad_norm": 0.0016276385867968202, "learning_rate": 0.07816059634895237, "loss": 0.2233, "num_input_tokens_seen": 22830096, "step": 26360 }, { "epoch": 12.43045733144743, "grad_norm": 0.002240564441308379, "learning_rate": 0.0781088920533113, "loss": 0.209, "num_input_tokens_seen": 22834560, "step": 26365 }, { "epoch": 12.432814710042432, "grad_norm": 0.002288860036060214, "learning_rate": 0.07805719884418257, "loss": 0.2197, "num_input_tokens_seen": 22839264, "step": 26370 }, { "epoch": 12.435172088637435, "grad_norm": 0.001849495805799961, "learning_rate": 0.07800551672953779, "loss": 0.2251, "num_input_tokens_seen": 22843296, "step": 26375 }, { "epoch": 12.437529467232437, "grad_norm": 0.002244577743113041, "learning_rate": 0.07795384571734709, "loss": 0.2233, "num_input_tokens_seen": 22847712, "step": 26380 }, { "epoch": 12.43988684582744, "grad_norm": 0.0023002084344625473, "learning_rate": 0.07790218581557883, "loss": 0.2363, "num_input_tokens_seen": 22852080, "step": 26385 }, { "epoch": 12.442244224422442, "grad_norm": 0.0015494026010856032, "learning_rate": 0.07785053703219949, "loss": 0.188, "num_input_tokens_seen": 22856368, "step": 26390 }, { "epoch": 12.444601603017444, "grad_norm": 0.001809404231607914, "learning_rate": 0.07779889937517409, "loss": 0.2189, "num_input_tokens_seen": 22860624, "step": 26395 }, { "epoch": 12.446958981612447, "grad_norm": 0.0013917406322434545, "learning_rate": 0.0777472728524657, "loss": 0.1789, "num_input_tokens_seen": 22864512, "step": 26400 }, { "epoch": 12.446958981612447, "eval_loss": 0.21330220997333527, "eval_runtime": 21.8753, "eval_samples_per_second": 43.108, "eval_steps_per_second": 21.577, "num_input_tokens_seen": 22864512, "step": 26400 }, { "epoch": 12.44931636020745, "grad_norm": 0.001880201743915677, "learning_rate": 0.07769565747203584, "loss": 0.1968, "num_input_tokens_seen": 22869232, "step": 26405 }, { "epoch": 12.451673738802452, "grad_norm": 0.002829479519277811, "learning_rate": 0.07764405324184427, "loss": 0.2003, "num_input_tokens_seen": 22873360, "step": 26410 }, { "epoch": 12.454031117397454, "grad_norm": 0.0025351690128445625, "learning_rate": 0.07759246016984889, "loss": 0.2588, "num_input_tokens_seen": 22877184, "step": 26415 }, { "epoch": 12.456388495992456, "grad_norm": 0.0013408659724518657, "learning_rate": 0.07754087826400609, "loss": 0.1747, "num_input_tokens_seen": 22881600, "step": 26420 }, { "epoch": 12.458745874587459, "grad_norm": 0.0014373630983754992, "learning_rate": 0.0774893075322705, "loss": 0.1879, "num_input_tokens_seen": 22885904, "step": 26425 }, { "epoch": 12.461103253182461, "grad_norm": 0.0033324630931019783, "learning_rate": 0.07743774798259484, "loss": 0.2156, "num_input_tokens_seen": 22890672, "step": 26430 }, { "epoch": 12.463460631777464, "grad_norm": 0.0013285984750837088, "learning_rate": 0.07738619962293032, "loss": 0.1527, "num_input_tokens_seen": 22895216, "step": 26435 }, { "epoch": 12.465818010372466, "grad_norm": 0.0014367697294801474, "learning_rate": 0.0773346624612264, "loss": 0.2004, "num_input_tokens_seen": 22900000, "step": 26440 }, { "epoch": 12.468175388967468, "grad_norm": 0.001604254823178053, "learning_rate": 0.07728313650543066, "loss": 0.203, "num_input_tokens_seen": 22904336, "step": 26445 }, { "epoch": 12.47053276756247, "grad_norm": 0.0018329507438465953, "learning_rate": 0.07723162176348913, "loss": 0.1806, "num_input_tokens_seen": 22909472, "step": 26450 }, { "epoch": 12.472890146157473, "grad_norm": 0.0020310545805841684, "learning_rate": 0.07718011824334593, "loss": 0.2462, "num_input_tokens_seen": 22913632, "step": 26455 }, { "epoch": 12.475247524752476, "grad_norm": 0.002024115761741996, "learning_rate": 0.07712862595294363, "loss": 0.2249, "num_input_tokens_seen": 22918016, "step": 26460 }, { "epoch": 12.477604903347478, "grad_norm": 0.001963347429409623, "learning_rate": 0.07707714490022301, "loss": 0.2252, "num_input_tokens_seen": 22922192, "step": 26465 }, { "epoch": 12.47996228194248, "grad_norm": 0.0019649898167699575, "learning_rate": 0.07702567509312298, "loss": 0.1976, "num_input_tokens_seen": 22927072, "step": 26470 }, { "epoch": 12.482319660537483, "grad_norm": 0.0016838643932715058, "learning_rate": 0.07697421653958098, "loss": 0.2054, "num_input_tokens_seen": 22931376, "step": 26475 }, { "epoch": 12.484677039132485, "grad_norm": 0.001809090143069625, "learning_rate": 0.07692276924753247, "loss": 0.2241, "num_input_tokens_seen": 22935296, "step": 26480 }, { "epoch": 12.487034417727488, "grad_norm": 0.0018332254840061069, "learning_rate": 0.07687133322491124, "loss": 0.1798, "num_input_tokens_seen": 22939280, "step": 26485 }, { "epoch": 12.48939179632249, "grad_norm": 0.0018497697310522199, "learning_rate": 0.07681990847964948, "loss": 0.193, "num_input_tokens_seen": 22943680, "step": 26490 }, { "epoch": 12.491749174917492, "grad_norm": 0.001770719769410789, "learning_rate": 0.0767684950196774, "loss": 0.2231, "num_input_tokens_seen": 22948912, "step": 26495 }, { "epoch": 12.494106553512495, "grad_norm": 0.0024434123188257217, "learning_rate": 0.0767170928529237, "loss": 0.2347, "num_input_tokens_seen": 22953376, "step": 26500 }, { "epoch": 12.496463932107497, "grad_norm": 0.0018066201591864228, "learning_rate": 0.07666570198731526, "loss": 0.1956, "num_input_tokens_seen": 22957952, "step": 26505 }, { "epoch": 12.4988213107025, "grad_norm": 0.002617438090965152, "learning_rate": 0.07661432243077708, "loss": 0.1901, "num_input_tokens_seen": 22961632, "step": 26510 }, { "epoch": 12.5011786892975, "grad_norm": 0.001422450295649469, "learning_rate": 0.0765629541912326, "loss": 0.216, "num_input_tokens_seen": 22966080, "step": 26515 }, { "epoch": 12.503536067892503, "grad_norm": 0.0011538423132151365, "learning_rate": 0.07651159727660352, "loss": 0.2014, "num_input_tokens_seen": 22970000, "step": 26520 }, { "epoch": 12.505893446487505, "grad_norm": 0.00220974232070148, "learning_rate": 0.07646025169480959, "loss": 0.2557, "num_input_tokens_seen": 22974560, "step": 26525 }, { "epoch": 12.508250825082508, "grad_norm": 0.0011979369446635246, "learning_rate": 0.07640891745376908, "loss": 0.1656, "num_input_tokens_seen": 22978912, "step": 26530 }, { "epoch": 12.51060820367751, "grad_norm": 0.0016943954396992922, "learning_rate": 0.07635759456139822, "loss": 0.2151, "num_input_tokens_seen": 22982944, "step": 26535 }, { "epoch": 12.512965582272512, "grad_norm": 0.0015075486153364182, "learning_rate": 0.0763062830256118, "loss": 0.2039, "num_input_tokens_seen": 22987088, "step": 26540 }, { "epoch": 12.515322960867515, "grad_norm": 0.001513878465630114, "learning_rate": 0.07625498285432258, "loss": 0.1705, "num_input_tokens_seen": 22991168, "step": 26545 }, { "epoch": 12.517680339462517, "grad_norm": 0.002128720050677657, "learning_rate": 0.07620369405544176, "loss": 0.2421, "num_input_tokens_seen": 22995760, "step": 26550 }, { "epoch": 12.52003771805752, "grad_norm": 0.0013931647408753633, "learning_rate": 0.07615241663687868, "loss": 0.2422, "num_input_tokens_seen": 22999744, "step": 26555 }, { "epoch": 12.522395096652522, "grad_norm": 0.0020236228592693806, "learning_rate": 0.07610115060654106, "loss": 0.2069, "num_input_tokens_seen": 23004016, "step": 26560 }, { "epoch": 12.524752475247524, "grad_norm": 0.0019758623093366623, "learning_rate": 0.07604989597233458, "loss": 0.2194, "num_input_tokens_seen": 23008080, "step": 26565 }, { "epoch": 12.527109853842527, "grad_norm": 0.002313564997166395, "learning_rate": 0.07599865274216352, "loss": 0.2015, "num_input_tokens_seen": 23012832, "step": 26570 }, { "epoch": 12.52946723243753, "grad_norm": 0.0024272941518574953, "learning_rate": 0.07594742092393013, "loss": 0.2187, "num_input_tokens_seen": 23016672, "step": 26575 }, { "epoch": 12.531824611032532, "grad_norm": 0.002012553857639432, "learning_rate": 0.07589620052553503, "loss": 0.1738, "num_input_tokens_seen": 23020640, "step": 26580 }, { "epoch": 12.534181989627534, "grad_norm": 0.0015155673027038574, "learning_rate": 0.0758449915548771, "loss": 0.1883, "num_input_tokens_seen": 23025248, "step": 26585 }, { "epoch": 12.536539368222536, "grad_norm": 0.002881759312003851, "learning_rate": 0.07579379401985332, "loss": 0.1804, "num_input_tokens_seen": 23029424, "step": 26590 }, { "epoch": 12.538896746817539, "grad_norm": 0.0017582689179107547, "learning_rate": 0.07574260792835905, "loss": 0.2301, "num_input_tokens_seen": 23033120, "step": 26595 }, { "epoch": 12.541254125412541, "grad_norm": 0.003071241080760956, "learning_rate": 0.07569143328828784, "loss": 0.2362, "num_input_tokens_seen": 23037568, "step": 26600 }, { "epoch": 12.541254125412541, "eval_loss": 0.21578361093997955, "eval_runtime": 21.9259, "eval_samples_per_second": 43.009, "eval_steps_per_second": 21.527, "num_input_tokens_seen": 23037568, "step": 26600 }, { "epoch": 12.543611504007544, "grad_norm": 0.0013814913108944893, "learning_rate": 0.0756402701075314, "loss": 0.1902, "num_input_tokens_seen": 23041920, "step": 26605 }, { "epoch": 12.545968882602546, "grad_norm": 0.002518513472750783, "learning_rate": 0.07558911839397982, "loss": 0.2408, "num_input_tokens_seen": 23046816, "step": 26610 }, { "epoch": 12.548326261197548, "grad_norm": 0.001633631531149149, "learning_rate": 0.07553797815552123, "loss": 0.1819, "num_input_tokens_seen": 23051152, "step": 26615 }, { "epoch": 12.55068363979255, "grad_norm": 0.0013821866596117616, "learning_rate": 0.07548684940004222, "loss": 0.1794, "num_input_tokens_seen": 23055440, "step": 26620 }, { "epoch": 12.553041018387553, "grad_norm": 0.001595496665686369, "learning_rate": 0.07543573213542744, "loss": 0.2264, "num_input_tokens_seen": 23060224, "step": 26625 }, { "epoch": 12.555398396982556, "grad_norm": 0.0018699313513934612, "learning_rate": 0.0753846263695597, "loss": 0.234, "num_input_tokens_seen": 23064128, "step": 26630 }, { "epoch": 12.557755775577558, "grad_norm": 0.002152504865080118, "learning_rate": 0.07533353211032029, "loss": 0.2276, "num_input_tokens_seen": 23068336, "step": 26635 }, { "epoch": 12.56011315417256, "grad_norm": 0.0016267058672383428, "learning_rate": 0.07528244936558857, "loss": 0.2361, "num_input_tokens_seen": 23072048, "step": 26640 }, { "epoch": 12.562470532767563, "grad_norm": 0.0019499401096254587, "learning_rate": 0.07523137814324206, "loss": 0.1856, "num_input_tokens_seen": 23076064, "step": 26645 }, { "epoch": 12.564827911362565, "grad_norm": 0.0027052853256464005, "learning_rate": 0.07518031845115672, "loss": 0.2081, "num_input_tokens_seen": 23080496, "step": 26650 }, { "epoch": 12.567185289957568, "grad_norm": 0.0013792923418805003, "learning_rate": 0.07512927029720647, "loss": 0.1964, "num_input_tokens_seen": 23084560, "step": 26655 }, { "epoch": 12.56954266855257, "grad_norm": 0.002021533902734518, "learning_rate": 0.0750782336892636, "loss": 0.2307, "num_input_tokens_seen": 23089008, "step": 26660 }, { "epoch": 12.571900047147572, "grad_norm": 0.0021919459104537964, "learning_rate": 0.0750272086351987, "loss": 0.1996, "num_input_tokens_seen": 23093248, "step": 26665 }, { "epoch": 12.574257425742575, "grad_norm": 0.0017514448845759034, "learning_rate": 0.07497619514288031, "loss": 0.2021, "num_input_tokens_seen": 23097120, "step": 26670 }, { "epoch": 12.576614804337577, "grad_norm": 0.00236295023933053, "learning_rate": 0.07492519322017545, "loss": 0.2405, "num_input_tokens_seen": 23101600, "step": 26675 }, { "epoch": 12.57897218293258, "grad_norm": 0.00184104114305228, "learning_rate": 0.0748742028749493, "loss": 0.2031, "num_input_tokens_seen": 23105680, "step": 26680 }, { "epoch": 12.581329561527582, "grad_norm": 0.0016915567684918642, "learning_rate": 0.0748232241150651, "loss": 0.2356, "num_input_tokens_seen": 23110032, "step": 26685 }, { "epoch": 12.583686940122584, "grad_norm": 0.002853140700608492, "learning_rate": 0.07477225694838453, "loss": 0.2263, "num_input_tokens_seen": 23114640, "step": 26690 }, { "epoch": 12.586044318717587, "grad_norm": 0.002332896227017045, "learning_rate": 0.07472130138276731, "loss": 0.1981, "num_input_tokens_seen": 23118864, "step": 26695 }, { "epoch": 12.58840169731259, "grad_norm": 0.0020513543859124184, "learning_rate": 0.07467035742607138, "loss": 0.224, "num_input_tokens_seen": 23123488, "step": 26700 }, { "epoch": 12.590759075907592, "grad_norm": 0.001656990614719689, "learning_rate": 0.07461942508615303, "loss": 0.1947, "num_input_tokens_seen": 23129072, "step": 26705 }, { "epoch": 12.593116454502592, "grad_norm": 0.002102404832839966, "learning_rate": 0.07456850437086657, "loss": 0.2194, "num_input_tokens_seen": 23133040, "step": 26710 }, { "epoch": 12.595473833097596, "grad_norm": 0.0015924021136015654, "learning_rate": 0.07451759528806468, "loss": 0.1832, "num_input_tokens_seen": 23136720, "step": 26715 }, { "epoch": 12.597831211692597, "grad_norm": 0.001781273982487619, "learning_rate": 0.0744666978455982, "loss": 0.1898, "num_input_tokens_seen": 23141696, "step": 26720 }, { "epoch": 12.6001885902876, "grad_norm": 0.0023413351736962795, "learning_rate": 0.07441581205131609, "loss": 0.2027, "num_input_tokens_seen": 23146192, "step": 26725 }, { "epoch": 12.602545968882602, "grad_norm": 0.0028869984671473503, "learning_rate": 0.07436493791306566, "loss": 0.2211, "num_input_tokens_seen": 23150560, "step": 26730 }, { "epoch": 12.604903347477604, "grad_norm": 0.0022023487836122513, "learning_rate": 0.07431407543869223, "loss": 0.1944, "num_input_tokens_seen": 23154944, "step": 26735 }, { "epoch": 12.607260726072607, "grad_norm": 0.0018141076434403658, "learning_rate": 0.0742632246360395, "loss": 0.2063, "num_input_tokens_seen": 23159616, "step": 26740 }, { "epoch": 12.609618104667609, "grad_norm": 0.003723234636709094, "learning_rate": 0.07421238551294934, "loss": 0.1927, "num_input_tokens_seen": 23163760, "step": 26745 }, { "epoch": 12.611975483262611, "grad_norm": 0.0014296281151473522, "learning_rate": 0.07416155807726171, "loss": 0.1693, "num_input_tokens_seen": 23167824, "step": 26750 }, { "epoch": 12.614332861857614, "grad_norm": 0.0032542904373258352, "learning_rate": 0.07411074233681492, "loss": 0.2506, "num_input_tokens_seen": 23172384, "step": 26755 }, { "epoch": 12.616690240452616, "grad_norm": 0.002967148320749402, "learning_rate": 0.07405993829944528, "loss": 0.2282, "num_input_tokens_seen": 23177040, "step": 26760 }, { "epoch": 12.619047619047619, "grad_norm": 0.001895320601761341, "learning_rate": 0.07400914597298755, "loss": 0.1868, "num_input_tokens_seen": 23181424, "step": 26765 }, { "epoch": 12.621404997642621, "grad_norm": 0.0026605986058712006, "learning_rate": 0.07395836536527445, "loss": 0.2495, "num_input_tokens_seen": 23185456, "step": 26770 }, { "epoch": 12.623762376237623, "grad_norm": 0.0017383563099429011, "learning_rate": 0.07390759648413696, "loss": 0.226, "num_input_tokens_seen": 23188736, "step": 26775 }, { "epoch": 12.626119754832626, "grad_norm": 0.00409194128587842, "learning_rate": 0.07385683933740435, "loss": 0.214, "num_input_tokens_seen": 23193088, "step": 26780 }, { "epoch": 12.628477133427628, "grad_norm": 0.002773512387648225, "learning_rate": 0.07380609393290402, "loss": 0.1984, "num_input_tokens_seen": 23196800, "step": 26785 }, { "epoch": 12.63083451202263, "grad_norm": 0.0015155128203332424, "learning_rate": 0.07375536027846147, "loss": 0.1838, "num_input_tokens_seen": 23200784, "step": 26790 }, { "epoch": 12.633191890617633, "grad_norm": 0.002389263827353716, "learning_rate": 0.07370463838190057, "loss": 0.2299, "num_input_tokens_seen": 23204192, "step": 26795 }, { "epoch": 12.635549269212635, "grad_norm": 0.0018419284606352448, "learning_rate": 0.07365392825104317, "loss": 0.2457, "num_input_tokens_seen": 23207936, "step": 26800 }, { "epoch": 12.635549269212635, "eval_loss": 0.21454894542694092, "eval_runtime": 21.8902, "eval_samples_per_second": 43.079, "eval_steps_per_second": 21.562, "num_input_tokens_seen": 23207936, "step": 26800 }, { "epoch": 12.637906647807638, "grad_norm": 0.002074162010103464, "learning_rate": 0.07360322989370945, "loss": 0.1903, "num_input_tokens_seen": 23212432, "step": 26805 }, { "epoch": 12.64026402640264, "grad_norm": 0.0026755088474601507, "learning_rate": 0.07355254331771781, "loss": 0.1992, "num_input_tokens_seen": 23217584, "step": 26810 }, { "epoch": 12.642621404997643, "grad_norm": 0.0016151960007846355, "learning_rate": 0.07350186853088461, "loss": 0.224, "num_input_tokens_seen": 23221536, "step": 26815 }, { "epoch": 12.644978783592645, "grad_norm": 0.002171027008444071, "learning_rate": 0.07345120554102462, "loss": 0.2193, "num_input_tokens_seen": 23225360, "step": 26820 }, { "epoch": 12.647336162187647, "grad_norm": 0.001508059212937951, "learning_rate": 0.07340055435595079, "loss": 0.1834, "num_input_tokens_seen": 23230320, "step": 26825 }, { "epoch": 12.64969354078265, "grad_norm": 0.0021907805930823088, "learning_rate": 0.07334991498347401, "loss": 0.2095, "num_input_tokens_seen": 23235296, "step": 26830 }, { "epoch": 12.652050919377652, "grad_norm": 0.0019480556948110461, "learning_rate": 0.07329928743140365, "loss": 0.1687, "num_input_tokens_seen": 23240240, "step": 26835 }, { "epoch": 12.654408297972655, "grad_norm": 0.002430003834888339, "learning_rate": 0.07324867170754705, "loss": 0.2201, "num_input_tokens_seen": 23245088, "step": 26840 }, { "epoch": 12.656765676567657, "grad_norm": 0.0024915605317801237, "learning_rate": 0.07319806781970974, "loss": 0.2353, "num_input_tokens_seen": 23249744, "step": 26845 }, { "epoch": 12.65912305516266, "grad_norm": 0.001959258457645774, "learning_rate": 0.07314747577569555, "loss": 0.1696, "num_input_tokens_seen": 23253984, "step": 26850 }, { "epoch": 12.661480433757662, "grad_norm": 0.0032744049094617367, "learning_rate": 0.07309689558330636, "loss": 0.2231, "num_input_tokens_seen": 23258464, "step": 26855 }, { "epoch": 12.663837812352664, "grad_norm": 0.0025698316749185324, "learning_rate": 0.0730463272503423, "loss": 0.2689, "num_input_tokens_seen": 23263792, "step": 26860 }, { "epoch": 12.666195190947667, "grad_norm": 0.0014490815810859203, "learning_rate": 0.07299577078460168, "loss": 0.1932, "num_input_tokens_seen": 23267936, "step": 26865 }, { "epoch": 12.668552569542669, "grad_norm": 0.002938194666057825, "learning_rate": 0.07294522619388083, "loss": 0.1809, "num_input_tokens_seen": 23272624, "step": 26870 }, { "epoch": 12.670909948137671, "grad_norm": 0.0018101558089256287, "learning_rate": 0.07289469348597452, "loss": 0.2002, "num_input_tokens_seen": 23277280, "step": 26875 }, { "epoch": 12.673267326732674, "grad_norm": 0.0018491678638383746, "learning_rate": 0.07284417266867535, "loss": 0.2283, "num_input_tokens_seen": 23281984, "step": 26880 }, { "epoch": 12.675624705327676, "grad_norm": 0.0018341557588428259, "learning_rate": 0.07279366374977439, "loss": 0.1911, "num_input_tokens_seen": 23286160, "step": 26885 }, { "epoch": 12.677982083922679, "grad_norm": 0.002989309374243021, "learning_rate": 0.07274316673706074, "loss": 0.1943, "num_input_tokens_seen": 23290800, "step": 26890 }, { "epoch": 12.680339462517681, "grad_norm": 0.0018026836914941669, "learning_rate": 0.07269268163832161, "loss": 0.2088, "num_input_tokens_seen": 23295328, "step": 26895 }, { "epoch": 12.682696841112683, "grad_norm": 0.0016587793361395597, "learning_rate": 0.07264220846134248, "loss": 0.2021, "num_input_tokens_seen": 23299088, "step": 26900 }, { "epoch": 12.685054219707686, "grad_norm": 0.0017360051861032844, "learning_rate": 0.07259174721390699, "loss": 0.1907, "num_input_tokens_seen": 23302800, "step": 26905 }, { "epoch": 12.687411598302688, "grad_norm": 0.0017065188148990273, "learning_rate": 0.07254129790379686, "loss": 0.1765, "num_input_tokens_seen": 23306864, "step": 26910 }, { "epoch": 12.689768976897689, "grad_norm": 0.0022955574095249176, "learning_rate": 0.072490860538792, "loss": 0.2329, "num_input_tokens_seen": 23311168, "step": 26915 }, { "epoch": 12.692126355492691, "grad_norm": 0.002466501435264945, "learning_rate": 0.07244043512667042, "loss": 0.1832, "num_input_tokens_seen": 23315200, "step": 26920 }, { "epoch": 12.694483734087694, "grad_norm": 0.0017437050119042397, "learning_rate": 0.07239002167520843, "loss": 0.2237, "num_input_tokens_seen": 23320064, "step": 26925 }, { "epoch": 12.696841112682696, "grad_norm": 0.0024176998995244503, "learning_rate": 0.07233962019218045, "loss": 0.2412, "num_input_tokens_seen": 23324496, "step": 26930 }, { "epoch": 12.699198491277699, "grad_norm": 0.0020343330688774586, "learning_rate": 0.07228923068535892, "loss": 0.2154, "num_input_tokens_seen": 23328528, "step": 26935 }, { "epoch": 12.701555869872701, "grad_norm": 0.0023078371305018663, "learning_rate": 0.0722388531625146, "loss": 0.1613, "num_input_tokens_seen": 23333472, "step": 26940 }, { "epoch": 12.703913248467703, "grad_norm": 0.002338495571166277, "learning_rate": 0.07218848763141639, "loss": 0.217, "num_input_tokens_seen": 23337152, "step": 26945 }, { "epoch": 12.706270627062706, "grad_norm": 0.0021293286699801683, "learning_rate": 0.07213813409983118, "loss": 0.1737, "num_input_tokens_seen": 23340848, "step": 26950 }, { "epoch": 12.708628005657708, "grad_norm": 0.0024564340710639954, "learning_rate": 0.0720877925755242, "loss": 0.2302, "num_input_tokens_seen": 23344752, "step": 26955 }, { "epoch": 12.71098538425271, "grad_norm": 0.0028243325650691986, "learning_rate": 0.07203746306625866, "loss": 0.2097, "num_input_tokens_seen": 23348912, "step": 26960 }, { "epoch": 12.713342762847713, "grad_norm": 0.0031891961116343737, "learning_rate": 0.07198714557979606, "loss": 0.2204, "num_input_tokens_seen": 23353008, "step": 26965 }, { "epoch": 12.715700141442715, "grad_norm": 0.002105789491906762, "learning_rate": 0.07193684012389602, "loss": 0.2038, "num_input_tokens_seen": 23356736, "step": 26970 }, { "epoch": 12.718057520037718, "grad_norm": 0.002072356641292572, "learning_rate": 0.07188654670631621, "loss": 0.1635, "num_input_tokens_seen": 23360784, "step": 26975 }, { "epoch": 12.72041489863272, "grad_norm": 0.0025962605141103268, "learning_rate": 0.07183626533481258, "loss": 0.2559, "num_input_tokens_seen": 23364848, "step": 26980 }, { "epoch": 12.722772277227723, "grad_norm": 0.0018610811093822122, "learning_rate": 0.07178599601713909, "loss": 0.1958, "num_input_tokens_seen": 23368880, "step": 26985 }, { "epoch": 12.725129655822725, "grad_norm": 0.0016213153721764684, "learning_rate": 0.07173573876104786, "loss": 0.2169, "num_input_tokens_seen": 23372928, "step": 26990 }, { "epoch": 12.727487034417727, "grad_norm": 0.002885885536670685, "learning_rate": 0.0716854935742893, "loss": 0.1832, "num_input_tokens_seen": 23377088, "step": 26995 }, { "epoch": 12.72984441301273, "grad_norm": 0.002385099418461323, "learning_rate": 0.07163526046461174, "loss": 0.2113, "num_input_tokens_seen": 23381376, "step": 27000 }, { "epoch": 12.72984441301273, "eval_loss": 0.21353647112846375, "eval_runtime": 21.8983, "eval_samples_per_second": 43.063, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 23381376, "step": 27000 }, { "epoch": 12.732201791607732, "grad_norm": 0.002014672150835395, "learning_rate": 0.07158503943976181, "loss": 0.2034, "num_input_tokens_seen": 23385232, "step": 27005 }, { "epoch": 12.734559170202735, "grad_norm": 0.0025827244389802217, "learning_rate": 0.07153483050748427, "loss": 0.2632, "num_input_tokens_seen": 23389280, "step": 27010 }, { "epoch": 12.736916548797737, "grad_norm": 0.002819756744429469, "learning_rate": 0.07148463367552188, "loss": 0.2314, "num_input_tokens_seen": 23393296, "step": 27015 }, { "epoch": 12.73927392739274, "grad_norm": 0.0018407930620014668, "learning_rate": 0.07143444895161565, "loss": 0.1972, "num_input_tokens_seen": 23397456, "step": 27020 }, { "epoch": 12.741631305987742, "grad_norm": 0.002018267521634698, "learning_rate": 0.07138427634350476, "loss": 0.2007, "num_input_tokens_seen": 23402000, "step": 27025 }, { "epoch": 12.743988684582744, "grad_norm": 0.001716488040983677, "learning_rate": 0.07133411585892636, "loss": 0.2098, "num_input_tokens_seen": 23406592, "step": 27030 }, { "epoch": 12.746346063177747, "grad_norm": 0.0017438913928344846, "learning_rate": 0.07128396750561593, "loss": 0.2056, "num_input_tokens_seen": 23410864, "step": 27035 }, { "epoch": 12.748703441772749, "grad_norm": 0.002013581804931164, "learning_rate": 0.07123383129130685, "loss": 0.2201, "num_input_tokens_seen": 23415120, "step": 27040 }, { "epoch": 12.751060820367751, "grad_norm": 0.0027766625862568617, "learning_rate": 0.07118370722373084, "loss": 0.1961, "num_input_tokens_seen": 23419696, "step": 27045 }, { "epoch": 12.753418198962754, "grad_norm": 0.001869815867394209, "learning_rate": 0.07113359531061769, "loss": 0.2415, "num_input_tokens_seen": 23423648, "step": 27050 }, { "epoch": 12.755775577557756, "grad_norm": 0.0015165242366492748, "learning_rate": 0.07108349555969525, "loss": 0.1995, "num_input_tokens_seen": 23428000, "step": 27055 }, { "epoch": 12.758132956152759, "grad_norm": 0.003131876001134515, "learning_rate": 0.07103340797868944, "loss": 0.2052, "num_input_tokens_seen": 23432368, "step": 27060 }, { "epoch": 12.760490334747761, "grad_norm": 0.002230751793831587, "learning_rate": 0.07098333257532453, "loss": 0.213, "num_input_tokens_seen": 23436848, "step": 27065 }, { "epoch": 12.762847713342763, "grad_norm": 0.003149432595819235, "learning_rate": 0.07093326935732269, "loss": 0.2126, "num_input_tokens_seen": 23441152, "step": 27070 }, { "epoch": 12.765205091937766, "grad_norm": 0.003091956954449415, "learning_rate": 0.0708832183324044, "loss": 0.1969, "num_input_tokens_seen": 23445696, "step": 27075 }, { "epoch": 12.767562470532768, "grad_norm": 0.0018901799339801073, "learning_rate": 0.07083317950828799, "loss": 0.2028, "num_input_tokens_seen": 23449888, "step": 27080 }, { "epoch": 12.76991984912777, "grad_norm": 0.0018480228027328849, "learning_rate": 0.0707831528926902, "loss": 0.1898, "num_input_tokens_seen": 23455232, "step": 27085 }, { "epoch": 12.772277227722773, "grad_norm": 0.0026183989830315113, "learning_rate": 0.07073313849332578, "loss": 0.1789, "num_input_tokens_seen": 23460448, "step": 27090 }, { "epoch": 12.774634606317775, "grad_norm": 0.003312087384983897, "learning_rate": 0.07068313631790749, "loss": 0.2408, "num_input_tokens_seen": 23465152, "step": 27095 }, { "epoch": 12.776991984912778, "grad_norm": 0.0024705049581825733, "learning_rate": 0.07063314637414632, "loss": 0.1859, "num_input_tokens_seen": 23470240, "step": 27100 }, { "epoch": 12.77934936350778, "grad_norm": 0.003400129731744528, "learning_rate": 0.07058316866975144, "loss": 0.2243, "num_input_tokens_seen": 23474368, "step": 27105 }, { "epoch": 12.78170674210278, "grad_norm": 0.0029374314472079277, "learning_rate": 0.0705332032124299, "loss": 0.2457, "num_input_tokens_seen": 23478880, "step": 27110 }, { "epoch": 12.784064120697785, "grad_norm": 0.0025456161238253117, "learning_rate": 0.0704832500098871, "loss": 0.1994, "num_input_tokens_seen": 23483744, "step": 27115 }, { "epoch": 12.786421499292786, "grad_norm": 0.0023624091409146786, "learning_rate": 0.07043330906982641, "loss": 0.1926, "num_input_tokens_seen": 23487248, "step": 27120 }, { "epoch": 12.788778877887788, "grad_norm": 0.0023349819239228964, "learning_rate": 0.07038338039994936, "loss": 0.205, "num_input_tokens_seen": 23490976, "step": 27125 }, { "epoch": 12.79113625648279, "grad_norm": 0.0022836208809167147, "learning_rate": 0.07033346400795562, "loss": 0.2208, "num_input_tokens_seen": 23495472, "step": 27130 }, { "epoch": 12.793493635077793, "grad_norm": 0.002792704850435257, "learning_rate": 0.07028355990154282, "loss": 0.2486, "num_input_tokens_seen": 23498832, "step": 27135 }, { "epoch": 12.795851013672795, "grad_norm": 0.002568983007222414, "learning_rate": 0.07023366808840685, "loss": 0.2386, "num_input_tokens_seen": 23503216, "step": 27140 }, { "epoch": 12.798208392267798, "grad_norm": 0.0036334767937660217, "learning_rate": 0.07018378857624172, "loss": 0.2123, "num_input_tokens_seen": 23507504, "step": 27145 }, { "epoch": 12.8005657708628, "grad_norm": 0.0019087977707386017, "learning_rate": 0.0701339213727394, "loss": 0.2175, "num_input_tokens_seen": 23511968, "step": 27150 }, { "epoch": 12.802923149457802, "grad_norm": 0.0031531427521258593, "learning_rate": 0.07008406648559008, "loss": 0.1895, "num_input_tokens_seen": 23516080, "step": 27155 }, { "epoch": 12.805280528052805, "grad_norm": 0.002929659327492118, "learning_rate": 0.07003422392248196, "loss": 0.2235, "num_input_tokens_seen": 23520768, "step": 27160 }, { "epoch": 12.807637906647807, "grad_norm": 0.0028042495250701904, "learning_rate": 0.06998439369110142, "loss": 0.2215, "num_input_tokens_seen": 23524736, "step": 27165 }, { "epoch": 12.80999528524281, "grad_norm": 0.002739723538979888, "learning_rate": 0.06993457579913295, "loss": 0.2181, "num_input_tokens_seen": 23529264, "step": 27170 }, { "epoch": 12.812352663837812, "grad_norm": 0.00191792706027627, "learning_rate": 0.06988477025425903, "loss": 0.178, "num_input_tokens_seen": 23533248, "step": 27175 }, { "epoch": 12.814710042432814, "grad_norm": 0.002396202879026532, "learning_rate": 0.06983497706416032, "loss": 0.1985, "num_input_tokens_seen": 23537536, "step": 27180 }, { "epoch": 12.817067421027817, "grad_norm": 0.001986737595871091, "learning_rate": 0.0697851962365156, "loss": 0.1983, "num_input_tokens_seen": 23541824, "step": 27185 }, { "epoch": 12.81942479962282, "grad_norm": 0.0017289062961935997, "learning_rate": 0.06973542777900163, "loss": 0.1755, "num_input_tokens_seen": 23545600, "step": 27190 }, { "epoch": 12.821782178217822, "grad_norm": 0.002296967664733529, "learning_rate": 0.06968567169929342, "loss": 0.2391, "num_input_tokens_seen": 23549280, "step": 27195 }, { "epoch": 12.824139556812824, "grad_norm": 0.0018754933262243867, "learning_rate": 0.06963592800506392, "loss": 0.2103, "num_input_tokens_seen": 23553008, "step": 27200 }, { "epoch": 12.824139556812824, "eval_loss": 0.21196311712265015, "eval_runtime": 21.8781, "eval_samples_per_second": 43.102, "eval_steps_per_second": 21.574, "num_input_tokens_seen": 23553008, "step": 27200 }, { "epoch": 12.826496935407826, "grad_norm": 0.0032809290569275618, "learning_rate": 0.06958619670398417, "loss": 0.1927, "num_input_tokens_seen": 23556704, "step": 27205 }, { "epoch": 12.828854314002829, "grad_norm": 0.0019201061222702265, "learning_rate": 0.0695364778037235, "loss": 0.2059, "num_input_tokens_seen": 23561360, "step": 27210 }, { "epoch": 12.831211692597831, "grad_norm": 0.0026406196411699057, "learning_rate": 0.06948677131194907, "loss": 0.233, "num_input_tokens_seen": 23565888, "step": 27215 }, { "epoch": 12.833569071192834, "grad_norm": 0.0021884581074118614, "learning_rate": 0.06943707723632629, "loss": 0.2519, "num_input_tokens_seen": 23570400, "step": 27220 }, { "epoch": 12.835926449787836, "grad_norm": 0.0023663865868002176, "learning_rate": 0.06938739558451867, "loss": 0.198, "num_input_tokens_seen": 23574432, "step": 27225 }, { "epoch": 12.838283828382838, "grad_norm": 0.002473373431712389, "learning_rate": 0.06933772636418763, "loss": 0.1936, "num_input_tokens_seen": 23578592, "step": 27230 }, { "epoch": 12.84064120697784, "grad_norm": 0.0026748350355774164, "learning_rate": 0.06928806958299293, "loss": 0.2172, "num_input_tokens_seen": 23582784, "step": 27235 }, { "epoch": 12.842998585572843, "grad_norm": 0.0023759088944643736, "learning_rate": 0.06923842524859211, "loss": 0.192, "num_input_tokens_seen": 23586944, "step": 27240 }, { "epoch": 12.845355964167846, "grad_norm": 0.0029762990307062864, "learning_rate": 0.06918879336864105, "loss": 0.2277, "num_input_tokens_seen": 23591328, "step": 27245 }, { "epoch": 12.847713342762848, "grad_norm": 0.0017382703954353929, "learning_rate": 0.06913917395079362, "loss": 0.1775, "num_input_tokens_seen": 23595264, "step": 27250 }, { "epoch": 12.85007072135785, "grad_norm": 0.0016341244336217642, "learning_rate": 0.0690895670027017, "loss": 0.197, "num_input_tokens_seen": 23598960, "step": 27255 }, { "epoch": 12.852428099952853, "grad_norm": 0.0019064786611124873, "learning_rate": 0.06903997253201531, "loss": 0.2717, "num_input_tokens_seen": 23603184, "step": 27260 }, { "epoch": 12.854785478547855, "grad_norm": 0.0023457540664821863, "learning_rate": 0.06899039054638263, "loss": 0.2053, "num_input_tokens_seen": 23606944, "step": 27265 }, { "epoch": 12.857142857142858, "grad_norm": 0.00204528053291142, "learning_rate": 0.06894082105344976, "loss": 0.2006, "num_input_tokens_seen": 23611808, "step": 27270 }, { "epoch": 12.85950023573786, "grad_norm": 0.0022474732249975204, "learning_rate": 0.06889126406086087, "loss": 0.2229, "num_input_tokens_seen": 23616368, "step": 27275 }, { "epoch": 12.861857614332862, "grad_norm": 0.0023429959546774626, "learning_rate": 0.0688417195762584, "loss": 0.2112, "num_input_tokens_seen": 23620800, "step": 27280 }, { "epoch": 12.864214992927865, "grad_norm": 0.002810097299516201, "learning_rate": 0.06879218760728262, "loss": 0.1987, "num_input_tokens_seen": 23625136, "step": 27285 }, { "epoch": 12.866572371522867, "grad_norm": 0.0024655533488839865, "learning_rate": 0.06874266816157207, "loss": 0.2319, "num_input_tokens_seen": 23628720, "step": 27290 }, { "epoch": 12.86892975011787, "grad_norm": 0.0019841566681861877, "learning_rate": 0.06869316124676321, "loss": 0.2009, "num_input_tokens_seen": 23632992, "step": 27295 }, { "epoch": 12.871287128712872, "grad_norm": 0.0019208498997613788, "learning_rate": 0.06864366687049062, "loss": 0.1955, "num_input_tokens_seen": 23636832, "step": 27300 }, { "epoch": 12.873644507307874, "grad_norm": 0.0022023788187652826, "learning_rate": 0.06859418504038704, "loss": 0.1895, "num_input_tokens_seen": 23640752, "step": 27305 }, { "epoch": 12.876001885902877, "grad_norm": 0.0022295555099844933, "learning_rate": 0.06854471576408311, "loss": 0.2238, "num_input_tokens_seen": 23645184, "step": 27310 }, { "epoch": 12.878359264497877, "grad_norm": 0.0022809377405792475, "learning_rate": 0.06849525904920767, "loss": 0.2113, "num_input_tokens_seen": 23649392, "step": 27315 }, { "epoch": 12.88071664309288, "grad_norm": 0.002749457722529769, "learning_rate": 0.06844581490338748, "loss": 0.257, "num_input_tokens_seen": 23653520, "step": 27320 }, { "epoch": 12.883074021687882, "grad_norm": 0.0019319417187944055, "learning_rate": 0.06839638333424752, "loss": 0.2058, "num_input_tokens_seen": 23657696, "step": 27325 }, { "epoch": 12.885431400282885, "grad_norm": 0.0022181631065905094, "learning_rate": 0.06834696434941082, "loss": 0.2195, "num_input_tokens_seen": 23661856, "step": 27330 }, { "epoch": 12.887788778877887, "grad_norm": 0.001990054501220584, "learning_rate": 0.06829755795649824, "loss": 0.1871, "num_input_tokens_seen": 23665904, "step": 27335 }, { "epoch": 12.89014615747289, "grad_norm": 0.0023155089002102613, "learning_rate": 0.06824816416312904, "loss": 0.2242, "num_input_tokens_seen": 23670528, "step": 27340 }, { "epoch": 12.892503536067892, "grad_norm": 0.002733101835474372, "learning_rate": 0.06819878297692027, "loss": 0.2156, "num_input_tokens_seen": 23674544, "step": 27345 }, { "epoch": 12.894860914662894, "grad_norm": 0.0035316413268446922, "learning_rate": 0.0681494144054871, "loss": 0.22, "num_input_tokens_seen": 23679680, "step": 27350 }, { "epoch": 12.897218293257897, "grad_norm": 0.0023094178177416325, "learning_rate": 0.06810005845644286, "loss": 0.1782, "num_input_tokens_seen": 23683472, "step": 27355 }, { "epoch": 12.899575671852899, "grad_norm": 0.0029660502914339304, "learning_rate": 0.06805071513739878, "loss": 0.2245, "num_input_tokens_seen": 23687216, "step": 27360 }, { "epoch": 12.901933050447902, "grad_norm": 0.001864682068116963, "learning_rate": 0.06800138445596428, "loss": 0.1536, "num_input_tokens_seen": 23691392, "step": 27365 }, { "epoch": 12.904290429042904, "grad_norm": 0.002606457332149148, "learning_rate": 0.06795206641974678, "loss": 0.223, "num_input_tokens_seen": 23695776, "step": 27370 }, { "epoch": 12.906647807637906, "grad_norm": 0.0019833019468933344, "learning_rate": 0.06790276103635169, "loss": 0.2224, "num_input_tokens_seen": 23700640, "step": 27375 }, { "epoch": 12.909005186232909, "grad_norm": 0.003156360937282443, "learning_rate": 0.0678534683133826, "loss": 0.1868, "num_input_tokens_seen": 23705168, "step": 27380 }, { "epoch": 12.911362564827911, "grad_norm": 0.002458041999489069, "learning_rate": 0.06780418825844095, "loss": 0.1863, "num_input_tokens_seen": 23709104, "step": 27385 }, { "epoch": 12.913719943422914, "grad_norm": 0.0029289769008755684, "learning_rate": 0.0677549208791264, "loss": 0.2004, "num_input_tokens_seen": 23713520, "step": 27390 }, { "epoch": 12.916077322017916, "grad_norm": 0.002219975693151355, "learning_rate": 0.06770566618303668, "loss": 0.1759, "num_input_tokens_seen": 23718160, "step": 27395 }, { "epoch": 12.918434700612918, "grad_norm": 0.0023491783067584038, "learning_rate": 0.06765642417776736, "loss": 0.2114, "num_input_tokens_seen": 23722608, "step": 27400 }, { "epoch": 12.918434700612918, "eval_loss": 0.2126309722661972, "eval_runtime": 21.9486, "eval_samples_per_second": 42.964, "eval_steps_per_second": 21.505, "num_input_tokens_seen": 23722608, "step": 27400 }, { "epoch": 12.92079207920792, "grad_norm": 0.002085567219182849, "learning_rate": 0.0676071948709122, "loss": 0.165, "num_input_tokens_seen": 23727392, "step": 27405 }, { "epoch": 12.923149457802923, "grad_norm": 0.0024900645948946476, "learning_rate": 0.06755797827006307, "loss": 0.1768, "num_input_tokens_seen": 23733056, "step": 27410 }, { "epoch": 12.925506836397926, "grad_norm": 0.0029378936160355806, "learning_rate": 0.06750877438280974, "loss": 0.1891, "num_input_tokens_seen": 23737488, "step": 27415 }, { "epoch": 12.927864214992928, "grad_norm": 0.0023847802076488733, "learning_rate": 0.06745958321673998, "loss": 0.185, "num_input_tokens_seen": 23741792, "step": 27420 }, { "epoch": 12.93022159358793, "grad_norm": 0.003579799784347415, "learning_rate": 0.0674104047794398, "loss": 0.1517, "num_input_tokens_seen": 23746512, "step": 27425 }, { "epoch": 12.932578972182933, "grad_norm": 0.003797441255301237, "learning_rate": 0.06736123907849303, "loss": 0.2027, "num_input_tokens_seen": 23751040, "step": 27430 }, { "epoch": 12.934936350777935, "grad_norm": 0.0036071850918233395, "learning_rate": 0.06731208612148178, "loss": 0.2427, "num_input_tokens_seen": 23755440, "step": 27435 }, { "epoch": 12.937293729372938, "grad_norm": 0.0027280524373054504, "learning_rate": 0.0672629459159859, "loss": 0.1759, "num_input_tokens_seen": 23759200, "step": 27440 }, { "epoch": 12.93965110796794, "grad_norm": 0.0033215878065675497, "learning_rate": 0.0672138184695835, "loss": 0.2033, "num_input_tokens_seen": 23763392, "step": 27445 }, { "epoch": 12.942008486562942, "grad_norm": 0.0025842224713414907, "learning_rate": 0.0671647037898507, "loss": 0.1926, "num_input_tokens_seen": 23767696, "step": 27450 }, { "epoch": 12.944365865157945, "grad_norm": 0.0027955754194408655, "learning_rate": 0.0671156018843615, "loss": 0.2323, "num_input_tokens_seen": 23772096, "step": 27455 }, { "epoch": 12.946723243752947, "grad_norm": 0.002187287900596857, "learning_rate": 0.06706651276068812, "loss": 0.1715, "num_input_tokens_seen": 23775344, "step": 27460 }, { "epoch": 12.94908062234795, "grad_norm": 0.0036013885401189327, "learning_rate": 0.06701743642640064, "loss": 0.2236, "num_input_tokens_seen": 23779792, "step": 27465 }, { "epoch": 12.951438000942952, "grad_norm": 0.0035697331186383963, "learning_rate": 0.06696837288906729, "loss": 0.2243, "num_input_tokens_seen": 23783744, "step": 27470 }, { "epoch": 12.953795379537954, "grad_norm": 0.004157577641308308, "learning_rate": 0.06691932215625432, "loss": 0.2654, "num_input_tokens_seen": 23788208, "step": 27475 }, { "epoch": 12.956152758132957, "grad_norm": 0.004324604757130146, "learning_rate": 0.06687028423552589, "loss": 0.1986, "num_input_tokens_seen": 23792496, "step": 27480 }, { "epoch": 12.95851013672796, "grad_norm": 0.002359729493036866, "learning_rate": 0.06682125913444435, "loss": 0.2119, "num_input_tokens_seen": 23797072, "step": 27485 }, { "epoch": 12.960867515322962, "grad_norm": 0.00412893807515502, "learning_rate": 0.0667722468605699, "loss": 0.1946, "num_input_tokens_seen": 23800976, "step": 27490 }, { "epoch": 12.963224893917964, "grad_norm": 0.0035797799937427044, "learning_rate": 0.06672324742146094, "loss": 0.2202, "num_input_tokens_seen": 23805072, "step": 27495 }, { "epoch": 12.965582272512966, "grad_norm": 0.0029910383746027946, "learning_rate": 0.06667426082467373, "loss": 0.2304, "num_input_tokens_seen": 23809424, "step": 27500 }, { "epoch": 12.967939651107969, "grad_norm": 0.0034758036490529776, "learning_rate": 0.0666252870777626, "loss": 0.1952, "num_input_tokens_seen": 23813184, "step": 27505 }, { "epoch": 12.97029702970297, "grad_norm": 0.003667723387479782, "learning_rate": 0.06657632618827995, "loss": 0.2135, "num_input_tokens_seen": 23817008, "step": 27510 }, { "epoch": 12.972654408297974, "grad_norm": 0.002123629441484809, "learning_rate": 0.06652737816377623, "loss": 0.2019, "num_input_tokens_seen": 23821856, "step": 27515 }, { "epoch": 12.975011786892974, "grad_norm": 0.002284109126776457, "learning_rate": 0.06647844301179971, "loss": 0.2108, "num_input_tokens_seen": 23826368, "step": 27520 }, { "epoch": 12.977369165487977, "grad_norm": 0.003243680577725172, "learning_rate": 0.06642952073989689, "loss": 0.1777, "num_input_tokens_seen": 23830928, "step": 27525 }, { "epoch": 12.979726544082979, "grad_norm": 0.003789369948208332, "learning_rate": 0.06638061135561223, "loss": 0.2729, "num_input_tokens_seen": 23835248, "step": 27530 }, { "epoch": 12.982083922677981, "grad_norm": 0.002655148971825838, "learning_rate": 0.06633171486648808, "loss": 0.2047, "num_input_tokens_seen": 23838992, "step": 27535 }, { "epoch": 12.984441301272984, "grad_norm": 0.0018445730675011873, "learning_rate": 0.06628283128006499, "loss": 0.1943, "num_input_tokens_seen": 23843072, "step": 27540 }, { "epoch": 12.986798679867986, "grad_norm": 0.0027047304902225733, "learning_rate": 0.0662339606038813, "loss": 0.2306, "num_input_tokens_seen": 23847296, "step": 27545 }, { "epoch": 12.989156058462989, "grad_norm": 0.0015737038338556886, "learning_rate": 0.06618510284547358, "loss": 0.2255, "num_input_tokens_seen": 23851488, "step": 27550 }, { "epoch": 12.991513437057991, "grad_norm": 0.0033369131851941347, "learning_rate": 0.06613625801237633, "loss": 0.2074, "num_input_tokens_seen": 23855248, "step": 27555 }, { "epoch": 12.993870815652993, "grad_norm": 0.002806410426273942, "learning_rate": 0.066087426112122, "loss": 0.1795, "num_input_tokens_seen": 23860064, "step": 27560 }, { "epoch": 12.996228194247996, "grad_norm": 0.002588457427918911, "learning_rate": 0.06603860715224101, "loss": 0.2237, "num_input_tokens_seen": 23864064, "step": 27565 }, { "epoch": 12.998585572842998, "grad_norm": 0.0031592040322721004, "learning_rate": 0.06598980114026198, "loss": 0.1716, "num_input_tokens_seen": 23867856, "step": 27570 }, { "epoch": 13.000942951438, "grad_norm": 0.0021865188609808683, "learning_rate": 0.06594100808371128, "loss": 0.1977, "num_input_tokens_seen": 23872560, "step": 27575 }, { "epoch": 13.003300330033003, "grad_norm": 0.0021558599546551704, "learning_rate": 0.06589222799011357, "loss": 0.1991, "num_input_tokens_seen": 23877344, "step": 27580 }, { "epoch": 13.005657708628005, "grad_norm": 0.0025004108902066946, "learning_rate": 0.0658434608669912, "loss": 0.2588, "num_input_tokens_seen": 23881312, "step": 27585 }, { "epoch": 13.008015087223008, "grad_norm": 0.001766430796124041, "learning_rate": 0.06579470672186473, "loss": 0.2005, "num_input_tokens_seen": 23885248, "step": 27590 }, { "epoch": 13.01037246581801, "grad_norm": 0.003001140197739005, "learning_rate": 0.06574596556225275, "loss": 0.2548, "num_input_tokens_seen": 23889312, "step": 27595 }, { "epoch": 13.012729844413013, "grad_norm": 0.0022847128566354513, "learning_rate": 0.06569723739567161, "loss": 0.1868, "num_input_tokens_seen": 23892928, "step": 27600 }, { "epoch": 13.012729844413013, "eval_loss": 0.21349774301052094, "eval_runtime": 21.9029, "eval_samples_per_second": 43.054, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 23892928, "step": 27600 }, { "epoch": 13.015087223008015, "grad_norm": 0.003682458773255348, "learning_rate": 0.06564852222963588, "loss": 0.2169, "num_input_tokens_seen": 23897472, "step": 27605 }, { "epoch": 13.017444601603017, "grad_norm": 0.0031703754793852568, "learning_rate": 0.06559982007165813, "loss": 0.1611, "num_input_tokens_seen": 23901392, "step": 27610 }, { "epoch": 13.01980198019802, "grad_norm": 0.0021520324517041445, "learning_rate": 0.06555113092924868, "loss": 0.1584, "num_input_tokens_seen": 23905616, "step": 27615 }, { "epoch": 13.022159358793022, "grad_norm": 0.003042110474780202, "learning_rate": 0.06550245480991615, "loss": 0.2312, "num_input_tokens_seen": 23909008, "step": 27620 }, { "epoch": 13.024516737388025, "grad_norm": 0.0022913780994713306, "learning_rate": 0.0654537917211669, "loss": 0.2369, "num_input_tokens_seen": 23912496, "step": 27625 }, { "epoch": 13.026874115983027, "grad_norm": 0.004723566118627787, "learning_rate": 0.0654051416705055, "loss": 0.2197, "num_input_tokens_seen": 23916320, "step": 27630 }, { "epoch": 13.02923149457803, "grad_norm": 0.0023805885575711727, "learning_rate": 0.06535650466543427, "loss": 0.1884, "num_input_tokens_seen": 23920992, "step": 27635 }, { "epoch": 13.031588873173032, "grad_norm": 0.002575993537902832, "learning_rate": 0.0653078807134538, "loss": 0.1624, "num_input_tokens_seen": 23925792, "step": 27640 }, { "epoch": 13.033946251768034, "grad_norm": 0.002734814304858446, "learning_rate": 0.06525926982206236, "loss": 0.1832, "num_input_tokens_seen": 23930672, "step": 27645 }, { "epoch": 13.036303630363037, "grad_norm": 0.002183645498007536, "learning_rate": 0.06521067199875648, "loss": 0.2052, "num_input_tokens_seen": 23934848, "step": 27650 }, { "epoch": 13.038661008958039, "grad_norm": 0.0024088930804282427, "learning_rate": 0.06516208725103047, "loss": 0.192, "num_input_tokens_seen": 23939488, "step": 27655 }, { "epoch": 13.041018387553041, "grad_norm": 0.003235182724893093, "learning_rate": 0.06511351558637678, "loss": 0.1942, "num_input_tokens_seen": 23943120, "step": 27660 }, { "epoch": 13.043375766148044, "grad_norm": 0.0020766339730471373, "learning_rate": 0.06506495701228569, "loss": 0.1974, "num_input_tokens_seen": 23947728, "step": 27665 }, { "epoch": 13.045733144743046, "grad_norm": 0.0026147235184907913, "learning_rate": 0.06501641153624559, "loss": 0.2065, "num_input_tokens_seen": 23952336, "step": 27670 }, { "epoch": 13.048090523338049, "grad_norm": 0.004071047995239496, "learning_rate": 0.06496787916574286, "loss": 0.2149, "num_input_tokens_seen": 23956560, "step": 27675 }, { "epoch": 13.050447901933051, "grad_norm": 0.002962120808660984, "learning_rate": 0.06491935990826168, "loss": 0.2225, "num_input_tokens_seen": 23961408, "step": 27680 }, { "epoch": 13.052805280528053, "grad_norm": 0.0036243193317204714, "learning_rate": 0.0648708537712844, "loss": 0.2179, "num_input_tokens_seen": 23965504, "step": 27685 }, { "epoch": 13.055162659123056, "grad_norm": 0.0031917677260935307, "learning_rate": 0.06482236076229132, "loss": 0.1602, "num_input_tokens_seen": 23969472, "step": 27690 }, { "epoch": 13.057520037718058, "grad_norm": 0.002462289994582534, "learning_rate": 0.06477388088876056, "loss": 0.183, "num_input_tokens_seen": 23973680, "step": 27695 }, { "epoch": 13.05987741631306, "grad_norm": 0.004937973339110613, "learning_rate": 0.06472541415816846, "loss": 0.1957, "num_input_tokens_seen": 23978592, "step": 27700 }, { "epoch": 13.062234794908063, "grad_norm": 0.0024522114545106888, "learning_rate": 0.06467696057798909, "loss": 0.1769, "num_input_tokens_seen": 23983520, "step": 27705 }, { "epoch": 13.064592173503065, "grad_norm": 0.004625607747584581, "learning_rate": 0.0646285201556946, "loss": 0.2528, "num_input_tokens_seen": 23987232, "step": 27710 }, { "epoch": 13.066949552098066, "grad_norm": 0.004562231712043285, "learning_rate": 0.06458009289875521, "loss": 0.249, "num_input_tokens_seen": 23991632, "step": 27715 }, { "epoch": 13.069306930693068, "grad_norm": 0.0035593973007053137, "learning_rate": 0.0645316788146389, "loss": 0.1765, "num_input_tokens_seen": 23996000, "step": 27720 }, { "epoch": 13.07166430928807, "grad_norm": 0.0035469927825033665, "learning_rate": 0.06448327791081175, "loss": 0.2435, "num_input_tokens_seen": 23999632, "step": 27725 }, { "epoch": 13.074021687883073, "grad_norm": 0.0033889133483171463, "learning_rate": 0.0644348901947379, "loss": 0.1905, "num_input_tokens_seen": 24004368, "step": 27730 }, { "epoch": 13.076379066478076, "grad_norm": 0.0035366276279091835, "learning_rate": 0.06438651567387917, "loss": 0.1634, "num_input_tokens_seen": 24008608, "step": 27735 }, { "epoch": 13.078736445073078, "grad_norm": 0.002637267578393221, "learning_rate": 0.0643381543556957, "loss": 0.1627, "num_input_tokens_seen": 24013280, "step": 27740 }, { "epoch": 13.08109382366808, "grad_norm": 0.003385243471711874, "learning_rate": 0.06428980624764526, "loss": 0.1905, "num_input_tokens_seen": 24017664, "step": 27745 }, { "epoch": 13.083451202263083, "grad_norm": 0.002465470228344202, "learning_rate": 0.06424147135718378, "loss": 0.2304, "num_input_tokens_seen": 24022112, "step": 27750 }, { "epoch": 13.085808580858085, "grad_norm": 0.0031016876455396414, "learning_rate": 0.06419314969176519, "loss": 0.2024, "num_input_tokens_seen": 24026032, "step": 27755 }, { "epoch": 13.088165959453088, "grad_norm": 0.002490459708496928, "learning_rate": 0.06414484125884118, "loss": 0.2081, "num_input_tokens_seen": 24030272, "step": 27760 }, { "epoch": 13.09052333804809, "grad_norm": 0.0033876849338412285, "learning_rate": 0.06409654606586157, "loss": 0.1862, "num_input_tokens_seen": 24034352, "step": 27765 }, { "epoch": 13.092880716643092, "grad_norm": 0.002968772314488888, "learning_rate": 0.06404826412027415, "loss": 0.1723, "num_input_tokens_seen": 24038336, "step": 27770 }, { "epoch": 13.095238095238095, "grad_norm": 0.0035668055061250925, "learning_rate": 0.06399999542952453, "loss": 0.2262, "num_input_tokens_seen": 24042240, "step": 27775 }, { "epoch": 13.097595473833097, "grad_norm": 0.0032752917613834143, "learning_rate": 0.0639517400010563, "loss": 0.1841, "num_input_tokens_seen": 24046496, "step": 27780 }, { "epoch": 13.0999528524281, "grad_norm": 0.004176370333880186, "learning_rate": 0.06390349784231118, "loss": 0.1936, "num_input_tokens_seen": 24050800, "step": 27785 }, { "epoch": 13.102310231023102, "grad_norm": 0.0024933184031397104, "learning_rate": 0.06385526896072859, "loss": 0.1646, "num_input_tokens_seen": 24055104, "step": 27790 }, { "epoch": 13.104667609618105, "grad_norm": 0.0037771931383758783, "learning_rate": 0.06380705336374613, "loss": 0.2437, "num_input_tokens_seen": 24059456, "step": 27795 }, { "epoch": 13.107024988213107, "grad_norm": 0.004191447515040636, "learning_rate": 0.06375885105879918, "loss": 0.225, "num_input_tokens_seen": 24063632, "step": 27800 }, { "epoch": 13.107024988213107, "eval_loss": 0.21396879851818085, "eval_runtime": 21.9007, "eval_samples_per_second": 43.058, "eval_steps_per_second": 21.552, "num_input_tokens_seen": 24063632, "step": 27800 }, { "epoch": 13.10938236680811, "grad_norm": 0.0027595381252467632, "learning_rate": 0.06371066205332115, "loss": 0.1885, "num_input_tokens_seen": 24068320, "step": 27805 }, { "epoch": 13.111739745403112, "grad_norm": 0.004130181856453419, "learning_rate": 0.06366248635474347, "loss": 0.186, "num_input_tokens_seen": 24073200, "step": 27810 }, { "epoch": 13.114097123998114, "grad_norm": 0.0022752874065190554, "learning_rate": 0.06361432397049532, "loss": 0.1609, "num_input_tokens_seen": 24077280, "step": 27815 }, { "epoch": 13.116454502593117, "grad_norm": 0.00259313709102571, "learning_rate": 0.06356617490800408, "loss": 0.1879, "num_input_tokens_seen": 24080848, "step": 27820 }, { "epoch": 13.118811881188119, "grad_norm": 0.0036827493458986282, "learning_rate": 0.06351803917469478, "loss": 0.2411, "num_input_tokens_seen": 24085520, "step": 27825 }, { "epoch": 13.121169259783121, "grad_norm": 0.004294184502214193, "learning_rate": 0.06346991677799067, "loss": 0.2086, "num_input_tokens_seen": 24090064, "step": 27830 }, { "epoch": 13.123526638378124, "grad_norm": 0.004798646084964275, "learning_rate": 0.06342180772531283, "loss": 0.2032, "num_input_tokens_seen": 24093952, "step": 27835 }, { "epoch": 13.125884016973126, "grad_norm": 0.002713978523388505, "learning_rate": 0.06337371202408021, "loss": 0.1442, "num_input_tokens_seen": 24098176, "step": 27840 }, { "epoch": 13.128241395568129, "grad_norm": 0.0034760653506964445, "learning_rate": 0.06332562968170984, "loss": 0.2039, "num_input_tokens_seen": 24102176, "step": 27845 }, { "epoch": 13.130598774163131, "grad_norm": 0.004568598233163357, "learning_rate": 0.06327756070561656, "loss": 0.2418, "num_input_tokens_seen": 24106960, "step": 27850 }, { "epoch": 13.132956152758133, "grad_norm": 0.004888992756605148, "learning_rate": 0.06322950510321329, "loss": 0.182, "num_input_tokens_seen": 24110720, "step": 27855 }, { "epoch": 13.135313531353136, "grad_norm": 0.0038136441726237535, "learning_rate": 0.06318146288191076, "loss": 0.2349, "num_input_tokens_seen": 24115232, "step": 27860 }, { "epoch": 13.137670909948138, "grad_norm": 0.005578142590820789, "learning_rate": 0.06313343404911763, "loss": 0.2506, "num_input_tokens_seen": 24119712, "step": 27865 }, { "epoch": 13.14002828854314, "grad_norm": 0.003042817348614335, "learning_rate": 0.0630854186122406, "loss": 0.1904, "num_input_tokens_seen": 24123504, "step": 27870 }, { "epoch": 13.142385667138143, "grad_norm": 0.003032529028132558, "learning_rate": 0.06303741657868431, "loss": 0.1865, "num_input_tokens_seen": 24127856, "step": 27875 }, { "epoch": 13.144743045733145, "grad_norm": 0.003658212022855878, "learning_rate": 0.06298942795585115, "loss": 0.1937, "num_input_tokens_seen": 24132704, "step": 27880 }, { "epoch": 13.147100424328148, "grad_norm": 0.003972828853875399, "learning_rate": 0.06294145275114167, "loss": 0.2712, "num_input_tokens_seen": 24137584, "step": 27885 }, { "epoch": 13.14945780292315, "grad_norm": 0.002773654880002141, "learning_rate": 0.06289349097195428, "loss": 0.2402, "num_input_tokens_seen": 24141840, "step": 27890 }, { "epoch": 13.151815181518153, "grad_norm": 0.003078120294958353, "learning_rate": 0.06284554262568516, "loss": 0.211, "num_input_tokens_seen": 24146512, "step": 27895 }, { "epoch": 13.154172560113155, "grad_norm": 0.0034743689466267824, "learning_rate": 0.06279760771972868, "loss": 0.1935, "num_input_tokens_seen": 24150976, "step": 27900 }, { "epoch": 13.156529938708157, "grad_norm": 0.0023883068934082985, "learning_rate": 0.06274968626147688, "loss": 0.1976, "num_input_tokens_seen": 24156128, "step": 27905 }, { "epoch": 13.15888731730316, "grad_norm": 0.004443046171218157, "learning_rate": 0.06270177825831993, "loss": 0.235, "num_input_tokens_seen": 24160096, "step": 27910 }, { "epoch": 13.16124469589816, "grad_norm": 0.0033913280349224806, "learning_rate": 0.06265388371764587, "loss": 0.2305, "num_input_tokens_seen": 24164624, "step": 27915 }, { "epoch": 13.163602074493163, "grad_norm": 0.0029678100254386663, "learning_rate": 0.0626060026468406, "loss": 0.1693, "num_input_tokens_seen": 24168416, "step": 27920 }, { "epoch": 13.165959453088165, "grad_norm": 0.003149575786665082, "learning_rate": 0.06255813505328794, "loss": 0.1726, "num_input_tokens_seen": 24173216, "step": 27925 }, { "epoch": 13.168316831683168, "grad_norm": 0.0029644242022186518, "learning_rate": 0.06251028094436978, "loss": 0.2217, "num_input_tokens_seen": 24177040, "step": 27930 }, { "epoch": 13.17067421027817, "grad_norm": 0.003028053091838956, "learning_rate": 0.06246244032746568, "loss": 0.1933, "num_input_tokens_seen": 24180880, "step": 27935 }, { "epoch": 13.173031588873172, "grad_norm": 0.004019772633910179, "learning_rate": 0.06241461320995342, "loss": 0.1773, "num_input_tokens_seen": 24185136, "step": 27940 }, { "epoch": 13.175388967468175, "grad_norm": 0.002202889183536172, "learning_rate": 0.062366799599208426, "loss": 0.1963, "num_input_tokens_seen": 24188800, "step": 27945 }, { "epoch": 13.177746346063177, "grad_norm": 0.0024415520019829273, "learning_rate": 0.06231899950260418, "loss": 0.2057, "num_input_tokens_seen": 24192832, "step": 27950 }, { "epoch": 13.18010372465818, "grad_norm": 0.00513939606025815, "learning_rate": 0.06227121292751214, "loss": 0.198, "num_input_tokens_seen": 24197440, "step": 27955 }, { "epoch": 13.182461103253182, "grad_norm": 0.004126903135329485, "learning_rate": 0.062223439881301496, "loss": 0.1869, "num_input_tokens_seen": 24202592, "step": 27960 }, { "epoch": 13.184818481848184, "grad_norm": 0.0019482758361846209, "learning_rate": 0.06217568037133948, "loss": 0.1939, "num_input_tokens_seen": 24207568, "step": 27965 }, { "epoch": 13.187175860443187, "grad_norm": 0.0032388712279498577, "learning_rate": 0.06212793440499126, "loss": 0.2126, "num_input_tokens_seen": 24211664, "step": 27970 }, { "epoch": 13.18953323903819, "grad_norm": 0.006384497974067926, "learning_rate": 0.062080201989619783, "loss": 0.1952, "num_input_tokens_seen": 24215904, "step": 27975 }, { "epoch": 13.191890617633192, "grad_norm": 0.006659139413386583, "learning_rate": 0.062032483132586094, "loss": 0.188, "num_input_tokens_seen": 24220016, "step": 27980 }, { "epoch": 13.194247996228194, "grad_norm": 0.004265317227691412, "learning_rate": 0.0619847778412489, "loss": 0.2083, "num_input_tokens_seen": 24224016, "step": 27985 }, { "epoch": 13.196605374823196, "grad_norm": 0.004381841514259577, "learning_rate": 0.06193708612296509, "loss": 0.1962, "num_input_tokens_seen": 24228736, "step": 27990 }, { "epoch": 13.198962753418199, "grad_norm": 0.0052251312881708145, "learning_rate": 0.06188940798508923, "loss": 0.1922, "num_input_tokens_seen": 24232400, "step": 27995 }, { "epoch": 13.201320132013201, "grad_norm": 0.0033367096912115812, "learning_rate": 0.06184174343497397, "loss": 0.2023, "num_input_tokens_seen": 24237248, "step": 28000 }, { "epoch": 13.201320132013201, "eval_loss": 0.2140951305627823, "eval_runtime": 21.9522, "eval_samples_per_second": 42.957, "eval_steps_per_second": 21.501, "num_input_tokens_seen": 24237248, "step": 28000 }, { "epoch": 13.203677510608204, "grad_norm": 0.005400930531322956, "learning_rate": 0.061794092479969726, "loss": 0.1785, "num_input_tokens_seen": 24241456, "step": 28005 }, { "epoch": 13.206034889203206, "grad_norm": 0.003697616746649146, "learning_rate": 0.06174645512742485, "loss": 0.2231, "num_input_tokens_seen": 24245712, "step": 28010 }, { "epoch": 13.208392267798208, "grad_norm": 0.005141960922628641, "learning_rate": 0.06169883138468565, "loss": 0.2152, "num_input_tokens_seen": 24249904, "step": 28015 }, { "epoch": 13.21074964639321, "grad_norm": 0.004695049021393061, "learning_rate": 0.06165122125909637, "loss": 0.1604, "num_input_tokens_seen": 24253808, "step": 28020 }, { "epoch": 13.213107024988213, "grad_norm": 0.004301515407860279, "learning_rate": 0.061603624757998965, "loss": 0.2452, "num_input_tokens_seen": 24257472, "step": 28025 }, { "epoch": 13.215464403583216, "grad_norm": 0.0022376300767064095, "learning_rate": 0.0615560418887335, "loss": 0.1725, "num_input_tokens_seen": 24261040, "step": 28030 }, { "epoch": 13.217821782178218, "grad_norm": 0.005454407539218664, "learning_rate": 0.06150847265863787, "loss": 0.2209, "num_input_tokens_seen": 24264976, "step": 28035 }, { "epoch": 13.22017916077322, "grad_norm": 0.0018482495797798038, "learning_rate": 0.061460917075047757, "loss": 0.1314, "num_input_tokens_seen": 24269920, "step": 28040 }, { "epoch": 13.222536539368223, "grad_norm": 0.004151421133428812, "learning_rate": 0.06141337514529694, "loss": 0.2004, "num_input_tokens_seen": 24276144, "step": 28045 }, { "epoch": 13.224893917963225, "grad_norm": 0.004850315395742655, "learning_rate": 0.06136584687671687, "loss": 0.1713, "num_input_tokens_seen": 24280688, "step": 28050 }, { "epoch": 13.227251296558228, "grad_norm": 0.004380761180073023, "learning_rate": 0.061318332276637064, "loss": 0.2078, "num_input_tokens_seen": 24285424, "step": 28055 }, { "epoch": 13.22960867515323, "grad_norm": 0.0027994848787784576, "learning_rate": 0.06127083135238491, "loss": 0.1871, "num_input_tokens_seen": 24289136, "step": 28060 }, { "epoch": 13.231966053748232, "grad_norm": 0.005169219803065062, "learning_rate": 0.06122334411128555, "loss": 0.2711, "num_input_tokens_seen": 24293072, "step": 28065 }, { "epoch": 13.234323432343235, "grad_norm": 0.00351754785515368, "learning_rate": 0.06117587056066223, "loss": 0.2171, "num_input_tokens_seen": 24296752, "step": 28070 }, { "epoch": 13.236680810938237, "grad_norm": 0.004001711029559374, "learning_rate": 0.06112841070783589, "loss": 0.2184, "num_input_tokens_seen": 24301920, "step": 28075 }, { "epoch": 13.23903818953324, "grad_norm": 0.005521354731172323, "learning_rate": 0.061080964560125406, "loss": 0.2301, "num_input_tokens_seen": 24305952, "step": 28080 }, { "epoch": 13.241395568128242, "grad_norm": 0.002503706607967615, "learning_rate": 0.06103353212484766, "loss": 0.2148, "num_input_tokens_seen": 24310016, "step": 28085 }, { "epoch": 13.243752946723244, "grad_norm": 0.0036319224163889885, "learning_rate": 0.06098611340931722, "loss": 0.1896, "num_input_tokens_seen": 24314544, "step": 28090 }, { "epoch": 13.246110325318247, "grad_norm": 0.0036453919019550085, "learning_rate": 0.06093870842084672, "loss": 0.1524, "num_input_tokens_seen": 24319616, "step": 28095 }, { "epoch": 13.24846770391325, "grad_norm": 0.0028188067954033613, "learning_rate": 0.06089131716674666, "loss": 0.1547, "num_input_tokens_seen": 24323568, "step": 28100 }, { "epoch": 13.250825082508252, "grad_norm": 0.002170430961996317, "learning_rate": 0.060843939654325226, "loss": 0.1521, "num_input_tokens_seen": 24328016, "step": 28105 }, { "epoch": 13.253182461103254, "grad_norm": 0.0026750110555440187, "learning_rate": 0.06079657589088873, "loss": 0.1555, "num_input_tokens_seen": 24331904, "step": 28110 }, { "epoch": 13.255539839698255, "grad_norm": 0.0031522426288574934, "learning_rate": 0.06074922588374126, "loss": 0.2194, "num_input_tokens_seen": 24336368, "step": 28115 }, { "epoch": 13.257897218293257, "grad_norm": 0.0028848464135080576, "learning_rate": 0.06070188964018472, "loss": 0.2181, "num_input_tokens_seen": 24341824, "step": 28120 }, { "epoch": 13.26025459688826, "grad_norm": 0.0029969143215566874, "learning_rate": 0.06065456716751902, "loss": 0.2, "num_input_tokens_seen": 24346336, "step": 28125 }, { "epoch": 13.262611975483262, "grad_norm": 0.00850124005228281, "learning_rate": 0.06060725847304182, "loss": 0.1883, "num_input_tokens_seen": 24350960, "step": 28130 }, { "epoch": 13.264969354078264, "grad_norm": 0.0036414163187146187, "learning_rate": 0.06055996356404877, "loss": 0.2027, "num_input_tokens_seen": 24355344, "step": 28135 }, { "epoch": 13.267326732673267, "grad_norm": 0.0027886638417840004, "learning_rate": 0.06051268244783327, "loss": 0.206, "num_input_tokens_seen": 24359360, "step": 28140 }, { "epoch": 13.269684111268269, "grad_norm": 0.0029075411148369312, "learning_rate": 0.06046541513168676, "loss": 0.1943, "num_input_tokens_seen": 24363360, "step": 28145 }, { "epoch": 13.272041489863271, "grad_norm": 0.005886119324713945, "learning_rate": 0.060418161622898356, "loss": 0.2288, "num_input_tokens_seen": 24367792, "step": 28150 }, { "epoch": 13.274398868458274, "grad_norm": 0.006161311641335487, "learning_rate": 0.06037092192875521, "loss": 0.2044, "num_input_tokens_seen": 24372528, "step": 28155 }, { "epoch": 13.276756247053276, "grad_norm": 0.006770299281924963, "learning_rate": 0.060323696056542225, "loss": 0.2341, "num_input_tokens_seen": 24376768, "step": 28160 }, { "epoch": 13.279113625648279, "grad_norm": 0.0034961046185344458, "learning_rate": 0.06027648401354229, "loss": 0.1839, "num_input_tokens_seen": 24380896, "step": 28165 }, { "epoch": 13.281471004243281, "grad_norm": 0.0040716324001550674, "learning_rate": 0.06022928580703601, "loss": 0.1922, "num_input_tokens_seen": 24384912, "step": 28170 }, { "epoch": 13.283828382838283, "grad_norm": 0.002871529432013631, "learning_rate": 0.060182101444301986, "loss": 0.2, "num_input_tokens_seen": 24389264, "step": 28175 }, { "epoch": 13.286185761433286, "grad_norm": 0.004829874727874994, "learning_rate": 0.06013493093261669, "loss": 0.1876, "num_input_tokens_seen": 24393584, "step": 28180 }, { "epoch": 13.288543140028288, "grad_norm": 0.002820091089233756, "learning_rate": 0.06008777427925432, "loss": 0.1997, "num_input_tokens_seen": 24398896, "step": 28185 }, { "epoch": 13.29090051862329, "grad_norm": 0.0027086748741567135, "learning_rate": 0.06004063149148705, "loss": 0.1499, "num_input_tokens_seen": 24403440, "step": 28190 }, { "epoch": 13.293257897218293, "grad_norm": 0.006282461807131767, "learning_rate": 0.05999350257658497, "loss": 0.2398, "num_input_tokens_seen": 24408000, "step": 28195 }, { "epoch": 13.295615275813295, "grad_norm": 0.0027803899720311165, "learning_rate": 0.05994638754181582, "loss": 0.2117, "num_input_tokens_seen": 24411712, "step": 28200 }, { "epoch": 13.295615275813295, "eval_loss": 0.21384276449680328, "eval_runtime": 21.9345, "eval_samples_per_second": 42.992, "eval_steps_per_second": 21.519, "num_input_tokens_seen": 24411712, "step": 28200 }, { "epoch": 13.297972654408298, "grad_norm": 0.005892493762075901, "learning_rate": 0.059899286394445445, "loss": 0.2265, "num_input_tokens_seen": 24415824, "step": 28205 }, { "epoch": 13.3003300330033, "grad_norm": 0.0029511824250221252, "learning_rate": 0.059852199141737346, "loss": 0.1965, "num_input_tokens_seen": 24420096, "step": 28210 }, { "epoch": 13.302687411598303, "grad_norm": 0.003923149313777685, "learning_rate": 0.05980512579095304, "loss": 0.2149, "num_input_tokens_seen": 24424336, "step": 28215 }, { "epoch": 13.305044790193305, "grad_norm": 0.00433417409658432, "learning_rate": 0.05975806634935181, "loss": 0.1748, "num_input_tokens_seen": 24427904, "step": 28220 }, { "epoch": 13.307402168788308, "grad_norm": 0.003342958865687251, "learning_rate": 0.05971102082419076, "loss": 0.2433, "num_input_tokens_seen": 24432480, "step": 28225 }, { "epoch": 13.30975954738331, "grad_norm": 0.0038674804382026196, "learning_rate": 0.05966398922272492, "loss": 0.1798, "num_input_tokens_seen": 24436688, "step": 28230 }, { "epoch": 13.312116925978312, "grad_norm": 0.003663122421130538, "learning_rate": 0.059616971552207236, "loss": 0.1997, "num_input_tokens_seen": 24440032, "step": 28235 }, { "epoch": 13.314474304573315, "grad_norm": 0.003171068150550127, "learning_rate": 0.059569967819888305, "loss": 0.1746, "num_input_tokens_seen": 24444480, "step": 28240 }, { "epoch": 13.316831683168317, "grad_norm": 0.0039473711512982845, "learning_rate": 0.05952297803301681, "loss": 0.1768, "num_input_tokens_seen": 24449104, "step": 28245 }, { "epoch": 13.31918906176332, "grad_norm": 0.004195316694676876, "learning_rate": 0.059476002198839056, "loss": 0.1942, "num_input_tokens_seen": 24452800, "step": 28250 }, { "epoch": 13.321546440358322, "grad_norm": 0.0033729146234691143, "learning_rate": 0.05942904032459935, "loss": 0.1543, "num_input_tokens_seen": 24457200, "step": 28255 }, { "epoch": 13.323903818953324, "grad_norm": 0.004285888280719519, "learning_rate": 0.05938209241753987, "loss": 0.2051, "num_input_tokens_seen": 24461776, "step": 28260 }, { "epoch": 13.326261197548327, "grad_norm": 0.006336894351989031, "learning_rate": 0.05933515848490046, "loss": 0.2457, "num_input_tokens_seen": 24465504, "step": 28265 }, { "epoch": 13.32861857614333, "grad_norm": 0.004565963987261057, "learning_rate": 0.059288238533918985, "loss": 0.1661, "num_input_tokens_seen": 24470240, "step": 28270 }, { "epoch": 13.330975954738332, "grad_norm": 0.00478568347170949, "learning_rate": 0.05924133257183113, "loss": 0.1917, "num_input_tokens_seen": 24474624, "step": 28275 }, { "epoch": 13.333333333333334, "grad_norm": 0.002604943234473467, "learning_rate": 0.059194440605870285, "loss": 0.2095, "num_input_tokens_seen": 24479024, "step": 28280 }, { "epoch": 13.335690711928336, "grad_norm": 0.006089653354138136, "learning_rate": 0.059147562643267884, "loss": 0.2378, "num_input_tokens_seen": 24483584, "step": 28285 }, { "epoch": 13.338048090523339, "grad_norm": 0.0036345927510410547, "learning_rate": 0.059100698691253055, "loss": 0.1829, "num_input_tokens_seen": 24488208, "step": 28290 }, { "epoch": 13.340405469118341, "grad_norm": 0.0033221275079995394, "learning_rate": 0.05905384875705273, "loss": 0.2075, "num_input_tokens_seen": 24492240, "step": 28295 }, { "epoch": 13.342762847713344, "grad_norm": 0.0029119213577359915, "learning_rate": 0.05900701284789189, "loss": 0.192, "num_input_tokens_seen": 24496848, "step": 28300 }, { "epoch": 13.345120226308346, "grad_norm": 0.008414680138230324, "learning_rate": 0.058960190970993115, "loss": 0.1876, "num_input_tokens_seen": 24502480, "step": 28305 }, { "epoch": 13.347477604903348, "grad_norm": 0.0036090167704969645, "learning_rate": 0.058913383133576955, "loss": 0.2627, "num_input_tokens_seen": 24506736, "step": 28310 }, { "epoch": 13.34983498349835, "grad_norm": 0.0033711292780935764, "learning_rate": 0.05886658934286185, "loss": 0.2141, "num_input_tokens_seen": 24511040, "step": 28315 }, { "epoch": 13.352192362093351, "grad_norm": 0.0029844765085726976, "learning_rate": 0.058819809606063846, "loss": 0.1817, "num_input_tokens_seen": 24514608, "step": 28320 }, { "epoch": 13.354549740688354, "grad_norm": 0.0028412824030965567, "learning_rate": 0.05877304393039711, "loss": 0.1681, "num_input_tokens_seen": 24518368, "step": 28325 }, { "epoch": 13.356907119283356, "grad_norm": 0.00374275678768754, "learning_rate": 0.05872629232307338, "loss": 0.179, "num_input_tokens_seen": 24522752, "step": 28330 }, { "epoch": 13.359264497878359, "grad_norm": 0.003863735357299447, "learning_rate": 0.05867955479130239, "loss": 0.1837, "num_input_tokens_seen": 24526688, "step": 28335 }, { "epoch": 13.361621876473361, "grad_norm": 0.0031909197568893433, "learning_rate": 0.058632831342291705, "loss": 0.1433, "num_input_tokens_seen": 24531984, "step": 28340 }, { "epoch": 13.363979255068363, "grad_norm": 0.0034158031921833754, "learning_rate": 0.05858612198324655, "loss": 0.2103, "num_input_tokens_seen": 24536320, "step": 28345 }, { "epoch": 13.366336633663366, "grad_norm": 0.008751826360821724, "learning_rate": 0.05853942672137025, "loss": 0.2085, "num_input_tokens_seen": 24541600, "step": 28350 }, { "epoch": 13.368694012258368, "grad_norm": 0.004063539672642946, "learning_rate": 0.05849274556386363, "loss": 0.2129, "num_input_tokens_seen": 24545600, "step": 28355 }, { "epoch": 13.37105139085337, "grad_norm": 0.005061716306954622, "learning_rate": 0.05844607851792567, "loss": 0.1496, "num_input_tokens_seen": 24549952, "step": 28360 }, { "epoch": 13.373408769448373, "grad_norm": 0.003114063758403063, "learning_rate": 0.058399425590752924, "loss": 0.1658, "num_input_tokens_seen": 24554560, "step": 28365 }, { "epoch": 13.375766148043375, "grad_norm": 0.003976886626332998, "learning_rate": 0.05835278678953985, "loss": 0.1747, "num_input_tokens_seen": 24559040, "step": 28370 }, { "epoch": 13.378123526638378, "grad_norm": 0.004157899878919125, "learning_rate": 0.05830616212147874, "loss": 0.1832, "num_input_tokens_seen": 24563408, "step": 28375 }, { "epoch": 13.38048090523338, "grad_norm": 0.004519881214946508, "learning_rate": 0.058259551593759784, "loss": 0.2118, "num_input_tokens_seen": 24567072, "step": 28380 }, { "epoch": 13.382838283828383, "grad_norm": 0.003807040164247155, "learning_rate": 0.058212955213570804, "loss": 0.1562, "num_input_tokens_seen": 24571824, "step": 28385 }, { "epoch": 13.385195662423385, "grad_norm": 0.004244002979248762, "learning_rate": 0.0581663729880976, "loss": 0.233, "num_input_tokens_seen": 24576272, "step": 28390 }, { "epoch": 13.387553041018387, "grad_norm": 0.002989956410601735, "learning_rate": 0.05811980492452379, "loss": 0.2049, "num_input_tokens_seen": 24580304, "step": 28395 }, { "epoch": 13.38991041961339, "grad_norm": 0.00439246604219079, "learning_rate": 0.058073251030030644, "loss": 0.1836, "num_input_tokens_seen": 24584800, "step": 28400 }, { "epoch": 13.38991041961339, "eval_loss": 0.21217310428619385, "eval_runtime": 21.927, "eval_samples_per_second": 43.006, "eval_steps_per_second": 21.526, "num_input_tokens_seen": 24584800, "step": 28400 }, { "epoch": 13.392267798208392, "grad_norm": 0.00468235369771719, "learning_rate": 0.05802671131179747, "loss": 0.1912, "num_input_tokens_seen": 24589376, "step": 28405 }, { "epoch": 13.394625176803395, "grad_norm": 0.0034818497952073812, "learning_rate": 0.057980185777001154, "loss": 0.2414, "num_input_tokens_seen": 24593840, "step": 28410 }, { "epoch": 13.396982555398397, "grad_norm": 0.0028383713215589523, "learning_rate": 0.057933674432816606, "loss": 0.2027, "num_input_tokens_seen": 24598752, "step": 28415 }, { "epoch": 13.3993399339934, "grad_norm": 0.005863791797310114, "learning_rate": 0.05788717728641648, "loss": 0.2596, "num_input_tokens_seen": 24602368, "step": 28420 }, { "epoch": 13.401697312588402, "grad_norm": 0.005047095473855734, "learning_rate": 0.057840694344971126, "loss": 0.1757, "num_input_tokens_seen": 24606480, "step": 28425 }, { "epoch": 13.404054691183404, "grad_norm": 0.0047446079552173615, "learning_rate": 0.0577942256156489, "loss": 0.1959, "num_input_tokens_seen": 24610544, "step": 28430 }, { "epoch": 13.406412069778407, "grad_norm": 0.003061014460399747, "learning_rate": 0.057747771105615804, "loss": 0.1846, "num_input_tokens_seen": 24615680, "step": 28435 }, { "epoch": 13.408769448373409, "grad_norm": 0.004367829766124487, "learning_rate": 0.05770133082203568, "loss": 0.2115, "num_input_tokens_seen": 24620128, "step": 28440 }, { "epoch": 13.411126826968411, "grad_norm": 0.004928905516862869, "learning_rate": 0.0576549047720703, "loss": 0.1973, "num_input_tokens_seen": 24625072, "step": 28445 }, { "epoch": 13.413484205563414, "grad_norm": 0.0037302598357200623, "learning_rate": 0.05760849296287902, "loss": 0.1898, "num_input_tokens_seen": 24628704, "step": 28450 }, { "epoch": 13.415841584158416, "grad_norm": 0.004397667478770018, "learning_rate": 0.05756209540161919, "loss": 0.2185, "num_input_tokens_seen": 24633424, "step": 28455 }, { "epoch": 13.418198962753419, "grad_norm": 0.004364490509033203, "learning_rate": 0.05751571209544595, "loss": 0.1882, "num_input_tokens_seen": 24637136, "step": 28460 }, { "epoch": 13.420556341348421, "grad_norm": 0.0018602063646540046, "learning_rate": 0.057469343051512085, "loss": 0.188, "num_input_tokens_seen": 24641072, "step": 28465 }, { "epoch": 13.422913719943423, "grad_norm": 0.0025981825310736895, "learning_rate": 0.057422988276968324, "loss": 0.1357, "num_input_tokens_seen": 24644912, "step": 28470 }, { "epoch": 13.425271098538426, "grad_norm": 0.00603348296135664, "learning_rate": 0.05737664777896323, "loss": 0.1793, "num_input_tokens_seen": 24649344, "step": 28475 }, { "epoch": 13.427628477133428, "grad_norm": 0.0021637571044266224, "learning_rate": 0.057330321564642975, "loss": 0.2287, "num_input_tokens_seen": 24653808, "step": 28480 }, { "epoch": 13.42998585572843, "grad_norm": 0.0034166877157986164, "learning_rate": 0.05728400964115174, "loss": 0.168, "num_input_tokens_seen": 24658528, "step": 28485 }, { "epoch": 13.432343234323433, "grad_norm": 0.003960008267313242, "learning_rate": 0.057237712015631305, "loss": 0.1896, "num_input_tokens_seen": 24662848, "step": 28490 }, { "epoch": 13.434700612918435, "grad_norm": 0.005399961955845356, "learning_rate": 0.057191428695221425, "loss": 0.1934, "num_input_tokens_seen": 24667056, "step": 28495 }, { "epoch": 13.437057991513438, "grad_norm": 0.00431847246363759, "learning_rate": 0.05714515968705958, "loss": 0.224, "num_input_tokens_seen": 24671360, "step": 28500 }, { "epoch": 13.43941537010844, "grad_norm": 0.003903711447492242, "learning_rate": 0.05709890499828099, "loss": 0.174, "num_input_tokens_seen": 24676480, "step": 28505 }, { "epoch": 13.441772748703443, "grad_norm": 0.0037968994583934546, "learning_rate": 0.05705266463601868, "loss": 0.1788, "num_input_tokens_seen": 24680864, "step": 28510 }, { "epoch": 13.444130127298443, "grad_norm": 0.0038604261353611946, "learning_rate": 0.057006438607403565, "loss": 0.1397, "num_input_tokens_seen": 24684944, "step": 28515 }, { "epoch": 13.446487505893446, "grad_norm": 0.004196444526314735, "learning_rate": 0.056960226919564205, "loss": 0.1942, "num_input_tokens_seen": 24688864, "step": 28520 }, { "epoch": 13.448844884488448, "grad_norm": 0.005792573094367981, "learning_rate": 0.05691402957962713, "loss": 0.2285, "num_input_tokens_seen": 24693632, "step": 28525 }, { "epoch": 13.45120226308345, "grad_norm": 0.002516391221433878, "learning_rate": 0.05686784659471642, "loss": 0.2288, "num_input_tokens_seen": 24697456, "step": 28530 }, { "epoch": 13.453559641678453, "grad_norm": 0.003014878137037158, "learning_rate": 0.056821677971954136, "loss": 0.1935, "num_input_tokens_seen": 24701920, "step": 28535 }, { "epoch": 13.455917020273455, "grad_norm": 0.002779040252789855, "learning_rate": 0.05677552371846012, "loss": 0.1753, "num_input_tokens_seen": 24706080, "step": 28540 }, { "epoch": 13.458274398868458, "grad_norm": 0.0035162148997187614, "learning_rate": 0.05672938384135182, "loss": 0.2106, "num_input_tokens_seen": 24710160, "step": 28545 }, { "epoch": 13.46063177746346, "grad_norm": 0.004334775265306234, "learning_rate": 0.05668325834774465, "loss": 0.1738, "num_input_tokens_seen": 24714256, "step": 28550 }, { "epoch": 13.462989156058462, "grad_norm": 0.0028555768076330423, "learning_rate": 0.05663714724475177, "loss": 0.1302, "num_input_tokens_seen": 24717872, "step": 28555 }, { "epoch": 13.465346534653465, "grad_norm": 0.004129506181925535, "learning_rate": 0.05659105053948403, "loss": 0.1898, "num_input_tokens_seen": 24723472, "step": 28560 }, { "epoch": 13.467703913248467, "grad_norm": 0.0075521161779761314, "learning_rate": 0.056544968239050176, "loss": 0.2837, "num_input_tokens_seen": 24729120, "step": 28565 }, { "epoch": 13.47006129184347, "grad_norm": 0.005326953250914812, "learning_rate": 0.056498900350556616, "loss": 0.2394, "num_input_tokens_seen": 24733552, "step": 28570 }, { "epoch": 13.472418670438472, "grad_norm": 0.002482346259057522, "learning_rate": 0.05645284688110766, "loss": 0.1607, "num_input_tokens_seen": 24738496, "step": 28575 }, { "epoch": 13.474776049033474, "grad_norm": 0.0029092859476804733, "learning_rate": 0.05640680783780532, "loss": 0.1684, "num_input_tokens_seen": 24742256, "step": 28580 }, { "epoch": 13.477133427628477, "grad_norm": 0.0027815192006528378, "learning_rate": 0.056360783227749324, "loss": 0.2098, "num_input_tokens_seen": 24746768, "step": 28585 }, { "epoch": 13.47949080622348, "grad_norm": 0.003038255264982581, "learning_rate": 0.05631477305803728, "loss": 0.1545, "num_input_tokens_seen": 24750960, "step": 28590 }, { "epoch": 13.481848184818482, "grad_norm": 0.006580726243555546, "learning_rate": 0.05626877733576462, "loss": 0.1363, "num_input_tokens_seen": 24755120, "step": 28595 }, { "epoch": 13.484205563413484, "grad_norm": 0.0019359199795871973, "learning_rate": 0.05622279606802435, "loss": 0.1836, "num_input_tokens_seen": 24759888, "step": 28600 }, { "epoch": 13.484205563413484, "eval_loss": 0.22224083542823792, "eval_runtime": 21.9158, "eval_samples_per_second": 43.028, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 24759888, "step": 28600 }, { "epoch": 13.486562942008486, "grad_norm": 0.003124724142253399, "learning_rate": 0.05617682926190744, "loss": 0.1894, "num_input_tokens_seen": 24764560, "step": 28605 }, { "epoch": 13.488920320603489, "grad_norm": 0.003626782912760973, "learning_rate": 0.05613087692450248, "loss": 0.214, "num_input_tokens_seen": 24769184, "step": 28610 }, { "epoch": 13.491277699198491, "grad_norm": 0.003989465069025755, "learning_rate": 0.05608493906289592, "loss": 0.2224, "num_input_tokens_seen": 24773088, "step": 28615 }, { "epoch": 13.493635077793494, "grad_norm": 0.003706319723278284, "learning_rate": 0.05603901568417201, "loss": 0.1828, "num_input_tokens_seen": 24777520, "step": 28620 }, { "epoch": 13.495992456388496, "grad_norm": 0.004123826045542955, "learning_rate": 0.055993106795412625, "loss": 0.2338, "num_input_tokens_seen": 24781360, "step": 28625 }, { "epoch": 13.498349834983498, "grad_norm": 0.0038644177839159966, "learning_rate": 0.05594721240369759, "loss": 0.1592, "num_input_tokens_seen": 24785648, "step": 28630 }, { "epoch": 13.500707213578501, "grad_norm": 0.003213512944057584, "learning_rate": 0.055901332516104296, "loss": 0.1908, "num_input_tokens_seen": 24790304, "step": 28635 }, { "epoch": 13.503064592173503, "grad_norm": 0.005704285576939583, "learning_rate": 0.05585546713970804, "loss": 0.2105, "num_input_tokens_seen": 24794048, "step": 28640 }, { "epoch": 13.505421970768506, "grad_norm": 0.003139129839837551, "learning_rate": 0.05580961628158189, "loss": 0.225, "num_input_tokens_seen": 24799040, "step": 28645 }, { "epoch": 13.507779349363508, "grad_norm": 0.003374840598553419, "learning_rate": 0.05576377994879659, "loss": 0.2037, "num_input_tokens_seen": 24802992, "step": 28650 }, { "epoch": 13.51013672795851, "grad_norm": 0.0038126667495816946, "learning_rate": 0.05571795814842063, "loss": 0.184, "num_input_tokens_seen": 24807248, "step": 28655 }, { "epoch": 13.512494106553513, "grad_norm": 0.002653178060427308, "learning_rate": 0.05567215088752037, "loss": 0.2166, "num_input_tokens_seen": 24811696, "step": 28660 }, { "epoch": 13.514851485148515, "grad_norm": 0.0051523963920772076, "learning_rate": 0.05562635817315981, "loss": 0.2072, "num_input_tokens_seen": 24815376, "step": 28665 }, { "epoch": 13.517208863743518, "grad_norm": 0.002797238063067198, "learning_rate": 0.05558058001240083, "loss": 0.1601, "num_input_tokens_seen": 24819712, "step": 28670 }, { "epoch": 13.51956624233852, "grad_norm": 0.005402186419814825, "learning_rate": 0.055534816412302915, "loss": 0.133, "num_input_tokens_seen": 24824768, "step": 28675 }, { "epoch": 13.521923620933523, "grad_norm": 0.004436859395354986, "learning_rate": 0.055489067379923436, "loss": 0.2048, "num_input_tokens_seen": 24829104, "step": 28680 }, { "epoch": 13.524280999528525, "grad_norm": 0.004116388037800789, "learning_rate": 0.055443332922317505, "loss": 0.1965, "num_input_tokens_seen": 24832864, "step": 28685 }, { "epoch": 13.526638378123527, "grad_norm": 0.0035979272797703743, "learning_rate": 0.055397613046537876, "loss": 0.1729, "num_input_tokens_seen": 24836656, "step": 28690 }, { "epoch": 13.52899575671853, "grad_norm": 0.0059866104274988174, "learning_rate": 0.055351907759635145, "loss": 0.2924, "num_input_tokens_seen": 24841120, "step": 28695 }, { "epoch": 13.531353135313532, "grad_norm": 0.004189211875200272, "learning_rate": 0.05530621706865772, "loss": 0.1932, "num_input_tokens_seen": 24845792, "step": 28700 }, { "epoch": 13.533710513908535, "grad_norm": 0.004581711255013943, "learning_rate": 0.055260540980651564, "loss": 0.183, "num_input_tokens_seen": 24850704, "step": 28705 }, { "epoch": 13.536067892503535, "grad_norm": 0.005896758288145065, "learning_rate": 0.05521487950266062, "loss": 0.1509, "num_input_tokens_seen": 24855200, "step": 28710 }, { "epoch": 13.53842527109854, "grad_norm": 0.003892758162692189, "learning_rate": 0.055169232641726344, "loss": 0.1931, "num_input_tokens_seen": 24859696, "step": 28715 }, { "epoch": 13.54078264969354, "grad_norm": 0.0030240598134696484, "learning_rate": 0.055123600404888166, "loss": 0.1884, "num_input_tokens_seen": 24864000, "step": 28720 }, { "epoch": 13.543140028288542, "grad_norm": 0.004869920667260885, "learning_rate": 0.05507798279918309, "loss": 0.1765, "num_input_tokens_seen": 24868720, "step": 28725 }, { "epoch": 13.545497406883545, "grad_norm": 0.004318545572459698, "learning_rate": 0.0550323798316459, "loss": 0.1821, "num_input_tokens_seen": 24873056, "step": 28730 }, { "epoch": 13.547854785478547, "grad_norm": 0.00289716525003314, "learning_rate": 0.05498679150930916, "loss": 0.2107, "num_input_tokens_seen": 24877200, "step": 28735 }, { "epoch": 13.55021216407355, "grad_norm": 0.00298098917119205, "learning_rate": 0.05494121783920323, "loss": 0.2495, "num_input_tokens_seen": 24881584, "step": 28740 }, { "epoch": 13.552569542668552, "grad_norm": 0.0060999151319265366, "learning_rate": 0.05489565882835605, "loss": 0.1711, "num_input_tokens_seen": 24886544, "step": 28745 }, { "epoch": 13.554926921263554, "grad_norm": 0.003732851706445217, "learning_rate": 0.05485011448379348, "loss": 0.1503, "num_input_tokens_seen": 24891664, "step": 28750 }, { "epoch": 13.557284299858557, "grad_norm": 0.007610046770423651, "learning_rate": 0.05480458481253893, "loss": 0.1851, "num_input_tokens_seen": 24895920, "step": 28755 }, { "epoch": 13.55964167845356, "grad_norm": 0.003554454306140542, "learning_rate": 0.054759069821613715, "loss": 0.1549, "num_input_tokens_seen": 24900304, "step": 28760 }, { "epoch": 13.561999057048562, "grad_norm": 0.002551827346906066, "learning_rate": 0.05471356951803683, "loss": 0.2156, "num_input_tokens_seen": 24904432, "step": 28765 }, { "epoch": 13.564356435643564, "grad_norm": 0.0034580405335873365, "learning_rate": 0.054668083908824945, "loss": 0.2575, "num_input_tokens_seen": 24909568, "step": 28770 }, { "epoch": 13.566713814238566, "grad_norm": 0.004853108432143927, "learning_rate": 0.054622613000992526, "loss": 0.25, "num_input_tokens_seen": 24913616, "step": 28775 }, { "epoch": 13.569071192833569, "grad_norm": 0.005195214878767729, "learning_rate": 0.05457715680155182, "loss": 0.1675, "num_input_tokens_seen": 24918048, "step": 28780 }, { "epoch": 13.571428571428571, "grad_norm": 0.005279366858303547, "learning_rate": 0.05453171531751265, "loss": 0.1853, "num_input_tokens_seen": 24922464, "step": 28785 }, { "epoch": 13.573785950023574, "grad_norm": 0.00409263651818037, "learning_rate": 0.05448628855588276, "loss": 0.2076, "num_input_tokens_seen": 24926800, "step": 28790 }, { "epoch": 13.576143328618576, "grad_norm": 0.0037708377931267023, "learning_rate": 0.05444087652366746, "loss": 0.2264, "num_input_tokens_seen": 24932192, "step": 28795 }, { "epoch": 13.578500707213578, "grad_norm": 0.0039263032376766205, "learning_rate": 0.05439547922786984, "loss": 0.2203, "num_input_tokens_seen": 24936720, "step": 28800 }, { "epoch": 13.578500707213578, "eval_loss": 0.22089065611362457, "eval_runtime": 21.9223, "eval_samples_per_second": 43.016, "eval_steps_per_second": 21.531, "num_input_tokens_seen": 24936720, "step": 28800 }, { "epoch": 13.58085808580858, "grad_norm": 0.006409460678696632, "learning_rate": 0.0543500966754908, "loss": 0.1963, "num_input_tokens_seen": 24941280, "step": 28805 }, { "epoch": 13.583215464403583, "grad_norm": 0.003566692117601633, "learning_rate": 0.05430472887352882, "loss": 0.1872, "num_input_tokens_seen": 24945088, "step": 28810 }, { "epoch": 13.585572842998586, "grad_norm": 0.0038699000142514706, "learning_rate": 0.05425937582898023, "loss": 0.2181, "num_input_tokens_seen": 24949456, "step": 28815 }, { "epoch": 13.587930221593588, "grad_norm": 0.0036918758414685726, "learning_rate": 0.054214037548839085, "loss": 0.1601, "num_input_tokens_seen": 24954336, "step": 28820 }, { "epoch": 13.59028760018859, "grad_norm": 0.004864132963120937, "learning_rate": 0.05416871404009703, "loss": 0.215, "num_input_tokens_seen": 24958752, "step": 28825 }, { "epoch": 13.592644978783593, "grad_norm": 0.0038606938906013966, "learning_rate": 0.054123405309743605, "loss": 0.1631, "num_input_tokens_seen": 24963296, "step": 28830 }, { "epoch": 13.595002357378595, "grad_norm": 0.004901327192783356, "learning_rate": 0.0540781113647659, "loss": 0.2024, "num_input_tokens_seen": 24967728, "step": 28835 }, { "epoch": 13.597359735973598, "grad_norm": 0.005510315764695406, "learning_rate": 0.054032832212148836, "loss": 0.2424, "num_input_tokens_seen": 24972544, "step": 28840 }, { "epoch": 13.5997171145686, "grad_norm": 0.004320807289332151, "learning_rate": 0.0539875678588751, "loss": 0.2456, "num_input_tokens_seen": 24977168, "step": 28845 }, { "epoch": 13.602074493163602, "grad_norm": 0.00375325046479702, "learning_rate": 0.05394231831192492, "loss": 0.1934, "num_input_tokens_seen": 24981632, "step": 28850 }, { "epoch": 13.604431871758605, "grad_norm": 0.007146845106035471, "learning_rate": 0.05389708357827639, "loss": 0.2199, "num_input_tokens_seen": 24985552, "step": 28855 }, { "epoch": 13.606789250353607, "grad_norm": 0.004711825400590897, "learning_rate": 0.05385186366490533, "loss": 0.1986, "num_input_tokens_seen": 24989712, "step": 28860 }, { "epoch": 13.60914662894861, "grad_norm": 0.005951453931629658, "learning_rate": 0.053806658578785166, "loss": 0.2152, "num_input_tokens_seen": 24994224, "step": 28865 }, { "epoch": 13.611504007543612, "grad_norm": 0.0059767975471913815, "learning_rate": 0.05376146832688705, "loss": 0.1974, "num_input_tokens_seen": 24998928, "step": 28870 }, { "epoch": 13.613861386138614, "grad_norm": 0.0045145186595618725, "learning_rate": 0.053716292916179964, "loss": 0.2441, "num_input_tokens_seen": 25003504, "step": 28875 }, { "epoch": 13.616218764733617, "grad_norm": 0.0036954889073967934, "learning_rate": 0.05367113235363045, "loss": 0.1986, "num_input_tokens_seen": 25008064, "step": 28880 }, { "epoch": 13.61857614332862, "grad_norm": 0.00467325933277607, "learning_rate": 0.05362598664620289, "loss": 0.164, "num_input_tokens_seen": 25011840, "step": 28885 }, { "epoch": 13.620933521923622, "grad_norm": 0.0039536841213703156, "learning_rate": 0.053580855800859285, "loss": 0.1471, "num_input_tokens_seen": 25015680, "step": 28890 }, { "epoch": 13.623290900518624, "grad_norm": 0.0029755316209048033, "learning_rate": 0.05353573982455938, "loss": 0.1855, "num_input_tokens_seen": 25020464, "step": 28895 }, { "epoch": 13.625648279113626, "grad_norm": 0.006905414164066315, "learning_rate": 0.053490638724260686, "loss": 0.2005, "num_input_tokens_seen": 25025296, "step": 28900 }, { "epoch": 13.628005657708629, "grad_norm": 0.004159688483923674, "learning_rate": 0.05344555250691827, "loss": 0.2131, "num_input_tokens_seen": 25029696, "step": 28905 }, { "epoch": 13.630363036303631, "grad_norm": 0.004368811380118132, "learning_rate": 0.053400481179485086, "loss": 0.1855, "num_input_tokens_seen": 25034592, "step": 28910 }, { "epoch": 13.632720414898632, "grad_norm": 0.004641393665224314, "learning_rate": 0.05335542474891159, "loss": 0.1797, "num_input_tokens_seen": 25039120, "step": 28915 }, { "epoch": 13.635077793493634, "grad_norm": 0.004671767819672823, "learning_rate": 0.053310383222146124, "loss": 0.2283, "num_input_tokens_seen": 25043648, "step": 28920 }, { "epoch": 13.637435172088637, "grad_norm": 0.0031480840407311916, "learning_rate": 0.053265356606134684, "loss": 0.2592, "num_input_tokens_seen": 25047728, "step": 28925 }, { "epoch": 13.639792550683639, "grad_norm": 0.002788742072880268, "learning_rate": 0.053220344907820856, "loss": 0.1512, "num_input_tokens_seen": 25052112, "step": 28930 }, { "epoch": 13.642149929278641, "grad_norm": 0.0038656184915453196, "learning_rate": 0.05317534813414608, "loss": 0.1938, "num_input_tokens_seen": 25056416, "step": 28935 }, { "epoch": 13.644507307873644, "grad_norm": 0.004766746889799833, "learning_rate": 0.05313036629204942, "loss": 0.1909, "num_input_tokens_seen": 25060096, "step": 28940 }, { "epoch": 13.646864686468646, "grad_norm": 0.004360992927104235, "learning_rate": 0.05308539938846756, "loss": 0.1666, "num_input_tokens_seen": 25064640, "step": 28945 }, { "epoch": 13.649222065063649, "grad_norm": 0.006198831368237734, "learning_rate": 0.05304044743033507, "loss": 0.2402, "num_input_tokens_seen": 25068736, "step": 28950 }, { "epoch": 13.651579443658651, "grad_norm": 0.0019321818836033344, "learning_rate": 0.05299551042458401, "loss": 0.151, "num_input_tokens_seen": 25072736, "step": 28955 }, { "epoch": 13.653936822253653, "grad_norm": 0.006511553190648556, "learning_rate": 0.052950588378144266, "loss": 0.218, "num_input_tokens_seen": 25077712, "step": 28960 }, { "epoch": 13.656294200848656, "grad_norm": 0.003966438118368387, "learning_rate": 0.052905681297943465, "loss": 0.186, "num_input_tokens_seen": 25082208, "step": 28965 }, { "epoch": 13.658651579443658, "grad_norm": 0.004760412033647299, "learning_rate": 0.0528607891909067, "loss": 0.2952, "num_input_tokens_seen": 25086864, "step": 28970 }, { "epoch": 13.66100895803866, "grad_norm": 0.004523079376667738, "learning_rate": 0.05281591206395697, "loss": 0.2032, "num_input_tokens_seen": 25091376, "step": 28975 }, { "epoch": 13.663366336633663, "grad_norm": 0.0040833428502082825, "learning_rate": 0.05277104992401496, "loss": 0.1861, "num_input_tokens_seen": 25095424, "step": 28980 }, { "epoch": 13.665723715228665, "grad_norm": 0.005317801609635353, "learning_rate": 0.05272620277799884, "loss": 0.2445, "num_input_tokens_seen": 25099360, "step": 28985 }, { "epoch": 13.668081093823668, "grad_norm": 0.0036685646045953035, "learning_rate": 0.05268137063282473, "loss": 0.1817, "num_input_tokens_seen": 25103376, "step": 28990 }, { "epoch": 13.67043847241867, "grad_norm": 0.0037000251468271017, "learning_rate": 0.0526365534954062, "loss": 0.2213, "num_input_tokens_seen": 25107440, "step": 28995 }, { "epoch": 13.672795851013673, "grad_norm": 0.004412584938108921, "learning_rate": 0.052591751372654656, "loss": 0.1911, "num_input_tokens_seen": 25110864, "step": 29000 }, { "epoch": 13.672795851013673, "eval_loss": 0.21386408805847168, "eval_runtime": 21.9346, "eval_samples_per_second": 42.991, "eval_steps_per_second": 21.519, "num_input_tokens_seen": 25110864, "step": 29000 }, { "epoch": 13.675153229608675, "grad_norm": 0.006533700041472912, "learning_rate": 0.05254696427147921, "loss": 0.2067, "num_input_tokens_seen": 25115472, "step": 29005 }, { "epoch": 13.677510608203677, "grad_norm": 0.0022053380962461233, "learning_rate": 0.052502192198786546, "loss": 0.1909, "num_input_tokens_seen": 25119696, "step": 29010 }, { "epoch": 13.67986798679868, "grad_norm": 0.003370769554749131, "learning_rate": 0.05245743516148103, "loss": 0.1861, "num_input_tokens_seen": 25123824, "step": 29015 }, { "epoch": 13.682225365393682, "grad_norm": 0.002879176288843155, "learning_rate": 0.05241269316646486, "loss": 0.1729, "num_input_tokens_seen": 25127536, "step": 29020 }, { "epoch": 13.684582743988685, "grad_norm": 0.0028984853997826576, "learning_rate": 0.052367966220637725, "loss": 0.1849, "num_input_tokens_seen": 25131536, "step": 29025 }, { "epoch": 13.686940122583687, "grad_norm": 0.004233913496136665, "learning_rate": 0.05232325433089716, "loss": 0.1804, "num_input_tokens_seen": 25136128, "step": 29030 }, { "epoch": 13.68929750117869, "grad_norm": 0.00387250492349267, "learning_rate": 0.052278557504138214, "loss": 0.192, "num_input_tokens_seen": 25140768, "step": 29035 }, { "epoch": 13.691654879773692, "grad_norm": 0.002160158473998308, "learning_rate": 0.05223387574725372, "loss": 0.1724, "num_input_tokens_seen": 25144608, "step": 29040 }, { "epoch": 13.694012258368694, "grad_norm": 0.004393074195832014, "learning_rate": 0.05218920906713428, "loss": 0.1912, "num_input_tokens_seen": 25148688, "step": 29045 }, { "epoch": 13.696369636963697, "grad_norm": 0.005205902736634016, "learning_rate": 0.05214455747066789, "loss": 0.221, "num_input_tokens_seen": 25154336, "step": 29050 }, { "epoch": 13.698727015558699, "grad_norm": 0.004779312759637833, "learning_rate": 0.05209992096474048, "loss": 0.2348, "num_input_tokens_seen": 25159040, "step": 29055 }, { "epoch": 13.701084394153701, "grad_norm": 0.003442067187279463, "learning_rate": 0.05205529955623559, "loss": 0.21, "num_input_tokens_seen": 25162768, "step": 29060 }, { "epoch": 13.703441772748704, "grad_norm": 0.003546150168403983, "learning_rate": 0.052010693252034314, "loss": 0.2063, "num_input_tokens_seen": 25166560, "step": 29065 }, { "epoch": 13.705799151343706, "grad_norm": 0.0046943132765591145, "learning_rate": 0.0519661020590156, "loss": 0.1803, "num_input_tokens_seen": 25170272, "step": 29070 }, { "epoch": 13.708156529938709, "grad_norm": 0.004767566453665495, "learning_rate": 0.05192152598405586, "loss": 0.1957, "num_input_tokens_seen": 25174272, "step": 29075 }, { "epoch": 13.710513908533711, "grad_norm": 0.003599602961912751, "learning_rate": 0.05187696503402941, "loss": 0.171, "num_input_tokens_seen": 25178688, "step": 29080 }, { "epoch": 13.712871287128714, "grad_norm": 0.004343706648796797, "learning_rate": 0.05183241921580798, "loss": 0.2229, "num_input_tokens_seen": 25182752, "step": 29085 }, { "epoch": 13.715228665723716, "grad_norm": 0.003426008392125368, "learning_rate": 0.051787888536261206, "loss": 0.2249, "num_input_tokens_seen": 25187344, "step": 29090 }, { "epoch": 13.717586044318718, "grad_norm": 0.004933008924126625, "learning_rate": 0.051743373002256184, "loss": 0.2249, "num_input_tokens_seen": 25191920, "step": 29095 }, { "epoch": 13.71994342291372, "grad_norm": 0.004095638170838356, "learning_rate": 0.05169887262065787, "loss": 0.1591, "num_input_tokens_seen": 25196368, "step": 29100 }, { "epoch": 13.722300801508723, "grad_norm": 0.0018408370669931173, "learning_rate": 0.051654387398328665, "loss": 0.1712, "num_input_tokens_seen": 25201264, "step": 29105 }, { "epoch": 13.724658180103724, "grad_norm": 0.003776341676712036, "learning_rate": 0.05160991734212888, "loss": 0.1595, "num_input_tokens_seen": 25205952, "step": 29110 }, { "epoch": 13.727015558698728, "grad_norm": 0.005363414995372295, "learning_rate": 0.051565462458916224, "loss": 0.1743, "num_input_tokens_seen": 25210208, "step": 29115 }, { "epoch": 13.729372937293729, "grad_norm": 0.003551613073796034, "learning_rate": 0.05152102275554627, "loss": 0.1999, "num_input_tokens_seen": 25214720, "step": 29120 }, { "epoch": 13.731730315888731, "grad_norm": 0.003412707010284066, "learning_rate": 0.05147659823887222, "loss": 0.2016, "num_input_tokens_seen": 25219040, "step": 29125 }, { "epoch": 13.734087694483733, "grad_norm": 0.004067674279212952, "learning_rate": 0.05143218891574479, "loss": 0.2051, "num_input_tokens_seen": 25222992, "step": 29130 }, { "epoch": 13.736445073078736, "grad_norm": 0.004270528443157673, "learning_rate": 0.0513877947930125, "loss": 0.2326, "num_input_tokens_seen": 25227472, "step": 29135 }, { "epoch": 13.738802451673738, "grad_norm": 0.003574499860405922, "learning_rate": 0.051343415877521566, "loss": 0.1979, "num_input_tokens_seen": 25232400, "step": 29140 }, { "epoch": 13.74115983026874, "grad_norm": 0.0026290391106158495, "learning_rate": 0.051299052176115634, "loss": 0.1689, "num_input_tokens_seen": 25236192, "step": 29145 }, { "epoch": 13.743517208863743, "grad_norm": 0.004557745531201363, "learning_rate": 0.051254703695636256, "loss": 0.2433, "num_input_tokens_seen": 25240640, "step": 29150 }, { "epoch": 13.745874587458745, "grad_norm": 0.005136494059115648, "learning_rate": 0.05121037044292249, "loss": 0.2148, "num_input_tokens_seen": 25244752, "step": 29155 }, { "epoch": 13.748231966053748, "grad_norm": 0.0033568604849278927, "learning_rate": 0.05116605242481101, "loss": 0.2303, "num_input_tokens_seen": 25249632, "step": 29160 }, { "epoch": 13.75058934464875, "grad_norm": 0.004388198256492615, "learning_rate": 0.05112174964813634, "loss": 0.224, "num_input_tokens_seen": 25254384, "step": 29165 }, { "epoch": 13.752946723243753, "grad_norm": 0.0033751139417290688, "learning_rate": 0.05107746211973038, "loss": 0.17, "num_input_tokens_seen": 25258208, "step": 29170 }, { "epoch": 13.755304101838755, "grad_norm": 0.0032822161447256804, "learning_rate": 0.05103318984642291, "loss": 0.1786, "num_input_tokens_seen": 25261840, "step": 29175 }, { "epoch": 13.757661480433757, "grad_norm": 0.004384009633213282, "learning_rate": 0.05098893283504131, "loss": 0.2024, "num_input_tokens_seen": 25266096, "step": 29180 }, { "epoch": 13.76001885902876, "grad_norm": 0.0037242916878312826, "learning_rate": 0.050944691092410475, "loss": 0.2024, "num_input_tokens_seen": 25270720, "step": 29185 }, { "epoch": 13.762376237623762, "grad_norm": 0.003847974119707942, "learning_rate": 0.05090046462535313, "loss": 0.218, "num_input_tokens_seen": 25275344, "step": 29190 }, { "epoch": 13.764733616218765, "grad_norm": 0.005201333202421665, "learning_rate": 0.050856253440689454, "loss": 0.2303, "num_input_tokens_seen": 25279920, "step": 29195 }, { "epoch": 13.767090994813767, "grad_norm": 0.004798797890543938, "learning_rate": 0.050812057545237405, "loss": 0.2123, "num_input_tokens_seen": 25284944, "step": 29200 }, { "epoch": 13.767090994813767, "eval_loss": 0.2117646038532257, "eval_runtime": 21.9302, "eval_samples_per_second": 43.0, "eval_steps_per_second": 21.523, "num_input_tokens_seen": 25284944, "step": 29200 }, { "epoch": 13.76944837340877, "grad_norm": 0.0035623477306216955, "learning_rate": 0.0507678769458126, "loss": 0.1987, "num_input_tokens_seen": 25288480, "step": 29205 }, { "epoch": 13.771805752003772, "grad_norm": 0.003694303799420595, "learning_rate": 0.050723711649228155, "loss": 0.1778, "num_input_tokens_seen": 25292736, "step": 29210 }, { "epoch": 13.774163130598774, "grad_norm": 0.004534947220236063, "learning_rate": 0.05067956166229496, "loss": 0.2103, "num_input_tokens_seen": 25297280, "step": 29215 }, { "epoch": 13.776520509193777, "grad_norm": 0.002619493054226041, "learning_rate": 0.05063542699182155, "loss": 0.2182, "num_input_tokens_seen": 25302320, "step": 29220 }, { "epoch": 13.778877887788779, "grad_norm": 0.0028981624636799097, "learning_rate": 0.050591307644613996, "loss": 0.1687, "num_input_tokens_seen": 25306416, "step": 29225 }, { "epoch": 13.781235266383781, "grad_norm": 0.004888403695076704, "learning_rate": 0.05054720362747599, "loss": 0.2358, "num_input_tokens_seen": 25310768, "step": 29230 }, { "epoch": 13.783592644978784, "grad_norm": 0.004227655474096537, "learning_rate": 0.050503114947209035, "loss": 0.2017, "num_input_tokens_seen": 25315056, "step": 29235 }, { "epoch": 13.785950023573786, "grad_norm": 0.0024586431682109833, "learning_rate": 0.05045904161061207, "loss": 0.1586, "num_input_tokens_seen": 25319584, "step": 29240 }, { "epoch": 13.788307402168789, "grad_norm": 0.004200305789709091, "learning_rate": 0.05041498362448185, "loss": 0.2146, "num_input_tokens_seen": 25323376, "step": 29245 }, { "epoch": 13.790664780763791, "grad_norm": 0.004687074571847916, "learning_rate": 0.05037094099561256, "loss": 0.1756, "num_input_tokens_seen": 25328160, "step": 29250 }, { "epoch": 13.793022159358793, "grad_norm": 0.004202248528599739, "learning_rate": 0.05032691373079624, "loss": 0.243, "num_input_tokens_seen": 25332464, "step": 29255 }, { "epoch": 13.795379537953796, "grad_norm": 0.004029267933219671, "learning_rate": 0.05028290183682234, "loss": 0.1876, "num_input_tokens_seen": 25336896, "step": 29260 }, { "epoch": 13.797736916548798, "grad_norm": 0.004172188229858875, "learning_rate": 0.050238905320478096, "loss": 0.1879, "num_input_tokens_seen": 25341312, "step": 29265 }, { "epoch": 13.8000942951438, "grad_norm": 0.002621263498440385, "learning_rate": 0.05019492418854838, "loss": 0.1602, "num_input_tokens_seen": 25345968, "step": 29270 }, { "epoch": 13.802451673738803, "grad_norm": 0.013429324142634869, "learning_rate": 0.05015095844781554, "loss": 0.2404, "num_input_tokens_seen": 25349728, "step": 29275 }, { "epoch": 13.804809052333805, "grad_norm": 0.0051779006607830524, "learning_rate": 0.05010700810505968, "loss": 0.2411, "num_input_tokens_seen": 25354032, "step": 29280 }, { "epoch": 13.807166430928808, "grad_norm": 0.006728032138198614, "learning_rate": 0.05006307316705856, "loss": 0.2325, "num_input_tokens_seen": 25357792, "step": 29285 }, { "epoch": 13.80952380952381, "grad_norm": 0.004422380588948727, "learning_rate": 0.0500191536405874, "loss": 0.2063, "num_input_tokens_seen": 25362128, "step": 29290 }, { "epoch": 13.811881188118813, "grad_norm": 0.0037285154685378075, "learning_rate": 0.04997524953241922, "loss": 0.2111, "num_input_tokens_seen": 25365600, "step": 29295 }, { "epoch": 13.814238566713815, "grad_norm": 0.003240319201722741, "learning_rate": 0.049931360849324556, "loss": 0.2122, "num_input_tokens_seen": 25369952, "step": 29300 }, { "epoch": 13.816595945308817, "grad_norm": 0.00333136273548007, "learning_rate": 0.04988748759807155, "loss": 0.1483, "num_input_tokens_seen": 25374816, "step": 29305 }, { "epoch": 13.81895332390382, "grad_norm": 0.003069463651627302, "learning_rate": 0.0498436297854261, "loss": 0.1511, "num_input_tokens_seen": 25380176, "step": 29310 }, { "epoch": 13.82131070249882, "grad_norm": 0.006015172693878412, "learning_rate": 0.04979978741815152, "loss": 0.1996, "num_input_tokens_seen": 25383856, "step": 29315 }, { "epoch": 13.823668081093825, "grad_norm": 0.00577114662155509, "learning_rate": 0.04975596050300891, "loss": 0.2117, "num_input_tokens_seen": 25387312, "step": 29320 }, { "epoch": 13.826025459688825, "grad_norm": 0.005388208199292421, "learning_rate": 0.049712149046757005, "loss": 0.1892, "num_input_tokens_seen": 25392192, "step": 29325 }, { "epoch": 13.828382838283828, "grad_norm": 0.004310495685786009, "learning_rate": 0.04966835305615194, "loss": 0.2302, "num_input_tokens_seen": 25396240, "step": 29330 }, { "epoch": 13.83074021687883, "grad_norm": 0.0033561259042471647, "learning_rate": 0.049624572537947755, "loss": 0.1944, "num_input_tokens_seen": 25400768, "step": 29335 }, { "epoch": 13.833097595473832, "grad_norm": 0.004013275261968374, "learning_rate": 0.04958080749889582, "loss": 0.2007, "num_input_tokens_seen": 25404960, "step": 29340 }, { "epoch": 13.835454974068835, "grad_norm": 0.005120350979268551, "learning_rate": 0.049537057945745304, "loss": 0.2266, "num_input_tokens_seen": 25408768, "step": 29345 }, { "epoch": 13.837812352663837, "grad_norm": 0.005392777267843485, "learning_rate": 0.049493323885243, "loss": 0.2128, "num_input_tokens_seen": 25413712, "step": 29350 }, { "epoch": 13.84016973125884, "grad_norm": 0.004702049307525158, "learning_rate": 0.04944960532413318, "loss": 0.1986, "num_input_tokens_seen": 25417888, "step": 29355 }, { "epoch": 13.842527109853842, "grad_norm": 0.0027801289688795805, "learning_rate": 0.049405902269157774, "loss": 0.1904, "num_input_tokens_seen": 25422176, "step": 29360 }, { "epoch": 13.844884488448844, "grad_norm": 0.0043770126067101955, "learning_rate": 0.04936221472705646, "loss": 0.2521, "num_input_tokens_seen": 25427216, "step": 29365 }, { "epoch": 13.847241867043847, "grad_norm": 0.00483175553381443, "learning_rate": 0.04931854270456632, "loss": 0.1828, "num_input_tokens_seen": 25431376, "step": 29370 }, { "epoch": 13.84959924563885, "grad_norm": 0.004217939916998148, "learning_rate": 0.049274886208422075, "loss": 0.2154, "num_input_tokens_seen": 25435680, "step": 29375 }, { "epoch": 13.851956624233852, "grad_norm": 0.0030798183288425207, "learning_rate": 0.049231245245356235, "loss": 0.2105, "num_input_tokens_seen": 25439936, "step": 29380 }, { "epoch": 13.854314002828854, "grad_norm": 0.004276847466826439, "learning_rate": 0.049187619822098655, "loss": 0.1837, "num_input_tokens_seen": 25444720, "step": 29385 }, { "epoch": 13.856671381423856, "grad_norm": 0.0035279253497719765, "learning_rate": 0.04914400994537705, "loss": 0.2108, "num_input_tokens_seen": 25448480, "step": 29390 }, { "epoch": 13.859028760018859, "grad_norm": 0.003942518029361963, "learning_rate": 0.049100415621916485, "loss": 0.1876, "num_input_tokens_seen": 25452464, "step": 29395 }, { "epoch": 13.861386138613861, "grad_norm": 0.002274898113682866, "learning_rate": 0.04905683685843981, "loss": 0.159, "num_input_tokens_seen": 25456816, "step": 29400 }, { "epoch": 13.861386138613861, "eval_loss": 0.2113555669784546, "eval_runtime": 21.9227, "eval_samples_per_second": 43.015, "eval_steps_per_second": 21.53, "num_input_tokens_seen": 25456816, "step": 29400 }, { "epoch": 13.863743517208864, "grad_norm": 0.003278976771980524, "learning_rate": 0.049013273661667495, "loss": 0.1819, "num_input_tokens_seen": 25461280, "step": 29405 }, { "epoch": 13.866100895803866, "grad_norm": 0.006098782643675804, "learning_rate": 0.048969726038317396, "loss": 0.2239, "num_input_tokens_seen": 25465664, "step": 29410 }, { "epoch": 13.868458274398868, "grad_norm": 0.005837530363351107, "learning_rate": 0.048926193995105206, "loss": 0.2407, "num_input_tokens_seen": 25469504, "step": 29415 }, { "epoch": 13.87081565299387, "grad_norm": 0.003387365723028779, "learning_rate": 0.048882677538744035, "loss": 0.1793, "num_input_tokens_seen": 25473072, "step": 29420 }, { "epoch": 13.873173031588873, "grad_norm": 0.0037284009158611298, "learning_rate": 0.048839176675944715, "loss": 0.2205, "num_input_tokens_seen": 25476624, "step": 29425 }, { "epoch": 13.875530410183876, "grad_norm": 0.004622723441570997, "learning_rate": 0.04879569141341566, "loss": 0.2055, "num_input_tokens_seen": 25482160, "step": 29430 }, { "epoch": 13.877887788778878, "grad_norm": 0.003325487021356821, "learning_rate": 0.04875222175786274, "loss": 0.1792, "num_input_tokens_seen": 25487296, "step": 29435 }, { "epoch": 13.88024516737388, "grad_norm": 0.0018232128350064158, "learning_rate": 0.04870876771598966, "loss": 0.2057, "num_input_tokens_seen": 25491056, "step": 29440 }, { "epoch": 13.882602545968883, "grad_norm": 0.002652060240507126, "learning_rate": 0.04866532929449744, "loss": 0.1739, "num_input_tokens_seen": 25495520, "step": 29445 }, { "epoch": 13.884959924563885, "grad_norm": 0.0031212049070745707, "learning_rate": 0.048621906500084945, "loss": 0.216, "num_input_tokens_seen": 25500352, "step": 29450 }, { "epoch": 13.887317303158888, "grad_norm": 0.0034292039927095175, "learning_rate": 0.04857849933944845, "loss": 0.1613, "num_input_tokens_seen": 25505200, "step": 29455 }, { "epoch": 13.88967468175389, "grad_norm": 0.005079376976937056, "learning_rate": 0.048535107819281866, "loss": 0.2591, "num_input_tokens_seen": 25509552, "step": 29460 }, { "epoch": 13.892032060348892, "grad_norm": 0.003466268302872777, "learning_rate": 0.04849173194627675, "loss": 0.1917, "num_input_tokens_seen": 25513840, "step": 29465 }, { "epoch": 13.894389438943895, "grad_norm": 0.002527260221540928, "learning_rate": 0.04844837172712223, "loss": 0.187, "num_input_tokens_seen": 25518016, "step": 29470 }, { "epoch": 13.896746817538897, "grad_norm": 0.0036825696006417274, "learning_rate": 0.04840502716850494, "loss": 0.181, "num_input_tokens_seen": 25522688, "step": 29475 }, { "epoch": 13.8991041961339, "grad_norm": 0.003721988992765546, "learning_rate": 0.04836169827710916, "loss": 0.1906, "num_input_tokens_seen": 25528400, "step": 29480 }, { "epoch": 13.901461574728902, "grad_norm": 0.004758345428854227, "learning_rate": 0.04831838505961684, "loss": 0.192, "num_input_tokens_seen": 25532496, "step": 29485 }, { "epoch": 13.903818953323904, "grad_norm": 0.003015707479789853, "learning_rate": 0.048275087522707295, "loss": 0.1698, "num_input_tokens_seen": 25536736, "step": 29490 }, { "epoch": 13.906176331918907, "grad_norm": 0.003071174258366227, "learning_rate": 0.04823180567305766, "loss": 0.1944, "num_input_tokens_seen": 25540912, "step": 29495 }, { "epoch": 13.90853371051391, "grad_norm": 0.004068693146109581, "learning_rate": 0.04818853951734244, "loss": 0.1948, "num_input_tokens_seen": 25544768, "step": 29500 }, { "epoch": 13.910891089108912, "grad_norm": 0.0025439232122153044, "learning_rate": 0.04814528906223387, "loss": 0.201, "num_input_tokens_seen": 25549200, "step": 29505 }, { "epoch": 13.913248467703912, "grad_norm": 0.0048871999606490135, "learning_rate": 0.04810205431440177, "loss": 0.1717, "num_input_tokens_seen": 25552816, "step": 29510 }, { "epoch": 13.915605846298917, "grad_norm": 0.0030690126586705446, "learning_rate": 0.04805883528051341, "loss": 0.1776, "num_input_tokens_seen": 25557744, "step": 29515 }, { "epoch": 13.917963224893917, "grad_norm": 0.004626222420483828, "learning_rate": 0.048015631967233685, "loss": 0.1628, "num_input_tokens_seen": 25561344, "step": 29520 }, { "epoch": 13.92032060348892, "grad_norm": 0.002297256840392947, "learning_rate": 0.04797244438122517, "loss": 0.1723, "num_input_tokens_seen": 25566496, "step": 29525 }, { "epoch": 13.922677982083922, "grad_norm": 0.004911480471491814, "learning_rate": 0.04792927252914784, "loss": 0.1994, "num_input_tokens_seen": 25570800, "step": 29530 }, { "epoch": 13.925035360678924, "grad_norm": 0.00404330063611269, "learning_rate": 0.04788611641765944, "loss": 0.2078, "num_input_tokens_seen": 25574816, "step": 29535 }, { "epoch": 13.927392739273927, "grad_norm": 0.0038285688497126102, "learning_rate": 0.04784297605341508, "loss": 0.1808, "num_input_tokens_seen": 25579936, "step": 29540 }, { "epoch": 13.92975011786893, "grad_norm": 0.005920145660638809, "learning_rate": 0.04779985144306761, "loss": 0.2246, "num_input_tokens_seen": 25584032, "step": 29545 }, { "epoch": 13.932107496463932, "grad_norm": 0.0023077167570590973, "learning_rate": 0.047756742593267405, "loss": 0.1561, "num_input_tokens_seen": 25588160, "step": 29550 }, { "epoch": 13.934464875058934, "grad_norm": 0.004504472482949495, "learning_rate": 0.047713649510662315, "loss": 0.2124, "num_input_tokens_seen": 25592272, "step": 29555 }, { "epoch": 13.936822253653936, "grad_norm": 0.004626700188964605, "learning_rate": 0.04767057220189789, "loss": 0.1976, "num_input_tokens_seen": 25596544, "step": 29560 }, { "epoch": 13.939179632248939, "grad_norm": 0.004087931476533413, "learning_rate": 0.04762751067361722, "loss": 0.1797, "num_input_tokens_seen": 25600512, "step": 29565 }, { "epoch": 13.941537010843941, "grad_norm": 0.0039518242701888084, "learning_rate": 0.04758446493246086, "loss": 0.1348, "num_input_tokens_seen": 25604240, "step": 29570 }, { "epoch": 13.943894389438944, "grad_norm": 0.0029246718622744083, "learning_rate": 0.047541434985067084, "loss": 0.1359, "num_input_tokens_seen": 25609344, "step": 29575 }, { "epoch": 13.946251768033946, "grad_norm": 0.005112757440656424, "learning_rate": 0.047498420838071556, "loss": 0.1648, "num_input_tokens_seen": 25613728, "step": 29580 }, { "epoch": 13.948609146628948, "grad_norm": 0.00202812347561121, "learning_rate": 0.04745542249810772, "loss": 0.1811, "num_input_tokens_seen": 25618240, "step": 29585 }, { "epoch": 13.95096652522395, "grad_norm": 0.003631796222180128, "learning_rate": 0.047412439971806324, "loss": 0.1674, "num_input_tokens_seen": 25622240, "step": 29590 }, { "epoch": 13.953323903818953, "grad_norm": 0.003187691094353795, "learning_rate": 0.04736947326579592, "loss": 0.192, "num_input_tokens_seen": 25627232, "step": 29595 }, { "epoch": 13.955681282413956, "grad_norm": 0.006024655885994434, "learning_rate": 0.04732652238670245, "loss": 0.1791, "num_input_tokens_seen": 25631728, "step": 29600 }, { "epoch": 13.955681282413956, "eval_loss": 0.2154432088136673, "eval_runtime": 21.8926, "eval_samples_per_second": 43.074, "eval_steps_per_second": 21.56, "num_input_tokens_seen": 25631728, "step": 29600 }, { "epoch": 13.958038661008958, "grad_norm": 0.004916062578558922, "learning_rate": 0.04728358734114952, "loss": 0.179, "num_input_tokens_seen": 25635200, "step": 29605 }, { "epoch": 13.96039603960396, "grad_norm": 0.004516012966632843, "learning_rate": 0.04724066813575821, "loss": 0.1609, "num_input_tokens_seen": 25639456, "step": 29610 }, { "epoch": 13.962753418198963, "grad_norm": 0.0034733896609395742, "learning_rate": 0.04719776477714729, "loss": 0.2103, "num_input_tokens_seen": 25643824, "step": 29615 }, { "epoch": 13.965110796793965, "grad_norm": 0.004846430383622646, "learning_rate": 0.047154877271932856, "loss": 0.2005, "num_input_tokens_seen": 25648688, "step": 29620 }, { "epoch": 13.967468175388968, "grad_norm": 0.005264287814497948, "learning_rate": 0.0471120056267288, "loss": 0.1913, "num_input_tokens_seen": 25652464, "step": 29625 }, { "epoch": 13.96982555398397, "grad_norm": 0.005964715965092182, "learning_rate": 0.047069149848146495, "loss": 0.2207, "num_input_tokens_seen": 25657248, "step": 29630 }, { "epoch": 13.972182932578972, "grad_norm": 0.006545036565512419, "learning_rate": 0.04702630994279473, "loss": 0.1589, "num_input_tokens_seen": 25661472, "step": 29635 }, { "epoch": 13.974540311173975, "grad_norm": 0.004161347169429064, "learning_rate": 0.046983485917280035, "loss": 0.2047, "num_input_tokens_seen": 25665584, "step": 29640 }, { "epoch": 13.976897689768977, "grad_norm": 0.005011442583054304, "learning_rate": 0.04694067777820644, "loss": 0.1542, "num_input_tokens_seen": 25669536, "step": 29645 }, { "epoch": 13.97925506836398, "grad_norm": 0.0025283421855419874, "learning_rate": 0.046897885532175415, "loss": 0.1339, "num_input_tokens_seen": 25673584, "step": 29650 }, { "epoch": 13.981612446958982, "grad_norm": 0.004765721503645182, "learning_rate": 0.04685510918578613, "loss": 0.1873, "num_input_tokens_seen": 25678592, "step": 29655 }, { "epoch": 13.983969825553984, "grad_norm": 0.005070932675153017, "learning_rate": 0.04681234874563519, "loss": 0.1632, "num_input_tokens_seen": 25682640, "step": 29660 }, { "epoch": 13.986327204148987, "grad_norm": 0.003561283927410841, "learning_rate": 0.046769604218316836, "loss": 0.1579, "num_input_tokens_seen": 25686704, "step": 29665 }, { "epoch": 13.98868458274399, "grad_norm": 0.0030871948692947626, "learning_rate": 0.04672687561042279, "loss": 0.1538, "num_input_tokens_seen": 25690832, "step": 29670 }, { "epoch": 13.991041961338992, "grad_norm": 0.004767861682921648, "learning_rate": 0.046684162928542286, "loss": 0.2175, "num_input_tokens_seen": 25694864, "step": 29675 }, { "epoch": 13.993399339933994, "grad_norm": 0.0059538334608078, "learning_rate": 0.04664146617926222, "loss": 0.2235, "num_input_tokens_seen": 25699120, "step": 29680 }, { "epoch": 13.995756718528996, "grad_norm": 0.004485993180423975, "learning_rate": 0.046598785369167, "loss": 0.2127, "num_input_tokens_seen": 25704080, "step": 29685 }, { "epoch": 13.998114097123999, "grad_norm": 0.0056234849616885185, "learning_rate": 0.046556120504838434, "loss": 0.1866, "num_input_tokens_seen": 25708384, "step": 29690 }, { "epoch": 14.000471475719001, "grad_norm": 0.0055489493533968925, "learning_rate": 0.04651347159285609, "loss": 0.183, "num_input_tokens_seen": 25712288, "step": 29695 }, { "epoch": 14.002828854314004, "grad_norm": 0.005752101074904203, "learning_rate": 0.04647083863979688, "loss": 0.2187, "num_input_tokens_seen": 25716256, "step": 29700 }, { "epoch": 14.005186232909006, "grad_norm": 0.005097091663628817, "learning_rate": 0.04642822165223538, "loss": 0.1802, "num_input_tokens_seen": 25720176, "step": 29705 }, { "epoch": 14.007543611504008, "grad_norm": 0.003930123522877693, "learning_rate": 0.046385620636743716, "loss": 0.1534, "num_input_tokens_seen": 25724432, "step": 29710 }, { "epoch": 14.009900990099009, "grad_norm": 0.0047265589237213135, "learning_rate": 0.04634303559989141, "loss": 0.1617, "num_input_tokens_seen": 25729664, "step": 29715 }, { "epoch": 14.012258368694011, "grad_norm": 0.004092021379619837, "learning_rate": 0.046300466548245635, "loss": 0.1692, "num_input_tokens_seen": 25734256, "step": 29720 }, { "epoch": 14.014615747289014, "grad_norm": 0.005378470290452242, "learning_rate": 0.04625791348837114, "loss": 0.1481, "num_input_tokens_seen": 25738176, "step": 29725 }, { "epoch": 14.016973125884016, "grad_norm": 0.003937562461942434, "learning_rate": 0.046215376426830095, "loss": 0.1804, "num_input_tokens_seen": 25741680, "step": 29730 }, { "epoch": 14.019330504479019, "grad_norm": 0.007620553951710463, "learning_rate": 0.04617285537018219, "loss": 0.1974, "num_input_tokens_seen": 25745440, "step": 29735 }, { "epoch": 14.021687883074021, "grad_norm": 0.00842153187841177, "learning_rate": 0.046130350324984803, "loss": 0.2265, "num_input_tokens_seen": 25750128, "step": 29740 }, { "epoch": 14.024045261669023, "grad_norm": 0.0057876682840287685, "learning_rate": 0.046087861297792666, "loss": 0.1647, "num_input_tokens_seen": 25755088, "step": 29745 }, { "epoch": 14.026402640264026, "grad_norm": 0.004547099117189646, "learning_rate": 0.0460453882951582, "loss": 0.1448, "num_input_tokens_seen": 25759072, "step": 29750 }, { "epoch": 14.028760018859028, "grad_norm": 0.004431622568517923, "learning_rate": 0.04600293132363119, "loss": 0.1455, "num_input_tokens_seen": 25763312, "step": 29755 }, { "epoch": 14.03111739745403, "grad_norm": 0.0062558613717556, "learning_rate": 0.045960490389759086, "loss": 0.1928, "num_input_tokens_seen": 25767984, "step": 29760 }, { "epoch": 14.033474776049033, "grad_norm": 0.0032056611962616444, "learning_rate": 0.04591806550008685, "loss": 0.1589, "num_input_tokens_seen": 25771824, "step": 29765 }, { "epoch": 14.035832154644035, "grad_norm": 0.004509766586124897, "learning_rate": 0.045875656661156825, "loss": 0.2244, "num_input_tokens_seen": 25776048, "step": 29770 }, { "epoch": 14.038189533239038, "grad_norm": 0.006259105633944273, "learning_rate": 0.04583326387950911, "loss": 0.1994, "num_input_tokens_seen": 25780816, "step": 29775 }, { "epoch": 14.04054691183404, "grad_norm": 0.004545127507299185, "learning_rate": 0.0457908871616811, "loss": 0.1499, "num_input_tokens_seen": 25784992, "step": 29780 }, { "epoch": 14.042904290429043, "grad_norm": 0.005558746866881847, "learning_rate": 0.04574852651420786, "loss": 0.2006, "num_input_tokens_seen": 25788352, "step": 29785 }, { "epoch": 14.045261669024045, "grad_norm": 0.007731474470347166, "learning_rate": 0.045706181943621985, "loss": 0.1529, "num_input_tokens_seen": 25792400, "step": 29790 }, { "epoch": 14.047619047619047, "grad_norm": 0.007570208050310612, "learning_rate": 0.04566385345645344, "loss": 0.17, "num_input_tokens_seen": 25796800, "step": 29795 }, { "epoch": 14.04997642621405, "grad_norm": 0.004104857798665762, "learning_rate": 0.04562154105922993, "loss": 0.1437, "num_input_tokens_seen": 25801056, "step": 29800 }, { "epoch": 14.04997642621405, "eval_loss": 0.21508099138736725, "eval_runtime": 21.9567, "eval_samples_per_second": 42.948, "eval_steps_per_second": 21.497, "num_input_tokens_seen": 25801056, "step": 29800 }, { "epoch": 14.052333804809052, "grad_norm": 0.0056895907036960125, "learning_rate": 0.04557924475847642, "loss": 0.129, "num_input_tokens_seen": 25805520, "step": 29805 }, { "epoch": 14.054691183404055, "grad_norm": 0.004744086880236864, "learning_rate": 0.04553696456071567, "loss": 0.2101, "num_input_tokens_seen": 25810032, "step": 29810 }, { "epoch": 14.057048561999057, "grad_norm": 0.007415502332150936, "learning_rate": 0.045494700472467724, "loss": 0.1935, "num_input_tokens_seen": 25814080, "step": 29815 }, { "epoch": 14.05940594059406, "grad_norm": 0.008217088878154755, "learning_rate": 0.04545245250025024, "loss": 0.192, "num_input_tokens_seen": 25818144, "step": 29820 }, { "epoch": 14.061763319189062, "grad_norm": 0.004311414435505867, "learning_rate": 0.045410220650578384, "loss": 0.1486, "num_input_tokens_seen": 25822496, "step": 29825 }, { "epoch": 14.064120697784064, "grad_norm": 0.0026819631457328796, "learning_rate": 0.04536800492996492, "loss": 0.1126, "num_input_tokens_seen": 25826768, "step": 29830 }, { "epoch": 14.066478076379067, "grad_norm": 0.005270141176879406, "learning_rate": 0.04532580534491994, "loss": 0.1503, "num_input_tokens_seen": 25831408, "step": 29835 }, { "epoch": 14.068835454974069, "grad_norm": 0.00592161575332284, "learning_rate": 0.045283621901951183, "loss": 0.1774, "num_input_tokens_seen": 25835744, "step": 29840 }, { "epoch": 14.071192833569071, "grad_norm": 0.005093283019959927, "learning_rate": 0.04524145460756393, "loss": 0.1582, "num_input_tokens_seen": 25840144, "step": 29845 }, { "epoch": 14.073550212164074, "grad_norm": 0.005124999210238457, "learning_rate": 0.045199303468260794, "loss": 0.1714, "num_input_tokens_seen": 25844736, "step": 29850 }, { "epoch": 14.075907590759076, "grad_norm": 0.00470344303175807, "learning_rate": 0.04515716849054214, "loss": 0.1079, "num_input_tokens_seen": 25849056, "step": 29855 }, { "epoch": 14.078264969354079, "grad_norm": 0.006090222857892513, "learning_rate": 0.04511504968090558, "loss": 0.1282, "num_input_tokens_seen": 25852992, "step": 29860 }, { "epoch": 14.080622347949081, "grad_norm": 0.005547447130084038, "learning_rate": 0.04507294704584644, "loss": 0.1802, "num_input_tokens_seen": 25856880, "step": 29865 }, { "epoch": 14.082979726544083, "grad_norm": 0.00720406835898757, "learning_rate": 0.04503086059185749, "loss": 0.2297, "num_input_tokens_seen": 25860864, "step": 29870 }, { "epoch": 14.085337105139086, "grad_norm": 0.009011568501591682, "learning_rate": 0.04498879032542893, "loss": 0.1842, "num_input_tokens_seen": 25865616, "step": 29875 }, { "epoch": 14.087694483734088, "grad_norm": 0.0037466883659362793, "learning_rate": 0.0449467362530486, "loss": 0.1521, "num_input_tokens_seen": 25869520, "step": 29880 }, { "epoch": 14.09005186232909, "grad_norm": 0.007506763096898794, "learning_rate": 0.04490469838120171, "loss": 0.2256, "num_input_tokens_seen": 25873776, "step": 29885 }, { "epoch": 14.092409240924093, "grad_norm": 0.004818316549062729, "learning_rate": 0.04486267671637101, "loss": 0.1382, "num_input_tokens_seen": 25878528, "step": 29890 }, { "epoch": 14.094766619519095, "grad_norm": 0.00964922271668911, "learning_rate": 0.04482067126503683, "loss": 0.1883, "num_input_tokens_seen": 25883376, "step": 29895 }, { "epoch": 14.097123998114098, "grad_norm": 0.00368486437946558, "learning_rate": 0.04477868203367687, "loss": 0.1315, "num_input_tokens_seen": 25887552, "step": 29900 }, { "epoch": 14.0994813767091, "grad_norm": 0.00603363336995244, "learning_rate": 0.044736709028766426, "loss": 0.1542, "num_input_tokens_seen": 25891664, "step": 29905 }, { "epoch": 14.101838755304103, "grad_norm": 0.011149140074849129, "learning_rate": 0.04469475225677832, "loss": 0.153, "num_input_tokens_seen": 25895584, "step": 29910 }, { "epoch": 14.104196133899103, "grad_norm": 0.00668503949418664, "learning_rate": 0.04465281172418273, "loss": 0.16, "num_input_tokens_seen": 25899584, "step": 29915 }, { "epoch": 14.106553512494106, "grad_norm": 0.004469035658985376, "learning_rate": 0.044610887437447476, "loss": 0.1903, "num_input_tokens_seen": 25904576, "step": 29920 }, { "epoch": 14.108910891089108, "grad_norm": 0.0034630061127245426, "learning_rate": 0.044568979403037744, "loss": 0.1406, "num_input_tokens_seen": 25909168, "step": 29925 }, { "epoch": 14.11126826968411, "grad_norm": 0.006076529622077942, "learning_rate": 0.04452708762741631, "loss": 0.2503, "num_input_tokens_seen": 25914672, "step": 29930 }, { "epoch": 14.113625648279113, "grad_norm": 0.013170834630727768, "learning_rate": 0.044485212117043475, "loss": 0.2124, "num_input_tokens_seen": 25918960, "step": 29935 }, { "epoch": 14.115983026874115, "grad_norm": 0.004110877402126789, "learning_rate": 0.04444335287837687, "loss": 0.1282, "num_input_tokens_seen": 25923808, "step": 29940 }, { "epoch": 14.118340405469118, "grad_norm": 0.007103634998202324, "learning_rate": 0.04440150991787179, "loss": 0.1413, "num_input_tokens_seen": 25928528, "step": 29945 }, { "epoch": 14.12069778406412, "grad_norm": 0.004805678967386484, "learning_rate": 0.04435968324198088, "loss": 0.161, "num_input_tokens_seen": 25933408, "step": 29950 }, { "epoch": 14.123055162659123, "grad_norm": 0.010558979585766792, "learning_rate": 0.04431787285715442, "loss": 0.1442, "num_input_tokens_seen": 25938464, "step": 29955 }, { "epoch": 14.125412541254125, "grad_norm": 0.007052351254969835, "learning_rate": 0.04427607876984004, "loss": 0.2052, "num_input_tokens_seen": 25943168, "step": 29960 }, { "epoch": 14.127769919849127, "grad_norm": 0.007279901299625635, "learning_rate": 0.044234300986482886, "loss": 0.1669, "num_input_tokens_seen": 25947872, "step": 29965 }, { "epoch": 14.13012729844413, "grad_norm": 0.004778577946126461, "learning_rate": 0.04419253951352566, "loss": 0.1448, "num_input_tokens_seen": 25952608, "step": 29970 }, { "epoch": 14.132484677039132, "grad_norm": 0.006930164061486721, "learning_rate": 0.044150794357408533, "loss": 0.1422, "num_input_tokens_seen": 25956704, "step": 29975 }, { "epoch": 14.134842055634135, "grad_norm": 0.002455734880641103, "learning_rate": 0.044109065524569065, "loss": 0.1383, "num_input_tokens_seen": 25962080, "step": 29980 }, { "epoch": 14.137199434229137, "grad_norm": 0.007291420828551054, "learning_rate": 0.0440673530214424, "loss": 0.2469, "num_input_tokens_seen": 25966352, "step": 29985 }, { "epoch": 14.13955681282414, "grad_norm": 0.009213839657604694, "learning_rate": 0.04402565685446117, "loss": 0.2125, "num_input_tokens_seen": 25970224, "step": 29990 }, { "epoch": 14.141914191419142, "grad_norm": 0.00779935996979475, "learning_rate": 0.04398397703005536, "loss": 0.1142, "num_input_tokens_seen": 25973968, "step": 29995 }, { "epoch": 14.144271570014144, "grad_norm": 0.008030143566429615, "learning_rate": 0.043942313554652626, "loss": 0.2108, "num_input_tokens_seen": 25978896, "step": 30000 }, { "epoch": 14.144271570014144, "eval_loss": 0.2241128385066986, "eval_runtime": 21.9301, "eval_samples_per_second": 43.0, "eval_steps_per_second": 21.523, "num_input_tokens_seen": 25978896, "step": 30000 }, { "epoch": 14.146628948609147, "grad_norm": 0.006169900763779879, "learning_rate": 0.0439006664346779, "loss": 0.1378, "num_input_tokens_seen": 25983040, "step": 30005 }, { "epoch": 14.148986327204149, "grad_norm": 0.0065635498613119125, "learning_rate": 0.043859035676553755, "loss": 0.1813, "num_input_tokens_seen": 25987488, "step": 30010 }, { "epoch": 14.151343705799151, "grad_norm": 0.0036043680738657713, "learning_rate": 0.043817421286700194, "loss": 0.1615, "num_input_tokens_seen": 25992016, "step": 30015 }, { "epoch": 14.153701084394154, "grad_norm": 0.003938963636755943, "learning_rate": 0.043775823271534585, "loss": 0.1707, "num_input_tokens_seen": 25996160, "step": 30020 }, { "epoch": 14.156058462989156, "grad_norm": 0.004694386385381222, "learning_rate": 0.04373424163747197, "loss": 0.186, "num_input_tokens_seen": 26000816, "step": 30025 }, { "epoch": 14.158415841584159, "grad_norm": 0.008192849345505238, "learning_rate": 0.04369267639092473, "loss": 0.1356, "num_input_tokens_seen": 26005232, "step": 30030 }, { "epoch": 14.160773220179161, "grad_norm": 0.007724672090262175, "learning_rate": 0.04365112753830268, "loss": 0.2168, "num_input_tokens_seen": 26009088, "step": 30035 }, { "epoch": 14.163130598774163, "grad_norm": 0.007274783682078123, "learning_rate": 0.04360959508601327, "loss": 0.1388, "num_input_tokens_seen": 26013232, "step": 30040 }, { "epoch": 14.165487977369166, "grad_norm": 0.014119734056293964, "learning_rate": 0.04356807904046123, "loss": 0.1811, "num_input_tokens_seen": 26016976, "step": 30045 }, { "epoch": 14.167845355964168, "grad_norm": 0.004459151532500982, "learning_rate": 0.04352657940804892, "loss": 0.2109, "num_input_tokens_seen": 26021872, "step": 30050 }, { "epoch": 14.17020273455917, "grad_norm": 0.0042907241731882095, "learning_rate": 0.04348509619517613, "loss": 0.1266, "num_input_tokens_seen": 26026352, "step": 30055 }, { "epoch": 14.172560113154173, "grad_norm": 0.012276563793420792, "learning_rate": 0.04344362940824002, "loss": 0.242, "num_input_tokens_seen": 26031728, "step": 30060 }, { "epoch": 14.174917491749175, "grad_norm": 0.006126211956143379, "learning_rate": 0.04340217905363533, "loss": 0.1632, "num_input_tokens_seen": 26035840, "step": 30065 }, { "epoch": 14.177274870344178, "grad_norm": 0.0074376813136041164, "learning_rate": 0.04336074513775425, "loss": 0.1767, "num_input_tokens_seen": 26039360, "step": 30070 }, { "epoch": 14.17963224893918, "grad_norm": 0.0062879156321287155, "learning_rate": 0.04331932766698636, "loss": 0.1471, "num_input_tokens_seen": 26043696, "step": 30075 }, { "epoch": 14.181989627534183, "grad_norm": 0.005722375586628914, "learning_rate": 0.0432779266477188, "loss": 0.1465, "num_input_tokens_seen": 26047680, "step": 30080 }, { "epoch": 14.184347006129185, "grad_norm": 0.0045995512045919895, "learning_rate": 0.04323654208633607, "loss": 0.153, "num_input_tokens_seen": 26052208, "step": 30085 }, { "epoch": 14.186704384724187, "grad_norm": 0.007991827093064785, "learning_rate": 0.04319517398922024, "loss": 0.122, "num_input_tokens_seen": 26057408, "step": 30090 }, { "epoch": 14.18906176331919, "grad_norm": 0.0077020348981022835, "learning_rate": 0.04315382236275079, "loss": 0.1831, "num_input_tokens_seen": 26062224, "step": 30095 }, { "epoch": 14.191419141914192, "grad_norm": 0.0075521995313465595, "learning_rate": 0.043112487213304664, "loss": 0.2506, "num_input_tokens_seen": 26067664, "step": 30100 }, { "epoch": 14.193776520509195, "grad_norm": 0.010488913394510746, "learning_rate": 0.04307116854725618, "loss": 0.1798, "num_input_tokens_seen": 26071520, "step": 30105 }, { "epoch": 14.196133899104197, "grad_norm": 0.005848318804055452, "learning_rate": 0.043029866370977325, "loss": 0.1824, "num_input_tokens_seen": 26076064, "step": 30110 }, { "epoch": 14.198491277699198, "grad_norm": 0.0107017382979393, "learning_rate": 0.04298858069083728, "loss": 0.1961, "num_input_tokens_seen": 26080656, "step": 30115 }, { "epoch": 14.2008486562942, "grad_norm": 0.007990701124072075, "learning_rate": 0.04294731151320295, "loss": 0.1797, "num_input_tokens_seen": 26085328, "step": 30120 }, { "epoch": 14.203206034889202, "grad_norm": 0.008625784888863564, "learning_rate": 0.04290605884443841, "loss": 0.1824, "num_input_tokens_seen": 26089568, "step": 30125 }, { "epoch": 14.205563413484205, "grad_norm": 0.006376832723617554, "learning_rate": 0.04286482269090545, "loss": 0.1769, "num_input_tokens_seen": 26094128, "step": 30130 }, { "epoch": 14.207920792079207, "grad_norm": 0.004033069591969252, "learning_rate": 0.04282360305896323, "loss": 0.1243, "num_input_tokens_seen": 26098976, "step": 30135 }, { "epoch": 14.21027817067421, "grad_norm": 0.0039036825764924288, "learning_rate": 0.04278239995496822, "loss": 0.1649, "num_input_tokens_seen": 26103744, "step": 30140 }, { "epoch": 14.212635549269212, "grad_norm": 0.008768069557845592, "learning_rate": 0.042741213385274514, "loss": 0.1505, "num_input_tokens_seen": 26107776, "step": 30145 }, { "epoch": 14.214992927864214, "grad_norm": 0.00647443812340498, "learning_rate": 0.04270004335623366, "loss": 0.1826, "num_input_tokens_seen": 26112032, "step": 30150 }, { "epoch": 14.217350306459217, "grad_norm": 0.007099487353116274, "learning_rate": 0.04265888987419448, "loss": 0.2343, "num_input_tokens_seen": 26116384, "step": 30155 }, { "epoch": 14.21970768505422, "grad_norm": 0.007134127430617809, "learning_rate": 0.04261775294550346, "loss": 0.1467, "num_input_tokens_seen": 26120640, "step": 30160 }, { "epoch": 14.222065063649222, "grad_norm": 0.003391087520867586, "learning_rate": 0.042576632576504354, "loss": 0.1374, "num_input_tokens_seen": 26125216, "step": 30165 }, { "epoch": 14.224422442244224, "grad_norm": 0.004971641581505537, "learning_rate": 0.0425355287735385, "loss": 0.2146, "num_input_tokens_seen": 26129376, "step": 30170 }, { "epoch": 14.226779820839226, "grad_norm": 0.006574847269803286, "learning_rate": 0.0424944415429446, "loss": 0.1633, "num_input_tokens_seen": 26133616, "step": 30175 }, { "epoch": 14.229137199434229, "grad_norm": 0.007146715652197599, "learning_rate": 0.04245337089105877, "loss": 0.1989, "num_input_tokens_seen": 26137392, "step": 30180 }, { "epoch": 14.231494578029231, "grad_norm": 0.005999289453029633, "learning_rate": 0.04241231682421467, "loss": 0.1603, "num_input_tokens_seen": 26141616, "step": 30185 }, { "epoch": 14.233851956624234, "grad_norm": 0.007023707497864962, "learning_rate": 0.04237127934874337, "loss": 0.1752, "num_input_tokens_seen": 26146592, "step": 30190 }, { "epoch": 14.236209335219236, "grad_norm": 0.0046423571184277534, "learning_rate": 0.042330258470973305, "loss": 0.1473, "num_input_tokens_seen": 26150960, "step": 30195 }, { "epoch": 14.238566713814238, "grad_norm": 0.005026846192777157, "learning_rate": 0.042289254197230515, "loss": 0.192, "num_input_tokens_seen": 26156672, "step": 30200 }, { "epoch": 14.238566713814238, "eval_loss": 0.22202041745185852, "eval_runtime": 21.8938, "eval_samples_per_second": 43.072, "eval_steps_per_second": 21.559, "num_input_tokens_seen": 26156672, "step": 30200 }, { "epoch": 14.24092409240924, "grad_norm": 0.007946528494358063, "learning_rate": 0.04224826653383823, "loss": 0.2082, "num_input_tokens_seen": 26161312, "step": 30205 }, { "epoch": 14.243281471004243, "grad_norm": 0.0075263893231749535, "learning_rate": 0.04220729548711735, "loss": 0.1782, "num_input_tokens_seen": 26165568, "step": 30210 }, { "epoch": 14.245638849599246, "grad_norm": 0.002777648624032736, "learning_rate": 0.04216634106338616, "loss": 0.1293, "num_input_tokens_seen": 26169584, "step": 30215 }, { "epoch": 14.247996228194248, "grad_norm": 0.004664845298975706, "learning_rate": 0.04212540326896025, "loss": 0.1864, "num_input_tokens_seen": 26174208, "step": 30220 }, { "epoch": 14.25035360678925, "grad_norm": 0.004095825832337141, "learning_rate": 0.0420844821101528, "loss": 0.1872, "num_input_tokens_seen": 26179088, "step": 30225 }, { "epoch": 14.252710985384253, "grad_norm": 0.00579098192974925, "learning_rate": 0.04204357759327441, "loss": 0.2115, "num_input_tokens_seen": 26182768, "step": 30230 }, { "epoch": 14.255068363979255, "grad_norm": 0.005154747981578112, "learning_rate": 0.042002689724632954, "loss": 0.1998, "num_input_tokens_seen": 26186880, "step": 30235 }, { "epoch": 14.257425742574258, "grad_norm": 0.005254998337477446, "learning_rate": 0.04196181851053398, "loss": 0.1485, "num_input_tokens_seen": 26191472, "step": 30240 }, { "epoch": 14.25978312116926, "grad_norm": 0.004900936037302017, "learning_rate": 0.041920963957280295, "loss": 0.1421, "num_input_tokens_seen": 26197504, "step": 30245 }, { "epoch": 14.262140499764262, "grad_norm": 0.006543243769556284, "learning_rate": 0.04188012607117212, "loss": 0.2116, "num_input_tokens_seen": 26201056, "step": 30250 }, { "epoch": 14.264497878359265, "grad_norm": 0.011217565275728703, "learning_rate": 0.04183930485850725, "loss": 0.2502, "num_input_tokens_seen": 26205952, "step": 30255 }, { "epoch": 14.266855256954267, "grad_norm": 0.004521931055933237, "learning_rate": 0.04179850032558078, "loss": 0.1897, "num_input_tokens_seen": 26209904, "step": 30260 }, { "epoch": 14.26921263554927, "grad_norm": 0.005287121050059795, "learning_rate": 0.041757712478685295, "loss": 0.1911, "num_input_tokens_seen": 26214720, "step": 30265 }, { "epoch": 14.271570014144272, "grad_norm": 0.005865566898137331, "learning_rate": 0.04171694132411085, "loss": 0.1773, "num_input_tokens_seen": 26219120, "step": 30270 }, { "epoch": 14.273927392739274, "grad_norm": 0.0062105427496135235, "learning_rate": 0.04167618686814479, "loss": 0.1474, "num_input_tokens_seen": 26223776, "step": 30275 }, { "epoch": 14.276284771334277, "grad_norm": 0.004904657602310181, "learning_rate": 0.041635449117072024, "loss": 0.1408, "num_input_tokens_seen": 26228448, "step": 30280 }, { "epoch": 14.27864214992928, "grad_norm": 0.010240467265248299, "learning_rate": 0.04159472807717477, "loss": 0.1152, "num_input_tokens_seen": 26233024, "step": 30285 }, { "epoch": 14.280999528524282, "grad_norm": 0.007853321731090546, "learning_rate": 0.041554023754732744, "loss": 0.1457, "num_input_tokens_seen": 26238400, "step": 30290 }, { "epoch": 14.283356907119284, "grad_norm": 0.0038395572919398546, "learning_rate": 0.04151333615602311, "loss": 0.1365, "num_input_tokens_seen": 26242304, "step": 30295 }, { "epoch": 14.285714285714286, "grad_norm": 0.005419599357992411, "learning_rate": 0.04147266528732034, "loss": 0.1455, "num_input_tokens_seen": 26246752, "step": 30300 }, { "epoch": 14.288071664309289, "grad_norm": 0.006951130460947752, "learning_rate": 0.0414320111548964, "loss": 0.1205, "num_input_tokens_seen": 26251440, "step": 30305 }, { "epoch": 14.290429042904291, "grad_norm": 0.008912274613976479, "learning_rate": 0.04139137376502076, "loss": 0.1555, "num_input_tokens_seen": 26255776, "step": 30310 }, { "epoch": 14.292786421499294, "grad_norm": 0.0076253036968410015, "learning_rate": 0.04135075312396014, "loss": 0.2262, "num_input_tokens_seen": 26259888, "step": 30315 }, { "epoch": 14.295143800094294, "grad_norm": 0.008812078274786472, "learning_rate": 0.04131014923797875, "loss": 0.1804, "num_input_tokens_seen": 26263712, "step": 30320 }, { "epoch": 14.297501178689297, "grad_norm": 0.007465405855327845, "learning_rate": 0.04126956211333819, "loss": 0.1957, "num_input_tokens_seen": 26267760, "step": 30325 }, { "epoch": 14.299858557284299, "grad_norm": 0.011151977814733982, "learning_rate": 0.041228991756297545, "loss": 0.2282, "num_input_tokens_seen": 26272304, "step": 30330 }, { "epoch": 14.302215935879302, "grad_norm": 0.0072767664678394794, "learning_rate": 0.04118843817311332, "loss": 0.192, "num_input_tokens_seen": 26276784, "step": 30335 }, { "epoch": 14.304573314474304, "grad_norm": 0.006412986665964127, "learning_rate": 0.0411479013700393, "loss": 0.1613, "num_input_tokens_seen": 26281104, "step": 30340 }, { "epoch": 14.306930693069306, "grad_norm": 0.0033187298104166985, "learning_rate": 0.0411073813533268, "loss": 0.1639, "num_input_tokens_seen": 26285536, "step": 30345 }, { "epoch": 14.309288071664309, "grad_norm": 0.00820187944918871, "learning_rate": 0.04106687812922456, "loss": 0.1562, "num_input_tokens_seen": 26289616, "step": 30350 }, { "epoch": 14.311645450259311, "grad_norm": 0.0035016450565308332, "learning_rate": 0.041026391703978635, "loss": 0.1093, "num_input_tokens_seen": 26294000, "step": 30355 }, { "epoch": 14.314002828854314, "grad_norm": 0.006643793545663357, "learning_rate": 0.04098592208383259, "loss": 0.1349, "num_input_tokens_seen": 26298496, "step": 30360 }, { "epoch": 14.316360207449316, "grad_norm": 0.007168669253587723, "learning_rate": 0.040945469275027256, "loss": 0.1422, "num_input_tokens_seen": 26302880, "step": 30365 }, { "epoch": 14.318717586044318, "grad_norm": 0.006721879355609417, "learning_rate": 0.04090503328380104, "loss": 0.1911, "num_input_tokens_seen": 26307008, "step": 30370 }, { "epoch": 14.32107496463932, "grad_norm": 0.00941894855350256, "learning_rate": 0.04086461411638971, "loss": 0.1879, "num_input_tokens_seen": 26310720, "step": 30375 }, { "epoch": 14.323432343234323, "grad_norm": 0.01057712733745575, "learning_rate": 0.04082421177902631, "loss": 0.2031, "num_input_tokens_seen": 26314832, "step": 30380 }, { "epoch": 14.325789721829326, "grad_norm": 0.003999708686023951, "learning_rate": 0.04078382627794149, "loss": 0.1286, "num_input_tokens_seen": 26319152, "step": 30385 }, { "epoch": 14.328147100424328, "grad_norm": 0.006086652632802725, "learning_rate": 0.04074345761936316, "loss": 0.1931, "num_input_tokens_seen": 26322992, "step": 30390 }, { "epoch": 14.33050447901933, "grad_norm": 0.007569155655801296, "learning_rate": 0.04070310580951663, "loss": 0.167, "num_input_tokens_seen": 26326352, "step": 30395 }, { "epoch": 14.332861857614333, "grad_norm": 0.007616538088768721, "learning_rate": 0.040662770854624726, "loss": 0.1962, "num_input_tokens_seen": 26330592, "step": 30400 }, { "epoch": 14.332861857614333, "eval_loss": 0.22741538286209106, "eval_runtime": 21.9146, "eval_samples_per_second": 43.031, "eval_steps_per_second": 21.538, "num_input_tokens_seen": 26330592, "step": 30400 }, { "epoch": 14.335219236209335, "grad_norm": 0.008938301354646683, "learning_rate": 0.040622452760907535, "loss": 0.1621, "num_input_tokens_seen": 26334704, "step": 30405 }, { "epoch": 14.337576614804338, "grad_norm": 0.0030964931938797235, "learning_rate": 0.04058215153458265, "loss": 0.1396, "num_input_tokens_seen": 26340400, "step": 30410 }, { "epoch": 14.33993399339934, "grad_norm": 0.010010600090026855, "learning_rate": 0.04054186718186507, "loss": 0.1972, "num_input_tokens_seen": 26344192, "step": 30415 }, { "epoch": 14.342291371994342, "grad_norm": 0.003785910317674279, "learning_rate": 0.04050159970896708, "loss": 0.1787, "num_input_tokens_seen": 26348128, "step": 30420 }, { "epoch": 14.344648750589345, "grad_norm": 0.00928839948028326, "learning_rate": 0.04046134912209843, "loss": 0.2316, "num_input_tokens_seen": 26352320, "step": 30425 }, { "epoch": 14.347006129184347, "grad_norm": 0.020709043368697166, "learning_rate": 0.040421115427466354, "loss": 0.1709, "num_input_tokens_seen": 26356384, "step": 30430 }, { "epoch": 14.34936350777935, "grad_norm": 0.005872691050171852, "learning_rate": 0.04038089863127529, "loss": 0.1923, "num_input_tokens_seen": 26361152, "step": 30435 }, { "epoch": 14.351720886374352, "grad_norm": 0.0058170282281935215, "learning_rate": 0.04034069873972727, "loss": 0.1519, "num_input_tokens_seen": 26366256, "step": 30440 }, { "epoch": 14.354078264969354, "grad_norm": 0.006014326121658087, "learning_rate": 0.040300515759021514, "loss": 0.2227, "num_input_tokens_seen": 26370368, "step": 30445 }, { "epoch": 14.356435643564357, "grad_norm": 0.006300934124737978, "learning_rate": 0.04026034969535478, "loss": 0.2189, "num_input_tokens_seen": 26374864, "step": 30450 }, { "epoch": 14.35879302215936, "grad_norm": 0.009482618421316147, "learning_rate": 0.040220200554921266, "loss": 0.1866, "num_input_tokens_seen": 26378960, "step": 30455 }, { "epoch": 14.361150400754362, "grad_norm": 0.0073554557748138905, "learning_rate": 0.0401800683439124, "loss": 0.1783, "num_input_tokens_seen": 26383616, "step": 30460 }, { "epoch": 14.363507779349364, "grad_norm": 0.01084878109395504, "learning_rate": 0.04013995306851704, "loss": 0.2351, "num_input_tokens_seen": 26387696, "step": 30465 }, { "epoch": 14.365865157944366, "grad_norm": 0.006201889365911484, "learning_rate": 0.040099854734921545, "loss": 0.1994, "num_input_tokens_seen": 26391312, "step": 30470 }, { "epoch": 14.368222536539369, "grad_norm": 0.005605104845017195, "learning_rate": 0.0400597733493095, "loss": 0.1883, "num_input_tokens_seen": 26395440, "step": 30475 }, { "epoch": 14.370579915134371, "grad_norm": 0.005972946994006634, "learning_rate": 0.04001970891786203, "loss": 0.2143, "num_input_tokens_seen": 26399616, "step": 30480 }, { "epoch": 14.372937293729374, "grad_norm": 0.008432709611952305, "learning_rate": 0.03997966144675752, "loss": 0.2105, "num_input_tokens_seen": 26404800, "step": 30485 }, { "epoch": 14.375294672324376, "grad_norm": 0.009495237842202187, "learning_rate": 0.039939630942171796, "loss": 0.2216, "num_input_tokens_seen": 26408848, "step": 30490 }, { "epoch": 14.377652050919378, "grad_norm": 0.004275960847735405, "learning_rate": 0.03989961741027815, "loss": 0.1534, "num_input_tokens_seen": 26413328, "step": 30495 }, { "epoch": 14.38000942951438, "grad_norm": 0.004292964935302734, "learning_rate": 0.03985962085724704, "loss": 0.1506, "num_input_tokens_seen": 26417072, "step": 30500 }, { "epoch": 14.382366808109383, "grad_norm": 0.0053281886503100395, "learning_rate": 0.03981964128924656, "loss": 0.1375, "num_input_tokens_seen": 26420544, "step": 30505 }, { "epoch": 14.384724186704386, "grad_norm": 0.004367724061012268, "learning_rate": 0.03977967871244197, "loss": 0.2169, "num_input_tokens_seen": 26424464, "step": 30510 }, { "epoch": 14.387081565299386, "grad_norm": 0.007210994604974985, "learning_rate": 0.03973973313299602, "loss": 0.183, "num_input_tokens_seen": 26428464, "step": 30515 }, { "epoch": 14.389438943894389, "grad_norm": 0.0048298304900527, "learning_rate": 0.0396998045570689, "loss": 0.2452, "num_input_tokens_seen": 26432704, "step": 30520 }, { "epoch": 14.391796322489391, "grad_norm": 0.008039802312850952, "learning_rate": 0.03965989299081798, "loss": 0.1431, "num_input_tokens_seen": 26437472, "step": 30525 }, { "epoch": 14.394153701084393, "grad_norm": 0.007098461501300335, "learning_rate": 0.039619998440398235, "loss": 0.1644, "num_input_tokens_seen": 26441680, "step": 30530 }, { "epoch": 14.396511079679396, "grad_norm": 0.009242547675967216, "learning_rate": 0.03958012091196184, "loss": 0.1678, "num_input_tokens_seen": 26446608, "step": 30535 }, { "epoch": 14.398868458274398, "grad_norm": 0.005284856539219618, "learning_rate": 0.039540260411658396, "loss": 0.1304, "num_input_tokens_seen": 26451152, "step": 30540 }, { "epoch": 14.4012258368694, "grad_norm": 0.008283290080726147, "learning_rate": 0.03950041694563496, "loss": 0.21, "num_input_tokens_seen": 26455328, "step": 30545 }, { "epoch": 14.403583215464403, "grad_norm": 0.004776647314429283, "learning_rate": 0.0394605905200358, "loss": 0.1726, "num_input_tokens_seen": 26459616, "step": 30550 }, { "epoch": 14.405940594059405, "grad_norm": 0.005495809018611908, "learning_rate": 0.03942078114100272, "loss": 0.1369, "num_input_tokens_seen": 26464336, "step": 30555 }, { "epoch": 14.408297972654408, "grad_norm": 0.008281443268060684, "learning_rate": 0.03938098881467485, "loss": 0.2248, "num_input_tokens_seen": 26468160, "step": 30560 }, { "epoch": 14.41065535124941, "grad_norm": 0.007201177999377251, "learning_rate": 0.039341213547188586, "loss": 0.2077, "num_input_tokens_seen": 26471936, "step": 30565 }, { "epoch": 14.413012729844413, "grad_norm": 0.009223357774317265, "learning_rate": 0.03930145534467782, "loss": 0.1638, "num_input_tokens_seen": 26476480, "step": 30570 }, { "epoch": 14.415370108439415, "grad_norm": 0.003936063963919878, "learning_rate": 0.0392617142132738, "loss": 0.1289, "num_input_tokens_seen": 26481536, "step": 30575 }, { "epoch": 14.417727487034417, "grad_norm": 0.007489695213735104, "learning_rate": 0.03922199015910504, "loss": 0.1849, "num_input_tokens_seen": 26485952, "step": 30580 }, { "epoch": 14.42008486562942, "grad_norm": 0.008202260360121727, "learning_rate": 0.039182283188297556, "loss": 0.2161, "num_input_tokens_seen": 26490160, "step": 30585 }, { "epoch": 14.422442244224422, "grad_norm": 0.006327849812805653, "learning_rate": 0.039142593306974595, "loss": 0.1878, "num_input_tokens_seen": 26494624, "step": 30590 }, { "epoch": 14.424799622819425, "grad_norm": 0.004990175366401672, "learning_rate": 0.039102920521256856, "loss": 0.179, "num_input_tokens_seen": 26498400, "step": 30595 }, { "epoch": 14.427157001414427, "grad_norm": 0.006220838986337185, "learning_rate": 0.03906326483726243, "loss": 0.1747, "num_input_tokens_seen": 26502800, "step": 30600 }, { "epoch": 14.427157001414427, "eval_loss": 0.21772323548793793, "eval_runtime": 21.9051, "eval_samples_per_second": 43.049, "eval_steps_per_second": 21.548, "num_input_tokens_seen": 26502800, "step": 30600 }, { "epoch": 14.42951438000943, "grad_norm": 0.008114648051559925, "learning_rate": 0.039023626261106704, "loss": 0.1938, "num_input_tokens_seen": 26507264, "step": 30605 }, { "epoch": 14.431871758604432, "grad_norm": 0.005522445775568485, "learning_rate": 0.03898400479890237, "loss": 0.2231, "num_input_tokens_seen": 26511008, "step": 30610 }, { "epoch": 14.434229137199434, "grad_norm": 0.0071020303294062614, "learning_rate": 0.038944400456759655, "loss": 0.158, "num_input_tokens_seen": 26515712, "step": 30615 }, { "epoch": 14.436586515794437, "grad_norm": 0.007140698377043009, "learning_rate": 0.038904813240785964, "loss": 0.2201, "num_input_tokens_seen": 26519616, "step": 30620 }, { "epoch": 14.438943894389439, "grad_norm": 0.007632260676473379, "learning_rate": 0.03886524315708621, "loss": 0.1753, "num_input_tokens_seen": 26523728, "step": 30625 }, { "epoch": 14.441301272984441, "grad_norm": 0.005710085388273001, "learning_rate": 0.03882569021176255, "loss": 0.1729, "num_input_tokens_seen": 26527664, "step": 30630 }, { "epoch": 14.443658651579444, "grad_norm": 0.004584199283272028, "learning_rate": 0.038786154410914535, "loss": 0.1316, "num_input_tokens_seen": 26532528, "step": 30635 }, { "epoch": 14.446016030174446, "grad_norm": 0.003939956892281771, "learning_rate": 0.03874663576063917, "loss": 0.1594, "num_input_tokens_seen": 26536304, "step": 30640 }, { "epoch": 14.448373408769449, "grad_norm": 0.00636113528162241, "learning_rate": 0.038707134267030624, "loss": 0.1187, "num_input_tokens_seen": 26540192, "step": 30645 }, { "epoch": 14.450730787364451, "grad_norm": 0.004779467824846506, "learning_rate": 0.038667649936180555, "loss": 0.1771, "num_input_tokens_seen": 26544720, "step": 30650 }, { "epoch": 14.453088165959453, "grad_norm": 0.004768576472997665, "learning_rate": 0.038628182774178, "loss": 0.1833, "num_input_tokens_seen": 26549248, "step": 30655 }, { "epoch": 14.455445544554456, "grad_norm": 0.005710358265787363, "learning_rate": 0.038588732787109226, "loss": 0.1693, "num_input_tokens_seen": 26553152, "step": 30660 }, { "epoch": 14.457802923149458, "grad_norm": 0.004045804496854544, "learning_rate": 0.03854929998105795, "loss": 0.2033, "num_input_tokens_seen": 26557520, "step": 30665 }, { "epoch": 14.46016030174446, "grad_norm": 0.005419635213911533, "learning_rate": 0.03850988436210518, "loss": 0.238, "num_input_tokens_seen": 26561616, "step": 30670 }, { "epoch": 14.462517680339463, "grad_norm": 0.004515789449214935, "learning_rate": 0.03847048593632933, "loss": 0.1574, "num_input_tokens_seen": 26566496, "step": 30675 }, { "epoch": 14.464875058934465, "grad_norm": 0.00880457554012537, "learning_rate": 0.038431104709806096, "loss": 0.1744, "num_input_tokens_seen": 26570784, "step": 30680 }, { "epoch": 14.467232437529468, "grad_norm": 0.0048517887480556965, "learning_rate": 0.0383917406886086, "loss": 0.2031, "num_input_tokens_seen": 26574800, "step": 30685 }, { "epoch": 14.46958981612447, "grad_norm": 0.005601128097623587, "learning_rate": 0.03835239387880722, "loss": 0.1905, "num_input_tokens_seen": 26579152, "step": 30690 }, { "epoch": 14.471947194719473, "grad_norm": 0.006484190467745066, "learning_rate": 0.03831306428646979, "loss": 0.1749, "num_input_tokens_seen": 26583664, "step": 30695 }, { "epoch": 14.474304573314475, "grad_norm": 0.004080051556229591, "learning_rate": 0.03827375191766135, "loss": 0.1428, "num_input_tokens_seen": 26587744, "step": 30700 }, { "epoch": 14.476661951909477, "grad_norm": 0.0036432817578315735, "learning_rate": 0.03823445677844446, "loss": 0.1077, "num_input_tokens_seen": 26592720, "step": 30705 }, { "epoch": 14.47901933050448, "grad_norm": 0.0063684056513011456, "learning_rate": 0.03819517887487881, "loss": 0.1862, "num_input_tokens_seen": 26596736, "step": 30710 }, { "epoch": 14.481376709099482, "grad_norm": 0.00417819619178772, "learning_rate": 0.03815591821302161, "loss": 0.1615, "num_input_tokens_seen": 26600592, "step": 30715 }, { "epoch": 14.483734087694483, "grad_norm": 0.006939438171684742, "learning_rate": 0.03811667479892739, "loss": 0.2114, "num_input_tokens_seen": 26604896, "step": 30720 }, { "epoch": 14.486091466289485, "grad_norm": 0.0059495954774320126, "learning_rate": 0.03807744863864788, "loss": 0.155, "num_input_tokens_seen": 26608512, "step": 30725 }, { "epoch": 14.488448844884488, "grad_norm": 0.009088069200515747, "learning_rate": 0.03803823973823229, "loss": 0.182, "num_input_tokens_seen": 26612624, "step": 30730 }, { "epoch": 14.49080622347949, "grad_norm": 0.005793694406747818, "learning_rate": 0.03799904810372719, "loss": 0.2084, "num_input_tokens_seen": 26617072, "step": 30735 }, { "epoch": 14.493163602074493, "grad_norm": 0.00692905206233263, "learning_rate": 0.03795987374117632, "loss": 0.1498, "num_input_tokens_seen": 26621312, "step": 30740 }, { "epoch": 14.495520980669495, "grad_norm": 0.0051119206473231316, "learning_rate": 0.03792071665662093, "loss": 0.1586, "num_input_tokens_seen": 26625456, "step": 30745 }, { "epoch": 14.497878359264497, "grad_norm": 0.009916132315993309, "learning_rate": 0.03788157685609952, "loss": 0.1904, "num_input_tokens_seen": 26629456, "step": 30750 }, { "epoch": 14.5002357378595, "grad_norm": 0.003021150827407837, "learning_rate": 0.037842454345647876, "loss": 0.1779, "num_input_tokens_seen": 26634224, "step": 30755 }, { "epoch": 14.502593116454502, "grad_norm": 0.006743459030985832, "learning_rate": 0.03780334913129929, "loss": 0.1963, "num_input_tokens_seen": 26638624, "step": 30760 }, { "epoch": 14.504950495049505, "grad_norm": 0.001900395262055099, "learning_rate": 0.037764261219084175, "loss": 0.1511, "num_input_tokens_seen": 26642368, "step": 30765 }, { "epoch": 14.507307873644507, "grad_norm": 0.010378874838352203, "learning_rate": 0.037725190615030414, "loss": 0.2182, "num_input_tokens_seen": 26646336, "step": 30770 }, { "epoch": 14.50966525223951, "grad_norm": 0.005705143325030804, "learning_rate": 0.037686137325163224, "loss": 0.156, "num_input_tokens_seen": 26650016, "step": 30775 }, { "epoch": 14.512022630834512, "grad_norm": 0.005963718052953482, "learning_rate": 0.037647101355505065, "loss": 0.2239, "num_input_tokens_seen": 26653664, "step": 30780 }, { "epoch": 14.514380009429514, "grad_norm": 0.009307513013482094, "learning_rate": 0.03760808271207581, "loss": 0.2198, "num_input_tokens_seen": 26658112, "step": 30785 }, { "epoch": 14.516737388024517, "grad_norm": 0.005175680387765169, "learning_rate": 0.03756908140089258, "loss": 0.1737, "num_input_tokens_seen": 26663104, "step": 30790 }, { "epoch": 14.519094766619519, "grad_norm": 0.0032260785810649395, "learning_rate": 0.03753009742796989, "loss": 0.1403, "num_input_tokens_seen": 26667280, "step": 30795 }, { "epoch": 14.521452145214521, "grad_norm": 0.004864353686571121, "learning_rate": 0.037491130799319615, "loss": 0.2064, "num_input_tokens_seen": 26671584, "step": 30800 }, { "epoch": 14.521452145214521, "eval_loss": 0.21666614711284637, "eval_runtime": 21.928, "eval_samples_per_second": 43.004, "eval_steps_per_second": 21.525, "num_input_tokens_seen": 26671584, "step": 30800 }, { "epoch": 14.523809523809524, "grad_norm": 0.004547072574496269, "learning_rate": 0.03745218152095079, "loss": 0.1435, "num_input_tokens_seen": 26675920, "step": 30805 }, { "epoch": 14.526166902404526, "grad_norm": 0.0031764600425958633, "learning_rate": 0.037413249598869935, "loss": 0.1119, "num_input_tokens_seen": 26680016, "step": 30810 }, { "epoch": 14.528524280999529, "grad_norm": 0.0048849573358893394, "learning_rate": 0.037374335039080886, "loss": 0.1895, "num_input_tokens_seen": 26684416, "step": 30815 }, { "epoch": 14.530881659594531, "grad_norm": 0.00830648560076952, "learning_rate": 0.037335437847584724, "loss": 0.1839, "num_input_tokens_seen": 26688736, "step": 30820 }, { "epoch": 14.533239038189533, "grad_norm": 0.006046365015208721, "learning_rate": 0.03729655803037983, "loss": 0.1501, "num_input_tokens_seen": 26692848, "step": 30825 }, { "epoch": 14.535596416784536, "grad_norm": 0.006857164669781923, "learning_rate": 0.03725769559346207, "loss": 0.1747, "num_input_tokens_seen": 26697888, "step": 30830 }, { "epoch": 14.537953795379538, "grad_norm": 0.005469634663313627, "learning_rate": 0.03721885054282439, "loss": 0.1818, "num_input_tokens_seen": 26701648, "step": 30835 }, { "epoch": 14.54031117397454, "grad_norm": 0.01010989025235176, "learning_rate": 0.03718002288445731, "loss": 0.1389, "num_input_tokens_seen": 26706240, "step": 30840 }, { "epoch": 14.542668552569543, "grad_norm": 0.003932561259716749, "learning_rate": 0.03714121262434844, "loss": 0.1906, "num_input_tokens_seen": 26710656, "step": 30845 }, { "epoch": 14.545025931164545, "grad_norm": 0.0035110118333250284, "learning_rate": 0.037102419768482844, "loss": 0.148, "num_input_tokens_seen": 26715264, "step": 30850 }, { "epoch": 14.547383309759548, "grad_norm": 0.0077673569321632385, "learning_rate": 0.03706364432284293, "loss": 0.2135, "num_input_tokens_seen": 26720176, "step": 30855 }, { "epoch": 14.54974068835455, "grad_norm": 0.003392778569832444, "learning_rate": 0.03702488629340828, "loss": 0.1964, "num_input_tokens_seen": 26724448, "step": 30860 }, { "epoch": 14.552098066949553, "grad_norm": 0.007430478930473328, "learning_rate": 0.036986145686155915, "loss": 0.1767, "num_input_tokens_seen": 26728944, "step": 30865 }, { "epoch": 14.554455445544555, "grad_norm": 0.006181610282510519, "learning_rate": 0.036947422507060075, "loss": 0.1622, "num_input_tokens_seen": 26732432, "step": 30870 }, { "epoch": 14.556812824139557, "grad_norm": 0.007989470846951008, "learning_rate": 0.0369087167620924, "loss": 0.1908, "num_input_tokens_seen": 26736368, "step": 30875 }, { "epoch": 14.55917020273456, "grad_norm": 0.0057749636471271515, "learning_rate": 0.03687002845722183, "loss": 0.1814, "num_input_tokens_seen": 26740752, "step": 30880 }, { "epoch": 14.561527581329562, "grad_norm": 0.005783790722489357, "learning_rate": 0.03683135759841451, "loss": 0.1758, "num_input_tokens_seen": 26744560, "step": 30885 }, { "epoch": 14.563884959924565, "grad_norm": 0.007686834316700697, "learning_rate": 0.03679270419163406, "loss": 0.2042, "num_input_tokens_seen": 26748480, "step": 30890 }, { "epoch": 14.566242338519567, "grad_norm": 0.008002380840480328, "learning_rate": 0.03675406824284127, "loss": 0.1778, "num_input_tokens_seen": 26752448, "step": 30895 }, { "epoch": 14.56859971711457, "grad_norm": 0.0067858281545341015, "learning_rate": 0.03671544975799425, "loss": 0.1815, "num_input_tokens_seen": 26756816, "step": 30900 }, { "epoch": 14.570957095709572, "grad_norm": 0.005919996183365583, "learning_rate": 0.03667684874304854, "loss": 0.1594, "num_input_tokens_seen": 26760992, "step": 30905 }, { "epoch": 14.573314474304574, "grad_norm": 0.007145118899643421, "learning_rate": 0.03663826520395683, "loss": 0.1607, "num_input_tokens_seen": 26765184, "step": 30910 }, { "epoch": 14.575671852899575, "grad_norm": 0.006249037571251392, "learning_rate": 0.03659969914666922, "loss": 0.1857, "num_input_tokens_seen": 26769472, "step": 30915 }, { "epoch": 14.578029231494579, "grad_norm": 0.00717057753354311, "learning_rate": 0.036561150577133106, "loss": 0.1554, "num_input_tokens_seen": 26774112, "step": 30920 }, { "epoch": 14.58038661008958, "grad_norm": 0.005977169144898653, "learning_rate": 0.036522619501293103, "loss": 0.1653, "num_input_tokens_seen": 26778480, "step": 30925 }, { "epoch": 14.582743988684582, "grad_norm": 0.010449199937283993, "learning_rate": 0.03648410592509122, "loss": 0.1706, "num_input_tokens_seen": 26782720, "step": 30930 }, { "epoch": 14.585101367279584, "grad_norm": 0.007821097038686275, "learning_rate": 0.03644560985446676, "loss": 0.1978, "num_input_tokens_seen": 26786624, "step": 30935 }, { "epoch": 14.587458745874587, "grad_norm": 0.006361290346831083, "learning_rate": 0.036407131295356256, "loss": 0.1564, "num_input_tokens_seen": 26790512, "step": 30940 }, { "epoch": 14.58981612446959, "grad_norm": 0.007763554807752371, "learning_rate": 0.03636867025369362, "loss": 0.1472, "num_input_tokens_seen": 26795648, "step": 30945 }, { "epoch": 14.592173503064592, "grad_norm": 0.006469822023063898, "learning_rate": 0.03633022673540999, "loss": 0.1516, "num_input_tokens_seen": 26800896, "step": 30950 }, { "epoch": 14.594530881659594, "grad_norm": 0.005037639755755663, "learning_rate": 0.03629180074643385, "loss": 0.1573, "num_input_tokens_seen": 26805776, "step": 30955 }, { "epoch": 14.596888260254596, "grad_norm": 0.006903060246258974, "learning_rate": 0.03625339229269102, "loss": 0.1698, "num_input_tokens_seen": 26810432, "step": 30960 }, { "epoch": 14.599245638849599, "grad_norm": 0.009457863867282867, "learning_rate": 0.036215001380104535, "loss": 0.2112, "num_input_tokens_seen": 26814848, "step": 30965 }, { "epoch": 14.601603017444601, "grad_norm": 0.013125145807862282, "learning_rate": 0.03617662801459471, "loss": 0.2344, "num_input_tokens_seen": 26820000, "step": 30970 }, { "epoch": 14.603960396039604, "grad_norm": 0.0061284014955163, "learning_rate": 0.036138272202079276, "loss": 0.1606, "num_input_tokens_seen": 26824288, "step": 30975 }, { "epoch": 14.606317774634606, "grad_norm": 0.005944352131336927, "learning_rate": 0.036099933948473106, "loss": 0.1725, "num_input_tokens_seen": 26829280, "step": 30980 }, { "epoch": 14.608675153229608, "grad_norm": 0.0058021689765155315, "learning_rate": 0.03606161325968851, "loss": 0.1529, "num_input_tokens_seen": 26833456, "step": 30985 }, { "epoch": 14.61103253182461, "grad_norm": 0.004104289226233959, "learning_rate": 0.03602331014163496, "loss": 0.1988, "num_input_tokens_seen": 26837472, "step": 30990 }, { "epoch": 14.613389910419613, "grad_norm": 0.008772805333137512, "learning_rate": 0.035985024600219295, "loss": 0.1689, "num_input_tokens_seen": 26841904, "step": 30995 }, { "epoch": 14.615747289014616, "grad_norm": 0.006049638614058495, "learning_rate": 0.03594675664134569, "loss": 0.1868, "num_input_tokens_seen": 26845568, "step": 31000 }, { "epoch": 14.615747289014616, "eval_loss": 0.22147604823112488, "eval_runtime": 21.8964, "eval_samples_per_second": 43.067, "eval_steps_per_second": 21.556, "num_input_tokens_seen": 26845568, "step": 31000 }, { "epoch": 14.618104667609618, "grad_norm": 0.005350709427148104, "learning_rate": 0.03590850627091545, "loss": 0.1604, "num_input_tokens_seen": 26849472, "step": 31005 }, { "epoch": 14.62046204620462, "grad_norm": 0.006496004294604063, "learning_rate": 0.03587027349482731, "loss": 0.1983, "num_input_tokens_seen": 26854160, "step": 31010 }, { "epoch": 14.622819424799623, "grad_norm": 0.010208907537162304, "learning_rate": 0.035832058318977275, "loss": 0.2148, "num_input_tokens_seen": 26859184, "step": 31015 }, { "epoch": 14.625176803394625, "grad_norm": 0.009728824719786644, "learning_rate": 0.03579386074925853, "loss": 0.1955, "num_input_tokens_seen": 26862912, "step": 31020 }, { "epoch": 14.627534181989628, "grad_norm": 0.006865161005407572, "learning_rate": 0.035755680791561696, "loss": 0.2106, "num_input_tokens_seen": 26868416, "step": 31025 }, { "epoch": 14.62989156058463, "grad_norm": 0.006134893745183945, "learning_rate": 0.03571751845177454, "loss": 0.1957, "num_input_tokens_seen": 26871872, "step": 31030 }, { "epoch": 14.632248939179632, "grad_norm": 0.007742851972579956, "learning_rate": 0.03567937373578225, "loss": 0.2745, "num_input_tokens_seen": 26875488, "step": 31035 }, { "epoch": 14.634606317774635, "grad_norm": 0.0035222978331148624, "learning_rate": 0.03564124664946711, "loss": 0.2, "num_input_tokens_seen": 26879584, "step": 31040 }, { "epoch": 14.636963696369637, "grad_norm": 0.0058174519799649715, "learning_rate": 0.035603137198708924, "loss": 0.1808, "num_input_tokens_seen": 26883344, "step": 31045 }, { "epoch": 14.63932107496464, "grad_norm": 0.004803921561688185, "learning_rate": 0.035565045389384514, "loss": 0.2108, "num_input_tokens_seen": 26887840, "step": 31050 }, { "epoch": 14.641678453559642, "grad_norm": 0.005257738754153252, "learning_rate": 0.03552697122736823, "loss": 0.1544, "num_input_tokens_seen": 26891648, "step": 31055 }, { "epoch": 14.644035832154644, "grad_norm": 0.0031367656774818897, "learning_rate": 0.03548891471853153, "loss": 0.1669, "num_input_tokens_seen": 26896416, "step": 31060 }, { "epoch": 14.646393210749647, "grad_norm": 0.005687291268259287, "learning_rate": 0.03545087586874322, "loss": 0.1147, "num_input_tokens_seen": 26901696, "step": 31065 }, { "epoch": 14.64875058934465, "grad_norm": 0.005886594764888287, "learning_rate": 0.03541285468386935, "loss": 0.1493, "num_input_tokens_seen": 26905696, "step": 31070 }, { "epoch": 14.651107967939652, "grad_norm": 0.005129913333803415, "learning_rate": 0.03537485116977327, "loss": 0.178, "num_input_tokens_seen": 26909600, "step": 31075 }, { "epoch": 14.653465346534654, "grad_norm": 0.009497813880443573, "learning_rate": 0.03533686533231565, "loss": 0.1959, "num_input_tokens_seen": 26914304, "step": 31080 }, { "epoch": 14.655822725129656, "grad_norm": 0.005658426322042942, "learning_rate": 0.0352988971773543, "loss": 0.1635, "num_input_tokens_seen": 26918400, "step": 31085 }, { "epoch": 14.658180103724659, "grad_norm": 0.005134064704179764, "learning_rate": 0.03526094671074443, "loss": 0.1503, "num_input_tokens_seen": 26922720, "step": 31090 }, { "epoch": 14.660537482319661, "grad_norm": 0.005945050157606602, "learning_rate": 0.03522301393833852, "loss": 0.1871, "num_input_tokens_seen": 26927504, "step": 31095 }, { "epoch": 14.662894860914664, "grad_norm": 0.0078046247363090515, "learning_rate": 0.035185098865986204, "loss": 0.2361, "num_input_tokens_seen": 26932000, "step": 31100 }, { "epoch": 14.665252239509666, "grad_norm": 0.008537321351468563, "learning_rate": 0.03514720149953453, "loss": 0.2006, "num_input_tokens_seen": 26936704, "step": 31105 }, { "epoch": 14.667609618104667, "grad_norm": 0.006612342782318592, "learning_rate": 0.03510932184482773, "loss": 0.2019, "num_input_tokens_seen": 26941296, "step": 31110 }, { "epoch": 14.66996699669967, "grad_norm": 0.0032411497086286545, "learning_rate": 0.03507145990770724, "loss": 0.1692, "num_input_tokens_seen": 26946336, "step": 31115 }, { "epoch": 14.672324375294671, "grad_norm": 0.0038390459958463907, "learning_rate": 0.035033615694011984, "loss": 0.1127, "num_input_tokens_seen": 26950544, "step": 31120 }, { "epoch": 14.674681753889674, "grad_norm": 0.006515556015074253, "learning_rate": 0.03499578920957788, "loss": 0.1673, "num_input_tokens_seen": 26955088, "step": 31125 }, { "epoch": 14.677039132484676, "grad_norm": 0.004634056705981493, "learning_rate": 0.034957980460238375, "loss": 0.1804, "num_input_tokens_seen": 26959920, "step": 31130 }, { "epoch": 14.679396511079679, "grad_norm": 0.009707312099635601, "learning_rate": 0.03492018945182393, "loss": 0.2016, "num_input_tokens_seen": 26964160, "step": 31135 }, { "epoch": 14.681753889674681, "grad_norm": 0.005718636326491833, "learning_rate": 0.03488241619016247, "loss": 0.1753, "num_input_tokens_seen": 26968160, "step": 31140 }, { "epoch": 14.684111268269683, "grad_norm": 0.00897898431867361, "learning_rate": 0.03484466068107913, "loss": 0.169, "num_input_tokens_seen": 26972560, "step": 31145 }, { "epoch": 14.686468646864686, "grad_norm": 0.010096997022628784, "learning_rate": 0.034806922930396195, "loss": 0.2276, "num_input_tokens_seen": 26976464, "step": 31150 }, { "epoch": 14.688826025459688, "grad_norm": 0.003829028457403183, "learning_rate": 0.03476920294393337, "loss": 0.169, "num_input_tokens_seen": 26981040, "step": 31155 }, { "epoch": 14.69118340405469, "grad_norm": 0.012253743596374989, "learning_rate": 0.03473150072750755, "loss": 0.2044, "num_input_tokens_seen": 26985152, "step": 31160 }, { "epoch": 14.693540782649693, "grad_norm": 0.00750062195584178, "learning_rate": 0.03469381628693284, "loss": 0.2548, "num_input_tokens_seen": 26989168, "step": 31165 }, { "epoch": 14.695898161244696, "grad_norm": 0.0058988165110349655, "learning_rate": 0.03465614962802072, "loss": 0.2135, "num_input_tokens_seen": 26994112, "step": 31170 }, { "epoch": 14.698255539839698, "grad_norm": 0.005652610212564468, "learning_rate": 0.0346185007565798, "loss": 0.1797, "num_input_tokens_seen": 26998272, "step": 31175 }, { "epoch": 14.7006129184347, "grad_norm": 0.006031505297869444, "learning_rate": 0.03458086967841609, "loss": 0.1574, "num_input_tokens_seen": 27002240, "step": 31180 }, { "epoch": 14.702970297029703, "grad_norm": 0.007745372597128153, "learning_rate": 0.03454325639933266, "loss": 0.2028, "num_input_tokens_seen": 27006240, "step": 31185 }, { "epoch": 14.705327675624705, "grad_norm": 0.005938663613051176, "learning_rate": 0.03450566092513007, "loss": 0.1504, "num_input_tokens_seen": 27010288, "step": 31190 }, { "epoch": 14.707685054219708, "grad_norm": 0.004308302886784077, "learning_rate": 0.034468083261605914, "loss": 0.1725, "num_input_tokens_seen": 27014144, "step": 31195 }, { "epoch": 14.71004243281471, "grad_norm": 0.006093740463256836, "learning_rate": 0.03443052341455522, "loss": 0.179, "num_input_tokens_seen": 27017952, "step": 31200 }, { "epoch": 14.71004243281471, "eval_loss": 0.21963630616664886, "eval_runtime": 21.939, "eval_samples_per_second": 42.983, "eval_steps_per_second": 21.514, "num_input_tokens_seen": 27017952, "step": 31200 }, { "epoch": 14.712399811409712, "grad_norm": 0.006688254419714212, "learning_rate": 0.0343929813897701, "loss": 0.1483, "num_input_tokens_seen": 27021600, "step": 31205 }, { "epoch": 14.714757190004715, "grad_norm": 0.009602392092347145, "learning_rate": 0.034355457193040125, "loss": 0.1753, "num_input_tokens_seen": 27026400, "step": 31210 }, { "epoch": 14.717114568599717, "grad_norm": 0.003857169533148408, "learning_rate": 0.03431795083015186, "loss": 0.1761, "num_input_tokens_seen": 27030352, "step": 31215 }, { "epoch": 14.71947194719472, "grad_norm": 0.004795355722308159, "learning_rate": 0.03428046230688936, "loss": 0.1524, "num_input_tokens_seen": 27034528, "step": 31220 }, { "epoch": 14.721829325789722, "grad_norm": 0.007884330116212368, "learning_rate": 0.034242991629033805, "loss": 0.2026, "num_input_tokens_seen": 27038640, "step": 31225 }, { "epoch": 14.724186704384724, "grad_norm": 0.0042383079417049885, "learning_rate": 0.03420553880236362, "loss": 0.1221, "num_input_tokens_seen": 27042960, "step": 31230 }, { "epoch": 14.726544082979727, "grad_norm": 0.007879868149757385, "learning_rate": 0.03416810383265449, "loss": 0.2238, "num_input_tokens_seen": 27048592, "step": 31235 }, { "epoch": 14.72890146157473, "grad_norm": 0.005207326728850603, "learning_rate": 0.03413068672567944, "loss": 0.174, "num_input_tokens_seen": 27053072, "step": 31240 }, { "epoch": 14.731258840169732, "grad_norm": 0.006050557363778353, "learning_rate": 0.034093287487208565, "loss": 0.1566, "num_input_tokens_seen": 27056784, "step": 31245 }, { "epoch": 14.733616218764734, "grad_norm": 0.008078157901763916, "learning_rate": 0.03405590612300937, "loss": 0.2179, "num_input_tokens_seen": 27060288, "step": 31250 }, { "epoch": 14.735973597359736, "grad_norm": 0.007427724078297615, "learning_rate": 0.03401854263884646, "loss": 0.1677, "num_input_tokens_seen": 27065344, "step": 31255 }, { "epoch": 14.738330975954739, "grad_norm": 0.010830432176589966, "learning_rate": 0.033981197040481824, "loss": 0.2085, "num_input_tokens_seen": 27069184, "step": 31260 }, { "epoch": 14.740688354549741, "grad_norm": 0.010542825795710087, "learning_rate": 0.03394386933367459, "loss": 0.1572, "num_input_tokens_seen": 27074496, "step": 31265 }, { "epoch": 14.743045733144744, "grad_norm": 0.0029538762755692005, "learning_rate": 0.033906559524181104, "loss": 0.1462, "num_input_tokens_seen": 27078304, "step": 31270 }, { "epoch": 14.745403111739746, "grad_norm": 0.003832175163552165, "learning_rate": 0.033869267617755085, "loss": 0.2213, "num_input_tokens_seen": 27082848, "step": 31275 }, { "epoch": 14.747760490334748, "grad_norm": 0.008046318776905537, "learning_rate": 0.0338319936201474, "loss": 0.1584, "num_input_tokens_seen": 27087184, "step": 31280 }, { "epoch": 14.75011786892975, "grad_norm": 0.009863331913948059, "learning_rate": 0.033794737537106136, "loss": 0.2069, "num_input_tokens_seen": 27091072, "step": 31285 }, { "epoch": 14.752475247524753, "grad_norm": 0.0031112448778003454, "learning_rate": 0.03375749937437671, "loss": 0.1178, "num_input_tokens_seen": 27095680, "step": 31290 }, { "epoch": 14.754832626119756, "grad_norm": 0.005577183328568935, "learning_rate": 0.033720279137701634, "loss": 0.1804, "num_input_tokens_seen": 27100160, "step": 31295 }, { "epoch": 14.757190004714758, "grad_norm": 0.008729926310479641, "learning_rate": 0.03368307683282078, "loss": 0.205, "num_input_tokens_seen": 27104384, "step": 31300 }, { "epoch": 14.75954738330976, "grad_norm": 0.003878445364534855, "learning_rate": 0.033645892465471235, "loss": 0.1249, "num_input_tokens_seen": 27108208, "step": 31305 }, { "epoch": 14.761904761904763, "grad_norm": 0.0052953679114580154, "learning_rate": 0.03360872604138724, "loss": 0.1733, "num_input_tokens_seen": 27112384, "step": 31310 }, { "epoch": 14.764262140499763, "grad_norm": 0.00709918886423111, "learning_rate": 0.03357157756630034, "loss": 0.2248, "num_input_tokens_seen": 27116704, "step": 31315 }, { "epoch": 14.766619519094768, "grad_norm": 0.008181417360901833, "learning_rate": 0.033534447045939365, "loss": 0.2452, "num_input_tokens_seen": 27121776, "step": 31320 }, { "epoch": 14.768976897689768, "grad_norm": 0.007131804712116718, "learning_rate": 0.03349733448603026, "loss": 0.1893, "num_input_tokens_seen": 27127392, "step": 31325 }, { "epoch": 14.77133427628477, "grad_norm": 0.008365755900740623, "learning_rate": 0.03346023989229619, "loss": 0.173, "num_input_tokens_seen": 27131360, "step": 31330 }, { "epoch": 14.773691654879773, "grad_norm": 0.007154990918934345, "learning_rate": 0.03342316327045769, "loss": 0.1618, "num_input_tokens_seen": 27135536, "step": 31335 }, { "epoch": 14.776049033474775, "grad_norm": 0.005592873319983482, "learning_rate": 0.033386104626232385, "loss": 0.1738, "num_input_tokens_seen": 27140528, "step": 31340 }, { "epoch": 14.778406412069778, "grad_norm": 0.0059942808002233505, "learning_rate": 0.03334906396533525, "loss": 0.2613, "num_input_tokens_seen": 27145216, "step": 31345 }, { "epoch": 14.78076379066478, "grad_norm": 0.004958879202604294, "learning_rate": 0.033312041293478326, "loss": 0.1538, "num_input_tokens_seen": 27149792, "step": 31350 }, { "epoch": 14.783121169259783, "grad_norm": 0.004528856370598078, "learning_rate": 0.03327503661637103, "loss": 0.2081, "num_input_tokens_seen": 27154832, "step": 31355 }, { "epoch": 14.785478547854785, "grad_norm": 0.003867924911901355, "learning_rate": 0.03323804993971998, "loss": 0.1411, "num_input_tokens_seen": 27158352, "step": 31360 }, { "epoch": 14.787835926449787, "grad_norm": 0.00661414535716176, "learning_rate": 0.033201081269228924, "loss": 0.174, "num_input_tokens_seen": 27162208, "step": 31365 }, { "epoch": 14.79019330504479, "grad_norm": 0.007284439634531736, "learning_rate": 0.03316413061059895, "loss": 0.1541, "num_input_tokens_seen": 27166272, "step": 31370 }, { "epoch": 14.792550683639792, "grad_norm": 0.004178010392934084, "learning_rate": 0.03312719796952827, "loss": 0.1765, "num_input_tokens_seen": 27170208, "step": 31375 }, { "epoch": 14.794908062234795, "grad_norm": 0.010041406378149986, "learning_rate": 0.03309028335171236, "loss": 0.1686, "num_input_tokens_seen": 27174336, "step": 31380 }, { "epoch": 14.797265440829797, "grad_norm": 0.005167939700186253, "learning_rate": 0.03305338676284398, "loss": 0.1528, "num_input_tokens_seen": 27179008, "step": 31385 }, { "epoch": 14.7996228194248, "grad_norm": 0.004662918392568827, "learning_rate": 0.03301650820861296, "loss": 0.1553, "num_input_tokens_seen": 27183536, "step": 31390 }, { "epoch": 14.801980198019802, "grad_norm": 0.00854891911149025, "learning_rate": 0.03297964769470652, "loss": 0.2318, "num_input_tokens_seen": 27187728, "step": 31395 }, { "epoch": 14.804337576614804, "grad_norm": 0.007363725453615189, "learning_rate": 0.032942805226808945, "loss": 0.2227, "num_input_tokens_seen": 27191600, "step": 31400 }, { "epoch": 14.804337576614804, "eval_loss": 0.22277867794036865, "eval_runtime": 21.9277, "eval_samples_per_second": 43.005, "eval_steps_per_second": 21.525, "num_input_tokens_seen": 27191600, "step": 31400 }, { "epoch": 14.806694955209807, "grad_norm": 0.007005356252193451, "learning_rate": 0.03290598081060187, "loss": 0.2265, "num_input_tokens_seen": 27195712, "step": 31405 }, { "epoch": 14.809052333804809, "grad_norm": 0.008061530068516731, "learning_rate": 0.03286917445176407, "loss": 0.1856, "num_input_tokens_seen": 27199472, "step": 31410 }, { "epoch": 14.811409712399811, "grad_norm": 0.007801678031682968, "learning_rate": 0.032832386155971456, "loss": 0.1855, "num_input_tokens_seen": 27204000, "step": 31415 }, { "epoch": 14.813767090994814, "grad_norm": 0.008730773814022541, "learning_rate": 0.032795615928897334, "loss": 0.2689, "num_input_tokens_seen": 27208208, "step": 31420 }, { "epoch": 14.816124469589816, "grad_norm": 0.0058345007710158825, "learning_rate": 0.03275886377621215, "loss": 0.1538, "num_input_tokens_seen": 27212256, "step": 31425 }, { "epoch": 14.818481848184819, "grad_norm": 0.01002301275730133, "learning_rate": 0.03272212970358348, "loss": 0.2258, "num_input_tokens_seen": 27216736, "step": 31430 }, { "epoch": 14.820839226779821, "grad_norm": 0.004367421381175518, "learning_rate": 0.032685413716676215, "loss": 0.1584, "num_input_tokens_seen": 27221072, "step": 31435 }, { "epoch": 14.823196605374823, "grad_norm": 0.004373235162347555, "learning_rate": 0.032648715821152474, "loss": 0.192, "num_input_tokens_seen": 27224688, "step": 31440 }, { "epoch": 14.825553983969826, "grad_norm": 0.005812883842736483, "learning_rate": 0.03261203602267143, "loss": 0.1724, "num_input_tokens_seen": 27228656, "step": 31445 }, { "epoch": 14.827911362564828, "grad_norm": 0.005029674619436264, "learning_rate": 0.03257537432688966, "loss": 0.2359, "num_input_tokens_seen": 27232704, "step": 31450 }, { "epoch": 14.83026874115983, "grad_norm": 0.005447214934974909, "learning_rate": 0.03253873073946077, "loss": 0.2082, "num_input_tokens_seen": 27236640, "step": 31455 }, { "epoch": 14.832626119754833, "grad_norm": 0.0024080912116914988, "learning_rate": 0.03250210526603572, "loss": 0.1413, "num_input_tokens_seen": 27240704, "step": 31460 }, { "epoch": 14.834983498349835, "grad_norm": 0.008191054686903954, "learning_rate": 0.03246549791226266, "loss": 0.1989, "num_input_tokens_seen": 27244944, "step": 31465 }, { "epoch": 14.837340876944838, "grad_norm": 0.00554154347628355, "learning_rate": 0.03242890868378679, "loss": 0.1632, "num_input_tokens_seen": 27249680, "step": 31470 }, { "epoch": 14.83969825553984, "grad_norm": 0.004086107946932316, "learning_rate": 0.03239233758625074, "loss": 0.1764, "num_input_tokens_seen": 27254688, "step": 31475 }, { "epoch": 14.842055634134843, "grad_norm": 0.0054254597052931786, "learning_rate": 0.032355784625294204, "loss": 0.1736, "num_input_tokens_seen": 27258448, "step": 31480 }, { "epoch": 14.844413012729845, "grad_norm": 0.009827401489019394, "learning_rate": 0.03231924980655402, "loss": 0.2116, "num_input_tokens_seen": 27262848, "step": 31485 }, { "epoch": 14.846770391324847, "grad_norm": 0.005223838612437248, "learning_rate": 0.032282733135664446, "loss": 0.1797, "num_input_tokens_seen": 27267520, "step": 31490 }, { "epoch": 14.84912776991985, "grad_norm": 0.006793577689677477, "learning_rate": 0.03224623461825669, "loss": 0.1354, "num_input_tokens_seen": 27271456, "step": 31495 }, { "epoch": 14.851485148514852, "grad_norm": 0.004696226213127375, "learning_rate": 0.03220975425995937, "loss": 0.1191, "num_input_tokens_seen": 27275360, "step": 31500 }, { "epoch": 14.853842527109855, "grad_norm": 0.005338048562407494, "learning_rate": 0.032173292066398206, "loss": 0.2282, "num_input_tokens_seen": 27279312, "step": 31505 }, { "epoch": 14.856199905704855, "grad_norm": 0.0020173415541648865, "learning_rate": 0.03213684804319606, "loss": 0.1578, "num_input_tokens_seen": 27284432, "step": 31510 }, { "epoch": 14.85855728429986, "grad_norm": 0.007944039069116116, "learning_rate": 0.03210042219597312, "loss": 0.144, "num_input_tokens_seen": 27288432, "step": 31515 }, { "epoch": 14.86091466289486, "grad_norm": 0.004040560685098171, "learning_rate": 0.03206401453034675, "loss": 0.1777, "num_input_tokens_seen": 27293024, "step": 31520 }, { "epoch": 14.863272041489862, "grad_norm": 0.00574663607403636, "learning_rate": 0.03202762505193136, "loss": 0.1726, "num_input_tokens_seen": 27297424, "step": 31525 }, { "epoch": 14.865629420084865, "grad_norm": 0.005537412595003843, "learning_rate": 0.031991253766338754, "loss": 0.1859, "num_input_tokens_seen": 27301696, "step": 31530 }, { "epoch": 14.867986798679867, "grad_norm": 0.004909951239824295, "learning_rate": 0.03195490067917778, "loss": 0.1735, "num_input_tokens_seen": 27306608, "step": 31535 }, { "epoch": 14.87034417727487, "grad_norm": 0.005349540151655674, "learning_rate": 0.03191856579605461, "loss": 0.1299, "num_input_tokens_seen": 27310464, "step": 31540 }, { "epoch": 14.872701555869872, "grad_norm": 0.00814803596585989, "learning_rate": 0.031882249122572454, "loss": 0.1749, "num_input_tokens_seen": 27314992, "step": 31545 }, { "epoch": 14.875058934464874, "grad_norm": 0.004097393248230219, "learning_rate": 0.03184595066433188, "loss": 0.1773, "num_input_tokens_seen": 27319104, "step": 31550 }, { "epoch": 14.877416313059877, "grad_norm": 0.01075207069516182, "learning_rate": 0.03180967042693049, "loss": 0.2199, "num_input_tokens_seen": 27323168, "step": 31555 }, { "epoch": 14.87977369165488, "grad_norm": 0.0035620720591396093, "learning_rate": 0.03177340841596323, "loss": 0.2038, "num_input_tokens_seen": 27327408, "step": 31560 }, { "epoch": 14.882131070249882, "grad_norm": 0.005206974223256111, "learning_rate": 0.03173716463702209, "loss": 0.1774, "num_input_tokens_seen": 27331456, "step": 31565 }, { "epoch": 14.884488448844884, "grad_norm": 0.008503629826009274, "learning_rate": 0.03170093909569638, "loss": 0.201, "num_input_tokens_seen": 27335168, "step": 31570 }, { "epoch": 14.886845827439886, "grad_norm": 0.005828293971717358, "learning_rate": 0.03166473179757246, "loss": 0.1851, "num_input_tokens_seen": 27339392, "step": 31575 }, { "epoch": 14.889203206034889, "grad_norm": 0.004316146485507488, "learning_rate": 0.031628542748234005, "loss": 0.1419, "num_input_tokens_seen": 27343776, "step": 31580 }, { "epoch": 14.891560584629891, "grad_norm": 0.005087959114462137, "learning_rate": 0.03159237195326184, "loss": 0.1946, "num_input_tokens_seen": 27348016, "step": 31585 }, { "epoch": 14.893917963224894, "grad_norm": 0.007090835366398096, "learning_rate": 0.031556219418233875, "loss": 0.215, "num_input_tokens_seen": 27352528, "step": 31590 }, { "epoch": 14.896275341819896, "grad_norm": 0.006120446603745222, "learning_rate": 0.03152008514872533, "loss": 0.172, "num_input_tokens_seen": 27357856, "step": 31595 }, { "epoch": 14.898632720414899, "grad_norm": 0.00725463917478919, "learning_rate": 0.03148396915030862, "loss": 0.1798, "num_input_tokens_seen": 27362144, "step": 31600 }, { "epoch": 14.898632720414899, "eval_loss": 0.22433394193649292, "eval_runtime": 21.916, "eval_samples_per_second": 43.028, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 27362144, "step": 31600 }, { "epoch": 14.900990099009901, "grad_norm": 0.010752711445093155, "learning_rate": 0.03144787142855318, "loss": 0.1813, "num_input_tokens_seen": 27366016, "step": 31605 }, { "epoch": 14.903347477604903, "grad_norm": 0.005783167667686939, "learning_rate": 0.031411791989025835, "loss": 0.2004, "num_input_tokens_seen": 27370320, "step": 31610 }, { "epoch": 14.905704856199906, "grad_norm": 0.008216471411287785, "learning_rate": 0.031375730837290394, "loss": 0.1683, "num_input_tokens_seen": 27374096, "step": 31615 }, { "epoch": 14.908062234794908, "grad_norm": 0.006392409093677998, "learning_rate": 0.031339687978908015, "loss": 0.1438, "num_input_tokens_seen": 27377984, "step": 31620 }, { "epoch": 14.91041961338991, "grad_norm": 0.00805665273219347, "learning_rate": 0.03130366341943694, "loss": 0.2728, "num_input_tokens_seen": 27382368, "step": 31625 }, { "epoch": 14.912776991984913, "grad_norm": 0.006121763028204441, "learning_rate": 0.031267657164432555, "loss": 0.1722, "num_input_tokens_seen": 27386832, "step": 31630 }, { "epoch": 14.915134370579915, "grad_norm": 0.010816319845616817, "learning_rate": 0.03123166921944752, "loss": 0.2638, "num_input_tokens_seen": 27390736, "step": 31635 }, { "epoch": 14.917491749174918, "grad_norm": 0.005303744226694107, "learning_rate": 0.031195699590031666, "loss": 0.1103, "num_input_tokens_seen": 27394976, "step": 31640 }, { "epoch": 14.91984912776992, "grad_norm": 0.007325414568185806, "learning_rate": 0.031159748281731885, "loss": 0.1846, "num_input_tokens_seen": 27399840, "step": 31645 }, { "epoch": 14.922206506364923, "grad_norm": 0.0028772037476301193, "learning_rate": 0.031123815300092394, "loss": 0.1757, "num_input_tokens_seen": 27404400, "step": 31650 }, { "epoch": 14.924563884959925, "grad_norm": 0.005404707510024309, "learning_rate": 0.031087900650654424, "loss": 0.1953, "num_input_tokens_seen": 27408176, "step": 31655 }, { "epoch": 14.926921263554927, "grad_norm": 0.007426152937114239, "learning_rate": 0.031052004338956534, "loss": 0.1566, "num_input_tokens_seen": 27412720, "step": 31660 }, { "epoch": 14.92927864214993, "grad_norm": 0.007629053201526403, "learning_rate": 0.031016126370534407, "loss": 0.1839, "num_input_tokens_seen": 27416688, "step": 31665 }, { "epoch": 14.931636020744932, "grad_norm": 0.0050683533772826195, "learning_rate": 0.030980266750920804, "loss": 0.1493, "num_input_tokens_seen": 27420976, "step": 31670 }, { "epoch": 14.933993399339935, "grad_norm": 0.009494072757661343, "learning_rate": 0.030944425485645747, "loss": 0.1675, "num_input_tokens_seen": 27425088, "step": 31675 }, { "epoch": 14.936350777934937, "grad_norm": 0.007550706155598164, "learning_rate": 0.03090860258023647, "loss": 0.1449, "num_input_tokens_seen": 27429952, "step": 31680 }, { "epoch": 14.93870815652994, "grad_norm": 0.003941238857805729, "learning_rate": 0.030872798040217236, "loss": 0.1754, "num_input_tokens_seen": 27434144, "step": 31685 }, { "epoch": 14.941065535124942, "grad_norm": 0.006010878831148148, "learning_rate": 0.03083701187110964, "loss": 0.1578, "num_input_tokens_seen": 27438640, "step": 31690 }, { "epoch": 14.943422913719944, "grad_norm": 0.0045889937318861485, "learning_rate": 0.030801244078432294, "loss": 0.1354, "num_input_tokens_seen": 27444000, "step": 31695 }, { "epoch": 14.945780292314947, "grad_norm": 0.004919214174151421, "learning_rate": 0.030765494667701024, "loss": 0.1437, "num_input_tokens_seen": 27448160, "step": 31700 }, { "epoch": 14.948137670909949, "grad_norm": 0.008427339605987072, "learning_rate": 0.030729763644428913, "loss": 0.1554, "num_input_tokens_seen": 27453040, "step": 31705 }, { "epoch": 14.950495049504951, "grad_norm": 0.010007768869400024, "learning_rate": 0.030694051014126048, "loss": 0.171, "num_input_tokens_seen": 27457280, "step": 31710 }, { "epoch": 14.952852428099952, "grad_norm": 0.00824948213994503, "learning_rate": 0.030658356782299792, "loss": 0.2098, "num_input_tokens_seen": 27461136, "step": 31715 }, { "epoch": 14.955209806694956, "grad_norm": 0.010132517665624619, "learning_rate": 0.030622680954454726, "loss": 0.143, "num_input_tokens_seen": 27465216, "step": 31720 }, { "epoch": 14.957567185289957, "grad_norm": 0.009320967830717564, "learning_rate": 0.030587023536092398, "loss": 0.1813, "num_input_tokens_seen": 27470448, "step": 31725 }, { "epoch": 14.95992456388496, "grad_norm": 0.006410930305719376, "learning_rate": 0.03055138453271171, "loss": 0.1967, "num_input_tokens_seen": 27474912, "step": 31730 }, { "epoch": 14.962281942479962, "grad_norm": 0.0030152248218655586, "learning_rate": 0.03051576394980858, "loss": 0.1735, "num_input_tokens_seen": 27479776, "step": 31735 }, { "epoch": 14.964639321074964, "grad_norm": 0.004947801120579243, "learning_rate": 0.030480161792876187, "loss": 0.114, "num_input_tokens_seen": 27484400, "step": 31740 }, { "epoch": 14.966996699669966, "grad_norm": 0.008130255155265331, "learning_rate": 0.030444578067404846, "loss": 0.1892, "num_input_tokens_seen": 27488912, "step": 31745 }, { "epoch": 14.969354078264969, "grad_norm": 0.006962026469409466, "learning_rate": 0.030409012778881975, "loss": 0.1437, "num_input_tokens_seen": 27493024, "step": 31750 }, { "epoch": 14.971711456859971, "grad_norm": 0.00539055373519659, "learning_rate": 0.030373465932792235, "loss": 0.18, "num_input_tokens_seen": 27497456, "step": 31755 }, { "epoch": 14.974068835454974, "grad_norm": 0.00877026654779911, "learning_rate": 0.030337937534617342, "loss": 0.2359, "num_input_tokens_seen": 27501968, "step": 31760 }, { "epoch": 14.976426214049976, "grad_norm": 0.0077596926130354404, "learning_rate": 0.030302427589836277, "loss": 0.2131, "num_input_tokens_seen": 27505936, "step": 31765 }, { "epoch": 14.978783592644978, "grad_norm": 0.004632691852748394, "learning_rate": 0.030266936103925095, "loss": 0.1598, "num_input_tokens_seen": 27510208, "step": 31770 }, { "epoch": 14.98114097123998, "grad_norm": 0.009307975880801678, "learning_rate": 0.030231463082356982, "loss": 0.2037, "num_input_tokens_seen": 27514640, "step": 31775 }, { "epoch": 14.983498349834983, "grad_norm": 0.003248799592256546, "learning_rate": 0.030196008530602367, "loss": 0.0909, "num_input_tokens_seen": 27519344, "step": 31780 }, { "epoch": 14.985855728429986, "grad_norm": 0.006391605362296104, "learning_rate": 0.030160572454128842, "loss": 0.2045, "num_input_tokens_seen": 27523408, "step": 31785 }, { "epoch": 14.988213107024988, "grad_norm": 0.006999519187957048, "learning_rate": 0.03012515485840098, "loss": 0.2363, "num_input_tokens_seen": 27528176, "step": 31790 }, { "epoch": 14.99057048561999, "grad_norm": 0.005662770010530949, "learning_rate": 0.030089755748880734, "loss": 0.1688, "num_input_tokens_seen": 27532640, "step": 31795 }, { "epoch": 14.992927864214993, "grad_norm": 0.004766967613250017, "learning_rate": 0.030054375131027003, "loss": 0.157, "num_input_tokens_seen": 27536992, "step": 31800 }, { "epoch": 14.992927864214993, "eval_loss": 0.21607425808906555, "eval_runtime": 21.9233, "eval_samples_per_second": 43.014, "eval_steps_per_second": 21.53, "num_input_tokens_seen": 27536992, "step": 31800 }, { "epoch": 14.995285242809995, "grad_norm": 0.0028982185758650303, "learning_rate": 0.030019013010295942, "loss": 0.2854, "num_input_tokens_seen": 27541840, "step": 31805 }, { "epoch": 14.997642621404998, "grad_norm": 0.0073151970282197, "learning_rate": 0.029983669392140897, "loss": 0.209, "num_input_tokens_seen": 27546128, "step": 31810 }, { "epoch": 15.0, "grad_norm": 0.007877274416387081, "learning_rate": 0.029948344282012217, "loss": 0.2426, "num_input_tokens_seen": 27550432, "step": 31815 }, { "epoch": 15.002357378595002, "grad_norm": 0.003405193565413356, "learning_rate": 0.029913037685357507, "loss": 0.1081, "num_input_tokens_seen": 27554720, "step": 31820 }, { "epoch": 15.004714757190005, "grad_norm": 0.002649395726621151, "learning_rate": 0.029877749607621528, "loss": 0.1458, "num_input_tokens_seen": 27559920, "step": 31825 }, { "epoch": 15.007072135785007, "grad_norm": 0.00851258635520935, "learning_rate": 0.029842480054246077, "loss": 0.1222, "num_input_tokens_seen": 27563696, "step": 31830 }, { "epoch": 15.00942951438001, "grad_norm": 0.005291992798447609, "learning_rate": 0.02980722903067022, "loss": 0.1369, "num_input_tokens_seen": 27567888, "step": 31835 }, { "epoch": 15.011786892975012, "grad_norm": 0.005410695448517799, "learning_rate": 0.029771996542330113, "loss": 0.1323, "num_input_tokens_seen": 27572704, "step": 31840 }, { "epoch": 15.014144271570014, "grad_norm": 0.0050464291125535965, "learning_rate": 0.029736782594658954, "loss": 0.1253, "num_input_tokens_seen": 27576832, "step": 31845 }, { "epoch": 15.016501650165017, "grad_norm": 0.008322654291987419, "learning_rate": 0.029701587193087284, "loss": 0.2082, "num_input_tokens_seen": 27581008, "step": 31850 }, { "epoch": 15.01885902876002, "grad_norm": 0.0066838338971138, "learning_rate": 0.0296664103430426, "loss": 0.1698, "num_input_tokens_seen": 27585152, "step": 31855 }, { "epoch": 15.021216407355022, "grad_norm": 0.005041365046054125, "learning_rate": 0.029631252049949652, "loss": 0.1442, "num_input_tokens_seen": 27590336, "step": 31860 }, { "epoch": 15.023573785950024, "grad_norm": 0.006057805847376585, "learning_rate": 0.02959611231923031, "loss": 0.1657, "num_input_tokens_seen": 27594208, "step": 31865 }, { "epoch": 15.025931164545026, "grad_norm": 0.007288984954357147, "learning_rate": 0.029560991156303507, "loss": 0.156, "num_input_tokens_seen": 27598272, "step": 31870 }, { "epoch": 15.028288543140029, "grad_norm": 0.008319100365042686, "learning_rate": 0.02952588856658544, "loss": 0.1271, "num_input_tokens_seen": 27602064, "step": 31875 }, { "epoch": 15.030645921735031, "grad_norm": 0.00371488812379539, "learning_rate": 0.029490804555489296, "loss": 0.0898, "num_input_tokens_seen": 27605952, "step": 31880 }, { "epoch": 15.033003300330034, "grad_norm": 0.011071549728512764, "learning_rate": 0.029455739128425484, "loss": 0.1455, "num_input_tokens_seen": 27611088, "step": 31885 }, { "epoch": 15.035360678925036, "grad_norm": 0.0043593840673565865, "learning_rate": 0.029420692290801607, "loss": 0.1016, "num_input_tokens_seen": 27615952, "step": 31890 }, { "epoch": 15.037718057520038, "grad_norm": 0.0076629482209682465, "learning_rate": 0.02938566404802223, "loss": 0.1078, "num_input_tokens_seen": 27620432, "step": 31895 }, { "epoch": 15.04007543611504, "grad_norm": 0.006147939246147871, "learning_rate": 0.029350654405489195, "loss": 0.1657, "num_input_tokens_seen": 27624064, "step": 31900 }, { "epoch": 15.042432814710043, "grad_norm": 0.0037718666717410088, "learning_rate": 0.02931566336860145, "loss": 0.1231, "num_input_tokens_seen": 27628128, "step": 31905 }, { "epoch": 15.044790193305046, "grad_norm": 0.005786620546132326, "learning_rate": 0.02928069094275505, "loss": 0.1073, "num_input_tokens_seen": 27631920, "step": 31910 }, { "epoch": 15.047147571900048, "grad_norm": 0.0016655907966196537, "learning_rate": 0.02924573713334314, "loss": 0.1429, "num_input_tokens_seen": 27636032, "step": 31915 }, { "epoch": 15.049504950495049, "grad_norm": 0.008867889642715454, "learning_rate": 0.02921080194575603, "loss": 0.2735, "num_input_tokens_seen": 27639744, "step": 31920 }, { "epoch": 15.051862329090051, "grad_norm": 0.007612468209117651, "learning_rate": 0.029175885385381177, "loss": 0.1245, "num_input_tokens_seen": 27644064, "step": 31925 }, { "epoch": 15.054219707685053, "grad_norm": 0.005120874848216772, "learning_rate": 0.029140987457603223, "loss": 0.2033, "num_input_tokens_seen": 27648528, "step": 31930 }, { "epoch": 15.056577086280056, "grad_norm": 0.002650020644068718, "learning_rate": 0.029106108167803763, "loss": 0.1372, "num_input_tokens_seen": 27653072, "step": 31935 }, { "epoch": 15.058934464875058, "grad_norm": 0.005455195438116789, "learning_rate": 0.029071247521361674, "loss": 0.1963, "num_input_tokens_seen": 27657168, "step": 31940 }, { "epoch": 15.06129184347006, "grad_norm": 0.00237372238188982, "learning_rate": 0.029036405523652945, "loss": 0.0802, "num_input_tokens_seen": 27661248, "step": 31945 }, { "epoch": 15.063649222065063, "grad_norm": 0.009384174831211567, "learning_rate": 0.029001582180050577, "loss": 0.1473, "num_input_tokens_seen": 27665536, "step": 31950 }, { "epoch": 15.066006600660065, "grad_norm": 0.008022339083254337, "learning_rate": 0.02896677749592482, "loss": 0.2257, "num_input_tokens_seen": 27669392, "step": 31955 }, { "epoch": 15.068363979255068, "grad_norm": 0.009980757720768452, "learning_rate": 0.028931991476642938, "loss": 0.1437, "num_input_tokens_seen": 27673216, "step": 31960 }, { "epoch": 15.07072135785007, "grad_norm": 0.007686107885092497, "learning_rate": 0.028897224127569412, "loss": 0.1695, "num_input_tokens_seen": 27677744, "step": 31965 }, { "epoch": 15.073078736445073, "grad_norm": 0.006691863760352135, "learning_rate": 0.028862475454065832, "loss": 0.158, "num_input_tokens_seen": 27681504, "step": 31970 }, { "epoch": 15.075436115040075, "grad_norm": 0.008031461387872696, "learning_rate": 0.028827745461490806, "loss": 0.1719, "num_input_tokens_seen": 27685712, "step": 31975 }, { "epoch": 15.077793493635077, "grad_norm": 0.012139732018113136, "learning_rate": 0.028793034155200212, "loss": 0.1486, "num_input_tokens_seen": 27690000, "step": 31980 }, { "epoch": 15.08015087223008, "grad_norm": 0.010516410693526268, "learning_rate": 0.028758341540546944, "loss": 0.245, "num_input_tokens_seen": 27693968, "step": 31985 }, { "epoch": 15.082508250825082, "grad_norm": 0.005717732012271881, "learning_rate": 0.02872366762288098, "loss": 0.1468, "num_input_tokens_seen": 27698544, "step": 31990 }, { "epoch": 15.084865629420085, "grad_norm": 0.00811387225985527, "learning_rate": 0.028689012407549567, "loss": 0.1458, "num_input_tokens_seen": 27703424, "step": 31995 }, { "epoch": 15.087223008015087, "grad_norm": 0.006718137301504612, "learning_rate": 0.028654375899896892, "loss": 0.1691, "num_input_tokens_seen": 27707728, "step": 32000 }, { "epoch": 15.087223008015087, "eval_loss": 0.2260258048772812, "eval_runtime": 21.9299, "eval_samples_per_second": 43.001, "eval_steps_per_second": 21.523, "num_input_tokens_seen": 27707728, "step": 32000 }, { "epoch": 15.08958038661009, "grad_norm": 0.007443042006343603, "learning_rate": 0.02861975810526437, "loss": 0.1584, "num_input_tokens_seen": 27712640, "step": 32005 }, { "epoch": 15.091937765205092, "grad_norm": 0.009210008196532726, "learning_rate": 0.02858515902899056, "loss": 0.1056, "num_input_tokens_seen": 27717984, "step": 32010 }, { "epoch": 15.094295143800094, "grad_norm": 0.011758270673453808, "learning_rate": 0.028550578676410976, "loss": 0.146, "num_input_tokens_seen": 27722592, "step": 32015 }, { "epoch": 15.096652522395097, "grad_norm": 0.005912365857511759, "learning_rate": 0.02851601705285837, "loss": 0.1376, "num_input_tokens_seen": 27727152, "step": 32020 }, { "epoch": 15.099009900990099, "grad_norm": 0.006046711932867765, "learning_rate": 0.028481474163662666, "loss": 0.1638, "num_input_tokens_seen": 27730688, "step": 32025 }, { "epoch": 15.101367279585101, "grad_norm": 0.007585310842841864, "learning_rate": 0.028446950014150683, "loss": 0.1222, "num_input_tokens_seen": 27736304, "step": 32030 }, { "epoch": 15.103724658180104, "grad_norm": 0.011047706007957458, "learning_rate": 0.028412444609646596, "loss": 0.208, "num_input_tokens_seen": 27740560, "step": 32035 }, { "epoch": 15.106082036775106, "grad_norm": 0.0077481744810938835, "learning_rate": 0.028377957955471465, "loss": 0.1224, "num_input_tokens_seen": 27745744, "step": 32040 }, { "epoch": 15.108439415370109, "grad_norm": 0.007697937544435263, "learning_rate": 0.0283434900569436, "loss": 0.1679, "num_input_tokens_seen": 27749568, "step": 32045 }, { "epoch": 15.110796793965111, "grad_norm": 0.00943811982870102, "learning_rate": 0.028309040919378456, "loss": 0.1606, "num_input_tokens_seen": 27754656, "step": 32050 }, { "epoch": 15.113154172560114, "grad_norm": 0.009678568691015244, "learning_rate": 0.02827461054808848, "loss": 0.1234, "num_input_tokens_seen": 27759248, "step": 32055 }, { "epoch": 15.115511551155116, "grad_norm": 0.013338626362383366, "learning_rate": 0.028240198948383186, "loss": 0.1475, "num_input_tokens_seen": 27763824, "step": 32060 }, { "epoch": 15.117868929750118, "grad_norm": 0.007759434171020985, "learning_rate": 0.028205806125569402, "loss": 0.1533, "num_input_tokens_seen": 27768528, "step": 32065 }, { "epoch": 15.12022630834512, "grad_norm": 0.005333269480615854, "learning_rate": 0.028171432084950834, "loss": 0.1024, "num_input_tokens_seen": 27772736, "step": 32070 }, { "epoch": 15.122583686940123, "grad_norm": 0.013363608159124851, "learning_rate": 0.028137076831828478, "loss": 0.2082, "num_input_tokens_seen": 27776800, "step": 32075 }, { "epoch": 15.124941065535126, "grad_norm": 0.014401925727725029, "learning_rate": 0.028102740371500238, "loss": 0.1827, "num_input_tokens_seen": 27781232, "step": 32080 }, { "epoch": 15.127298444130128, "grad_norm": 0.0119165712967515, "learning_rate": 0.0280684227092613, "loss": 0.1402, "num_input_tokens_seen": 27785968, "step": 32085 }, { "epoch": 15.12965582272513, "grad_norm": 0.00829023215919733, "learning_rate": 0.02803412385040392, "loss": 0.1435, "num_input_tokens_seen": 27791472, "step": 32090 }, { "epoch": 15.132013201320133, "grad_norm": 0.004531940910965204, "learning_rate": 0.027999843800217306, "loss": 0.13, "num_input_tokens_seen": 27796064, "step": 32095 }, { "epoch": 15.134370579915135, "grad_norm": 0.007127948570996523, "learning_rate": 0.027965582563987932, "loss": 0.129, "num_input_tokens_seen": 27801072, "step": 32100 }, { "epoch": 15.136727958510138, "grad_norm": 0.012917138636112213, "learning_rate": 0.027931340146999346, "loss": 0.1769, "num_input_tokens_seen": 27805344, "step": 32105 }, { "epoch": 15.13908533710514, "grad_norm": 0.01270768791437149, "learning_rate": 0.02789711655453208, "loss": 0.1435, "num_input_tokens_seen": 27808880, "step": 32110 }, { "epoch": 15.14144271570014, "grad_norm": 0.007484283298254013, "learning_rate": 0.02786291179186392, "loss": 0.1309, "num_input_tokens_seen": 27813120, "step": 32115 }, { "epoch": 15.143800094295143, "grad_norm": 0.0063737197779119015, "learning_rate": 0.02782872586426961, "loss": 0.1326, "num_input_tokens_seen": 27816880, "step": 32120 }, { "epoch": 15.146157472890145, "grad_norm": 0.011078761890530586, "learning_rate": 0.027794558777021083, "loss": 0.176, "num_input_tokens_seen": 27820976, "step": 32125 }, { "epoch": 15.148514851485148, "grad_norm": 0.010743359103798866, "learning_rate": 0.02776041053538734, "loss": 0.1915, "num_input_tokens_seen": 27824944, "step": 32130 }, { "epoch": 15.15087223008015, "grad_norm": 0.006372596602886915, "learning_rate": 0.027726281144634407, "loss": 0.1796, "num_input_tokens_seen": 27830144, "step": 32135 }, { "epoch": 15.153229608675153, "grad_norm": 0.0029856194742023945, "learning_rate": 0.02769217061002552, "loss": 0.1138, "num_input_tokens_seen": 27834496, "step": 32140 }, { "epoch": 15.155586987270155, "grad_norm": 0.009944400750100613, "learning_rate": 0.027658078936820967, "loss": 0.1804, "num_input_tokens_seen": 27838944, "step": 32145 }, { "epoch": 15.157944365865157, "grad_norm": 0.008034724742174149, "learning_rate": 0.02762400613027805, "loss": 0.143, "num_input_tokens_seen": 27842752, "step": 32150 }, { "epoch": 15.16030174446016, "grad_norm": 0.00780272064730525, "learning_rate": 0.027589952195651295, "loss": 0.1631, "num_input_tokens_seen": 27846848, "step": 32155 }, { "epoch": 15.162659123055162, "grad_norm": 0.00837457925081253, "learning_rate": 0.027555917138192186, "loss": 0.1466, "num_input_tokens_seen": 27850992, "step": 32160 }, { "epoch": 15.165016501650165, "grad_norm": 0.0036268997937440872, "learning_rate": 0.027521900963149375, "loss": 0.117, "num_input_tokens_seen": 27854992, "step": 32165 }, { "epoch": 15.167373880245167, "grad_norm": 0.008317976258695126, "learning_rate": 0.027487903675768633, "loss": 0.1558, "num_input_tokens_seen": 27859584, "step": 32170 }, { "epoch": 15.16973125884017, "grad_norm": 0.005387894809246063, "learning_rate": 0.027453925281292677, "loss": 0.1251, "num_input_tokens_seen": 27864304, "step": 32175 }, { "epoch": 15.172088637435172, "grad_norm": 0.010136988945305347, "learning_rate": 0.027419965784961475, "loss": 0.1449, "num_input_tokens_seen": 27868416, "step": 32180 }, { "epoch": 15.174446016030174, "grad_norm": 0.006001410540193319, "learning_rate": 0.027386025192012015, "loss": 0.075, "num_input_tokens_seen": 27873520, "step": 32185 }, { "epoch": 15.176803394625177, "grad_norm": 0.010405183769762516, "learning_rate": 0.027352103507678277, "loss": 0.1941, "num_input_tokens_seen": 27878128, "step": 32190 }, { "epoch": 15.179160773220179, "grad_norm": 0.005759806837886572, "learning_rate": 0.027318200737191527, "loss": 0.1769, "num_input_tokens_seen": 27881968, "step": 32195 }, { "epoch": 15.181518151815181, "grad_norm": 0.007023729383945465, "learning_rate": 0.027284316885779935, "loss": 0.119, "num_input_tokens_seen": 27886368, "step": 32200 }, { "epoch": 15.181518151815181, "eval_loss": 0.23457427322864532, "eval_runtime": 21.9257, "eval_samples_per_second": 43.009, "eval_steps_per_second": 21.527, "num_input_tokens_seen": 27886368, "step": 32200 }, { "epoch": 15.183875530410184, "grad_norm": 0.011687003076076508, "learning_rate": 0.027250451958668785, "loss": 0.1364, "num_input_tokens_seen": 27891920, "step": 32205 }, { "epoch": 15.186232909005186, "grad_norm": 0.012156837619841099, "learning_rate": 0.027216605961080536, "loss": 0.1366, "num_input_tokens_seen": 27896736, "step": 32210 }, { "epoch": 15.188590287600189, "grad_norm": 0.01599881239235401, "learning_rate": 0.02718277889823461, "loss": 0.1653, "num_input_tokens_seen": 27901440, "step": 32215 }, { "epoch": 15.190947666195191, "grad_norm": 0.00851558055728674, "learning_rate": 0.027148970775347604, "loss": 0.1242, "num_input_tokens_seen": 27906208, "step": 32220 }, { "epoch": 15.193305044790193, "grad_norm": 0.010315995663404465, "learning_rate": 0.027115181597633174, "loss": 0.1324, "num_input_tokens_seen": 27911120, "step": 32225 }, { "epoch": 15.195662423385196, "grad_norm": 0.007938310503959656, "learning_rate": 0.027081411370301976, "loss": 0.1201, "num_input_tokens_seen": 27915488, "step": 32230 }, { "epoch": 15.198019801980198, "grad_norm": 0.009246890433132648, "learning_rate": 0.027047660098561875, "loss": 0.1125, "num_input_tokens_seen": 27920528, "step": 32235 }, { "epoch": 15.2003771805752, "grad_norm": 0.012387375347316265, "learning_rate": 0.02701392778761766, "loss": 0.1834, "num_input_tokens_seen": 27925120, "step": 32240 }, { "epoch": 15.202734559170203, "grad_norm": 0.010826407000422478, "learning_rate": 0.02698021444267133, "loss": 0.1961, "num_input_tokens_seen": 27929072, "step": 32245 }, { "epoch": 15.205091937765205, "grad_norm": 0.009084058925509453, "learning_rate": 0.026946520068921915, "loss": 0.1845, "num_input_tokens_seen": 27932976, "step": 32250 }, { "epoch": 15.207449316360208, "grad_norm": 0.004440312273800373, "learning_rate": 0.02691284467156547, "loss": 0.0927, "num_input_tokens_seen": 27936912, "step": 32255 }, { "epoch": 15.20980669495521, "grad_norm": 0.005042764358222485, "learning_rate": 0.026879188255795182, "loss": 0.1043, "num_input_tokens_seen": 27941568, "step": 32260 }, { "epoch": 15.212164073550213, "grad_norm": 0.014170131646096706, "learning_rate": 0.026845550826801328, "loss": 0.1204, "num_input_tokens_seen": 27946512, "step": 32265 }, { "epoch": 15.214521452145215, "grad_norm": 0.0058250706642866135, "learning_rate": 0.02681193238977121, "loss": 0.0894, "num_input_tokens_seen": 27950832, "step": 32270 }, { "epoch": 15.216878830740217, "grad_norm": 0.01427895575761795, "learning_rate": 0.026778332949889145, "loss": 0.1988, "num_input_tokens_seen": 27955056, "step": 32275 }, { "epoch": 15.21923620933522, "grad_norm": 0.010294276289641857, "learning_rate": 0.026744752512336673, "loss": 0.1178, "num_input_tokens_seen": 27959280, "step": 32280 }, { "epoch": 15.221593587930222, "grad_norm": 0.008662639185786247, "learning_rate": 0.02671119108229225, "loss": 0.1252, "num_input_tokens_seen": 27963424, "step": 32285 }, { "epoch": 15.223950966525225, "grad_norm": 0.013079563155770302, "learning_rate": 0.026677648664931556, "loss": 0.1099, "num_input_tokens_seen": 27966928, "step": 32290 }, { "epoch": 15.226308345120227, "grad_norm": 0.02364530600607395, "learning_rate": 0.026644125265427154, "loss": 0.2131, "num_input_tokens_seen": 27971360, "step": 32295 }, { "epoch": 15.22866572371523, "grad_norm": 0.0031454747077077627, "learning_rate": 0.026610620888948822, "loss": 0.1227, "num_input_tokens_seen": 27975328, "step": 32300 }, { "epoch": 15.231023102310232, "grad_norm": 0.009328093379735947, "learning_rate": 0.026577135540663408, "loss": 0.1263, "num_input_tokens_seen": 27979840, "step": 32305 }, { "epoch": 15.233380480905234, "grad_norm": 0.008871783502399921, "learning_rate": 0.026543669225734673, "loss": 0.1742, "num_input_tokens_seen": 27984320, "step": 32310 }, { "epoch": 15.235737859500237, "grad_norm": 0.016636818647384644, "learning_rate": 0.02651022194932363, "loss": 0.1467, "num_input_tokens_seen": 27988320, "step": 32315 }, { "epoch": 15.238095238095237, "grad_norm": 0.01185997948050499, "learning_rate": 0.026476793716588194, "loss": 0.1516, "num_input_tokens_seen": 27992304, "step": 32320 }, { "epoch": 15.24045261669024, "grad_norm": 0.015266108326613903, "learning_rate": 0.026443384532683467, "loss": 0.255, "num_input_tokens_seen": 27996496, "step": 32325 }, { "epoch": 15.242809995285242, "grad_norm": 0.007138614542782307, "learning_rate": 0.026409994402761584, "loss": 0.1003, "num_input_tokens_seen": 28000576, "step": 32330 }, { "epoch": 15.245167373880244, "grad_norm": 0.007402620743960142, "learning_rate": 0.026376623331971653, "loss": 0.1026, "num_input_tokens_seen": 28005040, "step": 32335 }, { "epoch": 15.247524752475247, "grad_norm": 0.004511822015047073, "learning_rate": 0.026343271325459997, "loss": 0.1355, "num_input_tokens_seen": 28008960, "step": 32340 }, { "epoch": 15.24988213107025, "grad_norm": 0.007138450164347887, "learning_rate": 0.02630993838836987, "loss": 0.1322, "num_input_tokens_seen": 28013392, "step": 32345 }, { "epoch": 15.252239509665252, "grad_norm": 0.014267225749790668, "learning_rate": 0.026276624525841584, "loss": 0.1655, "num_input_tokens_seen": 28016720, "step": 32350 }, { "epoch": 15.254596888260254, "grad_norm": 0.01218926440924406, "learning_rate": 0.026243329743012637, "loss": 0.1167, "num_input_tokens_seen": 28021424, "step": 32355 }, { "epoch": 15.256954266855256, "grad_norm": 0.015085089951753616, "learning_rate": 0.026210054045017438, "loss": 0.1404, "num_input_tokens_seen": 28026336, "step": 32360 }, { "epoch": 15.259311645450259, "grad_norm": 0.005296699237078428, "learning_rate": 0.02617679743698755, "loss": 0.1618, "num_input_tokens_seen": 28030656, "step": 32365 }, { "epoch": 15.261669024045261, "grad_norm": 0.0031926780939102173, "learning_rate": 0.02614355992405158, "loss": 0.0835, "num_input_tokens_seen": 28035152, "step": 32370 }, { "epoch": 15.264026402640264, "grad_norm": 0.004572013393044472, "learning_rate": 0.026110341511335115, "loss": 0.1672, "num_input_tokens_seen": 28039632, "step": 32375 }, { "epoch": 15.266383781235266, "grad_norm": 0.006843676790595055, "learning_rate": 0.02607714220396093, "loss": 0.1061, "num_input_tokens_seen": 28043680, "step": 32380 }, { "epoch": 15.268741159830268, "grad_norm": 0.018468564376235008, "learning_rate": 0.02604396200704869, "loss": 0.2295, "num_input_tokens_seen": 28048432, "step": 32385 }, { "epoch": 15.27109853842527, "grad_norm": 0.011436278000473976, "learning_rate": 0.02601080092571523, "loss": 0.303, "num_input_tokens_seen": 28052128, "step": 32390 }, { "epoch": 15.273455917020273, "grad_norm": 0.012343303300440311, "learning_rate": 0.025977658965074455, "loss": 0.1626, "num_input_tokens_seen": 28056176, "step": 32395 }, { "epoch": 15.275813295615276, "grad_norm": 0.004792632535099983, "learning_rate": 0.02594453613023719, "loss": 0.1075, "num_input_tokens_seen": 28061984, "step": 32400 }, { "epoch": 15.275813295615276, "eval_loss": 0.23268653452396393, "eval_runtime": 21.8986, "eval_samples_per_second": 43.062, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 28061984, "step": 32400 }, { "epoch": 15.278170674210278, "grad_norm": 0.010497204028069973, "learning_rate": 0.025911432426311443, "loss": 0.1392, "num_input_tokens_seen": 28066528, "step": 32405 }, { "epoch": 15.28052805280528, "grad_norm": 0.0029115735087543726, "learning_rate": 0.025878347858402234, "loss": 0.1236, "num_input_tokens_seen": 28070688, "step": 32410 }, { "epoch": 15.282885431400283, "grad_norm": 0.013748629949986935, "learning_rate": 0.025845282431611598, "loss": 0.2091, "num_input_tokens_seen": 28074704, "step": 32415 }, { "epoch": 15.285242809995285, "grad_norm": 0.018689455464482307, "learning_rate": 0.025812236151038608, "loss": 0.1809, "num_input_tokens_seen": 28078816, "step": 32420 }, { "epoch": 15.287600188590288, "grad_norm": 0.009783794172108173, "learning_rate": 0.025779209021779468, "loss": 0.0955, "num_input_tokens_seen": 28082736, "step": 32425 }, { "epoch": 15.28995756718529, "grad_norm": 0.008072003722190857, "learning_rate": 0.025746201048927324, "loss": 0.1087, "num_input_tokens_seen": 28087248, "step": 32430 }, { "epoch": 15.292314945780292, "grad_norm": 0.005828424356877804, "learning_rate": 0.025713212237572485, "loss": 0.0935, "num_input_tokens_seen": 28091744, "step": 32435 }, { "epoch": 15.294672324375295, "grad_norm": 0.013339896686375141, "learning_rate": 0.025680242592802164, "loss": 0.2137, "num_input_tokens_seen": 28095360, "step": 32440 }, { "epoch": 15.297029702970297, "grad_norm": 0.011880171485245228, "learning_rate": 0.02564729211970073, "loss": 0.1427, "num_input_tokens_seen": 28099888, "step": 32445 }, { "epoch": 15.2993870815653, "grad_norm": 0.005814662203192711, "learning_rate": 0.025614360823349617, "loss": 0.1569, "num_input_tokens_seen": 28103472, "step": 32450 }, { "epoch": 15.301744460160302, "grad_norm": 0.016350964084267616, "learning_rate": 0.025581448708827146, "loss": 0.1274, "num_input_tokens_seen": 28107680, "step": 32455 }, { "epoch": 15.304101838755304, "grad_norm": 0.01193738728761673, "learning_rate": 0.025548555781208876, "loss": 0.1875, "num_input_tokens_seen": 28111872, "step": 32460 }, { "epoch": 15.306459217350307, "grad_norm": 0.008230690844357014, "learning_rate": 0.02551568204556721, "loss": 0.1123, "num_input_tokens_seen": 28117280, "step": 32465 }, { "epoch": 15.30881659594531, "grad_norm": 0.007615272421389818, "learning_rate": 0.02548282750697173, "loss": 0.1059, "num_input_tokens_seen": 28121136, "step": 32470 }, { "epoch": 15.311173974540312, "grad_norm": 0.00945815909653902, "learning_rate": 0.02544999217048909, "loss": 0.2026, "num_input_tokens_seen": 28127248, "step": 32475 }, { "epoch": 15.313531353135314, "grad_norm": 0.013622204773128033, "learning_rate": 0.025417176041182793, "loss": 0.2165, "num_input_tokens_seen": 28131184, "step": 32480 }, { "epoch": 15.315888731730317, "grad_norm": 0.010845323093235493, "learning_rate": 0.025384379124113596, "loss": 0.1234, "num_input_tokens_seen": 28135248, "step": 32485 }, { "epoch": 15.318246110325319, "grad_norm": 0.011358315125107765, "learning_rate": 0.025351601424339124, "loss": 0.1676, "num_input_tokens_seen": 28139424, "step": 32490 }, { "epoch": 15.320603488920321, "grad_norm": 0.009925664402544498, "learning_rate": 0.025318842946914184, "loss": 0.1459, "num_input_tokens_seen": 28143776, "step": 32495 }, { "epoch": 15.322960867515324, "grad_norm": 0.005954282823950052, "learning_rate": 0.025286103696890494, "loss": 0.1412, "num_input_tokens_seen": 28148032, "step": 32500 }, { "epoch": 15.325318246110326, "grad_norm": 0.004367055371403694, "learning_rate": 0.025253383679316836, "loss": 0.1272, "num_input_tokens_seen": 28151888, "step": 32505 }, { "epoch": 15.327675624705329, "grad_norm": 0.008908172138035297, "learning_rate": 0.025220682899239077, "loss": 0.1631, "num_input_tokens_seen": 28155760, "step": 32510 }, { "epoch": 15.33003300330033, "grad_norm": 0.011793509125709534, "learning_rate": 0.02518800136170013, "loss": 0.1322, "num_input_tokens_seen": 28160560, "step": 32515 }, { "epoch": 15.332390381895332, "grad_norm": 0.00810137391090393, "learning_rate": 0.02515533907173981, "loss": 0.1744, "num_input_tokens_seen": 28164672, "step": 32520 }, { "epoch": 15.334747760490334, "grad_norm": 0.003926781006157398, "learning_rate": 0.025122696034395115, "loss": 0.1067, "num_input_tokens_seen": 28169056, "step": 32525 }, { "epoch": 15.337105139085336, "grad_norm": 0.008796206675469875, "learning_rate": 0.025090072254700023, "loss": 0.143, "num_input_tokens_seen": 28173040, "step": 32530 }, { "epoch": 15.339462517680339, "grad_norm": 0.009944225661456585, "learning_rate": 0.025057467737685468, "loss": 0.1069, "num_input_tokens_seen": 28176736, "step": 32535 }, { "epoch": 15.341819896275341, "grad_norm": 0.01030266098678112, "learning_rate": 0.025024882488379557, "loss": 0.1501, "num_input_tokens_seen": 28182240, "step": 32540 }, { "epoch": 15.344177274870344, "grad_norm": 0.013475790619850159, "learning_rate": 0.02499231651180727, "loss": 0.12, "num_input_tokens_seen": 28186784, "step": 32545 }, { "epoch": 15.346534653465346, "grad_norm": 0.012259495444595814, "learning_rate": 0.024959769812990713, "loss": 0.2102, "num_input_tokens_seen": 28191776, "step": 32550 }, { "epoch": 15.348892032060348, "grad_norm": 0.0028135532047599554, "learning_rate": 0.024927242396949045, "loss": 0.0531, "num_input_tokens_seen": 28195568, "step": 32555 }, { "epoch": 15.35124941065535, "grad_norm": 0.015003572218120098, "learning_rate": 0.02489473426869836, "loss": 0.1547, "num_input_tokens_seen": 28199488, "step": 32560 }, { "epoch": 15.353606789250353, "grad_norm": 0.011886035092175007, "learning_rate": 0.024862245433251776, "loss": 0.1522, "num_input_tokens_seen": 28204384, "step": 32565 }, { "epoch": 15.355964167845356, "grad_norm": 0.005336808506399393, "learning_rate": 0.024829775895619577, "loss": 0.176, "num_input_tokens_seen": 28208464, "step": 32570 }, { "epoch": 15.358321546440358, "grad_norm": 0.008342361077666283, "learning_rate": 0.024797325660808882, "loss": 0.1445, "num_input_tokens_seen": 28212000, "step": 32575 }, { "epoch": 15.36067892503536, "grad_norm": 0.00827929936349392, "learning_rate": 0.02476489473382401, "loss": 0.1003, "num_input_tokens_seen": 28216064, "step": 32580 }, { "epoch": 15.363036303630363, "grad_norm": 0.015575846657156944, "learning_rate": 0.024732483119666127, "loss": 0.1585, "num_input_tokens_seen": 28220320, "step": 32585 }, { "epoch": 15.365393682225365, "grad_norm": 0.006159981247037649, "learning_rate": 0.024700090823333548, "loss": 0.1183, "num_input_tokens_seen": 28224240, "step": 32590 }, { "epoch": 15.367751060820368, "grad_norm": 0.006392660550773144, "learning_rate": 0.02466771784982163, "loss": 0.1169, "num_input_tokens_seen": 28229184, "step": 32595 }, { "epoch": 15.37010843941537, "grad_norm": 0.015322172082960606, "learning_rate": 0.024635364204122594, "loss": 0.1668, "num_input_tokens_seen": 28233360, "step": 32600 }, { "epoch": 15.37010843941537, "eval_loss": 0.23528781533241272, "eval_runtime": 21.9257, "eval_samples_per_second": 43.009, "eval_steps_per_second": 21.527, "num_input_tokens_seen": 28233360, "step": 32600 }, { "epoch": 15.372465818010372, "grad_norm": 0.008008717559278011, "learning_rate": 0.024603029891225852, "loss": 0.1526, "num_input_tokens_seen": 28237472, "step": 32605 }, { "epoch": 15.374823196605375, "grad_norm": 0.018177973106503487, "learning_rate": 0.024570714916117748, "loss": 0.1215, "num_input_tokens_seen": 28242368, "step": 32610 }, { "epoch": 15.377180575200377, "grad_norm": 0.01232017669826746, "learning_rate": 0.024538419283781625, "loss": 0.1661, "num_input_tokens_seen": 28247520, "step": 32615 }, { "epoch": 15.37953795379538, "grad_norm": 0.012947475537657738, "learning_rate": 0.024506142999197938, "loss": 0.2171, "num_input_tokens_seen": 28251488, "step": 32620 }, { "epoch": 15.381895332390382, "grad_norm": 0.017344379797577858, "learning_rate": 0.024473886067344002, "loss": 0.1159, "num_input_tokens_seen": 28256240, "step": 32625 }, { "epoch": 15.384252710985384, "grad_norm": 0.010178913362324238, "learning_rate": 0.02444164849319434, "loss": 0.1642, "num_input_tokens_seen": 28260000, "step": 32630 }, { "epoch": 15.386610089580387, "grad_norm": 0.010834285989403725, "learning_rate": 0.024409430281720306, "loss": 0.1919, "num_input_tokens_seen": 28264800, "step": 32635 }, { "epoch": 15.38896746817539, "grad_norm": 0.008635144680738449, "learning_rate": 0.024377231437890428, "loss": 0.151, "num_input_tokens_seen": 28269808, "step": 32640 }, { "epoch": 15.391324846770392, "grad_norm": 0.011760484427213669, "learning_rate": 0.024345051966670115, "loss": 0.1578, "num_input_tokens_seen": 28273936, "step": 32645 }, { "epoch": 15.393682225365394, "grad_norm": 0.011929628439247608, "learning_rate": 0.024312891873021884, "loss": 0.2178, "num_input_tokens_seen": 28278048, "step": 32650 }, { "epoch": 15.396039603960396, "grad_norm": 0.008102137595415115, "learning_rate": 0.024280751161905183, "loss": 0.1768, "num_input_tokens_seen": 28282160, "step": 32655 }, { "epoch": 15.398396982555399, "grad_norm": 0.014904254116117954, "learning_rate": 0.02424862983827658, "loss": 0.1655, "num_input_tokens_seen": 28286256, "step": 32660 }, { "epoch": 15.400754361150401, "grad_norm": 0.010600886307656765, "learning_rate": 0.024216527907089495, "loss": 0.1873, "num_input_tokens_seen": 28290256, "step": 32665 }, { "epoch": 15.403111739745404, "grad_norm": 0.008769355714321136, "learning_rate": 0.024184445373294505, "loss": 0.1756, "num_input_tokens_seen": 28295280, "step": 32670 }, { "epoch": 15.405469118340406, "grad_norm": 0.009978369809687138, "learning_rate": 0.02415238224183918, "loss": 0.1877, "num_input_tokens_seen": 28299328, "step": 32675 }, { "epoch": 15.407826496935408, "grad_norm": 0.01037834957242012, "learning_rate": 0.024120338517667973, "loss": 0.1389, "num_input_tokens_seen": 28303296, "step": 32680 }, { "epoch": 15.41018387553041, "grad_norm": 0.004951676353812218, "learning_rate": 0.02408831420572247, "loss": 0.1269, "num_input_tokens_seen": 28307712, "step": 32685 }, { "epoch": 15.412541254125413, "grad_norm": 0.0045358226634562016, "learning_rate": 0.024056309310941264, "loss": 0.0969, "num_input_tokens_seen": 28311696, "step": 32690 }, { "epoch": 15.414898632720416, "grad_norm": 0.0031718246173113585, "learning_rate": 0.02402432383825982, "loss": 0.1114, "num_input_tokens_seen": 28316256, "step": 32695 }, { "epoch": 15.417256011315418, "grad_norm": 0.020901277661323547, "learning_rate": 0.023992357792610792, "loss": 0.1903, "num_input_tokens_seen": 28320960, "step": 32700 }, { "epoch": 15.41961338991042, "grad_norm": 0.012570049613714218, "learning_rate": 0.0239604111789237, "loss": 0.1551, "num_input_tokens_seen": 28325232, "step": 32705 }, { "epoch": 15.421970768505423, "grad_norm": 0.0062584588304162025, "learning_rate": 0.023928484002125095, "loss": 0.1091, "num_input_tokens_seen": 28329472, "step": 32710 }, { "epoch": 15.424328147100425, "grad_norm": 0.007957147434353828, "learning_rate": 0.023896576267138595, "loss": 0.1437, "num_input_tokens_seen": 28333232, "step": 32715 }, { "epoch": 15.426685525695426, "grad_norm": 0.014776704832911491, "learning_rate": 0.02386468797888471, "loss": 0.1503, "num_input_tokens_seen": 28337312, "step": 32720 }, { "epoch": 15.429042904290428, "grad_norm": 0.012422189116477966, "learning_rate": 0.023832819142281057, "loss": 0.2676, "num_input_tokens_seen": 28342336, "step": 32725 }, { "epoch": 15.43140028288543, "grad_norm": 0.008457721211016178, "learning_rate": 0.02380096976224225, "loss": 0.165, "num_input_tokens_seen": 28347264, "step": 32730 }, { "epoch": 15.433757661480433, "grad_norm": 0.014382749795913696, "learning_rate": 0.023769139843679777, "loss": 0.1552, "num_input_tokens_seen": 28350816, "step": 32735 }, { "epoch": 15.436115040075435, "grad_norm": 0.006935228127986193, "learning_rate": 0.023737329391502287, "loss": 0.1641, "num_input_tokens_seen": 28355712, "step": 32740 }, { "epoch": 15.438472418670438, "grad_norm": 0.011144048534333706, "learning_rate": 0.023705538410615293, "loss": 0.2076, "num_input_tokens_seen": 28359856, "step": 32745 }, { "epoch": 15.44082979726544, "grad_norm": 0.012775902636349201, "learning_rate": 0.023673766905921396, "loss": 0.1565, "num_input_tokens_seen": 28364464, "step": 32750 }, { "epoch": 15.443187175860443, "grad_norm": 0.0087564866989851, "learning_rate": 0.0236420148823202, "loss": 0.1157, "num_input_tokens_seen": 28368704, "step": 32755 }, { "epoch": 15.445544554455445, "grad_norm": 0.007694755215197802, "learning_rate": 0.02361028234470816, "loss": 0.1123, "num_input_tokens_seen": 28374096, "step": 32760 }, { "epoch": 15.447901933050447, "grad_norm": 0.016539813950657845, "learning_rate": 0.023578569297978913, "loss": 0.1311, "num_input_tokens_seen": 28380000, "step": 32765 }, { "epoch": 15.45025931164545, "grad_norm": 0.0044408938847482204, "learning_rate": 0.023546875747023025, "loss": 0.1436, "num_input_tokens_seen": 28383792, "step": 32770 }, { "epoch": 15.452616690240452, "grad_norm": 0.015383691526949406, "learning_rate": 0.02351520169672801, "loss": 0.1408, "num_input_tokens_seen": 28388000, "step": 32775 }, { "epoch": 15.454974068835455, "grad_norm": 0.013608130626380444, "learning_rate": 0.023483547151978357, "loss": 0.2037, "num_input_tokens_seen": 28392400, "step": 32780 }, { "epoch": 15.457331447430457, "grad_norm": 0.008210189640522003, "learning_rate": 0.023451912117655675, "loss": 0.1, "num_input_tokens_seen": 28396736, "step": 32785 }, { "epoch": 15.45968882602546, "grad_norm": 0.014253182336688042, "learning_rate": 0.023420296598638417, "loss": 0.1575, "num_input_tokens_seen": 28401616, "step": 32790 }, { "epoch": 15.462046204620462, "grad_norm": 0.013996430672705173, "learning_rate": 0.023388700599802165, "loss": 0.199, "num_input_tokens_seen": 28405552, "step": 32795 }, { "epoch": 15.464403583215464, "grad_norm": 0.0024401608388870955, "learning_rate": 0.023357124126019334, "loss": 0.1905, "num_input_tokens_seen": 28411200, "step": 32800 }, { "epoch": 15.464403583215464, "eval_loss": 0.22916005551815033, "eval_runtime": 21.9385, "eval_samples_per_second": 42.984, "eval_steps_per_second": 21.515, "num_input_tokens_seen": 28411200, "step": 32800 }, { "epoch": 15.466760961810467, "grad_norm": 0.01107438188046217, "learning_rate": 0.02332556718215945, "loss": 0.1225, "num_input_tokens_seen": 28414832, "step": 32805 }, { "epoch": 15.469118340405469, "grad_norm": 0.012400981970131397, "learning_rate": 0.023294029773089035, "loss": 0.1532, "num_input_tokens_seen": 28419440, "step": 32810 }, { "epoch": 15.471475719000471, "grad_norm": 0.015646561980247498, "learning_rate": 0.023262511903671484, "loss": 0.1513, "num_input_tokens_seen": 28423136, "step": 32815 }, { "epoch": 15.473833097595474, "grad_norm": 0.008530253544449806, "learning_rate": 0.023231013578767324, "loss": 0.1741, "num_input_tokens_seen": 28427616, "step": 32820 }, { "epoch": 15.476190476190476, "grad_norm": 0.00967363826930523, "learning_rate": 0.0231995348032339, "loss": 0.1868, "num_input_tokens_seen": 28432768, "step": 32825 }, { "epoch": 15.478547854785479, "grad_norm": 0.011692674830555916, "learning_rate": 0.023168075581925685, "loss": 0.2336, "num_input_tokens_seen": 28437488, "step": 32830 }, { "epoch": 15.480905233380481, "grad_norm": 0.01673063263297081, "learning_rate": 0.023136635919694126, "loss": 0.204, "num_input_tokens_seen": 28442176, "step": 32835 }, { "epoch": 15.483262611975483, "grad_norm": 0.012262137606739998, "learning_rate": 0.02310521582138753, "loss": 0.2024, "num_input_tokens_seen": 28446512, "step": 32840 }, { "epoch": 15.485619990570486, "grad_norm": 0.006570841651409864, "learning_rate": 0.023073815291851357, "loss": 0.1525, "num_input_tokens_seen": 28450288, "step": 32845 }, { "epoch": 15.487977369165488, "grad_norm": 0.002762504620477557, "learning_rate": 0.02304243433592788, "loss": 0.0825, "num_input_tokens_seen": 28454208, "step": 32850 }, { "epoch": 15.49033474776049, "grad_norm": 0.007456216495484114, "learning_rate": 0.023011072958456513, "loss": 0.1826, "num_input_tokens_seen": 28458736, "step": 32855 }, { "epoch": 15.492692126355493, "grad_norm": 0.0064378040842711926, "learning_rate": 0.022979731164273536, "loss": 0.1267, "num_input_tokens_seen": 28462528, "step": 32860 }, { "epoch": 15.495049504950495, "grad_norm": 0.008060024119913578, "learning_rate": 0.022948408958212218, "loss": 0.1064, "num_input_tokens_seen": 28467744, "step": 32865 }, { "epoch": 15.497406883545498, "grad_norm": 0.006059776991605759, "learning_rate": 0.022917106345102876, "loss": 0.1255, "num_input_tokens_seen": 28471824, "step": 32870 }, { "epoch": 15.4997642621405, "grad_norm": 0.006881760898977518, "learning_rate": 0.022885823329772785, "loss": 0.137, "num_input_tokens_seen": 28476288, "step": 32875 }, { "epoch": 15.502121640735503, "grad_norm": 0.022588232532143593, "learning_rate": 0.02285455991704612, "loss": 0.2013, "num_input_tokens_seen": 28481152, "step": 32880 }, { "epoch": 15.504479019330505, "grad_norm": 0.009498757310211658, "learning_rate": 0.022823316111744117, "loss": 0.1672, "num_input_tokens_seen": 28485920, "step": 32885 }, { "epoch": 15.506836397925507, "grad_norm": 0.008905145339667797, "learning_rate": 0.022792091918685014, "loss": 0.1441, "num_input_tokens_seen": 28490096, "step": 32890 }, { "epoch": 15.50919377652051, "grad_norm": 0.01259632222354412, "learning_rate": 0.022760887342683906, "loss": 0.1796, "num_input_tokens_seen": 28493312, "step": 32895 }, { "epoch": 15.511551155115512, "grad_norm": 0.006613368634134531, "learning_rate": 0.022729702388552975, "loss": 0.1888, "num_input_tokens_seen": 28497328, "step": 32900 }, { "epoch": 15.513908533710515, "grad_norm": 0.009593715891242027, "learning_rate": 0.022698537061101292, "loss": 0.1592, "num_input_tokens_seen": 28501648, "step": 32905 }, { "epoch": 15.516265912305517, "grad_norm": 0.01375671848654747, "learning_rate": 0.022667391365134962, "loss": 0.1351, "num_input_tokens_seen": 28505808, "step": 32910 }, { "epoch": 15.518623290900518, "grad_norm": 0.011229746043682098, "learning_rate": 0.022636265305457065, "loss": 0.1989, "num_input_tokens_seen": 28510592, "step": 32915 }, { "epoch": 15.520980669495522, "grad_norm": 0.00799809955060482, "learning_rate": 0.02260515888686764, "loss": 0.1701, "num_input_tokens_seen": 28514336, "step": 32920 }, { "epoch": 15.523338048090523, "grad_norm": 0.010452601127326488, "learning_rate": 0.022574072114163596, "loss": 0.1602, "num_input_tokens_seen": 28518512, "step": 32925 }, { "epoch": 15.525695426685525, "grad_norm": 0.0068704853765666485, "learning_rate": 0.022543004992139005, "loss": 0.1985, "num_input_tokens_seen": 28522544, "step": 32930 }, { "epoch": 15.528052805280527, "grad_norm": 0.0053809937089681625, "learning_rate": 0.022511957525584745, "loss": 0.1423, "num_input_tokens_seen": 28526352, "step": 32935 }, { "epoch": 15.53041018387553, "grad_norm": 0.009612223133444786, "learning_rate": 0.022480929719288778, "loss": 0.1964, "num_input_tokens_seen": 28531200, "step": 32940 }, { "epoch": 15.532767562470532, "grad_norm": 0.0064047579653561115, "learning_rate": 0.02244992157803592, "loss": 0.1792, "num_input_tokens_seen": 28535616, "step": 32945 }, { "epoch": 15.535124941065535, "grad_norm": 0.007627466693520546, "learning_rate": 0.022418933106608047, "loss": 0.1245, "num_input_tokens_seen": 28540560, "step": 32950 }, { "epoch": 15.537482319660537, "grad_norm": 0.005930561572313309, "learning_rate": 0.022387964309784018, "loss": 0.1853, "num_input_tokens_seen": 28544432, "step": 32955 }, { "epoch": 15.53983969825554, "grad_norm": 0.010000316426157951, "learning_rate": 0.022357015192339517, "loss": 0.2115, "num_input_tokens_seen": 28548528, "step": 32960 }, { "epoch": 15.542197076850542, "grad_norm": 0.0052168527618050575, "learning_rate": 0.02232608575904734, "loss": 0.1203, "num_input_tokens_seen": 28552448, "step": 32965 }, { "epoch": 15.544554455445544, "grad_norm": 0.010701922699809074, "learning_rate": 0.022295176014677225, "loss": 0.1806, "num_input_tokens_seen": 28557136, "step": 32970 }, { "epoch": 15.546911834040547, "grad_norm": 0.007220966275781393, "learning_rate": 0.02226428596399577, "loss": 0.1931, "num_input_tokens_seen": 28561424, "step": 32975 }, { "epoch": 15.549269212635549, "grad_norm": 0.006489204242825508, "learning_rate": 0.02223341561176669, "loss": 0.0887, "num_input_tokens_seen": 28565792, "step": 32980 }, { "epoch": 15.551626591230551, "grad_norm": 0.006054729223251343, "learning_rate": 0.0222025649627505, "loss": 0.1257, "num_input_tokens_seen": 28569376, "step": 32985 }, { "epoch": 15.553983969825554, "grad_norm": 0.004876998718827963, "learning_rate": 0.022171734021704814, "loss": 0.0981, "num_input_tokens_seen": 28573824, "step": 32990 }, { "epoch": 15.556341348420556, "grad_norm": 0.012059914879500866, "learning_rate": 0.022140922793384116, "loss": 0.1754, "num_input_tokens_seen": 28577872, "step": 32995 }, { "epoch": 15.558698727015559, "grad_norm": 0.003697044914588332, "learning_rate": 0.022110131282539934, "loss": 0.1303, "num_input_tokens_seen": 28582944, "step": 33000 }, { "epoch": 15.558698727015559, "eval_loss": 0.22632470726966858, "eval_runtime": 21.9125, "eval_samples_per_second": 43.035, "eval_steps_per_second": 21.54, "num_input_tokens_seen": 28582944, "step": 33000 }, { "epoch": 15.561056105610561, "grad_norm": 0.005530308000743389, "learning_rate": 0.022079359493920675, "loss": 0.1171, "num_input_tokens_seen": 28587136, "step": 33005 }, { "epoch": 15.563413484205563, "grad_norm": 0.014832360669970512, "learning_rate": 0.02204860743227169, "loss": 0.1795, "num_input_tokens_seen": 28591248, "step": 33010 }, { "epoch": 15.565770862800566, "grad_norm": 0.013271516188979149, "learning_rate": 0.022017875102335365, "loss": 0.1751, "num_input_tokens_seen": 28595344, "step": 33015 }, { "epoch": 15.568128241395568, "grad_norm": 0.009843386709690094, "learning_rate": 0.02198716250885108, "loss": 0.1233, "num_input_tokens_seen": 28600016, "step": 33020 }, { "epoch": 15.57048561999057, "grad_norm": 0.009697039611637592, "learning_rate": 0.021956469656555, "loss": 0.1388, "num_input_tokens_seen": 28603984, "step": 33025 }, { "epoch": 15.572842998585573, "grad_norm": 0.014772730879485607, "learning_rate": 0.0219257965501804, "loss": 0.201, "num_input_tokens_seen": 28608672, "step": 33030 }, { "epoch": 15.575200377180575, "grad_norm": 0.007973120547831059, "learning_rate": 0.021895143194457494, "loss": 0.134, "num_input_tokens_seen": 28613008, "step": 33035 }, { "epoch": 15.577557755775578, "grad_norm": 0.009184773080050945, "learning_rate": 0.021864509594113322, "loss": 0.1354, "num_input_tokens_seen": 28617552, "step": 33040 }, { "epoch": 15.57991513437058, "grad_norm": 0.010211185552179813, "learning_rate": 0.02183389575387207, "loss": 0.1181, "num_input_tokens_seen": 28621200, "step": 33045 }, { "epoch": 15.582272512965583, "grad_norm": 0.0062277293764054775, "learning_rate": 0.021803301678454682, "loss": 0.1405, "num_input_tokens_seen": 28625152, "step": 33050 }, { "epoch": 15.584629891560585, "grad_norm": 0.011773655191063881, "learning_rate": 0.021772727372579213, "loss": 0.2368, "num_input_tokens_seen": 28629168, "step": 33055 }, { "epoch": 15.586987270155587, "grad_norm": 0.008062131702899933, "learning_rate": 0.02174217284096061, "loss": 0.1005, "num_input_tokens_seen": 28633808, "step": 33060 }, { "epoch": 15.58934464875059, "grad_norm": 0.004558291286230087, "learning_rate": 0.0217116380883107, "loss": 0.1445, "num_input_tokens_seen": 28637760, "step": 33065 }, { "epoch": 15.591702027345592, "grad_norm": 0.008162260986864567, "learning_rate": 0.021681123119338425, "loss": 0.191, "num_input_tokens_seen": 28642960, "step": 33070 }, { "epoch": 15.594059405940595, "grad_norm": 0.007699870970100164, "learning_rate": 0.02165062793874951, "loss": 0.096, "num_input_tokens_seen": 28646848, "step": 33075 }, { "epoch": 15.596416784535597, "grad_norm": 0.004813583567738533, "learning_rate": 0.021620152551246666, "loss": 0.1478, "num_input_tokens_seen": 28651952, "step": 33080 }, { "epoch": 15.5987741631306, "grad_norm": 0.00886498298496008, "learning_rate": 0.02158969696152967, "loss": 0.1688, "num_input_tokens_seen": 28656064, "step": 33085 }, { "epoch": 15.601131541725602, "grad_norm": 0.011855809018015862, "learning_rate": 0.021559261174295057, "loss": 0.1867, "num_input_tokens_seen": 28660832, "step": 33090 }, { "epoch": 15.603488920320604, "grad_norm": 0.014365799725055695, "learning_rate": 0.02152884519423646, "loss": 0.1611, "num_input_tokens_seen": 28665184, "step": 33095 }, { "epoch": 15.605846298915607, "grad_norm": 0.007142855320125818, "learning_rate": 0.021498449026044447, "loss": 0.1594, "num_input_tokens_seen": 28668816, "step": 33100 }, { "epoch": 15.608203677510609, "grad_norm": 0.010502105578780174, "learning_rate": 0.021468072674406414, "loss": 0.1483, "num_input_tokens_seen": 28673296, "step": 33105 }, { "epoch": 15.61056105610561, "grad_norm": 0.0037844388280063868, "learning_rate": 0.021437716144006795, "loss": 0.1121, "num_input_tokens_seen": 28677376, "step": 33110 }, { "epoch": 15.612918434700614, "grad_norm": 0.006872489582747221, "learning_rate": 0.021407379439527002, "loss": 0.1363, "num_input_tokens_seen": 28681648, "step": 33115 }, { "epoch": 15.615275813295614, "grad_norm": 0.005345071200281382, "learning_rate": 0.021377062565645255, "loss": 0.1524, "num_input_tokens_seen": 28686144, "step": 33120 }, { "epoch": 15.617633191890617, "grad_norm": 0.0056538814678788185, "learning_rate": 0.02134676552703688, "loss": 0.1361, "num_input_tokens_seen": 28690688, "step": 33125 }, { "epoch": 15.61999057048562, "grad_norm": 0.003952366299927235, "learning_rate": 0.02131648832837398, "loss": 0.1043, "num_input_tokens_seen": 28695232, "step": 33130 }, { "epoch": 15.622347949080622, "grad_norm": 0.006569413933902979, "learning_rate": 0.02128623097432574, "loss": 0.1617, "num_input_tokens_seen": 28699584, "step": 33135 }, { "epoch": 15.624705327675624, "grad_norm": 0.0025312684010714293, "learning_rate": 0.021255993469558192, "loss": 0.1215, "num_input_tokens_seen": 28704720, "step": 33140 }, { "epoch": 15.627062706270626, "grad_norm": 0.013719763606786728, "learning_rate": 0.021225775818734364, "loss": 0.1913, "num_input_tokens_seen": 28708528, "step": 33145 }, { "epoch": 15.629420084865629, "grad_norm": 0.009359954856336117, "learning_rate": 0.021195578026514166, "loss": 0.1774, "num_input_tokens_seen": 28713104, "step": 33150 }, { "epoch": 15.631777463460631, "grad_norm": 0.01313064992427826, "learning_rate": 0.02116540009755452, "loss": 0.1637, "num_input_tokens_seen": 28718208, "step": 33155 }, { "epoch": 15.634134842055634, "grad_norm": 0.006834178231656551, "learning_rate": 0.021135242036509173, "loss": 0.1665, "num_input_tokens_seen": 28722096, "step": 33160 }, { "epoch": 15.636492220650636, "grad_norm": 0.015169400721788406, "learning_rate": 0.021105103848028967, "loss": 0.1416, "num_input_tokens_seen": 28726368, "step": 33165 }, { "epoch": 15.638849599245638, "grad_norm": 0.006066963076591492, "learning_rate": 0.021074985536761504, "loss": 0.1379, "num_input_tokens_seen": 28731856, "step": 33170 }, { "epoch": 15.64120697784064, "grad_norm": 0.010219521820545197, "learning_rate": 0.021044887107351435, "loss": 0.1324, "num_input_tokens_seen": 28736592, "step": 33175 }, { "epoch": 15.643564356435643, "grad_norm": 0.011590374633669853, "learning_rate": 0.021014808564440362, "loss": 0.1324, "num_input_tokens_seen": 28740144, "step": 33180 }, { "epoch": 15.645921735030646, "grad_norm": 0.005227481480687857, "learning_rate": 0.02098474991266671, "loss": 0.1142, "num_input_tokens_seen": 28744080, "step": 33185 }, { "epoch": 15.648279113625648, "grad_norm": 0.010854237712919712, "learning_rate": 0.02095471115666592, "loss": 0.1438, "num_input_tokens_seen": 28748320, "step": 33190 }, { "epoch": 15.65063649222065, "grad_norm": 0.013706477358937263, "learning_rate": 0.020924692301070406, "loss": 0.1998, "num_input_tokens_seen": 28752832, "step": 33195 }, { "epoch": 15.652993870815653, "grad_norm": 0.010388951748609543, "learning_rate": 0.020894693350509346, "loss": 0.1954, "num_input_tokens_seen": 28756240, "step": 33200 }, { "epoch": 15.652993870815653, "eval_loss": 0.2356317937374115, "eval_runtime": 21.9273, "eval_samples_per_second": 43.006, "eval_steps_per_second": 21.526, "num_input_tokens_seen": 28756240, "step": 33200 }, { "epoch": 15.655351249410655, "grad_norm": 0.014369914308190346, "learning_rate": 0.020864714309609057, "loss": 0.1867, "num_input_tokens_seen": 28760752, "step": 33205 }, { "epoch": 15.657708628005658, "grad_norm": 0.009331897832453251, "learning_rate": 0.020834755182992604, "loss": 0.1384, "num_input_tokens_seen": 28764896, "step": 33210 }, { "epoch": 15.66006600660066, "grad_norm": 0.004013505764305592, "learning_rate": 0.02080481597528011, "loss": 0.1087, "num_input_tokens_seen": 28769168, "step": 33215 }, { "epoch": 15.662423385195662, "grad_norm": 0.014961844310164452, "learning_rate": 0.020774896691088583, "loss": 0.1583, "num_input_tokens_seen": 28772704, "step": 33220 }, { "epoch": 15.664780763790665, "grad_norm": 0.0034656417556107044, "learning_rate": 0.020744997335031882, "loss": 0.0825, "num_input_tokens_seen": 28776736, "step": 33225 }, { "epoch": 15.667138142385667, "grad_norm": 0.009257973171770573, "learning_rate": 0.02071511791172092, "loss": 0.1528, "num_input_tokens_seen": 28780704, "step": 33230 }, { "epoch": 15.66949552098067, "grad_norm": 0.015145860612392426, "learning_rate": 0.02068525842576351, "loss": 0.1493, "num_input_tokens_seen": 28784576, "step": 33235 }, { "epoch": 15.671852899575672, "grad_norm": 0.00983085110783577, "learning_rate": 0.020655418881764264, "loss": 0.1725, "num_input_tokens_seen": 28789024, "step": 33240 }, { "epoch": 15.674210278170674, "grad_norm": 0.009945601224899292, "learning_rate": 0.020625599284324923, "loss": 0.1173, "num_input_tokens_seen": 28793648, "step": 33245 }, { "epoch": 15.676567656765677, "grad_norm": 0.010404549539089203, "learning_rate": 0.02059579963804396, "loss": 0.1638, "num_input_tokens_seen": 28798048, "step": 33250 }, { "epoch": 15.67892503536068, "grad_norm": 0.01724894344806671, "learning_rate": 0.02056601994751688, "loss": 0.1953, "num_input_tokens_seen": 28802128, "step": 33255 }, { "epoch": 15.681282413955682, "grad_norm": 0.007362672593444586, "learning_rate": 0.02053626021733614, "loss": 0.1876, "num_input_tokens_seen": 28806688, "step": 33260 }, { "epoch": 15.683639792550684, "grad_norm": 0.00902239978313446, "learning_rate": 0.02050652045209097, "loss": 0.1657, "num_input_tokens_seen": 28811072, "step": 33265 }, { "epoch": 15.685997171145686, "grad_norm": 0.007616869639605284, "learning_rate": 0.020476800656367672, "loss": 0.1479, "num_input_tokens_seen": 28815520, "step": 33270 }, { "epoch": 15.688354549740689, "grad_norm": 0.006913819350302219, "learning_rate": 0.020447100834749425, "loss": 0.1482, "num_input_tokens_seen": 28820880, "step": 33275 }, { "epoch": 15.690711928335691, "grad_norm": 0.01078998763114214, "learning_rate": 0.02041742099181627, "loss": 0.0884, "num_input_tokens_seen": 28825440, "step": 33280 }, { "epoch": 15.693069306930694, "grad_norm": 0.010669144801795483, "learning_rate": 0.02038776113214526, "loss": 0.1015, "num_input_tokens_seen": 28829584, "step": 33285 }, { "epoch": 15.695426685525696, "grad_norm": 0.007945742458105087, "learning_rate": 0.0203581212603103, "loss": 0.1498, "num_input_tokens_seen": 28833712, "step": 33290 }, { "epoch": 15.697784064120698, "grad_norm": 0.01267133466899395, "learning_rate": 0.02032850138088219, "loss": 0.1393, "num_input_tokens_seen": 28837920, "step": 33295 }, { "epoch": 15.700141442715701, "grad_norm": 0.00546290585771203, "learning_rate": 0.020298901498428754, "loss": 0.1067, "num_input_tokens_seen": 28842176, "step": 33300 }, { "epoch": 15.702498821310703, "grad_norm": 0.013740873895585537, "learning_rate": 0.020269321617514595, "loss": 0.2519, "num_input_tokens_seen": 28846192, "step": 33305 }, { "epoch": 15.704856199905706, "grad_norm": 0.01193067617714405, "learning_rate": 0.020239761742701343, "loss": 0.1727, "num_input_tokens_seen": 28851712, "step": 33310 }, { "epoch": 15.707213578500706, "grad_norm": 0.013204175047576427, "learning_rate": 0.02021022187854754, "loss": 0.1607, "num_input_tokens_seen": 28855472, "step": 33315 }, { "epoch": 15.70957095709571, "grad_norm": 0.007259982172399759, "learning_rate": 0.020180702029608522, "loss": 0.1463, "num_input_tokens_seen": 28859088, "step": 33320 }, { "epoch": 15.711928335690711, "grad_norm": 0.015096140094101429, "learning_rate": 0.020151202200436695, "loss": 0.1688, "num_input_tokens_seen": 28863456, "step": 33325 }, { "epoch": 15.714285714285714, "grad_norm": 0.005905061028897762, "learning_rate": 0.020121722395581226, "loss": 0.1493, "num_input_tokens_seen": 28867360, "step": 33330 }, { "epoch": 15.716643092880716, "grad_norm": 0.011165544390678406, "learning_rate": 0.020092262619588342, "loss": 0.1188, "num_input_tokens_seen": 28872000, "step": 33335 }, { "epoch": 15.719000471475718, "grad_norm": 0.01145342830568552, "learning_rate": 0.02006282287700109, "loss": 0.1876, "num_input_tokens_seen": 28876192, "step": 33340 }, { "epoch": 15.72135785007072, "grad_norm": 0.010686317458748817, "learning_rate": 0.020033403172359427, "loss": 0.0789, "num_input_tokens_seen": 28879824, "step": 33345 }, { "epoch": 15.723715228665723, "grad_norm": 0.010326235555112362, "learning_rate": 0.020004003510200284, "loss": 0.1262, "num_input_tokens_seen": 28883776, "step": 33350 }, { "epoch": 15.726072607260726, "grad_norm": 0.005926843732595444, "learning_rate": 0.019974623895057407, "loss": 0.0664, "num_input_tokens_seen": 28887904, "step": 33355 }, { "epoch": 15.728429985855728, "grad_norm": 0.009972752071917057, "learning_rate": 0.019945264331461553, "loss": 0.2003, "num_input_tokens_seen": 28892256, "step": 33360 }, { "epoch": 15.73078736445073, "grad_norm": 0.00403845589607954, "learning_rate": 0.019915924823940317, "loss": 0.1447, "num_input_tokens_seen": 28897184, "step": 33365 }, { "epoch": 15.733144743045733, "grad_norm": 0.015507747419178486, "learning_rate": 0.01988660537701816, "loss": 0.3053, "num_input_tokens_seen": 28901040, "step": 33370 }, { "epoch": 15.735502121640735, "grad_norm": 0.009736848063766956, "learning_rate": 0.01985730599521659, "loss": 0.1295, "num_input_tokens_seen": 28905600, "step": 33375 }, { "epoch": 15.737859500235738, "grad_norm": 0.005619247909635305, "learning_rate": 0.019828026683053918, "loss": 0.1238, "num_input_tokens_seen": 28909552, "step": 33380 }, { "epoch": 15.74021687883074, "grad_norm": 0.015858426690101624, "learning_rate": 0.01979876744504535, "loss": 0.1267, "num_input_tokens_seen": 28913632, "step": 33385 }, { "epoch": 15.742574257425742, "grad_norm": 0.019311323761940002, "learning_rate": 0.019769528285703046, "loss": 0.1957, "num_input_tokens_seen": 28917888, "step": 33390 }, { "epoch": 15.744931636020745, "grad_norm": 0.009724169969558716, "learning_rate": 0.019740309209536098, "loss": 0.1797, "num_input_tokens_seen": 28922080, "step": 33395 }, { "epoch": 15.747289014615747, "grad_norm": 0.008935193531215191, "learning_rate": 0.019711110221050387, "loss": 0.1933, "num_input_tokens_seen": 28926208, "step": 33400 }, { "epoch": 15.747289014615747, "eval_loss": 0.2358691394329071, "eval_runtime": 21.9343, "eval_samples_per_second": 42.992, "eval_steps_per_second": 21.519, "num_input_tokens_seen": 28926208, "step": 33400 }, { "epoch": 15.74964639321075, "grad_norm": 0.009668223559856415, "learning_rate": 0.019681931324748825, "loss": 0.1476, "num_input_tokens_seen": 28930288, "step": 33405 }, { "epoch": 15.752003771805752, "grad_norm": 0.006874902173876762, "learning_rate": 0.019652772525131094, "loss": 0.1687, "num_input_tokens_seen": 28935376, "step": 33410 }, { "epoch": 15.754361150400754, "grad_norm": 0.010143081657588482, "learning_rate": 0.019623633826693885, "loss": 0.1433, "num_input_tokens_seen": 28939248, "step": 33415 }, { "epoch": 15.756718528995757, "grad_norm": 0.014642305672168732, "learning_rate": 0.019594515233930788, "loss": 0.1338, "num_input_tokens_seen": 28943392, "step": 33420 }, { "epoch": 15.75907590759076, "grad_norm": 0.015993531793355942, "learning_rate": 0.019565416751332186, "loss": 0.17, "num_input_tokens_seen": 28947808, "step": 33425 }, { "epoch": 15.761433286185762, "grad_norm": 0.006854436360299587, "learning_rate": 0.019536338383385497, "loss": 0.1535, "num_input_tokens_seen": 28951984, "step": 33430 }, { "epoch": 15.763790664780764, "grad_norm": 0.005140727385878563, "learning_rate": 0.019507280134574933, "loss": 0.1544, "num_input_tokens_seen": 28956032, "step": 33435 }, { "epoch": 15.766148043375766, "grad_norm": 0.011818690225481987, "learning_rate": 0.019478242009381624, "loss": 0.134, "num_input_tokens_seen": 28960672, "step": 33440 }, { "epoch": 15.768505421970769, "grad_norm": 0.011516189202666283, "learning_rate": 0.01944922401228367, "loss": 0.1608, "num_input_tokens_seen": 28965248, "step": 33445 }, { "epoch": 15.770862800565771, "grad_norm": 0.00805049017071724, "learning_rate": 0.01942022614775593, "loss": 0.0847, "num_input_tokens_seen": 28969824, "step": 33450 }, { "epoch": 15.773220179160774, "grad_norm": 0.011468291282653809, "learning_rate": 0.01939124842027029, "loss": 0.1301, "num_input_tokens_seen": 28974368, "step": 33455 }, { "epoch": 15.775577557755776, "grad_norm": 0.014600384049117565, "learning_rate": 0.01936229083429551, "loss": 0.2214, "num_input_tokens_seen": 28978128, "step": 33460 }, { "epoch": 15.777934936350778, "grad_norm": 0.010637049563229084, "learning_rate": 0.019333353394297148, "loss": 0.1204, "num_input_tokens_seen": 28982048, "step": 33465 }, { "epoch": 15.78029231494578, "grad_norm": 0.01019955798983574, "learning_rate": 0.019304436104737754, "loss": 0.1452, "num_input_tokens_seen": 28986336, "step": 33470 }, { "epoch": 15.782649693540783, "grad_norm": 0.012605104595422745, "learning_rate": 0.019275538970076778, "loss": 0.1324, "num_input_tokens_seen": 28990560, "step": 33475 }, { "epoch": 15.785007072135786, "grad_norm": 0.006216756068170071, "learning_rate": 0.019246661994770434, "loss": 0.1466, "num_input_tokens_seen": 28994416, "step": 33480 }, { "epoch": 15.787364450730788, "grad_norm": 0.011961767449975014, "learning_rate": 0.019217805183271985, "loss": 0.1477, "num_input_tokens_seen": 28998304, "step": 33485 }, { "epoch": 15.78972182932579, "grad_norm": 0.011355184018611908, "learning_rate": 0.019188968540031465, "loss": 0.143, "num_input_tokens_seen": 29002160, "step": 33490 }, { "epoch": 15.792079207920793, "grad_norm": 0.010464862920343876, "learning_rate": 0.019160152069495867, "loss": 0.1324, "num_input_tokens_seen": 29006768, "step": 33495 }, { "epoch": 15.794436586515795, "grad_norm": 0.008440044708549976, "learning_rate": 0.019131355776109103, "loss": 0.2708, "num_input_tokens_seen": 29011472, "step": 33500 }, { "epoch": 15.796793965110798, "grad_norm": 0.00617208331823349, "learning_rate": 0.019102579664311857, "loss": 0.2133, "num_input_tokens_seen": 29015984, "step": 33505 }, { "epoch": 15.799151343705798, "grad_norm": 0.006971802096813917, "learning_rate": 0.019073823738541763, "loss": 0.1476, "num_input_tokens_seen": 29020672, "step": 33510 }, { "epoch": 15.801508722300802, "grad_norm": 0.011096819303929806, "learning_rate": 0.0190450880032334, "loss": 0.1167, "num_input_tokens_seen": 29024608, "step": 33515 }, { "epoch": 15.803866100895803, "grad_norm": 0.008110621944069862, "learning_rate": 0.019016372462818114, "loss": 0.1247, "num_input_tokens_seen": 29029024, "step": 33520 }, { "epoch": 15.806223479490805, "grad_norm": 0.005207305308431387, "learning_rate": 0.018987677121724278, "loss": 0.144, "num_input_tokens_seen": 29033280, "step": 33525 }, { "epoch": 15.808580858085808, "grad_norm": 0.01477016881108284, "learning_rate": 0.018959001984377, "loss": 0.2034, "num_input_tokens_seen": 29037584, "step": 33530 }, { "epoch": 15.81093823668081, "grad_norm": 0.011677647940814495, "learning_rate": 0.018930347055198377, "loss": 0.1885, "num_input_tokens_seen": 29041312, "step": 33535 }, { "epoch": 15.813295615275813, "grad_norm": 0.00546837504953146, "learning_rate": 0.01890171233860739, "loss": 0.1615, "num_input_tokens_seen": 29045360, "step": 33540 }, { "epoch": 15.815652993870815, "grad_norm": 0.01631225273013115, "learning_rate": 0.018873097839019807, "loss": 0.1882, "num_input_tokens_seen": 29049792, "step": 33545 }, { "epoch": 15.818010372465817, "grad_norm": 0.01510283350944519, "learning_rate": 0.0188445035608484, "loss": 0.2107, "num_input_tokens_seen": 29054096, "step": 33550 }, { "epoch": 15.82036775106082, "grad_norm": 0.013078639283776283, "learning_rate": 0.018815929508502777, "loss": 0.1537, "num_input_tokens_seen": 29057952, "step": 33555 }, { "epoch": 15.822725129655822, "grad_norm": 0.012507554143667221, "learning_rate": 0.01878737568638934, "loss": 0.1465, "num_input_tokens_seen": 29062544, "step": 33560 }, { "epoch": 15.825082508250825, "grad_norm": 0.011849344708025455, "learning_rate": 0.01875884209891152, "loss": 0.1219, "num_input_tokens_seen": 29066992, "step": 33565 }, { "epoch": 15.827439886845827, "grad_norm": 0.004110679030418396, "learning_rate": 0.018730328750469514, "loss": 0.1161, "num_input_tokens_seen": 29070704, "step": 33570 }, { "epoch": 15.82979726544083, "grad_norm": 0.014476512558758259, "learning_rate": 0.018701835645460473, "loss": 0.1797, "num_input_tokens_seen": 29074736, "step": 33575 }, { "epoch": 15.832154644035832, "grad_norm": 0.005878007505089045, "learning_rate": 0.01867336278827838, "loss": 0.1282, "num_input_tokens_seen": 29079136, "step": 33580 }, { "epoch": 15.834512022630834, "grad_norm": 0.009585722349584103, "learning_rate": 0.018644910183314056, "loss": 0.1112, "num_input_tokens_seen": 29083904, "step": 33585 }, { "epoch": 15.836869401225837, "grad_norm": 0.011633875779807568, "learning_rate": 0.01861647783495531, "loss": 0.1665, "num_input_tokens_seen": 29088608, "step": 33590 }, { "epoch": 15.839226779820839, "grad_norm": 0.005122533068060875, "learning_rate": 0.01858806574758676, "loss": 0.1591, "num_input_tokens_seen": 29092688, "step": 33595 }, { "epoch": 15.841584158415841, "grad_norm": 0.009182100184261799, "learning_rate": 0.01855967392558988, "loss": 0.1241, "num_input_tokens_seen": 29096816, "step": 33600 }, { "epoch": 15.841584158415841, "eval_loss": 0.23447120189666748, "eval_runtime": 21.9341, "eval_samples_per_second": 42.992, "eval_steps_per_second": 21.519, "num_input_tokens_seen": 29096816, "step": 33600 }, { "epoch": 15.843941537010844, "grad_norm": 0.007577728945761919, "learning_rate": 0.018531302373343096, "loss": 0.1571, "num_input_tokens_seen": 29101056, "step": 33605 }, { "epoch": 15.846298915605846, "grad_norm": 0.01298991683870554, "learning_rate": 0.018502951095221588, "loss": 0.1744, "num_input_tokens_seen": 29105568, "step": 33610 }, { "epoch": 15.848656294200849, "grad_norm": 0.01238330453634262, "learning_rate": 0.01847462009559751, "loss": 0.2041, "num_input_tokens_seen": 29109104, "step": 33615 }, { "epoch": 15.851013672795851, "grad_norm": 0.010474913753569126, "learning_rate": 0.01844630937883992, "loss": 0.2178, "num_input_tokens_seen": 29113312, "step": 33620 }, { "epoch": 15.853371051390853, "grad_norm": 0.006106297019869089, "learning_rate": 0.018418018949314573, "loss": 0.0885, "num_input_tokens_seen": 29117600, "step": 33625 }, { "epoch": 15.855728429985856, "grad_norm": 0.010363249108195305, "learning_rate": 0.018389748811384315, "loss": 0.1538, "num_input_tokens_seen": 29122096, "step": 33630 }, { "epoch": 15.858085808580858, "grad_norm": 0.010607166215777397, "learning_rate": 0.018361498969408658, "loss": 0.13, "num_input_tokens_seen": 29125824, "step": 33635 }, { "epoch": 15.86044318717586, "grad_norm": 0.009844205342233181, "learning_rate": 0.01833326942774415, "loss": 0.1411, "num_input_tokens_seen": 29130240, "step": 33640 }, { "epoch": 15.862800565770863, "grad_norm": 0.003402505535632372, "learning_rate": 0.018305060190744155, "loss": 0.0859, "num_input_tokens_seen": 29134912, "step": 33645 }, { "epoch": 15.865157944365865, "grad_norm": 0.009187222458422184, "learning_rate": 0.018276871262758846, "loss": 0.1345, "num_input_tokens_seen": 29139456, "step": 33650 }, { "epoch": 15.867515322960868, "grad_norm": 0.0094832768663764, "learning_rate": 0.0182487026481353, "loss": 0.1225, "num_input_tokens_seen": 29143600, "step": 33655 }, { "epoch": 15.86987270155587, "grad_norm": 0.013676558621227741, "learning_rate": 0.018220554351217538, "loss": 0.1362, "num_input_tokens_seen": 29147712, "step": 33660 }, { "epoch": 15.872230080150873, "grad_norm": 0.009403099305927753, "learning_rate": 0.01819242637634629, "loss": 0.1227, "num_input_tokens_seen": 29151776, "step": 33665 }, { "epoch": 15.874587458745875, "grad_norm": 0.008690647780895233, "learning_rate": 0.01816431872785933, "loss": 0.1529, "num_input_tokens_seen": 29156512, "step": 33670 }, { "epoch": 15.876944837340877, "grad_norm": 0.007721145171672106, "learning_rate": 0.018136231410091148, "loss": 0.1352, "num_input_tokens_seen": 29161456, "step": 33675 }, { "epoch": 15.87930221593588, "grad_norm": 0.00759466178715229, "learning_rate": 0.018108164427373175, "loss": 0.1698, "num_input_tokens_seen": 29165488, "step": 33680 }, { "epoch": 15.881659594530882, "grad_norm": 0.004428298212587833, "learning_rate": 0.01808011778403375, "loss": 0.1291, "num_input_tokens_seen": 29169648, "step": 33685 }, { "epoch": 15.884016973125885, "grad_norm": 0.004801807459443808, "learning_rate": 0.01805209148439793, "loss": 0.1274, "num_input_tokens_seen": 29173344, "step": 33690 }, { "epoch": 15.886374351720887, "grad_norm": 0.008379009552299976, "learning_rate": 0.018024085532787757, "loss": 0.158, "num_input_tokens_seen": 29177312, "step": 33695 }, { "epoch": 15.88873173031589, "grad_norm": 0.008536525070667267, "learning_rate": 0.017996099933522164, "loss": 0.1474, "num_input_tokens_seen": 29181696, "step": 33700 }, { "epoch": 15.891089108910892, "grad_norm": 0.016969969496130943, "learning_rate": 0.017968134690916775, "loss": 0.2143, "num_input_tokens_seen": 29185808, "step": 33705 }, { "epoch": 15.893446487505894, "grad_norm": 0.007745060138404369, "learning_rate": 0.017940189809284263, "loss": 0.2069, "num_input_tokens_seen": 29190064, "step": 33710 }, { "epoch": 15.895803866100895, "grad_norm": 0.012543420307338238, "learning_rate": 0.017912265292934024, "loss": 0.2238, "num_input_tokens_seen": 29193632, "step": 33715 }, { "epoch": 15.898161244695899, "grad_norm": 0.005619990173727274, "learning_rate": 0.017884361146172423, "loss": 0.1212, "num_input_tokens_seen": 29197520, "step": 33720 }, { "epoch": 15.9005186232909, "grad_norm": 0.011378907598555088, "learning_rate": 0.01785647737330261, "loss": 0.147, "num_input_tokens_seen": 29202544, "step": 33725 }, { "epoch": 15.902876001885902, "grad_norm": 0.01348639465868473, "learning_rate": 0.017828613978624563, "loss": 0.1805, "num_input_tokens_seen": 29207168, "step": 33730 }, { "epoch": 15.905233380480905, "grad_norm": 0.004668994806706905, "learning_rate": 0.01780077096643523, "loss": 0.1572, "num_input_tokens_seen": 29211728, "step": 33735 }, { "epoch": 15.907590759075907, "grad_norm": 0.011346595361828804, "learning_rate": 0.017772948341028345, "loss": 0.1175, "num_input_tokens_seen": 29215616, "step": 33740 }, { "epoch": 15.90994813767091, "grad_norm": 0.01404068898409605, "learning_rate": 0.01774514610669447, "loss": 0.1313, "num_input_tokens_seen": 29220928, "step": 33745 }, { "epoch": 15.912305516265912, "grad_norm": 0.018039526417851448, "learning_rate": 0.017717364267721112, "loss": 0.1942, "num_input_tokens_seen": 29224608, "step": 33750 }, { "epoch": 15.914662894860914, "grad_norm": 0.014733195304870605, "learning_rate": 0.017689602828392513, "loss": 0.173, "num_input_tokens_seen": 29229120, "step": 33755 }, { "epoch": 15.917020273455917, "grad_norm": 0.008668680675327778, "learning_rate": 0.017661861792989897, "loss": 0.1692, "num_input_tokens_seen": 29233408, "step": 33760 }, { "epoch": 15.919377652050919, "grad_norm": 0.00762353278696537, "learning_rate": 0.017634141165791272, "loss": 0.1764, "num_input_tokens_seen": 29237008, "step": 33765 }, { "epoch": 15.921735030645921, "grad_norm": 0.019390935078263283, "learning_rate": 0.017606440951071455, "loss": 0.2788, "num_input_tokens_seen": 29240384, "step": 33770 }, { "epoch": 15.924092409240924, "grad_norm": 0.012553144246339798, "learning_rate": 0.017578761153102213, "loss": 0.134, "num_input_tokens_seen": 29244832, "step": 33775 }, { "epoch": 15.926449787835926, "grad_norm": 0.008866786025464535, "learning_rate": 0.017551101776152146, "loss": 0.1484, "num_input_tokens_seen": 29250896, "step": 33780 }, { "epoch": 15.928807166430929, "grad_norm": 0.008158031851053238, "learning_rate": 0.017523462824486608, "loss": 0.1289, "num_input_tokens_seen": 29255168, "step": 33785 }, { "epoch": 15.931164545025931, "grad_norm": 0.01971360668540001, "learning_rate": 0.01749584430236794, "loss": 0.1636, "num_input_tokens_seen": 29258880, "step": 33790 }, { "epoch": 15.933521923620933, "grad_norm": 0.011381003074347973, "learning_rate": 0.01746824621405524, "loss": 0.1776, "num_input_tokens_seen": 29263200, "step": 33795 }, { "epoch": 15.935879302215936, "grad_norm": 0.015404395759105682, "learning_rate": 0.017440668563804412, "loss": 0.1931, "num_input_tokens_seen": 29267072, "step": 33800 }, { "epoch": 15.935879302215936, "eval_loss": 0.23921844363212585, "eval_runtime": 21.9135, "eval_samples_per_second": 43.033, "eval_steps_per_second": 21.539, "num_input_tokens_seen": 29267072, "step": 33800 }, { "epoch": 15.938236680810938, "grad_norm": 0.015389579348266125, "learning_rate": 0.017413111355868392, "loss": 0.194, "num_input_tokens_seen": 29272208, "step": 33805 }, { "epoch": 15.94059405940594, "grad_norm": 0.003462106455117464, "learning_rate": 0.017385574594496748, "loss": 0.1395, "num_input_tokens_seen": 29276528, "step": 33810 }, { "epoch": 15.942951438000943, "grad_norm": 0.017849478870630264, "learning_rate": 0.01735805828393605, "loss": 0.155, "num_input_tokens_seen": 29280416, "step": 33815 }, { "epoch": 15.945308816595945, "grad_norm": 0.008412381634116173, "learning_rate": 0.017330562428429667, "loss": 0.1658, "num_input_tokens_seen": 29285088, "step": 33820 }, { "epoch": 15.947666195190948, "grad_norm": 0.012100562453269958, "learning_rate": 0.01730308703221776, "loss": 0.1381, "num_input_tokens_seen": 29289168, "step": 33825 }, { "epoch": 15.95002357378595, "grad_norm": 0.00923791155219078, "learning_rate": 0.01727563209953744, "loss": 0.1078, "num_input_tokens_seen": 29293296, "step": 33830 }, { "epoch": 15.952380952380953, "grad_norm": 0.012612216174602509, "learning_rate": 0.017248197634622535, "loss": 0.1394, "num_input_tokens_seen": 29297696, "step": 33835 }, { "epoch": 15.954738330975955, "grad_norm": 0.01085917092859745, "learning_rate": 0.01722078364170383, "loss": 0.1766, "num_input_tokens_seen": 29301664, "step": 33840 }, { "epoch": 15.957095709570957, "grad_norm": 0.008514147251844406, "learning_rate": 0.017193390125008905, "loss": 0.1367, "num_input_tokens_seen": 29305776, "step": 33845 }, { "epoch": 15.95945308816596, "grad_norm": 0.006366985384374857, "learning_rate": 0.017166017088762153, "loss": 0.105, "num_input_tokens_seen": 29309856, "step": 33850 }, { "epoch": 15.961810466760962, "grad_norm": 0.003963693045079708, "learning_rate": 0.017138664537184878, "loss": 0.0947, "num_input_tokens_seen": 29313328, "step": 33855 }, { "epoch": 15.964167845355965, "grad_norm": 0.0078471340239048, "learning_rate": 0.017111332474495172, "loss": 0.1166, "num_input_tokens_seen": 29317344, "step": 33860 }, { "epoch": 15.966525223950967, "grad_norm": 0.004692547954618931, "learning_rate": 0.017084020904907998, "loss": 0.1121, "num_input_tokens_seen": 29321344, "step": 33865 }, { "epoch": 15.96888260254597, "grad_norm": 0.007308132015168667, "learning_rate": 0.017056729832635103, "loss": 0.1648, "num_input_tokens_seen": 29325200, "step": 33870 }, { "epoch": 15.971239981140972, "grad_norm": 0.01259162649512291, "learning_rate": 0.017029459261885153, "loss": 0.1617, "num_input_tokens_seen": 29329248, "step": 33875 }, { "epoch": 15.973597359735974, "grad_norm": 0.013526651076972485, "learning_rate": 0.01700220919686359, "loss": 0.1016, "num_input_tokens_seen": 29333248, "step": 33880 }, { "epoch": 15.975954738330977, "grad_norm": 0.0137288523837924, "learning_rate": 0.016974979641772723, "loss": 0.2651, "num_input_tokens_seen": 29337312, "step": 33885 }, { "epoch": 15.978312116925979, "grad_norm": 0.009665065445005894, "learning_rate": 0.01694777060081169, "loss": 0.1286, "num_input_tokens_seen": 29340896, "step": 33890 }, { "epoch": 15.980669495520981, "grad_norm": 0.012758394703269005, "learning_rate": 0.016920582078176444, "loss": 0.1184, "num_input_tokens_seen": 29344928, "step": 33895 }, { "epoch": 15.983026874115984, "grad_norm": 0.009794783778488636, "learning_rate": 0.016893414078059863, "loss": 0.1799, "num_input_tokens_seen": 29349616, "step": 33900 }, { "epoch": 15.985384252710986, "grad_norm": 0.012445393018424511, "learning_rate": 0.016866266604651535, "loss": 0.1709, "num_input_tokens_seen": 29353760, "step": 33905 }, { "epoch": 15.987741631305987, "grad_norm": 0.013093075715005398, "learning_rate": 0.016839139662137976, "loss": 0.1578, "num_input_tokens_seen": 29357584, "step": 33910 }, { "epoch": 15.990099009900991, "grad_norm": 0.0046911947429180145, "learning_rate": 0.01681203325470245, "loss": 0.0715, "num_input_tokens_seen": 29362144, "step": 33915 }, { "epoch": 15.992456388495992, "grad_norm": 0.009837457910180092, "learning_rate": 0.016784947386525157, "loss": 0.1248, "num_input_tokens_seen": 29367296, "step": 33920 }, { "epoch": 15.994813767090994, "grad_norm": 0.008478086441755295, "learning_rate": 0.01675788206178308, "loss": 0.131, "num_input_tokens_seen": 29371600, "step": 33925 }, { "epoch": 15.997171145685996, "grad_norm": 0.011700219474732876, "learning_rate": 0.016730837284649986, "loss": 0.1461, "num_input_tokens_seen": 29376656, "step": 33930 }, { "epoch": 15.999528524280999, "grad_norm": 0.013135616667568684, "learning_rate": 0.016703813059296583, "loss": 0.1576, "num_input_tokens_seen": 29379904, "step": 33935 }, { "epoch": 16.001885902876, "grad_norm": 0.0085666598752141, "learning_rate": 0.016676809389890294, "loss": 0.1269, "num_input_tokens_seen": 29384608, "step": 33940 }, { "epoch": 16.004243281471005, "grad_norm": 0.006550874561071396, "learning_rate": 0.016649826280595435, "loss": 0.0718, "num_input_tokens_seen": 29388624, "step": 33945 }, { "epoch": 16.006600660066006, "grad_norm": 0.014352720230817795, "learning_rate": 0.016622863735573163, "loss": 0.1668, "num_input_tokens_seen": 29391808, "step": 33950 }, { "epoch": 16.00895803866101, "grad_norm": 0.009166819974780083, "learning_rate": 0.016595921758981395, "loss": 0.1297, "num_input_tokens_seen": 29395840, "step": 33955 }, { "epoch": 16.01131541725601, "grad_norm": 0.0112850870937109, "learning_rate": 0.01656900035497495, "loss": 0.0972, "num_input_tokens_seen": 29400608, "step": 33960 }, { "epoch": 16.013672795851015, "grad_norm": 0.007533607073128223, "learning_rate": 0.016542099527705485, "loss": 0.1238, "num_input_tokens_seen": 29404592, "step": 33965 }, { "epoch": 16.016030174446016, "grad_norm": 0.00611309427767992, "learning_rate": 0.01651521928132138, "loss": 0.158, "num_input_tokens_seen": 29409552, "step": 33970 }, { "epoch": 16.01838755304102, "grad_norm": 0.01574980467557907, "learning_rate": 0.01648835961996794, "loss": 0.2151, "num_input_tokens_seen": 29413168, "step": 33975 }, { "epoch": 16.02074493163602, "grad_norm": 0.010679124854505062, "learning_rate": 0.016461520547787285, "loss": 0.1013, "num_input_tokens_seen": 29418032, "step": 33980 }, { "epoch": 16.023102310231025, "grad_norm": 0.006675763987004757, "learning_rate": 0.016434702068918266, "loss": 0.0614, "num_input_tokens_seen": 29422608, "step": 33985 }, { "epoch": 16.025459688826025, "grad_norm": 0.011866366490721703, "learning_rate": 0.01640790418749673, "loss": 0.1228, "num_input_tokens_seen": 29426800, "step": 33990 }, { "epoch": 16.02781706742103, "grad_norm": 0.012925009243190289, "learning_rate": 0.016381126907655134, "loss": 0.1378, "num_input_tokens_seen": 29431328, "step": 33995 }, { "epoch": 16.03017444601603, "grad_norm": 0.00643345108255744, "learning_rate": 0.016354370233522948, "loss": 0.1113, "num_input_tokens_seen": 29435360, "step": 34000 }, { "epoch": 16.03017444601603, "eval_loss": 0.2413286715745926, "eval_runtime": 21.9184, "eval_samples_per_second": 43.023, "eval_steps_per_second": 21.534, "num_input_tokens_seen": 29435360, "step": 34000 }, { "epoch": 16.032531824611034, "grad_norm": 0.0017099065007641912, "learning_rate": 0.016327634169226394, "loss": 0.0813, "num_input_tokens_seen": 29439888, "step": 34005 }, { "epoch": 16.034889203206035, "grad_norm": 0.015047271735966206, "learning_rate": 0.016300918718888485, "loss": 0.1037, "num_input_tokens_seen": 29444192, "step": 34010 }, { "epoch": 16.03724658180104, "grad_norm": 0.011723118834197521, "learning_rate": 0.016274223886629052, "loss": 0.1355, "num_input_tokens_seen": 29448560, "step": 34015 }, { "epoch": 16.03960396039604, "grad_norm": 0.012239443138241768, "learning_rate": 0.01624754967656482, "loss": 0.1516, "num_input_tokens_seen": 29452960, "step": 34020 }, { "epoch": 16.04196133899104, "grad_norm": 0.008117412216961384, "learning_rate": 0.016220896092809235, "loss": 0.1357, "num_input_tokens_seen": 29458400, "step": 34025 }, { "epoch": 16.044318717586044, "grad_norm": 0.021050719544291496, "learning_rate": 0.01619426313947267, "loss": 0.1378, "num_input_tokens_seen": 29462000, "step": 34030 }, { "epoch": 16.046676096181045, "grad_norm": 0.0037561622448265553, "learning_rate": 0.016167650820662228, "loss": 0.1174, "num_input_tokens_seen": 29465824, "step": 34035 }, { "epoch": 16.04903347477605, "grad_norm": 0.00926631223410368, "learning_rate": 0.016141059140481855, "loss": 0.1479, "num_input_tokens_seen": 29471280, "step": 34040 }, { "epoch": 16.05139085337105, "grad_norm": 0.005544474348425865, "learning_rate": 0.016114488103032374, "loss": 0.1105, "num_input_tokens_seen": 29475728, "step": 34045 }, { "epoch": 16.053748231966054, "grad_norm": 0.009672624059021473, "learning_rate": 0.016087937712411293, "loss": 0.0985, "num_input_tokens_seen": 29479824, "step": 34050 }, { "epoch": 16.056105610561055, "grad_norm": 0.0070793721824884415, "learning_rate": 0.01606140797271308, "loss": 0.083, "num_input_tokens_seen": 29484112, "step": 34055 }, { "epoch": 16.05846298915606, "grad_norm": 0.015368602238595486, "learning_rate": 0.01603489888802897, "loss": 0.0985, "num_input_tokens_seen": 29488272, "step": 34060 }, { "epoch": 16.06082036775106, "grad_norm": 0.009617436677217484, "learning_rate": 0.016008410462446918, "loss": 0.1189, "num_input_tokens_seen": 29493088, "step": 34065 }, { "epoch": 16.063177746346064, "grad_norm": 0.014716090634465218, "learning_rate": 0.01598194270005185, "loss": 0.1195, "num_input_tokens_seen": 29497168, "step": 34070 }, { "epoch": 16.065535124941064, "grad_norm": 0.009490973316133022, "learning_rate": 0.015955495604925356, "loss": 0.0911, "num_input_tokens_seen": 29501760, "step": 34075 }, { "epoch": 16.06789250353607, "grad_norm": 0.008850552141666412, "learning_rate": 0.01592906918114598, "loss": 0.0988, "num_input_tokens_seen": 29506336, "step": 34080 }, { "epoch": 16.07024988213107, "grad_norm": 0.013250972144305706, "learning_rate": 0.015902663432788965, "loss": 0.1062, "num_input_tokens_seen": 29511888, "step": 34085 }, { "epoch": 16.072607260726073, "grad_norm": 0.011141255497932434, "learning_rate": 0.01587627836392643, "loss": 0.0778, "num_input_tokens_seen": 29516096, "step": 34090 }, { "epoch": 16.074964639321074, "grad_norm": 0.022495299577713013, "learning_rate": 0.01584991397862726, "loss": 0.1076, "num_input_tokens_seen": 29520144, "step": 34095 }, { "epoch": 16.077322017916078, "grad_norm": 0.007952597923576832, "learning_rate": 0.015823570280957214, "loss": 0.121, "num_input_tokens_seen": 29524640, "step": 34100 }, { "epoch": 16.07967939651108, "grad_norm": 0.009321251884102821, "learning_rate": 0.015797247274978766, "loss": 0.1009, "num_input_tokens_seen": 29529504, "step": 34105 }, { "epoch": 16.082036775106083, "grad_norm": 0.008311799727380276, "learning_rate": 0.015770944964751326, "loss": 0.1105, "num_input_tokens_seen": 29533328, "step": 34110 }, { "epoch": 16.084394153701083, "grad_norm": 0.01356179267168045, "learning_rate": 0.015744663354330956, "loss": 0.1256, "num_input_tokens_seen": 29538080, "step": 34115 }, { "epoch": 16.086751532296088, "grad_norm": 0.005060568451881409, "learning_rate": 0.015718402447770664, "loss": 0.1516, "num_input_tokens_seen": 29542032, "step": 34120 }, { "epoch": 16.08910891089109, "grad_norm": 0.012311452999711037, "learning_rate": 0.015692162249120224, "loss": 0.1631, "num_input_tokens_seen": 29546432, "step": 34125 }, { "epoch": 16.091466289486092, "grad_norm": 0.020479561761021614, "learning_rate": 0.01566594276242615, "loss": 0.1478, "num_input_tokens_seen": 29550944, "step": 34130 }, { "epoch": 16.093823668081093, "grad_norm": 0.007709856610745192, "learning_rate": 0.015639743991731857, "loss": 0.1055, "num_input_tokens_seen": 29555520, "step": 34135 }, { "epoch": 16.096181046676097, "grad_norm": 0.012737628072500229, "learning_rate": 0.01561356594107755, "loss": 0.1283, "num_input_tokens_seen": 29559856, "step": 34140 }, { "epoch": 16.098538425271098, "grad_norm": 0.005424327217042446, "learning_rate": 0.015587408614500147, "loss": 0.1503, "num_input_tokens_seen": 29563696, "step": 34145 }, { "epoch": 16.100895803866102, "grad_norm": 0.01442993525415659, "learning_rate": 0.015561272016033505, "loss": 0.1722, "num_input_tokens_seen": 29569152, "step": 34150 }, { "epoch": 16.103253182461103, "grad_norm": 0.004335211124271154, "learning_rate": 0.015535156149708167, "loss": 0.0425, "num_input_tokens_seen": 29573120, "step": 34155 }, { "epoch": 16.105610561056107, "grad_norm": 0.003364636329934001, "learning_rate": 0.015509061019551528, "loss": 0.0883, "num_input_tokens_seen": 29577456, "step": 34160 }, { "epoch": 16.107967939651108, "grad_norm": 0.019437070935964584, "learning_rate": 0.015482986629587818, "loss": 0.1475, "num_input_tokens_seen": 29581424, "step": 34165 }, { "epoch": 16.11032531824611, "grad_norm": 0.007611352950334549, "learning_rate": 0.01545693298383799, "loss": 0.1431, "num_input_tokens_seen": 29585872, "step": 34170 }, { "epoch": 16.112682696841112, "grad_norm": 0.015791453421115875, "learning_rate": 0.015430900086319858, "loss": 0.1353, "num_input_tokens_seen": 29589984, "step": 34175 }, { "epoch": 16.115040075436116, "grad_norm": 0.005937471054494381, "learning_rate": 0.015404887941048084, "loss": 0.0874, "num_input_tokens_seen": 29594672, "step": 34180 }, { "epoch": 16.117397454031117, "grad_norm": 0.005718894302845001, "learning_rate": 0.01537889655203397, "loss": 0.1511, "num_input_tokens_seen": 29598992, "step": 34185 }, { "epoch": 16.11975483262612, "grad_norm": 0.005655954591929913, "learning_rate": 0.015352925923285798, "loss": 0.1737, "num_input_tokens_seen": 29602528, "step": 34190 }, { "epoch": 16.122112211221122, "grad_norm": 0.0035175681114196777, "learning_rate": 0.015326976058808511, "loss": 0.0605, "num_input_tokens_seen": 29606384, "step": 34195 }, { "epoch": 16.124469589816126, "grad_norm": 0.011124420911073685, "learning_rate": 0.015301046962603908, "loss": 0.1501, "num_input_tokens_seen": 29610720, "step": 34200 }, { "epoch": 16.124469589816126, "eval_loss": 0.2551458179950714, "eval_runtime": 21.8876, "eval_samples_per_second": 43.084, "eval_steps_per_second": 21.565, "num_input_tokens_seen": 29610720, "step": 34200 }, { "epoch": 16.126826968411127, "grad_norm": 0.011172446422278881, "learning_rate": 0.015275138638670626, "loss": 0.1045, "num_input_tokens_seen": 29615744, "step": 34205 }, { "epoch": 16.12918434700613, "grad_norm": 0.003085650037974119, "learning_rate": 0.015249251091004001, "loss": 0.1339, "num_input_tokens_seen": 29619392, "step": 34210 }, { "epoch": 16.13154172560113, "grad_norm": 0.01371519360691309, "learning_rate": 0.01522338432359624, "loss": 0.161, "num_input_tokens_seen": 29623248, "step": 34215 }, { "epoch": 16.133899104196132, "grad_norm": 0.020540283992886543, "learning_rate": 0.01519753834043635, "loss": 0.1606, "num_input_tokens_seen": 29626816, "step": 34220 }, { "epoch": 16.136256482791136, "grad_norm": 0.020900508388876915, "learning_rate": 0.015171713145510095, "loss": 0.2167, "num_input_tokens_seen": 29632784, "step": 34225 }, { "epoch": 16.138613861386137, "grad_norm": 0.005213696975260973, "learning_rate": 0.01514590874279999, "loss": 0.0919, "num_input_tokens_seen": 29637200, "step": 34230 }, { "epoch": 16.14097123998114, "grad_norm": 0.004236621782183647, "learning_rate": 0.015120125136285467, "loss": 0.0758, "num_input_tokens_seen": 29641216, "step": 34235 }, { "epoch": 16.14332861857614, "grad_norm": 0.008980554528534412, "learning_rate": 0.015094362329942629, "loss": 0.1154, "num_input_tokens_seen": 29645856, "step": 34240 }, { "epoch": 16.145685997171146, "grad_norm": 0.014634541235864162, "learning_rate": 0.01506862032774448, "loss": 0.1558, "num_input_tokens_seen": 29649776, "step": 34245 }, { "epoch": 16.148043375766147, "grad_norm": 0.025339709594845772, "learning_rate": 0.015042899133660697, "loss": 0.1495, "num_input_tokens_seen": 29654016, "step": 34250 }, { "epoch": 16.15040075436115, "grad_norm": 0.01310226134955883, "learning_rate": 0.01501719875165789, "loss": 0.209, "num_input_tokens_seen": 29657952, "step": 34255 }, { "epoch": 16.15275813295615, "grad_norm": 0.018156424164772034, "learning_rate": 0.014991519185699286, "loss": 0.161, "num_input_tokens_seen": 29661968, "step": 34260 }, { "epoch": 16.155115511551156, "grad_norm": 0.018348902463912964, "learning_rate": 0.014965860439745054, "loss": 0.148, "num_input_tokens_seen": 29666480, "step": 34265 }, { "epoch": 16.157472890146156, "grad_norm": 0.019490554928779602, "learning_rate": 0.01494022251775211, "loss": 0.1927, "num_input_tokens_seen": 29670512, "step": 34270 }, { "epoch": 16.15983026874116, "grad_norm": 0.009946472942829132, "learning_rate": 0.014914605423674109, "loss": 0.1582, "num_input_tokens_seen": 29674432, "step": 34275 }, { "epoch": 16.16218764733616, "grad_norm": 0.010929797776043415, "learning_rate": 0.014889009161461525, "loss": 0.15, "num_input_tokens_seen": 29678720, "step": 34280 }, { "epoch": 16.164545025931165, "grad_norm": 0.008386784233152866, "learning_rate": 0.014863433735061665, "loss": 0.1338, "num_input_tokens_seen": 29683696, "step": 34285 }, { "epoch": 16.166902404526166, "grad_norm": 0.008403047919273376, "learning_rate": 0.014837879148418541, "loss": 0.1101, "num_input_tokens_seen": 29687824, "step": 34290 }, { "epoch": 16.16925978312117, "grad_norm": 0.009904601611196995, "learning_rate": 0.01481234540547302, "loss": 0.1115, "num_input_tokens_seen": 29692160, "step": 34295 }, { "epoch": 16.17161716171617, "grad_norm": 0.02155277132987976, "learning_rate": 0.014786832510162717, "loss": 0.1864, "num_input_tokens_seen": 29696304, "step": 34300 }, { "epoch": 16.173974540311175, "grad_norm": 0.008372167125344276, "learning_rate": 0.014761340466422017, "loss": 0.1728, "num_input_tokens_seen": 29700208, "step": 34305 }, { "epoch": 16.176331918906175, "grad_norm": 0.020202049985527992, "learning_rate": 0.014735869278182144, "loss": 0.1581, "num_input_tokens_seen": 29703920, "step": 34310 }, { "epoch": 16.17868929750118, "grad_norm": 0.008375285193324089, "learning_rate": 0.014710418949371057, "loss": 0.0898, "num_input_tokens_seen": 29708400, "step": 34315 }, { "epoch": 16.18104667609618, "grad_norm": 0.01661125011742115, "learning_rate": 0.014684989483913495, "loss": 0.1036, "num_input_tokens_seen": 29712656, "step": 34320 }, { "epoch": 16.183404054691184, "grad_norm": 0.024773744866251945, "learning_rate": 0.014659580885731077, "loss": 0.1323, "num_input_tokens_seen": 29717376, "step": 34325 }, { "epoch": 16.185761433286185, "grad_norm": 0.0049279904924333096, "learning_rate": 0.014634193158742047, "loss": 0.117, "num_input_tokens_seen": 29721200, "step": 34330 }, { "epoch": 16.18811881188119, "grad_norm": 0.009918139316141605, "learning_rate": 0.014608826306861576, "loss": 0.1296, "num_input_tokens_seen": 29725472, "step": 34335 }, { "epoch": 16.19047619047619, "grad_norm": 0.003971131052821875, "learning_rate": 0.014583480334001486, "loss": 0.139, "num_input_tokens_seen": 29729376, "step": 34340 }, { "epoch": 16.192833569071194, "grad_norm": 0.007473228499293327, "learning_rate": 0.014558155244070496, "loss": 0.1976, "num_input_tokens_seen": 29733648, "step": 34345 }, { "epoch": 16.195190947666195, "grad_norm": 0.006171034183353186, "learning_rate": 0.014532851040974036, "loss": 0.1324, "num_input_tokens_seen": 29737168, "step": 34350 }, { "epoch": 16.1975483262612, "grad_norm": 0.014796918258070946, "learning_rate": 0.014507567728614335, "loss": 0.0887, "num_input_tokens_seen": 29742768, "step": 34355 }, { "epoch": 16.1999057048562, "grad_norm": 0.004246920812875032, "learning_rate": 0.01448230531089037, "loss": 0.085, "num_input_tokens_seen": 29747328, "step": 34360 }, { "epoch": 16.202263083451204, "grad_norm": 0.002736500231549144, "learning_rate": 0.014457063791697993, "loss": 0.1054, "num_input_tokens_seen": 29751328, "step": 34365 }, { "epoch": 16.204620462046204, "grad_norm": 0.008553816936910152, "learning_rate": 0.01443184317492971, "loss": 0.1654, "num_input_tokens_seen": 29755504, "step": 34370 }, { "epoch": 16.20697784064121, "grad_norm": 0.011818241328001022, "learning_rate": 0.014406643464474822, "loss": 0.1231, "num_input_tokens_seen": 29760336, "step": 34375 }, { "epoch": 16.20933521923621, "grad_norm": 0.010860808193683624, "learning_rate": 0.014381464664219539, "loss": 0.0831, "num_input_tokens_seen": 29764752, "step": 34380 }, { "epoch": 16.211692597831213, "grad_norm": 0.009971903637051582, "learning_rate": 0.014356306778046656, "loss": 0.0758, "num_input_tokens_seen": 29768816, "step": 34385 }, { "epoch": 16.214049976426214, "grad_norm": 0.021600401028990746, "learning_rate": 0.014331169809835885, "loss": 0.1825, "num_input_tokens_seen": 29772992, "step": 34390 }, { "epoch": 16.216407355021218, "grad_norm": 0.014235047623515129, "learning_rate": 0.014306053763463644, "loss": 0.153, "num_input_tokens_seen": 29776848, "step": 34395 }, { "epoch": 16.21876473361622, "grad_norm": 0.009120653383433819, "learning_rate": 0.014280958642803147, "loss": 0.1441, "num_input_tokens_seen": 29781472, "step": 34400 }, { "epoch": 16.21876473361622, "eval_loss": 0.24954015016555786, "eval_runtime": 21.9481, "eval_samples_per_second": 42.965, "eval_steps_per_second": 21.505, "num_input_tokens_seen": 29781472, "step": 34400 }, { "epoch": 16.221122112211223, "grad_norm": 0.00858235638588667, "learning_rate": 0.014255884451724404, "loss": 0.0996, "num_input_tokens_seen": 29786416, "step": 34405 }, { "epoch": 16.223479490806223, "grad_norm": 0.010800439864397049, "learning_rate": 0.014230831194094101, "loss": 0.0756, "num_input_tokens_seen": 29790480, "step": 34410 }, { "epoch": 16.225836869401228, "grad_norm": 0.008445179089903831, "learning_rate": 0.014205798873775865, "loss": 0.1106, "num_input_tokens_seen": 29795136, "step": 34415 }, { "epoch": 16.22819424799623, "grad_norm": 0.01712847501039505, "learning_rate": 0.014180787494629893, "loss": 0.0673, "num_input_tokens_seen": 29800080, "step": 34420 }, { "epoch": 16.23055162659123, "grad_norm": 0.0026848330162465572, "learning_rate": 0.014155797060513314, "loss": 0.0754, "num_input_tokens_seen": 29804224, "step": 34425 }, { "epoch": 16.232909005186233, "grad_norm": 0.021138524636626244, "learning_rate": 0.014130827575279963, "loss": 0.1533, "num_input_tokens_seen": 29808896, "step": 34430 }, { "epoch": 16.235266383781234, "grad_norm": 0.00976987462490797, "learning_rate": 0.014105879042780427, "loss": 0.0743, "num_input_tokens_seen": 29813664, "step": 34435 }, { "epoch": 16.237623762376238, "grad_norm": 0.004852553363889456, "learning_rate": 0.014080951466862113, "loss": 0.0926, "num_input_tokens_seen": 29817184, "step": 34440 }, { "epoch": 16.23998114097124, "grad_norm": 0.004030173644423485, "learning_rate": 0.014056044851369126, "loss": 0.0587, "num_input_tokens_seen": 29821408, "step": 34445 }, { "epoch": 16.242338519566243, "grad_norm": 0.0057837339118123055, "learning_rate": 0.014031159200142428, "loss": 0.127, "num_input_tokens_seen": 29826368, "step": 34450 }, { "epoch": 16.244695898161243, "grad_norm": 0.014906048774719238, "learning_rate": 0.014006294517019667, "loss": 0.1672, "num_input_tokens_seen": 29830912, "step": 34455 }, { "epoch": 16.247053276756247, "grad_norm": 0.0037369453348219395, "learning_rate": 0.013981450805835276, "loss": 0.1381, "num_input_tokens_seen": 29835872, "step": 34460 }, { "epoch": 16.249410655351248, "grad_norm": 0.012458516284823418, "learning_rate": 0.01395662807042049, "loss": 0.1128, "num_input_tokens_seen": 29839936, "step": 34465 }, { "epoch": 16.251768033946252, "grad_norm": 0.006020260043442249, "learning_rate": 0.013931826314603296, "loss": 0.1025, "num_input_tokens_seen": 29844016, "step": 34470 }, { "epoch": 16.254125412541253, "grad_norm": 0.006804877892136574, "learning_rate": 0.013907045542208401, "loss": 0.1231, "num_input_tokens_seen": 29848512, "step": 34475 }, { "epoch": 16.256482791136257, "grad_norm": 0.017228273674845695, "learning_rate": 0.013882285757057333, "loss": 0.2225, "num_input_tokens_seen": 29853648, "step": 34480 }, { "epoch": 16.258840169731258, "grad_norm": 0.009123594500124454, "learning_rate": 0.013857546962968403, "loss": 0.1231, "num_input_tokens_seen": 29857552, "step": 34485 }, { "epoch": 16.261197548326262, "grad_norm": 0.0066408757120370865, "learning_rate": 0.013832829163756577, "loss": 0.0607, "num_input_tokens_seen": 29861472, "step": 34490 }, { "epoch": 16.263554926921262, "grad_norm": 0.012659543193876743, "learning_rate": 0.013808132363233689, "loss": 0.1055, "num_input_tokens_seen": 29865616, "step": 34495 }, { "epoch": 16.265912305516267, "grad_norm": 0.008858910761773586, "learning_rate": 0.013783456565208256, "loss": 0.0772, "num_input_tokens_seen": 29870112, "step": 34500 }, { "epoch": 16.268269684111267, "grad_norm": 0.016577212139964104, "learning_rate": 0.01375880177348564, "loss": 0.1375, "num_input_tokens_seen": 29873840, "step": 34505 }, { "epoch": 16.27062706270627, "grad_norm": 0.009850556030869484, "learning_rate": 0.013734167991867928, "loss": 0.1311, "num_input_tokens_seen": 29878320, "step": 34510 }, { "epoch": 16.272984441301272, "grad_norm": 0.0062917061150074005, "learning_rate": 0.013709555224153935, "loss": 0.1423, "num_input_tokens_seen": 29883280, "step": 34515 }, { "epoch": 16.275341819896276, "grad_norm": 0.0030026084277778864, "learning_rate": 0.013684963474139222, "loss": 0.1527, "num_input_tokens_seen": 29887600, "step": 34520 }, { "epoch": 16.277699198491277, "grad_norm": 0.01357159111648798, "learning_rate": 0.013660392745616224, "loss": 0.1067, "num_input_tokens_seen": 29891840, "step": 34525 }, { "epoch": 16.28005657708628, "grad_norm": 0.008259536698460579, "learning_rate": 0.013635843042373974, "loss": 0.1329, "num_input_tokens_seen": 29896608, "step": 34530 }, { "epoch": 16.28241395568128, "grad_norm": 0.004500230774283409, "learning_rate": 0.01361131436819843, "loss": 0.1115, "num_input_tokens_seen": 29901792, "step": 34535 }, { "epoch": 16.284771334276286, "grad_norm": 0.01196984015405178, "learning_rate": 0.013586806726872147, "loss": 0.1022, "num_input_tokens_seen": 29906048, "step": 34540 }, { "epoch": 16.287128712871286, "grad_norm": 0.023855049163103104, "learning_rate": 0.013562320122174537, "loss": 0.1964, "num_input_tokens_seen": 29910000, "step": 34545 }, { "epoch": 16.28948609146629, "grad_norm": 0.011452157981693745, "learning_rate": 0.013537854557881762, "loss": 0.1599, "num_input_tokens_seen": 29914608, "step": 34550 }, { "epoch": 16.29184347006129, "grad_norm": 0.004847514443099499, "learning_rate": 0.013513410037766687, "loss": 0.0825, "num_input_tokens_seen": 29918528, "step": 34555 }, { "epoch": 16.294200848656295, "grad_norm": 0.013607257045805454, "learning_rate": 0.013488986565598998, "loss": 0.1309, "num_input_tokens_seen": 29923232, "step": 34560 }, { "epoch": 16.296558227251296, "grad_norm": 0.00858873501420021, "learning_rate": 0.013464584145145097, "loss": 0.1285, "num_input_tokens_seen": 29927520, "step": 34565 }, { "epoch": 16.2989156058463, "grad_norm": 0.007658522110432386, "learning_rate": 0.013440202780168109, "loss": 0.1252, "num_input_tokens_seen": 29932176, "step": 34570 }, { "epoch": 16.3012729844413, "grad_norm": 0.004131216090172529, "learning_rate": 0.01341584247442799, "loss": 0.1646, "num_input_tokens_seen": 29936880, "step": 34575 }, { "epoch": 16.303630363036305, "grad_norm": 0.004858119413256645, "learning_rate": 0.013391503231681355, "loss": 0.0712, "num_input_tokens_seen": 29941360, "step": 34580 }, { "epoch": 16.305987741631306, "grad_norm": 0.004984191618859768, "learning_rate": 0.013367185055681685, "loss": 0.0622, "num_input_tokens_seen": 29945920, "step": 34585 }, { "epoch": 16.30834512022631, "grad_norm": 0.010351119562983513, "learning_rate": 0.013342887950179095, "loss": 0.1319, "num_input_tokens_seen": 29950912, "step": 34590 }, { "epoch": 16.31070249882131, "grad_norm": 0.008251049555838108, "learning_rate": 0.013318611918920554, "loss": 0.083, "num_input_tokens_seen": 29955408, "step": 34595 }, { "epoch": 16.313059877416315, "grad_norm": 0.017348725348711014, "learning_rate": 0.01329435696564965, "loss": 0.1606, "num_input_tokens_seen": 29959568, "step": 34600 }, { "epoch": 16.313059877416315, "eval_loss": 0.25914889574050903, "eval_runtime": 21.9054, "eval_samples_per_second": 43.049, "eval_steps_per_second": 21.547, "num_input_tokens_seen": 29959568, "step": 34600 }, { "epoch": 16.315417256011315, "grad_norm": 0.017015403136610985, "learning_rate": 0.013270123094106894, "loss": 0.226, "num_input_tokens_seen": 29964400, "step": 34605 }, { "epoch": 16.31777463460632, "grad_norm": 0.022247862070798874, "learning_rate": 0.013245910308029395, "loss": 0.1817, "num_input_tokens_seen": 29968304, "step": 34610 }, { "epoch": 16.32013201320132, "grad_norm": 0.008536193519830704, "learning_rate": 0.0132217186111511, "loss": 0.1196, "num_input_tokens_seen": 29973008, "step": 34615 }, { "epoch": 16.32248939179632, "grad_norm": 0.019585397094488144, "learning_rate": 0.013197548007202626, "loss": 0.1354, "num_input_tokens_seen": 29977168, "step": 34620 }, { "epoch": 16.324846770391325, "grad_norm": 0.014536433853209019, "learning_rate": 0.01317339849991142, "loss": 0.1086, "num_input_tokens_seen": 29981760, "step": 34625 }, { "epoch": 16.327204148986326, "grad_norm": 0.007709859870374203, "learning_rate": 0.013149270093001675, "loss": 0.1206, "num_input_tokens_seen": 29985456, "step": 34630 }, { "epoch": 16.32956152758133, "grad_norm": 0.019150685518980026, "learning_rate": 0.013125162790194227, "loss": 0.1136, "num_input_tokens_seen": 29989616, "step": 34635 }, { "epoch": 16.33191890617633, "grad_norm": 0.0024337507784366608, "learning_rate": 0.01310107659520674, "loss": 0.0821, "num_input_tokens_seen": 29994160, "step": 34640 }, { "epoch": 16.334276284771335, "grad_norm": 0.022555958479642868, "learning_rate": 0.013077011511753655, "loss": 0.1087, "num_input_tokens_seen": 29998432, "step": 34645 }, { "epoch": 16.336633663366335, "grad_norm": 0.020689763128757477, "learning_rate": 0.013052967543546056, "loss": 0.1549, "num_input_tokens_seen": 30002992, "step": 34650 }, { "epoch": 16.33899104196134, "grad_norm": 0.009866178967058659, "learning_rate": 0.01302894469429186, "loss": 0.1435, "num_input_tokens_seen": 30006832, "step": 34655 }, { "epoch": 16.34134842055634, "grad_norm": 0.012270376086235046, "learning_rate": 0.013004942967695653, "loss": 0.1944, "num_input_tokens_seen": 30010400, "step": 34660 }, { "epoch": 16.343705799151344, "grad_norm": 0.00685886712744832, "learning_rate": 0.012980962367458859, "loss": 0.1062, "num_input_tokens_seen": 30014880, "step": 34665 }, { "epoch": 16.346063177746345, "grad_norm": 0.005834975279867649, "learning_rate": 0.012957002897279567, "loss": 0.0517, "num_input_tokens_seen": 30019328, "step": 34670 }, { "epoch": 16.34842055634135, "grad_norm": 0.01022653840482235, "learning_rate": 0.012933064560852576, "loss": 0.1214, "num_input_tokens_seen": 30023888, "step": 34675 }, { "epoch": 16.35077793493635, "grad_norm": 0.008776349015533924, "learning_rate": 0.012909147361869527, "loss": 0.0944, "num_input_tokens_seen": 30030016, "step": 34680 }, { "epoch": 16.353135313531354, "grad_norm": 0.001837646821513772, "learning_rate": 0.012885251304018774, "loss": 0.0604, "num_input_tokens_seen": 30034256, "step": 34685 }, { "epoch": 16.355492692126354, "grad_norm": 0.018766913563013077, "learning_rate": 0.012861376390985335, "loss": 0.1473, "num_input_tokens_seen": 30039488, "step": 34690 }, { "epoch": 16.35785007072136, "grad_norm": 0.01562689244747162, "learning_rate": 0.012837522626451063, "loss": 0.1066, "num_input_tokens_seen": 30044512, "step": 34695 }, { "epoch": 16.36020744931636, "grad_norm": 0.0022444245405495167, "learning_rate": 0.01281369001409447, "loss": 0.0492, "num_input_tokens_seen": 30048272, "step": 34700 }, { "epoch": 16.362564827911363, "grad_norm": 0.00583273358643055, "learning_rate": 0.012789878557590877, "loss": 0.1079, "num_input_tokens_seen": 30052768, "step": 34705 }, { "epoch": 16.364922206506364, "grad_norm": 0.007334201596677303, "learning_rate": 0.012766088260612334, "loss": 0.1351, "num_input_tokens_seen": 30057568, "step": 34710 }, { "epoch": 16.367279585101368, "grad_norm": 0.008131795562803745, "learning_rate": 0.012742319126827523, "loss": 0.0645, "num_input_tokens_seen": 30061856, "step": 34715 }, { "epoch": 16.36963696369637, "grad_norm": 0.017093287780880928, "learning_rate": 0.012718571159902008, "loss": 0.1468, "num_input_tokens_seen": 30066704, "step": 34720 }, { "epoch": 16.371994342291373, "grad_norm": 0.007563186343759298, "learning_rate": 0.01269484436349803, "loss": 0.1399, "num_input_tokens_seen": 30071040, "step": 34725 }, { "epoch": 16.374351720886374, "grad_norm": 0.011832297779619694, "learning_rate": 0.012671138741274528, "loss": 0.1369, "num_input_tokens_seen": 30075776, "step": 34730 }, { "epoch": 16.376709099481378, "grad_norm": 0.01238256972283125, "learning_rate": 0.012647454296887194, "loss": 0.0931, "num_input_tokens_seen": 30079520, "step": 34735 }, { "epoch": 16.37906647807638, "grad_norm": 0.0019004677888005972, "learning_rate": 0.012623791033988507, "loss": 0.1415, "num_input_tokens_seen": 30083888, "step": 34740 }, { "epoch": 16.381423856671383, "grad_norm": 0.011642579920589924, "learning_rate": 0.012600148956227597, "loss": 0.1875, "num_input_tokens_seen": 30087936, "step": 34745 }, { "epoch": 16.383781235266383, "grad_norm": 0.01616702228784561, "learning_rate": 0.012576528067250414, "loss": 0.1014, "num_input_tokens_seen": 30092272, "step": 34750 }, { "epoch": 16.386138613861387, "grad_norm": 0.01834462396800518, "learning_rate": 0.012552928370699561, "loss": 0.1512, "num_input_tokens_seen": 30096144, "step": 34755 }, { "epoch": 16.388495992456388, "grad_norm": 0.0014152417425066233, "learning_rate": 0.012529349870214411, "loss": 0.1572, "num_input_tokens_seen": 30100464, "step": 34760 }, { "epoch": 16.390853371051392, "grad_norm": 0.014970878139138222, "learning_rate": 0.012505792569431106, "loss": 0.1194, "num_input_tokens_seen": 30104960, "step": 34765 }, { "epoch": 16.393210749646393, "grad_norm": 0.016629910096526146, "learning_rate": 0.012482256471982422, "loss": 0.0894, "num_input_tokens_seen": 30109552, "step": 34770 }, { "epoch": 16.395568128241397, "grad_norm": 0.009407327510416508, "learning_rate": 0.012458741581497956, "loss": 0.0917, "num_input_tokens_seen": 30113648, "step": 34775 }, { "epoch": 16.397925506836398, "grad_norm": 0.008620122447609901, "learning_rate": 0.012435247901603974, "loss": 0.0731, "num_input_tokens_seen": 30118576, "step": 34780 }, { "epoch": 16.400282885431402, "grad_norm": 0.006362468469887972, "learning_rate": 0.012411775435923528, "loss": 0.1279, "num_input_tokens_seen": 30122752, "step": 34785 }, { "epoch": 16.402640264026402, "grad_norm": 0.016277378425002098, "learning_rate": 0.012388324188076354, "loss": 0.0719, "num_input_tokens_seen": 30127472, "step": 34790 }, { "epoch": 16.404997642621407, "grad_norm": 0.003680358175188303, "learning_rate": 0.012364894161678913, "loss": 0.1331, "num_input_tokens_seen": 30131008, "step": 34795 }, { "epoch": 16.407355021216407, "grad_norm": 0.01733982376754284, "learning_rate": 0.012341485360344445, "loss": 0.1382, "num_input_tokens_seen": 30134704, "step": 34800 }, { "epoch": 16.407355021216407, "eval_loss": 0.26583540439605713, "eval_runtime": 21.8943, "eval_samples_per_second": 43.071, "eval_steps_per_second": 21.558, "num_input_tokens_seen": 30134704, "step": 34800 }, { "epoch": 16.40971239981141, "grad_norm": 0.017474906519055367, "learning_rate": 0.01231809778768283, "loss": 0.0805, "num_input_tokens_seen": 30138832, "step": 34805 }, { "epoch": 16.412069778406412, "grad_norm": 0.006551632657647133, "learning_rate": 0.012294731447300799, "loss": 0.0908, "num_input_tokens_seen": 30143408, "step": 34810 }, { "epoch": 16.414427157001413, "grad_norm": 0.00411498686298728, "learning_rate": 0.012271386342801671, "loss": 0.0802, "num_input_tokens_seen": 30148176, "step": 34815 }, { "epoch": 16.416784535596417, "grad_norm": 0.0105154262855649, "learning_rate": 0.012248062477785565, "loss": 0.09, "num_input_tokens_seen": 30152240, "step": 34820 }, { "epoch": 16.419141914191417, "grad_norm": 0.008995122276246548, "learning_rate": 0.012224759855849305, "loss": 0.1283, "num_input_tokens_seen": 30156832, "step": 34825 }, { "epoch": 16.42149929278642, "grad_norm": 0.012466702610254288, "learning_rate": 0.012201478480586513, "loss": 0.1027, "num_input_tokens_seen": 30161280, "step": 34830 }, { "epoch": 16.423856671381422, "grad_norm": 0.006256289314478636, "learning_rate": 0.012178218355587389, "loss": 0.1305, "num_input_tokens_seen": 30166240, "step": 34835 }, { "epoch": 16.426214049976426, "grad_norm": 0.005141161382198334, "learning_rate": 0.01215497948443896, "loss": 0.0952, "num_input_tokens_seen": 30170624, "step": 34840 }, { "epoch": 16.428571428571427, "grad_norm": 0.006387734320014715, "learning_rate": 0.012131761870724993, "loss": 0.1236, "num_input_tokens_seen": 30174816, "step": 34845 }, { "epoch": 16.43092880716643, "grad_norm": 0.014991546981036663, "learning_rate": 0.012108565518025893, "loss": 0.1224, "num_input_tokens_seen": 30179520, "step": 34850 }, { "epoch": 16.433286185761432, "grad_norm": 0.011947426944971085, "learning_rate": 0.012085390429918862, "loss": 0.1436, "num_input_tokens_seen": 30183328, "step": 34855 }, { "epoch": 16.435643564356436, "grad_norm": 0.009861820377409458, "learning_rate": 0.012062236609977744, "loss": 0.1611, "num_input_tokens_seen": 30187344, "step": 34860 }, { "epoch": 16.438000942951437, "grad_norm": 0.013246297836303711, "learning_rate": 0.01203910406177318, "loss": 0.0905, "num_input_tokens_seen": 30191168, "step": 34865 }, { "epoch": 16.44035832154644, "grad_norm": 0.012861029244959354, "learning_rate": 0.01201599278887252, "loss": 0.1707, "num_input_tokens_seen": 30195776, "step": 34870 }, { "epoch": 16.44271570014144, "grad_norm": 0.011228815652430058, "learning_rate": 0.011992902794839744, "loss": 0.1348, "num_input_tokens_seen": 30199904, "step": 34875 }, { "epoch": 16.445073078736446, "grad_norm": 0.002383790910243988, "learning_rate": 0.011969834083235703, "loss": 0.0874, "num_input_tokens_seen": 30204144, "step": 34880 }, { "epoch": 16.447430457331446, "grad_norm": 0.014992046169936657, "learning_rate": 0.011946786657617836, "loss": 0.1404, "num_input_tokens_seen": 30208304, "step": 34885 }, { "epoch": 16.44978783592645, "grad_norm": 0.011577366851270199, "learning_rate": 0.011923760521540332, "loss": 0.1465, "num_input_tokens_seen": 30213600, "step": 34890 }, { "epoch": 16.45214521452145, "grad_norm": 0.020672926679253578, "learning_rate": 0.011900755678554153, "loss": 0.2086, "num_input_tokens_seen": 30217296, "step": 34895 }, { "epoch": 16.454502593116455, "grad_norm": 0.014294707216322422, "learning_rate": 0.011877772132206893, "loss": 0.1212, "num_input_tokens_seen": 30222096, "step": 34900 }, { "epoch": 16.456859971711456, "grad_norm": 0.003543161554262042, "learning_rate": 0.011854809886042915, "loss": 0.1296, "num_input_tokens_seen": 30226448, "step": 34905 }, { "epoch": 16.45921735030646, "grad_norm": 0.010926477611064911, "learning_rate": 0.011831868943603325, "loss": 0.1214, "num_input_tokens_seen": 30230176, "step": 34910 }, { "epoch": 16.46157472890146, "grad_norm": 0.014689477160573006, "learning_rate": 0.011808949308425836, "loss": 0.0857, "num_input_tokens_seen": 30234080, "step": 34915 }, { "epoch": 16.463932107496465, "grad_norm": 0.016087887808680534, "learning_rate": 0.01178605098404501, "loss": 0.1082, "num_input_tokens_seen": 30238112, "step": 34920 }, { "epoch": 16.466289486091465, "grad_norm": 0.006383042316883802, "learning_rate": 0.011763173973992002, "loss": 0.1542, "num_input_tokens_seen": 30242384, "step": 34925 }, { "epoch": 16.46864686468647, "grad_norm": 0.006607703864574432, "learning_rate": 0.011740318281794776, "loss": 0.1027, "num_input_tokens_seen": 30246880, "step": 34930 }, { "epoch": 16.47100424328147, "grad_norm": 0.005224226973950863, "learning_rate": 0.01171748391097796, "loss": 0.0954, "num_input_tokens_seen": 30251216, "step": 34935 }, { "epoch": 16.473361621876474, "grad_norm": 0.015584144741296768, "learning_rate": 0.011694670865062873, "loss": 0.1191, "num_input_tokens_seen": 30254464, "step": 34940 }, { "epoch": 16.475719000471475, "grad_norm": 0.0060554808005690575, "learning_rate": 0.011671879147567616, "loss": 0.1256, "num_input_tokens_seen": 30259136, "step": 34945 }, { "epoch": 16.47807637906648, "grad_norm": 0.008802007883787155, "learning_rate": 0.011649108762006893, "loss": 0.0973, "num_input_tokens_seen": 30263200, "step": 34950 }, { "epoch": 16.48043375766148, "grad_norm": 0.013043027371168137, "learning_rate": 0.011626359711892265, "loss": 0.0796, "num_input_tokens_seen": 30268096, "step": 34955 }, { "epoch": 16.482791136256484, "grad_norm": 0.0042444998398423195, "learning_rate": 0.01160363200073189, "loss": 0.1074, "num_input_tokens_seen": 30272480, "step": 34960 }, { "epoch": 16.485148514851485, "grad_norm": 0.01721043325960636, "learning_rate": 0.011580925632030614, "loss": 0.1518, "num_input_tokens_seen": 30276624, "step": 34965 }, { "epoch": 16.48750589344649, "grad_norm": 0.014687670394778252, "learning_rate": 0.011558240609290104, "loss": 0.1819, "num_input_tokens_seen": 30280544, "step": 34970 }, { "epoch": 16.48986327204149, "grad_norm": 0.0046437084674835205, "learning_rate": 0.011535576936008679, "loss": 0.1382, "num_input_tokens_seen": 30284112, "step": 34975 }, { "epoch": 16.492220650636494, "grad_norm": 0.020965280011296272, "learning_rate": 0.011512934615681309, "loss": 0.1359, "num_input_tokens_seen": 30288672, "step": 34980 }, { "epoch": 16.494578029231494, "grad_norm": 0.013096784241497517, "learning_rate": 0.011490313651799765, "loss": 0.0841, "num_input_tokens_seen": 30293120, "step": 34985 }, { "epoch": 16.4969354078265, "grad_norm": 0.04140080139040947, "learning_rate": 0.011467714047852512, "loss": 0.1904, "num_input_tokens_seen": 30297040, "step": 34990 }, { "epoch": 16.4992927864215, "grad_norm": 0.009016991592943668, "learning_rate": 0.011445135807324624, "loss": 0.0999, "num_input_tokens_seen": 30301328, "step": 34995 }, { "epoch": 16.501650165016503, "grad_norm": 0.015973450616002083, "learning_rate": 0.011422578933698002, "loss": 0.096, "num_input_tokens_seen": 30305200, "step": 35000 }, { "epoch": 16.501650165016503, "eval_loss": 0.26546040177345276, "eval_runtime": 21.8685, "eval_samples_per_second": 43.121, "eval_steps_per_second": 21.584, "num_input_tokens_seen": 30305200, "step": 35000 }, { "epoch": 16.504007543611504, "grad_norm": 0.0027341393288224936, "learning_rate": 0.011400043430451161, "loss": 0.1348, "num_input_tokens_seen": 30309360, "step": 35005 }, { "epoch": 16.506364922206508, "grad_norm": 0.013696285896003246, "learning_rate": 0.011377529301059392, "loss": 0.0945, "num_input_tokens_seen": 30314256, "step": 35010 }, { "epoch": 16.50872230080151, "grad_norm": 0.005790447350591421, "learning_rate": 0.011355036548994646, "loss": 0.0959, "num_input_tokens_seen": 30318544, "step": 35015 }, { "epoch": 16.51107967939651, "grad_norm": 0.01679118163883686, "learning_rate": 0.011332565177725584, "loss": 0.1827, "num_input_tokens_seen": 30322432, "step": 35020 }, { "epoch": 16.513437057991514, "grad_norm": 0.01996285282075405, "learning_rate": 0.011310115190717585, "loss": 0.1285, "num_input_tokens_seen": 30326176, "step": 35025 }, { "epoch": 16.515794436586514, "grad_norm": 0.016931995749473572, "learning_rate": 0.01128768659143271, "loss": 0.1119, "num_input_tokens_seen": 30330944, "step": 35030 }, { "epoch": 16.51815181518152, "grad_norm": 0.023037662729620934, "learning_rate": 0.011265279383329713, "loss": 0.1837, "num_input_tokens_seen": 30334944, "step": 35035 }, { "epoch": 16.52050919377652, "grad_norm": 0.0054449113085865974, "learning_rate": 0.01124289356986411, "loss": 0.1025, "num_input_tokens_seen": 30338688, "step": 35040 }, { "epoch": 16.522866572371523, "grad_norm": 0.0142629723995924, "learning_rate": 0.011220529154488023, "loss": 0.1037, "num_input_tokens_seen": 30342880, "step": 35045 }, { "epoch": 16.525223950966524, "grad_norm": 0.014100898988544941, "learning_rate": 0.011198186140650346, "loss": 0.0923, "num_input_tokens_seen": 30347168, "step": 35050 }, { "epoch": 16.527581329561528, "grad_norm": 0.019583038985729218, "learning_rate": 0.011175864531796685, "loss": 0.1557, "num_input_tokens_seen": 30351568, "step": 35055 }, { "epoch": 16.52993870815653, "grad_norm": 0.007302261423319578, "learning_rate": 0.011153564331369258, "loss": 0.1035, "num_input_tokens_seen": 30356048, "step": 35060 }, { "epoch": 16.532296086751533, "grad_norm": 0.014552667737007141, "learning_rate": 0.011131285542807078, "loss": 0.1251, "num_input_tokens_seen": 30360112, "step": 35065 }, { "epoch": 16.534653465346533, "grad_norm": 0.018458813428878784, "learning_rate": 0.011109028169545815, "loss": 0.1578, "num_input_tokens_seen": 30364288, "step": 35070 }, { "epoch": 16.537010843941538, "grad_norm": 0.004091160837560892, "learning_rate": 0.011086792215017804, "loss": 0.0844, "num_input_tokens_seen": 30369280, "step": 35075 }, { "epoch": 16.539368222536538, "grad_norm": 0.014887602999806404, "learning_rate": 0.011064577682652137, "loss": 0.1375, "num_input_tokens_seen": 30373584, "step": 35080 }, { "epoch": 16.541725601131542, "grad_norm": 0.017657272517681122, "learning_rate": 0.011042384575874559, "loss": 0.073, "num_input_tokens_seen": 30378016, "step": 35085 }, { "epoch": 16.544082979726543, "grad_norm": 0.00416394229978323, "learning_rate": 0.011020212898107512, "loss": 0.107, "num_input_tokens_seen": 30382080, "step": 35090 }, { "epoch": 16.546440358321547, "grad_norm": 0.01989159733057022, "learning_rate": 0.010998062652770197, "loss": 0.2031, "num_input_tokens_seen": 30386656, "step": 35095 }, { "epoch": 16.548797736916548, "grad_norm": 0.010066955350339413, "learning_rate": 0.010975933843278428, "loss": 0.1106, "num_input_tokens_seen": 30391568, "step": 35100 }, { "epoch": 16.551155115511552, "grad_norm": 0.026892730966210365, "learning_rate": 0.010953826473044714, "loss": 0.1546, "num_input_tokens_seen": 30395712, "step": 35105 }, { "epoch": 16.553512494106553, "grad_norm": 0.010290831327438354, "learning_rate": 0.010931740545478357, "loss": 0.1838, "num_input_tokens_seen": 30400624, "step": 35110 }, { "epoch": 16.555869872701557, "grad_norm": 0.020022213459014893, "learning_rate": 0.010909676063985218, "loss": 0.1727, "num_input_tokens_seen": 30404512, "step": 35115 }, { "epoch": 16.558227251296557, "grad_norm": 0.00794756505638361, "learning_rate": 0.010887633031967974, "loss": 0.1039, "num_input_tokens_seen": 30410432, "step": 35120 }, { "epoch": 16.56058462989156, "grad_norm": 0.025536654517054558, "learning_rate": 0.01086561145282589, "loss": 0.1242, "num_input_tokens_seen": 30415120, "step": 35125 }, { "epoch": 16.562942008486562, "grad_norm": 0.012409742921590805, "learning_rate": 0.010843611329954983, "loss": 0.1031, "num_input_tokens_seen": 30419056, "step": 35130 }, { "epoch": 16.565299387081566, "grad_norm": 0.008502207696437836, "learning_rate": 0.010821632666747988, "loss": 0.094, "num_input_tokens_seen": 30423296, "step": 35135 }, { "epoch": 16.567656765676567, "grad_norm": 0.00784034002572298, "learning_rate": 0.010799675466594244, "loss": 0.0763, "num_input_tokens_seen": 30427296, "step": 35140 }, { "epoch": 16.57001414427157, "grad_norm": 0.018319740891456604, "learning_rate": 0.010777739732879826, "loss": 0.1287, "num_input_tokens_seen": 30431152, "step": 35145 }, { "epoch": 16.572371522866572, "grad_norm": 0.009106172248721123, "learning_rate": 0.010755825468987562, "loss": 0.0909, "num_input_tokens_seen": 30435728, "step": 35150 }, { "epoch": 16.574728901461576, "grad_norm": 0.010628446005284786, "learning_rate": 0.010733932678296814, "loss": 0.1358, "num_input_tokens_seen": 30440368, "step": 35155 }, { "epoch": 16.577086280056577, "grad_norm": 0.016367647796869278, "learning_rate": 0.010712061364183817, "loss": 0.1205, "num_input_tokens_seen": 30444336, "step": 35160 }, { "epoch": 16.57944365865158, "grad_norm": 0.0246882364153862, "learning_rate": 0.010690211530021337, "loss": 0.0974, "num_input_tokens_seen": 30448768, "step": 35165 }, { "epoch": 16.58180103724658, "grad_norm": 0.011260773055255413, "learning_rate": 0.01066838317917893, "loss": 0.1205, "num_input_tokens_seen": 30453632, "step": 35170 }, { "epoch": 16.584158415841586, "grad_norm": 0.015185331925749779, "learning_rate": 0.010646576315022787, "loss": 0.1065, "num_input_tokens_seen": 30458192, "step": 35175 }, { "epoch": 16.586515794436586, "grad_norm": 0.019020820036530495, "learning_rate": 0.010624790940915785, "loss": 0.119, "num_input_tokens_seen": 30461632, "step": 35180 }, { "epoch": 16.58887317303159, "grad_norm": 0.015028572641313076, "learning_rate": 0.0106030270602175, "loss": 0.1548, "num_input_tokens_seen": 30466064, "step": 35185 }, { "epoch": 16.59123055162659, "grad_norm": 0.0103147579357028, "learning_rate": 0.010581284676284252, "loss": 0.1212, "num_input_tokens_seen": 30470256, "step": 35190 }, { "epoch": 16.593587930221595, "grad_norm": 0.01812771148979664, "learning_rate": 0.010559563792468923, "loss": 0.0839, "num_input_tokens_seen": 30474096, "step": 35195 }, { "epoch": 16.595945308816596, "grad_norm": 0.006949461996555328, "learning_rate": 0.010537864412121217, "loss": 0.1434, "num_input_tokens_seen": 30478576, "step": 35200 }, { "epoch": 16.595945308816596, "eval_loss": 0.27300241589546204, "eval_runtime": 21.9431, "eval_samples_per_second": 42.975, "eval_steps_per_second": 21.51, "num_input_tokens_seen": 30478576, "step": 35200 }, { "epoch": 16.5983026874116, "grad_norm": 0.014065522700548172, "learning_rate": 0.010516186538587357, "loss": 0.1572, "num_input_tokens_seen": 30483200, "step": 35205 }, { "epoch": 16.6006600660066, "grad_norm": 0.027471262961626053, "learning_rate": 0.01049453017521042, "loss": 0.1338, "num_input_tokens_seen": 30486960, "step": 35210 }, { "epoch": 16.603017444601605, "grad_norm": 0.009548719972372055, "learning_rate": 0.010472895325330083, "loss": 0.1447, "num_input_tokens_seen": 30491792, "step": 35215 }, { "epoch": 16.605374823196605, "grad_norm": 0.02331313118338585, "learning_rate": 0.010451281992282662, "loss": 0.1631, "num_input_tokens_seen": 30495712, "step": 35220 }, { "epoch": 16.607732201791606, "grad_norm": 0.006815020926296711, "learning_rate": 0.01042969017940124, "loss": 0.0761, "num_input_tokens_seen": 30500096, "step": 35225 }, { "epoch": 16.61008958038661, "grad_norm": 0.02551700361073017, "learning_rate": 0.01040811989001557, "loss": 0.1431, "num_input_tokens_seen": 30505152, "step": 35230 }, { "epoch": 16.61244695898161, "grad_norm": 0.014257312752306461, "learning_rate": 0.010386571127451992, "loss": 0.1543, "num_input_tokens_seen": 30509504, "step": 35235 }, { "epoch": 16.614804337576615, "grad_norm": 0.026753578335046768, "learning_rate": 0.010365043895033682, "loss": 0.1776, "num_input_tokens_seen": 30513616, "step": 35240 }, { "epoch": 16.617161716171616, "grad_norm": 0.021302232518792152, "learning_rate": 0.010343538196080365, "loss": 0.1102, "num_input_tokens_seen": 30517792, "step": 35245 }, { "epoch": 16.61951909476662, "grad_norm": 0.021725891157984734, "learning_rate": 0.010322054033908457, "loss": 0.1398, "num_input_tokens_seen": 30522896, "step": 35250 }, { "epoch": 16.62187647336162, "grad_norm": 0.004303533583879471, "learning_rate": 0.010300591411831156, "loss": 0.0753, "num_input_tokens_seen": 30527232, "step": 35255 }, { "epoch": 16.624233851956625, "grad_norm": 0.008826233446598053, "learning_rate": 0.010279150333158198, "loss": 0.0834, "num_input_tokens_seen": 30531872, "step": 35260 }, { "epoch": 16.626591230551625, "grad_norm": 0.0048581636510789394, "learning_rate": 0.010257730801196107, "loss": 0.1003, "num_input_tokens_seen": 30536256, "step": 35265 }, { "epoch": 16.62894860914663, "grad_norm": 0.005547201726585627, "learning_rate": 0.010236332819248056, "loss": 0.0819, "num_input_tokens_seen": 30540528, "step": 35270 }, { "epoch": 16.63130598774163, "grad_norm": 0.012261842377483845, "learning_rate": 0.010214956390613854, "loss": 0.221, "num_input_tokens_seen": 30544784, "step": 35275 }, { "epoch": 16.633663366336634, "grad_norm": 0.004975807853043079, "learning_rate": 0.010193601518590034, "loss": 0.1179, "num_input_tokens_seen": 30548768, "step": 35280 }, { "epoch": 16.636020744931635, "grad_norm": 0.003555319970473647, "learning_rate": 0.010172268206469758, "loss": 0.1024, "num_input_tokens_seen": 30552656, "step": 35285 }, { "epoch": 16.63837812352664, "grad_norm": 0.0189500842243433, "learning_rate": 0.010150956457542897, "loss": 0.0873, "num_input_tokens_seen": 30556960, "step": 35290 }, { "epoch": 16.64073550212164, "grad_norm": 0.019520742818713188, "learning_rate": 0.010129666275096054, "loss": 0.1037, "num_input_tokens_seen": 30561088, "step": 35295 }, { "epoch": 16.643092880716644, "grad_norm": 0.016071228310465813, "learning_rate": 0.010108397662412338, "loss": 0.1011, "num_input_tokens_seen": 30564768, "step": 35300 }, { "epoch": 16.645450259311644, "grad_norm": 0.020050635561347008, "learning_rate": 0.010087150622771707, "loss": 0.2454, "num_input_tokens_seen": 30569632, "step": 35305 }, { "epoch": 16.64780763790665, "grad_norm": 0.005001400131732225, "learning_rate": 0.010065925159450739, "loss": 0.0897, "num_input_tokens_seen": 30574224, "step": 35310 }, { "epoch": 16.65016501650165, "grad_norm": 0.006814736872911453, "learning_rate": 0.010044721275722618, "loss": 0.0678, "num_input_tokens_seen": 30578512, "step": 35315 }, { "epoch": 16.652522395096653, "grad_norm": 0.015536016784608364, "learning_rate": 0.01002353897485726, "loss": 0.1224, "num_input_tokens_seen": 30582864, "step": 35320 }, { "epoch": 16.654879773691654, "grad_norm": 0.012418797239661217, "learning_rate": 0.010002378260121236, "loss": 0.0846, "num_input_tokens_seen": 30587216, "step": 35325 }, { "epoch": 16.65723715228666, "grad_norm": 0.0077856313437223434, "learning_rate": 0.009981239134777786, "loss": 0.0882, "num_input_tokens_seen": 30591120, "step": 35330 }, { "epoch": 16.65959453088166, "grad_norm": 0.02454352378845215, "learning_rate": 0.009960121602086884, "loss": 0.18, "num_input_tokens_seen": 30595472, "step": 35335 }, { "epoch": 16.661951909476663, "grad_norm": 0.021875590085983276, "learning_rate": 0.009939025665305062, "loss": 0.1213, "num_input_tokens_seen": 30599776, "step": 35340 }, { "epoch": 16.664309288071664, "grad_norm": 0.01761482283473015, "learning_rate": 0.009917951327685597, "loss": 0.1064, "num_input_tokens_seen": 30603616, "step": 35345 }, { "epoch": 16.666666666666668, "grad_norm": 0.02375260926783085, "learning_rate": 0.009896898592478425, "loss": 0.0884, "num_input_tokens_seen": 30607376, "step": 35350 }, { "epoch": 16.66902404526167, "grad_norm": 0.014958813786506653, "learning_rate": 0.009875867462930132, "loss": 0.1322, "num_input_tokens_seen": 30611936, "step": 35355 }, { "epoch": 16.671381423856673, "grad_norm": 0.01618078164756298, "learning_rate": 0.009854857942284006, "loss": 0.2136, "num_input_tokens_seen": 30615552, "step": 35360 }, { "epoch": 16.673738802451673, "grad_norm": 0.010000337846577168, "learning_rate": 0.009833870033779923, "loss": 0.0936, "num_input_tokens_seen": 30619312, "step": 35365 }, { "epoch": 16.676096181046677, "grad_norm": 0.004790876992046833, "learning_rate": 0.009812903740654527, "loss": 0.0983, "num_input_tokens_seen": 30623456, "step": 35370 }, { "epoch": 16.678453559641678, "grad_norm": 0.015369550324976444, "learning_rate": 0.009791959066141097, "loss": 0.1077, "num_input_tokens_seen": 30627392, "step": 35375 }, { "epoch": 16.680810938236682, "grad_norm": 0.01710900105535984, "learning_rate": 0.009771036013469537, "loss": 0.1505, "num_input_tokens_seen": 30630864, "step": 35380 }, { "epoch": 16.683168316831683, "grad_norm": 0.007363800425082445, "learning_rate": 0.00975013458586646, "loss": 0.0919, "num_input_tokens_seen": 30634592, "step": 35385 }, { "epoch": 16.685525695426687, "grad_norm": 0.015121519565582275, "learning_rate": 0.009729254786555107, "loss": 0.0972, "num_input_tokens_seen": 30639120, "step": 35390 }, { "epoch": 16.687883074021688, "grad_norm": 0.003069967729970813, "learning_rate": 0.009708396618755421, "loss": 0.1346, "num_input_tokens_seen": 30643104, "step": 35395 }, { "epoch": 16.690240452616692, "grad_norm": 0.010551847517490387, "learning_rate": 0.009687560085683994, "loss": 0.0859, "num_input_tokens_seen": 30647744, "step": 35400 }, { "epoch": 16.690240452616692, "eval_loss": 0.26213181018829346, "eval_runtime": 21.9246, "eval_samples_per_second": 43.011, "eval_steps_per_second": 21.528, "num_input_tokens_seen": 30647744, "step": 35400 }, { "epoch": 16.692597831211692, "grad_norm": 0.016356129199266434, "learning_rate": 0.009666745190554054, "loss": 0.1053, "num_input_tokens_seen": 30652144, "step": 35405 }, { "epoch": 16.694955209806697, "grad_norm": 0.013923811726272106, "learning_rate": 0.009645951936575553, "loss": 0.1144, "num_input_tokens_seen": 30656496, "step": 35410 }, { "epoch": 16.697312588401697, "grad_norm": 0.012098482809960842, "learning_rate": 0.00962518032695509, "loss": 0.0996, "num_input_tokens_seen": 30660528, "step": 35415 }, { "epoch": 16.6996699669967, "grad_norm": 0.0025649776216596365, "learning_rate": 0.009604430364895855, "loss": 0.0971, "num_input_tokens_seen": 30665744, "step": 35420 }, { "epoch": 16.702027345591702, "grad_norm": 0.021852515637874603, "learning_rate": 0.00958370205359777, "loss": 0.0825, "num_input_tokens_seen": 30669904, "step": 35425 }, { "epoch": 16.704384724186703, "grad_norm": 0.01130243856459856, "learning_rate": 0.009562995396257445, "loss": 0.0917, "num_input_tokens_seen": 30673920, "step": 35430 }, { "epoch": 16.706742102781707, "grad_norm": 0.01743270270526409, "learning_rate": 0.009542310396068026, "loss": 0.1056, "num_input_tokens_seen": 30678784, "step": 35435 }, { "epoch": 16.709099481376708, "grad_norm": 0.016530361026525497, "learning_rate": 0.009521647056219495, "loss": 0.1001, "num_input_tokens_seen": 30683168, "step": 35440 }, { "epoch": 16.71145685997171, "grad_norm": 0.023032763972878456, "learning_rate": 0.00950100537989832, "loss": 0.1294, "num_input_tokens_seen": 30687664, "step": 35445 }, { "epoch": 16.713814238566712, "grad_norm": 0.014949853532016277, "learning_rate": 0.00948038537028772, "loss": 0.1489, "num_input_tokens_seen": 30691904, "step": 35450 }, { "epoch": 16.716171617161717, "grad_norm": 0.004398577380925417, "learning_rate": 0.009459787030567617, "loss": 0.0698, "num_input_tokens_seen": 30695600, "step": 35455 }, { "epoch": 16.718528995756717, "grad_norm": 0.010042480193078518, "learning_rate": 0.00943921036391449, "loss": 0.0964, "num_input_tokens_seen": 30699968, "step": 35460 }, { "epoch": 16.72088637435172, "grad_norm": 0.009190640412271023, "learning_rate": 0.009418655373501483, "loss": 0.1267, "num_input_tokens_seen": 30703792, "step": 35465 }, { "epoch": 16.723243752946722, "grad_norm": 0.017326461151242256, "learning_rate": 0.00939812206249851, "loss": 0.1226, "num_input_tokens_seen": 30708096, "step": 35470 }, { "epoch": 16.725601131541726, "grad_norm": 0.018404478207230568, "learning_rate": 0.009377610434072004, "loss": 0.1431, "num_input_tokens_seen": 30713792, "step": 35475 }, { "epoch": 16.727958510136727, "grad_norm": 0.010921269655227661, "learning_rate": 0.009357120491385167, "loss": 0.1303, "num_input_tokens_seen": 30718288, "step": 35480 }, { "epoch": 16.73031588873173, "grad_norm": 0.024027390405535698, "learning_rate": 0.009336652237597743, "loss": 0.1221, "num_input_tokens_seen": 30722384, "step": 35485 }, { "epoch": 16.73267326732673, "grad_norm": 0.005403308197855949, "learning_rate": 0.009316205675866251, "loss": 0.1742, "num_input_tokens_seen": 30726592, "step": 35490 }, { "epoch": 16.735030645921736, "grad_norm": 0.015990082174539566, "learning_rate": 0.00929578080934379, "loss": 0.1412, "num_input_tokens_seen": 30730784, "step": 35495 }, { "epoch": 16.737388024516736, "grad_norm": 0.011869063600897789, "learning_rate": 0.00927537764118012, "loss": 0.1431, "num_input_tokens_seen": 30734608, "step": 35500 }, { "epoch": 16.73974540311174, "grad_norm": 0.010554048232734203, "learning_rate": 0.009254996174521678, "loss": 0.1018, "num_input_tokens_seen": 30738656, "step": 35505 }, { "epoch": 16.74210278170674, "grad_norm": 0.003269665874540806, "learning_rate": 0.009234636412511531, "loss": 0.0592, "num_input_tokens_seen": 30742720, "step": 35510 }, { "epoch": 16.744460160301745, "grad_norm": 0.01641368679702282, "learning_rate": 0.009214298358289418, "loss": 0.0878, "num_input_tokens_seen": 30747392, "step": 35515 }, { "epoch": 16.746817538896746, "grad_norm": 0.01102248951792717, "learning_rate": 0.00919398201499173, "loss": 0.0769, "num_input_tokens_seen": 30752608, "step": 35520 }, { "epoch": 16.74917491749175, "grad_norm": 0.0121776657178998, "learning_rate": 0.009173687385751495, "loss": 0.1128, "num_input_tokens_seen": 30757664, "step": 35525 }, { "epoch": 16.75153229608675, "grad_norm": 0.005921770818531513, "learning_rate": 0.009153414473698407, "loss": 0.1375, "num_input_tokens_seen": 30761872, "step": 35530 }, { "epoch": 16.753889674681755, "grad_norm": 0.01893548108637333, "learning_rate": 0.009133163281958784, "loss": 0.1208, "num_input_tokens_seen": 30766176, "step": 35535 }, { "epoch": 16.756247053276756, "grad_norm": 0.009933964349329472, "learning_rate": 0.009112933813655627, "loss": 0.2145, "num_input_tokens_seen": 30770704, "step": 35540 }, { "epoch": 16.75860443187176, "grad_norm": 0.01219438947737217, "learning_rate": 0.009092726071908573, "loss": 0.0722, "num_input_tokens_seen": 30775024, "step": 35545 }, { "epoch": 16.76096181046676, "grad_norm": 0.014504989609122276, "learning_rate": 0.0090725400598339, "loss": 0.1209, "num_input_tokens_seen": 30779200, "step": 35550 }, { "epoch": 16.763319189061765, "grad_norm": 0.012825874611735344, "learning_rate": 0.009052375780544563, "loss": 0.1016, "num_input_tokens_seen": 30783088, "step": 35555 }, { "epoch": 16.765676567656765, "grad_norm": 0.01118024718016386, "learning_rate": 0.009032233237150144, "loss": 0.0723, "num_input_tokens_seen": 30787376, "step": 35560 }, { "epoch": 16.76803394625177, "grad_norm": 0.0103830611333251, "learning_rate": 0.009012112432756875, "loss": 0.0592, "num_input_tokens_seen": 30791376, "step": 35565 }, { "epoch": 16.77039132484677, "grad_norm": 0.008351908065378666, "learning_rate": 0.008992013370467605, "loss": 0.0842, "num_input_tokens_seen": 30795456, "step": 35570 }, { "epoch": 16.772748703441774, "grad_norm": 0.019620757550001144, "learning_rate": 0.008971936053381924, "loss": 0.1991, "num_input_tokens_seen": 30800320, "step": 35575 }, { "epoch": 16.775106082036775, "grad_norm": 0.007302840705960989, "learning_rate": 0.008951880484595953, "loss": 0.0479, "num_input_tokens_seen": 30804560, "step": 35580 }, { "epoch": 16.77746346063178, "grad_norm": 0.005145778879523277, "learning_rate": 0.008931846667202552, "loss": 0.1362, "num_input_tokens_seen": 30809232, "step": 35585 }, { "epoch": 16.77982083922678, "grad_norm": 0.02542269416153431, "learning_rate": 0.008911834604291152, "loss": 0.1742, "num_input_tokens_seen": 30813760, "step": 35590 }, { "epoch": 16.782178217821784, "grad_norm": 0.0035323547199368477, "learning_rate": 0.008891844298947882, "loss": 0.1113, "num_input_tokens_seen": 30818512, "step": 35595 }, { "epoch": 16.784535596416784, "grad_norm": 0.016081709414720535, "learning_rate": 0.008871875754255508, "loss": 0.1035, "num_input_tokens_seen": 30823072, "step": 35600 }, { "epoch": 16.784535596416784, "eval_loss": 0.2634243071079254, "eval_runtime": 21.9317, "eval_samples_per_second": 42.997, "eval_steps_per_second": 21.521, "num_input_tokens_seen": 30823072, "step": 35600 }, { "epoch": 16.78689297501179, "grad_norm": 0.007513964083045721, "learning_rate": 0.008851928973293422, "loss": 0.155, "num_input_tokens_seen": 30827136, "step": 35605 }, { "epoch": 16.78925035360679, "grad_norm": 0.008744494058191776, "learning_rate": 0.00883200395913764, "loss": 0.1164, "num_input_tokens_seen": 30831248, "step": 35610 }, { "epoch": 16.79160773220179, "grad_norm": 0.020081497728824615, "learning_rate": 0.00881210071486091, "loss": 0.1239, "num_input_tokens_seen": 30837152, "step": 35615 }, { "epoch": 16.793965110796794, "grad_norm": 0.014561597257852554, "learning_rate": 0.008792219243532505, "loss": 0.13, "num_input_tokens_seen": 30841760, "step": 35620 }, { "epoch": 16.796322489391795, "grad_norm": 0.017971096560359, "learning_rate": 0.008772359548218428, "loss": 0.1277, "num_input_tokens_seen": 30846032, "step": 35625 }, { "epoch": 16.7986798679868, "grad_norm": 0.004171531647443771, "learning_rate": 0.008752521631981274, "loss": 0.1071, "num_input_tokens_seen": 30850496, "step": 35630 }, { "epoch": 16.8010372465818, "grad_norm": 0.011229939758777618, "learning_rate": 0.008732705497880315, "loss": 0.0896, "num_input_tokens_seen": 30854496, "step": 35635 }, { "epoch": 16.803394625176804, "grad_norm": 0.010215594433248043, "learning_rate": 0.008712911148971459, "loss": 0.121, "num_input_tokens_seen": 30858320, "step": 35640 }, { "epoch": 16.805752003771804, "grad_norm": 0.003503407584503293, "learning_rate": 0.008693138588307208, "loss": 0.0724, "num_input_tokens_seen": 30862992, "step": 35645 }, { "epoch": 16.80810938236681, "grad_norm": 0.004218435380607843, "learning_rate": 0.008673387818936762, "loss": 0.0946, "num_input_tokens_seen": 30867136, "step": 35650 }, { "epoch": 16.81046676096181, "grad_norm": 0.0130458427593112, "learning_rate": 0.008653658843905948, "loss": 0.1564, "num_input_tokens_seen": 30871216, "step": 35655 }, { "epoch": 16.812824139556813, "grad_norm": 0.023598428815603256, "learning_rate": 0.0086339516662572, "loss": 0.1514, "num_input_tokens_seen": 30875856, "step": 35660 }, { "epoch": 16.815181518151814, "grad_norm": 0.012036167085170746, "learning_rate": 0.008614266289029638, "loss": 0.0686, "num_input_tokens_seen": 30880016, "step": 35665 }, { "epoch": 16.817538896746818, "grad_norm": 0.006943972781300545, "learning_rate": 0.008594602715258965, "loss": 0.0945, "num_input_tokens_seen": 30884048, "step": 35670 }, { "epoch": 16.81989627534182, "grad_norm": 0.022596141323447227, "learning_rate": 0.008574960947977573, "loss": 0.2305, "num_input_tokens_seen": 30888448, "step": 35675 }, { "epoch": 16.822253653936823, "grad_norm": 0.0193775687366724, "learning_rate": 0.008555340990214438, "loss": 0.0987, "num_input_tokens_seen": 30892176, "step": 35680 }, { "epoch": 16.824611032531823, "grad_norm": 0.013672865927219391, "learning_rate": 0.008535742844995258, "loss": 0.11, "num_input_tokens_seen": 30896640, "step": 35685 }, { "epoch": 16.826968411126828, "grad_norm": 0.015574382618069649, "learning_rate": 0.008516166515342266, "loss": 0.084, "num_input_tokens_seen": 30900720, "step": 35690 }, { "epoch": 16.82932578972183, "grad_norm": 0.02158981002867222, "learning_rate": 0.008496612004274411, "loss": 0.2111, "num_input_tokens_seen": 30905616, "step": 35695 }, { "epoch": 16.831683168316832, "grad_norm": 0.01221825648099184, "learning_rate": 0.008477079314807201, "loss": 0.1449, "num_input_tokens_seen": 30909600, "step": 35700 }, { "epoch": 16.834040546911833, "grad_norm": 0.012631695717573166, "learning_rate": 0.008457568449952874, "loss": 0.2199, "num_input_tokens_seen": 30913568, "step": 35705 }, { "epoch": 16.836397925506837, "grad_norm": 0.015725458040833473, "learning_rate": 0.008438079412720189, "loss": 0.153, "num_input_tokens_seen": 30918080, "step": 35710 }, { "epoch": 16.838755304101838, "grad_norm": 0.0046507553197443485, "learning_rate": 0.00841861220611466, "loss": 0.0709, "num_input_tokens_seen": 30923120, "step": 35715 }, { "epoch": 16.841112682696842, "grad_norm": 0.01059041265398264, "learning_rate": 0.008399166833138355, "loss": 0.1082, "num_input_tokens_seen": 30927104, "step": 35720 }, { "epoch": 16.843470061291843, "grad_norm": 0.018893718719482422, "learning_rate": 0.008379743296789987, "loss": 0.0789, "num_input_tokens_seen": 30931920, "step": 35725 }, { "epoch": 16.845827439886847, "grad_norm": 0.010083929635584354, "learning_rate": 0.008360341600064896, "loss": 0.0878, "num_input_tokens_seen": 30936608, "step": 35730 }, { "epoch": 16.848184818481847, "grad_norm": 0.011624534614384174, "learning_rate": 0.008340961745955121, "loss": 0.1578, "num_input_tokens_seen": 30941024, "step": 35735 }, { "epoch": 16.85054219707685, "grad_norm": 0.007596930954605341, "learning_rate": 0.008321603737449224, "loss": 0.0673, "num_input_tokens_seen": 30945200, "step": 35740 }, { "epoch": 16.852899575671852, "grad_norm": 0.012930074706673622, "learning_rate": 0.008302267577532479, "loss": 0.1238, "num_input_tokens_seen": 30950000, "step": 35745 }, { "epoch": 16.855256954266856, "grad_norm": 0.005511323921382427, "learning_rate": 0.008282953269186771, "loss": 0.0535, "num_input_tokens_seen": 30953760, "step": 35750 }, { "epoch": 16.857614332861857, "grad_norm": 0.024114711210131645, "learning_rate": 0.008263660815390567, "loss": 0.1489, "num_input_tokens_seen": 30957376, "step": 35755 }, { "epoch": 16.85997171145686, "grad_norm": 0.006128882523626089, "learning_rate": 0.008244390219119069, "loss": 0.1692, "num_input_tokens_seen": 30961712, "step": 35760 }, { "epoch": 16.862329090051862, "grad_norm": 0.023384280502796173, "learning_rate": 0.008225141483343967, "loss": 0.167, "num_input_tokens_seen": 30966480, "step": 35765 }, { "epoch": 16.864686468646866, "grad_norm": 0.01856972463428974, "learning_rate": 0.00820591461103372, "loss": 0.1353, "num_input_tokens_seen": 30971152, "step": 35770 }, { "epoch": 16.867043847241867, "grad_norm": 0.009614518843591213, "learning_rate": 0.008186709605153358, "loss": 0.1783, "num_input_tokens_seen": 30975088, "step": 35775 }, { "epoch": 16.86940122583687, "grad_norm": 0.026372816413640976, "learning_rate": 0.008167526468664492, "loss": 0.1407, "num_input_tokens_seen": 30979264, "step": 35780 }, { "epoch": 16.87175860443187, "grad_norm": 0.007105548400431871, "learning_rate": 0.008148365204525443, "loss": 0.1258, "num_input_tokens_seen": 30982960, "step": 35785 }, { "epoch": 16.874115983026876, "grad_norm": 0.013276240788400173, "learning_rate": 0.00812922581569106, "loss": 0.1649, "num_input_tokens_seen": 30987056, "step": 35790 }, { "epoch": 16.876473361621876, "grad_norm": 0.009276241064071655, "learning_rate": 0.008110108305112934, "loss": 0.1078, "num_input_tokens_seen": 30991456, "step": 35795 }, { "epoch": 16.87883074021688, "grad_norm": 0.011670474894344807, "learning_rate": 0.008091012675739223, "loss": 0.1649, "num_input_tokens_seen": 30996032, "step": 35800 }, { "epoch": 16.87883074021688, "eval_loss": 0.26129886507987976, "eval_runtime": 22.0031, "eval_samples_per_second": 42.858, "eval_steps_per_second": 21.451, "num_input_tokens_seen": 30996032, "step": 35800 }, { "epoch": 16.88118811881188, "grad_norm": 0.01039219368249178, "learning_rate": 0.008071938930514671, "loss": 0.0599, "num_input_tokens_seen": 30999968, "step": 35805 }, { "epoch": 16.883545497406885, "grad_norm": 0.003808434819802642, "learning_rate": 0.008052887072380726, "loss": 0.1325, "num_input_tokens_seen": 31003792, "step": 35810 }, { "epoch": 16.885902876001886, "grad_norm": 0.013571172021329403, "learning_rate": 0.008033857104275437, "loss": 0.1751, "num_input_tokens_seen": 31008944, "step": 35815 }, { "epoch": 16.888260254596887, "grad_norm": 0.008013325743377209, "learning_rate": 0.008014849029133424, "loss": 0.0871, "num_input_tokens_seen": 31012976, "step": 35820 }, { "epoch": 16.89061763319189, "grad_norm": 0.008567462675273418, "learning_rate": 0.007995862849885975, "loss": 0.1596, "num_input_tokens_seen": 31017664, "step": 35825 }, { "epoch": 16.89297501178689, "grad_norm": 0.008565583266317844, "learning_rate": 0.007976898569461032, "loss": 0.1298, "num_input_tokens_seen": 31022704, "step": 35830 }, { "epoch": 16.895332390381895, "grad_norm": 0.009018138982355595, "learning_rate": 0.007957956190783088, "loss": 0.0979, "num_input_tokens_seen": 31026752, "step": 35835 }, { "epoch": 16.897689768976896, "grad_norm": 0.009172054007649422, "learning_rate": 0.007939035716773324, "loss": 0.1101, "num_input_tokens_seen": 31031136, "step": 35840 }, { "epoch": 16.9000471475719, "grad_norm": 0.013981117866933346, "learning_rate": 0.007920137150349487, "loss": 0.1484, "num_input_tokens_seen": 31036208, "step": 35845 }, { "epoch": 16.9024045261669, "grad_norm": 0.014014261774718761, "learning_rate": 0.007901260494425981, "loss": 0.0813, "num_input_tokens_seen": 31040816, "step": 35850 }, { "epoch": 16.904761904761905, "grad_norm": 0.014437993057072163, "learning_rate": 0.007882405751913861, "loss": 0.118, "num_input_tokens_seen": 31044928, "step": 35855 }, { "epoch": 16.907119283356906, "grad_norm": 0.012140852399170399, "learning_rate": 0.007863572925720702, "loss": 0.11, "num_input_tokens_seen": 31048736, "step": 35860 }, { "epoch": 16.90947666195191, "grad_norm": 0.005972868762910366, "learning_rate": 0.007844762018750827, "loss": 0.0821, "num_input_tokens_seen": 31053136, "step": 35865 }, { "epoch": 16.91183404054691, "grad_norm": 0.00921252928674221, "learning_rate": 0.007825973033905054, "loss": 0.1153, "num_input_tokens_seen": 31057216, "step": 35870 }, { "epoch": 16.914191419141915, "grad_norm": 0.007058992981910706, "learning_rate": 0.007807205974080927, "loss": 0.0655, "num_input_tokens_seen": 31061616, "step": 35875 }, { "epoch": 16.916548797736915, "grad_norm": 0.003664696356281638, "learning_rate": 0.007788460842172551, "loss": 0.0817, "num_input_tokens_seen": 31065200, "step": 35880 }, { "epoch": 16.91890617633192, "grad_norm": 0.006061555817723274, "learning_rate": 0.0077697376410706285, "loss": 0.089, "num_input_tokens_seen": 31069584, "step": 35885 }, { "epoch": 16.92126355492692, "grad_norm": 0.02002040669322014, "learning_rate": 0.007751036373662567, "loss": 0.1716, "num_input_tokens_seen": 31074464, "step": 35890 }, { "epoch": 16.923620933521924, "grad_norm": 0.01151377335190773, "learning_rate": 0.00773235704283231, "loss": 0.1197, "num_input_tokens_seen": 31079008, "step": 35895 }, { "epoch": 16.925978312116925, "grad_norm": 0.0063186693005263805, "learning_rate": 0.007713699651460437, "loss": 0.1289, "num_input_tokens_seen": 31083264, "step": 35900 }, { "epoch": 16.92833569071193, "grad_norm": 0.011716816574335098, "learning_rate": 0.007695064202424162, "loss": 0.0883, "num_input_tokens_seen": 31087792, "step": 35905 }, { "epoch": 16.93069306930693, "grad_norm": 0.014431536197662354, "learning_rate": 0.007676450698597286, "loss": 0.089, "num_input_tokens_seen": 31091840, "step": 35910 }, { "epoch": 16.933050447901934, "grad_norm": 0.013445715419948101, "learning_rate": 0.007657859142850265, "loss": 0.1364, "num_input_tokens_seen": 31095872, "step": 35915 }, { "epoch": 16.935407826496935, "grad_norm": 0.009531930088996887, "learning_rate": 0.0076392895380501535, "loss": 0.1752, "num_input_tokens_seen": 31100464, "step": 35920 }, { "epoch": 16.93776520509194, "grad_norm": 0.007896044291555882, "learning_rate": 0.007620741887060611, "loss": 0.1096, "num_input_tokens_seen": 31104544, "step": 35925 }, { "epoch": 16.94012258368694, "grad_norm": 0.001596235204488039, "learning_rate": 0.007602216192741901, "loss": 0.0922, "num_input_tokens_seen": 31109072, "step": 35930 }, { "epoch": 16.942479962281944, "grad_norm": 0.016933150589466095, "learning_rate": 0.007583712457950969, "loss": 0.1624, "num_input_tokens_seen": 31113824, "step": 35935 }, { "epoch": 16.944837340876944, "grad_norm": 0.018517998978495598, "learning_rate": 0.007565230685541269, "loss": 0.1173, "num_input_tokens_seen": 31117440, "step": 35940 }, { "epoch": 16.94719471947195, "grad_norm": 0.02614550292491913, "learning_rate": 0.007546770878362968, "loss": 0.1174, "num_input_tokens_seen": 31121584, "step": 35945 }, { "epoch": 16.94955209806695, "grad_norm": 0.0169843640178442, "learning_rate": 0.0075283330392627405, "loss": 0.0855, "num_input_tokens_seen": 31126080, "step": 35950 }, { "epoch": 16.951909476661953, "grad_norm": 0.02208230458199978, "learning_rate": 0.007509917171083979, "loss": 0.1417, "num_input_tokens_seen": 31130432, "step": 35955 }, { "epoch": 16.954266855256954, "grad_norm": 0.01811923459172249, "learning_rate": 0.007491523276666662, "loss": 0.1465, "num_input_tokens_seen": 31134464, "step": 35960 }, { "epoch": 16.956624233851958, "grad_norm": 0.01501999981701374, "learning_rate": 0.007473151358847318, "loss": 0.0654, "num_input_tokens_seen": 31138544, "step": 35965 }, { "epoch": 16.95898161244696, "grad_norm": 0.008531957864761353, "learning_rate": 0.007454801420459117, "loss": 0.0611, "num_input_tokens_seen": 31143408, "step": 35970 }, { "epoch": 16.961338991041963, "grad_norm": 0.006068440619856119, "learning_rate": 0.0074364734643319105, "loss": 0.1054, "num_input_tokens_seen": 31147120, "step": 35975 }, { "epoch": 16.963696369636963, "grad_norm": 0.0061237686313688755, "learning_rate": 0.007418167493292022, "loss": 0.109, "num_input_tokens_seen": 31150800, "step": 35980 }, { "epoch": 16.966053748231968, "grad_norm": 0.024726686999201775, "learning_rate": 0.0073998835101625245, "loss": 0.1689, "num_input_tokens_seen": 31155008, "step": 35985 }, { "epoch": 16.968411126826968, "grad_norm": 0.02818484976887703, "learning_rate": 0.007381621517762998, "loss": 0.3027, "num_input_tokens_seen": 31158960, "step": 35990 }, { "epoch": 16.970768505421972, "grad_norm": 0.005384627263993025, "learning_rate": 0.007363381518909689, "loss": 0.1312, "num_input_tokens_seen": 31163936, "step": 35995 }, { "epoch": 16.973125884016973, "grad_norm": 0.006879071239382029, "learning_rate": 0.007345163516415448, "loss": 0.0938, "num_input_tokens_seen": 31167328, "step": 36000 }, { "epoch": 16.973125884016973, "eval_loss": 0.27088505029678345, "eval_runtime": 21.9078, "eval_samples_per_second": 43.044, "eval_steps_per_second": 21.545, "num_input_tokens_seen": 31167328, "step": 36000 }, { "epoch": 16.975483262611977, "grad_norm": 0.007064170204102993, "learning_rate": 0.007326967513089693, "loss": 0.0866, "num_input_tokens_seen": 31171424, "step": 36005 }, { "epoch": 16.977840641206978, "grad_norm": 0.008873811922967434, "learning_rate": 0.0073087935117384815, "loss": 0.0857, "num_input_tokens_seen": 31175696, "step": 36010 }, { "epoch": 16.980198019801982, "grad_norm": 0.017500873655080795, "learning_rate": 0.007290641515164503, "loss": 0.1068, "num_input_tokens_seen": 31180064, "step": 36015 }, { "epoch": 16.982555398396983, "grad_norm": 0.009193282574415207, "learning_rate": 0.007272511526166986, "loss": 0.1669, "num_input_tokens_seen": 31184976, "step": 36020 }, { "epoch": 16.984912776991983, "grad_norm": 0.016908008605241776, "learning_rate": 0.0072544035475418265, "loss": 0.1368, "num_input_tokens_seen": 31189392, "step": 36025 }, { "epoch": 16.987270155586987, "grad_norm": 0.018625443801283836, "learning_rate": 0.007236317582081475, "loss": 0.2236, "num_input_tokens_seen": 31193456, "step": 36030 }, { "epoch": 16.989627534181988, "grad_norm": 0.004359911661595106, "learning_rate": 0.007218253632575066, "loss": 0.1539, "num_input_tokens_seen": 31198288, "step": 36035 }, { "epoch": 16.991984912776992, "grad_norm": 0.01650894433259964, "learning_rate": 0.007200211701808223, "loss": 0.0892, "num_input_tokens_seen": 31202336, "step": 36040 }, { "epoch": 16.994342291371993, "grad_norm": 0.001986259827390313, "learning_rate": 0.007182191792563286, "loss": 0.1659, "num_input_tokens_seen": 31206832, "step": 36045 }, { "epoch": 16.996699669966997, "grad_norm": 0.003989676013588905, "learning_rate": 0.0071641939076191145, "loss": 0.157, "num_input_tokens_seen": 31210880, "step": 36050 }, { "epoch": 16.999057048561998, "grad_norm": 0.0046168118715286255, "learning_rate": 0.007146218049751257, "loss": 0.154, "num_input_tokens_seen": 31215248, "step": 36055 }, { "epoch": 17.001414427157002, "grad_norm": 0.006854538805782795, "learning_rate": 0.0071282642217317775, "loss": 0.0742, "num_input_tokens_seen": 31219808, "step": 36060 }, { "epoch": 17.003771805752002, "grad_norm": 0.01015361025929451, "learning_rate": 0.007110332426329396, "loss": 0.1169, "num_input_tokens_seen": 31223520, "step": 36065 }, { "epoch": 17.006129184347007, "grad_norm": 0.013338311575353146, "learning_rate": 0.007092422666309417, "loss": 0.1183, "num_input_tokens_seen": 31227456, "step": 36070 }, { "epoch": 17.008486562942007, "grad_norm": 0.012907693162560463, "learning_rate": 0.0070745349444337295, "loss": 0.1056, "num_input_tokens_seen": 31231344, "step": 36075 }, { "epoch": 17.01084394153701, "grad_norm": 0.01949482597410679, "learning_rate": 0.007056669263460913, "loss": 0.1236, "num_input_tokens_seen": 31235552, "step": 36080 }, { "epoch": 17.013201320132012, "grad_norm": 0.0028664893470704556, "learning_rate": 0.007038825626145995, "loss": 0.1003, "num_input_tokens_seen": 31240016, "step": 36085 }, { "epoch": 17.015558698727016, "grad_norm": 0.01643681339919567, "learning_rate": 0.007021004035240724, "loss": 0.0792, "num_input_tokens_seen": 31244368, "step": 36090 }, { "epoch": 17.017916077322017, "grad_norm": 0.004617566708475351, "learning_rate": 0.007003204493493453, "loss": 0.098, "num_input_tokens_seen": 31248432, "step": 36095 }, { "epoch": 17.02027345591702, "grad_norm": 0.01235467940568924, "learning_rate": 0.006985427003649036, "loss": 0.1168, "num_input_tokens_seen": 31252800, "step": 36100 }, { "epoch": 17.02263083451202, "grad_norm": 0.014176882803440094, "learning_rate": 0.006967671568449013, "loss": 0.115, "num_input_tokens_seen": 31256992, "step": 36105 }, { "epoch": 17.024988213107026, "grad_norm": 0.006037629209458828, "learning_rate": 0.006949938190631511, "loss": 0.0715, "num_input_tokens_seen": 31262096, "step": 36110 }, { "epoch": 17.027345591702026, "grad_norm": 0.011909185908734798, "learning_rate": 0.0069322268729311905, "loss": 0.1197, "num_input_tokens_seen": 31265872, "step": 36115 }, { "epoch": 17.02970297029703, "grad_norm": 0.0054299538023769855, "learning_rate": 0.006914537618079403, "loss": 0.0802, "num_input_tokens_seen": 31270304, "step": 36120 }, { "epoch": 17.03206034889203, "grad_norm": 0.00811014510691166, "learning_rate": 0.006896870428804031, "loss": 0.0864, "num_input_tokens_seen": 31274336, "step": 36125 }, { "epoch": 17.034417727487035, "grad_norm": 0.011875255033373833, "learning_rate": 0.006879225307829595, "loss": 0.0917, "num_input_tokens_seen": 31278784, "step": 36130 }, { "epoch": 17.036775106082036, "grad_norm": 0.015378271229565144, "learning_rate": 0.00686160225787717, "loss": 0.0952, "num_input_tokens_seen": 31282736, "step": 36135 }, { "epoch": 17.03913248467704, "grad_norm": 0.017506049945950508, "learning_rate": 0.006844001281664463, "loss": 0.0727, "num_input_tokens_seen": 31288480, "step": 36140 }, { "epoch": 17.04148986327204, "grad_norm": 0.01013952773064375, "learning_rate": 0.006826422381905789, "loss": 0.0668, "num_input_tokens_seen": 31292400, "step": 36145 }, { "epoch": 17.043847241867045, "grad_norm": 0.01578451693058014, "learning_rate": 0.006808865561311994, "loss": 0.1547, "num_input_tokens_seen": 31296400, "step": 36150 }, { "epoch": 17.046204620462046, "grad_norm": 0.005914626177400351, "learning_rate": 0.00679133082259058, "loss": 0.1096, "num_input_tokens_seen": 31300304, "step": 36155 }, { "epoch": 17.04856199905705, "grad_norm": 0.01096775010228157, "learning_rate": 0.00677381816844565, "loss": 0.1028, "num_input_tokens_seen": 31305040, "step": 36160 }, { "epoch": 17.05091937765205, "grad_norm": 0.014520931988954544, "learning_rate": 0.0067563276015778434, "loss": 0.0705, "num_input_tokens_seen": 31310576, "step": 36165 }, { "epoch": 17.053276756247055, "grad_norm": 0.010059935040771961, "learning_rate": 0.006738859124684437, "loss": 0.0985, "num_input_tokens_seen": 31314544, "step": 36170 }, { "epoch": 17.055634134842055, "grad_norm": 0.010816458612680435, "learning_rate": 0.006721412740459259, "loss": 0.0632, "num_input_tokens_seen": 31320272, "step": 36175 }, { "epoch": 17.05799151343706, "grad_norm": 0.00584813579916954, "learning_rate": 0.006703988451592824, "loss": 0.0643, "num_input_tokens_seen": 31325104, "step": 36180 }, { "epoch": 17.06034889203206, "grad_norm": 0.016626687720417976, "learning_rate": 0.006686586260772114, "loss": 0.0701, "num_input_tokens_seen": 31329152, "step": 36185 }, { "epoch": 17.062706270627064, "grad_norm": 0.007504918146878481, "learning_rate": 0.006669206170680819, "loss": 0.0725, "num_input_tokens_seen": 31332896, "step": 36190 }, { "epoch": 17.065063649222065, "grad_norm": 0.005399907939136028, "learning_rate": 0.0066518481839991095, "loss": 0.0581, "num_input_tokens_seen": 31336656, "step": 36195 }, { "epoch": 17.06742102781707, "grad_norm": 0.006314376834779978, "learning_rate": 0.006634512303403861, "loss": 0.0595, "num_input_tokens_seen": 31341392, "step": 36200 }, { "epoch": 17.06742102781707, "eval_loss": 0.27376726269721985, "eval_runtime": 21.9028, "eval_samples_per_second": 43.054, "eval_steps_per_second": 21.55, "num_input_tokens_seen": 31341392, "step": 36200 }, { "epoch": 17.06977840641207, "grad_norm": 0.01618790626525879, "learning_rate": 0.0066171985315684355, "loss": 0.1527, "num_input_tokens_seen": 31346032, "step": 36205 }, { "epoch": 17.072135785007074, "grad_norm": 0.010968578048050404, "learning_rate": 0.0065999068711628806, "loss": 0.1294, "num_input_tokens_seen": 31350432, "step": 36210 }, { "epoch": 17.074493163602074, "grad_norm": 0.010473438538610935, "learning_rate": 0.0065826373248537295, "loss": 0.1055, "num_input_tokens_seen": 31354528, "step": 36215 }, { "epoch": 17.076850542197075, "grad_norm": 0.007763137575238943, "learning_rate": 0.006565389895304218, "loss": 0.1101, "num_input_tokens_seen": 31358544, "step": 36220 }, { "epoch": 17.07920792079208, "grad_norm": 0.011175322346389294, "learning_rate": 0.006548164585174104, "loss": 0.1229, "num_input_tokens_seen": 31363040, "step": 36225 }, { "epoch": 17.08156529938708, "grad_norm": 0.017585547640919685, "learning_rate": 0.006530961397119728, "loss": 0.1129, "num_input_tokens_seen": 31367280, "step": 36230 }, { "epoch": 17.083922677982084, "grad_norm": 0.013315394520759583, "learning_rate": 0.00651378033379405, "loss": 0.1171, "num_input_tokens_seen": 31371264, "step": 36235 }, { "epoch": 17.086280056577085, "grad_norm": 0.0194857195019722, "learning_rate": 0.006496621397846619, "loss": 0.1135, "num_input_tokens_seen": 31375488, "step": 36240 }, { "epoch": 17.08863743517209, "grad_norm": 0.009498114697635174, "learning_rate": 0.006479484591923518, "loss": 0.0885, "num_input_tokens_seen": 31381280, "step": 36245 }, { "epoch": 17.09099481376709, "grad_norm": 0.011644668877124786, "learning_rate": 0.006462369918667515, "loss": 0.1026, "num_input_tokens_seen": 31385120, "step": 36250 }, { "epoch": 17.093352192362094, "grad_norm": 0.010010889731347561, "learning_rate": 0.006445277380717851, "loss": 0.1565, "num_input_tokens_seen": 31389888, "step": 36255 }, { "epoch": 17.095709570957094, "grad_norm": 0.0033402724657207727, "learning_rate": 0.006428206980710466, "loss": 0.0593, "num_input_tokens_seen": 31393648, "step": 36260 }, { "epoch": 17.0980669495521, "grad_norm": 0.009152665734291077, "learning_rate": 0.006411158721277788, "loss": 0.0546, "num_input_tokens_seen": 31397888, "step": 36265 }, { "epoch": 17.1004243281471, "grad_norm": 0.013551130890846252, "learning_rate": 0.00639413260504888, "loss": 0.0776, "num_input_tokens_seen": 31402640, "step": 36270 }, { "epoch": 17.102781706742103, "grad_norm": 0.025572702288627625, "learning_rate": 0.006377128634649376, "loss": 0.1097, "num_input_tokens_seen": 31406928, "step": 36275 }, { "epoch": 17.105139085337104, "grad_norm": 0.010415523312985897, "learning_rate": 0.006360146812701528, "loss": 0.0728, "num_input_tokens_seen": 31411104, "step": 36280 }, { "epoch": 17.107496463932108, "grad_norm": 0.002753691514953971, "learning_rate": 0.006343187141824125, "loss": 0.0604, "num_input_tokens_seen": 31415632, "step": 36285 }, { "epoch": 17.10985384252711, "grad_norm": 0.002575148129835725, "learning_rate": 0.00632624962463259, "loss": 0.0488, "num_input_tokens_seen": 31420240, "step": 36290 }, { "epoch": 17.112211221122113, "grad_norm": 0.02011752687394619, "learning_rate": 0.006309334263738853, "loss": 0.1385, "num_input_tokens_seen": 31424592, "step": 36295 }, { "epoch": 17.114568599717114, "grad_norm": 0.004235713742673397, "learning_rate": 0.006292441061751508, "loss": 0.0985, "num_input_tokens_seen": 31429568, "step": 36300 }, { "epoch": 17.116925978312118, "grad_norm": 0.004790938459336758, "learning_rate": 0.0062755700212757054, "loss": 0.037, "num_input_tokens_seen": 31433520, "step": 36305 }, { "epoch": 17.11928335690712, "grad_norm": 0.005081883165985346, "learning_rate": 0.006258721144913148, "loss": 0.1048, "num_input_tokens_seen": 31439184, "step": 36310 }, { "epoch": 17.121640735502123, "grad_norm": 0.014255238696932793, "learning_rate": 0.0062418944352621575, "loss": 0.1235, "num_input_tokens_seen": 31443600, "step": 36315 }, { "epoch": 17.123998114097123, "grad_norm": 0.01183360442519188, "learning_rate": 0.0062250898949176405, "loss": 0.0711, "num_input_tokens_seen": 31448288, "step": 36320 }, { "epoch": 17.126355492692127, "grad_norm": 0.004065865650773048, "learning_rate": 0.006208307526471041, "loss": 0.0982, "num_input_tokens_seen": 31452864, "step": 36325 }, { "epoch": 17.128712871287128, "grad_norm": 0.013872957788407803, "learning_rate": 0.006191547332510405, "loss": 0.0717, "num_input_tokens_seen": 31457056, "step": 36330 }, { "epoch": 17.131070249882132, "grad_norm": 0.011521593667566776, "learning_rate": 0.006174809315620416, "loss": 0.1234, "num_input_tokens_seen": 31461584, "step": 36335 }, { "epoch": 17.133427628477133, "grad_norm": 0.015593203715980053, "learning_rate": 0.00615809347838221, "loss": 0.1182, "num_input_tokens_seen": 31465328, "step": 36340 }, { "epoch": 17.135785007072137, "grad_norm": 0.003055104287341237, "learning_rate": 0.006141399823373655, "loss": 0.125, "num_input_tokens_seen": 31469744, "step": 36345 }, { "epoch": 17.138142385667138, "grad_norm": 0.018332447856664658, "learning_rate": 0.0061247283531690455, "loss": 0.0922, "num_input_tokens_seen": 31473824, "step": 36350 }, { "epoch": 17.14049976426214, "grad_norm": 0.01814332976937294, "learning_rate": 0.0061080790703393895, "loss": 0.0632, "num_input_tokens_seen": 31477648, "step": 36355 }, { "epoch": 17.142857142857142, "grad_norm": 0.013918413780629635, "learning_rate": 0.006091451977452217, "loss": 0.1405, "num_input_tokens_seen": 31481808, "step": 36360 }, { "epoch": 17.145214521452147, "grad_norm": 0.018977725878357887, "learning_rate": 0.00607484707707161, "loss": 0.1416, "num_input_tokens_seen": 31486016, "step": 36365 }, { "epoch": 17.147571900047147, "grad_norm": 0.007560989819467068, "learning_rate": 0.006058264371758254, "loss": 0.1006, "num_input_tokens_seen": 31490192, "step": 36370 }, { "epoch": 17.14992927864215, "grad_norm": 0.01759473606944084, "learning_rate": 0.00604170386406942, "loss": 0.0929, "num_input_tokens_seen": 31494528, "step": 36375 }, { "epoch": 17.152286657237152, "grad_norm": 0.005675950087606907, "learning_rate": 0.006025165556558931, "loss": 0.072, "num_input_tokens_seen": 31498144, "step": 36380 }, { "epoch": 17.154644035832156, "grad_norm": 0.00676286406815052, "learning_rate": 0.006008649451777248, "loss": 0.0876, "num_input_tokens_seen": 31502688, "step": 36385 }, { "epoch": 17.157001414427157, "grad_norm": 0.012616056017577648, "learning_rate": 0.005992155552271283, "loss": 0.0496, "num_input_tokens_seen": 31507296, "step": 36390 }, { "epoch": 17.15935879302216, "grad_norm": 0.012391744181513786, "learning_rate": 0.005975683860584685, "loss": 0.1831, "num_input_tokens_seen": 31511680, "step": 36395 }, { "epoch": 17.16171617161716, "grad_norm": 0.011679720133543015, "learning_rate": 0.0059592343792575385, "loss": 0.1804, "num_input_tokens_seen": 31515648, "step": 36400 }, { "epoch": 17.16171617161716, "eval_loss": 0.2758013606071472, "eval_runtime": 21.9094, "eval_samples_per_second": 43.041, "eval_steps_per_second": 21.543, "num_input_tokens_seen": 31515648, "step": 36400 }, { "epoch": 17.164073550212166, "grad_norm": 0.01688370667397976, "learning_rate": 0.0059428071108265975, "loss": 0.1466, "num_input_tokens_seen": 31520352, "step": 36405 }, { "epoch": 17.166430928807166, "grad_norm": 0.021355103701353073, "learning_rate": 0.005926402057825136, "loss": 0.066, "num_input_tokens_seen": 31524528, "step": 36410 }, { "epoch": 17.16878830740217, "grad_norm": 0.00760915782302618, "learning_rate": 0.005910019222782997, "loss": 0.1206, "num_input_tokens_seen": 31529040, "step": 36415 }, { "epoch": 17.17114568599717, "grad_norm": 0.005009378306567669, "learning_rate": 0.005893658608226643, "loss": 0.0877, "num_input_tokens_seen": 31533392, "step": 36420 }, { "epoch": 17.173503064592172, "grad_norm": 0.0205161664634943, "learning_rate": 0.0058773202166791045, "loss": 0.1297, "num_input_tokens_seen": 31537488, "step": 36425 }, { "epoch": 17.175860443187176, "grad_norm": 0.005842353217303753, "learning_rate": 0.005861004050659918, "loss": 0.0474, "num_input_tokens_seen": 31541952, "step": 36430 }, { "epoch": 17.178217821782177, "grad_norm": 0.006939308252185583, "learning_rate": 0.005844710112685286, "loss": 0.1122, "num_input_tokens_seen": 31546480, "step": 36435 }, { "epoch": 17.18057520037718, "grad_norm": 0.016216833144426346, "learning_rate": 0.005828438405267933, "loss": 0.0868, "num_input_tokens_seen": 31551216, "step": 36440 }, { "epoch": 17.18293257897218, "grad_norm": 0.004711097106337547, "learning_rate": 0.00581218893091715, "loss": 0.0915, "num_input_tokens_seen": 31555744, "step": 36445 }, { "epoch": 17.185289957567186, "grad_norm": 0.006320019252598286, "learning_rate": 0.005795961692138801, "loss": 0.0638, "num_input_tokens_seen": 31559920, "step": 36450 }, { "epoch": 17.187647336162186, "grad_norm": 0.018396077677607536, "learning_rate": 0.00577975669143535, "loss": 0.0774, "num_input_tokens_seen": 31563328, "step": 36455 }, { "epoch": 17.19000471475719, "grad_norm": 0.007439781446009874, "learning_rate": 0.005763573931305782, "loss": 0.0857, "num_input_tokens_seen": 31567808, "step": 36460 }, { "epoch": 17.19236209335219, "grad_norm": 0.0021173127461224794, "learning_rate": 0.005747413414245733, "loss": 0.0778, "num_input_tokens_seen": 31572032, "step": 36465 }, { "epoch": 17.194719471947195, "grad_norm": 0.01380198821425438, "learning_rate": 0.005731275142747294, "loss": 0.085, "num_input_tokens_seen": 31576048, "step": 36470 }, { "epoch": 17.197076850542196, "grad_norm": 0.017338458448648453, "learning_rate": 0.005715159119299256, "loss": 0.116, "num_input_tokens_seen": 31580208, "step": 36475 }, { "epoch": 17.1994342291372, "grad_norm": 0.014047152362763882, "learning_rate": 0.005699065346386867, "loss": 0.1103, "num_input_tokens_seen": 31584976, "step": 36480 }, { "epoch": 17.2017916077322, "grad_norm": 0.009470172226428986, "learning_rate": 0.0056829938264919885, "loss": 0.091, "num_input_tokens_seen": 31588768, "step": 36485 }, { "epoch": 17.204148986327205, "grad_norm": 0.006561388727277517, "learning_rate": 0.005666944562093074, "loss": 0.0972, "num_input_tokens_seen": 31592832, "step": 36490 }, { "epoch": 17.206506364922205, "grad_norm": 0.001638320041820407, "learning_rate": 0.005650917555665108, "loss": 0.1218, "num_input_tokens_seen": 31596736, "step": 36495 }, { "epoch": 17.20886374351721, "grad_norm": 0.02003917470574379, "learning_rate": 0.005634912809679632, "loss": 0.1084, "num_input_tokens_seen": 31602352, "step": 36500 }, { "epoch": 17.21122112211221, "grad_norm": 0.018580028787255287, "learning_rate": 0.005618930326604854, "loss": 0.1284, "num_input_tokens_seen": 31606976, "step": 36505 }, { "epoch": 17.213578500707214, "grad_norm": 0.018456248566508293, "learning_rate": 0.005602970108905386, "loss": 0.1167, "num_input_tokens_seen": 31610592, "step": 36510 }, { "epoch": 17.215935879302215, "grad_norm": 0.012837285175919533, "learning_rate": 0.005587032159042543, "loss": 0.0669, "num_input_tokens_seen": 31614768, "step": 36515 }, { "epoch": 17.21829325789722, "grad_norm": 0.006123372353613377, "learning_rate": 0.005571116479474158, "loss": 0.0915, "num_input_tokens_seen": 31618688, "step": 36520 }, { "epoch": 17.22065063649222, "grad_norm": 0.016067981719970703, "learning_rate": 0.005555223072654619, "loss": 0.081, "num_input_tokens_seen": 31623152, "step": 36525 }, { "epoch": 17.223008015087224, "grad_norm": 0.0017877136124297976, "learning_rate": 0.005539351941034881, "loss": 0.1142, "num_input_tokens_seen": 31627040, "step": 36530 }, { "epoch": 17.225365393682225, "grad_norm": 0.01693052425980568, "learning_rate": 0.0055235030870624865, "loss": 0.1117, "num_input_tokens_seen": 31632656, "step": 36535 }, { "epoch": 17.22772277227723, "grad_norm": 0.01036886591464281, "learning_rate": 0.005507676513181514, "loss": 0.1088, "num_input_tokens_seen": 31636832, "step": 36540 }, { "epoch": 17.23008015087223, "grad_norm": 0.005394837353378534, "learning_rate": 0.005491872221832628, "loss": 0.0537, "num_input_tokens_seen": 31640992, "step": 36545 }, { "epoch": 17.232437529467234, "grad_norm": 0.0036144896876066923, "learning_rate": 0.005476090215453061, "loss": 0.0577, "num_input_tokens_seen": 31644992, "step": 36550 }, { "epoch": 17.234794908062234, "grad_norm": 0.011230967938899994, "learning_rate": 0.0054603304964765675, "loss": 0.1145, "num_input_tokens_seen": 31649024, "step": 36555 }, { "epoch": 17.23715228665724, "grad_norm": 0.013110796920955181, "learning_rate": 0.005444593067333519, "loss": 0.0993, "num_input_tokens_seen": 31652752, "step": 36560 }, { "epoch": 17.23950966525224, "grad_norm": 0.0106892054900527, "learning_rate": 0.00542887793045081, "loss": 0.0765, "num_input_tokens_seen": 31657472, "step": 36565 }, { "epoch": 17.241867043847243, "grad_norm": 0.007662664633244276, "learning_rate": 0.005413185088251932, "loss": 0.0818, "num_input_tokens_seen": 31661632, "step": 36570 }, { "epoch": 17.244224422442244, "grad_norm": 0.004474200773984194, "learning_rate": 0.005397514543156884, "loss": 0.1605, "num_input_tokens_seen": 31666256, "step": 36575 }, { "epoch": 17.246581801037248, "grad_norm": 0.008614257909357548, "learning_rate": 0.0053818662975822825, "loss": 0.0905, "num_input_tokens_seen": 31670368, "step": 36580 }, { "epoch": 17.24893917963225, "grad_norm": 0.01151672936975956, "learning_rate": 0.005366240353941315, "loss": 0.0907, "num_input_tokens_seen": 31675328, "step": 36585 }, { "epoch": 17.251296558227253, "grad_norm": 0.015184842981398106, "learning_rate": 0.005350636714643636, "loss": 0.113, "num_input_tokens_seen": 31680224, "step": 36590 }, { "epoch": 17.253653936822253, "grad_norm": 0.007778584025800228, "learning_rate": 0.005335055382095555, "loss": 0.1056, "num_input_tokens_seen": 31684864, "step": 36595 }, { "epoch": 17.256011315417258, "grad_norm": 0.004454773850739002, "learning_rate": 0.005319496358699915, "loss": 0.0988, "num_input_tokens_seen": 31690208, "step": 36600 }, { "epoch": 17.256011315417258, "eval_loss": 0.28145697712898254, "eval_runtime": 21.9155, "eval_samples_per_second": 43.029, "eval_steps_per_second": 21.537, "num_input_tokens_seen": 31690208, "step": 36600 }, { "epoch": 17.25836869401226, "grad_norm": 0.007907221093773842, "learning_rate": 0.005303959646856099, "loss": 0.0831, "num_input_tokens_seen": 31694384, "step": 36605 }, { "epoch": 17.260726072607262, "grad_norm": 0.0030305185355246067, "learning_rate": 0.005288445248960089, "loss": 0.0916, "num_input_tokens_seen": 31698480, "step": 36610 }, { "epoch": 17.263083451202263, "grad_norm": 0.009816888719797134, "learning_rate": 0.005272953167404354, "loss": 0.1345, "num_input_tokens_seen": 31702576, "step": 36615 }, { "epoch": 17.265440829797264, "grad_norm": 0.00972907803952694, "learning_rate": 0.005257483404578017, "loss": 0.0894, "num_input_tokens_seen": 31707184, "step": 36620 }, { "epoch": 17.267798208392268, "grad_norm": 0.017799345776438713, "learning_rate": 0.0052420359628666865, "loss": 0.0963, "num_input_tokens_seen": 31711552, "step": 36625 }, { "epoch": 17.27015558698727, "grad_norm": 0.018719591200351715, "learning_rate": 0.00522661084465254, "loss": 0.0842, "num_input_tokens_seen": 31716048, "step": 36630 }, { "epoch": 17.272512965582273, "grad_norm": 0.005683872848749161, "learning_rate": 0.005211208052314326, "loss": 0.1484, "num_input_tokens_seen": 31719872, "step": 36635 }, { "epoch": 17.274870344177273, "grad_norm": 0.007124336436390877, "learning_rate": 0.005195827588227391, "loss": 0.0869, "num_input_tokens_seen": 31724800, "step": 36640 }, { "epoch": 17.277227722772277, "grad_norm": 0.011073101311922073, "learning_rate": 0.0051804694547635255, "loss": 0.0945, "num_input_tokens_seen": 31729344, "step": 36645 }, { "epoch": 17.279585101367278, "grad_norm": 0.013655728660523891, "learning_rate": 0.005165133654291232, "loss": 0.0653, "num_input_tokens_seen": 31734240, "step": 36650 }, { "epoch": 17.281942479962282, "grad_norm": 0.013372094370424747, "learning_rate": 0.005149820189175402, "loss": 0.0794, "num_input_tokens_seen": 31738464, "step": 36655 }, { "epoch": 17.284299858557283, "grad_norm": 0.002947862260043621, "learning_rate": 0.005134529061777598, "loss": 0.062, "num_input_tokens_seen": 31742816, "step": 36660 }, { "epoch": 17.286657237152287, "grad_norm": 0.00851537100970745, "learning_rate": 0.005119260274455933, "loss": 0.0936, "num_input_tokens_seen": 31746960, "step": 36665 }, { "epoch": 17.289014615747288, "grad_norm": 0.003180151106789708, "learning_rate": 0.005104013829565007, "loss": 0.1269, "num_input_tokens_seen": 31752064, "step": 36670 }, { "epoch": 17.291371994342292, "grad_norm": 0.03190287947654724, "learning_rate": 0.005088789729456006, "loss": 0.2212, "num_input_tokens_seen": 31756368, "step": 36675 }, { "epoch": 17.293729372937293, "grad_norm": 0.00527909304946661, "learning_rate": 0.005073587976476735, "loss": 0.0618, "num_input_tokens_seen": 31761120, "step": 36680 }, { "epoch": 17.296086751532297, "grad_norm": 0.001020805211737752, "learning_rate": 0.005058408572971418, "loss": 0.0795, "num_input_tokens_seen": 31766048, "step": 36685 }, { "epoch": 17.298444130127297, "grad_norm": 0.009127741679549217, "learning_rate": 0.005043251521280983, "loss": 0.1165, "num_input_tokens_seen": 31770368, "step": 36690 }, { "epoch": 17.3008015087223, "grad_norm": 0.02393074706196785, "learning_rate": 0.005028116823742795, "loss": 0.1032, "num_input_tokens_seen": 31774096, "step": 36695 }, { "epoch": 17.303158887317302, "grad_norm": 0.025998570024967194, "learning_rate": 0.005013004482690819, "loss": 0.1377, "num_input_tokens_seen": 31778576, "step": 36700 }, { "epoch": 17.305516265912306, "grad_norm": 0.012036054395139217, "learning_rate": 0.0049979145004555746, "loss": 0.0424, "num_input_tokens_seen": 31783184, "step": 36705 }, { "epoch": 17.307873644507307, "grad_norm": 0.020190933719277382, "learning_rate": 0.004982846879364116, "loss": 0.1003, "num_input_tokens_seen": 31787840, "step": 36710 }, { "epoch": 17.31023102310231, "grad_norm": 0.010709051042795181, "learning_rate": 0.0049678016217400535, "loss": 0.0799, "num_input_tokens_seen": 31792320, "step": 36715 }, { "epoch": 17.31258840169731, "grad_norm": 0.015421408228576183, "learning_rate": 0.004952778729903595, "loss": 0.1102, "num_input_tokens_seen": 31796272, "step": 36720 }, { "epoch": 17.314945780292316, "grad_norm": 0.00610466580837965, "learning_rate": 0.004937778206171422, "loss": 0.0916, "num_input_tokens_seen": 31800640, "step": 36725 }, { "epoch": 17.317303158887317, "grad_norm": 0.00515762809664011, "learning_rate": 0.004922800052856835, "loss": 0.075, "num_input_tokens_seen": 31805872, "step": 36730 }, { "epoch": 17.31966053748232, "grad_norm": 0.010027341544628143, "learning_rate": 0.004907844272269602, "loss": 0.1794, "num_input_tokens_seen": 31810608, "step": 36735 }, { "epoch": 17.32201791607732, "grad_norm": 0.012927073054015636, "learning_rate": 0.004892910866716144, "loss": 0.0927, "num_input_tokens_seen": 31814688, "step": 36740 }, { "epoch": 17.324375294672326, "grad_norm": 0.02078653685748577, "learning_rate": 0.004877999838499369, "loss": 0.0788, "num_input_tokens_seen": 31819520, "step": 36745 }, { "epoch": 17.326732673267326, "grad_norm": 0.0125935859978199, "learning_rate": 0.0048631111899187065, "loss": 0.1159, "num_input_tokens_seen": 31824912, "step": 36750 }, { "epoch": 17.32909005186233, "grad_norm": 0.021209241822361946, "learning_rate": 0.0048482449232702335, "loss": 0.1389, "num_input_tokens_seen": 31829104, "step": 36755 }, { "epoch": 17.33144743045733, "grad_norm": 0.007866098545491695, "learning_rate": 0.004833401040846469, "loss": 0.0447, "num_input_tokens_seen": 31833824, "step": 36760 }, { "epoch": 17.333804809052335, "grad_norm": 0.029457172378897667, "learning_rate": 0.004818579544936546, "loss": 0.1531, "num_input_tokens_seen": 31837712, "step": 36765 }, { "epoch": 17.336162187647336, "grad_norm": 0.004547744523733854, "learning_rate": 0.004803780437826121, "loss": 0.077, "num_input_tokens_seen": 31841824, "step": 36770 }, { "epoch": 17.33851956624234, "grad_norm": 0.002559335669502616, "learning_rate": 0.004789003721797402, "loss": 0.1112, "num_input_tokens_seen": 31845952, "step": 36775 }, { "epoch": 17.34087694483734, "grad_norm": 0.02416328713297844, "learning_rate": 0.004774249399129132, "loss": 0.1575, "num_input_tokens_seen": 31850112, "step": 36780 }, { "epoch": 17.343234323432345, "grad_norm": 0.017543796449899673, "learning_rate": 0.004759517472096642, "loss": 0.1209, "num_input_tokens_seen": 31853936, "step": 36785 }, { "epoch": 17.345591702027345, "grad_norm": 0.013840384781360626, "learning_rate": 0.004744807942971746, "loss": 0.0726, "num_input_tokens_seen": 31858480, "step": 36790 }, { "epoch": 17.34794908062235, "grad_norm": 0.01507110521197319, "learning_rate": 0.004730120814022881, "loss": 0.1331, "num_input_tokens_seen": 31863376, "step": 36795 }, { "epoch": 17.35030645921735, "grad_norm": 0.016000894829630852, "learning_rate": 0.004715456087514935, "loss": 0.1109, "num_input_tokens_seen": 31868288, "step": 36800 }, { "epoch": 17.35030645921735, "eval_loss": 0.28111526370048523, "eval_runtime": 21.8597, "eval_samples_per_second": 43.139, "eval_steps_per_second": 21.592, "num_input_tokens_seen": 31868288, "step": 36800 }, { "epoch": 17.352663837812354, "grad_norm": 0.028043637052178383, "learning_rate": 0.004700813765709432, "loss": 0.0703, "num_input_tokens_seen": 31872656, "step": 36805 }, { "epoch": 17.355021216407355, "grad_norm": 0.012592381797730923, "learning_rate": 0.004686193850864401, "loss": 0.0941, "num_input_tokens_seen": 31877552, "step": 36810 }, { "epoch": 17.35737859500236, "grad_norm": 0.01725969836115837, "learning_rate": 0.004671596345234385, "loss": 0.1153, "num_input_tokens_seen": 31882400, "step": 36815 }, { "epoch": 17.35973597359736, "grad_norm": 0.015227221883833408, "learning_rate": 0.00465702125107052, "loss": 0.1542, "num_input_tokens_seen": 31886560, "step": 36820 }, { "epoch": 17.36209335219236, "grad_norm": 0.021076224744319916, "learning_rate": 0.004642468570620506, "loss": 0.1175, "num_input_tokens_seen": 31890512, "step": 36825 }, { "epoch": 17.364450730787365, "grad_norm": 0.01934572495520115, "learning_rate": 0.004627938306128482, "loss": 0.1155, "num_input_tokens_seen": 31895040, "step": 36830 }, { "epoch": 17.366808109382365, "grad_norm": 0.00551070598885417, "learning_rate": 0.004613430459835255, "loss": 0.0965, "num_input_tokens_seen": 31899312, "step": 36835 }, { "epoch": 17.36916548797737, "grad_norm": 0.01007936429232359, "learning_rate": 0.004598945033978085, "loss": 0.0826, "num_input_tokens_seen": 31904112, "step": 36840 }, { "epoch": 17.37152286657237, "grad_norm": 0.014137782156467438, "learning_rate": 0.004584482030790804, "loss": 0.0985, "num_input_tokens_seen": 31908048, "step": 36845 }, { "epoch": 17.373880245167374, "grad_norm": 0.007444418501108885, "learning_rate": 0.004570041452503826, "loss": 0.1, "num_input_tokens_seen": 31911696, "step": 36850 }, { "epoch": 17.376237623762375, "grad_norm": 0.005004009697586298, "learning_rate": 0.004555623301344003, "loss": 0.0356, "num_input_tokens_seen": 31916944, "step": 36855 }, { "epoch": 17.37859500235738, "grad_norm": 0.007047620601952076, "learning_rate": 0.004541227579534857, "loss": 0.1214, "num_input_tokens_seen": 31921264, "step": 36860 }, { "epoch": 17.38095238095238, "grad_norm": 0.018108729273080826, "learning_rate": 0.004526854289296378, "loss": 0.088, "num_input_tokens_seen": 31925520, "step": 36865 }, { "epoch": 17.383309759547384, "grad_norm": 0.01568642072379589, "learning_rate": 0.004512503432845078, "loss": 0.0701, "num_input_tokens_seen": 31929104, "step": 36870 }, { "epoch": 17.385667138142384, "grad_norm": 0.010210195556282997, "learning_rate": 0.004498175012394068, "loss": 0.0627, "num_input_tokens_seen": 31933136, "step": 36875 }, { "epoch": 17.38802451673739, "grad_norm": 0.019430244341492653, "learning_rate": 0.004483869030152965, "loss": 0.1142, "num_input_tokens_seen": 31936912, "step": 36880 }, { "epoch": 17.39038189533239, "grad_norm": 0.018688926473259926, "learning_rate": 0.004469585488327904, "loss": 0.1067, "num_input_tokens_seen": 31941040, "step": 36885 }, { "epoch": 17.392739273927393, "grad_norm": 0.00512363575398922, "learning_rate": 0.0044553243891216395, "loss": 0.0963, "num_input_tokens_seen": 31945024, "step": 36890 }, { "epoch": 17.395096652522394, "grad_norm": 0.004080599173903465, "learning_rate": 0.004441085734733363, "loss": 0.0656, "num_input_tokens_seen": 31949312, "step": 36895 }, { "epoch": 17.397454031117398, "grad_norm": 0.021944599226117134, "learning_rate": 0.004426869527358884, "loss": 0.1399, "num_input_tokens_seen": 31953536, "step": 36900 }, { "epoch": 17.3998114097124, "grad_norm": 0.017604153603315353, "learning_rate": 0.0044126757691905156, "loss": 0.0798, "num_input_tokens_seen": 31957680, "step": 36905 }, { "epoch": 17.402168788307403, "grad_norm": 0.006458121817559004, "learning_rate": 0.004398504462417107, "loss": 0.0651, "num_input_tokens_seen": 31962800, "step": 36910 }, { "epoch": 17.404526166902404, "grad_norm": 0.006085680797696114, "learning_rate": 0.0043843556092240605, "loss": 0.0947, "num_input_tokens_seen": 31967152, "step": 36915 }, { "epoch": 17.406883545497408, "grad_norm": 0.012713879346847534, "learning_rate": 0.004370229211793281, "loss": 0.1065, "num_input_tokens_seen": 31971056, "step": 36920 }, { "epoch": 17.40924092409241, "grad_norm": 0.01980450563132763, "learning_rate": 0.0043561252723032405, "loss": 0.0906, "num_input_tokens_seen": 31975152, "step": 36925 }, { "epoch": 17.411598302687413, "grad_norm": 0.00924790557473898, "learning_rate": 0.004342043792929001, "loss": 0.044, "num_input_tokens_seen": 31979184, "step": 36930 }, { "epoch": 17.413955681282413, "grad_norm": 0.017202438786625862, "learning_rate": 0.004327984775842025, "loss": 0.0663, "num_input_tokens_seen": 31982544, "step": 36935 }, { "epoch": 17.416313059877417, "grad_norm": 0.00657540000975132, "learning_rate": 0.004313948223210428, "loss": 0.0665, "num_input_tokens_seen": 31987136, "step": 36940 }, { "epoch": 17.418670438472418, "grad_norm": 0.011688865721225739, "learning_rate": 0.004299934137198846, "loss": 0.0376, "num_input_tokens_seen": 31992144, "step": 36945 }, { "epoch": 17.421027817067422, "grad_norm": 0.012884492054581642, "learning_rate": 0.004285942519968383, "loss": 0.1052, "num_input_tokens_seen": 31996064, "step": 36950 }, { "epoch": 17.423385195662423, "grad_norm": 0.007845276035368443, "learning_rate": 0.004271973373676746, "loss": 0.052, "num_input_tokens_seen": 32000448, "step": 36955 }, { "epoch": 17.425742574257427, "grad_norm": 0.008708599023520947, "learning_rate": 0.004258026700478146, "loss": 0.0847, "num_input_tokens_seen": 32005968, "step": 36960 }, { "epoch": 17.428099952852428, "grad_norm": 0.009054476395249367, "learning_rate": 0.004244102502523328, "loss": 0.0729, "num_input_tokens_seen": 32010976, "step": 36965 }, { "epoch": 17.430457331447432, "grad_norm": 0.0228553656488657, "learning_rate": 0.004230200781959592, "loss": 0.1265, "num_input_tokens_seen": 32016032, "step": 36970 }, { "epoch": 17.432814710042432, "grad_norm": 0.016372598707675934, "learning_rate": 0.004216321540930756, "loss": 0.0657, "num_input_tokens_seen": 32020304, "step": 36975 }, { "epoch": 17.435172088637437, "grad_norm": 0.0333210825920105, "learning_rate": 0.004202464781577175, "loss": 0.3029, "num_input_tokens_seen": 32024144, "step": 36980 }, { "epoch": 17.437529467232437, "grad_norm": 0.01952928863465786, "learning_rate": 0.00418863050603574, "loss": 0.1064, "num_input_tokens_seen": 32028864, "step": 36985 }, { "epoch": 17.43988684582744, "grad_norm": 0.018115228042006493, "learning_rate": 0.004174818716439843, "loss": 0.1168, "num_input_tokens_seen": 32032688, "step": 36990 }, { "epoch": 17.442244224422442, "grad_norm": 0.005769358482211828, "learning_rate": 0.004161029414919464, "loss": 0.0621, "num_input_tokens_seen": 32037008, "step": 36995 }, { "epoch": 17.444601603017446, "grad_norm": 0.019282463937997818, "learning_rate": 0.004147262603601071, "loss": 0.0748, "num_input_tokens_seen": 32041536, "step": 37000 }, { "epoch": 17.444601603017446, "eval_loss": 0.28019195795059204, "eval_runtime": 21.8982, "eval_samples_per_second": 43.063, "eval_steps_per_second": 21.554, "num_input_tokens_seen": 32041536, "step": 37000 }, { "epoch": 17.446958981612447, "grad_norm": 0.016871048137545586, "learning_rate": 0.004133518284607679, "loss": 0.0904, "num_input_tokens_seen": 32045568, "step": 37005 }, { "epoch": 17.44931636020745, "grad_norm": 0.025413470342755318, "learning_rate": 0.004119796460058861, "loss": 0.1198, "num_input_tokens_seen": 32050144, "step": 37010 }, { "epoch": 17.45167373880245, "grad_norm": 0.009033769369125366, "learning_rate": 0.00410609713207064, "loss": 0.0735, "num_input_tokens_seen": 32055296, "step": 37015 }, { "epoch": 17.454031117397456, "grad_norm": 0.023872872814536095, "learning_rate": 0.004092420302755678, "loss": 0.1033, "num_input_tokens_seen": 32059872, "step": 37020 }, { "epoch": 17.456388495992456, "grad_norm": 0.022432398051023483, "learning_rate": 0.004078765974223103, "loss": 0.0854, "num_input_tokens_seen": 32064128, "step": 37025 }, { "epoch": 17.458745874587457, "grad_norm": 0.01602267660200596, "learning_rate": 0.004065134148578564, "loss": 0.147, "num_input_tokens_seen": 32069232, "step": 37030 }, { "epoch": 17.46110325318246, "grad_norm": 0.008524245582520962, "learning_rate": 0.004051524827924279, "loss": 0.0823, "num_input_tokens_seen": 32073760, "step": 37035 }, { "epoch": 17.463460631777462, "grad_norm": 0.003973803948611021, "learning_rate": 0.004037938014358955, "loss": 0.0722, "num_input_tokens_seen": 32078944, "step": 37040 }, { "epoch": 17.465818010372466, "grad_norm": 0.00881690438836813, "learning_rate": 0.004024373709977863, "loss": 0.1095, "num_input_tokens_seen": 32082992, "step": 37045 }, { "epoch": 17.468175388967467, "grad_norm": 0.018982673063874245, "learning_rate": 0.004010831916872814, "loss": 0.0927, "num_input_tokens_seen": 32087584, "step": 37050 }, { "epoch": 17.47053276756247, "grad_norm": 0.011893784627318382, "learning_rate": 0.003997312637132089, "loss": 0.1159, "num_input_tokens_seen": 32091600, "step": 37055 }, { "epoch": 17.47289014615747, "grad_norm": 0.014219461008906364, "learning_rate": 0.003983815872840535, "loss": 0.122, "num_input_tokens_seen": 32096144, "step": 37060 }, { "epoch": 17.475247524752476, "grad_norm": 0.011056758463382721, "learning_rate": 0.003970341626079521, "loss": 0.0861, "num_input_tokens_seen": 32100624, "step": 37065 }, { "epoch": 17.477604903347476, "grad_norm": 0.007328271400183439, "learning_rate": 0.003956889898926952, "loss": 0.0917, "num_input_tokens_seen": 32104768, "step": 37070 }, { "epoch": 17.47996228194248, "grad_norm": 0.012774522416293621, "learning_rate": 0.0039434606934572675, "loss": 0.0815, "num_input_tokens_seen": 32108448, "step": 37075 }, { "epoch": 17.48231966053748, "grad_norm": 0.014154463075101376, "learning_rate": 0.003930054011741396, "loss": 0.0798, "num_input_tokens_seen": 32112784, "step": 37080 }, { "epoch": 17.484677039132485, "grad_norm": 0.005905671510845423, "learning_rate": 0.0039166698558468155, "loss": 0.0844, "num_input_tokens_seen": 32117104, "step": 37085 }, { "epoch": 17.487034417727486, "grad_norm": 0.007756276521831751, "learning_rate": 0.0039033082278375594, "loss": 0.0918, "num_input_tokens_seen": 32121984, "step": 37090 }, { "epoch": 17.48939179632249, "grad_norm": 0.017373360693454742, "learning_rate": 0.003889969129774112, "loss": 0.1276, "num_input_tokens_seen": 32126096, "step": 37095 }, { "epoch": 17.49174917491749, "grad_norm": 0.014663071371614933, "learning_rate": 0.0038766525637135784, "loss": 0.076, "num_input_tokens_seen": 32130016, "step": 37100 }, { "epoch": 17.494106553512495, "grad_norm": 0.0149783194065094, "learning_rate": 0.0038633585317095318, "loss": 0.1117, "num_input_tokens_seen": 32133632, "step": 37105 }, { "epoch": 17.496463932107496, "grad_norm": 0.017553722485899925, "learning_rate": 0.00385008703581205, "loss": 0.0894, "num_input_tokens_seen": 32137472, "step": 37110 }, { "epoch": 17.4988213107025, "grad_norm": 0.0034000156447291374, "learning_rate": 0.0038368380780677944, "loss": 0.0387, "num_input_tokens_seen": 32141472, "step": 37115 }, { "epoch": 17.5011786892975, "grad_norm": 0.010670600458979607, "learning_rate": 0.003823611660519882, "loss": 0.049, "num_input_tokens_seen": 32145840, "step": 37120 }, { "epoch": 17.503536067892504, "grad_norm": 0.012972015887498856, "learning_rate": 0.0038104077852080475, "loss": 0.0847, "num_input_tokens_seen": 32149664, "step": 37125 }, { "epoch": 17.505893446487505, "grad_norm": 0.01400159765034914, "learning_rate": 0.003797226454168462, "loss": 0.1395, "num_input_tokens_seen": 32154208, "step": 37130 }, { "epoch": 17.50825082508251, "grad_norm": 0.00900962669402361, "learning_rate": 0.003784067669433849, "loss": 0.0655, "num_input_tokens_seen": 32158688, "step": 37135 }, { "epoch": 17.51060820367751, "grad_norm": 0.010161323472857475, "learning_rate": 0.0037709314330334528, "loss": 0.0773, "num_input_tokens_seen": 32162512, "step": 37140 }, { "epoch": 17.512965582272514, "grad_norm": 0.009984783828258514, "learning_rate": 0.003757817746993086, "loss": 0.1263, "num_input_tokens_seen": 32166288, "step": 37145 }, { "epoch": 17.515322960867515, "grad_norm": 0.005558244418352842, "learning_rate": 0.0037447266133349977, "loss": 0.0796, "num_input_tokens_seen": 32171024, "step": 37150 }, { "epoch": 17.51768033946252, "grad_norm": 0.010638963431119919, "learning_rate": 0.003731658034078039, "loss": 0.0561, "num_input_tokens_seen": 32174496, "step": 37155 }, { "epoch": 17.52003771805752, "grad_norm": 0.012513309717178345, "learning_rate": 0.0037186120112375153, "loss": 0.1451, "num_input_tokens_seen": 32179120, "step": 37160 }, { "epoch": 17.522395096652524, "grad_norm": 0.011733729392290115, "learning_rate": 0.003705588546825317, "loss": 0.0608, "num_input_tokens_seen": 32182800, "step": 37165 }, { "epoch": 17.524752475247524, "grad_norm": 0.007248970679938793, "learning_rate": 0.0036925876428498205, "loss": 0.0571, "num_input_tokens_seen": 32187936, "step": 37170 }, { "epoch": 17.52710985384253, "grad_norm": 0.020274773240089417, "learning_rate": 0.0036796093013159057, "loss": 0.0995, "num_input_tokens_seen": 32191984, "step": 37175 }, { "epoch": 17.52946723243753, "grad_norm": 0.006151061039417982, "learning_rate": 0.0036666535242250217, "loss": 0.0581, "num_input_tokens_seen": 32196512, "step": 37180 }, { "epoch": 17.531824611032533, "grad_norm": 0.015256330370903015, "learning_rate": 0.003653720313575104, "loss": 0.1234, "num_input_tokens_seen": 32201168, "step": 37185 }, { "epoch": 17.534181989627534, "grad_norm": 0.011192667298018932, "learning_rate": 0.003640809671360623, "loss": 0.0523, "num_input_tokens_seen": 32205120, "step": 37190 }, { "epoch": 17.536539368222538, "grad_norm": 0.023285001516342163, "learning_rate": 0.003627921599572553, "loss": 0.0831, "num_input_tokens_seen": 32209440, "step": 37195 }, { "epoch": 17.53889674681754, "grad_norm": 0.028738277032971382, "learning_rate": 0.003615056100198405, "loss": 0.105, "num_input_tokens_seen": 32213584, "step": 37200 }, { "epoch": 17.53889674681754, "eval_loss": 0.28394991159439087, "eval_runtime": 21.9443, "eval_samples_per_second": 42.972, "eval_steps_per_second": 21.509, "num_input_tokens_seen": 32213584, "step": 37200 }, { "epoch": 17.541254125412543, "grad_norm": 0.006572001148015261, "learning_rate": 0.003602213175222174, "loss": 0.1017, "num_input_tokens_seen": 32217664, "step": 37205 }, { "epoch": 17.543611504007544, "grad_norm": 0.013822491280734539, "learning_rate": 0.0035893928266244432, "loss": 0.0583, "num_input_tokens_seen": 32222288, "step": 37210 }, { "epoch": 17.545968882602544, "grad_norm": 0.00720096193253994, "learning_rate": 0.003576595056382248, "loss": 0.0867, "num_input_tokens_seen": 32226720, "step": 37215 }, { "epoch": 17.54832626119755, "grad_norm": 0.0058188047260046005, "learning_rate": 0.0035638198664691423, "loss": 0.1175, "num_input_tokens_seen": 32230160, "step": 37220 }, { "epoch": 17.55068363979255, "grad_norm": 0.008298723958432674, "learning_rate": 0.003551067258855267, "loss": 0.1041, "num_input_tokens_seen": 32234784, "step": 37225 }, { "epoch": 17.553041018387553, "grad_norm": 0.016752494499087334, "learning_rate": 0.0035383372355071996, "loss": 0.1477, "num_input_tokens_seen": 32239920, "step": 37230 }, { "epoch": 17.555398396982554, "grad_norm": 0.025318287312984467, "learning_rate": 0.0035256297983881023, "loss": 0.2348, "num_input_tokens_seen": 32243888, "step": 37235 }, { "epoch": 17.557755775577558, "grad_norm": 0.01791287213563919, "learning_rate": 0.0035129449494575747, "loss": 0.0963, "num_input_tokens_seen": 32248480, "step": 37240 }, { "epoch": 17.56011315417256, "grad_norm": 0.002624685177579522, "learning_rate": 0.0035002826906718187, "loss": 0.0536, "num_input_tokens_seen": 32252848, "step": 37245 }, { "epoch": 17.562470532767563, "grad_norm": 0.01675652340054512, "learning_rate": 0.003487643023983522, "loss": 0.0984, "num_input_tokens_seen": 32257040, "step": 37250 }, { "epoch": 17.564827911362563, "grad_norm": 0.010680273175239563, "learning_rate": 0.003475025951341842, "loss": 0.1299, "num_input_tokens_seen": 32260768, "step": 37255 }, { "epoch": 17.567185289957568, "grad_norm": 0.028947947546839714, "learning_rate": 0.00346243147469249, "loss": 0.2178, "num_input_tokens_seen": 32264688, "step": 37260 }, { "epoch": 17.569542668552568, "grad_norm": 0.0035651070065796375, "learning_rate": 0.0034498595959777446, "loss": 0.1528, "num_input_tokens_seen": 32269792, "step": 37265 }, { "epoch": 17.571900047147572, "grad_norm": 0.014632598496973515, "learning_rate": 0.003437310317136305, "loss": 0.0711, "num_input_tokens_seen": 32274320, "step": 37270 }, { "epoch": 17.574257425742573, "grad_norm": 0.01866121217608452, "learning_rate": 0.0034247836401034236, "loss": 0.1379, "num_input_tokens_seen": 32278816, "step": 37275 }, { "epoch": 17.576614804337577, "grad_norm": 0.006002443376928568, "learning_rate": 0.003412279566810905, "loss": 0.0563, "num_input_tokens_seen": 32282816, "step": 37280 }, { "epoch": 17.578972182932578, "grad_norm": 0.02452298067510128, "learning_rate": 0.00339979809918699, "loss": 0.1561, "num_input_tokens_seen": 32286432, "step": 37285 }, { "epoch": 17.581329561527582, "grad_norm": 0.007970267906785011, "learning_rate": 0.0033873392391565228, "loss": 0.0857, "num_input_tokens_seen": 32290608, "step": 37290 }, { "epoch": 17.583686940122583, "grad_norm": 0.014925443567335606, "learning_rate": 0.003374902988640782, "loss": 0.1255, "num_input_tokens_seen": 32295344, "step": 37295 }, { "epoch": 17.586044318717587, "grad_norm": 0.013276524841785431, "learning_rate": 0.0033624893495576014, "loss": 0.0919, "num_input_tokens_seen": 32299728, "step": 37300 }, { "epoch": 17.588401697312587, "grad_norm": 0.0026049637235701084, "learning_rate": 0.0033500983238213323, "loss": 0.0503, "num_input_tokens_seen": 32303872, "step": 37305 }, { "epoch": 17.59075907590759, "grad_norm": 0.022344935685396194, "learning_rate": 0.0033377299133428126, "loss": 0.1128, "num_input_tokens_seen": 32307600, "step": 37310 }, { "epoch": 17.593116454502592, "grad_norm": 0.009955333545804024, "learning_rate": 0.003325384120029434, "loss": 0.0765, "num_input_tokens_seen": 32312016, "step": 37315 }, { "epoch": 17.595473833097596, "grad_norm": 0.006067886482924223, "learning_rate": 0.0033130609457850233, "loss": 0.0423, "num_input_tokens_seen": 32316160, "step": 37320 }, { "epoch": 17.597831211692597, "grad_norm": 0.01821281388401985, "learning_rate": 0.0033007603925100104, "loss": 0.1538, "num_input_tokens_seen": 32320480, "step": 37325 }, { "epoch": 17.6001885902876, "grad_norm": 0.025433285161852837, "learning_rate": 0.003288482462101294, "loss": 0.0863, "num_input_tokens_seen": 32325520, "step": 37330 }, { "epoch": 17.602545968882602, "grad_norm": 0.00830509327352047, "learning_rate": 0.0032762271564522605, "loss": 0.1245, "num_input_tokens_seen": 32330288, "step": 37335 }, { "epoch": 17.604903347477606, "grad_norm": 0.015903238207101822, "learning_rate": 0.003263994477452864, "loss": 0.1014, "num_input_tokens_seen": 32333824, "step": 37340 }, { "epoch": 17.607260726072607, "grad_norm": 0.023675723001360893, "learning_rate": 0.0032517844269895125, "loss": 0.0691, "num_input_tokens_seen": 32338240, "step": 37345 }, { "epoch": 17.60961810466761, "grad_norm": 0.009697126224637032, "learning_rate": 0.0032395970069451496, "loss": 0.096, "num_input_tokens_seen": 32342336, "step": 37350 }, { "epoch": 17.61197548326261, "grad_norm": 0.020435182377696037, "learning_rate": 0.0032274322191992388, "loss": 0.1761, "num_input_tokens_seen": 32346512, "step": 37355 }, { "epoch": 17.614332861857616, "grad_norm": 0.00927786622196436, "learning_rate": 0.0032152900656277294, "loss": 0.1245, "num_input_tokens_seen": 32350672, "step": 37360 }, { "epoch": 17.616690240452616, "grad_norm": 0.01766171306371689, "learning_rate": 0.0032031705481030902, "loss": 0.0984, "num_input_tokens_seen": 32355408, "step": 37365 }, { "epoch": 17.61904761904762, "grad_norm": 0.012787561863660812, "learning_rate": 0.0031910736684943428, "loss": 0.1077, "num_input_tokens_seen": 32359648, "step": 37370 }, { "epoch": 17.62140499764262, "grad_norm": 0.016698474064469337, "learning_rate": 0.0031789994286669453, "loss": 0.0796, "num_input_tokens_seen": 32363952, "step": 37375 }, { "epoch": 17.623762376237625, "grad_norm": 0.028920453041791916, "learning_rate": 0.003166947830482908, "loss": 0.0849, "num_input_tokens_seen": 32368544, "step": 37380 }, { "epoch": 17.626119754832626, "grad_norm": 0.005374092608690262, "learning_rate": 0.003154918875800727, "loss": 0.0817, "num_input_tokens_seen": 32373184, "step": 37385 }, { "epoch": 17.62847713342763, "grad_norm": 0.009427309036254883, "learning_rate": 0.00314291256647542, "loss": 0.0646, "num_input_tokens_seen": 32377232, "step": 37390 }, { "epoch": 17.63083451202263, "grad_norm": 0.0013187407748773694, "learning_rate": 0.0031309289043585375, "loss": 0.0657, "num_input_tokens_seen": 32381008, "step": 37395 }, { "epoch": 17.633191890617635, "grad_norm": 0.01300282496958971, "learning_rate": 0.003118967891298069, "loss": 0.0659, "num_input_tokens_seen": 32385920, "step": 37400 }, { "epoch": 17.633191890617635, "eval_loss": 0.2861294150352478, "eval_runtime": 21.9042, "eval_samples_per_second": 43.051, "eval_steps_per_second": 21.548, "num_input_tokens_seen": 32385920, "step": 37400 }, { "epoch": 17.635549269212635, "grad_norm": 0.011255573481321335, "learning_rate": 0.003107029529138572, "loss": 0.0893, "num_input_tokens_seen": 32390528, "step": 37405 }, { "epoch": 17.63790664780764, "grad_norm": 0.0017460254020988941, "learning_rate": 0.0030951138197211235, "loss": 0.1768, "num_input_tokens_seen": 32395040, "step": 37410 }, { "epoch": 17.64026402640264, "grad_norm": 0.012948630377650261, "learning_rate": 0.0030832207648832377, "loss": 0.0875, "num_input_tokens_seen": 32399440, "step": 37415 }, { "epoch": 17.64262140499764, "grad_norm": 0.008558126166462898, "learning_rate": 0.0030713503664589635, "loss": 0.0797, "num_input_tokens_seen": 32403184, "step": 37420 }, { "epoch": 17.644978783592645, "grad_norm": 0.004658039193600416, "learning_rate": 0.0030595026262788872, "loss": 0.1151, "num_input_tokens_seen": 32406848, "step": 37425 }, { "epoch": 17.647336162187646, "grad_norm": 0.016357524320483208, "learning_rate": 0.00304767754617008, "loss": 0.0802, "num_input_tokens_seen": 32411280, "step": 37430 }, { "epoch": 17.64969354078265, "grad_norm": 0.017679346725344658, "learning_rate": 0.003035875127956117, "loss": 0.0913, "num_input_tokens_seen": 32416000, "step": 37435 }, { "epoch": 17.65205091937765, "grad_norm": 0.007143513765186071, "learning_rate": 0.0030240953734570752, "loss": 0.12, "num_input_tokens_seen": 32419760, "step": 37440 }, { "epoch": 17.654408297972655, "grad_norm": 0.01890089362859726, "learning_rate": 0.003012338284489535, "loss": 0.0869, "num_input_tokens_seen": 32423408, "step": 37445 }, { "epoch": 17.656765676567655, "grad_norm": 0.009493334218859673, "learning_rate": 0.0030006038628665964, "loss": 0.0664, "num_input_tokens_seen": 32427728, "step": 37450 }, { "epoch": 17.65912305516266, "grad_norm": 0.019914016127586365, "learning_rate": 0.002988892110397845, "loss": 0.0883, "num_input_tokens_seen": 32431920, "step": 37455 }, { "epoch": 17.66148043375766, "grad_norm": 0.016373831778764725, "learning_rate": 0.0029772030288894025, "loss": 0.1019, "num_input_tokens_seen": 32435696, "step": 37460 }, { "epoch": 17.663837812352664, "grad_norm": 0.014126284047961235, "learning_rate": 0.0029655366201438438, "loss": 0.0993, "num_input_tokens_seen": 32440208, "step": 37465 }, { "epoch": 17.666195190947665, "grad_norm": 0.01077905111014843, "learning_rate": 0.0029538928859602965, "loss": 0.1074, "num_input_tokens_seen": 32444576, "step": 37470 }, { "epoch": 17.66855256954267, "grad_norm": 0.009381688199937344, "learning_rate": 0.002942271828134374, "loss": 0.1603, "num_input_tokens_seen": 32448864, "step": 37475 }, { "epoch": 17.67090994813767, "grad_norm": 0.016609903424978256, "learning_rate": 0.00293067344845816, "loss": 0.0819, "num_input_tokens_seen": 32452800, "step": 37480 }, { "epoch": 17.673267326732674, "grad_norm": 0.013592839241027832, "learning_rate": 0.0029190977487202896, "loss": 0.1395, "num_input_tokens_seen": 32456576, "step": 37485 }, { "epoch": 17.675624705327674, "grad_norm": 0.006814047694206238, "learning_rate": 0.0029075447307058853, "loss": 0.0666, "num_input_tokens_seen": 32461504, "step": 37490 }, { "epoch": 17.67798208392268, "grad_norm": 0.006972964387387037, "learning_rate": 0.0028960143961965722, "loss": 0.0777, "num_input_tokens_seen": 32465904, "step": 37495 }, { "epoch": 17.68033946251768, "grad_norm": 0.009058712050318718, "learning_rate": 0.002884506746970461, "loss": 0.1298, "num_input_tokens_seen": 32469888, "step": 37500 }, { "epoch": 17.682696841112683, "grad_norm": 0.011921214871108532, "learning_rate": 0.0028730217848021654, "loss": 0.075, "num_input_tokens_seen": 32473696, "step": 37505 }, { "epoch": 17.685054219707684, "grad_norm": 0.01218518428504467, "learning_rate": 0.0028615595114628188, "loss": 0.1105, "num_input_tokens_seen": 32478128, "step": 37510 }, { "epoch": 17.68741159830269, "grad_norm": 0.014437354169785976, "learning_rate": 0.002850119928720074, "loss": 0.1168, "num_input_tokens_seen": 32482352, "step": 37515 }, { "epoch": 17.68976897689769, "grad_norm": 0.005329999607056379, "learning_rate": 0.0028387030383380195, "loss": 0.0968, "num_input_tokens_seen": 32486432, "step": 37520 }, { "epoch": 17.692126355492693, "grad_norm": 0.011276536621153355, "learning_rate": 0.0028273088420772974, "loss": 0.0545, "num_input_tokens_seen": 32490784, "step": 37525 }, { "epoch": 17.694483734087694, "grad_norm": 0.01851036585867405, "learning_rate": 0.002815937341695068, "loss": 0.0982, "num_input_tokens_seen": 32495744, "step": 37530 }, { "epoch": 17.696841112682698, "grad_norm": 0.02714168280363083, "learning_rate": 0.0028045885389448963, "loss": 0.1136, "num_input_tokens_seen": 32499408, "step": 37535 }, { "epoch": 17.6991984912777, "grad_norm": 0.015347837470471859, "learning_rate": 0.002793262435576965, "loss": 0.1077, "num_input_tokens_seen": 32504096, "step": 37540 }, { "epoch": 17.701555869872703, "grad_norm": 0.0026802239008247852, "learning_rate": 0.0027819590333378772, "loss": 0.0792, "num_input_tokens_seen": 32508144, "step": 37545 }, { "epoch": 17.703913248467703, "grad_norm": 0.01572924666106701, "learning_rate": 0.002770678333970755, "loss": 0.1436, "num_input_tokens_seen": 32512352, "step": 37550 }, { "epoch": 17.706270627062707, "grad_norm": 0.004579825326800346, "learning_rate": 0.0027594203392152573, "loss": 0.0852, "num_input_tokens_seen": 32516576, "step": 37555 }, { "epoch": 17.708628005657708, "grad_norm": 0.017606457695364952, "learning_rate": 0.002748185050807478, "loss": 0.1346, "num_input_tokens_seen": 32521264, "step": 37560 }, { "epoch": 17.710985384252712, "grad_norm": 0.005874218884855509, "learning_rate": 0.002736972470480031, "loss": 0.1962, "num_input_tokens_seen": 32525456, "step": 37565 }, { "epoch": 17.713342762847713, "grad_norm": 0.015978962182998657, "learning_rate": 0.002725782599962068, "loss": 0.1892, "num_input_tokens_seen": 32529648, "step": 37570 }, { "epoch": 17.715700141442717, "grad_norm": 0.020400643348693848, "learning_rate": 0.0027146154409791734, "loss": 0.1237, "num_input_tokens_seen": 32534176, "step": 37575 }, { "epoch": 17.718057520037718, "grad_norm": 0.018990276381373405, "learning_rate": 0.002703470995253504, "loss": 0.1115, "num_input_tokens_seen": 32538080, "step": 37580 }, { "epoch": 17.720414898632722, "grad_norm": 0.006479684263467789, "learning_rate": 0.0026923492645036184, "loss": 0.0851, "num_input_tokens_seen": 32542848, "step": 37585 }, { "epoch": 17.722772277227723, "grad_norm": 0.003159410087391734, "learning_rate": 0.0026812502504446776, "loss": 0.0339, "num_input_tokens_seen": 32547456, "step": 37590 }, { "epoch": 17.725129655822727, "grad_norm": 0.029272079467773438, "learning_rate": 0.0026701739547882798, "loss": 0.1044, "num_input_tokens_seen": 32551168, "step": 37595 }, { "epoch": 17.727487034417727, "grad_norm": 0.019184965640306473, "learning_rate": 0.0026591203792425077, "loss": 0.094, "num_input_tokens_seen": 32555856, "step": 37600 }, { "epoch": 17.727487034417727, "eval_loss": 0.28277096152305603, "eval_runtime": 21.8781, "eval_samples_per_second": 43.102, "eval_steps_per_second": 21.574, "num_input_tokens_seen": 32555856, "step": 37600 }, { "epoch": 17.72984441301273, "grad_norm": 0.012827574275434017, "learning_rate": 0.0026480895255119818, "loss": 0.0603, "num_input_tokens_seen": 32560112, "step": 37605 }, { "epoch": 17.732201791607732, "grad_norm": 0.006732284557074308, "learning_rate": 0.002637081395297791, "loss": 0.0978, "num_input_tokens_seen": 32563312, "step": 37610 }, { "epoch": 17.734559170202736, "grad_norm": 0.0031669391319155693, "learning_rate": 0.0026260959902975113, "loss": 0.0983, "num_input_tokens_seen": 32567104, "step": 37615 }, { "epoch": 17.736916548797737, "grad_norm": 0.030518831685185432, "learning_rate": 0.00261513331220527, "loss": 0.1412, "num_input_tokens_seen": 32570896, "step": 37620 }, { "epoch": 17.739273927392738, "grad_norm": 0.02441331557929516, "learning_rate": 0.0026041933627116154, "loss": 0.0618, "num_input_tokens_seen": 32574768, "step": 37625 }, { "epoch": 17.74163130598774, "grad_norm": 0.005071558523923159, "learning_rate": 0.0025932761435036476, "loss": 0.0604, "num_input_tokens_seen": 32579024, "step": 37630 }, { "epoch": 17.743988684582742, "grad_norm": 0.012346313335001469, "learning_rate": 0.002582381656264904, "loss": 0.0949, "num_input_tokens_seen": 32584624, "step": 37635 }, { "epoch": 17.746346063177747, "grad_norm": 0.014568074606359005, "learning_rate": 0.0025715099026754895, "loss": 0.1068, "num_input_tokens_seen": 32589120, "step": 37640 }, { "epoch": 17.748703441772747, "grad_norm": 0.016098206862807274, "learning_rate": 0.002560660884411947, "loss": 0.0926, "num_input_tokens_seen": 32593872, "step": 37645 }, { "epoch": 17.75106082036775, "grad_norm": 0.00982583686709404, "learning_rate": 0.0025498346031473385, "loss": 0.0875, "num_input_tokens_seen": 32598016, "step": 37650 }, { "epoch": 17.753418198962752, "grad_norm": 0.012439287267625332, "learning_rate": 0.0025390310605511945, "loss": 0.1056, "num_input_tokens_seen": 32602240, "step": 37655 }, { "epoch": 17.755775577557756, "grad_norm": 0.01763077825307846, "learning_rate": 0.0025282502582895995, "loss": 0.0586, "num_input_tokens_seen": 32607216, "step": 37660 }, { "epoch": 17.758132956152757, "grad_norm": 0.025673573836684227, "learning_rate": 0.002517492198025023, "loss": 0.0939, "num_input_tokens_seen": 32611392, "step": 37665 }, { "epoch": 17.76049033474776, "grad_norm": 0.009384541772305965, "learning_rate": 0.0025067568814165554, "loss": 0.0787, "num_input_tokens_seen": 32614752, "step": 37670 }, { "epoch": 17.76284771334276, "grad_norm": 0.04071333259344101, "learning_rate": 0.0024960443101196884, "loss": 0.1187, "num_input_tokens_seen": 32618864, "step": 37675 }, { "epoch": 17.765205091937766, "grad_norm": 0.0332196019589901, "learning_rate": 0.002485354485786434, "loss": 0.1351, "num_input_tokens_seen": 32622656, "step": 37680 }, { "epoch": 17.767562470532766, "grad_norm": 0.01608564704656601, "learning_rate": 0.002474687410065307, "loss": 0.0966, "num_input_tokens_seen": 32626544, "step": 37685 }, { "epoch": 17.76991984912777, "grad_norm": 0.007058768067508936, "learning_rate": 0.002464043084601308, "loss": 0.0811, "num_input_tokens_seen": 32632688, "step": 37690 }, { "epoch": 17.77227722772277, "grad_norm": 0.019096404314041138, "learning_rate": 0.0024534215110358915, "loss": 0.0927, "num_input_tokens_seen": 32637120, "step": 37695 }, { "epoch": 17.774634606317775, "grad_norm": 0.008591579273343086, "learning_rate": 0.002442822691007096, "loss": 0.0939, "num_input_tokens_seen": 32641328, "step": 37700 }, { "epoch": 17.776991984912776, "grad_norm": 0.004804564174264669, "learning_rate": 0.002432246626149348, "loss": 0.0823, "num_input_tokens_seen": 32645216, "step": 37705 }, { "epoch": 17.77934936350778, "grad_norm": 0.009266759268939495, "learning_rate": 0.002421693318093626, "loss": 0.119, "num_input_tokens_seen": 32649456, "step": 37710 }, { "epoch": 17.78170674210278, "grad_norm": 0.0020745634101331234, "learning_rate": 0.0024111627684673784, "loss": 0.1856, "num_input_tokens_seen": 32654400, "step": 37715 }, { "epoch": 17.784064120697785, "grad_norm": 0.013148010708391666, "learning_rate": 0.0024006549788945395, "loss": 0.1064, "num_input_tokens_seen": 32659600, "step": 37720 }, { "epoch": 17.786421499292786, "grad_norm": 0.0368446446955204, "learning_rate": 0.0023901699509955463, "loss": 0.1446, "num_input_tokens_seen": 32663968, "step": 37725 }, { "epoch": 17.78877887788779, "grad_norm": 0.004968029912561178, "learning_rate": 0.0023797076863873554, "loss": 0.0615, "num_input_tokens_seen": 32668160, "step": 37730 }, { "epoch": 17.79113625648279, "grad_norm": 0.00618192320689559, "learning_rate": 0.0023692681866833262, "loss": 0.0591, "num_input_tokens_seen": 32672480, "step": 37735 }, { "epoch": 17.793493635077795, "grad_norm": 0.024555517360568047, "learning_rate": 0.0023588514534934046, "loss": 0.1072, "num_input_tokens_seen": 32676688, "step": 37740 }, { "epoch": 17.795851013672795, "grad_norm": 0.012568702921271324, "learning_rate": 0.002348457488423955, "loss": 0.0657, "num_input_tokens_seen": 32682080, "step": 37745 }, { "epoch": 17.7982083922678, "grad_norm": 0.011449404992163181, "learning_rate": 0.0023380862930778624, "loss": 0.1276, "num_input_tokens_seen": 32686432, "step": 37750 }, { "epoch": 17.8005657708628, "grad_norm": 0.025019392371177673, "learning_rate": 0.0023277378690545135, "loss": 0.0928, "num_input_tokens_seen": 32690384, "step": 37755 }, { "epoch": 17.802923149457804, "grad_norm": 0.003095651278272271, "learning_rate": 0.0023174122179497325, "loss": 0.1078, "num_input_tokens_seen": 32694912, "step": 37760 }, { "epoch": 17.805280528052805, "grad_norm": 0.0078053465113043785, "learning_rate": 0.0023071093413558784, "loss": 0.1156, "num_input_tokens_seen": 32699856, "step": 37765 }, { "epoch": 17.80763790664781, "grad_norm": 0.016472671180963516, "learning_rate": 0.002296829240861814, "loss": 0.1026, "num_input_tokens_seen": 32704544, "step": 37770 }, { "epoch": 17.80999528524281, "grad_norm": 0.007889578118920326, "learning_rate": 0.002286571918052821, "loss": 0.0774, "num_input_tokens_seen": 32708256, "step": 37775 }, { "epoch": 17.812352663837814, "grad_norm": 0.017487863078713417, "learning_rate": 0.0022763373745107174, "loss": 0.1092, "num_input_tokens_seen": 32712368, "step": 37780 }, { "epoch": 17.814710042432814, "grad_norm": 0.013089492917060852, "learning_rate": 0.0022661256118138074, "loss": 0.0532, "num_input_tokens_seen": 32717088, "step": 37785 }, { "epoch": 17.81706742102782, "grad_norm": 0.007651366759091616, "learning_rate": 0.0022559366315368645, "loss": 0.0673, "num_input_tokens_seen": 32720816, "step": 37790 }, { "epoch": 17.81942479962282, "grad_norm": 0.011137564666569233, "learning_rate": 0.002245770435251182, "loss": 0.0738, "num_input_tokens_seen": 32725120, "step": 37795 }, { "epoch": 17.821782178217823, "grad_norm": 0.011898873373866081, "learning_rate": 0.002235627024524456, "loss": 0.113, "num_input_tokens_seen": 32729024, "step": 37800 }, { "epoch": 17.821782178217823, "eval_loss": 0.2855503559112549, "eval_runtime": 21.9134, "eval_samples_per_second": 43.033, "eval_steps_per_second": 21.539, "num_input_tokens_seen": 32729024, "step": 37800 }, { "epoch": 17.824139556812824, "grad_norm": 0.013530240394175053, "learning_rate": 0.0022255064009209847, "loss": 0.0791, "num_input_tokens_seen": 32733504, "step": 37805 }, { "epoch": 17.826496935407828, "grad_norm": 0.0020145338494330645, "learning_rate": 0.0022154085660014864, "loss": 0.0682, "num_input_tokens_seen": 32738192, "step": 37810 }, { "epoch": 17.82885431400283, "grad_norm": 0.00931021198630333, "learning_rate": 0.0022053335213231494, "loss": 0.1012, "num_input_tokens_seen": 32742304, "step": 37815 }, { "epoch": 17.831211692597833, "grad_norm": 0.012183364480733871, "learning_rate": 0.002195281268439697, "loss": 0.115, "num_input_tokens_seen": 32747328, "step": 37820 }, { "epoch": 17.833569071192834, "grad_norm": 0.01888197660446167, "learning_rate": 0.002185251808901306, "loss": 0.0931, "num_input_tokens_seen": 32751552, "step": 37825 }, { "epoch": 17.835926449787834, "grad_norm": 0.020053604617714882, "learning_rate": 0.0021752451442546227, "loss": 0.0929, "num_input_tokens_seen": 32756144, "step": 37830 }, { "epoch": 17.83828382838284, "grad_norm": 0.03165505826473236, "learning_rate": 0.0021652612760428456, "loss": 0.0873, "num_input_tokens_seen": 32760208, "step": 37835 }, { "epoch": 17.84064120697784, "grad_norm": 0.024324428290128708, "learning_rate": 0.0021553002058055603, "loss": 0.1281, "num_input_tokens_seen": 32764960, "step": 37840 }, { "epoch": 17.842998585572843, "grad_norm": 0.0026885471306741238, "learning_rate": 0.0021453619350789376, "loss": 0.0637, "num_input_tokens_seen": 32769760, "step": 37845 }, { "epoch": 17.845355964167844, "grad_norm": 0.004285980481654406, "learning_rate": 0.0021354464653955516, "loss": 0.0907, "num_input_tokens_seen": 32774432, "step": 37850 }, { "epoch": 17.847713342762848, "grad_norm": 0.008758455514907837, "learning_rate": 0.002125553798284513, "loss": 0.045, "num_input_tokens_seen": 32779056, "step": 37855 }, { "epoch": 17.85007072135785, "grad_norm": 0.010427780449390411, "learning_rate": 0.002115683935271384, "loss": 0.1111, "num_input_tokens_seen": 32784080, "step": 37860 }, { "epoch": 17.852428099952853, "grad_norm": 0.016013622283935547, "learning_rate": 0.0021058368778782144, "loss": 0.165, "num_input_tokens_seen": 32788208, "step": 37865 }, { "epoch": 17.854785478547853, "grad_norm": 0.01217940729111433, "learning_rate": 0.002096012627623539, "loss": 0.0988, "num_input_tokens_seen": 32792576, "step": 37870 }, { "epoch": 17.857142857142858, "grad_norm": 0.007289068307727575, "learning_rate": 0.00208621118602243, "loss": 0.0632, "num_input_tokens_seen": 32796928, "step": 37875 }, { "epoch": 17.85950023573786, "grad_norm": 0.03092428669333458, "learning_rate": 0.002076432554586327, "loss": 0.1224, "num_input_tokens_seen": 32801008, "step": 37880 }, { "epoch": 17.861857614332862, "grad_norm": 0.019624466076493263, "learning_rate": 0.002066676734823258, "loss": 0.1862, "num_input_tokens_seen": 32805520, "step": 37885 }, { "epoch": 17.864214992927863, "grad_norm": 0.00908893533051014, "learning_rate": 0.0020569437282376866, "loss": 0.0858, "num_input_tokens_seen": 32809808, "step": 37890 }, { "epoch": 17.866572371522867, "grad_norm": 0.013917027972638607, "learning_rate": 0.002047233536330545, "loss": 0.0693, "num_input_tokens_seen": 32814624, "step": 37895 }, { "epoch": 17.868929750117868, "grad_norm": 0.0022596593480557203, "learning_rate": 0.0020375461605993015, "loss": 0.1049, "num_input_tokens_seen": 32818208, "step": 37900 }, { "epoch": 17.871287128712872, "grad_norm": 0.009653734974563122, "learning_rate": 0.002027881602537845, "loss": 0.0833, "num_input_tokens_seen": 32821760, "step": 37905 }, { "epoch": 17.873644507307873, "grad_norm": 0.038967959582805634, "learning_rate": 0.002018239863636567, "loss": 0.0915, "num_input_tokens_seen": 32825344, "step": 37910 }, { "epoch": 17.876001885902877, "grad_norm": 0.0022974100429564714, "learning_rate": 0.002008620945382378, "loss": 0.0888, "num_input_tokens_seen": 32829920, "step": 37915 }, { "epoch": 17.878359264497877, "grad_norm": 0.007134982384741306, "learning_rate": 0.001999024849258607, "loss": 0.1368, "num_input_tokens_seen": 32834112, "step": 37920 }, { "epoch": 17.88071664309288, "grad_norm": 0.013345448300242424, "learning_rate": 0.001989451576745105, "loss": 0.1075, "num_input_tokens_seen": 32838240, "step": 37925 }, { "epoch": 17.883074021687882, "grad_norm": 0.02551189623773098, "learning_rate": 0.00197990112931819, "loss": 0.2088, "num_input_tokens_seen": 32842432, "step": 37930 }, { "epoch": 17.885431400282886, "grad_norm": 0.012903011403977871, "learning_rate": 0.0019703735084506345, "loss": 0.0801, "num_input_tokens_seen": 32846864, "step": 37935 }, { "epoch": 17.887788778877887, "grad_norm": 0.01229048054665327, "learning_rate": 0.001960868715611763, "loss": 0.1194, "num_input_tokens_seen": 32851632, "step": 37940 }, { "epoch": 17.89014615747289, "grad_norm": 0.004790354520082474, "learning_rate": 0.0019513867522673034, "loss": 0.0713, "num_input_tokens_seen": 32856464, "step": 37945 }, { "epoch": 17.892503536067892, "grad_norm": 0.003282822435721755, "learning_rate": 0.001941927619879502, "loss": 0.0602, "num_input_tokens_seen": 32860912, "step": 37950 }, { "epoch": 17.894860914662896, "grad_norm": 0.009283834137022495, "learning_rate": 0.0019324913199070758, "loss": 0.1098, "num_input_tokens_seen": 32864816, "step": 37955 }, { "epoch": 17.897218293257897, "grad_norm": 0.003941444214433432, "learning_rate": 0.0019230778538052106, "loss": 0.0812, "num_input_tokens_seen": 32869056, "step": 37960 }, { "epoch": 17.8995756718529, "grad_norm": 0.02222757413983345, "learning_rate": 0.0019136872230255952, "loss": 0.1188, "num_input_tokens_seen": 32872816, "step": 37965 }, { "epoch": 17.9019330504479, "grad_norm": 0.030342984944581985, "learning_rate": 0.0019043194290164045, "loss": 0.1176, "num_input_tokens_seen": 32876128, "step": 37970 }, { "epoch": 17.904290429042906, "grad_norm": 0.0014170166105031967, "learning_rate": 0.0018949744732222162, "loss": 0.112, "num_input_tokens_seen": 32880960, "step": 37975 }, { "epoch": 17.906647807637906, "grad_norm": 0.012112445197999477, "learning_rate": 0.0018856523570841776, "loss": 0.0843, "num_input_tokens_seen": 32885552, "step": 37980 }, { "epoch": 17.90900518623291, "grad_norm": 0.012128538452088833, "learning_rate": 0.0018763530820398555, "loss": 0.1095, "num_input_tokens_seen": 32889664, "step": 37985 }, { "epoch": 17.91136256482791, "grad_norm": 0.018887383863329887, "learning_rate": 0.0018670766495233525, "loss": 0.0871, "num_input_tokens_seen": 32894832, "step": 37990 }, { "epoch": 17.913719943422915, "grad_norm": 0.002926361048594117, "learning_rate": 0.001857823060965158, "loss": 0.1675, "num_input_tokens_seen": 32898880, "step": 37995 }, { "epoch": 17.916077322017916, "grad_norm": 0.011991100385785103, "learning_rate": 0.0018485923177923467, "loss": 0.0887, "num_input_tokens_seen": 32902832, "step": 38000 }, { "epoch": 17.916077322017916, "eval_loss": 0.28853240609169006, "eval_runtime": 21.8656, "eval_samples_per_second": 43.127, "eval_steps_per_second": 21.586, "num_input_tokens_seen": 32902832, "step": 38000 }, { "epoch": 17.91843470061292, "grad_norm": 0.01959872432053089, "learning_rate": 0.001839384421428364, "loss": 0.1125, "num_input_tokens_seen": 32906896, "step": 38005 }, { "epoch": 17.92079207920792, "grad_norm": 0.012782145291566849, "learning_rate": 0.0018301993732932065, "loss": 0.1213, "num_input_tokens_seen": 32911168, "step": 38010 }, { "epoch": 17.92314945780292, "grad_norm": 0.012649822980165482, "learning_rate": 0.0018210371748033248, "loss": 0.0448, "num_input_tokens_seen": 32915664, "step": 38015 }, { "epoch": 17.925506836397926, "grad_norm": 0.00946555845439434, "learning_rate": 0.0018118978273716556, "loss": 0.0908, "num_input_tokens_seen": 32919824, "step": 38020 }, { "epoch": 17.927864214992926, "grad_norm": 0.014002281241118908, "learning_rate": 0.001802781332407588, "loss": 0.1055, "num_input_tokens_seen": 32925040, "step": 38025 }, { "epoch": 17.93022159358793, "grad_norm": 0.006152820307761431, "learning_rate": 0.0017936876913169806, "loss": 0.0682, "num_input_tokens_seen": 32929040, "step": 38030 }, { "epoch": 17.93257897218293, "grad_norm": 0.006718302145600319, "learning_rate": 0.0017846169055022287, "loss": 0.0528, "num_input_tokens_seen": 32933312, "step": 38035 }, { "epoch": 17.934936350777935, "grad_norm": 0.011343470774590969, "learning_rate": 0.0017755689763621295, "loss": 0.155, "num_input_tokens_seen": 32937648, "step": 38040 }, { "epoch": 17.937293729372936, "grad_norm": 0.025656158104538918, "learning_rate": 0.0017665439052920173, "loss": 0.1125, "num_input_tokens_seen": 32942048, "step": 38045 }, { "epoch": 17.93965110796794, "grad_norm": 0.008439374156296253, "learning_rate": 0.0017575416936836286, "loss": 0.0561, "num_input_tokens_seen": 32946272, "step": 38050 }, { "epoch": 17.94200848656294, "grad_norm": 0.02078508585691452, "learning_rate": 0.0017485623429252528, "loss": 0.1623, "num_input_tokens_seen": 32950704, "step": 38055 }, { "epoch": 17.944365865157945, "grad_norm": 0.022240743041038513, "learning_rate": 0.0017396058544016156, "loss": 0.1168, "num_input_tokens_seen": 32955328, "step": 38060 }, { "epoch": 17.946723243752945, "grad_norm": 0.024186881259083748, "learning_rate": 0.0017306722294938958, "loss": 0.1279, "num_input_tokens_seen": 32958928, "step": 38065 }, { "epoch": 17.94908062234795, "grad_norm": 0.021847285330295563, "learning_rate": 0.0017217614695798078, "loss": 0.0798, "num_input_tokens_seen": 32963472, "step": 38070 }, { "epoch": 17.95143800094295, "grad_norm": 0.007008741144090891, "learning_rate": 0.001712873576033469, "loss": 0.054, "num_input_tokens_seen": 32968128, "step": 38075 }, { "epoch": 17.953795379537954, "grad_norm": 0.0070619480684399605, "learning_rate": 0.0017040085502255163, "loss": 0.1337, "num_input_tokens_seen": 32973120, "step": 38080 }, { "epoch": 17.956152758132955, "grad_norm": 0.007161016575992107, "learning_rate": 0.0016951663935230565, "loss": 0.1014, "num_input_tokens_seen": 32978080, "step": 38085 }, { "epoch": 17.95851013672796, "grad_norm": 0.020568722859025, "learning_rate": 0.0016863471072896485, "loss": 0.0657, "num_input_tokens_seen": 32982240, "step": 38090 }, { "epoch": 17.96086751532296, "grad_norm": 0.002584498142823577, "learning_rate": 0.0016775506928853377, "loss": 0.1138, "num_input_tokens_seen": 32986400, "step": 38095 }, { "epoch": 17.963224893917964, "grad_norm": 0.003965984098613262, "learning_rate": 0.001668777151666656, "loss": 0.1601, "num_input_tokens_seen": 32990608, "step": 38100 }, { "epoch": 17.965582272512965, "grad_norm": 0.01800120621919632, "learning_rate": 0.0016600264849865709, "loss": 0.1195, "num_input_tokens_seen": 32994368, "step": 38105 }, { "epoch": 17.96793965110797, "grad_norm": 0.007404701318591833, "learning_rate": 0.0016512986941945695, "loss": 0.0976, "num_input_tokens_seen": 32998592, "step": 38110 }, { "epoch": 17.97029702970297, "grad_norm": 0.013168561272323132, "learning_rate": 0.0016425937806365753, "loss": 0.0402, "num_input_tokens_seen": 33003184, "step": 38115 }, { "epoch": 17.972654408297974, "grad_norm": 0.01746150106191635, "learning_rate": 0.0016339117456549979, "loss": 0.1067, "num_input_tokens_seen": 33006816, "step": 38120 }, { "epoch": 17.975011786892974, "grad_norm": 0.006270723883062601, "learning_rate": 0.0016252525905886995, "loss": 0.1301, "num_input_tokens_seen": 33011440, "step": 38125 }, { "epoch": 17.97736916548798, "grad_norm": 0.009296282194554806, "learning_rate": 0.0016166163167730617, "loss": 0.1074, "num_input_tokens_seen": 33015744, "step": 38130 }, { "epoch": 17.97972654408298, "grad_norm": 0.004519122187048197, "learning_rate": 0.0016080029255398864, "loss": 0.0748, "num_input_tokens_seen": 33019536, "step": 38135 }, { "epoch": 17.982083922677983, "grad_norm": 0.008839458227157593, "learning_rate": 0.0015994124182174606, "loss": 0.0579, "num_input_tokens_seen": 33024384, "step": 38140 }, { "epoch": 17.984441301272984, "grad_norm": 0.008435183204710484, "learning_rate": 0.001590844796130575, "loss": 0.0952, "num_input_tokens_seen": 33028816, "step": 38145 }, { "epoch": 17.986798679867988, "grad_norm": 0.012483861297369003, "learning_rate": 0.001582300060600439, "loss": 0.1705, "num_input_tokens_seen": 33033296, "step": 38150 }, { "epoch": 17.98915605846299, "grad_norm": 0.026790892705321312, "learning_rate": 0.0015737782129447652, "loss": 0.1218, "num_input_tokens_seen": 33037552, "step": 38155 }, { "epoch": 17.991513437057993, "grad_norm": 0.0038666073232889175, "learning_rate": 0.0015652792544777361, "loss": 0.095, "num_input_tokens_seen": 33041776, "step": 38160 }, { "epoch": 17.993870815652993, "grad_norm": 0.017153222113847733, "learning_rate": 0.0015568031865099863, "loss": 0.1281, "num_input_tokens_seen": 33046288, "step": 38165 }, { "epoch": 17.996228194247998, "grad_norm": 0.005000031553208828, "learning_rate": 0.0015483500103486369, "loss": 0.184, "num_input_tokens_seen": 33050224, "step": 38170 }, { "epoch": 17.998585572842998, "grad_norm": 0.029735101386904716, "learning_rate": 0.0015399197272972787, "loss": 0.1663, "num_input_tokens_seen": 33054960, "step": 38175 }, { "epoch": 18.000942951438002, "grad_norm": 0.01599138416349888, "learning_rate": 0.0015315123386559714, "loss": 0.1346, "num_input_tokens_seen": 33059456, "step": 38180 }, { "epoch": 18.003300330033003, "grad_norm": 0.010431255213916302, "learning_rate": 0.0015231278457212283, "loss": 0.093, "num_input_tokens_seen": 33065056, "step": 38185 }, { "epoch": 18.005657708628007, "grad_norm": 0.024192914366722107, "learning_rate": 0.001514766249786048, "loss": 0.1063, "num_input_tokens_seen": 33068992, "step": 38190 }, { "epoch": 18.008015087223008, "grad_norm": 0.007937935180962086, "learning_rate": 0.0015064275521398994, "loss": 0.0773, "num_input_tokens_seen": 33072944, "step": 38195 }, { "epoch": 18.010372465818012, "grad_norm": 0.007735913153737783, "learning_rate": 0.0014981117540686872, "loss": 0.1239, "num_input_tokens_seen": 33076912, "step": 38200 }, { "epoch": 18.010372465818012, "eval_loss": 0.2858264744281769, "eval_runtime": 21.9032, "eval_samples_per_second": 43.053, "eval_steps_per_second": 21.549, "num_input_tokens_seen": 33076912, "step": 38200 }, { "epoch": 18.012729844413013, "grad_norm": 0.022645853459835052, "learning_rate": 0.0014898188568548687, "loss": 0.0661, "num_input_tokens_seen": 33081536, "step": 38205 }, { "epoch": 18.015087223008017, "grad_norm": 0.030319707468152046, "learning_rate": 0.0014815488617772542, "loss": 0.0843, "num_input_tokens_seen": 33085600, "step": 38210 }, { "epoch": 18.017444601603017, "grad_norm": 0.024839386343955994, "learning_rate": 0.0014733017701112072, "loss": 0.1385, "num_input_tokens_seen": 33089824, "step": 38215 }, { "epoch": 18.019801980198018, "grad_norm": 0.011745426803827286, "learning_rate": 0.0014650775831285435, "loss": 0.0825, "num_input_tokens_seen": 33093936, "step": 38220 }, { "epoch": 18.022159358793022, "grad_norm": 0.010720506310462952, "learning_rate": 0.001456876302097515, "loss": 0.172, "num_input_tokens_seen": 33098208, "step": 38225 }, { "epoch": 18.024516737388023, "grad_norm": 0.005856699775904417, "learning_rate": 0.0014486979282828604, "loss": 0.0557, "num_input_tokens_seen": 33102528, "step": 38230 }, { "epoch": 18.026874115983027, "grad_norm": 0.003233224619179964, "learning_rate": 0.001440542462945804, "loss": 0.0857, "num_input_tokens_seen": 33106416, "step": 38235 }, { "epoch": 18.029231494578028, "grad_norm": 0.013712835498154163, "learning_rate": 0.0014324099073440232, "loss": 0.1145, "num_input_tokens_seen": 33110800, "step": 38240 }, { "epoch": 18.031588873173032, "grad_norm": 0.009180348366498947, "learning_rate": 0.0014243002627316482, "loss": 0.1125, "num_input_tokens_seen": 33115136, "step": 38245 }, { "epoch": 18.033946251768032, "grad_norm": 0.0068691992200911045, "learning_rate": 0.0014162135303592781, "loss": 0.0739, "num_input_tokens_seen": 33119488, "step": 38250 }, { "epoch": 18.036303630363037, "grad_norm": 0.010211888700723648, "learning_rate": 0.001408149711474016, "loss": 0.052, "num_input_tokens_seen": 33123568, "step": 38255 }, { "epoch": 18.038661008958037, "grad_norm": 0.0283662062138319, "learning_rate": 0.0014001088073193834, "loss": 0.1295, "num_input_tokens_seen": 33127952, "step": 38260 }, { "epoch": 18.04101838755304, "grad_norm": 0.015019943937659264, "learning_rate": 0.0013920908191354052, "loss": 0.0768, "num_input_tokens_seen": 33132976, "step": 38265 }, { "epoch": 18.043375766148042, "grad_norm": 0.002399677410721779, "learning_rate": 0.001384095748158526, "loss": 0.1325, "num_input_tokens_seen": 33137184, "step": 38270 }, { "epoch": 18.045733144743046, "grad_norm": 0.004782942123711109, "learning_rate": 0.0013761235956217255, "loss": 0.0619, "num_input_tokens_seen": 33141200, "step": 38275 }, { "epoch": 18.048090523338047, "grad_norm": 0.017798256129026413, "learning_rate": 0.0013681743627543873, "loss": 0.1282, "num_input_tokens_seen": 33146288, "step": 38280 }, { "epoch": 18.05044790193305, "grad_norm": 0.010104597546160221, "learning_rate": 0.001360248050782381, "loss": 0.077, "num_input_tokens_seen": 33150992, "step": 38285 }, { "epoch": 18.05280528052805, "grad_norm": 0.017527736723423004, "learning_rate": 0.001352344660928062, "loss": 0.1082, "num_input_tokens_seen": 33154768, "step": 38290 }, { "epoch": 18.055162659123056, "grad_norm": 0.011829888448119164, "learning_rate": 0.0013444641944102052, "loss": 0.064, "num_input_tokens_seen": 33158480, "step": 38295 }, { "epoch": 18.057520037718056, "grad_norm": 0.00579990865662694, "learning_rate": 0.0013366066524441056, "loss": 0.0422, "num_input_tokens_seen": 33162528, "step": 38300 }, { "epoch": 18.05987741631306, "grad_norm": 0.006139148026704788, "learning_rate": 0.0013287720362414768, "loss": 0.0921, "num_input_tokens_seen": 33166736, "step": 38305 }, { "epoch": 18.06223479490806, "grad_norm": 0.012056229636073112, "learning_rate": 0.0013209603470105025, "loss": 0.1282, "num_input_tokens_seen": 33170624, "step": 38310 }, { "epoch": 18.064592173503065, "grad_norm": 0.03021955117583275, "learning_rate": 0.0013131715859558857, "loss": 0.1081, "num_input_tokens_seen": 33176048, "step": 38315 }, { "epoch": 18.066949552098066, "grad_norm": 0.009102693758904934, "learning_rate": 0.001305405754278699, "loss": 0.0979, "num_input_tokens_seen": 33180768, "step": 38320 }, { "epoch": 18.06930693069307, "grad_norm": 0.003484122222289443, "learning_rate": 0.0012976628531765843, "loss": 0.0885, "num_input_tokens_seen": 33185248, "step": 38325 }, { "epoch": 18.07166430928807, "grad_norm": 0.004599242936819792, "learning_rate": 0.0012899428838435533, "loss": 0.059, "num_input_tokens_seen": 33189696, "step": 38330 }, { "epoch": 18.074021687883075, "grad_norm": 0.019366301596164703, "learning_rate": 0.001282245847470137, "loss": 0.0698, "num_input_tokens_seen": 33193312, "step": 38335 }, { "epoch": 18.076379066478076, "grad_norm": 0.013607708737254143, "learning_rate": 0.001274571745243319, "loss": 0.0964, "num_input_tokens_seen": 33198416, "step": 38340 }, { "epoch": 18.07873644507308, "grad_norm": 0.006907926872372627, "learning_rate": 0.0012669205783465364, "loss": 0.0819, "num_input_tokens_seen": 33202912, "step": 38345 }, { "epoch": 18.08109382366808, "grad_norm": 0.015390957705676556, "learning_rate": 0.001259292347959695, "loss": 0.0895, "num_input_tokens_seen": 33206768, "step": 38350 }, { "epoch": 18.083451202263085, "grad_norm": 0.014342605136334896, "learning_rate": 0.0012516870552591707, "loss": 0.0898, "num_input_tokens_seen": 33210896, "step": 38355 }, { "epoch": 18.085808580858085, "grad_norm": 0.005119668319821358, "learning_rate": 0.001244104701417792, "loss": 0.1223, "num_input_tokens_seen": 33215168, "step": 38360 }, { "epoch": 18.08816595945309, "grad_norm": 0.014188064262270927, "learning_rate": 0.0012365452876048565, "loss": 0.0436, "num_input_tokens_seen": 33218752, "step": 38365 }, { "epoch": 18.09052333804809, "grad_norm": 0.005307248793542385, "learning_rate": 0.001229008814986099, "loss": 0.0626, "num_input_tokens_seen": 33223248, "step": 38370 }, { "epoch": 18.092880716643094, "grad_norm": 0.014490621164441109, "learning_rate": 0.0012214952847237725, "loss": 0.092, "num_input_tokens_seen": 33227696, "step": 38375 }, { "epoch": 18.095238095238095, "grad_norm": 0.01472401525825262, "learning_rate": 0.0012140046979765339, "loss": 0.0652, "num_input_tokens_seen": 33231392, "step": 38380 }, { "epoch": 18.0975954738331, "grad_norm": 0.012282131239771843, "learning_rate": 0.0012065370558995258, "loss": 0.0879, "num_input_tokens_seen": 33235472, "step": 38385 }, { "epoch": 18.0999528524281, "grad_norm": 0.012077976949512959, "learning_rate": 0.0011990923596443602, "loss": 0.0714, "num_input_tokens_seen": 33239872, "step": 38390 }, { "epoch": 18.102310231023104, "grad_norm": 0.01181839220225811, "learning_rate": 0.001191670610359119, "loss": 0.0616, "num_input_tokens_seen": 33244768, "step": 38395 }, { "epoch": 18.104667609618105, "grad_norm": 0.016474559903144836, "learning_rate": 0.0011842718091882865, "loss": 0.0903, "num_input_tokens_seen": 33248832, "step": 38400 }, { "epoch": 18.104667609618105, "eval_loss": 0.28742265701293945, "eval_runtime": 21.9177, "eval_samples_per_second": 43.025, "eval_steps_per_second": 21.535, "num_input_tokens_seen": 33248832, "step": 38400 }, { "epoch": 18.10702498821311, "grad_norm": 0.019123561680316925, "learning_rate": 0.0011768959572729, "loss": 0.0999, "num_input_tokens_seen": 33253296, "step": 38405 }, { "epoch": 18.10938236680811, "grad_norm": 0.010630922392010689, "learning_rate": 0.001169543055750366, "loss": 0.1409, "num_input_tokens_seen": 33257136, "step": 38410 }, { "epoch": 18.111739745403113, "grad_norm": 0.01099251490086317, "learning_rate": 0.0011622131057546115, "loss": 0.0981, "num_input_tokens_seen": 33261504, "step": 38415 }, { "epoch": 18.114097123998114, "grad_norm": 0.015359519049525261, "learning_rate": 0.0011549061084160316, "loss": 0.1045, "num_input_tokens_seen": 33265600, "step": 38420 }, { "epoch": 18.116454502593115, "grad_norm": 0.030503666028380394, "learning_rate": 0.0011476220648614088, "loss": 0.1859, "num_input_tokens_seen": 33269104, "step": 38425 }, { "epoch": 18.11881188118812, "grad_norm": 0.014417004771530628, "learning_rate": 0.0011403609762140777, "loss": 0.1513, "num_input_tokens_seen": 33273632, "step": 38430 }, { "epoch": 18.12116925978312, "grad_norm": 0.018436715006828308, "learning_rate": 0.0011331228435937756, "loss": 0.1662, "num_input_tokens_seen": 33278704, "step": 38435 }, { "epoch": 18.123526638378124, "grad_norm": 0.008053172379732132, "learning_rate": 0.0011259076681166935, "loss": 0.0726, "num_input_tokens_seen": 33282768, "step": 38440 }, { "epoch": 18.125884016973124, "grad_norm": 0.012580829672515392, "learning_rate": 0.0011187154508955244, "loss": 0.0909, "num_input_tokens_seen": 33286416, "step": 38445 }, { "epoch": 18.12824139556813, "grad_norm": 0.010426346212625504, "learning_rate": 0.001111546193039381, "loss": 0.0684, "num_input_tokens_seen": 33290400, "step": 38450 }, { "epoch": 18.13059877416313, "grad_norm": 0.009272490628063679, "learning_rate": 0.0011043998956538792, "loss": 0.079, "num_input_tokens_seen": 33295120, "step": 38455 }, { "epoch": 18.132956152758133, "grad_norm": 0.018105091527104378, "learning_rate": 0.0010972765598410538, "loss": 0.0702, "num_input_tokens_seen": 33299280, "step": 38460 }, { "epoch": 18.135313531353134, "grad_norm": 0.01514426525682211, "learning_rate": 0.0010901761866993931, "loss": 0.1328, "num_input_tokens_seen": 33303856, "step": 38465 }, { "epoch": 18.137670909948138, "grad_norm": 0.005651990417391062, "learning_rate": 0.0010830987773238876, "loss": 0.064, "num_input_tokens_seen": 33308048, "step": 38470 }, { "epoch": 18.14002828854314, "grad_norm": 0.010926861315965652, "learning_rate": 0.0010760443328059644, "loss": 0.0569, "num_input_tokens_seen": 33312288, "step": 38475 }, { "epoch": 18.142385667138143, "grad_norm": 0.008650146424770355, "learning_rate": 0.001069012854233503, "loss": 0.0549, "num_input_tokens_seen": 33316864, "step": 38480 }, { "epoch": 18.144743045733144, "grad_norm": 0.0202487763017416, "learning_rate": 0.0010620043426908365, "loss": 0.1778, "num_input_tokens_seen": 33321184, "step": 38485 }, { "epoch": 18.147100424328148, "grad_norm": 0.016532903537154198, "learning_rate": 0.0010550187992587833, "loss": 0.1384, "num_input_tokens_seen": 33325168, "step": 38490 }, { "epoch": 18.14945780292315, "grad_norm": 0.01289845909923315, "learning_rate": 0.0010480562250145653, "loss": 0.0975, "num_input_tokens_seen": 33329104, "step": 38495 }, { "epoch": 18.151815181518153, "grad_norm": 0.016277363523840904, "learning_rate": 0.0010411166210319567, "loss": 0.1157, "num_input_tokens_seen": 33333424, "step": 38500 }, { "epoch": 18.154172560113153, "grad_norm": 0.0018365181749686599, "learning_rate": 0.0010341999883810848, "loss": 0.0574, "num_input_tokens_seen": 33337008, "step": 38505 }, { "epoch": 18.156529938708157, "grad_norm": 0.030944960191845894, "learning_rate": 0.0010273063281285965, "loss": 0.1482, "num_input_tokens_seen": 33341840, "step": 38510 }, { "epoch": 18.158887317303158, "grad_norm": 0.024335481226444244, "learning_rate": 0.0010204356413375747, "loss": 0.1128, "num_input_tokens_seen": 33346752, "step": 38515 }, { "epoch": 18.161244695898162, "grad_norm": 0.01192509476095438, "learning_rate": 0.001013587929067572, "loss": 0.1025, "num_input_tokens_seen": 33350400, "step": 38520 }, { "epoch": 18.163602074493163, "grad_norm": 0.00975954532623291, "learning_rate": 0.00100676319237461, "loss": 0.1119, "num_input_tokens_seen": 33354624, "step": 38525 }, { "epoch": 18.165959453088167, "grad_norm": 0.0026107397861778736, "learning_rate": 0.0009999614323110972, "loss": 0.1259, "num_input_tokens_seen": 33358816, "step": 38530 }, { "epoch": 18.168316831683168, "grad_norm": 0.00970445852726698, "learning_rate": 0.000993182649926011, "loss": 0.0637, "num_input_tokens_seen": 33363568, "step": 38535 }, { "epoch": 18.17067421027817, "grad_norm": 0.01386620756238699, "learning_rate": 0.000986426846264682, "loss": 0.0497, "num_input_tokens_seen": 33367920, "step": 38540 }, { "epoch": 18.173031588873172, "grad_norm": 0.015404759906232357, "learning_rate": 0.00097969402236896, "loss": 0.1391, "num_input_tokens_seen": 33372208, "step": 38545 }, { "epoch": 18.175388967468177, "grad_norm": 0.011873593553900719, "learning_rate": 0.0009729841792771143, "loss": 0.158, "num_input_tokens_seen": 33377040, "step": 38550 }, { "epoch": 18.177746346063177, "grad_norm": 0.008375647477805614, "learning_rate": 0.0009662973180239176, "loss": 0.0997, "num_input_tokens_seen": 33380976, "step": 38555 }, { "epoch": 18.18010372465818, "grad_norm": 0.012741712853312492, "learning_rate": 0.0009596334396405448, "loss": 0.1198, "num_input_tokens_seen": 33385504, "step": 38560 }, { "epoch": 18.182461103253182, "grad_norm": 0.0021736614871770144, "learning_rate": 0.0009529925451546406, "loss": 0.0653, "num_input_tokens_seen": 33389408, "step": 38565 }, { "epoch": 18.184818481848186, "grad_norm": 0.00871674157679081, "learning_rate": 0.0009463746355903357, "loss": 0.047, "num_input_tokens_seen": 33393344, "step": 38570 }, { "epoch": 18.187175860443187, "grad_norm": 0.00917226355522871, "learning_rate": 0.0009397797119681971, "loss": 0.1292, "num_input_tokens_seen": 33397856, "step": 38575 }, { "epoch": 18.18953323903819, "grad_norm": 0.014530804008245468, "learning_rate": 0.0009332077753052281, "loss": 0.0802, "num_input_tokens_seen": 33402176, "step": 38580 }, { "epoch": 18.19189061763319, "grad_norm": 0.013889393769204617, "learning_rate": 0.0009266588266149011, "loss": 0.1366, "num_input_tokens_seen": 33406656, "step": 38585 }, { "epoch": 18.194247996228196, "grad_norm": 0.006645447574555874, "learning_rate": 0.0009201328669071584, "loss": 0.102, "num_input_tokens_seen": 33411088, "step": 38590 }, { "epoch": 18.196605374823196, "grad_norm": 0.007802989333868027, "learning_rate": 0.0009136298971883949, "loss": 0.0591, "num_input_tokens_seen": 33416176, "step": 38595 }, { "epoch": 18.1989627534182, "grad_norm": 0.0016977130435407162, "learning_rate": 0.0009071499184614251, "loss": 0.0294, "num_input_tokens_seen": 33420800, "step": 38600 }, { "epoch": 18.1989627534182, "eval_loss": 0.28807637095451355, "eval_runtime": 21.919, "eval_samples_per_second": 43.022, "eval_steps_per_second": 21.534, "num_input_tokens_seen": 33420800, "step": 38600 }, { "epoch": 18.2013201320132, "grad_norm": 0.002798629691824317, "learning_rate": 0.0009006929317255663, "loss": 0.0974, "num_input_tokens_seen": 33424832, "step": 38605 }, { "epoch": 18.203677510608205, "grad_norm": 0.014146587811410427, "learning_rate": 0.0008942589379765387, "loss": 0.0799, "num_input_tokens_seen": 33428672, "step": 38610 }, { "epoch": 18.206034889203206, "grad_norm": 0.01277405396103859, "learning_rate": 0.0008878479382065817, "loss": 0.115, "num_input_tokens_seen": 33432528, "step": 38615 }, { "epoch": 18.208392267798207, "grad_norm": 0.017665691673755646, "learning_rate": 0.0008814599334043215, "loss": 0.1295, "num_input_tokens_seen": 33436816, "step": 38620 }, { "epoch": 18.21074964639321, "grad_norm": 0.01662038080394268, "learning_rate": 0.0008750949245548866, "loss": 0.0778, "num_input_tokens_seen": 33441536, "step": 38625 }, { "epoch": 18.21310702498821, "grad_norm": 0.0032254511024802923, "learning_rate": 0.0008687529126398252, "loss": 0.0592, "num_input_tokens_seen": 33445664, "step": 38630 }, { "epoch": 18.215464403583216, "grad_norm": 0.00875080842524767, "learning_rate": 0.0008624338986371715, "loss": 0.0471, "num_input_tokens_seen": 33450240, "step": 38635 }, { "epoch": 18.217821782178216, "grad_norm": 0.019642548635601997, "learning_rate": 0.0008561378835213962, "loss": 0.0872, "num_input_tokens_seen": 33454448, "step": 38640 }, { "epoch": 18.22017916077322, "grad_norm": 0.009133032523095608, "learning_rate": 0.0008498648682634058, "loss": 0.0613, "num_input_tokens_seen": 33458720, "step": 38645 }, { "epoch": 18.22253653936822, "grad_norm": 0.01762106828391552, "learning_rate": 0.0008436148538306099, "loss": 0.1159, "num_input_tokens_seen": 33462880, "step": 38650 }, { "epoch": 18.224893917963225, "grad_norm": 0.005270893685519695, "learning_rate": 0.0008373878411868041, "loss": 0.0545, "num_input_tokens_seen": 33466944, "step": 38655 }, { "epoch": 18.227251296558226, "grad_norm": 0.012603041715919971, "learning_rate": 0.000831183831292287, "loss": 0.1236, "num_input_tokens_seen": 33470752, "step": 38660 }, { "epoch": 18.22960867515323, "grad_norm": 0.024409467354416847, "learning_rate": 0.0008250028251037933, "loss": 0.1279, "num_input_tokens_seen": 33475504, "step": 38665 }, { "epoch": 18.23196605374823, "grad_norm": 0.0029145460575819016, "learning_rate": 0.0008188448235745271, "loss": 0.0601, "num_input_tokens_seen": 33480128, "step": 38670 }, { "epoch": 18.234323432343235, "grad_norm": 0.007935485802590847, "learning_rate": 0.0008127098276541122, "loss": 0.0464, "num_input_tokens_seen": 33484144, "step": 38675 }, { "epoch": 18.236680810938235, "grad_norm": 0.015196803025901318, "learning_rate": 0.0008065978382886418, "loss": 0.117, "num_input_tokens_seen": 33488608, "step": 38680 }, { "epoch": 18.23903818953324, "grad_norm": 0.018609551712870598, "learning_rate": 0.0008005088564206785, "loss": 0.0589, "num_input_tokens_seen": 33492864, "step": 38685 }, { "epoch": 18.24139556812824, "grad_norm": 0.002322407905012369, "learning_rate": 0.0007944428829891881, "loss": 0.048, "num_input_tokens_seen": 33496512, "step": 38690 }, { "epoch": 18.243752946723244, "grad_norm": 0.0090817641466856, "learning_rate": 0.0007883999189296386, "loss": 0.08, "num_input_tokens_seen": 33502368, "step": 38695 }, { "epoch": 18.246110325318245, "grad_norm": 0.009893090464174747, "learning_rate": 0.0007823799651739515, "loss": 0.124, "num_input_tokens_seen": 33506272, "step": 38700 }, { "epoch": 18.24846770391325, "grad_norm": 0.004182750359177589, "learning_rate": 0.0007763830226504509, "loss": 0.0946, "num_input_tokens_seen": 33511584, "step": 38705 }, { "epoch": 18.25082508250825, "grad_norm": 0.004378094337880611, "learning_rate": 0.0007704090922839468, "loss": 0.0733, "num_input_tokens_seen": 33516128, "step": 38710 }, { "epoch": 18.253182461103254, "grad_norm": 0.012883923947811127, "learning_rate": 0.0007644581749957025, "loss": 0.1308, "num_input_tokens_seen": 33520928, "step": 38715 }, { "epoch": 18.255539839698255, "grad_norm": 0.012955124489963055, "learning_rate": 0.000758530271703417, "loss": 0.0949, "num_input_tokens_seen": 33527792, "step": 38720 }, { "epoch": 18.25789721829326, "grad_norm": 0.010785473510622978, "learning_rate": 0.0007526253833212426, "loss": 0.1339, "num_input_tokens_seen": 33531728, "step": 38725 }, { "epoch": 18.26025459688826, "grad_norm": 0.0064979540184140205, "learning_rate": 0.0007467435107598008, "loss": 0.0825, "num_input_tokens_seen": 33535664, "step": 38730 }, { "epoch": 18.262611975483264, "grad_norm": 0.02031964249908924, "learning_rate": 0.0007408846549261328, "loss": 0.0981, "num_input_tokens_seen": 33539664, "step": 38735 }, { "epoch": 18.264969354078264, "grad_norm": 0.012543831020593643, "learning_rate": 0.0007350488167237656, "loss": 0.0655, "num_input_tokens_seen": 33543664, "step": 38740 }, { "epoch": 18.26732673267327, "grad_norm": 0.012430505827069283, "learning_rate": 0.0007292359970526629, "loss": 0.1133, "num_input_tokens_seen": 33547456, "step": 38745 }, { "epoch": 18.26968411126827, "grad_norm": 0.017103560268878937, "learning_rate": 0.0007234461968092076, "loss": 0.0955, "num_input_tokens_seen": 33551024, "step": 38750 }, { "epoch": 18.272041489863273, "grad_norm": 0.011629187501966953, "learning_rate": 0.0007176794168862854, "loss": 0.0532, "num_input_tokens_seen": 33555792, "step": 38755 }, { "epoch": 18.274398868458274, "grad_norm": 0.007442745845764875, "learning_rate": 0.000711935658173185, "loss": 0.0611, "num_input_tokens_seen": 33559872, "step": 38760 }, { "epoch": 18.276756247053278, "grad_norm": 0.013606538064777851, "learning_rate": 0.0007062149215556812, "loss": 0.097, "num_input_tokens_seen": 33564096, "step": 38765 }, { "epoch": 18.27911362564828, "grad_norm": 0.008590647019445896, "learning_rate": 0.0007005172079159849, "loss": 0.1599, "num_input_tokens_seen": 33567840, "step": 38770 }, { "epoch": 18.281471004243283, "grad_norm": 0.02148733288049698, "learning_rate": 0.0006948425181327267, "loss": 0.1307, "num_input_tokens_seen": 33573280, "step": 38775 }, { "epoch": 18.283828382838283, "grad_norm": 0.005320890340954065, "learning_rate": 0.000689190853081073, "loss": 0.0971, "num_input_tokens_seen": 33577424, "step": 38780 }, { "epoch": 18.286185761433288, "grad_norm": 0.023171983659267426, "learning_rate": 0.000683562213632527, "loss": 0.1271, "num_input_tokens_seen": 33581184, "step": 38785 }, { "epoch": 18.28854314002829, "grad_norm": 0.002641556551679969, "learning_rate": 0.0006779566006551108, "loss": 0.0439, "num_input_tokens_seen": 33585248, "step": 38790 }, { "epoch": 18.290900518623292, "grad_norm": 0.006319163367152214, "learning_rate": 0.0006723740150132995, "loss": 0.097, "num_input_tokens_seen": 33589664, "step": 38795 }, { "epoch": 18.293257897218293, "grad_norm": 0.005997419822961092, "learning_rate": 0.0006668144575679713, "loss": 0.1241, "num_input_tokens_seen": 33594000, "step": 38800 }, { "epoch": 18.293257897218293, "eval_loss": 0.2859748601913452, "eval_runtime": 21.9404, "eval_samples_per_second": 42.98, "eval_steps_per_second": 21.513, "num_input_tokens_seen": 33594000, "step": 38800 }, { "epoch": 18.295615275813297, "grad_norm": 0.002386396750807762, "learning_rate": 0.0006612779291765069, "loss": 0.1636, "num_input_tokens_seen": 33598000, "step": 38805 }, { "epoch": 18.297972654408298, "grad_norm": 0.014595330692827702, "learning_rate": 0.0006557644306926736, "loss": 0.0827, "num_input_tokens_seen": 33602512, "step": 38810 }, { "epoch": 18.300330033003302, "grad_norm": 0.013365499675273895, "learning_rate": 0.0006502739629667575, "loss": 0.0478, "num_input_tokens_seen": 33607040, "step": 38815 }, { "epoch": 18.302687411598303, "grad_norm": 0.017048876732587814, "learning_rate": 0.0006448065268454317, "loss": 0.1279, "num_input_tokens_seen": 33611728, "step": 38820 }, { "epoch": 18.305044790193303, "grad_norm": 0.008202873170375824, "learning_rate": 0.0006393621231718549, "loss": 0.0375, "num_input_tokens_seen": 33615488, "step": 38825 }, { "epoch": 18.307402168788308, "grad_norm": 0.004397777374833822, "learning_rate": 0.0006339407527856389, "loss": 0.0996, "num_input_tokens_seen": 33619968, "step": 38830 }, { "epoch": 18.309759547383308, "grad_norm": 0.017157312482595444, "learning_rate": 0.0006285424165227982, "loss": 0.0706, "num_input_tokens_seen": 33623424, "step": 38835 }, { "epoch": 18.312116925978312, "grad_norm": 0.008174706250429153, "learning_rate": 0.0006231671152158169, "loss": 0.0586, "num_input_tokens_seen": 33627392, "step": 38840 }, { "epoch": 18.314474304573313, "grad_norm": 0.010457228869199753, "learning_rate": 0.0006178148496936819, "loss": 0.0758, "num_input_tokens_seen": 33631392, "step": 38845 }, { "epoch": 18.316831683168317, "grad_norm": 0.01212691143155098, "learning_rate": 0.000612485620781733, "loss": 0.0619, "num_input_tokens_seen": 33636064, "step": 38850 }, { "epoch": 18.319189061763318, "grad_norm": 0.008400465361773968, "learning_rate": 0.0006071794293018296, "loss": 0.0908, "num_input_tokens_seen": 33639936, "step": 38855 }, { "epoch": 18.321546440358322, "grad_norm": 0.017973627895116806, "learning_rate": 0.0006018962760722501, "loss": 0.0725, "num_input_tokens_seen": 33644160, "step": 38860 }, { "epoch": 18.323903818953323, "grad_norm": 0.01630215346813202, "learning_rate": 0.0005966361619077098, "loss": 0.0601, "num_input_tokens_seen": 33647936, "step": 38865 }, { "epoch": 18.326261197548327, "grad_norm": 0.010439412668347359, "learning_rate": 0.000591399087619393, "loss": 0.0801, "num_input_tokens_seen": 33652208, "step": 38870 }, { "epoch": 18.328618576143327, "grad_norm": 0.008847573772072792, "learning_rate": 0.0005861850540149371, "loss": 0.0699, "num_input_tokens_seen": 33655792, "step": 38875 }, { "epoch": 18.33097595473833, "grad_norm": 0.023052891716361046, "learning_rate": 0.0005809940618983822, "loss": 0.0634, "num_input_tokens_seen": 33660176, "step": 38880 }, { "epoch": 18.333333333333332, "grad_norm": 0.02688494697213173, "learning_rate": 0.0005758261120702712, "loss": 0.0729, "num_input_tokens_seen": 33664768, "step": 38885 }, { "epoch": 18.335690711928336, "grad_norm": 0.02152164839208126, "learning_rate": 0.0005706812053275501, "loss": 0.1185, "num_input_tokens_seen": 33668832, "step": 38890 }, { "epoch": 18.338048090523337, "grad_norm": 0.004761905875056982, "learning_rate": 0.0005655593424636173, "loss": 0.0747, "num_input_tokens_seen": 33673408, "step": 38895 }, { "epoch": 18.34040546911834, "grad_norm": 0.014462285675108433, "learning_rate": 0.0005604605242683746, "loss": 0.0388, "num_input_tokens_seen": 33678672, "step": 38900 }, { "epoch": 18.34276284771334, "grad_norm": 0.015734845772385597, "learning_rate": 0.0005553847515280596, "loss": 0.0972, "num_input_tokens_seen": 33683120, "step": 38905 }, { "epoch": 18.345120226308346, "grad_norm": 0.007995336316525936, "learning_rate": 0.0005503320250254795, "loss": 0.1078, "num_input_tokens_seen": 33687504, "step": 38910 }, { "epoch": 18.347477604903347, "grad_norm": 0.0031270708423107862, "learning_rate": 0.0005453023455397943, "loss": 0.0559, "num_input_tokens_seen": 33691952, "step": 38915 }, { "epoch": 18.34983498349835, "grad_norm": 0.005457921419292688, "learning_rate": 0.0005402957138466502, "loss": 0.1132, "num_input_tokens_seen": 33696016, "step": 38920 }, { "epoch": 18.35219236209335, "grad_norm": 0.010574528016149998, "learning_rate": 0.0005353121307181463, "loss": 0.0572, "num_input_tokens_seen": 33700144, "step": 38925 }, { "epoch": 18.354549740688356, "grad_norm": 0.01068148948252201, "learning_rate": 0.0005303515969227845, "loss": 0.0801, "num_input_tokens_seen": 33704144, "step": 38930 }, { "epoch": 18.356907119283356, "grad_norm": 0.006465582642704248, "learning_rate": 0.0005254141132255862, "loss": 0.1394, "num_input_tokens_seen": 33710192, "step": 38935 }, { "epoch": 18.35926449787836, "grad_norm": 0.013842878863215446, "learning_rate": 0.0005204996803879258, "loss": 0.0779, "num_input_tokens_seen": 33714960, "step": 38940 }, { "epoch": 18.36162187647336, "grad_norm": 0.01643979363143444, "learning_rate": 0.0005156082991676969, "loss": 0.0877, "num_input_tokens_seen": 33719136, "step": 38945 }, { "epoch": 18.363979255068365, "grad_norm": 0.014679396525025368, "learning_rate": 0.0005107399703192127, "loss": 0.0943, "num_input_tokens_seen": 33722768, "step": 38950 }, { "epoch": 18.366336633663366, "grad_norm": 0.009714308194816113, "learning_rate": 0.0005058946945932063, "loss": 0.0871, "num_input_tokens_seen": 33727312, "step": 38955 }, { "epoch": 18.36869401225837, "grad_norm": 0.011285386979579926, "learning_rate": 0.0005010724727369131, "loss": 0.0735, "num_input_tokens_seen": 33731296, "step": 38960 }, { "epoch": 18.37105139085337, "grad_norm": 0.010397360660135746, "learning_rate": 0.000496273305493955, "loss": 0.0726, "num_input_tokens_seen": 33735200, "step": 38965 }, { "epoch": 18.373408769448375, "grad_norm": 0.005112814251333475, "learning_rate": 0.0004914971936044399, "loss": 0.1086, "num_input_tokens_seen": 33740064, "step": 38970 }, { "epoch": 18.375766148043375, "grad_norm": 0.004232257138937712, "learning_rate": 0.00048674413780491196, "loss": 0.0759, "num_input_tokens_seen": 33744464, "step": 38975 }, { "epoch": 18.37812352663838, "grad_norm": 0.02125425450503826, "learning_rate": 0.0004820141388283183, "loss": 0.1586, "num_input_tokens_seen": 33749792, "step": 38980 }, { "epoch": 18.38048090523338, "grad_norm": 0.009054761379957199, "learning_rate": 0.00047730719740410874, "loss": 0.0652, "num_input_tokens_seen": 33754096, "step": 38985 }, { "epoch": 18.382838283828384, "grad_norm": 0.01772669330239296, "learning_rate": 0.00047262331425816927, "loss": 0.0602, "num_input_tokens_seen": 33758160, "step": 38990 }, { "epoch": 18.385195662423385, "grad_norm": 0.010442155413329601, "learning_rate": 0.00046796249011277213, "loss": 0.0911, "num_input_tokens_seen": 33761584, "step": 38995 }, { "epoch": 18.38755304101839, "grad_norm": 0.010579949244856834, "learning_rate": 0.00046332472568669236, "loss": 0.0538, "num_input_tokens_seen": 33765936, "step": 39000 }, { "epoch": 18.38755304101839, "eval_loss": 0.28802064061164856, "eval_runtime": 21.9082, "eval_samples_per_second": 43.043, "eval_steps_per_second": 21.544, "num_input_tokens_seen": 33765936, "step": 39000 }, { "epoch": 18.38991041961339, "grad_norm": 0.00810024980455637, "learning_rate": 0.0004587100216951578, "loss": 0.0772, "num_input_tokens_seen": 33770512, "step": 39005 }, { "epoch": 18.392267798208394, "grad_norm": 0.010942600667476654, "learning_rate": 0.00045411837884978265, "loss": 0.0615, "num_input_tokens_seen": 33773904, "step": 39010 }, { "epoch": 18.394625176803395, "grad_norm": 0.023365259170532227, "learning_rate": 0.00044954979785865045, "loss": 0.0819, "num_input_tokens_seen": 33777920, "step": 39015 }, { "epoch": 18.396982555398395, "grad_norm": 0.021406477317214012, "learning_rate": 0.00044500427942631426, "loss": 0.0769, "num_input_tokens_seen": 33781344, "step": 39020 }, { "epoch": 18.3993399339934, "grad_norm": 0.00809391401708126, "learning_rate": 0.0004404818242537467, "loss": 0.0583, "num_input_tokens_seen": 33785200, "step": 39025 }, { "epoch": 18.4016973125884, "grad_norm": 0.015666499733924866, "learning_rate": 0.00043598243303837324, "loss": 0.0897, "num_input_tokens_seen": 33790320, "step": 39030 }, { "epoch": 18.404054691183404, "grad_norm": 0.004839923698455095, "learning_rate": 0.00043150610647403885, "loss": 0.0712, "num_input_tokens_seen": 33794992, "step": 39035 }, { "epoch": 18.406412069778405, "grad_norm": 0.006537731271237135, "learning_rate": 0.00042705284525104134, "loss": 0.0926, "num_input_tokens_seen": 33799728, "step": 39040 }, { "epoch": 18.40876944837341, "grad_norm": 0.017217909917235374, "learning_rate": 0.0004226226500561647, "loss": 0.1121, "num_input_tokens_seen": 33803936, "step": 39045 }, { "epoch": 18.41112682696841, "grad_norm": 0.0037774937227368355, "learning_rate": 0.0004182155215725791, "loss": 0.0644, "num_input_tokens_seen": 33808144, "step": 39050 }, { "epoch": 18.413484205563414, "grad_norm": 0.0020505981519818306, "learning_rate": 0.00041383146047992424, "loss": 0.079, "num_input_tokens_seen": 33813280, "step": 39055 }, { "epoch": 18.415841584158414, "grad_norm": 0.003849760629236698, "learning_rate": 0.00040947046745427597, "loss": 0.0409, "num_input_tokens_seen": 33817440, "step": 39060 }, { "epoch": 18.41819896275342, "grad_norm": 0.006973329931497574, "learning_rate": 0.00040513254316814625, "loss": 0.0398, "num_input_tokens_seen": 33822800, "step": 39065 }, { "epoch": 18.42055634134842, "grad_norm": 0.0318143405020237, "learning_rate": 0.0004008176882905168, "loss": 0.1382, "num_input_tokens_seen": 33827376, "step": 39070 }, { "epoch": 18.422913719943423, "grad_norm": 0.0020538035314530134, "learning_rate": 0.00039652590348677184, "loss": 0.0646, "num_input_tokens_seen": 33831248, "step": 39075 }, { "epoch": 18.425271098538424, "grad_norm": 0.020781999453902245, "learning_rate": 0.00039225718941878206, "loss": 0.0765, "num_input_tokens_seen": 33835056, "step": 39080 }, { "epoch": 18.427628477133428, "grad_norm": 0.010085498914122581, "learning_rate": 0.00038801154674480417, "loss": 0.1413, "num_input_tokens_seen": 33839040, "step": 39085 }, { "epoch": 18.42998585572843, "grad_norm": 0.016100183129310608, "learning_rate": 0.00038378897611959784, "loss": 0.0943, "num_input_tokens_seen": 33843024, "step": 39090 }, { "epoch": 18.432343234323433, "grad_norm": 0.01310497522354126, "learning_rate": 0.00037958947819430875, "loss": 0.0866, "num_input_tokens_seen": 33847232, "step": 39095 }, { "epoch": 18.434700612918434, "grad_norm": 0.01717170886695385, "learning_rate": 0.0003754130536165856, "loss": 0.0897, "num_input_tokens_seen": 33851760, "step": 39100 }, { "epoch": 18.437057991513438, "grad_norm": 0.013882094994187355, "learning_rate": 0.0003712597030304632, "loss": 0.0592, "num_input_tokens_seen": 33855824, "step": 39105 }, { "epoch": 18.43941537010844, "grad_norm": 0.010565144941210747, "learning_rate": 0.00036712942707646247, "loss": 0.0508, "num_input_tokens_seen": 33860128, "step": 39110 }, { "epoch": 18.441772748703443, "grad_norm": 0.01672755368053913, "learning_rate": 0.00036302222639149063, "loss": 0.0826, "num_input_tokens_seen": 33864352, "step": 39115 }, { "epoch": 18.444130127298443, "grad_norm": 0.017068596556782722, "learning_rate": 0.000358938101608941, "loss": 0.0858, "num_input_tokens_seen": 33869280, "step": 39120 }, { "epoch": 18.446487505893447, "grad_norm": 0.025402406230568886, "learning_rate": 0.0003548770533586598, "loss": 0.0797, "num_input_tokens_seen": 33873472, "step": 39125 }, { "epoch": 18.448844884488448, "grad_norm": 0.017581013962626457, "learning_rate": 0.0003508390822668961, "loss": 0.0654, "num_input_tokens_seen": 33877344, "step": 39130 }, { "epoch": 18.451202263083452, "grad_norm": 0.016756225377321243, "learning_rate": 0.00034682418895633503, "loss": 0.067, "num_input_tokens_seen": 33881600, "step": 39135 }, { "epoch": 18.453559641678453, "grad_norm": 0.019894467666745186, "learning_rate": 0.0003428323740461647, "loss": 0.0974, "num_input_tokens_seen": 33885920, "step": 39140 }, { "epoch": 18.455917020273457, "grad_norm": 0.012173019349575043, "learning_rate": 0.00033886363815194276, "loss": 0.0806, "num_input_tokens_seen": 33890192, "step": 39145 }, { "epoch": 18.458274398868458, "grad_norm": 0.008509778417646885, "learning_rate": 0.0003349179818857129, "loss": 0.1277, "num_input_tokens_seen": 33895168, "step": 39150 }, { "epoch": 18.460631777463462, "grad_norm": 0.022791439667344093, "learning_rate": 0.0003309954058559383, "loss": 0.0956, "num_input_tokens_seen": 33899120, "step": 39155 }, { "epoch": 18.462989156058462, "grad_norm": 0.023058075457811356, "learning_rate": 0.0003270959106675186, "loss": 0.1027, "num_input_tokens_seen": 33902800, "step": 39160 }, { "epoch": 18.465346534653467, "grad_norm": 0.0047293771058321, "learning_rate": 0.0003232194969218227, "loss": 0.0686, "num_input_tokens_seen": 33906704, "step": 39165 }, { "epoch": 18.467703913248467, "grad_norm": 0.013443795032799244, "learning_rate": 0.00031936616521663905, "loss": 0.0996, "num_input_tokens_seen": 33911184, "step": 39170 }, { "epoch": 18.47006129184347, "grad_norm": 0.003442867659032345, "learning_rate": 0.00031553591614619236, "loss": 0.044, "num_input_tokens_seen": 33915376, "step": 39175 }, { "epoch": 18.472418670438472, "grad_norm": 0.008020518347620964, "learning_rate": 0.00031172875030117676, "loss": 0.0681, "num_input_tokens_seen": 33919600, "step": 39180 }, { "epoch": 18.474776049033476, "grad_norm": 0.012869726866483688, "learning_rate": 0.0003079446682686726, "loss": 0.0754, "num_input_tokens_seen": 33923520, "step": 39185 }, { "epoch": 18.477133427628477, "grad_norm": 0.00506545789539814, "learning_rate": 0.0003041836706322465, "loss": 0.0601, "num_input_tokens_seen": 33927808, "step": 39190 }, { "epoch": 18.47949080622348, "grad_norm": 0.006123701110482216, "learning_rate": 0.0003004457579719011, "loss": 0.0746, "num_input_tokens_seen": 33932240, "step": 39195 }, { "epoch": 18.48184818481848, "grad_norm": 0.003660484217107296, "learning_rate": 0.00029673093086405867, "loss": 0.0482, "num_input_tokens_seen": 33936896, "step": 39200 }, { "epoch": 18.48184818481848, "eval_loss": 0.2888562083244324, "eval_runtime": 21.9014, "eval_samples_per_second": 43.057, "eval_steps_per_second": 21.551, "num_input_tokens_seen": 33936896, "step": 39200 }, { "epoch": 18.484205563413486, "grad_norm": 0.01670616865158081, "learning_rate": 0.00029303918988159426, "loss": 0.0942, "num_input_tokens_seen": 33940832, "step": 39205 }, { "epoch": 18.486562942008486, "grad_norm": 0.02119067683815956, "learning_rate": 0.0002893705355938192, "loss": 0.0776, "num_input_tokens_seen": 33944624, "step": 39210 }, { "epoch": 18.48892032060349, "grad_norm": 0.010340698063373566, "learning_rate": 0.0002857249685664975, "loss": 0.09, "num_input_tokens_seen": 33948960, "step": 39215 }, { "epoch": 18.49127769919849, "grad_norm": 0.01002565398812294, "learning_rate": 0.0002821024893618129, "loss": 0.0819, "num_input_tokens_seen": 33954144, "step": 39220 }, { "epoch": 18.493635077793492, "grad_norm": 0.024105137214064598, "learning_rate": 0.0002785030985383852, "loss": 0.1095, "num_input_tokens_seen": 33958944, "step": 39225 }, { "epoch": 18.495992456388496, "grad_norm": 0.010402134619653225, "learning_rate": 0.00027492679665130356, "loss": 0.0413, "num_input_tokens_seen": 33963248, "step": 39230 }, { "epoch": 18.498349834983497, "grad_norm": 0.01282298844307661, "learning_rate": 0.000271373584252077, "loss": 0.0862, "num_input_tokens_seen": 33967392, "step": 39235 }, { "epoch": 18.5007072135785, "grad_norm": 0.02711041457951069, "learning_rate": 0.00026784346188865046, "loss": 0.1415, "num_input_tokens_seen": 33972096, "step": 39240 }, { "epoch": 18.5030645921735, "grad_norm": 0.0017261919565498829, "learning_rate": 0.0002643364301054218, "loss": 0.0836, "num_input_tokens_seen": 33977808, "step": 39245 }, { "epoch": 18.505421970768506, "grad_norm": 0.015207303687930107, "learning_rate": 0.0002608524894431918, "loss": 0.1021, "num_input_tokens_seen": 33982192, "step": 39250 }, { "epoch": 18.507779349363506, "grad_norm": 0.017558734863996506, "learning_rate": 0.000257391640439264, "loss": 0.0991, "num_input_tokens_seen": 33986544, "step": 39255 }, { "epoch": 18.51013672795851, "grad_norm": 0.016638997942209244, "learning_rate": 0.00025395388362732806, "loss": 0.1141, "num_input_tokens_seen": 33991456, "step": 39260 }, { "epoch": 18.51249410655351, "grad_norm": 0.025268755853176117, "learning_rate": 0.00025053921953751, "loss": 0.1638, "num_input_tokens_seen": 33995264, "step": 39265 }, { "epoch": 18.514851485148515, "grad_norm": 0.010673661716282368, "learning_rate": 0.00024714764869643855, "loss": 0.0827, "num_input_tokens_seen": 33999584, "step": 39270 }, { "epoch": 18.517208863743516, "grad_norm": 0.018715115264058113, "learning_rate": 0.0002437791716270954, "loss": 0.1168, "num_input_tokens_seen": 34002752, "step": 39275 }, { "epoch": 18.51956624233852, "grad_norm": 0.008257167413830757, "learning_rate": 0.00024043378884896493, "loss": 0.0574, "num_input_tokens_seen": 34007200, "step": 39280 }, { "epoch": 18.52192362093352, "grad_norm": 0.022653305903077126, "learning_rate": 0.00023711150087793453, "loss": 0.0848, "num_input_tokens_seen": 34011072, "step": 39285 }, { "epoch": 18.524280999528525, "grad_norm": 0.002676167991012335, "learning_rate": 0.000233812308226361, "loss": 0.0625, "num_input_tokens_seen": 34015216, "step": 39290 }, { "epoch": 18.526638378123526, "grad_norm": 0.011457450687885284, "learning_rate": 0.00023053621140300406, "loss": 0.177, "num_input_tokens_seen": 34020048, "step": 39295 }, { "epoch": 18.52899575671853, "grad_norm": 0.024237683042883873, "learning_rate": 0.00022728321091307623, "loss": 0.1379, "num_input_tokens_seen": 34024352, "step": 39300 }, { "epoch": 18.53135313531353, "grad_norm": 0.016124477609992027, "learning_rate": 0.0002240533072582429, "loss": 0.0943, "num_input_tokens_seen": 34027952, "step": 39305 }, { "epoch": 18.533710513908535, "grad_norm": 0.017075084149837494, "learning_rate": 0.00022084650093658897, "loss": 0.1281, "num_input_tokens_seen": 34031856, "step": 39310 }, { "epoch": 18.536067892503535, "grad_norm": 0.0207653921097517, "learning_rate": 0.0002176627924426522, "loss": 0.1554, "num_input_tokens_seen": 34036608, "step": 39315 }, { "epoch": 18.53842527109854, "grad_norm": 0.013970598578453064, "learning_rate": 0.0002145021822673898, "loss": 0.0782, "num_input_tokens_seen": 34040656, "step": 39320 }, { "epoch": 18.54078264969354, "grad_norm": 0.005550181493163109, "learning_rate": 0.00021136467089822862, "loss": 0.0913, "num_input_tokens_seen": 34044832, "step": 39325 }, { "epoch": 18.543140028288544, "grad_norm": 0.0037104832008481026, "learning_rate": 0.00020825025881898162, "loss": 0.0655, "num_input_tokens_seen": 34049808, "step": 39330 }, { "epoch": 18.545497406883545, "grad_norm": 0.0077162547968328, "learning_rate": 0.0002051589465099479, "loss": 0.0722, "num_input_tokens_seen": 34054528, "step": 39335 }, { "epoch": 18.54785478547855, "grad_norm": 0.013427382335066795, "learning_rate": 0.0002020907344478462, "loss": 0.0727, "num_input_tokens_seen": 34058912, "step": 39340 }, { "epoch": 18.55021216407355, "grad_norm": 0.004311059135943651, "learning_rate": 0.0001990456231058313, "loss": 0.068, "num_input_tokens_seen": 34063504, "step": 39345 }, { "epoch": 18.552569542668554, "grad_norm": 0.01876750960946083, "learning_rate": 0.00019602361295349423, "loss": 0.0627, "num_input_tokens_seen": 34067856, "step": 39350 }, { "epoch": 18.554926921263554, "grad_norm": 0.02497601881623268, "learning_rate": 0.0001930247044568789, "loss": 0.1235, "num_input_tokens_seen": 34071872, "step": 39355 }, { "epoch": 18.55728429985856, "grad_norm": 0.009049373678863049, "learning_rate": 0.00019004889807843205, "loss": 0.0594, "num_input_tokens_seen": 34076736, "step": 39360 }, { "epoch": 18.55964167845356, "grad_norm": 0.01647106185555458, "learning_rate": 0.00018709619427708656, "loss": 0.1006, "num_input_tokens_seen": 34081408, "step": 39365 }, { "epoch": 18.561999057048563, "grad_norm": 0.008814402855932713, "learning_rate": 0.00018416659350817822, "loss": 0.0695, "num_input_tokens_seen": 34085552, "step": 39370 }, { "epoch": 18.564356435643564, "grad_norm": 0.017760606482625008, "learning_rate": 0.00018126009622346229, "loss": 0.079, "num_input_tokens_seen": 34090320, "step": 39375 }, { "epoch": 18.566713814238568, "grad_norm": 0.01971689984202385, "learning_rate": 0.00017837670287119687, "loss": 0.1752, "num_input_tokens_seen": 34094208, "step": 39380 }, { "epoch": 18.56907119283357, "grad_norm": 0.009442835114896297, "learning_rate": 0.00017551641389602633, "loss": 0.09, "num_input_tokens_seen": 34098960, "step": 39385 }, { "epoch": 18.571428571428573, "grad_norm": 0.010458395816385746, "learning_rate": 0.00017267922973903115, "loss": 0.056, "num_input_tokens_seen": 34102944, "step": 39390 }, { "epoch": 18.573785950023574, "grad_norm": 0.009192125871777534, "learning_rate": 0.00016986515083774467, "loss": 0.1425, "num_input_tokens_seen": 34106752, "step": 39395 }, { "epoch": 18.576143328618578, "grad_norm": 0.009287843480706215, "learning_rate": 0.00016707417762611975, "loss": 0.0872, "num_input_tokens_seen": 34110592, "step": 39400 }, { "epoch": 18.576143328618578, "eval_loss": 0.28899332880973816, "eval_runtime": 21.8862, "eval_samples_per_second": 43.086, "eval_steps_per_second": 21.566, "num_input_tokens_seen": 34110592, "step": 39400 }, { "epoch": 18.57850070721358, "grad_norm": 0.017889028415083885, "learning_rate": 0.00016430631053459543, "loss": 0.0842, "num_input_tokens_seen": 34114256, "step": 39405 }, { "epoch": 18.580858085808583, "grad_norm": 0.015459234826266766, "learning_rate": 0.0001615615499899803, "loss": 0.1131, "num_input_tokens_seen": 34118240, "step": 39410 }, { "epoch": 18.583215464403583, "grad_norm": 0.008754379115998745, "learning_rate": 0.00015883989641556905, "loss": 0.1322, "num_input_tokens_seen": 34122896, "step": 39415 }, { "epoch": 18.585572842998587, "grad_norm": 0.012524116784334183, "learning_rate": 0.00015614135023105934, "loss": 0.1273, "num_input_tokens_seen": 34127008, "step": 39420 }, { "epoch": 18.587930221593588, "grad_norm": 0.008522243238985538, "learning_rate": 0.00015346591185261827, "loss": 0.0663, "num_input_tokens_seen": 34131632, "step": 39425 }, { "epoch": 18.59028760018859, "grad_norm": 0.01596761867403984, "learning_rate": 0.00015081358169281576, "loss": 0.0841, "num_input_tokens_seen": 34136000, "step": 39430 }, { "epoch": 18.592644978783593, "grad_norm": 0.017875174060463905, "learning_rate": 0.00014818436016069135, "loss": 0.0996, "num_input_tokens_seen": 34140672, "step": 39435 }, { "epoch": 18.595002357378593, "grad_norm": 0.020815974101424217, "learning_rate": 0.00014557824766168735, "loss": 0.0791, "num_input_tokens_seen": 34144512, "step": 39440 }, { "epoch": 18.597359735973598, "grad_norm": 0.01341498177498579, "learning_rate": 0.00014299524459769896, "loss": 0.0811, "num_input_tokens_seen": 34148288, "step": 39445 }, { "epoch": 18.599717114568598, "grad_norm": 0.008876568637788296, "learning_rate": 0.0001404353513670742, "loss": 0.0695, "num_input_tokens_seen": 34152128, "step": 39450 }, { "epoch": 18.602074493163602, "grad_norm": 0.0228035319596529, "learning_rate": 0.0001378985683645806, "loss": 0.0976, "num_input_tokens_seen": 34156192, "step": 39455 }, { "epoch": 18.604431871758603, "grad_norm": 0.01861092634499073, "learning_rate": 0.0001353848959813886, "loss": 0.1109, "num_input_tokens_seen": 34160064, "step": 39460 }, { "epoch": 18.606789250353607, "grad_norm": 0.006291537545621395, "learning_rate": 0.00013289433460517142, "loss": 0.0594, "num_input_tokens_seen": 34164160, "step": 39465 }, { "epoch": 18.609146628948608, "grad_norm": 0.007573523558676243, "learning_rate": 0.00013042688462000518, "loss": 0.055, "num_input_tokens_seen": 34168944, "step": 39470 }, { "epoch": 18.611504007543612, "grad_norm": 0.014548714272677898, "learning_rate": 0.0001279825464063855, "loss": 0.0792, "num_input_tokens_seen": 34173856, "step": 39475 }, { "epoch": 18.613861386138613, "grad_norm": 0.009897487238049507, "learning_rate": 0.00012556132034126087, "loss": 0.0692, "num_input_tokens_seen": 34178624, "step": 39480 }, { "epoch": 18.616218764733617, "grad_norm": 0.005154917947947979, "learning_rate": 0.0001231632067980326, "loss": 0.0747, "num_input_tokens_seen": 34184048, "step": 39485 }, { "epoch": 18.618576143328617, "grad_norm": 0.015009189024567604, "learning_rate": 0.00012078820614650486, "loss": 0.1342, "num_input_tokens_seen": 34188256, "step": 39490 }, { "epoch": 18.62093352192362, "grad_norm": 0.006844956427812576, "learning_rate": 0.00011843631875291804, "loss": 0.0606, "num_input_tokens_seen": 34192544, "step": 39495 }, { "epoch": 18.623290900518622, "grad_norm": 0.012442706152796745, "learning_rate": 0.00011610754497999863, "loss": 0.0645, "num_input_tokens_seen": 34196896, "step": 39500 }, { "epoch": 18.625648279113626, "grad_norm": 0.019810406491160393, "learning_rate": 0.0001138018851868594, "loss": 0.0687, "num_input_tokens_seen": 34200512, "step": 39505 }, { "epoch": 18.628005657708627, "grad_norm": 0.005730709992349148, "learning_rate": 0.0001115193397290326, "loss": 0.1299, "num_input_tokens_seen": 34204672, "step": 39510 }, { "epoch": 18.63036303630363, "grad_norm": 0.01674485206604004, "learning_rate": 0.00010925990895856996, "loss": 0.1067, "num_input_tokens_seen": 34209472, "step": 39515 }, { "epoch": 18.632720414898632, "grad_norm": 0.0068301050923764706, "learning_rate": 0.00010702359322385946, "loss": 0.0784, "num_input_tokens_seen": 34214064, "step": 39520 }, { "epoch": 18.635077793493636, "grad_norm": 0.006076378747820854, "learning_rate": 0.00010481039286977523, "loss": 0.0817, "num_input_tokens_seen": 34218864, "step": 39525 }, { "epoch": 18.637435172088637, "grad_norm": 0.016974911093711853, "learning_rate": 0.00010262030823764423, "loss": 0.0899, "num_input_tokens_seen": 34222768, "step": 39530 }, { "epoch": 18.63979255068364, "grad_norm": 0.0067319804802536964, "learning_rate": 0.00010045333966517966, "loss": 0.0666, "num_input_tokens_seen": 34226896, "step": 39535 }, { "epoch": 18.64214992927864, "grad_norm": 0.02635984681546688, "learning_rate": 9.83094874865642e-05, "loss": 0.09, "num_input_tokens_seen": 34231472, "step": 39540 }, { "epoch": 18.644507307873646, "grad_norm": 0.01712076924741268, "learning_rate": 9.618875203241672e-05, "loss": 0.0551, "num_input_tokens_seen": 34235696, "step": 39545 }, { "epoch": 18.646864686468646, "grad_norm": 0.027750233188271523, "learning_rate": 9.409113362977561e-05, "loss": 0.1756, "num_input_tokens_seen": 34240256, "step": 39550 }, { "epoch": 18.64922206506365, "grad_norm": 0.00917709618806839, "learning_rate": 9.20166326020988e-05, "loss": 0.0678, "num_input_tokens_seen": 34243760, "step": 39555 }, { "epoch": 18.65157944365865, "grad_norm": 0.009253164753317833, "learning_rate": 8.996524926933035e-05, "loss": 0.1028, "num_input_tokens_seen": 34249200, "step": 39560 }, { "epoch": 18.653936822253655, "grad_norm": 0.008726068772375584, "learning_rate": 8.793698394781723e-05, "loss": 0.1132, "num_input_tokens_seen": 34252656, "step": 39565 }, { "epoch": 18.656294200848656, "grad_norm": 0.02091221511363983, "learning_rate": 8.593183695030926e-05, "loss": 0.102, "num_input_tokens_seen": 34256688, "step": 39570 }, { "epoch": 18.65865157944366, "grad_norm": 0.003981017041951418, "learning_rate": 8.39498085860757e-05, "loss": 0.0513, "num_input_tokens_seen": 34261440, "step": 39575 }, { "epoch": 18.66100895803866, "grad_norm": 0.015008358284831047, "learning_rate": 8.199089916072211e-05, "loss": 0.0707, "num_input_tokens_seen": 34265776, "step": 39580 }, { "epoch": 18.663366336633665, "grad_norm": 0.0066367038525640965, "learning_rate": 8.005510897637346e-05, "loss": 0.0622, "num_input_tokens_seen": 34270784, "step": 39585 }, { "epoch": 18.665723715228665, "grad_norm": 0.02477482333779335, "learning_rate": 7.8142438331541e-05, "loss": 0.085, "num_input_tokens_seen": 34274736, "step": 39590 }, { "epoch": 18.66808109382367, "grad_norm": 0.009845632128417492, "learning_rate": 7.625288752117209e-05, "loss": 0.1083, "num_input_tokens_seen": 34280352, "step": 39595 }, { "epoch": 18.67043847241867, "grad_norm": 0.018831059336662292, "learning_rate": 7.4386456836667e-05, "loss": 0.1452, "num_input_tokens_seen": 34284208, "step": 39600 }, { "epoch": 18.67043847241867, "eval_loss": 0.2887762784957886, "eval_runtime": 21.8799, "eval_samples_per_second": 43.099, "eval_steps_per_second": 21.572, "num_input_tokens_seen": 34284208, "step": 39600 }, { "epoch": 18.672795851013674, "grad_norm": 0.004413051530718803, "learning_rate": 7.254314656586214e-05, "loss": 0.0723, "num_input_tokens_seen": 34288928, "step": 39605 }, { "epoch": 18.675153229608675, "grad_norm": 0.022885063663125038, "learning_rate": 7.07229569929968e-05, "loss": 0.0919, "num_input_tokens_seen": 34293136, "step": 39610 }, { "epoch": 18.677510608203676, "grad_norm": 0.005017272662371397, "learning_rate": 6.892588839879643e-05, "loss": 0.0891, "num_input_tokens_seen": 34297984, "step": 39615 }, { "epoch": 18.67986798679868, "grad_norm": 0.01828698441386223, "learning_rate": 6.71519410603727e-05, "loss": 0.0809, "num_input_tokens_seen": 34301904, "step": 39620 }, { "epoch": 18.68222536539368, "grad_norm": 0.007361389696598053, "learning_rate": 6.540111525129011e-05, "loss": 0.0947, "num_input_tokens_seen": 34306368, "step": 39625 }, { "epoch": 18.684582743988685, "grad_norm": 0.01193610206246376, "learning_rate": 6.367341124154934e-05, "loss": 0.0519, "num_input_tokens_seen": 34310880, "step": 39630 }, { "epoch": 18.686940122583685, "grad_norm": 0.008383829146623611, "learning_rate": 6.19688292975873e-05, "loss": 0.0656, "num_input_tokens_seen": 34314592, "step": 39635 }, { "epoch": 18.68929750117869, "grad_norm": 0.016446270048618317, "learning_rate": 6.0287369682260336e-05, "loss": 0.0594, "num_input_tokens_seen": 34318864, "step": 39640 }, { "epoch": 18.69165487977369, "grad_norm": 0.0070274299941957, "learning_rate": 5.8629032654894384e-05, "loss": 0.1585, "num_input_tokens_seen": 34324000, "step": 39645 }, { "epoch": 18.694012258368694, "grad_norm": 0.01622469164431095, "learning_rate": 5.699381847120155e-05, "loss": 0.0575, "num_input_tokens_seen": 34327840, "step": 39650 }, { "epoch": 18.696369636963695, "grad_norm": 0.016749674454331398, "learning_rate": 5.5381727383380094e-05, "loss": 0.1757, "num_input_tokens_seen": 34331664, "step": 39655 }, { "epoch": 18.6987270155587, "grad_norm": 0.008495217189192772, "learning_rate": 5.379275964001451e-05, "loss": 0.0253, "num_input_tokens_seen": 34336016, "step": 39660 }, { "epoch": 18.7010843941537, "grad_norm": 0.025003831833600998, "learning_rate": 5.222691548614211e-05, "loss": 0.1114, "num_input_tokens_seen": 34341072, "step": 39665 }, { "epoch": 18.703441772748704, "grad_norm": 0.013879910111427307, "learning_rate": 5.068419516323641e-05, "loss": 0.0991, "num_input_tokens_seen": 34345824, "step": 39670 }, { "epoch": 18.705799151343705, "grad_norm": 0.031011074781417847, "learning_rate": 4.91645989092071e-05, "loss": 0.1449, "num_input_tokens_seen": 34350784, "step": 39675 }, { "epoch": 18.70815652993871, "grad_norm": 0.014465633779764175, "learning_rate": 4.7668126958400056e-05, "loss": 0.1564, "num_input_tokens_seen": 34354624, "step": 39680 }, { "epoch": 18.71051390853371, "grad_norm": 0.01345904916524887, "learning_rate": 4.619477954159734e-05, "loss": 0.0976, "num_input_tokens_seen": 34359088, "step": 39685 }, { "epoch": 18.712871287128714, "grad_norm": 0.013596734963357449, "learning_rate": 4.4744556885983884e-05, "loss": 0.1842, "num_input_tokens_seen": 34363008, "step": 39690 }, { "epoch": 18.715228665723714, "grad_norm": 0.021314403042197227, "learning_rate": 4.331745921523078e-05, "loss": 0.1558, "num_input_tokens_seen": 34367504, "step": 39695 }, { "epoch": 18.71758604431872, "grad_norm": 0.015984216704964638, "learning_rate": 4.191348674937867e-05, "loss": 0.0977, "num_input_tokens_seen": 34371760, "step": 39700 }, { "epoch": 18.71994342291372, "grad_norm": 0.005226494744420052, "learning_rate": 4.0532639704971006e-05, "loss": 0.0756, "num_input_tokens_seen": 34375440, "step": 39705 }, { "epoch": 18.722300801508723, "grad_norm": 0.009761794470250607, "learning_rate": 3.917491829493747e-05, "loss": 0.109, "num_input_tokens_seen": 34379440, "step": 39710 }, { "epoch": 18.724658180103724, "grad_norm": 0.02526105009019375, "learning_rate": 3.78403227286439e-05, "loss": 0.0891, "num_input_tokens_seen": 34383552, "step": 39715 }, { "epoch": 18.727015558698728, "grad_norm": 0.0034098774194717407, "learning_rate": 3.652885321192567e-05, "loss": 0.0728, "num_input_tokens_seen": 34387136, "step": 39720 }, { "epoch": 18.72937293729373, "grad_norm": 0.0038812693674117327, "learning_rate": 3.524050994702099e-05, "loss": 0.0716, "num_input_tokens_seen": 34391536, "step": 39725 }, { "epoch": 18.731730315888733, "grad_norm": 0.017734356224536896, "learning_rate": 3.3975293132604276e-05, "loss": 0.1053, "num_input_tokens_seen": 34395200, "step": 39730 }, { "epoch": 18.734087694483733, "grad_norm": 0.007793187629431486, "learning_rate": 3.2733202963786125e-05, "loss": 0.1325, "num_input_tokens_seen": 34399584, "step": 39735 }, { "epoch": 18.736445073078738, "grad_norm": 0.015179858542978764, "learning_rate": 3.15142396321133e-05, "loss": 0.1031, "num_input_tokens_seen": 34405296, "step": 39740 }, { "epoch": 18.738802451673738, "grad_norm": 0.0029584902804344893, "learning_rate": 3.0318403325552132e-05, "loss": 0.1028, "num_input_tokens_seen": 34409360, "step": 39745 }, { "epoch": 18.741159830268742, "grad_norm": 0.03023880161345005, "learning_rate": 2.914569422855506e-05, "loss": 0.1374, "num_input_tokens_seen": 34413136, "step": 39750 }, { "epoch": 18.743517208863743, "grad_norm": 0.011132280342280865, "learning_rate": 2.7996112521927462e-05, "loss": 0.0472, "num_input_tokens_seen": 34417488, "step": 39755 }, { "epoch": 18.745874587458747, "grad_norm": 0.010531708598136902, "learning_rate": 2.68696583829775e-05, "loss": 0.0466, "num_input_tokens_seen": 34421680, "step": 39760 }, { "epoch": 18.748231966053748, "grad_norm": 0.010450134985148907, "learning_rate": 2.576633198539957e-05, "loss": 0.0966, "num_input_tokens_seen": 34425296, "step": 39765 }, { "epoch": 18.750589344648752, "grad_norm": 0.013927256688475609, "learning_rate": 2.46861334993409e-05, "loss": 0.1092, "num_input_tokens_seen": 34430256, "step": 39770 }, { "epoch": 18.752946723243753, "grad_norm": 0.014284822158515453, "learning_rate": 2.3629063091384903e-05, "loss": 0.0797, "num_input_tokens_seen": 34435024, "step": 39775 }, { "epoch": 18.755304101838757, "grad_norm": 0.01127520203590393, "learning_rate": 2.2595120924567834e-05, "loss": 0.1079, "num_input_tokens_seen": 34440256, "step": 39780 }, { "epoch": 18.757661480433757, "grad_norm": 0.0074231550097465515, "learning_rate": 2.158430715829551e-05, "loss": 0.1007, "num_input_tokens_seen": 34444752, "step": 39785 }, { "epoch": 18.76001885902876, "grad_norm": 0.008589277043938637, "learning_rate": 2.059662194849321e-05, "loss": 0.0689, "num_input_tokens_seen": 34449952, "step": 39790 }, { "epoch": 18.762376237623762, "grad_norm": 0.030056290328502655, "learning_rate": 1.9632065447422463e-05, "loss": 0.1302, "num_input_tokens_seen": 34454112, "step": 39795 }, { "epoch": 18.764733616218766, "grad_norm": 0.008744222111999989, "learning_rate": 1.8690637803880916e-05, "loss": 0.1472, "num_input_tokens_seen": 34458576, "step": 39800 }, { "epoch": 18.764733616218766, "eval_loss": 0.28791555762290955, "eval_runtime": 21.9114, "eval_samples_per_second": 43.037, "eval_steps_per_second": 21.541, "num_input_tokens_seen": 34458576, "step": 39800 }, { "epoch": 18.767090994813767, "grad_norm": 0.0026055576745420694, "learning_rate": 1.7772339163019123e-05, "loss": 0.0218, "num_input_tokens_seen": 34462688, "step": 39805 }, { "epoch": 18.76944837340877, "grad_norm": 0.022913895547389984, "learning_rate": 1.6877169666457138e-05, "loss": 0.1083, "num_input_tokens_seen": 34466688, "step": 39810 }, { "epoch": 18.77180575200377, "grad_norm": 0.018014898523688316, "learning_rate": 1.6005129452234532e-05, "loss": 0.0876, "num_input_tokens_seen": 34471056, "step": 39815 }, { "epoch": 18.774163130598772, "grad_norm": 0.0030772502068430185, "learning_rate": 1.5156218654843733e-05, "loss": 0.0894, "num_input_tokens_seen": 34476880, "step": 39820 }, { "epoch": 18.776520509193777, "grad_norm": 0.017863677814602852, "learning_rate": 1.4330437405196683e-05, "loss": 0.091, "num_input_tokens_seen": 34480576, "step": 39825 }, { "epoch": 18.778877887788777, "grad_norm": 0.02051074430346489, "learning_rate": 1.352778583062486e-05, "loss": 0.09, "num_input_tokens_seen": 34484960, "step": 39830 }, { "epoch": 18.78123526638378, "grad_norm": 0.015396296046674252, "learning_rate": 1.2748264054929237e-05, "loss": 0.0557, "num_input_tokens_seen": 34489392, "step": 39835 }, { "epoch": 18.783592644978782, "grad_norm": 0.008194101974368095, "learning_rate": 1.1991872198297004e-05, "loss": 0.0634, "num_input_tokens_seen": 34494096, "step": 39840 }, { "epoch": 18.785950023573786, "grad_norm": 0.013768205419182777, "learning_rate": 1.1258610377384847e-05, "loss": 0.1238, "num_input_tokens_seen": 34499232, "step": 39845 }, { "epoch": 18.788307402168787, "grad_norm": 0.013058130629360676, "learning_rate": 1.0548478705268982e-05, "loss": 0.1082, "num_input_tokens_seen": 34503120, "step": 39850 }, { "epoch": 18.79066478076379, "grad_norm": 0.014570710249245167, "learning_rate": 9.86147729147846e-06, "loss": 0.1095, "num_input_tokens_seen": 34506656, "step": 39855 }, { "epoch": 18.79302215935879, "grad_norm": 0.014433410950005054, "learning_rate": 9.197606241928557e-06, "loss": 0.0733, "num_input_tokens_seen": 34509840, "step": 39860 }, { "epoch": 18.795379537953796, "grad_norm": 0.003787419991567731, "learning_rate": 8.556865659004042e-06, "loss": 0.0696, "num_input_tokens_seen": 34514320, "step": 39865 }, { "epoch": 18.797736916548796, "grad_norm": 0.012462849728763103, "learning_rate": 7.939255641525867e-06, "loss": 0.1178, "num_input_tokens_seen": 34518480, "step": 39870 }, { "epoch": 18.8000942951438, "grad_norm": 0.013562124222517014, "learning_rate": 7.344776284751164e-06, "loss": 0.0991, "num_input_tokens_seen": 34522592, "step": 39875 }, { "epoch": 18.8024516737388, "grad_norm": 0.01157116424292326, "learning_rate": 6.773427680323296e-06, "loss": 0.0594, "num_input_tokens_seen": 34527728, "step": 39880 }, { "epoch": 18.804809052333805, "grad_norm": 0.023624740540981293, "learning_rate": 6.225209916355112e-06, "loss": 0.1066, "num_input_tokens_seen": 34531584, "step": 39885 }, { "epoch": 18.807166430928806, "grad_norm": 0.0068284086883068085, "learning_rate": 5.7001230774123e-06, "loss": 0.0671, "num_input_tokens_seen": 34535776, "step": 39890 }, { "epoch": 18.80952380952381, "grad_norm": 0.008424722589552402, "learning_rate": 5.198167244446772e-06, "loss": 0.0672, "num_input_tokens_seen": 34540704, "step": 39895 }, { "epoch": 18.81188118811881, "grad_norm": 0.02570182830095291, "learning_rate": 4.71934249487993e-06, "loss": 0.1173, "num_input_tokens_seen": 34545504, "step": 39900 }, { "epoch": 18.814238566713815, "grad_norm": 0.0018481033621355891, "learning_rate": 4.2636489025527075e-06, "loss": 0.1157, "num_input_tokens_seen": 34549600, "step": 39905 }, { "epoch": 18.816595945308816, "grad_norm": 0.019510336220264435, "learning_rate": 3.831086537742223e-06, "loss": 0.1069, "num_input_tokens_seen": 34553872, "step": 39910 }, { "epoch": 18.81895332390382, "grad_norm": 0.009640456177294254, "learning_rate": 3.4216554671451236e-06, "loss": 0.1426, "num_input_tokens_seen": 34558400, "step": 39915 }, { "epoch": 18.82131070249882, "grad_norm": 0.021989544853568077, "learning_rate": 3.035355753894242e-06, "loss": 0.0716, "num_input_tokens_seen": 34562400, "step": 39920 }, { "epoch": 18.823668081093825, "grad_norm": 0.008589406497776508, "learning_rate": 2.6721874575752477e-06, "loss": 0.0641, "num_input_tokens_seen": 34567024, "step": 39925 }, { "epoch": 18.826025459688825, "grad_norm": 0.010782629251480103, "learning_rate": 2.3321506341933418e-06, "loss": 0.0532, "num_input_tokens_seen": 34571696, "step": 39930 }, { "epoch": 18.82838283828383, "grad_norm": 0.02195165306329727, "learning_rate": 2.0152453361732546e-06, "loss": 0.1307, "num_input_tokens_seen": 34575904, "step": 39935 }, { "epoch": 18.83074021687883, "grad_norm": 0.02232843078672886, "learning_rate": 1.7214716123925554e-06, "loss": 0.1079, "num_input_tokens_seen": 34579440, "step": 39940 }, { "epoch": 18.833097595473834, "grad_norm": 0.019134260714054108, "learning_rate": 1.4508295081649968e-06, "loss": 0.0735, "num_input_tokens_seen": 34583632, "step": 39945 }, { "epoch": 18.835454974068835, "grad_norm": 0.024946492165327072, "learning_rate": 1.2033190652238623e-06, "loss": 0.1048, "num_input_tokens_seen": 34588304, "step": 39950 }, { "epoch": 18.83781235266384, "grad_norm": 0.01863667368888855, "learning_rate": 9.78940321721966e-07, "loss": 0.084, "num_input_tokens_seen": 34592464, "step": 39955 }, { "epoch": 18.84016973125884, "grad_norm": 0.005115286447107792, "learning_rate": 7.776933122816132e-07, "loss": 0.1006, "num_input_tokens_seen": 34597024, "step": 39960 }, { "epoch": 18.842527109853844, "grad_norm": 0.004415691830217838, "learning_rate": 5.99578067927986e-07, "loss": 0.0979, "num_input_tokens_seen": 34601488, "step": 39965 }, { "epoch": 18.844884488448844, "grad_norm": 0.007164729293435812, "learning_rate": 4.445946161224512e-07, "loss": 0.035, "num_input_tokens_seen": 34606048, "step": 39970 }, { "epoch": 18.84724186704385, "grad_norm": 0.010828171856701374, "learning_rate": 3.127429807792126e-07, "loss": 0.1639, "num_input_tokens_seen": 34611376, "step": 39975 }, { "epoch": 18.84959924563885, "grad_norm": 0.006289685145020485, "learning_rate": 2.040231822320049e-07, "loss": 0.0383, "num_input_tokens_seen": 34616288, "step": 39980 }, { "epoch": 18.851956624233853, "grad_norm": 0.010136693716049194, "learning_rate": 1.1843523723409354e-07, "loss": 0.0553, "num_input_tokens_seen": 34620368, "step": 39985 }, { "epoch": 18.854314002828854, "grad_norm": 0.008962450549006462, "learning_rate": 5.597915897492811e-08, "loss": 0.0717, "num_input_tokens_seen": 34624976, "step": 39990 }, { "epoch": 18.85667138142386, "grad_norm": 0.020498232915997505, "learning_rate": 1.6654957113448885e-08, "loss": 0.0731, "num_input_tokens_seen": 34629024, "step": 39995 }, { "epoch": 18.85902876001886, "grad_norm": 0.016933605074882507, "learning_rate": 4.626377114735902e-10, "loss": 0.0763, "num_input_tokens_seen": 34633072, "step": 40000 }, { "epoch": 18.85902876001886, "eval_loss": 0.288237065076828, "eval_runtime": 21.9439, "eval_samples_per_second": 42.973, "eval_steps_per_second": 21.509, "num_input_tokens_seen": 34633072, "step": 40000 }, { "epoch": 18.85902876001886, "num_input_tokens_seen": 34633072, "step": 40000, "total_flos": 1.4502170779817165e+17, "train_loss": 0.20583191509507595, "train_runtime": 12813.2622, "train_samples_per_second": 12.487, "train_steps_per_second": 3.122 } ], "logging_steps": 5, "max_steps": 40000, "num_input_tokens_seen": 34633072, "num_train_epochs": 19, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4502170779817165e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }