{ "best_global_step": 35600, "best_metric": 0.08715619146823883, "best_model_checkpoint": "saves/lntuning/gemma-3-1b-it/train_qqp_1744902595/checkpoint-35600", "epoch": 1.9544377397210075, "eval_steps": 200, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024430166369432974, "grad_norm": 12.042793273925781, "learning_rate": 4.999999876629946e-05, "loss": 3.4202, "num_input_tokens_seen": 6432, "step": 5 }, { "epoch": 0.0004886033273886595, "grad_norm": 12.35019302368164, "learning_rate": 4.999999375439123e-05, "loss": 3.3232, "num_input_tokens_seen": 13184, "step": 10 }, { "epoch": 0.0007329049910829893, "grad_norm": 10.97464656829834, "learning_rate": 4.9999984887169785e-05, "loss": 2.9841, "num_input_tokens_seen": 19808, "step": 15 }, { "epoch": 0.000977206654777319, "grad_norm": 10.63893985748291, "learning_rate": 4.9999972164636506e-05, "loss": 2.7556, "num_input_tokens_seen": 26432, "step": 20 }, { "epoch": 0.0012215083184716488, "grad_norm": 8.924328804016113, "learning_rate": 4.999995558679334e-05, "loss": 2.5035, "num_input_tokens_seen": 32896, "step": 25 }, { "epoch": 0.0014658099821659785, "grad_norm": 9.062905311584473, "learning_rate": 4.999993515364287e-05, "loss": 2.1348, "num_input_tokens_seen": 39072, "step": 30 }, { "epoch": 0.0017101116458603082, "grad_norm": 9.490468978881836, "learning_rate": 4.999991086518822e-05, "loss": 1.8569, "num_input_tokens_seen": 45312, "step": 35 }, { "epoch": 0.001954413309554638, "grad_norm": 8.826730728149414, "learning_rate": 4.999988272143315e-05, "loss": 1.4306, "num_input_tokens_seen": 51680, "step": 40 }, { "epoch": 0.002198714973248968, "grad_norm": 6.776498794555664, "learning_rate": 4.999985072238199e-05, "loss": 1.4439, "num_input_tokens_seen": 57952, "step": 45 }, { "epoch": 0.0024430166369432977, "grad_norm": 4.70786190032959, "learning_rate": 4.999981486803969e-05, "loss": 1.0112, "num_input_tokens_seen": 64256, "step": 50 }, { "epoch": 0.0026873183006376274, "grad_norm": 4.770380973815918, "learning_rate": 4.999977515841176e-05, "loss": 0.8746, "num_input_tokens_seen": 71264, "step": 55 }, { "epoch": 0.002931619964331957, "grad_norm": 5.296840667724609, "learning_rate": 4.9999731593504344e-05, "loss": 0.8706, "num_input_tokens_seen": 77952, "step": 60 }, { "epoch": 0.0031759216280262867, "grad_norm": 2.7573764324188232, "learning_rate": 4.999968417332415e-05, "loss": 0.7904, "num_input_tokens_seen": 84416, "step": 65 }, { "epoch": 0.0034202232917206164, "grad_norm": 4.256185531616211, "learning_rate": 4.999963289787848e-05, "loss": 0.8231, "num_input_tokens_seen": 90880, "step": 70 }, { "epoch": 0.0036645249554149465, "grad_norm": 2.491360902786255, "learning_rate": 4.999957776717526e-05, "loss": 0.7673, "num_input_tokens_seen": 97568, "step": 75 }, { "epoch": 0.003908826619109276, "grad_norm": 2.6319215297698975, "learning_rate": 4.9999518781222984e-05, "loss": 0.5618, "num_input_tokens_seen": 103968, "step": 80 }, { "epoch": 0.004153128282803606, "grad_norm": 3.272073984146118, "learning_rate": 4.9999455940030746e-05, "loss": 0.4763, "num_input_tokens_seen": 110400, "step": 85 }, { "epoch": 0.004397429946497936, "grad_norm": 3.0982444286346436, "learning_rate": 4.999938924360824e-05, "loss": 0.4971, "num_input_tokens_seen": 116768, "step": 90 }, { "epoch": 0.004641731610192265, "grad_norm": 1.904009461402893, "learning_rate": 4.999931869196575e-05, "loss": 0.5055, "num_input_tokens_seen": 123232, "step": 95 }, { "epoch": 0.004886033273886595, "grad_norm": 2.3682374954223633, "learning_rate": 4.999924428511416e-05, "loss": 0.5061, "num_input_tokens_seen": 129536, "step": 100 }, { "epoch": 0.005130334937580925, "grad_norm": 1.9445445537567139, "learning_rate": 4.999916602306494e-05, "loss": 0.4286, "num_input_tokens_seen": 136448, "step": 105 }, { "epoch": 0.005374636601275255, "grad_norm": 2.5411665439605713, "learning_rate": 4.999908390583016e-05, "loss": 0.3977, "num_input_tokens_seen": 143168, "step": 110 }, { "epoch": 0.005618938264969585, "grad_norm": 1.329473853111267, "learning_rate": 4.999899793342247e-05, "loss": 0.29, "num_input_tokens_seen": 149696, "step": 115 }, { "epoch": 0.005863239928663914, "grad_norm": 2.3808672428131104, "learning_rate": 4.999890810585516e-05, "loss": 0.3197, "num_input_tokens_seen": 156832, "step": 120 }, { "epoch": 0.006107541592358244, "grad_norm": 1.961336612701416, "learning_rate": 4.999881442314206e-05, "loss": 0.2885, "num_input_tokens_seen": 163456, "step": 125 }, { "epoch": 0.0063518432560525735, "grad_norm": 1.0185242891311646, "learning_rate": 4.9998716885297617e-05, "loss": 0.2598, "num_input_tokens_seen": 169920, "step": 130 }, { "epoch": 0.006596144919746904, "grad_norm": 3.092215061187744, "learning_rate": 4.999861549233688e-05, "loss": 0.3697, "num_input_tokens_seen": 176480, "step": 135 }, { "epoch": 0.006840446583441233, "grad_norm": 0.9019033908843994, "learning_rate": 4.999851024427548e-05, "loss": 0.3265, "num_input_tokens_seen": 182464, "step": 140 }, { "epoch": 0.007084748247135563, "grad_norm": 0.7969025373458862, "learning_rate": 4.999840114112965e-05, "loss": 0.298, "num_input_tokens_seen": 188800, "step": 145 }, { "epoch": 0.007329049910829893, "grad_norm": 1.1580439805984497, "learning_rate": 4.999828818291621e-05, "loss": 0.2641, "num_input_tokens_seen": 195424, "step": 150 }, { "epoch": 0.007573351574524222, "grad_norm": 1.0794167518615723, "learning_rate": 4.999817136965259e-05, "loss": 0.2537, "num_input_tokens_seen": 201952, "step": 155 }, { "epoch": 0.007817653238218552, "grad_norm": 1.5792232751846313, "learning_rate": 4.9998050701356794e-05, "loss": 0.2597, "num_input_tokens_seen": 209408, "step": 160 }, { "epoch": 0.008061954901912883, "grad_norm": 1.5190260410308838, "learning_rate": 4.999792617804744e-05, "loss": 0.2964, "num_input_tokens_seen": 215808, "step": 165 }, { "epoch": 0.008306256565607212, "grad_norm": 1.2116179466247559, "learning_rate": 4.9997797799743724e-05, "loss": 0.2757, "num_input_tokens_seen": 222496, "step": 170 }, { "epoch": 0.008550558229301541, "grad_norm": 0.8956277966499329, "learning_rate": 4.999766556646545e-05, "loss": 0.2482, "num_input_tokens_seen": 229152, "step": 175 }, { "epoch": 0.008794859892995872, "grad_norm": 0.7525573968887329, "learning_rate": 4.9997529478232996e-05, "loss": 0.2772, "num_input_tokens_seen": 235712, "step": 180 }, { "epoch": 0.009039161556690201, "grad_norm": 1.534791111946106, "learning_rate": 4.9997389535067365e-05, "loss": 0.2298, "num_input_tokens_seen": 242048, "step": 185 }, { "epoch": 0.00928346322038453, "grad_norm": 2.7378363609313965, "learning_rate": 4.999724573699012e-05, "loss": 0.283, "num_input_tokens_seen": 248352, "step": 190 }, { "epoch": 0.00952776488407886, "grad_norm": 1.6957545280456543, "learning_rate": 4.9997098084023457e-05, "loss": 0.2072, "num_input_tokens_seen": 254688, "step": 195 }, { "epoch": 0.00977206654777319, "grad_norm": 1.0662879943847656, "learning_rate": 4.999694657619013e-05, "loss": 0.2209, "num_input_tokens_seen": 260832, "step": 200 }, { "epoch": 0.00977206654777319, "eval_loss": 0.2407904416322708, "eval_runtime": 374.0515, "eval_samples_per_second": 97.273, "eval_steps_per_second": 24.32, "num_input_tokens_seen": 260832, "step": 200 }, { "epoch": 0.01001636821146752, "grad_norm": 0.9394246339797974, "learning_rate": 4.999679121351352e-05, "loss": 0.2383, "num_input_tokens_seen": 267232, "step": 205 }, { "epoch": 0.01026066987516185, "grad_norm": 0.4818631410598755, "learning_rate": 4.9996631996017565e-05, "loss": 0.2159, "num_input_tokens_seen": 273472, "step": 210 }, { "epoch": 0.01050497153885618, "grad_norm": 0.4660932421684265, "learning_rate": 4.9996468923726835e-05, "loss": 0.1822, "num_input_tokens_seen": 280096, "step": 215 }, { "epoch": 0.01074927320255051, "grad_norm": 1.8490161895751953, "learning_rate": 4.999630199666647e-05, "loss": 0.2626, "num_input_tokens_seen": 286912, "step": 220 }, { "epoch": 0.010993574866244839, "grad_norm": 1.153332233428955, "learning_rate": 4.999613121486222e-05, "loss": 0.2187, "num_input_tokens_seen": 293216, "step": 225 }, { "epoch": 0.01123787652993917, "grad_norm": 1.1386908292770386, "learning_rate": 4.999595657834041e-05, "loss": 0.1963, "num_input_tokens_seen": 299744, "step": 230 }, { "epoch": 0.011482178193633499, "grad_norm": 0.5686959624290466, "learning_rate": 4.999577808712798e-05, "loss": 0.2133, "num_input_tokens_seen": 306240, "step": 235 }, { "epoch": 0.011726479857327828, "grad_norm": 0.6818825602531433, "learning_rate": 4.999559574125244e-05, "loss": 0.1814, "num_input_tokens_seen": 312800, "step": 240 }, { "epoch": 0.011970781521022157, "grad_norm": 1.4523073434829712, "learning_rate": 4.9995409540741934e-05, "loss": 0.2502, "num_input_tokens_seen": 319200, "step": 245 }, { "epoch": 0.012215083184716488, "grad_norm": 0.9568024277687073, "learning_rate": 4.999521948562516e-05, "loss": 0.1741, "num_input_tokens_seen": 325856, "step": 250 }, { "epoch": 0.012459384848410818, "grad_norm": 1.1009211540222168, "learning_rate": 4.999502557593143e-05, "loss": 0.1569, "num_input_tokens_seen": 332064, "step": 255 }, { "epoch": 0.012703686512105147, "grad_norm": 1.66846764087677, "learning_rate": 4.999482781169066e-05, "loss": 0.2227, "num_input_tokens_seen": 338400, "step": 260 }, { "epoch": 0.012947988175799478, "grad_norm": 0.45701903104782104, "learning_rate": 4.9994626192933324e-05, "loss": 0.1732, "num_input_tokens_seen": 344864, "step": 265 }, { "epoch": 0.013192289839493807, "grad_norm": 0.8389227986335754, "learning_rate": 4.999442071969054e-05, "loss": 0.1966, "num_input_tokens_seen": 351232, "step": 270 }, { "epoch": 0.013436591503188136, "grad_norm": 0.8445138931274414, "learning_rate": 4.999421139199397e-05, "loss": 0.194, "num_input_tokens_seen": 358144, "step": 275 }, { "epoch": 0.013680893166882466, "grad_norm": 0.6054143905639648, "learning_rate": 4.999399820987592e-05, "loss": 0.2152, "num_input_tokens_seen": 364832, "step": 280 }, { "epoch": 0.013925194830576797, "grad_norm": 1.3213456869125366, "learning_rate": 4.999378117336924e-05, "loss": 0.1754, "num_input_tokens_seen": 371488, "step": 285 }, { "epoch": 0.014169496494271126, "grad_norm": 0.9229286313056946, "learning_rate": 4.9993560282507415e-05, "loss": 0.1788, "num_input_tokens_seen": 377984, "step": 290 }, { "epoch": 0.014413798157965455, "grad_norm": 0.6117981672286987, "learning_rate": 4.9993335537324495e-05, "loss": 0.175, "num_input_tokens_seen": 384256, "step": 295 }, { "epoch": 0.014658099821659786, "grad_norm": 0.5989497303962708, "learning_rate": 4.999310693785516e-05, "loss": 0.2493, "num_input_tokens_seen": 390400, "step": 300 }, { "epoch": 0.014902401485354115, "grad_norm": 0.626385509967804, "learning_rate": 4.9992874484134653e-05, "loss": 0.1869, "num_input_tokens_seen": 396576, "step": 305 }, { "epoch": 0.015146703149048445, "grad_norm": 1.2640793323516846, "learning_rate": 4.999263817619882e-05, "loss": 0.1869, "num_input_tokens_seen": 403200, "step": 310 }, { "epoch": 0.015391004812742774, "grad_norm": 0.4844975173473358, "learning_rate": 4.9992398014084105e-05, "loss": 0.1586, "num_input_tokens_seen": 409536, "step": 315 }, { "epoch": 0.015635306476437103, "grad_norm": 0.43938714265823364, "learning_rate": 4.999215399782754e-05, "loss": 0.1542, "num_input_tokens_seen": 416128, "step": 320 }, { "epoch": 0.015879608140131434, "grad_norm": 0.6372065544128418, "learning_rate": 4.999190612746675e-05, "loss": 0.2279, "num_input_tokens_seen": 422336, "step": 325 }, { "epoch": 0.016123909803825765, "grad_norm": 1.050950527191162, "learning_rate": 4.999165440303998e-05, "loss": 0.161, "num_input_tokens_seen": 429152, "step": 330 }, { "epoch": 0.016368211467520093, "grad_norm": 0.7614061832427979, "learning_rate": 4.999139882458603e-05, "loss": 0.1741, "num_input_tokens_seen": 435488, "step": 335 }, { "epoch": 0.016612513131214424, "grad_norm": 0.7215339541435242, "learning_rate": 4.9991139392144314e-05, "loss": 0.1385, "num_input_tokens_seen": 441952, "step": 340 }, { "epoch": 0.016856814794908755, "grad_norm": 3.1718385219573975, "learning_rate": 4.999087610575485e-05, "loss": 0.1969, "num_input_tokens_seen": 448672, "step": 345 }, { "epoch": 0.017101116458603082, "grad_norm": 0.5116874575614929, "learning_rate": 4.999060896545824e-05, "loss": 0.2058, "num_input_tokens_seen": 454720, "step": 350 }, { "epoch": 0.017345418122297413, "grad_norm": 0.663537323474884, "learning_rate": 4.999033797129568e-05, "loss": 0.1744, "num_input_tokens_seen": 461216, "step": 355 }, { "epoch": 0.017589719785991744, "grad_norm": 1.0356642007827759, "learning_rate": 4.999006312330894e-05, "loss": 0.1709, "num_input_tokens_seen": 467584, "step": 360 }, { "epoch": 0.01783402144968607, "grad_norm": 1.0372087955474854, "learning_rate": 4.998978442154043e-05, "loss": 0.1562, "num_input_tokens_seen": 473696, "step": 365 }, { "epoch": 0.018078323113380403, "grad_norm": 1.0467535257339478, "learning_rate": 4.9989501866033125e-05, "loss": 0.2204, "num_input_tokens_seen": 480288, "step": 370 }, { "epoch": 0.018322624777074734, "grad_norm": 1.1813163757324219, "learning_rate": 4.998921545683059e-05, "loss": 0.1824, "num_input_tokens_seen": 486720, "step": 375 }, { "epoch": 0.01856692644076906, "grad_norm": 0.5512180924415588, "learning_rate": 4.9988925193976996e-05, "loss": 0.1957, "num_input_tokens_seen": 493120, "step": 380 }, { "epoch": 0.018811228104463392, "grad_norm": 1.9954239130020142, "learning_rate": 4.998863107751711e-05, "loss": 0.1998, "num_input_tokens_seen": 499456, "step": 385 }, { "epoch": 0.01905552976815772, "grad_norm": 0.4591430127620697, "learning_rate": 4.998833310749629e-05, "loss": 0.1564, "num_input_tokens_seen": 505792, "step": 390 }, { "epoch": 0.01929983143185205, "grad_norm": 1.1354262828826904, "learning_rate": 4.998803128396047e-05, "loss": 0.1851, "num_input_tokens_seen": 512064, "step": 395 }, { "epoch": 0.01954413309554638, "grad_norm": 0.4885125160217285, "learning_rate": 4.9987725606956215e-05, "loss": 0.137, "num_input_tokens_seen": 518880, "step": 400 }, { "epoch": 0.01954413309554638, "eval_loss": 0.16603943705558777, "eval_runtime": 375.3918, "eval_samples_per_second": 96.925, "eval_steps_per_second": 24.233, "num_input_tokens_seen": 518880, "step": 400 }, { "epoch": 0.01978843475924071, "grad_norm": 0.4015945792198181, "learning_rate": 4.998741607653066e-05, "loss": 0.162, "num_input_tokens_seen": 526240, "step": 405 }, { "epoch": 0.02003273642293504, "grad_norm": 0.5113167762756348, "learning_rate": 4.9987102692731523e-05, "loss": 0.2119, "num_input_tokens_seen": 532768, "step": 410 }, { "epoch": 0.02027703808662937, "grad_norm": 0.7100062370300293, "learning_rate": 4.9986785455607157e-05, "loss": 0.1978, "num_input_tokens_seen": 539296, "step": 415 }, { "epoch": 0.0205213397503237, "grad_norm": 0.5853425860404968, "learning_rate": 4.9986464365206456e-05, "loss": 0.1445, "num_input_tokens_seen": 546464, "step": 420 }, { "epoch": 0.02076564141401803, "grad_norm": 0.8775040507316589, "learning_rate": 4.9986139421578956e-05, "loss": 0.1282, "num_input_tokens_seen": 552928, "step": 425 }, { "epoch": 0.02100994307771236, "grad_norm": 1.3816280364990234, "learning_rate": 4.998581062477477e-05, "loss": 0.1409, "num_input_tokens_seen": 559168, "step": 430 }, { "epoch": 0.021254244741406688, "grad_norm": 0.4245508015155792, "learning_rate": 4.998547797484458e-05, "loss": 0.2203, "num_input_tokens_seen": 565312, "step": 435 }, { "epoch": 0.02149854640510102, "grad_norm": 0.7351658344268799, "learning_rate": 4.9985141471839706e-05, "loss": 0.1883, "num_input_tokens_seen": 571488, "step": 440 }, { "epoch": 0.02174284806879535, "grad_norm": 1.1929872035980225, "learning_rate": 4.998480111581203e-05, "loss": 0.1571, "num_input_tokens_seen": 578016, "step": 445 }, { "epoch": 0.021987149732489678, "grad_norm": 0.7868479490280151, "learning_rate": 4.998445690681405e-05, "loss": 0.1619, "num_input_tokens_seen": 584480, "step": 450 }, { "epoch": 0.02223145139618401, "grad_norm": 0.4149157404899597, "learning_rate": 4.9984108844898834e-05, "loss": 0.1568, "num_input_tokens_seen": 591648, "step": 455 }, { "epoch": 0.02247575305987834, "grad_norm": 0.34102684259414673, "learning_rate": 4.9983756930120076e-05, "loss": 0.1296, "num_input_tokens_seen": 597760, "step": 460 }, { "epoch": 0.022720054723572667, "grad_norm": 0.48406094312667847, "learning_rate": 4.9983401162532025e-05, "loss": 0.1491, "num_input_tokens_seen": 604320, "step": 465 }, { "epoch": 0.022964356387266998, "grad_norm": 0.6906338930130005, "learning_rate": 4.998304154218955e-05, "loss": 0.151, "num_input_tokens_seen": 610592, "step": 470 }, { "epoch": 0.023208658050961326, "grad_norm": 0.8350521326065063, "learning_rate": 4.998267806914812e-05, "loss": 0.1084, "num_input_tokens_seen": 617120, "step": 475 }, { "epoch": 0.023452959714655656, "grad_norm": 0.5681911706924438, "learning_rate": 4.998231074346378e-05, "loss": 0.1946, "num_input_tokens_seen": 623296, "step": 480 }, { "epoch": 0.023697261378349987, "grad_norm": 0.6131694912910461, "learning_rate": 4.998193956519317e-05, "loss": 0.1383, "num_input_tokens_seen": 629984, "step": 485 }, { "epoch": 0.023941563042044315, "grad_norm": 1.1965006589889526, "learning_rate": 4.9981564534393545e-05, "loss": 0.1599, "num_input_tokens_seen": 636800, "step": 490 }, { "epoch": 0.024185864705738646, "grad_norm": 0.7094550728797913, "learning_rate": 4.998118565112272e-05, "loss": 0.1639, "num_input_tokens_seen": 643456, "step": 495 }, { "epoch": 0.024430166369432977, "grad_norm": 0.40330803394317627, "learning_rate": 4.998080291543914e-05, "loss": 0.1597, "num_input_tokens_seen": 650720, "step": 500 }, { "epoch": 0.024674468033127304, "grad_norm": 0.3551614284515381, "learning_rate": 4.9980416327401826e-05, "loss": 0.1539, "num_input_tokens_seen": 657440, "step": 505 }, { "epoch": 0.024918769696821635, "grad_norm": 1.0106029510498047, "learning_rate": 4.998002588707038e-05, "loss": 0.169, "num_input_tokens_seen": 663680, "step": 510 }, { "epoch": 0.025163071360515966, "grad_norm": 1.0352987051010132, "learning_rate": 4.997963159450503e-05, "loss": 0.1552, "num_input_tokens_seen": 670784, "step": 515 }, { "epoch": 0.025407373024210294, "grad_norm": 0.7572924494743347, "learning_rate": 4.9979233449766575e-05, "loss": 0.1874, "num_input_tokens_seen": 677760, "step": 520 }, { "epoch": 0.025651674687904625, "grad_norm": 0.8729822039604187, "learning_rate": 4.997883145291641e-05, "loss": 0.169, "num_input_tokens_seen": 684192, "step": 525 }, { "epoch": 0.025895976351598956, "grad_norm": 0.6148104071617126, "learning_rate": 4.9978425604016536e-05, "loss": 0.1621, "num_input_tokens_seen": 690432, "step": 530 }, { "epoch": 0.026140278015293283, "grad_norm": 0.7394568920135498, "learning_rate": 4.9978015903129536e-05, "loss": 0.1811, "num_input_tokens_seen": 696800, "step": 535 }, { "epoch": 0.026384579678987614, "grad_norm": 0.5455793142318726, "learning_rate": 4.997760235031859e-05, "loss": 0.1543, "num_input_tokens_seen": 703392, "step": 540 }, { "epoch": 0.026628881342681942, "grad_norm": 0.7559114098548889, "learning_rate": 4.9977184945647473e-05, "loss": 0.117, "num_input_tokens_seen": 709728, "step": 545 }, { "epoch": 0.026873183006376273, "grad_norm": 0.5582893490791321, "learning_rate": 4.997676368918055e-05, "loss": 0.1538, "num_input_tokens_seen": 715904, "step": 550 }, { "epoch": 0.027117484670070604, "grad_norm": 0.26657506823539734, "learning_rate": 4.9976338580982794e-05, "loss": 0.1603, "num_input_tokens_seen": 722304, "step": 555 }, { "epoch": 0.02736178633376493, "grad_norm": 0.2515397071838379, "learning_rate": 4.9975909621119755e-05, "loss": 0.1694, "num_input_tokens_seen": 728320, "step": 560 }, { "epoch": 0.027606087997459262, "grad_norm": 0.41713330149650574, "learning_rate": 4.997547680965758e-05, "loss": 0.139, "num_input_tokens_seen": 734496, "step": 565 }, { "epoch": 0.027850389661153593, "grad_norm": 0.5668849349021912, "learning_rate": 4.997504014666302e-05, "loss": 0.1344, "num_input_tokens_seen": 741248, "step": 570 }, { "epoch": 0.02809469132484792, "grad_norm": 0.3737994432449341, "learning_rate": 4.997459963220342e-05, "loss": 0.1758, "num_input_tokens_seen": 747712, "step": 575 }, { "epoch": 0.028338992988542252, "grad_norm": 0.30534040927886963, "learning_rate": 4.997415526634671e-05, "loss": 0.1196, "num_input_tokens_seen": 755040, "step": 580 }, { "epoch": 0.028583294652236583, "grad_norm": 0.3336613178253174, "learning_rate": 4.99737070491614e-05, "loss": 0.1117, "num_input_tokens_seen": 761536, "step": 585 }, { "epoch": 0.02882759631593091, "grad_norm": 1.1663768291473389, "learning_rate": 4.997325498071663e-05, "loss": 0.1323, "num_input_tokens_seen": 768000, "step": 590 }, { "epoch": 0.02907189797962524, "grad_norm": 1.267972469329834, "learning_rate": 4.997279906108211e-05, "loss": 0.129, "num_input_tokens_seen": 774112, "step": 595 }, { "epoch": 0.029316199643319572, "grad_norm": 0.5521332025527954, "learning_rate": 4.9972339290328155e-05, "loss": 0.1244, "num_input_tokens_seen": 780768, "step": 600 }, { "epoch": 0.029316199643319572, "eval_loss": 0.14877179265022278, "eval_runtime": 375.2888, "eval_samples_per_second": 96.952, "eval_steps_per_second": 24.24, "num_input_tokens_seen": 780768, "step": 600 }, { "epoch": 0.0295605013070139, "grad_norm": 0.5564755201339722, "learning_rate": 4.9971875668525646e-05, "loss": 0.1263, "num_input_tokens_seen": 787104, "step": 605 }, { "epoch": 0.02980480297070823, "grad_norm": 0.3615465760231018, "learning_rate": 4.997140819574609e-05, "loss": 0.1403, "num_input_tokens_seen": 793856, "step": 610 }, { "epoch": 0.030049104634402562, "grad_norm": 0.7862145304679871, "learning_rate": 4.997093687206159e-05, "loss": 0.1737, "num_input_tokens_seen": 800448, "step": 615 }, { "epoch": 0.03029340629809689, "grad_norm": 0.580680787563324, "learning_rate": 4.997046169754482e-05, "loss": 0.15, "num_input_tokens_seen": 807232, "step": 620 }, { "epoch": 0.03053770796179122, "grad_norm": 0.3152447044849396, "learning_rate": 4.996998267226905e-05, "loss": 0.1654, "num_input_tokens_seen": 813600, "step": 625 }, { "epoch": 0.030782009625485548, "grad_norm": 0.21116594970226288, "learning_rate": 4.996949979630817e-05, "loss": 0.1172, "num_input_tokens_seen": 820000, "step": 630 }, { "epoch": 0.03102631128917988, "grad_norm": 0.5218799710273743, "learning_rate": 4.996901306973663e-05, "loss": 0.1256, "num_input_tokens_seen": 826560, "step": 635 }, { "epoch": 0.031270612952874206, "grad_norm": 0.5859628915786743, "learning_rate": 4.996852249262949e-05, "loss": 0.1371, "num_input_tokens_seen": 832960, "step": 640 }, { "epoch": 0.03151491461656854, "grad_norm": 0.6644803285598755, "learning_rate": 4.996802806506241e-05, "loss": 0.1105, "num_input_tokens_seen": 839680, "step": 645 }, { "epoch": 0.03175921628026287, "grad_norm": 0.5046868324279785, "learning_rate": 4.996752978711164e-05, "loss": 0.153, "num_input_tokens_seen": 846592, "step": 650 }, { "epoch": 0.0320035179439572, "grad_norm": 0.8195663690567017, "learning_rate": 4.996702765885401e-05, "loss": 0.1085, "num_input_tokens_seen": 853088, "step": 655 }, { "epoch": 0.03224781960765153, "grad_norm": 1.456371784210205, "learning_rate": 4.9966521680366964e-05, "loss": 0.1631, "num_input_tokens_seen": 859424, "step": 660 }, { "epoch": 0.03249212127134586, "grad_norm": 0.5620949268341064, "learning_rate": 4.9966011851728524e-05, "loss": 0.1313, "num_input_tokens_seen": 866080, "step": 665 }, { "epoch": 0.032736422935040185, "grad_norm": 0.5160529613494873, "learning_rate": 4.996549817301731e-05, "loss": 0.1301, "num_input_tokens_seen": 872608, "step": 670 }, { "epoch": 0.032980724598734516, "grad_norm": 0.36695095896720886, "learning_rate": 4.9964980644312544e-05, "loss": 0.1636, "num_input_tokens_seen": 879008, "step": 675 }, { "epoch": 0.03322502626242885, "grad_norm": 0.4375484585762024, "learning_rate": 4.996445926569403e-05, "loss": 0.1455, "num_input_tokens_seen": 885408, "step": 680 }, { "epoch": 0.03346932792612318, "grad_norm": 0.3357987403869629, "learning_rate": 4.996393403724218e-05, "loss": 0.1142, "num_input_tokens_seen": 891552, "step": 685 }, { "epoch": 0.03371362958981751, "grad_norm": 0.35807204246520996, "learning_rate": 4.9963404959037985e-05, "loss": 0.1546, "num_input_tokens_seen": 897984, "step": 690 }, { "epoch": 0.03395793125351183, "grad_norm": 0.7181982398033142, "learning_rate": 4.996287203116303e-05, "loss": 0.156, "num_input_tokens_seen": 904480, "step": 695 }, { "epoch": 0.034202232917206164, "grad_norm": 0.2947688102722168, "learning_rate": 4.996233525369951e-05, "loss": 0.133, "num_input_tokens_seen": 910880, "step": 700 }, { "epoch": 0.034446534580900495, "grad_norm": 0.42973268032073975, "learning_rate": 4.99617946267302e-05, "loss": 0.1622, "num_input_tokens_seen": 917216, "step": 705 }, { "epoch": 0.034690836244594826, "grad_norm": 0.3220207691192627, "learning_rate": 4.996125015033846e-05, "loss": 0.1292, "num_input_tokens_seen": 923232, "step": 710 }, { "epoch": 0.03493513790828916, "grad_norm": 0.492258757352829, "learning_rate": 4.996070182460827e-05, "loss": 0.1773, "num_input_tokens_seen": 929824, "step": 715 }, { "epoch": 0.03517943957198349, "grad_norm": 0.624816358089447, "learning_rate": 4.996014964962418e-05, "loss": 0.1603, "num_input_tokens_seen": 936224, "step": 720 }, { "epoch": 0.03542374123567781, "grad_norm": 0.9788967370986938, "learning_rate": 4.9959593625471344e-05, "loss": 0.1607, "num_input_tokens_seen": 942432, "step": 725 }, { "epoch": 0.03566804289937214, "grad_norm": 0.332538366317749, "learning_rate": 4.995903375223552e-05, "loss": 0.106, "num_input_tokens_seen": 948672, "step": 730 }, { "epoch": 0.035912344563066474, "grad_norm": 0.4185684323310852, "learning_rate": 4.995847003000302e-05, "loss": 0.1571, "num_input_tokens_seen": 954880, "step": 735 }, { "epoch": 0.036156646226760805, "grad_norm": 0.8561639189720154, "learning_rate": 4.9957902458860804e-05, "loss": 0.1567, "num_input_tokens_seen": 961120, "step": 740 }, { "epoch": 0.036400947890455136, "grad_norm": 0.4978964626789093, "learning_rate": 4.995733103889639e-05, "loss": 0.1435, "num_input_tokens_seen": 967616, "step": 745 }, { "epoch": 0.03664524955414947, "grad_norm": 0.663611888885498, "learning_rate": 4.99567557701979e-05, "loss": 0.1858, "num_input_tokens_seen": 974272, "step": 750 }, { "epoch": 0.03688955121784379, "grad_norm": 0.24765193462371826, "learning_rate": 4.995617665285403e-05, "loss": 0.1183, "num_input_tokens_seen": 980608, "step": 755 }, { "epoch": 0.03713385288153812, "grad_norm": 0.7581337094306946, "learning_rate": 4.99555936869541e-05, "loss": 0.134, "num_input_tokens_seen": 987168, "step": 760 }, { "epoch": 0.03737815454523245, "grad_norm": 0.3382894694805145, "learning_rate": 4.995500687258803e-05, "loss": 0.2114, "num_input_tokens_seen": 993632, "step": 765 }, { "epoch": 0.037622456208926784, "grad_norm": 0.7266979217529297, "learning_rate": 4.995441620984628e-05, "loss": 0.1594, "num_input_tokens_seen": 999968, "step": 770 }, { "epoch": 0.037866757872621115, "grad_norm": 0.572500467300415, "learning_rate": 4.995382169881996e-05, "loss": 0.1407, "num_input_tokens_seen": 1006688, "step": 775 }, { "epoch": 0.03811105953631544, "grad_norm": 0.7732391357421875, "learning_rate": 4.9953223339600755e-05, "loss": 0.1235, "num_input_tokens_seen": 1012896, "step": 780 }, { "epoch": 0.03835536120000977, "grad_norm": 0.9629490375518799, "learning_rate": 4.995262113228091e-05, "loss": 0.1352, "num_input_tokens_seen": 1019136, "step": 785 }, { "epoch": 0.0385996628637041, "grad_norm": 0.3862694799900055, "learning_rate": 4.995201507695332e-05, "loss": 0.1549, "num_input_tokens_seen": 1025792, "step": 790 }, { "epoch": 0.03884396452739843, "grad_norm": 0.5082911252975464, "learning_rate": 4.995140517371144e-05, "loss": 0.1182, "num_input_tokens_seen": 1032032, "step": 795 }, { "epoch": 0.03908826619109276, "grad_norm": 0.460388720035553, "learning_rate": 4.995079142264932e-05, "loss": 0.1629, "num_input_tokens_seen": 1038304, "step": 800 }, { "epoch": 0.03908826619109276, "eval_loss": 0.13663995265960693, "eval_runtime": 374.7636, "eval_samples_per_second": 97.088, "eval_steps_per_second": 24.274, "num_input_tokens_seen": 1038304, "step": 800 }, { "epoch": 0.039332567854787094, "grad_norm": 0.9324464201927185, "learning_rate": 4.995017382386162e-05, "loss": 0.1663, "num_input_tokens_seen": 1044384, "step": 805 }, { "epoch": 0.03957686951848142, "grad_norm": 0.8469566106796265, "learning_rate": 4.994955237744356e-05, "loss": 0.1471, "num_input_tokens_seen": 1051040, "step": 810 }, { "epoch": 0.03982117118217575, "grad_norm": 1.2157275676727295, "learning_rate": 4.994892708349101e-05, "loss": 0.1344, "num_input_tokens_seen": 1057664, "step": 815 }, { "epoch": 0.04006547284587008, "grad_norm": 0.7170956134796143, "learning_rate": 4.994829794210035e-05, "loss": 0.1345, "num_input_tokens_seen": 1063968, "step": 820 }, { "epoch": 0.04030977450956441, "grad_norm": 0.7175129055976868, "learning_rate": 4.994766495336864e-05, "loss": 0.136, "num_input_tokens_seen": 1070624, "step": 825 }, { "epoch": 0.04055407617325874, "grad_norm": 0.23009970784187317, "learning_rate": 4.994702811739348e-05, "loss": 0.1322, "num_input_tokens_seen": 1077376, "step": 830 }, { "epoch": 0.04079837783695307, "grad_norm": 0.5560957789421082, "learning_rate": 4.994638743427308e-05, "loss": 0.1247, "num_input_tokens_seen": 1083840, "step": 835 }, { "epoch": 0.0410426795006474, "grad_norm": 0.36965784430503845, "learning_rate": 4.994574290410624e-05, "loss": 0.1387, "num_input_tokens_seen": 1090272, "step": 840 }, { "epoch": 0.04128698116434173, "grad_norm": 0.870733916759491, "learning_rate": 4.9945094526992364e-05, "loss": 0.1138, "num_input_tokens_seen": 1096992, "step": 845 }, { "epoch": 0.04153128282803606, "grad_norm": 0.8376825451850891, "learning_rate": 4.994444230303142e-05, "loss": 0.1332, "num_input_tokens_seen": 1103392, "step": 850 }, { "epoch": 0.04177558449173039, "grad_norm": 0.3101494312286377, "learning_rate": 4.994378623232402e-05, "loss": 0.1263, "num_input_tokens_seen": 1109664, "step": 855 }, { "epoch": 0.04201988615542472, "grad_norm": 0.7708681225776672, "learning_rate": 4.99431263149713e-05, "loss": 0.1225, "num_input_tokens_seen": 1116096, "step": 860 }, { "epoch": 0.042264187819119045, "grad_norm": 0.49696481227874756, "learning_rate": 4.9942462551075056e-05, "loss": 0.1263, "num_input_tokens_seen": 1122048, "step": 865 }, { "epoch": 0.042508489482813376, "grad_norm": 0.16828277707099915, "learning_rate": 4.994179494073764e-05, "loss": 0.1216, "num_input_tokens_seen": 1128512, "step": 870 }, { "epoch": 0.04275279114650771, "grad_norm": 0.21603140234947205, "learning_rate": 4.9941123484062e-05, "loss": 0.1184, "num_input_tokens_seen": 1134912, "step": 875 }, { "epoch": 0.04299709281020204, "grad_norm": 0.30149996280670166, "learning_rate": 4.99404481811517e-05, "loss": 0.1457, "num_input_tokens_seen": 1141504, "step": 880 }, { "epoch": 0.04324139447389637, "grad_norm": 0.6964754462242126, "learning_rate": 4.9939769032110864e-05, "loss": 0.1313, "num_input_tokens_seen": 1147872, "step": 885 }, { "epoch": 0.0434856961375907, "grad_norm": 0.4144536554813385, "learning_rate": 4.993908603704423e-05, "loss": 0.1355, "num_input_tokens_seen": 1154848, "step": 890 }, { "epoch": 0.043729997801285024, "grad_norm": 0.2719855010509491, "learning_rate": 4.9938399196057126e-05, "loss": 0.1021, "num_input_tokens_seen": 1161248, "step": 895 }, { "epoch": 0.043974299464979355, "grad_norm": 0.7866916060447693, "learning_rate": 4.993770850925547e-05, "loss": 0.1495, "num_input_tokens_seen": 1168032, "step": 900 }, { "epoch": 0.044218601128673686, "grad_norm": 1.2078666687011719, "learning_rate": 4.993701397674577e-05, "loss": 0.1702, "num_input_tokens_seen": 1174304, "step": 905 }, { "epoch": 0.04446290279236802, "grad_norm": 0.8769565224647522, "learning_rate": 4.993631559863515e-05, "loss": 0.1108, "num_input_tokens_seen": 1180640, "step": 910 }, { "epoch": 0.04470720445606235, "grad_norm": 0.9617577791213989, "learning_rate": 4.9935613375031283e-05, "loss": 0.1539, "num_input_tokens_seen": 1187104, "step": 915 }, { "epoch": 0.04495150611975668, "grad_norm": 1.879855990409851, "learning_rate": 4.993490730604248e-05, "loss": 0.1742, "num_input_tokens_seen": 1193600, "step": 920 }, { "epoch": 0.045195807783451, "grad_norm": 0.2622809112071991, "learning_rate": 4.993419739177761e-05, "loss": 0.1404, "num_input_tokens_seen": 1200384, "step": 925 }, { "epoch": 0.045440109447145334, "grad_norm": 0.3291712701320648, "learning_rate": 4.9933483632346164e-05, "loss": 0.1493, "num_input_tokens_seen": 1206528, "step": 930 }, { "epoch": 0.045684411110839665, "grad_norm": 0.24598684906959534, "learning_rate": 4.993276602785821e-05, "loss": 0.1164, "num_input_tokens_seen": 1212800, "step": 935 }, { "epoch": 0.045928712774533996, "grad_norm": 0.8889023661613464, "learning_rate": 4.993204457842441e-05, "loss": 0.1124, "num_input_tokens_seen": 1218976, "step": 940 }, { "epoch": 0.04617301443822833, "grad_norm": 0.5684680342674255, "learning_rate": 4.993131928415602e-05, "loss": 0.1341, "num_input_tokens_seen": 1225376, "step": 945 }, { "epoch": 0.04641731610192265, "grad_norm": 0.22609254717826843, "learning_rate": 4.993059014516489e-05, "loss": 0.099, "num_input_tokens_seen": 1231360, "step": 950 }, { "epoch": 0.04666161776561698, "grad_norm": 0.36800000071525574, "learning_rate": 4.9929857161563464e-05, "loss": 0.1366, "num_input_tokens_seen": 1237856, "step": 955 }, { "epoch": 0.04690591942931131, "grad_norm": 0.5549322962760925, "learning_rate": 4.992912033346477e-05, "loss": 0.1543, "num_input_tokens_seen": 1244384, "step": 960 }, { "epoch": 0.047150221093005644, "grad_norm": 0.9379312992095947, "learning_rate": 4.992837966098245e-05, "loss": 0.1251, "num_input_tokens_seen": 1250816, "step": 965 }, { "epoch": 0.047394522756699975, "grad_norm": 0.2533070147037506, "learning_rate": 4.992763514423071e-05, "loss": 0.093, "num_input_tokens_seen": 1257504, "step": 970 }, { "epoch": 0.047638824420394306, "grad_norm": 0.4473751485347748, "learning_rate": 4.992688678332437e-05, "loss": 0.1137, "num_input_tokens_seen": 1264160, "step": 975 }, { "epoch": 0.04788312608408863, "grad_norm": 0.23214903473854065, "learning_rate": 4.992613457837884e-05, "loss": 0.1109, "num_input_tokens_seen": 1271008, "step": 980 }, { "epoch": 0.04812742774778296, "grad_norm": 1.091675877571106, "learning_rate": 4.992537852951011e-05, "loss": 0.1303, "num_input_tokens_seen": 1277248, "step": 985 }, { "epoch": 0.04837172941147729, "grad_norm": 0.25854024291038513, "learning_rate": 4.9924618636834785e-05, "loss": 0.112, "num_input_tokens_seen": 1283712, "step": 990 }, { "epoch": 0.04861603107517162, "grad_norm": 0.48087549209594727, "learning_rate": 4.9923854900470046e-05, "loss": 0.1475, "num_input_tokens_seen": 1289632, "step": 995 }, { "epoch": 0.048860332738865954, "grad_norm": 0.3105148375034332, "learning_rate": 4.992308732053367e-05, "loss": 0.1414, "num_input_tokens_seen": 1296288, "step": 1000 }, { "epoch": 0.048860332738865954, "eval_loss": 0.12839382886886597, "eval_runtime": 374.7543, "eval_samples_per_second": 97.09, "eval_steps_per_second": 24.275, "num_input_tokens_seen": 1296288, "step": 1000 }, { "epoch": 0.049104634402560285, "grad_norm": 0.464666485786438, "learning_rate": 4.992231589714402e-05, "loss": 0.1227, "num_input_tokens_seen": 1302752, "step": 1005 }, { "epoch": 0.04934893606625461, "grad_norm": 0.37693437933921814, "learning_rate": 4.992154063042007e-05, "loss": 0.1258, "num_input_tokens_seen": 1309152, "step": 1010 }, { "epoch": 0.04959323772994894, "grad_norm": 0.2615901827812195, "learning_rate": 4.992076152048136e-05, "loss": 0.1349, "num_input_tokens_seen": 1315584, "step": 1015 }, { "epoch": 0.04983753939364327, "grad_norm": 0.8857989311218262, "learning_rate": 4.991997856744807e-05, "loss": 0.1317, "num_input_tokens_seen": 1322368, "step": 1020 }, { "epoch": 0.0500818410573376, "grad_norm": 0.4845932424068451, "learning_rate": 4.9919191771440905e-05, "loss": 0.1048, "num_input_tokens_seen": 1329152, "step": 1025 }, { "epoch": 0.05032614272103193, "grad_norm": 0.3887306749820709, "learning_rate": 4.991840113258122e-05, "loss": 0.1339, "num_input_tokens_seen": 1335712, "step": 1030 }, { "epoch": 0.05057044438472626, "grad_norm": 0.2786475718021393, "learning_rate": 4.9917606650990933e-05, "loss": 0.1094, "num_input_tokens_seen": 1341632, "step": 1035 }, { "epoch": 0.05081474604842059, "grad_norm": 0.32800808548927307, "learning_rate": 4.9916808326792566e-05, "loss": 0.1322, "num_input_tokens_seen": 1348096, "step": 1040 }, { "epoch": 0.05105904771211492, "grad_norm": 0.32484838366508484, "learning_rate": 4.9916006160109235e-05, "loss": 0.1542, "num_input_tokens_seen": 1354432, "step": 1045 }, { "epoch": 0.05130334937580925, "grad_norm": 0.9136307239532471, "learning_rate": 4.991520015106464e-05, "loss": 0.1192, "num_input_tokens_seen": 1360864, "step": 1050 }, { "epoch": 0.05154765103950358, "grad_norm": 0.7490929961204529, "learning_rate": 4.991439029978308e-05, "loss": 0.1354, "num_input_tokens_seen": 1366848, "step": 1055 }, { "epoch": 0.05179195270319791, "grad_norm": 0.3712176978588104, "learning_rate": 4.9913576606389434e-05, "loss": 0.1207, "num_input_tokens_seen": 1372960, "step": 1060 }, { "epoch": 0.052036254366892236, "grad_norm": 0.30699291825294495, "learning_rate": 4.991275907100919e-05, "loss": 0.1439, "num_input_tokens_seen": 1380032, "step": 1065 }, { "epoch": 0.05228055603058657, "grad_norm": 0.8964342474937439, "learning_rate": 4.9911937693768434e-05, "loss": 0.1263, "num_input_tokens_seen": 1386464, "step": 1070 }, { "epoch": 0.0525248576942809, "grad_norm": 0.3400002419948578, "learning_rate": 4.991111247479382e-05, "loss": 0.1403, "num_input_tokens_seen": 1392992, "step": 1075 }, { "epoch": 0.05276915935797523, "grad_norm": 0.4651097059249878, "learning_rate": 4.9910283414212605e-05, "loss": 0.121, "num_input_tokens_seen": 1399360, "step": 1080 }, { "epoch": 0.05301346102166956, "grad_norm": 2.50156831741333, "learning_rate": 4.990945051215265e-05, "loss": 0.2052, "num_input_tokens_seen": 1405760, "step": 1085 }, { "epoch": 0.053257762685363884, "grad_norm": 0.8468628525733948, "learning_rate": 4.99086137687424e-05, "loss": 0.1419, "num_input_tokens_seen": 1412000, "step": 1090 }, { "epoch": 0.053502064349058215, "grad_norm": 1.4272916316986084, "learning_rate": 4.9907773184110874e-05, "loss": 0.1594, "num_input_tokens_seen": 1418240, "step": 1095 }, { "epoch": 0.053746366012752546, "grad_norm": 0.5385130643844604, "learning_rate": 4.9906928758387715e-05, "loss": 0.1133, "num_input_tokens_seen": 1424576, "step": 1100 }, { "epoch": 0.05399066767644688, "grad_norm": 0.9526497721672058, "learning_rate": 4.9906080491703146e-05, "loss": 0.1237, "num_input_tokens_seen": 1430720, "step": 1105 }, { "epoch": 0.05423496934014121, "grad_norm": 1.9801167249679565, "learning_rate": 4.990522838418797e-05, "loss": 0.1622, "num_input_tokens_seen": 1437280, "step": 1110 }, { "epoch": 0.05447927100383554, "grad_norm": 0.17620107531547546, "learning_rate": 4.9904372435973604e-05, "loss": 0.1207, "num_input_tokens_seen": 1443872, "step": 1115 }, { "epoch": 0.05472357266752986, "grad_norm": 0.25610843300819397, "learning_rate": 4.990351264719203e-05, "loss": 0.112, "num_input_tokens_seen": 1450304, "step": 1120 }, { "epoch": 0.054967874331224194, "grad_norm": 0.569233238697052, "learning_rate": 4.990264901797586e-05, "loss": 0.1362, "num_input_tokens_seen": 1456288, "step": 1125 }, { "epoch": 0.055212175994918525, "grad_norm": 0.2744321823120117, "learning_rate": 4.990178154845826e-05, "loss": 0.1366, "num_input_tokens_seen": 1462944, "step": 1130 }, { "epoch": 0.055456477658612856, "grad_norm": 0.3421759009361267, "learning_rate": 4.9900910238773014e-05, "loss": 0.1245, "num_input_tokens_seen": 1469472, "step": 1135 }, { "epoch": 0.05570077932230719, "grad_norm": 0.24267233908176422, "learning_rate": 4.990003508905448e-05, "loss": 0.1394, "num_input_tokens_seen": 1475648, "step": 1140 }, { "epoch": 0.05594508098600152, "grad_norm": 0.35325953364372253, "learning_rate": 4.989915609943763e-05, "loss": 0.1396, "num_input_tokens_seen": 1482432, "step": 1145 }, { "epoch": 0.05618938264969584, "grad_norm": 0.24340598285198212, "learning_rate": 4.9898273270058e-05, "loss": 0.1245, "num_input_tokens_seen": 1489152, "step": 1150 }, { "epoch": 0.05643368431339017, "grad_norm": 0.17413754761219025, "learning_rate": 4.989738660105174e-05, "loss": 0.1002, "num_input_tokens_seen": 1495232, "step": 1155 }, { "epoch": 0.056677985977084504, "grad_norm": 0.34656408429145813, "learning_rate": 4.989649609255559e-05, "loss": 0.1257, "num_input_tokens_seen": 1502176, "step": 1160 }, { "epoch": 0.056922287640778835, "grad_norm": 0.5852197408676147, "learning_rate": 4.989560174470687e-05, "loss": 0.0929, "num_input_tokens_seen": 1509408, "step": 1165 }, { "epoch": 0.057166589304473166, "grad_norm": 0.25255104899406433, "learning_rate": 4.989470355764351e-05, "loss": 0.1066, "num_input_tokens_seen": 1515808, "step": 1170 }, { "epoch": 0.05741089096816749, "grad_norm": 0.48087653517723083, "learning_rate": 4.9893801531504e-05, "loss": 0.14, "num_input_tokens_seen": 1521856, "step": 1175 }, { "epoch": 0.05765519263186182, "grad_norm": 0.29383769631385803, "learning_rate": 4.9892895666427475e-05, "loss": 0.1239, "num_input_tokens_seen": 1528000, "step": 1180 }, { "epoch": 0.05789949429555615, "grad_norm": 0.5967001914978027, "learning_rate": 4.9891985962553606e-05, "loss": 0.1242, "num_input_tokens_seen": 1534816, "step": 1185 }, { "epoch": 0.05814379595925048, "grad_norm": 0.5850064754486084, "learning_rate": 4.989107242002269e-05, "loss": 0.1282, "num_input_tokens_seen": 1541568, "step": 1190 }, { "epoch": 0.058388097622944814, "grad_norm": 0.6399086713790894, "learning_rate": 4.989015503897561e-05, "loss": 0.1039, "num_input_tokens_seen": 1547680, "step": 1195 }, { "epoch": 0.058632399286639145, "grad_norm": 0.19738279283046722, "learning_rate": 4.988923381955383e-05, "loss": 0.1101, "num_input_tokens_seen": 1554400, "step": 1200 }, { "epoch": 0.058632399286639145, "eval_loss": 0.12483752518892288, "eval_runtime": 374.3803, "eval_samples_per_second": 97.187, "eval_steps_per_second": 24.299, "num_input_tokens_seen": 1554400, "step": 1200 }, { "epoch": 0.05887670095033347, "grad_norm": 0.28427350521087646, "learning_rate": 4.988830876189942e-05, "loss": 0.1044, "num_input_tokens_seen": 1560960, "step": 1205 }, { "epoch": 0.0591210026140278, "grad_norm": 0.42542320489883423, "learning_rate": 4.988737986615503e-05, "loss": 0.1265, "num_input_tokens_seen": 1567648, "step": 1210 }, { "epoch": 0.05936530427772213, "grad_norm": 0.23856550455093384, "learning_rate": 4.988644713246391e-05, "loss": 0.1211, "num_input_tokens_seen": 1573728, "step": 1215 }, { "epoch": 0.05960960594141646, "grad_norm": 0.3160989284515381, "learning_rate": 4.988551056096991e-05, "loss": 0.1254, "num_input_tokens_seen": 1580416, "step": 1220 }, { "epoch": 0.05985390760511079, "grad_norm": 0.2695930004119873, "learning_rate": 4.988457015181743e-05, "loss": 0.1171, "num_input_tokens_seen": 1586816, "step": 1225 }, { "epoch": 0.060098209268805124, "grad_norm": 1.4598729610443115, "learning_rate": 4.988362590515153e-05, "loss": 0.1384, "num_input_tokens_seen": 1593344, "step": 1230 }, { "epoch": 0.06034251093249945, "grad_norm": 0.7046808004379272, "learning_rate": 4.9882677821117805e-05, "loss": 0.131, "num_input_tokens_seen": 1599648, "step": 1235 }, { "epoch": 0.06058681259619378, "grad_norm": 0.6008304357528687, "learning_rate": 4.988172589986246e-05, "loss": 0.1131, "num_input_tokens_seen": 1606144, "step": 1240 }, { "epoch": 0.06083111425988811, "grad_norm": 0.4933989346027374, "learning_rate": 4.9880770141532304e-05, "loss": 0.1356, "num_input_tokens_seen": 1612640, "step": 1245 }, { "epoch": 0.06107541592358244, "grad_norm": 0.6385255455970764, "learning_rate": 4.987981054627472e-05, "loss": 0.0979, "num_input_tokens_seen": 1619040, "step": 1250 }, { "epoch": 0.06131971758727677, "grad_norm": 0.3269837498664856, "learning_rate": 4.987884711423769e-05, "loss": 0.1354, "num_input_tokens_seen": 1625184, "step": 1255 }, { "epoch": 0.061564019250971096, "grad_norm": 0.4979533553123474, "learning_rate": 4.9877879845569784e-05, "loss": 0.1549, "num_input_tokens_seen": 1631520, "step": 1260 }, { "epoch": 0.06180832091466543, "grad_norm": 0.25080054998397827, "learning_rate": 4.9876908740420175e-05, "loss": 0.1137, "num_input_tokens_seen": 1638080, "step": 1265 }, { "epoch": 0.06205262257835976, "grad_norm": 0.41899144649505615, "learning_rate": 4.987593379893861e-05, "loss": 0.1482, "num_input_tokens_seen": 1644384, "step": 1270 }, { "epoch": 0.06229692424205409, "grad_norm": 0.5510353446006775, "learning_rate": 4.987495502127545e-05, "loss": 0.1381, "num_input_tokens_seen": 1650816, "step": 1275 }, { "epoch": 0.06254122590574841, "grad_norm": 0.3925706148147583, "learning_rate": 4.987397240758162e-05, "loss": 0.1102, "num_input_tokens_seen": 1657152, "step": 1280 }, { "epoch": 0.06278552756944275, "grad_norm": 0.5624771118164062, "learning_rate": 4.9872985958008664e-05, "loss": 0.1314, "num_input_tokens_seen": 1663360, "step": 1285 }, { "epoch": 0.06302982923313707, "grad_norm": 0.4418310523033142, "learning_rate": 4.987199567270871e-05, "loss": 0.129, "num_input_tokens_seen": 1670080, "step": 1290 }, { "epoch": 0.06327413089683141, "grad_norm": 0.863349437713623, "learning_rate": 4.9871001551834444e-05, "loss": 0.1059, "num_input_tokens_seen": 1676992, "step": 1295 }, { "epoch": 0.06351843256052574, "grad_norm": 0.24372227489948273, "learning_rate": 4.98700035955392e-05, "loss": 0.1067, "num_input_tokens_seen": 1683168, "step": 1300 }, { "epoch": 0.06376273422422006, "grad_norm": 0.25625118613243103, "learning_rate": 4.986900180397686e-05, "loss": 0.1417, "num_input_tokens_seen": 1689632, "step": 1305 }, { "epoch": 0.0640070358879144, "grad_norm": 0.41220057010650635, "learning_rate": 4.9867996177301926e-05, "loss": 0.1387, "num_input_tokens_seen": 1696320, "step": 1310 }, { "epoch": 0.06425133755160872, "grad_norm": 0.17168757319450378, "learning_rate": 4.9866986715669464e-05, "loss": 0.0955, "num_input_tokens_seen": 1702656, "step": 1315 }, { "epoch": 0.06449563921530306, "grad_norm": 0.7754583358764648, "learning_rate": 4.9865973419235155e-05, "loss": 0.1205, "num_input_tokens_seen": 1709344, "step": 1320 }, { "epoch": 0.06473994087899738, "grad_norm": 0.7110973000526428, "learning_rate": 4.986495628815526e-05, "loss": 0.1112, "num_input_tokens_seen": 1716064, "step": 1325 }, { "epoch": 0.06498424254269172, "grad_norm": 0.25429001450538635, "learning_rate": 4.986393532258663e-05, "loss": 0.1062, "num_input_tokens_seen": 1722592, "step": 1330 }, { "epoch": 0.06522854420638605, "grad_norm": 0.312883585691452, "learning_rate": 4.986291052268671e-05, "loss": 0.1436, "num_input_tokens_seen": 1729120, "step": 1335 }, { "epoch": 0.06547284587008037, "grad_norm": 0.2793474495410919, "learning_rate": 4.986188188861355e-05, "loss": 0.106, "num_input_tokens_seen": 1735424, "step": 1340 }, { "epoch": 0.06571714753377471, "grad_norm": 0.5432543754577637, "learning_rate": 4.9860849420525766e-05, "loss": 0.0916, "num_input_tokens_seen": 1742208, "step": 1345 }, { "epoch": 0.06596144919746903, "grad_norm": 0.37033435702323914, "learning_rate": 4.9859813118582575e-05, "loss": 0.0942, "num_input_tokens_seen": 1748736, "step": 1350 }, { "epoch": 0.06620575086116337, "grad_norm": 0.1890176236629486, "learning_rate": 4.98587729829438e-05, "loss": 0.1086, "num_input_tokens_seen": 1754912, "step": 1355 }, { "epoch": 0.0664500525248577, "grad_norm": 0.3513653874397278, "learning_rate": 4.985772901376983e-05, "loss": 0.1144, "num_input_tokens_seen": 1761120, "step": 1360 }, { "epoch": 0.06669435418855202, "grad_norm": 0.5658845901489258, "learning_rate": 4.9856681211221666e-05, "loss": 0.1167, "num_input_tokens_seen": 1767360, "step": 1365 }, { "epoch": 0.06693865585224636, "grad_norm": 0.5734747648239136, "learning_rate": 4.985562957546089e-05, "loss": 0.1143, "num_input_tokens_seen": 1774656, "step": 1370 }, { "epoch": 0.06718295751594068, "grad_norm": 0.3257296681404114, "learning_rate": 4.9854574106649686e-05, "loss": 0.0987, "num_input_tokens_seen": 1781216, "step": 1375 }, { "epoch": 0.06742725917963502, "grad_norm": 0.49185460805892944, "learning_rate": 4.985351480495081e-05, "loss": 0.1551, "num_input_tokens_seen": 1788064, "step": 1380 }, { "epoch": 0.06767156084332934, "grad_norm": 0.38879406452178955, "learning_rate": 4.985245167052762e-05, "loss": 0.0855, "num_input_tokens_seen": 1794720, "step": 1385 }, { "epoch": 0.06791586250702367, "grad_norm": 0.7322312593460083, "learning_rate": 4.9851384703544066e-05, "loss": 0.1308, "num_input_tokens_seen": 1800864, "step": 1390 }, { "epoch": 0.068160164170718, "grad_norm": 0.23179581761360168, "learning_rate": 4.985031390416469e-05, "loss": 0.1009, "num_input_tokens_seen": 1807424, "step": 1395 }, { "epoch": 0.06840446583441233, "grad_norm": 0.26655733585357666, "learning_rate": 4.984923927255461e-05, "loss": 0.1429, "num_input_tokens_seen": 1813856, "step": 1400 }, { "epoch": 0.06840446583441233, "eval_loss": 0.12042272090911865, "eval_runtime": 374.4959, "eval_samples_per_second": 97.157, "eval_steps_per_second": 24.291, "num_input_tokens_seen": 1813856, "step": 1400 }, { "epoch": 0.06864876749810667, "grad_norm": 0.6458513736724854, "learning_rate": 4.984816080887958e-05, "loss": 0.1357, "num_input_tokens_seen": 1820544, "step": 1405 }, { "epoch": 0.06889306916180099, "grad_norm": 0.840027928352356, "learning_rate": 4.9847078513305875e-05, "loss": 0.0938, "num_input_tokens_seen": 1827616, "step": 1410 }, { "epoch": 0.06913737082549533, "grad_norm": 0.24412764608860016, "learning_rate": 4.984599238600043e-05, "loss": 0.1092, "num_input_tokens_seen": 1834112, "step": 1415 }, { "epoch": 0.06938167248918965, "grad_norm": 0.23427866399288177, "learning_rate": 4.9844902427130716e-05, "loss": 0.1098, "num_input_tokens_seen": 1840320, "step": 1420 }, { "epoch": 0.06962597415288398, "grad_norm": 1.0325570106506348, "learning_rate": 4.984380863686482e-05, "loss": 0.1518, "num_input_tokens_seen": 1846816, "step": 1425 }, { "epoch": 0.06987027581657831, "grad_norm": 0.31596192717552185, "learning_rate": 4.984271101537143e-05, "loss": 0.1173, "num_input_tokens_seen": 1852864, "step": 1430 }, { "epoch": 0.07011457748027264, "grad_norm": 0.8340107202529907, "learning_rate": 4.9841609562819816e-05, "loss": 0.1158, "num_input_tokens_seen": 1859328, "step": 1435 }, { "epoch": 0.07035887914396698, "grad_norm": 0.6839359998703003, "learning_rate": 4.984050427937983e-05, "loss": 0.1353, "num_input_tokens_seen": 1865856, "step": 1440 }, { "epoch": 0.0706031808076613, "grad_norm": 0.7426912188529968, "learning_rate": 4.983939516522191e-05, "loss": 0.1195, "num_input_tokens_seen": 1872672, "step": 1445 }, { "epoch": 0.07084748247135562, "grad_norm": 0.4870670437812805, "learning_rate": 4.983828222051711e-05, "loss": 0.1267, "num_input_tokens_seen": 1879392, "step": 1450 }, { "epoch": 0.07109178413504996, "grad_norm": 0.323607861995697, "learning_rate": 4.983716544543705e-05, "loss": 0.1277, "num_input_tokens_seen": 1886688, "step": 1455 }, { "epoch": 0.07133608579874429, "grad_norm": 0.6219181418418884, "learning_rate": 4.983604484015395e-05, "loss": 0.1089, "num_input_tokens_seen": 1892352, "step": 1460 }, { "epoch": 0.07158038746243862, "grad_norm": 0.35047176480293274, "learning_rate": 4.983492040484064e-05, "loss": 0.1116, "num_input_tokens_seen": 1899136, "step": 1465 }, { "epoch": 0.07182468912613295, "grad_norm": 0.2843092381954193, "learning_rate": 4.98337921396705e-05, "loss": 0.1187, "num_input_tokens_seen": 1905856, "step": 1470 }, { "epoch": 0.07206899078982727, "grad_norm": 0.3645287752151489, "learning_rate": 4.983266004481753e-05, "loss": 0.1078, "num_input_tokens_seen": 1912608, "step": 1475 }, { "epoch": 0.07231329245352161, "grad_norm": 0.28492534160614014, "learning_rate": 4.9831524120456316e-05, "loss": 0.1002, "num_input_tokens_seen": 1918816, "step": 1480 }, { "epoch": 0.07255759411721593, "grad_norm": 0.5144096612930298, "learning_rate": 4.9830384366762026e-05, "loss": 0.1165, "num_input_tokens_seen": 1925440, "step": 1485 }, { "epoch": 0.07280189578091027, "grad_norm": 0.2551226019859314, "learning_rate": 4.9829240783910436e-05, "loss": 0.1069, "num_input_tokens_seen": 1932064, "step": 1490 }, { "epoch": 0.0730461974446046, "grad_norm": 0.3466969430446625, "learning_rate": 4.982809337207789e-05, "loss": 0.102, "num_input_tokens_seen": 1938176, "step": 1495 }, { "epoch": 0.07329049910829893, "grad_norm": 0.2258131057024002, "learning_rate": 4.9826942131441337e-05, "loss": 0.1001, "num_input_tokens_seen": 1945152, "step": 1500 }, { "epoch": 0.07353480077199326, "grad_norm": 0.43081921339035034, "learning_rate": 4.9825787062178315e-05, "loss": 0.1211, "num_input_tokens_seen": 1951520, "step": 1505 }, { "epoch": 0.07377910243568758, "grad_norm": 0.5383859276771545, "learning_rate": 4.9824628164466945e-05, "loss": 0.1209, "num_input_tokens_seen": 1958176, "step": 1510 }, { "epoch": 0.07402340409938192, "grad_norm": 0.8575022220611572, "learning_rate": 4.982346543848595e-05, "loss": 0.1088, "num_input_tokens_seen": 1964608, "step": 1515 }, { "epoch": 0.07426770576307624, "grad_norm": 1.019161581993103, "learning_rate": 4.9822298884414626e-05, "loss": 0.1192, "num_input_tokens_seen": 1971168, "step": 1520 }, { "epoch": 0.07451200742677058, "grad_norm": 0.6277984976768494, "learning_rate": 4.982112850243288e-05, "loss": 0.09, "num_input_tokens_seen": 1977632, "step": 1525 }, { "epoch": 0.0747563090904649, "grad_norm": 0.48934563994407654, "learning_rate": 4.98199542927212e-05, "loss": 0.1155, "num_input_tokens_seen": 1984032, "step": 1530 }, { "epoch": 0.07500061075415923, "grad_norm": 0.292464941740036, "learning_rate": 4.981877625546066e-05, "loss": 0.1132, "num_input_tokens_seen": 1990432, "step": 1535 }, { "epoch": 0.07524491241785357, "grad_norm": 0.2558807134628296, "learning_rate": 4.981759439083293e-05, "loss": 0.1232, "num_input_tokens_seen": 1996992, "step": 1540 }, { "epoch": 0.07548921408154789, "grad_norm": 0.2713088393211365, "learning_rate": 4.981640869902027e-05, "loss": 0.1174, "num_input_tokens_seen": 2004032, "step": 1545 }, { "epoch": 0.07573351574524223, "grad_norm": 0.49059486389160156, "learning_rate": 4.9815219180205517e-05, "loss": 0.1092, "num_input_tokens_seen": 2010464, "step": 1550 }, { "epoch": 0.07597781740893655, "grad_norm": 0.7369285225868225, "learning_rate": 4.9814025834572126e-05, "loss": 0.1221, "num_input_tokens_seen": 2016864, "step": 1555 }, { "epoch": 0.07622211907263088, "grad_norm": 0.2653580605983734, "learning_rate": 4.981282866230411e-05, "loss": 0.1107, "num_input_tokens_seen": 2023680, "step": 1560 }, { "epoch": 0.07646642073632522, "grad_norm": 0.2456459254026413, "learning_rate": 4.981162766358611e-05, "loss": 0.1062, "num_input_tokens_seen": 2030112, "step": 1565 }, { "epoch": 0.07671072240001954, "grad_norm": 0.17351789772510529, "learning_rate": 4.9810422838603316e-05, "loss": 0.087, "num_input_tokens_seen": 2036256, "step": 1570 }, { "epoch": 0.07695502406371388, "grad_norm": 0.5521295666694641, "learning_rate": 4.9809214187541533e-05, "loss": 0.1204, "num_input_tokens_seen": 2042368, "step": 1575 }, { "epoch": 0.0771993257274082, "grad_norm": 0.35773274302482605, "learning_rate": 4.980800171058715e-05, "loss": 0.1178, "num_input_tokens_seen": 2048736, "step": 1580 }, { "epoch": 0.07744362739110254, "grad_norm": 0.2242223024368286, "learning_rate": 4.980678540792715e-05, "loss": 0.1213, "num_input_tokens_seen": 2055168, "step": 1585 }, { "epoch": 0.07768792905479686, "grad_norm": 0.22164283692836761, "learning_rate": 4.980556527974909e-05, "loss": 0.1264, "num_input_tokens_seen": 2061824, "step": 1590 }, { "epoch": 0.07793223071849119, "grad_norm": 0.5737943649291992, "learning_rate": 4.980434132624114e-05, "loss": 0.1056, "num_input_tokens_seen": 2068192, "step": 1595 }, { "epoch": 0.07817653238218553, "grad_norm": 0.22074872255325317, "learning_rate": 4.980311354759205e-05, "loss": 0.1061, "num_input_tokens_seen": 2074816, "step": 1600 }, { "epoch": 0.07817653238218553, "eval_loss": 0.11742368340492249, "eval_runtime": 374.889, "eval_samples_per_second": 97.055, "eval_steps_per_second": 24.266, "num_input_tokens_seen": 2074816, "step": 1600 }, { "epoch": 0.07842083404587985, "grad_norm": 0.2360423058271408, "learning_rate": 4.980188194399116e-05, "loss": 0.1353, "num_input_tokens_seen": 2081152, "step": 1605 }, { "epoch": 0.07866513570957419, "grad_norm": 0.21458959579467773, "learning_rate": 4.9800646515628384e-05, "loss": 0.1257, "num_input_tokens_seen": 2087680, "step": 1610 }, { "epoch": 0.07890943737326851, "grad_norm": 0.2771640121936798, "learning_rate": 4.979940726269426e-05, "loss": 0.1086, "num_input_tokens_seen": 2094272, "step": 1615 }, { "epoch": 0.07915373903696284, "grad_norm": 0.36047831177711487, "learning_rate": 4.979816418537988e-05, "loss": 0.1353, "num_input_tokens_seen": 2100576, "step": 1620 }, { "epoch": 0.07939804070065717, "grad_norm": 0.3074171245098114, "learning_rate": 4.979691728387696e-05, "loss": 0.1318, "num_input_tokens_seen": 2106816, "step": 1625 }, { "epoch": 0.0796423423643515, "grad_norm": 0.29594576358795166, "learning_rate": 4.979566655837776e-05, "loss": 0.1071, "num_input_tokens_seen": 2112960, "step": 1630 }, { "epoch": 0.07988664402804584, "grad_norm": 0.3125322461128235, "learning_rate": 4.9794412009075184e-05, "loss": 0.1103, "num_input_tokens_seen": 2119360, "step": 1635 }, { "epoch": 0.08013094569174016, "grad_norm": 0.26007863879203796, "learning_rate": 4.979315363616269e-05, "loss": 0.1252, "num_input_tokens_seen": 2126048, "step": 1640 }, { "epoch": 0.08037524735543448, "grad_norm": 0.200529083609581, "learning_rate": 4.979189143983434e-05, "loss": 0.1036, "num_input_tokens_seen": 2132736, "step": 1645 }, { "epoch": 0.08061954901912882, "grad_norm": 0.2557392716407776, "learning_rate": 4.979062542028478e-05, "loss": 0.1221, "num_input_tokens_seen": 2138880, "step": 1650 }, { "epoch": 0.08086385068282315, "grad_norm": 0.1946401447057724, "learning_rate": 4.978935557770923e-05, "loss": 0.1124, "num_input_tokens_seen": 2145856, "step": 1655 }, { "epoch": 0.08110815234651748, "grad_norm": 0.30987998843193054, "learning_rate": 4.978808191230353e-05, "loss": 0.1041, "num_input_tokens_seen": 2152480, "step": 1660 }, { "epoch": 0.08135245401021181, "grad_norm": 0.4040214717388153, "learning_rate": 4.9786804424264085e-05, "loss": 0.1, "num_input_tokens_seen": 2158720, "step": 1665 }, { "epoch": 0.08159675567390615, "grad_norm": 0.2724892795085907, "learning_rate": 4.978552311378792e-05, "loss": 0.1534, "num_input_tokens_seen": 2165536, "step": 1670 }, { "epoch": 0.08184105733760047, "grad_norm": 0.22152768075466156, "learning_rate": 4.978423798107261e-05, "loss": 0.1264, "num_input_tokens_seen": 2172128, "step": 1675 }, { "epoch": 0.0820853590012948, "grad_norm": 0.24465692043304443, "learning_rate": 4.978294902631635e-05, "loss": 0.116, "num_input_tokens_seen": 2178400, "step": 1680 }, { "epoch": 0.08232966066498913, "grad_norm": 0.43770745396614075, "learning_rate": 4.9781656249717914e-05, "loss": 0.1234, "num_input_tokens_seen": 2184736, "step": 1685 }, { "epoch": 0.08257396232868346, "grad_norm": 0.8486392498016357, "learning_rate": 4.9780359651476645e-05, "loss": 0.1053, "num_input_tokens_seen": 2191328, "step": 1690 }, { "epoch": 0.0828182639923778, "grad_norm": 0.5837977528572083, "learning_rate": 4.977905923179251e-05, "loss": 0.1328, "num_input_tokens_seen": 2197632, "step": 1695 }, { "epoch": 0.08306256565607212, "grad_norm": 0.1940063238143921, "learning_rate": 4.977775499086606e-05, "loss": 0.0942, "num_input_tokens_seen": 2204000, "step": 1700 }, { "epoch": 0.08330686731976644, "grad_norm": 0.26092010736465454, "learning_rate": 4.97764469288984e-05, "loss": 0.1141, "num_input_tokens_seen": 2210272, "step": 1705 }, { "epoch": 0.08355116898346078, "grad_norm": 0.8512888550758362, "learning_rate": 4.977513504609127e-05, "loss": 0.1084, "num_input_tokens_seen": 2217248, "step": 1710 }, { "epoch": 0.0837954706471551, "grad_norm": 0.3453160226345062, "learning_rate": 4.9773819342646965e-05, "loss": 0.1165, "num_input_tokens_seen": 2223584, "step": 1715 }, { "epoch": 0.08403977231084944, "grad_norm": 0.33305060863494873, "learning_rate": 4.97724998187684e-05, "loss": 0.1169, "num_input_tokens_seen": 2230720, "step": 1720 }, { "epoch": 0.08428407397454377, "grad_norm": 0.23264551162719727, "learning_rate": 4.9771176474659045e-05, "loss": 0.1042, "num_input_tokens_seen": 2237216, "step": 1725 }, { "epoch": 0.08452837563823809, "grad_norm": 0.41015365719795227, "learning_rate": 4.976984931052299e-05, "loss": 0.131, "num_input_tokens_seen": 2243584, "step": 1730 }, { "epoch": 0.08477267730193243, "grad_norm": 0.5348792672157288, "learning_rate": 4.976851832656489e-05, "loss": 0.1215, "num_input_tokens_seen": 2250240, "step": 1735 }, { "epoch": 0.08501697896562675, "grad_norm": 0.9558844566345215, "learning_rate": 4.9767183522990004e-05, "loss": 0.1231, "num_input_tokens_seen": 2256480, "step": 1740 }, { "epoch": 0.08526128062932109, "grad_norm": 0.24336354434490204, "learning_rate": 4.9765844900004176e-05, "loss": 0.1269, "num_input_tokens_seen": 2263232, "step": 1745 }, { "epoch": 0.08550558229301541, "grad_norm": 1.0774308443069458, "learning_rate": 4.9764502457813834e-05, "loss": 0.125, "num_input_tokens_seen": 2269568, "step": 1750 }, { "epoch": 0.08574988395670975, "grad_norm": 0.3472192883491516, "learning_rate": 4.9763156196626005e-05, "loss": 0.0857, "num_input_tokens_seen": 2275840, "step": 1755 }, { "epoch": 0.08599418562040408, "grad_norm": 0.38795536756515503, "learning_rate": 4.97618061166483e-05, "loss": 0.1311, "num_input_tokens_seen": 2281824, "step": 1760 }, { "epoch": 0.0862384872840984, "grad_norm": 0.19003154337406158, "learning_rate": 4.9760452218088915e-05, "loss": 0.0867, "num_input_tokens_seen": 2288288, "step": 1765 }, { "epoch": 0.08648278894779274, "grad_norm": 0.28503385186195374, "learning_rate": 4.975909450115663e-05, "loss": 0.1118, "num_input_tokens_seen": 2294496, "step": 1770 }, { "epoch": 0.08672709061148706, "grad_norm": 0.2934000492095947, "learning_rate": 4.975773296606084e-05, "loss": 0.1381, "num_input_tokens_seen": 2300992, "step": 1775 }, { "epoch": 0.0869713922751814, "grad_norm": 0.8414890170097351, "learning_rate": 4.97563676130115e-05, "loss": 0.124, "num_input_tokens_seen": 2307264, "step": 1780 }, { "epoch": 0.08721569393887572, "grad_norm": 0.46648070216178894, "learning_rate": 4.9754998442219166e-05, "loss": 0.128, "num_input_tokens_seen": 2313568, "step": 1785 }, { "epoch": 0.08745999560257005, "grad_norm": 0.4186483919620514, "learning_rate": 4.9753625453894984e-05, "loss": 0.1139, "num_input_tokens_seen": 2319808, "step": 1790 }, { "epoch": 0.08770429726626439, "grad_norm": 0.35556352138519287, "learning_rate": 4.975224864825068e-05, "loss": 0.1401, "num_input_tokens_seen": 2326208, "step": 1795 }, { "epoch": 0.08794859892995871, "grad_norm": 0.4760929048061371, "learning_rate": 4.9750868025498576e-05, "loss": 0.0867, "num_input_tokens_seen": 2332544, "step": 1800 }, { "epoch": 0.08794859892995871, "eval_loss": 0.1143551766872406, "eval_runtime": 375.2711, "eval_samples_per_second": 96.957, "eval_steps_per_second": 24.241, "num_input_tokens_seen": 2332544, "step": 1800 }, { "epoch": 0.08819290059365305, "grad_norm": 0.2766828238964081, "learning_rate": 4.974948358585158e-05, "loss": 0.124, "num_input_tokens_seen": 2338784, "step": 1805 }, { "epoch": 0.08843720225734737, "grad_norm": 0.5105414390563965, "learning_rate": 4.9748095329523205e-05, "loss": 0.1232, "num_input_tokens_seen": 2345152, "step": 1810 }, { "epoch": 0.0886815039210417, "grad_norm": 0.49127939343452454, "learning_rate": 4.974670325672752e-05, "loss": 0.1089, "num_input_tokens_seen": 2351744, "step": 1815 }, { "epoch": 0.08892580558473603, "grad_norm": 0.6023644208908081, "learning_rate": 4.974530736767921e-05, "loss": 0.1026, "num_input_tokens_seen": 2358336, "step": 1820 }, { "epoch": 0.08917010724843036, "grad_norm": 0.3855728507041931, "learning_rate": 4.9743907662593524e-05, "loss": 0.1179, "num_input_tokens_seen": 2365056, "step": 1825 }, { "epoch": 0.0894144089121247, "grad_norm": 0.24879060685634613, "learning_rate": 4.974250414168633e-05, "loss": 0.1134, "num_input_tokens_seen": 2371424, "step": 1830 }, { "epoch": 0.08965871057581902, "grad_norm": 0.4857349097728729, "learning_rate": 4.974109680517407e-05, "loss": 0.115, "num_input_tokens_seen": 2378240, "step": 1835 }, { "epoch": 0.08990301223951336, "grad_norm": 0.3328619599342346, "learning_rate": 4.973968565327376e-05, "loss": 0.1136, "num_input_tokens_seen": 2384832, "step": 1840 }, { "epoch": 0.09014731390320768, "grad_norm": 0.17763246595859528, "learning_rate": 4.973827068620303e-05, "loss": 0.0832, "num_input_tokens_seen": 2391648, "step": 1845 }, { "epoch": 0.090391615566902, "grad_norm": 0.31517016887664795, "learning_rate": 4.973685190418008e-05, "loss": 0.093, "num_input_tokens_seen": 2397856, "step": 1850 }, { "epoch": 0.09063591723059634, "grad_norm": 0.4536077678203583, "learning_rate": 4.97354293074237e-05, "loss": 0.1184, "num_input_tokens_seen": 2404096, "step": 1855 }, { "epoch": 0.09088021889429067, "grad_norm": 0.3108906149864197, "learning_rate": 4.9734002896153276e-05, "loss": 0.1096, "num_input_tokens_seen": 2411008, "step": 1860 }, { "epoch": 0.091124520557985, "grad_norm": 0.26283735036849976, "learning_rate": 4.973257267058877e-05, "loss": 0.1218, "num_input_tokens_seen": 2417376, "step": 1865 }, { "epoch": 0.09136882222167933, "grad_norm": 0.2777526080608368, "learning_rate": 4.973113863095076e-05, "loss": 0.1117, "num_input_tokens_seen": 2423776, "step": 1870 }, { "epoch": 0.09161312388537365, "grad_norm": 0.20730936527252197, "learning_rate": 4.9729700777460384e-05, "loss": 0.1169, "num_input_tokens_seen": 2430560, "step": 1875 }, { "epoch": 0.09185742554906799, "grad_norm": 0.30349382758140564, "learning_rate": 4.972825911033937e-05, "loss": 0.1406, "num_input_tokens_seen": 2436864, "step": 1880 }, { "epoch": 0.09210172721276232, "grad_norm": 0.35597240924835205, "learning_rate": 4.9726813629810056e-05, "loss": 0.126, "num_input_tokens_seen": 2443136, "step": 1885 }, { "epoch": 0.09234602887645665, "grad_norm": 0.3540005683898926, "learning_rate": 4.9725364336095326e-05, "loss": 0.1111, "num_input_tokens_seen": 2449824, "step": 1890 }, { "epoch": 0.09259033054015098, "grad_norm": 0.5296183228492737, "learning_rate": 4.972391122941871e-05, "loss": 0.1116, "num_input_tokens_seen": 2456704, "step": 1895 }, { "epoch": 0.0928346322038453, "grad_norm": 0.26731249690055847, "learning_rate": 4.972245431000428e-05, "loss": 0.1014, "num_input_tokens_seen": 2463360, "step": 1900 }, { "epoch": 0.09307893386753964, "grad_norm": 0.7292331457138062, "learning_rate": 4.972099357807671e-05, "loss": 0.1268, "num_input_tokens_seen": 2470112, "step": 1905 }, { "epoch": 0.09332323553123396, "grad_norm": 0.6846416592597961, "learning_rate": 4.971952903386127e-05, "loss": 0.1213, "num_input_tokens_seen": 2476256, "step": 1910 }, { "epoch": 0.0935675371949283, "grad_norm": 0.5834847688674927, "learning_rate": 4.971806067758381e-05, "loss": 0.146, "num_input_tokens_seen": 2482336, "step": 1915 }, { "epoch": 0.09381183885862263, "grad_norm": 0.4531136453151703, "learning_rate": 4.971658850947076e-05, "loss": 0.0984, "num_input_tokens_seen": 2488320, "step": 1920 }, { "epoch": 0.09405614052231696, "grad_norm": 0.37176135182380676, "learning_rate": 4.9715112529749165e-05, "loss": 0.0936, "num_input_tokens_seen": 2494944, "step": 1925 }, { "epoch": 0.09430044218601129, "grad_norm": 0.6348533034324646, "learning_rate": 4.9713632738646624e-05, "loss": 0.1023, "num_input_tokens_seen": 2501568, "step": 1930 }, { "epoch": 0.09454474384970561, "grad_norm": 0.45230045914649963, "learning_rate": 4.971214913639134e-05, "loss": 0.0988, "num_input_tokens_seen": 2508224, "step": 1935 }, { "epoch": 0.09478904551339995, "grad_norm": 0.20600281655788422, "learning_rate": 4.9710661723212104e-05, "loss": 0.1139, "num_input_tokens_seen": 2515360, "step": 1940 }, { "epoch": 0.09503334717709427, "grad_norm": 0.4734657108783722, "learning_rate": 4.9709170499338295e-05, "loss": 0.1096, "num_input_tokens_seen": 2522080, "step": 1945 }, { "epoch": 0.09527764884078861, "grad_norm": 0.4797411859035492, "learning_rate": 4.9707675464999895e-05, "loss": 0.1101, "num_input_tokens_seen": 2528736, "step": 1950 }, { "epoch": 0.09552195050448294, "grad_norm": 0.4922773540019989, "learning_rate": 4.970617662042743e-05, "loss": 0.1033, "num_input_tokens_seen": 2534880, "step": 1955 }, { "epoch": 0.09576625216817726, "grad_norm": 0.36018142104148865, "learning_rate": 4.970467396585206e-05, "loss": 0.1083, "num_input_tokens_seen": 2541472, "step": 1960 }, { "epoch": 0.0960105538318716, "grad_norm": 0.3105052411556244, "learning_rate": 4.97031675015055e-05, "loss": 0.0916, "num_input_tokens_seen": 2548320, "step": 1965 }, { "epoch": 0.09625485549556592, "grad_norm": 0.6247346997261047, "learning_rate": 4.9701657227620075e-05, "loss": 0.0974, "num_input_tokens_seen": 2554944, "step": 1970 }, { "epoch": 0.09649915715926026, "grad_norm": 0.6833367943763733, "learning_rate": 4.9700143144428685e-05, "loss": 0.1162, "num_input_tokens_seen": 2561056, "step": 1975 }, { "epoch": 0.09674345882295458, "grad_norm": 0.31758812069892883, "learning_rate": 4.969862525216482e-05, "loss": 0.09, "num_input_tokens_seen": 2567488, "step": 1980 }, { "epoch": 0.09698776048664891, "grad_norm": 0.2805930972099304, "learning_rate": 4.9697103551062556e-05, "loss": 0.0984, "num_input_tokens_seen": 2574176, "step": 1985 }, { "epoch": 0.09723206215034325, "grad_norm": 0.5740191340446472, "learning_rate": 4.9695578041356565e-05, "loss": 0.1098, "num_input_tokens_seen": 2580992, "step": 1990 }, { "epoch": 0.09747636381403757, "grad_norm": 0.2029186189174652, "learning_rate": 4.969404872328209e-05, "loss": 0.1239, "num_input_tokens_seen": 2587712, "step": 1995 }, { "epoch": 0.09772066547773191, "grad_norm": 0.6568037867546082, "learning_rate": 4.969251559707498e-05, "loss": 0.1038, "num_input_tokens_seen": 2594720, "step": 2000 }, { "epoch": 0.09772066547773191, "eval_loss": 0.11247672885656357, "eval_runtime": 375.1865, "eval_samples_per_second": 96.978, "eval_steps_per_second": 24.247, "num_input_tokens_seen": 2594720, "step": 2000 }, { "epoch": 0.09796496714142623, "grad_norm": 0.32047000527381897, "learning_rate": 4.9690978662971674e-05, "loss": 0.1225, "num_input_tokens_seen": 2600928, "step": 2005 }, { "epoch": 0.09820926880512057, "grad_norm": 0.16640885174274445, "learning_rate": 4.968943792120916e-05, "loss": 0.1073, "num_input_tokens_seen": 2607488, "step": 2010 }, { "epoch": 0.0984535704688149, "grad_norm": 0.40855225920677185, "learning_rate": 4.9687893372025046e-05, "loss": 0.105, "num_input_tokens_seen": 2613984, "step": 2015 }, { "epoch": 0.09869787213250922, "grad_norm": 0.22109673917293549, "learning_rate": 4.9686345015657535e-05, "loss": 0.1009, "num_input_tokens_seen": 2620832, "step": 2020 }, { "epoch": 0.09894217379620356, "grad_norm": 0.16116957366466522, "learning_rate": 4.968479285234538e-05, "loss": 0.0913, "num_input_tokens_seen": 2627104, "step": 2025 }, { "epoch": 0.09918647545989788, "grad_norm": 0.5361090898513794, "learning_rate": 4.9683236882327974e-05, "loss": 0.091, "num_input_tokens_seen": 2633504, "step": 2030 }, { "epoch": 0.09943077712359222, "grad_norm": 0.4863675832748413, "learning_rate": 4.968167710584526e-05, "loss": 0.1087, "num_input_tokens_seen": 2639648, "step": 2035 }, { "epoch": 0.09967507878728654, "grad_norm": 0.4151296019554138, "learning_rate": 4.968011352313775e-05, "loss": 0.0877, "num_input_tokens_seen": 2645984, "step": 2040 }, { "epoch": 0.09991938045098087, "grad_norm": 0.2402655929327011, "learning_rate": 4.967854613444659e-05, "loss": 0.0919, "num_input_tokens_seen": 2652576, "step": 2045 }, { "epoch": 0.1001636821146752, "grad_norm": 0.42378661036491394, "learning_rate": 4.967697494001349e-05, "loss": 0.128, "num_input_tokens_seen": 2659008, "step": 2050 }, { "epoch": 0.10040798377836953, "grad_norm": 0.2899245321750641, "learning_rate": 4.9675399940080736e-05, "loss": 0.1441, "num_input_tokens_seen": 2665280, "step": 2055 }, { "epoch": 0.10065228544206387, "grad_norm": 0.3569321632385254, "learning_rate": 4.9673821134891226e-05, "loss": 0.1145, "num_input_tokens_seen": 2671584, "step": 2060 }, { "epoch": 0.10089658710575819, "grad_norm": 0.7115597128868103, "learning_rate": 4.967223852468842e-05, "loss": 0.1237, "num_input_tokens_seen": 2677792, "step": 2065 }, { "epoch": 0.10114088876945251, "grad_norm": 0.1377585232257843, "learning_rate": 4.967065210971639e-05, "loss": 0.0799, "num_input_tokens_seen": 2684192, "step": 2070 }, { "epoch": 0.10138519043314685, "grad_norm": 0.3476821482181549, "learning_rate": 4.966906189021977e-05, "loss": 0.1311, "num_input_tokens_seen": 2690624, "step": 2075 }, { "epoch": 0.10162949209684118, "grad_norm": 0.30361223220825195, "learning_rate": 4.966746786644379e-05, "loss": 0.1066, "num_input_tokens_seen": 2697088, "step": 2080 }, { "epoch": 0.10187379376053551, "grad_norm": 0.6206225752830505, "learning_rate": 4.966587003863429e-05, "loss": 0.1344, "num_input_tokens_seen": 2703232, "step": 2085 }, { "epoch": 0.10211809542422984, "grad_norm": 0.2770300507545471, "learning_rate": 4.966426840703765e-05, "loss": 0.1, "num_input_tokens_seen": 2709824, "step": 2090 }, { "epoch": 0.10236239708792418, "grad_norm": 0.42208847403526306, "learning_rate": 4.9662662971900875e-05, "loss": 0.1152, "num_input_tokens_seen": 2716032, "step": 2095 }, { "epoch": 0.1026066987516185, "grad_norm": 0.2312735915184021, "learning_rate": 4.9661053733471534e-05, "loss": 0.1222, "num_input_tokens_seen": 2722048, "step": 2100 }, { "epoch": 0.10285100041531282, "grad_norm": 0.42259305715560913, "learning_rate": 4.965944069199781e-05, "loss": 0.0967, "num_input_tokens_seen": 2728416, "step": 2105 }, { "epoch": 0.10309530207900716, "grad_norm": 0.32355839014053345, "learning_rate": 4.965782384772842e-05, "loss": 0.112, "num_input_tokens_seen": 2734720, "step": 2110 }, { "epoch": 0.10333960374270149, "grad_norm": 0.6239001154899597, "learning_rate": 4.9656203200912734e-05, "loss": 0.1115, "num_input_tokens_seen": 2741376, "step": 2115 }, { "epoch": 0.10358390540639582, "grad_norm": 0.2798544466495514, "learning_rate": 4.965457875180067e-05, "loss": 0.1131, "num_input_tokens_seen": 2747552, "step": 2120 }, { "epoch": 0.10382820707009015, "grad_norm": 0.22770383954048157, "learning_rate": 4.9652950500642724e-05, "loss": 0.1208, "num_input_tokens_seen": 2753824, "step": 2125 }, { "epoch": 0.10407250873378447, "grad_norm": 0.3251630961894989, "learning_rate": 4.965131844769001e-05, "loss": 0.1298, "num_input_tokens_seen": 2760256, "step": 2130 }, { "epoch": 0.10431681039747881, "grad_norm": 0.28261303901672363, "learning_rate": 4.96496825931942e-05, "loss": 0.1245, "num_input_tokens_seen": 2767264, "step": 2135 }, { "epoch": 0.10456111206117313, "grad_norm": 0.32841169834136963, "learning_rate": 4.9648042937407566e-05, "loss": 0.0738, "num_input_tokens_seen": 2773920, "step": 2140 }, { "epoch": 0.10480541372486747, "grad_norm": 0.7063871026039124, "learning_rate": 4.964639948058297e-05, "loss": 0.0935, "num_input_tokens_seen": 2780352, "step": 2145 }, { "epoch": 0.1050497153885618, "grad_norm": 0.2859821319580078, "learning_rate": 4.9644752222973846e-05, "loss": 0.0808, "num_input_tokens_seen": 2787584, "step": 2150 }, { "epoch": 0.10529401705225612, "grad_norm": 0.5543833374977112, "learning_rate": 4.964310116483422e-05, "loss": 0.1152, "num_input_tokens_seen": 2793888, "step": 2155 }, { "epoch": 0.10553831871595046, "grad_norm": 0.6162224411964417, "learning_rate": 4.964144630641872e-05, "loss": 0.171, "num_input_tokens_seen": 2800480, "step": 2160 }, { "epoch": 0.10578262037964478, "grad_norm": 0.3495533764362335, "learning_rate": 4.9639787647982525e-05, "loss": 0.1029, "num_input_tokens_seen": 2806944, "step": 2165 }, { "epoch": 0.10602692204333912, "grad_norm": 0.2763596773147583, "learning_rate": 4.963812518978143e-05, "loss": 0.0914, "num_input_tokens_seen": 2814176, "step": 2170 }, { "epoch": 0.10627122370703344, "grad_norm": 0.19909362494945526, "learning_rate": 4.963645893207182e-05, "loss": 0.121, "num_input_tokens_seen": 2820768, "step": 2175 }, { "epoch": 0.10651552537072777, "grad_norm": 0.8844020366668701, "learning_rate": 4.963478887511063e-05, "loss": 0.108, "num_input_tokens_seen": 2827424, "step": 2180 }, { "epoch": 0.1067598270344221, "grad_norm": 0.49041569232940674, "learning_rate": 4.963311501915542e-05, "loss": 0.1279, "num_input_tokens_seen": 2833920, "step": 2185 }, { "epoch": 0.10700412869811643, "grad_norm": 0.4018978774547577, "learning_rate": 4.963143736446432e-05, "loss": 0.1145, "num_input_tokens_seen": 2840352, "step": 2190 }, { "epoch": 0.10724843036181077, "grad_norm": 0.8591128587722778, "learning_rate": 4.962975591129603e-05, "loss": 0.1193, "num_input_tokens_seen": 2847456, "step": 2195 }, { "epoch": 0.10749273202550509, "grad_norm": 0.25134462118148804, "learning_rate": 4.962807065990986e-05, "loss": 0.1141, "num_input_tokens_seen": 2853856, "step": 2200 }, { "epoch": 0.10749273202550509, "eval_loss": 0.1110469400882721, "eval_runtime": 375.187, "eval_samples_per_second": 96.978, "eval_steps_per_second": 24.247, "num_input_tokens_seen": 2853856, "step": 2200 }, { "epoch": 0.10773703368919943, "grad_norm": 0.27591672539711, "learning_rate": 4.9626381610565714e-05, "loss": 0.1058, "num_input_tokens_seen": 2860320, "step": 2205 }, { "epoch": 0.10798133535289375, "grad_norm": 0.36234596371650696, "learning_rate": 4.9624688763524043e-05, "loss": 0.0956, "num_input_tokens_seen": 2866432, "step": 2210 }, { "epoch": 0.10822563701658808, "grad_norm": 0.3157033324241638, "learning_rate": 4.962299211904591e-05, "loss": 0.0953, "num_input_tokens_seen": 2872864, "step": 2215 }, { "epoch": 0.10846993868028242, "grad_norm": 0.29220831394195557, "learning_rate": 4.962129167739296e-05, "loss": 0.1116, "num_input_tokens_seen": 2879296, "step": 2220 }, { "epoch": 0.10871424034397674, "grad_norm": 0.23068754374980927, "learning_rate": 4.961958743882742e-05, "loss": 0.1092, "num_input_tokens_seen": 2885344, "step": 2225 }, { "epoch": 0.10895854200767108, "grad_norm": 0.3926211893558502, "learning_rate": 4.961787940361211e-05, "loss": 0.1208, "num_input_tokens_seen": 2891744, "step": 2230 }, { "epoch": 0.1092028436713654, "grad_norm": 0.2874603867530823, "learning_rate": 4.961616757201043e-05, "loss": 0.1344, "num_input_tokens_seen": 2898368, "step": 2235 }, { "epoch": 0.10944714533505973, "grad_norm": 0.35483625531196594, "learning_rate": 4.961445194428637e-05, "loss": 0.1083, "num_input_tokens_seen": 2904640, "step": 2240 }, { "epoch": 0.10969144699875406, "grad_norm": 0.9282806515693665, "learning_rate": 4.9612732520704486e-05, "loss": 0.135, "num_input_tokens_seen": 2910880, "step": 2245 }, { "epoch": 0.10993574866244839, "grad_norm": 0.31976518034935, "learning_rate": 4.961100930152994e-05, "loss": 0.1075, "num_input_tokens_seen": 2917376, "step": 2250 }, { "epoch": 0.11018005032614273, "grad_norm": 0.2711617946624756, "learning_rate": 4.960928228702849e-05, "loss": 0.1152, "num_input_tokens_seen": 2923936, "step": 2255 }, { "epoch": 0.11042435198983705, "grad_norm": 0.37334439158439636, "learning_rate": 4.960755147746645e-05, "loss": 0.1442, "num_input_tokens_seen": 2930784, "step": 2260 }, { "epoch": 0.11066865365353137, "grad_norm": 0.45091140270233154, "learning_rate": 4.9605816873110736e-05, "loss": 0.1308, "num_input_tokens_seen": 2937056, "step": 2265 }, { "epoch": 0.11091295531722571, "grad_norm": 0.47351792454719543, "learning_rate": 4.960407847422883e-05, "loss": 0.1019, "num_input_tokens_seen": 2943648, "step": 2270 }, { "epoch": 0.11115725698092004, "grad_norm": 0.39979615807533264, "learning_rate": 4.960233628108885e-05, "loss": 0.0944, "num_input_tokens_seen": 2950816, "step": 2275 }, { "epoch": 0.11140155864461437, "grad_norm": 0.2776520848274231, "learning_rate": 4.960059029395942e-05, "loss": 0.0983, "num_input_tokens_seen": 2957088, "step": 2280 }, { "epoch": 0.1116458603083087, "grad_norm": 0.4199182689189911, "learning_rate": 4.959884051310983e-05, "loss": 0.0988, "num_input_tokens_seen": 2963680, "step": 2285 }, { "epoch": 0.11189016197200304, "grad_norm": 0.29123207926750183, "learning_rate": 4.959708693880991e-05, "loss": 0.1154, "num_input_tokens_seen": 2970016, "step": 2290 }, { "epoch": 0.11213446363569736, "grad_norm": 0.5552732944488525, "learning_rate": 4.9595329571330074e-05, "loss": 0.072, "num_input_tokens_seen": 2976544, "step": 2295 }, { "epoch": 0.11237876529939168, "grad_norm": 0.4160250723361969, "learning_rate": 4.9593568410941326e-05, "loss": 0.1083, "num_input_tokens_seen": 2982688, "step": 2300 }, { "epoch": 0.11262306696308602, "grad_norm": 0.37391069531440735, "learning_rate": 4.959180345791528e-05, "loss": 0.1349, "num_input_tokens_seen": 2988960, "step": 2305 }, { "epoch": 0.11286736862678035, "grad_norm": 0.6179954409599304, "learning_rate": 4.9590034712524086e-05, "loss": 0.1114, "num_input_tokens_seen": 2995328, "step": 2310 }, { "epoch": 0.11311167029047468, "grad_norm": 0.33775007724761963, "learning_rate": 4.958826217504053e-05, "loss": 0.1027, "num_input_tokens_seen": 3002240, "step": 2315 }, { "epoch": 0.11335597195416901, "grad_norm": 1.3158366680145264, "learning_rate": 4.958648584573795e-05, "loss": 0.1338, "num_input_tokens_seen": 3009408, "step": 2320 }, { "epoch": 0.11360027361786333, "grad_norm": 0.6338576078414917, "learning_rate": 4.958470572489028e-05, "loss": 0.1193, "num_input_tokens_seen": 3015488, "step": 2325 }, { "epoch": 0.11384457528155767, "grad_norm": 0.15295006334781647, "learning_rate": 4.958292181277203e-05, "loss": 0.1107, "num_input_tokens_seen": 3022528, "step": 2330 }, { "epoch": 0.114088876945252, "grad_norm": 0.8060551285743713, "learning_rate": 4.958113410965832e-05, "loss": 0.1094, "num_input_tokens_seen": 3029088, "step": 2335 }, { "epoch": 0.11433317860894633, "grad_norm": 0.6259481906890869, "learning_rate": 4.957934261582481e-05, "loss": 0.1123, "num_input_tokens_seen": 3035648, "step": 2340 }, { "epoch": 0.11457748027264066, "grad_norm": 0.260140597820282, "learning_rate": 4.95775473315478e-05, "loss": 0.0924, "num_input_tokens_seen": 3042016, "step": 2345 }, { "epoch": 0.11482178193633498, "grad_norm": 0.21781635284423828, "learning_rate": 4.9575748257104124e-05, "loss": 0.1078, "num_input_tokens_seen": 3048928, "step": 2350 }, { "epoch": 0.11506608360002932, "grad_norm": 0.2835567891597748, "learning_rate": 4.9573945392771224e-05, "loss": 0.0947, "num_input_tokens_seen": 3055168, "step": 2355 }, { "epoch": 0.11531038526372364, "grad_norm": 0.22197020053863525, "learning_rate": 4.9572138738827134e-05, "loss": 0.1397, "num_input_tokens_seen": 3061120, "step": 2360 }, { "epoch": 0.11555468692741798, "grad_norm": 0.18790999054908752, "learning_rate": 4.957032829555046e-05, "loss": 0.0916, "num_input_tokens_seen": 3067584, "step": 2365 }, { "epoch": 0.1157989885911123, "grad_norm": 0.291366308927536, "learning_rate": 4.956851406322039e-05, "loss": 0.1063, "num_input_tokens_seen": 3073728, "step": 2370 }, { "epoch": 0.11604329025480664, "grad_norm": 0.3768998980522156, "learning_rate": 4.9566696042116704e-05, "loss": 0.0695, "num_input_tokens_seen": 3080416, "step": 2375 }, { "epoch": 0.11628759191850097, "grad_norm": 0.16083434224128723, "learning_rate": 4.9564874232519766e-05, "loss": 0.0844, "num_input_tokens_seen": 3086912, "step": 2380 }, { "epoch": 0.11653189358219529, "grad_norm": 0.39923274517059326, "learning_rate": 4.9563048634710516e-05, "loss": 0.092, "num_input_tokens_seen": 3093472, "step": 2385 }, { "epoch": 0.11677619524588963, "grad_norm": 0.3909435570240021, "learning_rate": 4.956121924897049e-05, "loss": 0.1046, "num_input_tokens_seen": 3100096, "step": 2390 }, { "epoch": 0.11702049690958395, "grad_norm": 0.5854181051254272, "learning_rate": 4.955938607558181e-05, "loss": 0.1093, "num_input_tokens_seen": 3106528, "step": 2395 }, { "epoch": 0.11726479857327829, "grad_norm": 0.34343552589416504, "learning_rate": 4.955754911482715e-05, "loss": 0.1021, "num_input_tokens_seen": 3112928, "step": 2400 }, { "epoch": 0.11726479857327829, "eval_loss": 0.10908334702253342, "eval_runtime": 374.0613, "eval_samples_per_second": 97.27, "eval_steps_per_second": 24.32, "num_input_tokens_seen": 3112928, "step": 2400 }, { "epoch": 0.11750910023697261, "grad_norm": 2.899653434753418, "learning_rate": 4.9555708366989804e-05, "loss": 0.1378, "num_input_tokens_seen": 3119904, "step": 2405 }, { "epoch": 0.11775340190066694, "grad_norm": 0.4083545207977295, "learning_rate": 4.9553863832353655e-05, "loss": 0.0765, "num_input_tokens_seen": 3126336, "step": 2410 }, { "epoch": 0.11799770356436128, "grad_norm": 0.4240977168083191, "learning_rate": 4.955201551120313e-05, "loss": 0.1161, "num_input_tokens_seen": 3132352, "step": 2415 }, { "epoch": 0.1182420052280556, "grad_norm": 0.7656815648078918, "learning_rate": 4.955016340382328e-05, "loss": 0.1059, "num_input_tokens_seen": 3138560, "step": 2420 }, { "epoch": 0.11848630689174994, "grad_norm": 0.20192773640155792, "learning_rate": 4.954830751049972e-05, "loss": 0.0923, "num_input_tokens_seen": 3145120, "step": 2425 }, { "epoch": 0.11873060855544426, "grad_norm": 0.511275053024292, "learning_rate": 4.954644783151864e-05, "loss": 0.1192, "num_input_tokens_seen": 3151776, "step": 2430 }, { "epoch": 0.11897491021913859, "grad_norm": 0.4316409230232239, "learning_rate": 4.954458436716684e-05, "loss": 0.1048, "num_input_tokens_seen": 3158496, "step": 2435 }, { "epoch": 0.11921921188283292, "grad_norm": 0.3730885088443756, "learning_rate": 4.954271711773168e-05, "loss": 0.0919, "num_input_tokens_seen": 3164640, "step": 2440 }, { "epoch": 0.11946351354652725, "grad_norm": 0.3955337405204773, "learning_rate": 4.9540846083501115e-05, "loss": 0.0852, "num_input_tokens_seen": 3170784, "step": 2445 }, { "epoch": 0.11970781521022159, "grad_norm": 0.6891440153121948, "learning_rate": 4.953897126476369e-05, "loss": 0.1163, "num_input_tokens_seen": 3177088, "step": 2450 }, { "epoch": 0.11995211687391591, "grad_norm": 0.28474563360214233, "learning_rate": 4.9537092661808514e-05, "loss": 0.12, "num_input_tokens_seen": 3183776, "step": 2455 }, { "epoch": 0.12019641853761025, "grad_norm": 0.749542236328125, "learning_rate": 4.9535210274925306e-05, "loss": 0.1018, "num_input_tokens_seen": 3190368, "step": 2460 }, { "epoch": 0.12044072020130457, "grad_norm": 0.5006524920463562, "learning_rate": 4.953332410440435e-05, "loss": 0.0815, "num_input_tokens_seen": 3196384, "step": 2465 }, { "epoch": 0.1206850218649989, "grad_norm": 0.8808647990226746, "learning_rate": 4.9531434150536496e-05, "loss": 0.1153, "num_input_tokens_seen": 3202944, "step": 2470 }, { "epoch": 0.12092932352869323, "grad_norm": 0.21755556762218475, "learning_rate": 4.952954041361322e-05, "loss": 0.1244, "num_input_tokens_seen": 3209376, "step": 2475 }, { "epoch": 0.12117362519238756, "grad_norm": 0.4461488723754883, "learning_rate": 4.952764289392655e-05, "loss": 0.112, "num_input_tokens_seen": 3215744, "step": 2480 }, { "epoch": 0.1214179268560819, "grad_norm": 0.2450903356075287, "learning_rate": 4.952574159176912e-05, "loss": 0.1081, "num_input_tokens_seen": 3222144, "step": 2485 }, { "epoch": 0.12166222851977622, "grad_norm": 0.3190363943576813, "learning_rate": 4.952383650743413e-05, "loss": 0.1044, "num_input_tokens_seen": 3228672, "step": 2490 }, { "epoch": 0.12190653018347054, "grad_norm": 0.3846518397331238, "learning_rate": 4.952192764121536e-05, "loss": 0.1008, "num_input_tokens_seen": 3235264, "step": 2495 }, { "epoch": 0.12215083184716488, "grad_norm": 0.3253364861011505, "learning_rate": 4.9520014993407185e-05, "loss": 0.1147, "num_input_tokens_seen": 3241760, "step": 2500 }, { "epoch": 0.1223951335108592, "grad_norm": 0.14753422141075134, "learning_rate": 4.951809856430456e-05, "loss": 0.092, "num_input_tokens_seen": 3248512, "step": 2505 }, { "epoch": 0.12263943517455354, "grad_norm": 0.4583525061607361, "learning_rate": 4.951617835420303e-05, "loss": 0.1111, "num_input_tokens_seen": 3254720, "step": 2510 }, { "epoch": 0.12288373683824787, "grad_norm": 0.3161800503730774, "learning_rate": 4.951425436339869e-05, "loss": 0.1063, "num_input_tokens_seen": 3261408, "step": 2515 }, { "epoch": 0.12312803850194219, "grad_norm": 0.6954147219657898, "learning_rate": 4.9512326592188274e-05, "loss": 0.1098, "num_input_tokens_seen": 3267808, "step": 2520 }, { "epoch": 0.12337234016563653, "grad_norm": 0.4778003692626953, "learning_rate": 4.9510395040869054e-05, "loss": 0.1252, "num_input_tokens_seen": 3274176, "step": 2525 }, { "epoch": 0.12361664182933085, "grad_norm": 0.5424783825874329, "learning_rate": 4.9508459709738905e-05, "loss": 0.1044, "num_input_tokens_seen": 3281440, "step": 2530 }, { "epoch": 0.12386094349302519, "grad_norm": 0.1634962260723114, "learning_rate": 4.950652059909627e-05, "loss": 0.1245, "num_input_tokens_seen": 3288352, "step": 2535 }, { "epoch": 0.12410524515671952, "grad_norm": 0.6924607157707214, "learning_rate": 4.95045777092402e-05, "loss": 0.0981, "num_input_tokens_seen": 3294848, "step": 2540 }, { "epoch": 0.12434954682041385, "grad_norm": 0.33260148763656616, "learning_rate": 4.950263104047031e-05, "loss": 0.1078, "num_input_tokens_seen": 3301536, "step": 2545 }, { "epoch": 0.12459384848410818, "grad_norm": 0.41015586256980896, "learning_rate": 4.9500680593086775e-05, "loss": 0.0998, "num_input_tokens_seen": 3308000, "step": 2550 }, { "epoch": 0.1248381501478025, "grad_norm": 0.7595974206924438, "learning_rate": 4.94987263673904e-05, "loss": 0.1187, "num_input_tokens_seen": 3314528, "step": 2555 }, { "epoch": 0.12508245181149683, "grad_norm": 0.3328864872455597, "learning_rate": 4.949676836368256e-05, "loss": 0.112, "num_input_tokens_seen": 3321248, "step": 2560 }, { "epoch": 0.12532675347519118, "grad_norm": 0.4019787311553955, "learning_rate": 4.949480658226518e-05, "loss": 0.1225, "num_input_tokens_seen": 3327680, "step": 2565 }, { "epoch": 0.1255710551388855, "grad_norm": 0.719825029373169, "learning_rate": 4.949284102344082e-05, "loss": 0.0765, "num_input_tokens_seen": 3334016, "step": 2570 }, { "epoch": 0.12581535680257983, "grad_norm": 0.24257870018482208, "learning_rate": 4.9490871687512565e-05, "loss": 0.0859, "num_input_tokens_seen": 3340960, "step": 2575 }, { "epoch": 0.12605965846627415, "grad_norm": 0.2587718367576599, "learning_rate": 4.948889857478413e-05, "loss": 0.11, "num_input_tokens_seen": 3347488, "step": 2580 }, { "epoch": 0.12630396012996847, "grad_norm": 0.1687050610780716, "learning_rate": 4.948692168555978e-05, "loss": 0.0916, "num_input_tokens_seen": 3353792, "step": 2585 }, { "epoch": 0.12654826179366283, "grad_norm": 0.4405125379562378, "learning_rate": 4.94849410201444e-05, "loss": 0.1031, "num_input_tokens_seen": 3360864, "step": 2590 }, { "epoch": 0.12679256345735715, "grad_norm": 0.24648703634738922, "learning_rate": 4.948295657884341e-05, "loss": 0.0942, "num_input_tokens_seen": 3367680, "step": 2595 }, { "epoch": 0.12703686512105147, "grad_norm": 0.4259985387325287, "learning_rate": 4.9480968361962835e-05, "loss": 0.1091, "num_input_tokens_seen": 3374048, "step": 2600 }, { "epoch": 0.12703686512105147, "eval_loss": 0.10898370295763016, "eval_runtime": 375.4631, "eval_samples_per_second": 96.907, "eval_steps_per_second": 24.229, "num_input_tokens_seen": 3374048, "step": 2600 }, { "epoch": 0.1272811667847458, "grad_norm": 0.19760194420814514, "learning_rate": 4.9478976369809305e-05, "loss": 0.1189, "num_input_tokens_seen": 3380288, "step": 2605 }, { "epoch": 0.12752546844844012, "grad_norm": 0.49808987975120544, "learning_rate": 4.947698060268999e-05, "loss": 0.1123, "num_input_tokens_seen": 3387040, "step": 2610 }, { "epoch": 0.12776977011213447, "grad_norm": 0.7513079643249512, "learning_rate": 4.9474981060912665e-05, "loss": 0.1088, "num_input_tokens_seen": 3393920, "step": 2615 }, { "epoch": 0.1280140717758288, "grad_norm": 0.7433413863182068, "learning_rate": 4.94729777447857e-05, "loss": 0.1223, "num_input_tokens_seen": 3400896, "step": 2620 }, { "epoch": 0.12825837343952312, "grad_norm": 0.3235704004764557, "learning_rate": 4.947097065461801e-05, "loss": 0.1245, "num_input_tokens_seen": 3407392, "step": 2625 }, { "epoch": 0.12850267510321745, "grad_norm": 0.4167018234729767, "learning_rate": 4.9468959790719125e-05, "loss": 0.0781, "num_input_tokens_seen": 3414496, "step": 2630 }, { "epoch": 0.12874697676691177, "grad_norm": 0.6193057894706726, "learning_rate": 4.9466945153399146e-05, "loss": 0.0964, "num_input_tokens_seen": 3420800, "step": 2635 }, { "epoch": 0.12899127843060612, "grad_norm": 0.8685640692710876, "learning_rate": 4.9464926742968755e-05, "loss": 0.1508, "num_input_tokens_seen": 3427712, "step": 2640 }, { "epoch": 0.12923558009430045, "grad_norm": 0.28778019547462463, "learning_rate": 4.946290455973921e-05, "loss": 0.13, "num_input_tokens_seen": 3434400, "step": 2645 }, { "epoch": 0.12947988175799477, "grad_norm": 0.3853325843811035, "learning_rate": 4.9460878604022365e-05, "loss": 0.0712, "num_input_tokens_seen": 3441088, "step": 2650 }, { "epoch": 0.1297241834216891, "grad_norm": 0.6106685400009155, "learning_rate": 4.945884887613065e-05, "loss": 0.0935, "num_input_tokens_seen": 3447968, "step": 2655 }, { "epoch": 0.12996848508538345, "grad_norm": 0.20751303434371948, "learning_rate": 4.9456815376377055e-05, "loss": 0.1016, "num_input_tokens_seen": 3454080, "step": 2660 }, { "epoch": 0.13021278674907777, "grad_norm": 0.3016695976257324, "learning_rate": 4.9454778105075195e-05, "loss": 0.1103, "num_input_tokens_seen": 3460608, "step": 2665 }, { "epoch": 0.1304570884127721, "grad_norm": 0.3965684473514557, "learning_rate": 4.945273706253924e-05, "loss": 0.1277, "num_input_tokens_seen": 3466976, "step": 2670 }, { "epoch": 0.13070139007646642, "grad_norm": 0.3191719055175781, "learning_rate": 4.9450692249083925e-05, "loss": 0.1149, "num_input_tokens_seen": 3473632, "step": 2675 }, { "epoch": 0.13094569174016074, "grad_norm": 0.3440818786621094, "learning_rate": 4.9448643665024605e-05, "loss": 0.1, "num_input_tokens_seen": 3480160, "step": 2680 }, { "epoch": 0.1311899934038551, "grad_norm": 0.2801845967769623, "learning_rate": 4.944659131067719e-05, "loss": 0.12, "num_input_tokens_seen": 3486784, "step": 2685 }, { "epoch": 0.13143429506754942, "grad_norm": 0.24411845207214355, "learning_rate": 4.944453518635818e-05, "loss": 0.105, "num_input_tokens_seen": 3493376, "step": 2690 }, { "epoch": 0.13167859673124374, "grad_norm": 0.24412250518798828, "learning_rate": 4.944247529238465e-05, "loss": 0.0961, "num_input_tokens_seen": 3500128, "step": 2695 }, { "epoch": 0.13192289839493807, "grad_norm": 0.28985652327537537, "learning_rate": 4.944041162907427e-05, "loss": 0.127, "num_input_tokens_seen": 3507264, "step": 2700 }, { "epoch": 0.1321672000586324, "grad_norm": 0.24599529802799225, "learning_rate": 4.943834419674529e-05, "loss": 0.0856, "num_input_tokens_seen": 3513664, "step": 2705 }, { "epoch": 0.13241150172232674, "grad_norm": 0.6879613399505615, "learning_rate": 4.9436272995716506e-05, "loss": 0.1031, "num_input_tokens_seen": 3520256, "step": 2710 }, { "epoch": 0.13265580338602107, "grad_norm": 0.19225242733955383, "learning_rate": 4.943419802630735e-05, "loss": 0.0946, "num_input_tokens_seen": 3526272, "step": 2715 }, { "epoch": 0.1329001050497154, "grad_norm": 0.35074564814567566, "learning_rate": 4.94321192888378e-05, "loss": 0.1096, "num_input_tokens_seen": 3532672, "step": 2720 }, { "epoch": 0.1331444067134097, "grad_norm": 0.46173757314682007, "learning_rate": 4.943003678362842e-05, "loss": 0.1287, "num_input_tokens_seen": 3538688, "step": 2725 }, { "epoch": 0.13338870837710404, "grad_norm": 0.2687634527683258, "learning_rate": 4.942795051100036e-05, "loss": 0.0786, "num_input_tokens_seen": 3545056, "step": 2730 }, { "epoch": 0.1336330100407984, "grad_norm": 0.21908371150493622, "learning_rate": 4.942586047127536e-05, "loss": 0.0814, "num_input_tokens_seen": 3551584, "step": 2735 }, { "epoch": 0.1338773117044927, "grad_norm": 0.384898841381073, "learning_rate": 4.942376666477571e-05, "loss": 0.1181, "num_input_tokens_seen": 3557888, "step": 2740 }, { "epoch": 0.13412161336818704, "grad_norm": 0.16934804618358612, "learning_rate": 4.9421669091824304e-05, "loss": 0.0967, "num_input_tokens_seen": 3564704, "step": 2745 }, { "epoch": 0.13436591503188136, "grad_norm": 0.3291660249233246, "learning_rate": 4.9419567752744634e-05, "loss": 0.14, "num_input_tokens_seen": 3571744, "step": 2750 }, { "epoch": 0.13461021669557569, "grad_norm": 0.32547396421432495, "learning_rate": 4.941746264786074e-05, "loss": 0.1172, "num_input_tokens_seen": 3578048, "step": 2755 }, { "epoch": 0.13485451835927004, "grad_norm": 0.3035357892513275, "learning_rate": 4.9415353777497254e-05, "loss": 0.1064, "num_input_tokens_seen": 3584416, "step": 2760 }, { "epoch": 0.13509882002296436, "grad_norm": 0.4989929497241974, "learning_rate": 4.9413241141979394e-05, "loss": 0.0832, "num_input_tokens_seen": 3591168, "step": 2765 }, { "epoch": 0.13534312168665869, "grad_norm": 0.17996270954608917, "learning_rate": 4.9411124741632956e-05, "loss": 0.0847, "num_input_tokens_seen": 3598240, "step": 2770 }, { "epoch": 0.135587423350353, "grad_norm": 0.2633976638317108, "learning_rate": 4.940900457678431e-05, "loss": 0.1149, "num_input_tokens_seen": 3604512, "step": 2775 }, { "epoch": 0.13583172501404733, "grad_norm": 0.2827676832675934, "learning_rate": 4.9406880647760425e-05, "loss": 0.104, "num_input_tokens_seen": 3610688, "step": 2780 }, { "epoch": 0.13607602667774168, "grad_norm": 0.4741869568824768, "learning_rate": 4.9404752954888824e-05, "loss": 0.1197, "num_input_tokens_seen": 3617056, "step": 2785 }, { "epoch": 0.136320328341436, "grad_norm": 0.20409773290157318, "learning_rate": 4.940262149849762e-05, "loss": 0.0838, "num_input_tokens_seen": 3623872, "step": 2790 }, { "epoch": 0.13656463000513033, "grad_norm": 0.2955840229988098, "learning_rate": 4.9400486278915526e-05, "loss": 0.0973, "num_input_tokens_seen": 3630976, "step": 2795 }, { "epoch": 0.13680893166882466, "grad_norm": 0.7070250511169434, "learning_rate": 4.939834729647181e-05, "loss": 0.0882, "num_input_tokens_seen": 3637152, "step": 2800 }, { "epoch": 0.13680893166882466, "eval_loss": 0.1059521958231926, "eval_runtime": 374.9064, "eval_samples_per_second": 97.051, "eval_steps_per_second": 24.265, "num_input_tokens_seen": 3637152, "step": 2800 }, { "epoch": 0.13705323333251898, "grad_norm": 0.21789734065532684, "learning_rate": 4.9396204551496326e-05, "loss": 0.0977, "num_input_tokens_seen": 3643488, "step": 2805 }, { "epoch": 0.13729753499621333, "grad_norm": 0.21557167172431946, "learning_rate": 4.939405804431952e-05, "loss": 0.0979, "num_input_tokens_seen": 3650144, "step": 2810 }, { "epoch": 0.13754183665990766, "grad_norm": 0.35683897137641907, "learning_rate": 4.9391907775272414e-05, "loss": 0.0963, "num_input_tokens_seen": 3656544, "step": 2815 }, { "epoch": 0.13778613832360198, "grad_norm": 0.2595210373401642, "learning_rate": 4.9389753744686604e-05, "loss": 0.143, "num_input_tokens_seen": 3662848, "step": 2820 }, { "epoch": 0.1380304399872963, "grad_norm": 0.7544244527816772, "learning_rate": 4.938759595289426e-05, "loss": 0.0836, "num_input_tokens_seen": 3669696, "step": 2825 }, { "epoch": 0.13827474165099066, "grad_norm": 0.2535622715950012, "learning_rate": 4.938543440022815e-05, "loss": 0.1273, "num_input_tokens_seen": 3676096, "step": 2830 }, { "epoch": 0.13851904331468498, "grad_norm": 0.17676971852779388, "learning_rate": 4.938326908702161e-05, "loss": 0.0826, "num_input_tokens_seen": 3682656, "step": 2835 }, { "epoch": 0.1387633449783793, "grad_norm": 0.3538903594017029, "learning_rate": 4.9381100013608554e-05, "loss": 0.1074, "num_input_tokens_seen": 3688928, "step": 2840 }, { "epoch": 0.13900764664207363, "grad_norm": 0.2835029065608978, "learning_rate": 4.9378927180323485e-05, "loss": 0.1035, "num_input_tokens_seen": 3695840, "step": 2845 }, { "epoch": 0.13925194830576795, "grad_norm": 0.14181870222091675, "learning_rate": 4.937675058750148e-05, "loss": 0.0752, "num_input_tokens_seen": 3702080, "step": 2850 }, { "epoch": 0.1394962499694623, "grad_norm": 0.29517894983291626, "learning_rate": 4.937457023547819e-05, "loss": 0.0982, "num_input_tokens_seen": 3708928, "step": 2855 }, { "epoch": 0.13974055163315663, "grad_norm": 0.5180371999740601, "learning_rate": 4.9372386124589876e-05, "loss": 0.0713, "num_input_tokens_seen": 3715648, "step": 2860 }, { "epoch": 0.13998485329685095, "grad_norm": 0.2214623987674713, "learning_rate": 4.937019825517333e-05, "loss": 0.1092, "num_input_tokens_seen": 3721984, "step": 2865 }, { "epoch": 0.14022915496054528, "grad_norm": 0.18132929503917694, "learning_rate": 4.9368006627565954e-05, "loss": 0.1113, "num_input_tokens_seen": 3728384, "step": 2870 }, { "epoch": 0.1404734566242396, "grad_norm": 0.3473208546638489, "learning_rate": 4.936581124210573e-05, "loss": 0.1321, "num_input_tokens_seen": 3734656, "step": 2875 }, { "epoch": 0.14071775828793395, "grad_norm": 0.4239959716796875, "learning_rate": 4.9363612099131216e-05, "loss": 0.0744, "num_input_tokens_seen": 3741120, "step": 2880 }, { "epoch": 0.14096205995162828, "grad_norm": 0.1716606467962265, "learning_rate": 4.936140919898155e-05, "loss": 0.1042, "num_input_tokens_seen": 3747488, "step": 2885 }, { "epoch": 0.1412063616153226, "grad_norm": 0.2317160964012146, "learning_rate": 4.9359202541996426e-05, "loss": 0.0853, "num_input_tokens_seen": 3754176, "step": 2890 }, { "epoch": 0.14145066327901692, "grad_norm": 0.17952261865139008, "learning_rate": 4.935699212851616e-05, "loss": 0.0886, "num_input_tokens_seen": 3760128, "step": 2895 }, { "epoch": 0.14169496494271125, "grad_norm": 0.27974775433540344, "learning_rate": 4.935477795888162e-05, "loss": 0.0862, "num_input_tokens_seen": 3766688, "step": 2900 }, { "epoch": 0.1419392666064056, "grad_norm": 0.20051373541355133, "learning_rate": 4.935256003343426e-05, "loss": 0.0891, "num_input_tokens_seen": 3773024, "step": 2905 }, { "epoch": 0.14218356827009992, "grad_norm": 0.36654072999954224, "learning_rate": 4.93503383525161e-05, "loss": 0.1338, "num_input_tokens_seen": 3779328, "step": 2910 }, { "epoch": 0.14242786993379425, "grad_norm": 0.26979678869247437, "learning_rate": 4.934811291646977e-05, "loss": 0.1312, "num_input_tokens_seen": 3785248, "step": 2915 }, { "epoch": 0.14267217159748857, "grad_norm": 0.548829197883606, "learning_rate": 4.934588372563845e-05, "loss": 0.1005, "num_input_tokens_seen": 3791552, "step": 2920 }, { "epoch": 0.1429164732611829, "grad_norm": 0.35269302129745483, "learning_rate": 4.93436507803659e-05, "loss": 0.0904, "num_input_tokens_seen": 3797952, "step": 2925 }, { "epoch": 0.14316077492487725, "grad_norm": 0.4241536557674408, "learning_rate": 4.934141408099649e-05, "loss": 0.1019, "num_input_tokens_seen": 3804064, "step": 2930 }, { "epoch": 0.14340507658857157, "grad_norm": 0.5729320645332336, "learning_rate": 4.9339173627875135e-05, "loss": 0.1129, "num_input_tokens_seen": 3810880, "step": 2935 }, { "epoch": 0.1436493782522659, "grad_norm": 0.39077186584472656, "learning_rate": 4.9336929421347335e-05, "loss": 0.095, "num_input_tokens_seen": 3817600, "step": 2940 }, { "epoch": 0.14389367991596022, "grad_norm": 0.3870016634464264, "learning_rate": 4.933468146175918e-05, "loss": 0.0934, "num_input_tokens_seen": 3824160, "step": 2945 }, { "epoch": 0.14413798157965454, "grad_norm": 0.7967491745948792, "learning_rate": 4.933242974945734e-05, "loss": 0.1073, "num_input_tokens_seen": 3830528, "step": 2950 }, { "epoch": 0.1443822832433489, "grad_norm": 0.28630563616752625, "learning_rate": 4.933017428478906e-05, "loss": 0.0893, "num_input_tokens_seen": 3837120, "step": 2955 }, { "epoch": 0.14462658490704322, "grad_norm": 0.2918873131275177, "learning_rate": 4.932791506810214e-05, "loss": 0.1036, "num_input_tokens_seen": 3843968, "step": 2960 }, { "epoch": 0.14487088657073754, "grad_norm": 0.18034060299396515, "learning_rate": 4.932565209974499e-05, "loss": 0.0785, "num_input_tokens_seen": 3850400, "step": 2965 }, { "epoch": 0.14511518823443187, "grad_norm": 0.3954705595970154, "learning_rate": 4.93233853800666e-05, "loss": 0.1085, "num_input_tokens_seen": 3856896, "step": 2970 }, { "epoch": 0.1453594898981262, "grad_norm": 0.5757956504821777, "learning_rate": 4.932111490941651e-05, "loss": 0.0954, "num_input_tokens_seen": 3863520, "step": 2975 }, { "epoch": 0.14560379156182054, "grad_norm": 0.4281691312789917, "learning_rate": 4.9318840688144876e-05, "loss": 0.0677, "num_input_tokens_seen": 3870272, "step": 2980 }, { "epoch": 0.14584809322551487, "grad_norm": 0.48630380630493164, "learning_rate": 4.9316562716602387e-05, "loss": 0.0896, "num_input_tokens_seen": 3877088, "step": 2985 }, { "epoch": 0.1460923948892092, "grad_norm": 0.2498846799135208, "learning_rate": 4.9314280995140346e-05, "loss": 0.1211, "num_input_tokens_seen": 3883776, "step": 2990 }, { "epoch": 0.14633669655290352, "grad_norm": 0.32741162180900574, "learning_rate": 4.931199552411063e-05, "loss": 0.1074, "num_input_tokens_seen": 3890112, "step": 2995 }, { "epoch": 0.14658099821659787, "grad_norm": 0.5714840292930603, "learning_rate": 4.930970630386568e-05, "loss": 0.1053, "num_input_tokens_seen": 3896512, "step": 3000 }, { "epoch": 0.14658099821659787, "eval_loss": 0.10580465197563171, "eval_runtime": 374.7746, "eval_samples_per_second": 97.085, "eval_steps_per_second": 24.273, "num_input_tokens_seen": 3896512, "step": 3000 }, { "epoch": 0.1468252998802922, "grad_norm": 0.18489399552345276, "learning_rate": 4.9307413334758524e-05, "loss": 0.1023, "num_input_tokens_seen": 3902688, "step": 3005 }, { "epoch": 0.14706960154398652, "grad_norm": 0.4483068883419037, "learning_rate": 4.930511661714276e-05, "loss": 0.1153, "num_input_tokens_seen": 3909408, "step": 3010 }, { "epoch": 0.14731390320768084, "grad_norm": 0.3131495714187622, "learning_rate": 4.9302816151372576e-05, "loss": 0.11, "num_input_tokens_seen": 3915488, "step": 3015 }, { "epoch": 0.14755820487137516, "grad_norm": 0.24794407188892365, "learning_rate": 4.930051193780274e-05, "loss": 0.1107, "num_input_tokens_seen": 3921632, "step": 3020 }, { "epoch": 0.14780250653506952, "grad_norm": 0.32746610045433044, "learning_rate": 4.929820397678858e-05, "loss": 0.0874, "num_input_tokens_seen": 3928640, "step": 3025 }, { "epoch": 0.14804680819876384, "grad_norm": 0.34705451130867004, "learning_rate": 4.9295892268686015e-05, "loss": 0.0951, "num_input_tokens_seen": 3935200, "step": 3030 }, { "epoch": 0.14829110986245816, "grad_norm": 0.22469554841518402, "learning_rate": 4.9293576813851536e-05, "loss": 0.1031, "num_input_tokens_seen": 3941856, "step": 3035 }, { "epoch": 0.1485354115261525, "grad_norm": 0.4869438111782074, "learning_rate": 4.929125761264223e-05, "loss": 0.0911, "num_input_tokens_seen": 3948640, "step": 3040 }, { "epoch": 0.1487797131898468, "grad_norm": 0.6551709771156311, "learning_rate": 4.928893466541573e-05, "loss": 0.0928, "num_input_tokens_seen": 3955264, "step": 3045 }, { "epoch": 0.14902401485354116, "grad_norm": 0.2717861235141754, "learning_rate": 4.928660797253027e-05, "loss": 0.0934, "num_input_tokens_seen": 3961504, "step": 3050 }, { "epoch": 0.1492683165172355, "grad_norm": 0.2854444980621338, "learning_rate": 4.928427753434467e-05, "loss": 0.101, "num_input_tokens_seen": 3967424, "step": 3055 }, { "epoch": 0.1495126181809298, "grad_norm": 0.24392232298851013, "learning_rate": 4.9281943351218286e-05, "loss": 0.0907, "num_input_tokens_seen": 3974336, "step": 3060 }, { "epoch": 0.14975691984462414, "grad_norm": 0.29105085134506226, "learning_rate": 4.9279605423511095e-05, "loss": 0.1191, "num_input_tokens_seen": 3980832, "step": 3065 }, { "epoch": 0.15000122150831846, "grad_norm": 0.7889485955238342, "learning_rate": 4.927726375158363e-05, "loss": 0.0822, "num_input_tokens_seen": 3987008, "step": 3070 }, { "epoch": 0.1502455231720128, "grad_norm": 0.8466653823852539, "learning_rate": 4.9274918335797004e-05, "loss": 0.1418, "num_input_tokens_seen": 3993728, "step": 3075 }, { "epoch": 0.15048982483570714, "grad_norm": 0.47049853205680847, "learning_rate": 4.927256917651292e-05, "loss": 0.1199, "num_input_tokens_seen": 4000288, "step": 3080 }, { "epoch": 0.15073412649940146, "grad_norm": 0.20141738653182983, "learning_rate": 4.927021627409364e-05, "loss": 0.1002, "num_input_tokens_seen": 4006944, "step": 3085 }, { "epoch": 0.15097842816309578, "grad_norm": 0.4972009062767029, "learning_rate": 4.9267859628902005e-05, "loss": 0.1027, "num_input_tokens_seen": 4013472, "step": 3090 }, { "epoch": 0.1512227298267901, "grad_norm": 0.34713947772979736, "learning_rate": 4.9265499241301454e-05, "loss": 0.1276, "num_input_tokens_seen": 4019712, "step": 3095 }, { "epoch": 0.15146703149048446, "grad_norm": 0.13198287785053253, "learning_rate": 4.926313511165598e-05, "loss": 0.0659, "num_input_tokens_seen": 4026240, "step": 3100 }, { "epoch": 0.15171133315417878, "grad_norm": 0.21851368248462677, "learning_rate": 4.926076724033016e-05, "loss": 0.0851, "num_input_tokens_seen": 4032800, "step": 3105 }, { "epoch": 0.1519556348178731, "grad_norm": 0.1723506897687912, "learning_rate": 4.9258395627689146e-05, "loss": 0.07, "num_input_tokens_seen": 4039712, "step": 3110 }, { "epoch": 0.15219993648156743, "grad_norm": 1.371970534324646, "learning_rate": 4.925602027409868e-05, "loss": 0.1166, "num_input_tokens_seen": 4045792, "step": 3115 }, { "epoch": 0.15244423814526176, "grad_norm": 0.3835395872592926, "learning_rate": 4.925364117992507e-05, "loss": 0.1083, "num_input_tokens_seen": 4052384, "step": 3120 }, { "epoch": 0.1526885398089561, "grad_norm": 0.5239814519882202, "learning_rate": 4.92512583455352e-05, "loss": 0.1095, "num_input_tokens_seen": 4058720, "step": 3125 }, { "epoch": 0.15293284147265043, "grad_norm": 0.18027043342590332, "learning_rate": 4.9248871771296536e-05, "loss": 0.0915, "num_input_tokens_seen": 4065344, "step": 3130 }, { "epoch": 0.15317714313634476, "grad_norm": 0.37134891748428345, "learning_rate": 4.924648145757711e-05, "loss": 0.0856, "num_input_tokens_seen": 4071712, "step": 3135 }, { "epoch": 0.15342144480003908, "grad_norm": 0.9989535212516785, "learning_rate": 4.924408740474554e-05, "loss": 0.1188, "num_input_tokens_seen": 4078528, "step": 3140 }, { "epoch": 0.1536657464637334, "grad_norm": 0.5539145469665527, "learning_rate": 4.924168961317103e-05, "loss": 0.1096, "num_input_tokens_seen": 4085056, "step": 3145 }, { "epoch": 0.15391004812742776, "grad_norm": 0.3217032849788666, "learning_rate": 4.9239288083223334e-05, "loss": 0.1037, "num_input_tokens_seen": 4091296, "step": 3150 }, { "epoch": 0.15415434979112208, "grad_norm": 0.1857212632894516, "learning_rate": 4.9236882815272803e-05, "loss": 0.0936, "num_input_tokens_seen": 4097696, "step": 3155 }, { "epoch": 0.1543986514548164, "grad_norm": 0.28988367319107056, "learning_rate": 4.9234473809690365e-05, "loss": 0.1201, "num_input_tokens_seen": 4104000, "step": 3160 }, { "epoch": 0.15464295311851073, "grad_norm": 0.44423022866249084, "learning_rate": 4.923206106684752e-05, "loss": 0.099, "num_input_tokens_seen": 4110272, "step": 3165 }, { "epoch": 0.15488725478220508, "grad_norm": 0.1561242640018463, "learning_rate": 4.922964458711634e-05, "loss": 0.0866, "num_input_tokens_seen": 4117024, "step": 3170 }, { "epoch": 0.1551315564458994, "grad_norm": 0.38538601994514465, "learning_rate": 4.9227224370869474e-05, "loss": 0.1026, "num_input_tokens_seen": 4123360, "step": 3175 }, { "epoch": 0.15537585810959373, "grad_norm": 0.21226473152637482, "learning_rate": 4.9224800418480155e-05, "loss": 0.0944, "num_input_tokens_seen": 4129760, "step": 3180 }, { "epoch": 0.15562015977328805, "grad_norm": 0.5080082416534424, "learning_rate": 4.9222372730322176e-05, "loss": 0.0763, "num_input_tokens_seen": 4136832, "step": 3185 }, { "epoch": 0.15586446143698238, "grad_norm": 0.2779831290245056, "learning_rate": 4.921994130676993e-05, "loss": 0.1026, "num_input_tokens_seen": 4143264, "step": 3190 }, { "epoch": 0.15610876310067673, "grad_norm": 0.2681067883968353, "learning_rate": 4.9217506148198366e-05, "loss": 0.1136, "num_input_tokens_seen": 4150656, "step": 3195 }, { "epoch": 0.15635306476437105, "grad_norm": 0.32909417152404785, "learning_rate": 4.921506725498302e-05, "loss": 0.1077, "num_input_tokens_seen": 4157216, "step": 3200 }, { "epoch": 0.15635306476437105, "eval_loss": 0.10446102172136307, "eval_runtime": 374.807, "eval_samples_per_second": 97.077, "eval_steps_per_second": 24.271, "num_input_tokens_seen": 4157216, "step": 3200 }, { "epoch": 0.15659736642806538, "grad_norm": 0.6297775506973267, "learning_rate": 4.9212624627499994e-05, "loss": 0.123, "num_input_tokens_seen": 4163808, "step": 3205 }, { "epoch": 0.1568416680917597, "grad_norm": 0.2330852895975113, "learning_rate": 4.921017826612597e-05, "loss": 0.115, "num_input_tokens_seen": 4170240, "step": 3210 }, { "epoch": 0.15708596975545402, "grad_norm": 0.43239256739616394, "learning_rate": 4.9207728171238223e-05, "loss": 0.0855, "num_input_tokens_seen": 4177376, "step": 3215 }, { "epoch": 0.15733027141914838, "grad_norm": 0.41638144850730896, "learning_rate": 4.920527434321458e-05, "loss": 0.1037, "num_input_tokens_seen": 4184032, "step": 3220 }, { "epoch": 0.1575745730828427, "grad_norm": 0.26762592792510986, "learning_rate": 4.920281678243345e-05, "loss": 0.1197, "num_input_tokens_seen": 4190304, "step": 3225 }, { "epoch": 0.15781887474653702, "grad_norm": 0.34116131067276, "learning_rate": 4.920035548927381e-05, "loss": 0.0939, "num_input_tokens_seen": 4196960, "step": 3230 }, { "epoch": 0.15806317641023135, "grad_norm": 0.22725261747837067, "learning_rate": 4.919789046411525e-05, "loss": 0.1182, "num_input_tokens_seen": 4203552, "step": 3235 }, { "epoch": 0.15830747807392567, "grad_norm": 0.1410820037126541, "learning_rate": 4.919542170733787e-05, "loss": 0.1109, "num_input_tokens_seen": 4209696, "step": 3240 }, { "epoch": 0.15855177973762002, "grad_norm": 0.24684152007102966, "learning_rate": 4.919294921932242e-05, "loss": 0.0726, "num_input_tokens_seen": 4216128, "step": 3245 }, { "epoch": 0.15879608140131435, "grad_norm": 0.3351017236709595, "learning_rate": 4.919047300045016e-05, "loss": 0.0883, "num_input_tokens_seen": 4222912, "step": 3250 }, { "epoch": 0.15904038306500867, "grad_norm": 0.9794028401374817, "learning_rate": 4.918799305110299e-05, "loss": 0.0891, "num_input_tokens_seen": 4229792, "step": 3255 }, { "epoch": 0.159284684728703, "grad_norm": 1.2340092658996582, "learning_rate": 4.918550937166331e-05, "loss": 0.1327, "num_input_tokens_seen": 4236384, "step": 3260 }, { "epoch": 0.15952898639239732, "grad_norm": 0.3635033071041107, "learning_rate": 4.918302196251415e-05, "loss": 0.1173, "num_input_tokens_seen": 4243488, "step": 3265 }, { "epoch": 0.15977328805609167, "grad_norm": 0.6431196928024292, "learning_rate": 4.91805308240391e-05, "loss": 0.1062, "num_input_tokens_seen": 4250112, "step": 3270 }, { "epoch": 0.160017589719786, "grad_norm": 0.35542115569114685, "learning_rate": 4.9178035956622326e-05, "loss": 0.0812, "num_input_tokens_seen": 4256512, "step": 3275 }, { "epoch": 0.16026189138348032, "grad_norm": 0.6606799364089966, "learning_rate": 4.917553736064857e-05, "loss": 0.0886, "num_input_tokens_seen": 4263264, "step": 3280 }, { "epoch": 0.16050619304717464, "grad_norm": 0.5947329998016357, "learning_rate": 4.917303503650314e-05, "loss": 0.1001, "num_input_tokens_seen": 4269600, "step": 3285 }, { "epoch": 0.16075049471086897, "grad_norm": 0.16912928223609924, "learning_rate": 4.917052898457194e-05, "loss": 0.108, "num_input_tokens_seen": 4275648, "step": 3290 }, { "epoch": 0.16099479637456332, "grad_norm": 0.4137447774410248, "learning_rate": 4.916801920524141e-05, "loss": 0.1146, "num_input_tokens_seen": 4281856, "step": 3295 }, { "epoch": 0.16123909803825764, "grad_norm": 0.4112042784690857, "learning_rate": 4.916550569889862e-05, "loss": 0.1056, "num_input_tokens_seen": 4287968, "step": 3300 }, { "epoch": 0.16148339970195197, "grad_norm": 0.5525413155555725, "learning_rate": 4.916298846593116e-05, "loss": 0.1119, "num_input_tokens_seen": 4295104, "step": 3305 }, { "epoch": 0.1617277013656463, "grad_norm": 0.17503385245800018, "learning_rate": 4.916046750672722e-05, "loss": 0.104, "num_input_tokens_seen": 4301440, "step": 3310 }, { "epoch": 0.16197200302934062, "grad_norm": 0.38442251086235046, "learning_rate": 4.915794282167559e-05, "loss": 0.0967, "num_input_tokens_seen": 4307648, "step": 3315 }, { "epoch": 0.16221630469303497, "grad_norm": 0.4895223379135132, "learning_rate": 4.915541441116558e-05, "loss": 0.1093, "num_input_tokens_seen": 4313984, "step": 3320 }, { "epoch": 0.1624606063567293, "grad_norm": 0.1719665378332138, "learning_rate": 4.915288227558711e-05, "loss": 0.0915, "num_input_tokens_seen": 4320832, "step": 3325 }, { "epoch": 0.16270490802042362, "grad_norm": 0.20818914473056793, "learning_rate": 4.915034641533066e-05, "loss": 0.1111, "num_input_tokens_seen": 4327104, "step": 3330 }, { "epoch": 0.16294920968411794, "grad_norm": 0.3420466184616089, "learning_rate": 4.914780683078731e-05, "loss": 0.1399, "num_input_tokens_seen": 4333952, "step": 3335 }, { "epoch": 0.1631935113478123, "grad_norm": 0.237259179353714, "learning_rate": 4.9145263522348695e-05, "loss": 0.0906, "num_input_tokens_seen": 4340480, "step": 3340 }, { "epoch": 0.16343781301150662, "grad_norm": 0.3418973386287689, "learning_rate": 4.9142716490407e-05, "loss": 0.0939, "num_input_tokens_seen": 4347040, "step": 3345 }, { "epoch": 0.16368211467520094, "grad_norm": 0.23503553867340088, "learning_rate": 4.914016573535504e-05, "loss": 0.1203, "num_input_tokens_seen": 4352864, "step": 3350 }, { "epoch": 0.16392641633889526, "grad_norm": 0.4176494777202606, "learning_rate": 4.9137611257586154e-05, "loss": 0.1117, "num_input_tokens_seen": 4359328, "step": 3355 }, { "epoch": 0.1641707180025896, "grad_norm": 0.3596843183040619, "learning_rate": 4.9135053057494274e-05, "loss": 0.0975, "num_input_tokens_seen": 4365856, "step": 3360 }, { "epoch": 0.16441501966628394, "grad_norm": 0.23906013369560242, "learning_rate": 4.913249113547392e-05, "loss": 0.1079, "num_input_tokens_seen": 4372320, "step": 3365 }, { "epoch": 0.16465932132997826, "grad_norm": 0.2687692642211914, "learning_rate": 4.912992549192016e-05, "loss": 0.1382, "num_input_tokens_seen": 4378752, "step": 3370 }, { "epoch": 0.1649036229936726, "grad_norm": 0.36665603518486023, "learning_rate": 4.9127356127228665e-05, "loss": 0.088, "num_input_tokens_seen": 4385856, "step": 3375 }, { "epoch": 0.1651479246573669, "grad_norm": 0.44952574372291565, "learning_rate": 4.912478304179564e-05, "loss": 0.1135, "num_input_tokens_seen": 4392416, "step": 3380 }, { "epoch": 0.16539222632106124, "grad_norm": 0.3165920078754425, "learning_rate": 4.9122206236017896e-05, "loss": 0.1052, "num_input_tokens_seen": 4399072, "step": 3385 }, { "epoch": 0.1656365279847556, "grad_norm": 0.3126979470252991, "learning_rate": 4.911962571029282e-05, "loss": 0.1284, "num_input_tokens_seen": 4405472, "step": 3390 }, { "epoch": 0.1658808296484499, "grad_norm": 0.3851845860481262, "learning_rate": 4.9117041465018353e-05, "loss": 0.1109, "num_input_tokens_seen": 4411968, "step": 3395 }, { "epoch": 0.16612513131214424, "grad_norm": 0.2553825378417969, "learning_rate": 4.911445350059302e-05, "loss": 0.1086, "num_input_tokens_seen": 4418592, "step": 3400 }, { "epoch": 0.16612513131214424, "eval_loss": 0.10320249199867249, "eval_runtime": 375.3937, "eval_samples_per_second": 96.925, "eval_steps_per_second": 24.233, "num_input_tokens_seen": 4418592, "step": 3400 }, { "epoch": 0.16636943297583856, "grad_norm": 0.46209123730659485, "learning_rate": 4.9111861817415905e-05, "loss": 0.0935, "num_input_tokens_seen": 4425088, "step": 3405 }, { "epoch": 0.16661373463953288, "grad_norm": 0.39279115200042725, "learning_rate": 4.91092664158867e-05, "loss": 0.0818, "num_input_tokens_seen": 4431584, "step": 3410 }, { "epoch": 0.16685803630322724, "grad_norm": 0.16631941497325897, "learning_rate": 4.910666729640563e-05, "loss": 0.0912, "num_input_tokens_seen": 4437792, "step": 3415 }, { "epoch": 0.16710233796692156, "grad_norm": 0.2226773500442505, "learning_rate": 4.910406445937353e-05, "loss": 0.1062, "num_input_tokens_seen": 4443904, "step": 3420 }, { "epoch": 0.16734663963061588, "grad_norm": 0.20711901783943176, "learning_rate": 4.9101457905191774e-05, "loss": 0.1175, "num_input_tokens_seen": 4450592, "step": 3425 }, { "epoch": 0.1675909412943102, "grad_norm": 0.40769901871681213, "learning_rate": 4.909884763426233e-05, "loss": 0.1008, "num_input_tokens_seen": 4457376, "step": 3430 }, { "epoch": 0.16783524295800453, "grad_norm": 0.30804651975631714, "learning_rate": 4.9096233646987736e-05, "loss": 0.1068, "num_input_tokens_seen": 4463392, "step": 3435 }, { "epoch": 0.16807954462169888, "grad_norm": 0.3853948712348938, "learning_rate": 4.9093615943771104e-05, "loss": 0.1058, "num_input_tokens_seen": 4469856, "step": 3440 }, { "epoch": 0.1683238462853932, "grad_norm": 0.7512122392654419, "learning_rate": 4.909099452501611e-05, "loss": 0.134, "num_input_tokens_seen": 4476384, "step": 3445 }, { "epoch": 0.16856814794908753, "grad_norm": 0.3309578001499176, "learning_rate": 4.908836939112702e-05, "loss": 0.1054, "num_input_tokens_seen": 4483008, "step": 3450 }, { "epoch": 0.16881244961278186, "grad_norm": 0.3535977602005005, "learning_rate": 4.908574054250865e-05, "loss": 0.0941, "num_input_tokens_seen": 4489728, "step": 3455 }, { "epoch": 0.16905675127647618, "grad_norm": 0.15997314453125, "learning_rate": 4.9083107979566414e-05, "loss": 0.098, "num_input_tokens_seen": 4496320, "step": 3460 }, { "epoch": 0.16930105294017053, "grad_norm": 0.1850723773241043, "learning_rate": 4.908047170270628e-05, "loss": 0.0877, "num_input_tokens_seen": 4502752, "step": 3465 }, { "epoch": 0.16954535460386486, "grad_norm": 0.24979764223098755, "learning_rate": 4.9077831712334784e-05, "loss": 0.1075, "num_input_tokens_seen": 4509440, "step": 3470 }, { "epoch": 0.16978965626755918, "grad_norm": 0.6583161354064941, "learning_rate": 4.907518800885907e-05, "loss": 0.0978, "num_input_tokens_seen": 4515968, "step": 3475 }, { "epoch": 0.1700339579312535, "grad_norm": 0.4701288938522339, "learning_rate": 4.907254059268681e-05, "loss": 0.1271, "num_input_tokens_seen": 4522304, "step": 3480 }, { "epoch": 0.17027825959494783, "grad_norm": 0.20398879051208496, "learning_rate": 4.906988946422628e-05, "loss": 0.1015, "num_input_tokens_seen": 4528736, "step": 3485 }, { "epoch": 0.17052256125864218, "grad_norm": 0.3217414617538452, "learning_rate": 4.9067234623886315e-05, "loss": 0.1261, "num_input_tokens_seen": 4535168, "step": 3490 }, { "epoch": 0.1707668629223365, "grad_norm": 0.4604056179523468, "learning_rate": 4.9064576072076316e-05, "loss": 0.1074, "num_input_tokens_seen": 4541824, "step": 3495 }, { "epoch": 0.17101116458603083, "grad_norm": 0.23362739384174347, "learning_rate": 4.906191380920628e-05, "loss": 0.1005, "num_input_tokens_seen": 4548160, "step": 3500 }, { "epoch": 0.17125546624972515, "grad_norm": 0.17584556341171265, "learning_rate": 4.905924783568675e-05, "loss": 0.1227, "num_input_tokens_seen": 4554496, "step": 3505 }, { "epoch": 0.1714997679134195, "grad_norm": 0.42760708928108215, "learning_rate": 4.905657815192886e-05, "loss": 0.1101, "num_input_tokens_seen": 4560960, "step": 3510 }, { "epoch": 0.17174406957711383, "grad_norm": 0.392635315656662, "learning_rate": 4.90539047583443e-05, "loss": 0.0877, "num_input_tokens_seen": 4567200, "step": 3515 }, { "epoch": 0.17198837124080815, "grad_norm": 0.6329143643379211, "learning_rate": 4.905122765534534e-05, "loss": 0.0843, "num_input_tokens_seen": 4574336, "step": 3520 }, { "epoch": 0.17223267290450248, "grad_norm": 0.2642519772052765, "learning_rate": 4.9048546843344846e-05, "loss": 0.1069, "num_input_tokens_seen": 4580832, "step": 3525 }, { "epoch": 0.1724769745681968, "grad_norm": 0.22001618146896362, "learning_rate": 4.9045862322756206e-05, "loss": 0.1252, "num_input_tokens_seen": 4586752, "step": 3530 }, { "epoch": 0.17272127623189115, "grad_norm": 0.2789965867996216, "learning_rate": 4.904317409399342e-05, "loss": 0.0846, "num_input_tokens_seen": 4593088, "step": 3535 }, { "epoch": 0.17296557789558548, "grad_norm": 0.16924160718917847, "learning_rate": 4.904048215747104e-05, "loss": 0.0796, "num_input_tokens_seen": 4600128, "step": 3540 }, { "epoch": 0.1732098795592798, "grad_norm": 0.23727254569530487, "learning_rate": 4.90377865136042e-05, "loss": 0.0936, "num_input_tokens_seen": 4606592, "step": 3545 }, { "epoch": 0.17345418122297412, "grad_norm": 0.15535084903240204, "learning_rate": 4.90350871628086e-05, "loss": 0.0998, "num_input_tokens_seen": 4613248, "step": 3550 }, { "epoch": 0.17369848288666845, "grad_norm": 0.18941472470760345, "learning_rate": 4.903238410550052e-05, "loss": 0.1036, "num_input_tokens_seen": 4619712, "step": 3555 }, { "epoch": 0.1739427845503628, "grad_norm": 0.13183848559856415, "learning_rate": 4.90296773420968e-05, "loss": 0.0962, "num_input_tokens_seen": 4626016, "step": 3560 }, { "epoch": 0.17418708621405712, "grad_norm": 0.24361541867256165, "learning_rate": 4.902696687301486e-05, "loss": 0.1263, "num_input_tokens_seen": 4632288, "step": 3565 }, { "epoch": 0.17443138787775145, "grad_norm": 0.40500760078430176, "learning_rate": 4.902425269867268e-05, "loss": 0.0878, "num_input_tokens_seen": 4638880, "step": 3570 }, { "epoch": 0.17467568954144577, "grad_norm": 0.28029489517211914, "learning_rate": 4.902153481948883e-05, "loss": 0.1269, "num_input_tokens_seen": 4645056, "step": 3575 }, { "epoch": 0.1749199912051401, "grad_norm": 0.1580449342727661, "learning_rate": 4.901881323588244e-05, "loss": 0.0919, "num_input_tokens_seen": 4651328, "step": 3580 }, { "epoch": 0.17516429286883445, "grad_norm": 0.3875674307346344, "learning_rate": 4.90160879482732e-05, "loss": 0.1034, "num_input_tokens_seen": 4657888, "step": 3585 }, { "epoch": 0.17540859453252877, "grad_norm": 0.5818448662757874, "learning_rate": 4.9013358957081405e-05, "loss": 0.0977, "num_input_tokens_seen": 4664512, "step": 3590 }, { "epoch": 0.1756528961962231, "grad_norm": 0.5138587355613708, "learning_rate": 4.901062626272789e-05, "loss": 0.0878, "num_input_tokens_seen": 4671008, "step": 3595 }, { "epoch": 0.17589719785991742, "grad_norm": 0.20787884294986725, "learning_rate": 4.900788986563406e-05, "loss": 0.0749, "num_input_tokens_seen": 4677248, "step": 3600 }, { "epoch": 0.17589719785991742, "eval_loss": 0.10287588089704514, "eval_runtime": 374.9871, "eval_samples_per_second": 97.03, "eval_steps_per_second": 24.259, "num_input_tokens_seen": 4677248, "step": 3600 }, { "epoch": 0.17614149952361174, "grad_norm": 0.5410585403442383, "learning_rate": 4.9005149766221915e-05, "loss": 0.101, "num_input_tokens_seen": 4684000, "step": 3605 }, { "epoch": 0.1763858011873061, "grad_norm": 0.3667186200618744, "learning_rate": 4.9002405964914e-05, "loss": 0.0944, "num_input_tokens_seen": 4690336, "step": 3610 }, { "epoch": 0.17663010285100042, "grad_norm": 1.3637367486953735, "learning_rate": 4.899965846213346e-05, "loss": 0.1211, "num_input_tokens_seen": 4696672, "step": 3615 }, { "epoch": 0.17687440451469474, "grad_norm": 0.2625536024570465, "learning_rate": 4.899690725830399e-05, "loss": 0.1099, "num_input_tokens_seen": 4703456, "step": 3620 }, { "epoch": 0.17711870617838907, "grad_norm": 0.23707211017608643, "learning_rate": 4.899415235384985e-05, "loss": 0.0964, "num_input_tokens_seen": 4710432, "step": 3625 }, { "epoch": 0.1773630078420834, "grad_norm": 0.31565535068511963, "learning_rate": 4.899139374919589e-05, "loss": 0.1017, "num_input_tokens_seen": 4716736, "step": 3630 }, { "epoch": 0.17760730950577774, "grad_norm": 0.239191934466362, "learning_rate": 4.898863144476752e-05, "loss": 0.1067, "num_input_tokens_seen": 4723584, "step": 3635 }, { "epoch": 0.17785161116947207, "grad_norm": 0.29952242970466614, "learning_rate": 4.898586544099072e-05, "loss": 0.0897, "num_input_tokens_seen": 4729440, "step": 3640 }, { "epoch": 0.1780959128331664, "grad_norm": 0.6510632634162903, "learning_rate": 4.898309573829204e-05, "loss": 0.1077, "num_input_tokens_seen": 4735648, "step": 3645 }, { "epoch": 0.17834021449686072, "grad_norm": 0.6672237515449524, "learning_rate": 4.898032233709862e-05, "loss": 0.1061, "num_input_tokens_seen": 4741888, "step": 3650 }, { "epoch": 0.17858451616055504, "grad_norm": 0.6704319715499878, "learning_rate": 4.8977545237838123e-05, "loss": 0.0951, "num_input_tokens_seen": 4748544, "step": 3655 }, { "epoch": 0.1788288178242494, "grad_norm": 0.4821835160255432, "learning_rate": 4.8974764440938836e-05, "loss": 0.1006, "num_input_tokens_seen": 4754816, "step": 3660 }, { "epoch": 0.17907311948794372, "grad_norm": 0.2276570349931717, "learning_rate": 4.897197994682959e-05, "loss": 0.0896, "num_input_tokens_seen": 4761472, "step": 3665 }, { "epoch": 0.17931742115163804, "grad_norm": 0.2822791039943695, "learning_rate": 4.8969191755939786e-05, "loss": 0.0823, "num_input_tokens_seen": 4768064, "step": 3670 }, { "epoch": 0.17956172281533236, "grad_norm": 0.3884615898132324, "learning_rate": 4.8966399868699396e-05, "loss": 0.1007, "num_input_tokens_seen": 4774496, "step": 3675 }, { "epoch": 0.17980602447902672, "grad_norm": 0.3355526328086853, "learning_rate": 4.8963604285538965e-05, "loss": 0.0952, "num_input_tokens_seen": 4780640, "step": 3680 }, { "epoch": 0.18005032614272104, "grad_norm": 0.19822990894317627, "learning_rate": 4.8960805006889604e-05, "loss": 0.1193, "num_input_tokens_seen": 4786656, "step": 3685 }, { "epoch": 0.18029462780641536, "grad_norm": 0.29140037298202515, "learning_rate": 4.8958002033183004e-05, "loss": 0.0826, "num_input_tokens_seen": 4793504, "step": 3690 }, { "epoch": 0.1805389294701097, "grad_norm": 0.21023188531398773, "learning_rate": 4.8955195364851414e-05, "loss": 0.0879, "num_input_tokens_seen": 4799680, "step": 3695 }, { "epoch": 0.180783231133804, "grad_norm": 0.28604620695114136, "learning_rate": 4.895238500232766e-05, "loss": 0.0877, "num_input_tokens_seen": 4806144, "step": 3700 }, { "epoch": 0.18102753279749836, "grad_norm": 0.27740082144737244, "learning_rate": 4.8949570946045143e-05, "loss": 0.1026, "num_input_tokens_seen": 4812992, "step": 3705 }, { "epoch": 0.1812718344611927, "grad_norm": 0.27882155776023865, "learning_rate": 4.89467531964378e-05, "loss": 0.1256, "num_input_tokens_seen": 4819232, "step": 3710 }, { "epoch": 0.181516136124887, "grad_norm": 0.4372435212135315, "learning_rate": 4.894393175394019e-05, "loss": 0.0889, "num_input_tokens_seen": 4825568, "step": 3715 }, { "epoch": 0.18176043778858134, "grad_norm": 0.2988497018814087, "learning_rate": 4.8941106618987406e-05, "loss": 0.1022, "num_input_tokens_seen": 4832224, "step": 3720 }, { "epoch": 0.18200473945227566, "grad_norm": 0.2430124431848526, "learning_rate": 4.893827779201512e-05, "loss": 0.1054, "num_input_tokens_seen": 4838144, "step": 3725 }, { "epoch": 0.18224904111597, "grad_norm": 0.3479905128479004, "learning_rate": 4.893544527345957e-05, "loss": 0.1187, "num_input_tokens_seen": 4844320, "step": 3730 }, { "epoch": 0.18249334277966434, "grad_norm": 0.5549243688583374, "learning_rate": 4.8932609063757563e-05, "loss": 0.1056, "num_input_tokens_seen": 4851296, "step": 3735 }, { "epoch": 0.18273764444335866, "grad_norm": 0.25206050276756287, "learning_rate": 4.8929769163346484e-05, "loss": 0.1005, "num_input_tokens_seen": 4857536, "step": 3740 }, { "epoch": 0.18298194610705298, "grad_norm": 0.2146785408258438, "learning_rate": 4.892692557266429e-05, "loss": 0.0967, "num_input_tokens_seen": 4863808, "step": 3745 }, { "epoch": 0.1832262477707473, "grad_norm": 0.6826537847518921, "learning_rate": 4.8924078292149464e-05, "loss": 0.0826, "num_input_tokens_seen": 4870080, "step": 3750 }, { "epoch": 0.18347054943444166, "grad_norm": 0.6790493130683899, "learning_rate": 4.892122732224114e-05, "loss": 0.1139, "num_input_tokens_seen": 4876864, "step": 3755 }, { "epoch": 0.18371485109813598, "grad_norm": 0.2693096995353699, "learning_rate": 4.8918372663378944e-05, "loss": 0.1268, "num_input_tokens_seen": 4883456, "step": 3760 }, { "epoch": 0.1839591527618303, "grad_norm": 0.42686727643013, "learning_rate": 4.89155143160031e-05, "loss": 0.0977, "num_input_tokens_seen": 4889216, "step": 3765 }, { "epoch": 0.18420345442552463, "grad_norm": 0.40577277541160583, "learning_rate": 4.891265228055441e-05, "loss": 0.1054, "num_input_tokens_seen": 4895616, "step": 3770 }, { "epoch": 0.18444775608921896, "grad_norm": 0.3860841393470764, "learning_rate": 4.890978655747424e-05, "loss": 0.1141, "num_input_tokens_seen": 4902464, "step": 3775 }, { "epoch": 0.1846920577529133, "grad_norm": 0.12667778134346008, "learning_rate": 4.89069171472045e-05, "loss": 0.0854, "num_input_tokens_seen": 4909280, "step": 3780 }, { "epoch": 0.18493635941660763, "grad_norm": 0.6892995834350586, "learning_rate": 4.890404405018772e-05, "loss": 0.1044, "num_input_tokens_seen": 4915520, "step": 3785 }, { "epoch": 0.18518066108030196, "grad_norm": 0.47452694177627563, "learning_rate": 4.8901167266866934e-05, "loss": 0.0863, "num_input_tokens_seen": 4921856, "step": 3790 }, { "epoch": 0.18542496274399628, "grad_norm": 0.3534107804298401, "learning_rate": 4.88982867976858e-05, "loss": 0.1252, "num_input_tokens_seen": 4927776, "step": 3795 }, { "epoch": 0.1856692644076906, "grad_norm": 0.603075385093689, "learning_rate": 4.889540264308852e-05, "loss": 0.1233, "num_input_tokens_seen": 4934080, "step": 3800 }, { "epoch": 0.1856692644076906, "eval_loss": 0.10188094526529312, "eval_runtime": 375.5451, "eval_samples_per_second": 96.886, "eval_steps_per_second": 24.223, "num_input_tokens_seen": 4934080, "step": 3800 }, { "epoch": 0.18591356607138496, "grad_norm": 0.8413894176483154, "learning_rate": 4.889251480351986e-05, "loss": 0.1135, "num_input_tokens_seen": 4940224, "step": 3805 }, { "epoch": 0.18615786773507928, "grad_norm": 0.5739133954048157, "learning_rate": 4.888962327942517e-05, "loss": 0.116, "num_input_tokens_seen": 4947136, "step": 3810 }, { "epoch": 0.1864021693987736, "grad_norm": 0.2831670343875885, "learning_rate": 4.8886728071250356e-05, "loss": 0.1361, "num_input_tokens_seen": 4953344, "step": 3815 }, { "epoch": 0.18664647106246793, "grad_norm": 0.2159545123577118, "learning_rate": 4.8883829179441884e-05, "loss": 0.1056, "num_input_tokens_seen": 4959840, "step": 3820 }, { "epoch": 0.18689077272616225, "grad_norm": 0.20697370171546936, "learning_rate": 4.888092660444682e-05, "loss": 0.0953, "num_input_tokens_seen": 4966208, "step": 3825 }, { "epoch": 0.1871350743898566, "grad_norm": 0.2302820086479187, "learning_rate": 4.887802034671276e-05, "loss": 0.1097, "num_input_tokens_seen": 4972480, "step": 3830 }, { "epoch": 0.18737937605355093, "grad_norm": 0.5183796286582947, "learning_rate": 4.88751104066879e-05, "loss": 0.1307, "num_input_tokens_seen": 4978944, "step": 3835 }, { "epoch": 0.18762367771724525, "grad_norm": 0.3071126639842987, "learning_rate": 4.887219678482098e-05, "loss": 0.1218, "num_input_tokens_seen": 4985344, "step": 3840 }, { "epoch": 0.18786797938093958, "grad_norm": 0.5892391800880432, "learning_rate": 4.8869279481561316e-05, "loss": 0.1307, "num_input_tokens_seen": 4991744, "step": 3845 }, { "epoch": 0.18811228104463393, "grad_norm": 0.2765864431858063, "learning_rate": 4.88663584973588e-05, "loss": 0.116, "num_input_tokens_seen": 4998112, "step": 3850 }, { "epoch": 0.18835658270832825, "grad_norm": 0.5571781992912292, "learning_rate": 4.8863433832663874e-05, "loss": 0.1044, "num_input_tokens_seen": 5004192, "step": 3855 }, { "epoch": 0.18860088437202258, "grad_norm": 0.17146095633506775, "learning_rate": 4.886050548792757e-05, "loss": 0.0967, "num_input_tokens_seen": 5009984, "step": 3860 }, { "epoch": 0.1888451860357169, "grad_norm": 0.23618334531784058, "learning_rate": 4.8857573463601465e-05, "loss": 0.0908, "num_input_tokens_seen": 5016512, "step": 3865 }, { "epoch": 0.18908948769941122, "grad_norm": 0.13410717248916626, "learning_rate": 4.885463776013772e-05, "loss": 0.1152, "num_input_tokens_seen": 5022976, "step": 3870 }, { "epoch": 0.18933378936310558, "grad_norm": 0.5085498690605164, "learning_rate": 4.8851698377989056e-05, "loss": 0.1286, "num_input_tokens_seen": 5029184, "step": 3875 }, { "epoch": 0.1895780910267999, "grad_norm": 0.6826160550117493, "learning_rate": 4.884875531760876e-05, "loss": 0.0945, "num_input_tokens_seen": 5035616, "step": 3880 }, { "epoch": 0.18982239269049422, "grad_norm": 0.29011714458465576, "learning_rate": 4.88458085794507e-05, "loss": 0.1285, "num_input_tokens_seen": 5042176, "step": 3885 }, { "epoch": 0.19006669435418855, "grad_norm": 0.6251981854438782, "learning_rate": 4.884285816396929e-05, "loss": 0.0785, "num_input_tokens_seen": 5048800, "step": 3890 }, { "epoch": 0.19031099601788287, "grad_norm": 0.19209054112434387, "learning_rate": 4.8839904071619526e-05, "loss": 0.1272, "num_input_tokens_seen": 5055328, "step": 3895 }, { "epoch": 0.19055529768157722, "grad_norm": 0.18983730673789978, "learning_rate": 4.8836946302856955e-05, "loss": 0.0902, "num_input_tokens_seen": 5061888, "step": 3900 }, { "epoch": 0.19079959934527155, "grad_norm": 0.21171711385250092, "learning_rate": 4.8833984858137715e-05, "loss": 0.1075, "num_input_tokens_seen": 5068768, "step": 3905 }, { "epoch": 0.19104390100896587, "grad_norm": 0.19251412153244019, "learning_rate": 4.8831019737918494e-05, "loss": 0.0976, "num_input_tokens_seen": 5075104, "step": 3910 }, { "epoch": 0.1912882026726602, "grad_norm": 0.41949525475502014, "learning_rate": 4.882805094265655e-05, "loss": 0.1094, "num_input_tokens_seen": 5081312, "step": 3915 }, { "epoch": 0.19153250433635452, "grad_norm": 0.3621079921722412, "learning_rate": 4.8825078472809706e-05, "loss": 0.1023, "num_input_tokens_seen": 5088352, "step": 3920 }, { "epoch": 0.19177680600004887, "grad_norm": 0.420163094997406, "learning_rate": 4.882210232883635e-05, "loss": 0.0956, "num_input_tokens_seen": 5094528, "step": 3925 }, { "epoch": 0.1920211076637432, "grad_norm": 0.36049410700798035, "learning_rate": 4.881912251119546e-05, "loss": 0.086, "num_input_tokens_seen": 5101408, "step": 3930 }, { "epoch": 0.19226540932743752, "grad_norm": 0.1541660875082016, "learning_rate": 4.881613902034654e-05, "loss": 0.0828, "num_input_tokens_seen": 5107808, "step": 3935 }, { "epoch": 0.19250971099113184, "grad_norm": 0.5941118001937866, "learning_rate": 4.88131518567497e-05, "loss": 0.1028, "num_input_tokens_seen": 5114464, "step": 3940 }, { "epoch": 0.19275401265482617, "grad_norm": 0.13738802075386047, "learning_rate": 4.881016102086558e-05, "loss": 0.088, "num_input_tokens_seen": 5120864, "step": 3945 }, { "epoch": 0.19299831431852052, "grad_norm": 0.6087609529495239, "learning_rate": 4.8807166513155425e-05, "loss": 0.1177, "num_input_tokens_seen": 5127520, "step": 3950 }, { "epoch": 0.19324261598221484, "grad_norm": 0.16404642164707184, "learning_rate": 4.8804168334081004e-05, "loss": 0.1027, "num_input_tokens_seen": 5134112, "step": 3955 }, { "epoch": 0.19348691764590917, "grad_norm": 0.3320470154285431, "learning_rate": 4.880116648410468e-05, "loss": 0.1082, "num_input_tokens_seen": 5140480, "step": 3960 }, { "epoch": 0.1937312193096035, "grad_norm": 0.26744815707206726, "learning_rate": 4.879816096368939e-05, "loss": 0.091, "num_input_tokens_seen": 5146848, "step": 3965 }, { "epoch": 0.19397552097329782, "grad_norm": 0.2107800394296646, "learning_rate": 4.879515177329861e-05, "loss": 0.1137, "num_input_tokens_seen": 5153760, "step": 3970 }, { "epoch": 0.19421982263699217, "grad_norm": 0.21767747402191162, "learning_rate": 4.8792138913396394e-05, "loss": 0.0819, "num_input_tokens_seen": 5159680, "step": 3975 }, { "epoch": 0.1944641243006865, "grad_norm": 0.17346565425395966, "learning_rate": 4.8789122384447374e-05, "loss": 0.1061, "num_input_tokens_seen": 5166176, "step": 3980 }, { "epoch": 0.19470842596438082, "grad_norm": 0.5454160571098328, "learning_rate": 4.878610218691673e-05, "loss": 0.1177, "num_input_tokens_seen": 5172672, "step": 3985 }, { "epoch": 0.19495272762807514, "grad_norm": 0.158975288271904, "learning_rate": 4.87830783212702e-05, "loss": 0.0859, "num_input_tokens_seen": 5179040, "step": 3990 }, { "epoch": 0.19519702929176946, "grad_norm": 0.7711046934127808, "learning_rate": 4.878005078797413e-05, "loss": 0.0856, "num_input_tokens_seen": 5185664, "step": 3995 }, { "epoch": 0.19544133095546382, "grad_norm": 0.371036171913147, "learning_rate": 4.877701958749539e-05, "loss": 0.1171, "num_input_tokens_seen": 5191936, "step": 4000 }, { "epoch": 0.19544133095546382, "eval_loss": 0.10387411713600159, "eval_runtime": 374.5702, "eval_samples_per_second": 97.138, "eval_steps_per_second": 24.287, "num_input_tokens_seen": 5191936, "step": 4000 }, { "epoch": 0.19568563261915814, "grad_norm": 1.0557140111923218, "learning_rate": 4.877398472030142e-05, "loss": 0.1155, "num_input_tokens_seen": 5198176, "step": 4005 }, { "epoch": 0.19592993428285246, "grad_norm": 0.24083396792411804, "learning_rate": 4.877094618686024e-05, "loss": 0.0839, "num_input_tokens_seen": 5204320, "step": 4010 }, { "epoch": 0.1961742359465468, "grad_norm": 0.20059116184711456, "learning_rate": 4.876790398764045e-05, "loss": 0.1041, "num_input_tokens_seen": 5211232, "step": 4015 }, { "epoch": 0.19641853761024114, "grad_norm": 0.1865481287240982, "learning_rate": 4.8764858123111167e-05, "loss": 0.1038, "num_input_tokens_seen": 5217600, "step": 4020 }, { "epoch": 0.19666283927393546, "grad_norm": 0.7370781898498535, "learning_rate": 4.876180859374212e-05, "loss": 0.0977, "num_input_tokens_seen": 5224544, "step": 4025 }, { "epoch": 0.1969071409376298, "grad_norm": 0.18642541766166687, "learning_rate": 4.875875540000357e-05, "loss": 0.0769, "num_input_tokens_seen": 5231488, "step": 4030 }, { "epoch": 0.1971514426013241, "grad_norm": 0.15892145037651062, "learning_rate": 4.8755698542366376e-05, "loss": 0.0773, "num_input_tokens_seen": 5237760, "step": 4035 }, { "epoch": 0.19739574426501844, "grad_norm": 0.36837494373321533, "learning_rate": 4.875263802130193e-05, "loss": 0.1024, "num_input_tokens_seen": 5244032, "step": 4040 }, { "epoch": 0.1976400459287128, "grad_norm": 0.4443624019622803, "learning_rate": 4.8749573837282207e-05, "loss": 0.1186, "num_input_tokens_seen": 5250080, "step": 4045 }, { "epoch": 0.1978843475924071, "grad_norm": 0.24159017205238342, "learning_rate": 4.874650599077974e-05, "loss": 0.0778, "num_input_tokens_seen": 5256896, "step": 4050 }, { "epoch": 0.19812864925610144, "grad_norm": 0.6955461502075195, "learning_rate": 4.874343448226764e-05, "loss": 0.1029, "num_input_tokens_seen": 5263712, "step": 4055 }, { "epoch": 0.19837295091979576, "grad_norm": 0.1558971405029297, "learning_rate": 4.874035931221955e-05, "loss": 0.0816, "num_input_tokens_seen": 5270048, "step": 4060 }, { "epoch": 0.19861725258349008, "grad_norm": 0.761987030506134, "learning_rate": 4.8737280481109724e-05, "loss": 0.1221, "num_input_tokens_seen": 5276448, "step": 4065 }, { "epoch": 0.19886155424718444, "grad_norm": 0.2677485942840576, "learning_rate": 4.873419798941294e-05, "loss": 0.0915, "num_input_tokens_seen": 5282848, "step": 4070 }, { "epoch": 0.19910585591087876, "grad_norm": 0.5941776037216187, "learning_rate": 4.873111183760458e-05, "loss": 0.1147, "num_input_tokens_seen": 5289056, "step": 4075 }, { "epoch": 0.19935015757457308, "grad_norm": 0.5737611055374146, "learning_rate": 4.8728022026160537e-05, "loss": 0.1076, "num_input_tokens_seen": 5295328, "step": 4080 }, { "epoch": 0.1995944592382674, "grad_norm": 0.2650241255760193, "learning_rate": 4.872492855555732e-05, "loss": 0.0856, "num_input_tokens_seen": 5301632, "step": 4085 }, { "epoch": 0.19983876090196173, "grad_norm": 0.16801555454730988, "learning_rate": 4.8721831426271956e-05, "loss": 0.1034, "num_input_tokens_seen": 5308384, "step": 4090 }, { "epoch": 0.20008306256565608, "grad_norm": 0.3224966526031494, "learning_rate": 4.87187306387821e-05, "loss": 0.1162, "num_input_tokens_seen": 5314688, "step": 4095 }, { "epoch": 0.2003273642293504, "grad_norm": 0.3408365249633789, "learning_rate": 4.87156261935659e-05, "loss": 0.0884, "num_input_tokens_seen": 5320480, "step": 4100 }, { "epoch": 0.20057166589304473, "grad_norm": 0.21980075538158417, "learning_rate": 4.871251809110211e-05, "loss": 0.1071, "num_input_tokens_seen": 5327168, "step": 4105 }, { "epoch": 0.20081596755673906, "grad_norm": 0.6451396346092224, "learning_rate": 4.8709406331870044e-05, "loss": 0.1084, "num_input_tokens_seen": 5333920, "step": 4110 }, { "epoch": 0.20106026922043338, "grad_norm": 0.1855594664812088, "learning_rate": 4.8706290916349574e-05, "loss": 0.083, "num_input_tokens_seen": 5340320, "step": 4115 }, { "epoch": 0.20130457088412773, "grad_norm": 0.3617601990699768, "learning_rate": 4.8703171845021134e-05, "loss": 0.1023, "num_input_tokens_seen": 5346592, "step": 4120 }, { "epoch": 0.20154887254782206, "grad_norm": 0.40693724155426025, "learning_rate": 4.870004911836572e-05, "loss": 0.1037, "num_input_tokens_seen": 5353504, "step": 4125 }, { "epoch": 0.20179317421151638, "grad_norm": 0.3004808723926544, "learning_rate": 4.869692273686489e-05, "loss": 0.0865, "num_input_tokens_seen": 5360096, "step": 4130 }, { "epoch": 0.2020374758752107, "grad_norm": 0.2144905924797058, "learning_rate": 4.869379270100079e-05, "loss": 0.0971, "num_input_tokens_seen": 5366688, "step": 4135 }, { "epoch": 0.20228177753890503, "grad_norm": 0.35997840762138367, "learning_rate": 4.86906590112561e-05, "loss": 0.1099, "num_input_tokens_seen": 5373120, "step": 4140 }, { "epoch": 0.20252607920259938, "grad_norm": 0.3617027699947357, "learning_rate": 4.8687521668114064e-05, "loss": 0.0754, "num_input_tokens_seen": 5379840, "step": 4145 }, { "epoch": 0.2027703808662937, "grad_norm": 0.14943066239356995, "learning_rate": 4.868438067205853e-05, "loss": 0.1061, "num_input_tokens_seen": 5386464, "step": 4150 }, { "epoch": 0.20301468252998803, "grad_norm": 0.12271829694509506, "learning_rate": 4.8681236023573844e-05, "loss": 0.088, "num_input_tokens_seen": 5392896, "step": 4155 }, { "epoch": 0.20325898419368235, "grad_norm": 0.33892032504081726, "learning_rate": 4.867808772314497e-05, "loss": 0.0795, "num_input_tokens_seen": 5399168, "step": 4160 }, { "epoch": 0.20350328585737668, "grad_norm": 0.4778141379356384, "learning_rate": 4.867493577125741e-05, "loss": 0.095, "num_input_tokens_seen": 5405472, "step": 4165 }, { "epoch": 0.20374758752107103, "grad_norm": 0.26324188709259033, "learning_rate": 4.867178016839725e-05, "loss": 0.1235, "num_input_tokens_seen": 5411840, "step": 4170 }, { "epoch": 0.20399188918476535, "grad_norm": 0.25325173139572144, "learning_rate": 4.8668620915051094e-05, "loss": 0.1005, "num_input_tokens_seen": 5418528, "step": 4175 }, { "epoch": 0.20423619084845968, "grad_norm": 0.2998596131801605, "learning_rate": 4.866545801170616e-05, "loss": 0.0991, "num_input_tokens_seen": 5424832, "step": 4180 }, { "epoch": 0.204480492512154, "grad_norm": 0.38212522864341736, "learning_rate": 4.86622914588502e-05, "loss": 0.0953, "num_input_tokens_seen": 5431616, "step": 4185 }, { "epoch": 0.20472479417584835, "grad_norm": 0.341267466545105, "learning_rate": 4.865912125697154e-05, "loss": 0.116, "num_input_tokens_seen": 5438048, "step": 4190 }, { "epoch": 0.20496909583954268, "grad_norm": 0.5152976512908936, "learning_rate": 4.865594740655907e-05, "loss": 0.1097, "num_input_tokens_seen": 5444512, "step": 4195 }, { "epoch": 0.205213397503237, "grad_norm": 0.5311033725738525, "learning_rate": 4.865276990810222e-05, "loss": 0.0911, "num_input_tokens_seen": 5451200, "step": 4200 }, { "epoch": 0.205213397503237, "eval_loss": 0.10060615837574005, "eval_runtime": 374.3687, "eval_samples_per_second": 97.19, "eval_steps_per_second": 24.3, "num_input_tokens_seen": 5451200, "step": 4200 }, { "epoch": 0.20545769916693132, "grad_norm": 0.2574841380119324, "learning_rate": 4.8649588762091016e-05, "loss": 0.1084, "num_input_tokens_seen": 5457888, "step": 4205 }, { "epoch": 0.20570200083062565, "grad_norm": 0.43523451685905457, "learning_rate": 4.8646403969016016e-05, "loss": 0.1254, "num_input_tokens_seen": 5464256, "step": 4210 }, { "epoch": 0.20594630249432, "grad_norm": 0.2676774263381958, "learning_rate": 4.864321552936838e-05, "loss": 0.1316, "num_input_tokens_seen": 5471392, "step": 4215 }, { "epoch": 0.20619060415801432, "grad_norm": 0.12027907371520996, "learning_rate": 4.864002344363978e-05, "loss": 0.0905, "num_input_tokens_seen": 5477888, "step": 4220 }, { "epoch": 0.20643490582170865, "grad_norm": 0.21054042875766754, "learning_rate": 4.863682771232248e-05, "loss": 0.1066, "num_input_tokens_seen": 5484416, "step": 4225 }, { "epoch": 0.20667920748540297, "grad_norm": 0.4523102939128876, "learning_rate": 4.8633628335909324e-05, "loss": 0.1048, "num_input_tokens_seen": 5491136, "step": 4230 }, { "epoch": 0.2069235091490973, "grad_norm": 0.2838248312473297, "learning_rate": 4.8630425314893676e-05, "loss": 0.134, "num_input_tokens_seen": 5497280, "step": 4235 }, { "epoch": 0.20716781081279165, "grad_norm": 0.25560107827186584, "learning_rate": 4.862721864976948e-05, "loss": 0.0634, "num_input_tokens_seen": 5504128, "step": 4240 }, { "epoch": 0.20741211247648597, "grad_norm": 0.17390379309654236, "learning_rate": 4.862400834103125e-05, "loss": 0.0997, "num_input_tokens_seen": 5511872, "step": 4245 }, { "epoch": 0.2076564141401803, "grad_norm": 0.16208206117153168, "learning_rate": 4.862079438917406e-05, "loss": 0.0947, "num_input_tokens_seen": 5518336, "step": 4250 }, { "epoch": 0.20790071580387462, "grad_norm": 0.8291673064231873, "learning_rate": 4.8617576794693536e-05, "loss": 0.0894, "num_input_tokens_seen": 5524384, "step": 4255 }, { "epoch": 0.20814501746756894, "grad_norm": 0.29229894280433655, "learning_rate": 4.8614355558085875e-05, "loss": 0.0895, "num_input_tokens_seen": 5530752, "step": 4260 }, { "epoch": 0.2083893191312633, "grad_norm": 0.3092079758644104, "learning_rate": 4.861113067984783e-05, "loss": 0.099, "num_input_tokens_seen": 5537408, "step": 4265 }, { "epoch": 0.20863362079495762, "grad_norm": 0.15943773090839386, "learning_rate": 4.860790216047671e-05, "loss": 0.0989, "num_input_tokens_seen": 5543776, "step": 4270 }, { "epoch": 0.20887792245865194, "grad_norm": 0.19600644707679749, "learning_rate": 4.860467000047041e-05, "loss": 0.0783, "num_input_tokens_seen": 5550208, "step": 4275 }, { "epoch": 0.20912222412234627, "grad_norm": 0.17882364988327026, "learning_rate": 4.860143420032737e-05, "loss": 0.0789, "num_input_tokens_seen": 5556736, "step": 4280 }, { "epoch": 0.2093665257860406, "grad_norm": 0.19262869656085968, "learning_rate": 4.859819476054657e-05, "loss": 0.0961, "num_input_tokens_seen": 5563136, "step": 4285 }, { "epoch": 0.20961082744973494, "grad_norm": 0.13933752477169037, "learning_rate": 4.859495168162758e-05, "loss": 0.0701, "num_input_tokens_seen": 5569824, "step": 4290 }, { "epoch": 0.20985512911342927, "grad_norm": 0.24084554612636566, "learning_rate": 4.859170496407054e-05, "loss": 0.1056, "num_input_tokens_seen": 5576384, "step": 4295 }, { "epoch": 0.2100994307771236, "grad_norm": 0.8092536330223083, "learning_rate": 4.8588454608376114e-05, "loss": 0.1081, "num_input_tokens_seen": 5582656, "step": 4300 }, { "epoch": 0.21034373244081792, "grad_norm": 0.24642378091812134, "learning_rate": 4.8585200615045555e-05, "loss": 0.0832, "num_input_tokens_seen": 5589312, "step": 4305 }, { "epoch": 0.21058803410451224, "grad_norm": 0.1669519990682602, "learning_rate": 4.8581942984580674e-05, "loss": 0.0742, "num_input_tokens_seen": 5596000, "step": 4310 }, { "epoch": 0.2108323357682066, "grad_norm": 0.27017292380332947, "learning_rate": 4.857868171748384e-05, "loss": 0.114, "num_input_tokens_seen": 5602528, "step": 4315 }, { "epoch": 0.21107663743190092, "grad_norm": 0.3409595489501953, "learning_rate": 4.8575416814257976e-05, "loss": 0.1053, "num_input_tokens_seen": 5609184, "step": 4320 }, { "epoch": 0.21132093909559524, "grad_norm": 1.0198845863342285, "learning_rate": 4.857214827540657e-05, "loss": 0.1291, "num_input_tokens_seen": 5615648, "step": 4325 }, { "epoch": 0.21156524075928956, "grad_norm": 0.36481788754463196, "learning_rate": 4.856887610143367e-05, "loss": 0.089, "num_input_tokens_seen": 5622432, "step": 4330 }, { "epoch": 0.2118095424229839, "grad_norm": 0.6144345998764038, "learning_rate": 4.8565600292843896e-05, "loss": 0.0956, "num_input_tokens_seen": 5628832, "step": 4335 }, { "epoch": 0.21205384408667824, "grad_norm": 0.3283021152019501, "learning_rate": 4.856232085014241e-05, "loss": 0.1204, "num_input_tokens_seen": 5634912, "step": 4340 }, { "epoch": 0.21229814575037256, "grad_norm": 0.7142403721809387, "learning_rate": 4.855903777383495e-05, "loss": 0.1019, "num_input_tokens_seen": 5641184, "step": 4345 }, { "epoch": 0.2125424474140669, "grad_norm": 0.3685157895088196, "learning_rate": 4.85557510644278e-05, "loss": 0.1185, "num_input_tokens_seen": 5647648, "step": 4350 }, { "epoch": 0.2127867490777612, "grad_norm": 0.36058473587036133, "learning_rate": 4.855246072242782e-05, "loss": 0.1355, "num_input_tokens_seen": 5653920, "step": 4355 }, { "epoch": 0.21303105074145554, "grad_norm": 0.25963449478149414, "learning_rate": 4.8549166748342414e-05, "loss": 0.082, "num_input_tokens_seen": 5660320, "step": 4360 }, { "epoch": 0.2132753524051499, "grad_norm": 0.30008625984191895, "learning_rate": 4.8545869142679556e-05, "loss": 0.1147, "num_input_tokens_seen": 5666464, "step": 4365 }, { "epoch": 0.2135196540688442, "grad_norm": 0.14624229073524475, "learning_rate": 4.8542567905947776e-05, "loss": 0.078, "num_input_tokens_seen": 5672992, "step": 4370 }, { "epoch": 0.21376395573253854, "grad_norm": 0.35546597838401794, "learning_rate": 4.853926303865618e-05, "loss": 0.1193, "num_input_tokens_seen": 5679520, "step": 4375 }, { "epoch": 0.21400825739623286, "grad_norm": 0.3166263997554779, "learning_rate": 4.853595454131441e-05, "loss": 0.0955, "num_input_tokens_seen": 5685696, "step": 4380 }, { "epoch": 0.2142525590599272, "grad_norm": 0.45098671317100525, "learning_rate": 4.8532642414432674e-05, "loss": 0.1004, "num_input_tokens_seen": 5691968, "step": 4385 }, { "epoch": 0.21449686072362154, "grad_norm": 0.189096137881279, "learning_rate": 4.8529326658521754e-05, "loss": 0.096, "num_input_tokens_seen": 5698144, "step": 4390 }, { "epoch": 0.21474116238731586, "grad_norm": 0.1685340255498886, "learning_rate": 4.8526007274092965e-05, "loss": 0.0774, "num_input_tokens_seen": 5704800, "step": 4395 }, { "epoch": 0.21498546405101018, "grad_norm": 0.6751738786697388, "learning_rate": 4.852268426165822e-05, "loss": 0.1135, "num_input_tokens_seen": 5711648, "step": 4400 }, { "epoch": 0.21498546405101018, "eval_loss": 0.10004128515720367, "eval_runtime": 375.2471, "eval_samples_per_second": 96.963, "eval_steps_per_second": 24.243, "num_input_tokens_seen": 5711648, "step": 4400 }, { "epoch": 0.2152297657147045, "grad_norm": 0.5805772542953491, "learning_rate": 4.851935762172995e-05, "loss": 0.1035, "num_input_tokens_seen": 5718112, "step": 4405 }, { "epoch": 0.21547406737839886, "grad_norm": 0.2609540522098541, "learning_rate": 4.8516027354821175e-05, "loss": 0.1205, "num_input_tokens_seen": 5724576, "step": 4410 }, { "epoch": 0.21571836904209318, "grad_norm": 0.4593774080276489, "learning_rate": 4.851269346144546e-05, "loss": 0.077, "num_input_tokens_seen": 5730880, "step": 4415 }, { "epoch": 0.2159626707057875, "grad_norm": 0.5745564699172974, "learning_rate": 4.850935594211693e-05, "loss": 0.1227, "num_input_tokens_seen": 5737568, "step": 4420 }, { "epoch": 0.21620697236948183, "grad_norm": 0.306925892829895, "learning_rate": 4.850601479735029e-05, "loss": 0.1202, "num_input_tokens_seen": 5744000, "step": 4425 }, { "epoch": 0.21645127403317616, "grad_norm": 0.21234866976737976, "learning_rate": 4.850267002766076e-05, "loss": 0.0921, "num_input_tokens_seen": 5750560, "step": 4430 }, { "epoch": 0.2166955756968705, "grad_norm": 0.18064486980438232, "learning_rate": 4.849932163356417e-05, "loss": 0.1139, "num_input_tokens_seen": 5756800, "step": 4435 }, { "epoch": 0.21693987736056483, "grad_norm": 0.1552189737558365, "learning_rate": 4.8495969615576864e-05, "loss": 0.1129, "num_input_tokens_seen": 5762976, "step": 4440 }, { "epoch": 0.21718417902425916, "grad_norm": 0.2565259635448456, "learning_rate": 4.849261397421577e-05, "loss": 0.0873, "num_input_tokens_seen": 5770176, "step": 4445 }, { "epoch": 0.21742848068795348, "grad_norm": 0.6522253155708313, "learning_rate": 4.848925470999839e-05, "loss": 0.115, "num_input_tokens_seen": 5776512, "step": 4450 }, { "epoch": 0.2176727823516478, "grad_norm": 0.2482023686170578, "learning_rate": 4.848589182344273e-05, "loss": 0.1052, "num_input_tokens_seen": 5782720, "step": 4455 }, { "epoch": 0.21791708401534216, "grad_norm": 0.48158565163612366, "learning_rate": 4.848252531506742e-05, "loss": 0.0954, "num_input_tokens_seen": 5788736, "step": 4460 }, { "epoch": 0.21816138567903648, "grad_norm": 0.25046709179878235, "learning_rate": 4.847915518539161e-05, "loss": 0.091, "num_input_tokens_seen": 5795328, "step": 4465 }, { "epoch": 0.2184056873427308, "grad_norm": 0.6639329195022583, "learning_rate": 4.847578143493501e-05, "loss": 0.0986, "num_input_tokens_seen": 5801344, "step": 4470 }, { "epoch": 0.21864998900642513, "grad_norm": 0.2702140510082245, "learning_rate": 4.847240406421789e-05, "loss": 0.1288, "num_input_tokens_seen": 5807776, "step": 4475 }, { "epoch": 0.21889429067011945, "grad_norm": 0.24862462282180786, "learning_rate": 4.84690230737611e-05, "loss": 0.0931, "num_input_tokens_seen": 5814304, "step": 4480 }, { "epoch": 0.2191385923338138, "grad_norm": 0.2686789333820343, "learning_rate": 4.846563846408602e-05, "loss": 0.0846, "num_input_tokens_seen": 5820992, "step": 4485 }, { "epoch": 0.21938289399750813, "grad_norm": 0.6730853915214539, "learning_rate": 4.84622502357146e-05, "loss": 0.1252, "num_input_tokens_seen": 5827840, "step": 4490 }, { "epoch": 0.21962719566120245, "grad_norm": 0.6536193490028381, "learning_rate": 4.8458858389169345e-05, "loss": 0.0992, "num_input_tokens_seen": 5833920, "step": 4495 }, { "epoch": 0.21987149732489678, "grad_norm": 0.14918817579746246, "learning_rate": 4.8455462924973334e-05, "loss": 0.0958, "num_input_tokens_seen": 5840064, "step": 4500 }, { "epoch": 0.2201157989885911, "grad_norm": 0.24204787611961365, "learning_rate": 4.845206384365018e-05, "loss": 0.1052, "num_input_tokens_seen": 5846400, "step": 4505 }, { "epoch": 0.22036010065228545, "grad_norm": 0.670293390750885, "learning_rate": 4.844866114572405e-05, "loss": 0.1142, "num_input_tokens_seen": 5852544, "step": 4510 }, { "epoch": 0.22060440231597978, "grad_norm": 0.40738365054130554, "learning_rate": 4.8445254831719706e-05, "loss": 0.1133, "num_input_tokens_seen": 5858464, "step": 4515 }, { "epoch": 0.2208487039796741, "grad_norm": 0.4003482758998871, "learning_rate": 4.8441844902162434e-05, "loss": 0.0876, "num_input_tokens_seen": 5864864, "step": 4520 }, { "epoch": 0.22109300564336842, "grad_norm": 0.4605056047439575, "learning_rate": 4.843843135757809e-05, "loss": 0.0804, "num_input_tokens_seen": 5871552, "step": 4525 }, { "epoch": 0.22133730730706275, "grad_norm": 0.379544198513031, "learning_rate": 4.843501419849308e-05, "loss": 0.0947, "num_input_tokens_seen": 5877888, "step": 4530 }, { "epoch": 0.2215816089707571, "grad_norm": 0.1595461070537567, "learning_rate": 4.8431593425434386e-05, "loss": 0.0921, "num_input_tokens_seen": 5884416, "step": 4535 }, { "epoch": 0.22182591063445142, "grad_norm": 0.28352370858192444, "learning_rate": 4.8428169038929526e-05, "loss": 0.082, "num_input_tokens_seen": 5891456, "step": 4540 }, { "epoch": 0.22207021229814575, "grad_norm": 0.2848389446735382, "learning_rate": 4.8424741039506575e-05, "loss": 0.0865, "num_input_tokens_seen": 5897728, "step": 4545 }, { "epoch": 0.22231451396184007, "grad_norm": 0.43499240279197693, "learning_rate": 4.842130942769419e-05, "loss": 0.0863, "num_input_tokens_seen": 5904640, "step": 4550 }, { "epoch": 0.22255881562553442, "grad_norm": 0.5484145283699036, "learning_rate": 4.841787420402156e-05, "loss": 0.0923, "num_input_tokens_seen": 5910976, "step": 4555 }, { "epoch": 0.22280311728922875, "grad_norm": 0.16997934877872467, "learning_rate": 4.841443536901844e-05, "loss": 0.1326, "num_input_tokens_seen": 5917248, "step": 4560 }, { "epoch": 0.22304741895292307, "grad_norm": 0.5416204333305359, "learning_rate": 4.841099292321514e-05, "loss": 0.096, "num_input_tokens_seen": 5923520, "step": 4565 }, { "epoch": 0.2232917206166174, "grad_norm": 0.13790039718151093, "learning_rate": 4.8407546867142525e-05, "loss": 0.0824, "num_input_tokens_seen": 5929792, "step": 4570 }, { "epoch": 0.22353602228031172, "grad_norm": 0.18645775318145752, "learning_rate": 4.840409720133203e-05, "loss": 0.0981, "num_input_tokens_seen": 5936416, "step": 4575 }, { "epoch": 0.22378032394400607, "grad_norm": 0.3153879642486572, "learning_rate": 4.8400643926315634e-05, "loss": 0.0991, "num_input_tokens_seen": 5943360, "step": 4580 }, { "epoch": 0.2240246256077004, "grad_norm": 0.2689094543457031, "learning_rate": 4.839718704262587e-05, "loss": 0.0856, "num_input_tokens_seen": 5950080, "step": 4585 }, { "epoch": 0.22426892727139472, "grad_norm": 0.37858590483665466, "learning_rate": 4.839372655079585e-05, "loss": 0.084, "num_input_tokens_seen": 5957280, "step": 4590 }, { "epoch": 0.22451322893508904, "grad_norm": 0.30640289187431335, "learning_rate": 4.83902624513592e-05, "loss": 0.0972, "num_input_tokens_seen": 5963712, "step": 4595 }, { "epoch": 0.22475753059878337, "grad_norm": 0.32096734642982483, "learning_rate": 4.838679474485014e-05, "loss": 0.0905, "num_input_tokens_seen": 5970048, "step": 4600 }, { "epoch": 0.22475753059878337, "eval_loss": 0.1000811830163002, "eval_runtime": 374.0627, "eval_samples_per_second": 97.27, "eval_steps_per_second": 24.319, "num_input_tokens_seen": 5970048, "step": 4600 }, { "epoch": 0.22500183226247772, "grad_norm": 0.33103200793266296, "learning_rate": 4.838332343180343e-05, "loss": 0.0892, "num_input_tokens_seen": 5976416, "step": 4605 }, { "epoch": 0.22524613392617204, "grad_norm": 0.23447400331497192, "learning_rate": 4.83798485127544e-05, "loss": 0.0757, "num_input_tokens_seen": 5982976, "step": 4610 }, { "epoch": 0.22549043558986637, "grad_norm": 0.3131433129310608, "learning_rate": 4.837636998823892e-05, "loss": 0.1218, "num_input_tokens_seen": 5989152, "step": 4615 }, { "epoch": 0.2257347372535607, "grad_norm": 0.1896740198135376, "learning_rate": 4.8372887858793414e-05, "loss": 0.1092, "num_input_tokens_seen": 5995552, "step": 4620 }, { "epoch": 0.22597903891725502, "grad_norm": 0.2269807755947113, "learning_rate": 4.836940212495489e-05, "loss": 0.1017, "num_input_tokens_seen": 6001824, "step": 4625 }, { "epoch": 0.22622334058094937, "grad_norm": 0.3196618854999542, "learning_rate": 4.836591278726087e-05, "loss": 0.0727, "num_input_tokens_seen": 6007936, "step": 4630 }, { "epoch": 0.2264676422446437, "grad_norm": 0.6485865116119385, "learning_rate": 4.836241984624947e-05, "loss": 0.08, "num_input_tokens_seen": 6014496, "step": 4635 }, { "epoch": 0.22671194390833802, "grad_norm": 0.4322064518928528, "learning_rate": 4.8358923302459336e-05, "loss": 0.0978, "num_input_tokens_seen": 6020928, "step": 4640 }, { "epoch": 0.22695624557203234, "grad_norm": 0.5506956577301025, "learning_rate": 4.835542315642968e-05, "loss": 0.0951, "num_input_tokens_seen": 6027456, "step": 4645 }, { "epoch": 0.22720054723572666, "grad_norm": 0.679858922958374, "learning_rate": 4.8351919408700274e-05, "loss": 0.1119, "num_input_tokens_seen": 6033632, "step": 4650 }, { "epoch": 0.22744484889942101, "grad_norm": 0.21280938386917114, "learning_rate": 4.834841205981144e-05, "loss": 0.1193, "num_input_tokens_seen": 6039936, "step": 4655 }, { "epoch": 0.22768915056311534, "grad_norm": 0.4698774516582489, "learning_rate": 4.8344901110304054e-05, "loss": 0.0914, "num_input_tokens_seen": 6046848, "step": 4660 }, { "epoch": 0.22793345222680966, "grad_norm": 0.6195977926254272, "learning_rate": 4.8341386560719534e-05, "loss": 0.1166, "num_input_tokens_seen": 6052736, "step": 4665 }, { "epoch": 0.228177753890504, "grad_norm": 0.2780122756958008, "learning_rate": 4.833786841159989e-05, "loss": 0.1013, "num_input_tokens_seen": 6059008, "step": 4670 }, { "epoch": 0.2284220555541983, "grad_norm": 0.21261584758758545, "learning_rate": 4.833434666348765e-05, "loss": 0.0669, "num_input_tokens_seen": 6065536, "step": 4675 }, { "epoch": 0.22866635721789266, "grad_norm": 0.9506212472915649, "learning_rate": 4.833082131692592e-05, "loss": 0.1198, "num_input_tokens_seen": 6072288, "step": 4680 }, { "epoch": 0.228910658881587, "grad_norm": 0.29631468653678894, "learning_rate": 4.832729237245835e-05, "loss": 0.1094, "num_input_tokens_seen": 6078560, "step": 4685 }, { "epoch": 0.2291549605452813, "grad_norm": 0.17984680831432343, "learning_rate": 4.8323759830629145e-05, "loss": 0.0725, "num_input_tokens_seen": 6084960, "step": 4690 }, { "epoch": 0.22939926220897564, "grad_norm": 0.2347131371498108, "learning_rate": 4.8320223691983066e-05, "loss": 0.0948, "num_input_tokens_seen": 6091392, "step": 4695 }, { "epoch": 0.22964356387266996, "grad_norm": 0.19370122253894806, "learning_rate": 4.831668395706544e-05, "loss": 0.097, "num_input_tokens_seen": 6097728, "step": 4700 }, { "epoch": 0.2298878655363643, "grad_norm": 0.34870609641075134, "learning_rate": 4.8313140626422125e-05, "loss": 0.0951, "num_input_tokens_seen": 6104896, "step": 4705 }, { "epoch": 0.23013216720005863, "grad_norm": 0.2417076677083969, "learning_rate": 4.830959370059956e-05, "loss": 0.0995, "num_input_tokens_seen": 6111456, "step": 4710 }, { "epoch": 0.23037646886375296, "grad_norm": 0.6610593199729919, "learning_rate": 4.830604318014472e-05, "loss": 0.1142, "num_input_tokens_seen": 6117664, "step": 4715 }, { "epoch": 0.23062077052744728, "grad_norm": 0.3310040533542633, "learning_rate": 4.830248906560514e-05, "loss": 0.0977, "num_input_tokens_seen": 6124224, "step": 4720 }, { "epoch": 0.23086507219114163, "grad_norm": 0.6234028935432434, "learning_rate": 4.829893135752891e-05, "loss": 0.0929, "num_input_tokens_seen": 6130720, "step": 4725 }, { "epoch": 0.23110937385483596, "grad_norm": 0.4643978774547577, "learning_rate": 4.829537005646466e-05, "loss": 0.0959, "num_input_tokens_seen": 6136992, "step": 4730 }, { "epoch": 0.23135367551853028, "grad_norm": 0.2521135210990906, "learning_rate": 4.8291805162961615e-05, "loss": 0.0925, "num_input_tokens_seen": 6143264, "step": 4735 }, { "epoch": 0.2315979771822246, "grad_norm": 0.1642276495695114, "learning_rate": 4.82882366775695e-05, "loss": 0.1038, "num_input_tokens_seen": 6149792, "step": 4740 }, { "epoch": 0.23184227884591893, "grad_norm": 0.290963739156723, "learning_rate": 4.828466460083864e-05, "loss": 0.1057, "num_input_tokens_seen": 6156160, "step": 4745 }, { "epoch": 0.23208658050961328, "grad_norm": 0.6243970394134521, "learning_rate": 4.8281088933319877e-05, "loss": 0.1315, "num_input_tokens_seen": 6162336, "step": 4750 }, { "epoch": 0.2323308821733076, "grad_norm": 0.2341538518667221, "learning_rate": 4.827750967556464e-05, "loss": 0.0549, "num_input_tokens_seen": 6168896, "step": 4755 }, { "epoch": 0.23257518383700193, "grad_norm": 0.26842576265335083, "learning_rate": 4.827392682812488e-05, "loss": 0.1023, "num_input_tokens_seen": 6175200, "step": 4760 }, { "epoch": 0.23281948550069625, "grad_norm": 0.15265341103076935, "learning_rate": 4.827034039155312e-05, "loss": 0.066, "num_input_tokens_seen": 6181664, "step": 4765 }, { "epoch": 0.23306378716439058, "grad_norm": 0.6001848578453064, "learning_rate": 4.8266750366402445e-05, "loss": 0.0967, "num_input_tokens_seen": 6188032, "step": 4770 }, { "epoch": 0.23330808882808493, "grad_norm": 0.22116141021251678, "learning_rate": 4.8263156753226476e-05, "loss": 0.0819, "num_input_tokens_seen": 6194560, "step": 4775 }, { "epoch": 0.23355239049177925, "grad_norm": 0.3131448030471802, "learning_rate": 4.8259559552579394e-05, "loss": 0.1003, "num_input_tokens_seen": 6201024, "step": 4780 }, { "epoch": 0.23379669215547358, "grad_norm": 0.2262118011713028, "learning_rate": 4.825595876501593e-05, "loss": 0.1008, "num_input_tokens_seen": 6207392, "step": 4785 }, { "epoch": 0.2340409938191679, "grad_norm": 0.24905925989151, "learning_rate": 4.825235439109137e-05, "loss": 0.1074, "num_input_tokens_seen": 6213792, "step": 4790 }, { "epoch": 0.23428529548286223, "grad_norm": 0.4233328104019165, "learning_rate": 4.824874643136156e-05, "loss": 0.0918, "num_input_tokens_seen": 6220320, "step": 4795 }, { "epoch": 0.23452959714655658, "grad_norm": 0.5418888926506042, "learning_rate": 4.824513488638288e-05, "loss": 0.0945, "num_input_tokens_seen": 6226272, "step": 4800 }, { "epoch": 0.23452959714655658, "eval_loss": 0.09923525899648666, "eval_runtime": 374.5353, "eval_samples_per_second": 97.147, "eval_steps_per_second": 24.289, "num_input_tokens_seen": 6226272, "step": 4800 }, { "epoch": 0.2347738988102509, "grad_norm": 0.40068086981773376, "learning_rate": 4.8241519756712293e-05, "loss": 0.096, "num_input_tokens_seen": 6232832, "step": 4805 }, { "epoch": 0.23501820047394523, "grad_norm": 0.2445489764213562, "learning_rate": 4.8237901042907285e-05, "loss": 0.0958, "num_input_tokens_seen": 6239072, "step": 4810 }, { "epoch": 0.23526250213763955, "grad_norm": 0.1709393858909607, "learning_rate": 4.823427874552591e-05, "loss": 0.1086, "num_input_tokens_seen": 6245344, "step": 4815 }, { "epoch": 0.23550680380133387, "grad_norm": 0.15210258960723877, "learning_rate": 4.823065286512677e-05, "loss": 0.0939, "num_input_tokens_seen": 6251648, "step": 4820 }, { "epoch": 0.23575110546502823, "grad_norm": 0.21129359304904938, "learning_rate": 4.8227023402269025e-05, "loss": 0.1081, "num_input_tokens_seen": 6258048, "step": 4825 }, { "epoch": 0.23599540712872255, "grad_norm": 0.4030386507511139, "learning_rate": 4.822339035751239e-05, "loss": 0.0794, "num_input_tokens_seen": 6264448, "step": 4830 }, { "epoch": 0.23623970879241687, "grad_norm": 0.27764689922332764, "learning_rate": 4.8219753731417104e-05, "loss": 0.0685, "num_input_tokens_seen": 6271296, "step": 4835 }, { "epoch": 0.2364840104561112, "grad_norm": 0.6659578680992126, "learning_rate": 4.821611352454401e-05, "loss": 0.1222, "num_input_tokens_seen": 6277888, "step": 4840 }, { "epoch": 0.23672831211980552, "grad_norm": 0.6276007294654846, "learning_rate": 4.8212469737454444e-05, "loss": 0.1079, "num_input_tokens_seen": 6284416, "step": 4845 }, { "epoch": 0.23697261378349987, "grad_norm": 0.16919203102588654, "learning_rate": 4.820882237071035e-05, "loss": 0.1136, "num_input_tokens_seen": 6290976, "step": 4850 }, { "epoch": 0.2372169154471942, "grad_norm": 0.2960776388645172, "learning_rate": 4.820517142487417e-05, "loss": 0.0987, "num_input_tokens_seen": 6297248, "step": 4855 }, { "epoch": 0.23746121711088852, "grad_norm": 0.16876313090324402, "learning_rate": 4.8201516900508956e-05, "loss": 0.0843, "num_input_tokens_seen": 6303968, "step": 4860 }, { "epoch": 0.23770551877458285, "grad_norm": 0.2559508681297302, "learning_rate": 4.819785879817827e-05, "loss": 0.0991, "num_input_tokens_seen": 6310400, "step": 4865 }, { "epoch": 0.23794982043827717, "grad_norm": 0.28029659390449524, "learning_rate": 4.8194197118446226e-05, "loss": 0.1099, "num_input_tokens_seen": 6316864, "step": 4870 }, { "epoch": 0.23819412210197152, "grad_norm": 0.11515801399946213, "learning_rate": 4.819053186187752e-05, "loss": 0.0748, "num_input_tokens_seen": 6323744, "step": 4875 }, { "epoch": 0.23843842376566585, "grad_norm": 0.41088956594467163, "learning_rate": 4.818686302903736e-05, "loss": 0.1007, "num_input_tokens_seen": 6330272, "step": 4880 }, { "epoch": 0.23868272542936017, "grad_norm": 0.2706381380558014, "learning_rate": 4.818319062049154e-05, "loss": 0.1322, "num_input_tokens_seen": 6336800, "step": 4885 }, { "epoch": 0.2389270270930545, "grad_norm": 0.1707540601491928, "learning_rate": 4.817951463680639e-05, "loss": 0.0987, "num_input_tokens_seen": 6343008, "step": 4890 }, { "epoch": 0.23917132875674885, "grad_norm": 1.0068238973617554, "learning_rate": 4.817583507854879e-05, "loss": 0.0952, "num_input_tokens_seen": 6349280, "step": 4895 }, { "epoch": 0.23941563042044317, "grad_norm": 0.44353899359703064, "learning_rate": 4.817215194628617e-05, "loss": 0.1079, "num_input_tokens_seen": 6355808, "step": 4900 }, { "epoch": 0.2396599320841375, "grad_norm": 0.46983814239501953, "learning_rate": 4.816846524058653e-05, "loss": 0.0736, "num_input_tokens_seen": 6362208, "step": 4905 }, { "epoch": 0.23990423374783182, "grad_norm": 0.37135106325149536, "learning_rate": 4.816477496201839e-05, "loss": 0.1158, "num_input_tokens_seen": 6368576, "step": 4910 }, { "epoch": 0.24014853541152614, "grad_norm": 0.1776648461818695, "learning_rate": 4.8161081111150845e-05, "loss": 0.1034, "num_input_tokens_seen": 6374656, "step": 4915 }, { "epoch": 0.2403928370752205, "grad_norm": 0.41822242736816406, "learning_rate": 4.815738368855354e-05, "loss": 0.0991, "num_input_tokens_seen": 6381312, "step": 4920 }, { "epoch": 0.24063713873891482, "grad_norm": 0.6213194727897644, "learning_rate": 4.815368269479664e-05, "loss": 0.0908, "num_input_tokens_seen": 6388224, "step": 4925 }, { "epoch": 0.24088144040260914, "grad_norm": 0.2502274215221405, "learning_rate": 4.814997813045092e-05, "loss": 0.1021, "num_input_tokens_seen": 6394336, "step": 4930 }, { "epoch": 0.24112574206630347, "grad_norm": 0.2562873363494873, "learning_rate": 4.814626999608764e-05, "loss": 0.1327, "num_input_tokens_seen": 6400512, "step": 4935 }, { "epoch": 0.2413700437299978, "grad_norm": 0.3329147398471832, "learning_rate": 4.814255829227865e-05, "loss": 0.0953, "num_input_tokens_seen": 6407200, "step": 4940 }, { "epoch": 0.24161434539369214, "grad_norm": 0.114528588950634, "learning_rate": 4.813884301959635e-05, "loss": 0.084, "num_input_tokens_seen": 6413984, "step": 4945 }, { "epoch": 0.24185864705738647, "grad_norm": 0.21531781554222107, "learning_rate": 4.813512417861368e-05, "loss": 0.0948, "num_input_tokens_seen": 6420704, "step": 4950 }, { "epoch": 0.2421029487210808, "grad_norm": 0.22373464703559875, "learning_rate": 4.813140176990411e-05, "loss": 0.1048, "num_input_tokens_seen": 6427168, "step": 4955 }, { "epoch": 0.24234725038477511, "grad_norm": 0.28376305103302, "learning_rate": 4.8127675794041714e-05, "loss": 0.1038, "num_input_tokens_seen": 6433664, "step": 4960 }, { "epoch": 0.24259155204846944, "grad_norm": 0.33331298828125, "learning_rate": 4.812394625160107e-05, "loss": 0.0765, "num_input_tokens_seen": 6439744, "step": 4965 }, { "epoch": 0.2428358537121638, "grad_norm": 0.18574196100234985, "learning_rate": 4.812021314315732e-05, "loss": 0.0757, "num_input_tokens_seen": 6446528, "step": 4970 }, { "epoch": 0.24308015537585811, "grad_norm": 0.3322915732860565, "learning_rate": 4.811647646928616e-05, "loss": 0.1002, "num_input_tokens_seen": 6452896, "step": 4975 }, { "epoch": 0.24332445703955244, "grad_norm": 0.5332229733467102, "learning_rate": 4.8112736230563814e-05, "loss": 0.1106, "num_input_tokens_seen": 6459424, "step": 4980 }, { "epoch": 0.24356875870324676, "grad_norm": 0.23082447052001953, "learning_rate": 4.81089924275671e-05, "loss": 0.109, "num_input_tokens_seen": 6466208, "step": 4985 }, { "epoch": 0.2438130603669411, "grad_norm": 0.6705574989318848, "learning_rate": 4.810524506087335e-05, "loss": 0.0987, "num_input_tokens_seen": 6473152, "step": 4990 }, { "epoch": 0.24405736203063544, "grad_norm": 0.19442453980445862, "learning_rate": 4.810149413106044e-05, "loss": 0.0787, "num_input_tokens_seen": 6479680, "step": 4995 }, { "epoch": 0.24430166369432976, "grad_norm": 0.20138664543628693, "learning_rate": 4.809773963870684e-05, "loss": 0.0722, "num_input_tokens_seen": 6486336, "step": 5000 }, { "epoch": 0.24430166369432976, "eval_loss": 0.10016938298940659, "eval_runtime": 374.6549, "eval_samples_per_second": 97.116, "eval_steps_per_second": 24.281, "num_input_tokens_seen": 6486336, "step": 5000 }, { "epoch": 0.2445459653580241, "grad_norm": 0.34115108847618103, "learning_rate": 4.809398158439151e-05, "loss": 0.1157, "num_input_tokens_seen": 6492992, "step": 5005 }, { "epoch": 0.2447902670217184, "grad_norm": 0.20131319761276245, "learning_rate": 4.8090219968694005e-05, "loss": 0.0698, "num_input_tokens_seen": 6499520, "step": 5010 }, { "epoch": 0.24503456868541273, "grad_norm": 0.3498467206954956, "learning_rate": 4.808645479219442e-05, "loss": 0.0887, "num_input_tokens_seen": 6505568, "step": 5015 }, { "epoch": 0.2452788703491071, "grad_norm": 0.6198288202285767, "learning_rate": 4.8082686055473375e-05, "loss": 0.0811, "num_input_tokens_seen": 6512160, "step": 5020 }, { "epoch": 0.2455231720128014, "grad_norm": 0.2188481092453003, "learning_rate": 4.8078913759112066e-05, "loss": 0.1074, "num_input_tokens_seen": 6518144, "step": 5025 }, { "epoch": 0.24576747367649573, "grad_norm": 0.22426393628120422, "learning_rate": 4.807513790369223e-05, "loss": 0.0672, "num_input_tokens_seen": 6524576, "step": 5030 }, { "epoch": 0.24601177534019006, "grad_norm": 0.16184037923812866, "learning_rate": 4.8071358489796145e-05, "loss": 0.117, "num_input_tokens_seen": 6531008, "step": 5035 }, { "epoch": 0.24625607700388438, "grad_norm": 0.20978477597236633, "learning_rate": 4.806757551800665e-05, "loss": 0.0984, "num_input_tokens_seen": 6537344, "step": 5040 }, { "epoch": 0.24650037866757873, "grad_norm": 0.22238509356975555, "learning_rate": 4.806378898890713e-05, "loss": 0.0725, "num_input_tokens_seen": 6543904, "step": 5045 }, { "epoch": 0.24674468033127306, "grad_norm": 0.8903068900108337, "learning_rate": 4.80599989030815e-05, "loss": 0.1, "num_input_tokens_seen": 6550272, "step": 5050 }, { "epoch": 0.24698898199496738, "grad_norm": 0.33753281831741333, "learning_rate": 4.805620526111426e-05, "loss": 0.0847, "num_input_tokens_seen": 6556608, "step": 5055 }, { "epoch": 0.2472332836586617, "grad_norm": 0.2695518732070923, "learning_rate": 4.805240806359042e-05, "loss": 0.0973, "num_input_tokens_seen": 6563168, "step": 5060 }, { "epoch": 0.24747758532235606, "grad_norm": 0.2571481168270111, "learning_rate": 4.804860731109557e-05, "loss": 0.0934, "num_input_tokens_seen": 6569472, "step": 5065 }, { "epoch": 0.24772188698605038, "grad_norm": 0.23588858544826508, "learning_rate": 4.804480300421581e-05, "loss": 0.1001, "num_input_tokens_seen": 6575968, "step": 5070 }, { "epoch": 0.2479661886497447, "grad_norm": 0.22669905424118042, "learning_rate": 4.804099514353784e-05, "loss": 0.0872, "num_input_tokens_seen": 6582112, "step": 5075 }, { "epoch": 0.24821049031343903, "grad_norm": 0.17712034285068512, "learning_rate": 4.8037183729648867e-05, "loss": 0.0748, "num_input_tokens_seen": 6588576, "step": 5080 }, { "epoch": 0.24845479197713335, "grad_norm": 0.15757682919502258, "learning_rate": 4.803336876313666e-05, "loss": 0.096, "num_input_tokens_seen": 6594816, "step": 5085 }, { "epoch": 0.2486990936408277, "grad_norm": 0.6450384259223938, "learning_rate": 4.802955024458953e-05, "loss": 0.0888, "num_input_tokens_seen": 6601184, "step": 5090 }, { "epoch": 0.24894339530452203, "grad_norm": 0.11929396539926529, "learning_rate": 4.802572817459634e-05, "loss": 0.0772, "num_input_tokens_seen": 6607616, "step": 5095 }, { "epoch": 0.24918769696821635, "grad_norm": 0.5162287354469299, "learning_rate": 4.802190255374651e-05, "loss": 0.0821, "num_input_tokens_seen": 6613984, "step": 5100 }, { "epoch": 0.24943199863191068, "grad_norm": 0.5831402540206909, "learning_rate": 4.801807338263e-05, "loss": 0.0922, "num_input_tokens_seen": 6620544, "step": 5105 }, { "epoch": 0.249676300295605, "grad_norm": 0.3629477918148041, "learning_rate": 4.8014240661837306e-05, "loss": 0.1053, "num_input_tokens_seen": 6627232, "step": 5110 }, { "epoch": 0.24992060195929935, "grad_norm": 0.445982426404953, "learning_rate": 4.80104043919595e-05, "loss": 0.1147, "num_input_tokens_seen": 6633728, "step": 5115 }, { "epoch": 0.25016490362299365, "grad_norm": 0.5254073739051819, "learning_rate": 4.800656457358815e-05, "loss": 0.1091, "num_input_tokens_seen": 6640288, "step": 5120 }, { "epoch": 0.250409205286688, "grad_norm": 0.48671668767929077, "learning_rate": 4.800272120731544e-05, "loss": 0.1077, "num_input_tokens_seen": 6647072, "step": 5125 }, { "epoch": 0.25065350695038235, "grad_norm": 0.5298551917076111, "learning_rate": 4.799887429373404e-05, "loss": 0.0922, "num_input_tokens_seen": 6654112, "step": 5130 }, { "epoch": 0.2508978086140767, "grad_norm": 0.25157999992370605, "learning_rate": 4.79950238334372e-05, "loss": 0.0901, "num_input_tokens_seen": 6661056, "step": 5135 }, { "epoch": 0.251142110277771, "grad_norm": 0.17787069082260132, "learning_rate": 4.799116982701872e-05, "loss": 0.1198, "num_input_tokens_seen": 6667648, "step": 5140 }, { "epoch": 0.2513864119414653, "grad_norm": 0.9013302326202393, "learning_rate": 4.7987312275072926e-05, "loss": 0.0982, "num_input_tokens_seen": 6674208, "step": 5145 }, { "epoch": 0.25163071360515965, "grad_norm": 0.15136215090751648, "learning_rate": 4.79834511781947e-05, "loss": 0.0924, "num_input_tokens_seen": 6680544, "step": 5150 }, { "epoch": 0.251875015268854, "grad_norm": 0.38733071088790894, "learning_rate": 4.797958653697947e-05, "loss": 0.1295, "num_input_tokens_seen": 6687008, "step": 5155 }, { "epoch": 0.2521193169325483, "grad_norm": 0.13547161221504211, "learning_rate": 4.7975718352023225e-05, "loss": 0.1057, "num_input_tokens_seen": 6693248, "step": 5160 }, { "epoch": 0.2523636185962426, "grad_norm": 0.25823816657066345, "learning_rate": 4.7971846623922476e-05, "loss": 0.0961, "num_input_tokens_seen": 6700064, "step": 5165 }, { "epoch": 0.25260792025993695, "grad_norm": 0.16175507009029388, "learning_rate": 4.7967971353274294e-05, "loss": 0.0981, "num_input_tokens_seen": 6706528, "step": 5170 }, { "epoch": 0.2528522219236313, "grad_norm": 0.12653838098049164, "learning_rate": 4.79640925406763e-05, "loss": 0.0834, "num_input_tokens_seen": 6712832, "step": 5175 }, { "epoch": 0.25309652358732565, "grad_norm": 0.4269203245639801, "learning_rate": 4.796021018672664e-05, "loss": 0.116, "num_input_tokens_seen": 6719200, "step": 5180 }, { "epoch": 0.25334082525102, "grad_norm": 0.9791394472122192, "learning_rate": 4.795632429202405e-05, "loss": 0.1161, "num_input_tokens_seen": 6725440, "step": 5185 }, { "epoch": 0.2535851269147143, "grad_norm": 0.14905744791030884, "learning_rate": 4.795243485716775e-05, "loss": 0.0965, "num_input_tokens_seen": 6732064, "step": 5190 }, { "epoch": 0.2538294285784086, "grad_norm": 0.7017241716384888, "learning_rate": 4.794854188275757e-05, "loss": 0.1015, "num_input_tokens_seen": 6738432, "step": 5195 }, { "epoch": 0.25407373024210295, "grad_norm": 0.4210987985134125, "learning_rate": 4.794464536939384e-05, "loss": 0.103, "num_input_tokens_seen": 6744864, "step": 5200 }, { "epoch": 0.25407373024210295, "eval_loss": 0.09911303222179413, "eval_runtime": 375.0885, "eval_samples_per_second": 97.004, "eval_steps_per_second": 24.253, "num_input_tokens_seen": 6744864, "step": 5200 }, { "epoch": 0.25431803190579727, "grad_norm": 0.17937876284122467, "learning_rate": 4.794074531767745e-05, "loss": 0.0981, "num_input_tokens_seen": 6751328, "step": 5205 }, { "epoch": 0.2545623335694916, "grad_norm": 0.634772777557373, "learning_rate": 4.7936841728209834e-05, "loss": 0.0856, "num_input_tokens_seen": 6757504, "step": 5210 }, { "epoch": 0.2548066352331859, "grad_norm": 0.32283297181129456, "learning_rate": 4.7932934601593e-05, "loss": 0.1115, "num_input_tokens_seen": 6763744, "step": 5215 }, { "epoch": 0.25505093689688024, "grad_norm": 0.3150581419467926, "learning_rate": 4.792902393842943e-05, "loss": 0.1198, "num_input_tokens_seen": 6770400, "step": 5220 }, { "epoch": 0.2552952385605746, "grad_norm": 0.2056950330734253, "learning_rate": 4.792510973932225e-05, "loss": 0.1066, "num_input_tokens_seen": 6776960, "step": 5225 }, { "epoch": 0.25553954022426895, "grad_norm": 0.4465382397174835, "learning_rate": 4.7921192004875036e-05, "loss": 0.0932, "num_input_tokens_seen": 6783328, "step": 5230 }, { "epoch": 0.25578384188796327, "grad_norm": 0.4368572533130646, "learning_rate": 4.791727073569198e-05, "loss": 0.0801, "num_input_tokens_seen": 6789440, "step": 5235 }, { "epoch": 0.2560281435516576, "grad_norm": 0.33449724316596985, "learning_rate": 4.7913345932377775e-05, "loss": 0.1089, "num_input_tokens_seen": 6796832, "step": 5240 }, { "epoch": 0.2562724452153519, "grad_norm": 0.45520323514938354, "learning_rate": 4.790941759553769e-05, "loss": 0.0865, "num_input_tokens_seen": 6803328, "step": 5245 }, { "epoch": 0.25651674687904624, "grad_norm": 1.0047880411148071, "learning_rate": 4.79054857257775e-05, "loss": 0.1149, "num_input_tokens_seen": 6809728, "step": 5250 }, { "epoch": 0.25676104854274057, "grad_norm": 0.18023781478405, "learning_rate": 4.790155032370357e-05, "loss": 0.0851, "num_input_tokens_seen": 6816448, "step": 5255 }, { "epoch": 0.2570053502064349, "grad_norm": 0.17732560634613037, "learning_rate": 4.789761138992278e-05, "loss": 0.0907, "num_input_tokens_seen": 6823040, "step": 5260 }, { "epoch": 0.2572496518701292, "grad_norm": 0.4590163826942444, "learning_rate": 4.7893668925042565e-05, "loss": 0.0863, "num_input_tokens_seen": 6829600, "step": 5265 }, { "epoch": 0.25749395353382354, "grad_norm": 0.2049398571252823, "learning_rate": 4.78897229296709e-05, "loss": 0.1048, "num_input_tokens_seen": 6836128, "step": 5270 }, { "epoch": 0.2577382551975179, "grad_norm": 0.6104893684387207, "learning_rate": 4.7885773404416315e-05, "loss": 0.1017, "num_input_tokens_seen": 6842944, "step": 5275 }, { "epoch": 0.25798255686121224, "grad_norm": 0.14275409281253815, "learning_rate": 4.788182034988786e-05, "loss": 0.0955, "num_input_tokens_seen": 6849792, "step": 5280 }, { "epoch": 0.25822685852490657, "grad_norm": 0.43151482939720154, "learning_rate": 4.787786376669516e-05, "loss": 0.079, "num_input_tokens_seen": 6856064, "step": 5285 }, { "epoch": 0.2584711601886009, "grad_norm": 0.19392499327659607, "learning_rate": 4.787390365544837e-05, "loss": 0.1019, "num_input_tokens_seen": 6862272, "step": 5290 }, { "epoch": 0.2587154618522952, "grad_norm": 0.826473593711853, "learning_rate": 4.786994001675818e-05, "loss": 0.1265, "num_input_tokens_seen": 6869120, "step": 5295 }, { "epoch": 0.25895976351598954, "grad_norm": 0.4832525849342346, "learning_rate": 4.786597285123584e-05, "loss": 0.0672, "num_input_tokens_seen": 6875712, "step": 5300 }, { "epoch": 0.25920406517968386, "grad_norm": 0.14367614686489105, "learning_rate": 4.7862002159493135e-05, "loss": 0.0766, "num_input_tokens_seen": 6882784, "step": 5305 }, { "epoch": 0.2594483668433782, "grad_norm": 0.35569873452186584, "learning_rate": 4.785802794214239e-05, "loss": 0.0981, "num_input_tokens_seen": 6889056, "step": 5310 }, { "epoch": 0.2596926685070725, "grad_norm": 0.5622379183769226, "learning_rate": 4.7854050199796495e-05, "loss": 0.0994, "num_input_tokens_seen": 6895392, "step": 5315 }, { "epoch": 0.2599369701707669, "grad_norm": 0.33083730936050415, "learning_rate": 4.7850068933068845e-05, "loss": 0.118, "num_input_tokens_seen": 6901760, "step": 5320 }, { "epoch": 0.2601812718344612, "grad_norm": 0.23837953805923462, "learning_rate": 4.7846084142573425e-05, "loss": 0.0723, "num_input_tokens_seen": 6908800, "step": 5325 }, { "epoch": 0.26042557349815554, "grad_norm": 0.35588881373405457, "learning_rate": 4.7842095828924725e-05, "loss": 0.0974, "num_input_tokens_seen": 6915040, "step": 5330 }, { "epoch": 0.26066987516184986, "grad_norm": 0.38917893171310425, "learning_rate": 4.783810399273779e-05, "loss": 0.0883, "num_input_tokens_seen": 6921344, "step": 5335 }, { "epoch": 0.2609141768255442, "grad_norm": 0.20221443474292755, "learning_rate": 4.7834108634628226e-05, "loss": 0.0944, "num_input_tokens_seen": 6928480, "step": 5340 }, { "epoch": 0.2611584784892385, "grad_norm": 0.5295644998550415, "learning_rate": 4.783010975521216e-05, "loss": 0.0934, "num_input_tokens_seen": 6934976, "step": 5345 }, { "epoch": 0.26140278015293283, "grad_norm": 0.20685873925685883, "learning_rate": 4.782610735510626e-05, "loss": 0.1086, "num_input_tokens_seen": 6941600, "step": 5350 }, { "epoch": 0.26164708181662716, "grad_norm": 0.3303382396697998, "learning_rate": 4.782210143492776e-05, "loss": 0.1349, "num_input_tokens_seen": 6947904, "step": 5355 }, { "epoch": 0.2618913834803215, "grad_norm": 0.17354163527488708, "learning_rate": 4.781809199529442e-05, "loss": 0.0671, "num_input_tokens_seen": 6954624, "step": 5360 }, { "epoch": 0.2621356851440158, "grad_norm": 0.6633116602897644, "learning_rate": 4.781407903682454e-05, "loss": 0.1062, "num_input_tokens_seen": 6961152, "step": 5365 }, { "epoch": 0.2623799868077102, "grad_norm": 0.22418414056301117, "learning_rate": 4.781006256013698e-05, "loss": 0.1199, "num_input_tokens_seen": 6967584, "step": 5370 }, { "epoch": 0.2626242884714045, "grad_norm": 0.21350686252117157, "learning_rate": 4.7806042565851115e-05, "loss": 0.086, "num_input_tokens_seen": 6973952, "step": 5375 }, { "epoch": 0.26286859013509883, "grad_norm": 0.17392049729824066, "learning_rate": 4.7802019054586895e-05, "loss": 0.0871, "num_input_tokens_seen": 6980352, "step": 5380 }, { "epoch": 0.26311289179879316, "grad_norm": 0.20223842561244965, "learning_rate": 4.779799202696479e-05, "loss": 0.1085, "num_input_tokens_seen": 6986752, "step": 5385 }, { "epoch": 0.2633571934624875, "grad_norm": 0.7156997323036194, "learning_rate": 4.779396148360581e-05, "loss": 0.1064, "num_input_tokens_seen": 6993280, "step": 5390 }, { "epoch": 0.2636014951261818, "grad_norm": 0.30133265256881714, "learning_rate": 4.7789927425131517e-05, "loss": 0.1044, "num_input_tokens_seen": 6999904, "step": 5395 }, { "epoch": 0.26384579678987613, "grad_norm": 0.5046026706695557, "learning_rate": 4.778588985216403e-05, "loss": 0.102, "num_input_tokens_seen": 7006944, "step": 5400 }, { "epoch": 0.26384579678987613, "eval_loss": 0.09841569513082504, "eval_runtime": 375.3684, "eval_samples_per_second": 96.931, "eval_steps_per_second": 24.235, "num_input_tokens_seen": 7006944, "step": 5400 }, { "epoch": 0.26409009845357045, "grad_norm": 0.36075323820114136, "learning_rate": 4.778184876532598e-05, "loss": 0.104, "num_input_tokens_seen": 7013632, "step": 5405 }, { "epoch": 0.2643344001172648, "grad_norm": 0.4973292350769043, "learning_rate": 4.7777804165240556e-05, "loss": 0.0935, "num_input_tokens_seen": 7020192, "step": 5410 }, { "epoch": 0.2645787017809591, "grad_norm": 0.21577085554599762, "learning_rate": 4.7773756052531485e-05, "loss": 0.1036, "num_input_tokens_seen": 7026400, "step": 5415 }, { "epoch": 0.2648230034446535, "grad_norm": 0.1763906329870224, "learning_rate": 4.7769704427823035e-05, "loss": 0.0824, "num_input_tokens_seen": 7032544, "step": 5420 }, { "epoch": 0.2650673051083478, "grad_norm": 0.5164601802825928, "learning_rate": 4.776564929174003e-05, "loss": 0.0879, "num_input_tokens_seen": 7039136, "step": 5425 }, { "epoch": 0.26531160677204213, "grad_norm": 0.3575229048728943, "learning_rate": 4.7761590644907806e-05, "loss": 0.0864, "num_input_tokens_seen": 7045152, "step": 5430 }, { "epoch": 0.26555590843573645, "grad_norm": 0.3888121247291565, "learning_rate": 4.7757528487952263e-05, "loss": 0.0943, "num_input_tokens_seen": 7051680, "step": 5435 }, { "epoch": 0.2658002100994308, "grad_norm": 0.4909196197986603, "learning_rate": 4.7753462821499836e-05, "loss": 0.0918, "num_input_tokens_seen": 7058112, "step": 5440 }, { "epoch": 0.2660445117631251, "grad_norm": 0.5604592561721802, "learning_rate": 4.774939364617751e-05, "loss": 0.0892, "num_input_tokens_seen": 7064832, "step": 5445 }, { "epoch": 0.2662888134268194, "grad_norm": 0.28411784768104553, "learning_rate": 4.7745320962612795e-05, "loss": 0.1222, "num_input_tokens_seen": 7071520, "step": 5450 }, { "epoch": 0.26653311509051375, "grad_norm": 0.2408333420753479, "learning_rate": 4.7741244771433756e-05, "loss": 0.079, "num_input_tokens_seen": 7078016, "step": 5455 }, { "epoch": 0.2667774167542081, "grad_norm": 0.8751343488693237, "learning_rate": 4.7737165073268985e-05, "loss": 0.1114, "num_input_tokens_seen": 7084576, "step": 5460 }, { "epoch": 0.2670217184179024, "grad_norm": 0.40831953287124634, "learning_rate": 4.7733081868747626e-05, "loss": 0.1284, "num_input_tokens_seen": 7091360, "step": 5465 }, { "epoch": 0.2672660200815968, "grad_norm": 0.4135887920856476, "learning_rate": 4.772899515849936e-05, "loss": 0.1139, "num_input_tokens_seen": 7098016, "step": 5470 }, { "epoch": 0.2675103217452911, "grad_norm": 0.21352960169315338, "learning_rate": 4.7724904943154414e-05, "loss": 0.1201, "num_input_tokens_seen": 7104704, "step": 5475 }, { "epoch": 0.2677546234089854, "grad_norm": 0.35244008898735046, "learning_rate": 4.772081122334354e-05, "loss": 0.0736, "num_input_tokens_seen": 7111360, "step": 5480 }, { "epoch": 0.26799892507267975, "grad_norm": 0.29837408661842346, "learning_rate": 4.771671399969806e-05, "loss": 0.1164, "num_input_tokens_seen": 7117728, "step": 5485 }, { "epoch": 0.2682432267363741, "grad_norm": 0.16003601253032684, "learning_rate": 4.7712613272849794e-05, "loss": 0.0732, "num_input_tokens_seen": 7124192, "step": 5490 }, { "epoch": 0.2684875284000684, "grad_norm": 0.18756748735904694, "learning_rate": 4.770850904343114e-05, "loss": 0.0938, "num_input_tokens_seen": 7131072, "step": 5495 }, { "epoch": 0.2687318300637627, "grad_norm": 0.22769282758235931, "learning_rate": 4.770440131207502e-05, "loss": 0.1322, "num_input_tokens_seen": 7137504, "step": 5500 }, { "epoch": 0.26897613172745705, "grad_norm": 0.35585978627204895, "learning_rate": 4.7700290079414896e-05, "loss": 0.0968, "num_input_tokens_seen": 7143904, "step": 5505 }, { "epoch": 0.26922043339115137, "grad_norm": 0.36317166686058044, "learning_rate": 4.769617534608477e-05, "loss": 0.0901, "num_input_tokens_seen": 7150208, "step": 5510 }, { "epoch": 0.26946473505484575, "grad_norm": 0.39509397745132446, "learning_rate": 4.7692057112719193e-05, "loss": 0.081, "num_input_tokens_seen": 7157088, "step": 5515 }, { "epoch": 0.2697090367185401, "grad_norm": 0.4898931086063385, "learning_rate": 4.7687935379953234e-05, "loss": 0.0901, "num_input_tokens_seen": 7163904, "step": 5520 }, { "epoch": 0.2699533383822344, "grad_norm": 0.6648363471031189, "learning_rate": 4.7683810148422534e-05, "loss": 0.0986, "num_input_tokens_seen": 7170176, "step": 5525 }, { "epoch": 0.2701976400459287, "grad_norm": 0.22660405933856964, "learning_rate": 4.767968141876324e-05, "loss": 0.0837, "num_input_tokens_seen": 7176864, "step": 5530 }, { "epoch": 0.27044194170962305, "grad_norm": 0.6144121885299683, "learning_rate": 4.767554919161207e-05, "loss": 0.1438, "num_input_tokens_seen": 7183520, "step": 5535 }, { "epoch": 0.27068624337331737, "grad_norm": 0.6490930318832397, "learning_rate": 4.767141346760624e-05, "loss": 0.0994, "num_input_tokens_seen": 7190112, "step": 5540 }, { "epoch": 0.2709305450370117, "grad_norm": 0.24163183569908142, "learning_rate": 4.766727424738356e-05, "loss": 0.0801, "num_input_tokens_seen": 7196448, "step": 5545 }, { "epoch": 0.271174846700706, "grad_norm": 0.36561450362205505, "learning_rate": 4.7663131531582325e-05, "loss": 0.0968, "num_input_tokens_seen": 7202816, "step": 5550 }, { "epoch": 0.27141914836440034, "grad_norm": 0.39912697672843933, "learning_rate": 4.765898532084142e-05, "loss": 0.0967, "num_input_tokens_seen": 7209408, "step": 5555 }, { "epoch": 0.27166345002809467, "grad_norm": 0.6837323904037476, "learning_rate": 4.765483561580022e-05, "loss": 0.0871, "num_input_tokens_seen": 7215904, "step": 5560 }, { "epoch": 0.27190775169178905, "grad_norm": 0.15679307281970978, "learning_rate": 4.7650682417098666e-05, "loss": 0.0778, "num_input_tokens_seen": 7222240, "step": 5565 }, { "epoch": 0.27215205335548337, "grad_norm": 0.1570103019475937, "learning_rate": 4.7646525725377244e-05, "loss": 0.0752, "num_input_tokens_seen": 7228608, "step": 5570 }, { "epoch": 0.2723963550191777, "grad_norm": 0.8185784220695496, "learning_rate": 4.764236554127696e-05, "loss": 0.0649, "num_input_tokens_seen": 7235744, "step": 5575 }, { "epoch": 0.272640656682872, "grad_norm": 0.9368842840194702, "learning_rate": 4.7638201865439356e-05, "loss": 0.1134, "num_input_tokens_seen": 7242176, "step": 5580 }, { "epoch": 0.27288495834656634, "grad_norm": 0.5264523029327393, "learning_rate": 4.7634034698506545e-05, "loss": 0.0874, "num_input_tokens_seen": 7248192, "step": 5585 }, { "epoch": 0.27312926001026067, "grad_norm": 0.21157506108283997, "learning_rate": 4.762986404112115e-05, "loss": 0.0826, "num_input_tokens_seen": 7254720, "step": 5590 }, { "epoch": 0.273373561673955, "grad_norm": 0.2113313376903534, "learning_rate": 4.762568989392633e-05, "loss": 0.0833, "num_input_tokens_seen": 7261088, "step": 5595 }, { "epoch": 0.2736178633376493, "grad_norm": 0.5167922973632812, "learning_rate": 4.76215122575658e-05, "loss": 0.0918, "num_input_tokens_seen": 7267584, "step": 5600 }, { "epoch": 0.2736178633376493, "eval_loss": 0.09785854816436768, "eval_runtime": 374.6618, "eval_samples_per_second": 97.114, "eval_steps_per_second": 24.281, "num_input_tokens_seen": 7267584, "step": 5600 }, { "epoch": 0.27386216500134364, "grad_norm": 0.23885132372379303, "learning_rate": 4.7617331132683795e-05, "loss": 0.082, "num_input_tokens_seen": 7274080, "step": 5605 }, { "epoch": 0.27410646666503796, "grad_norm": 0.28353461623191833, "learning_rate": 4.7613146519925105e-05, "loss": 0.1147, "num_input_tokens_seen": 7281824, "step": 5610 }, { "epoch": 0.27435076832873234, "grad_norm": 0.5158082842826843, "learning_rate": 4.7608958419935045e-05, "loss": 0.0899, "num_input_tokens_seen": 7288384, "step": 5615 }, { "epoch": 0.27459506999242667, "grad_norm": 0.434513121843338, "learning_rate": 4.760476683335948e-05, "loss": 0.0832, "num_input_tokens_seen": 7294976, "step": 5620 }, { "epoch": 0.274839371656121, "grad_norm": 0.2884354889392853, "learning_rate": 4.760057176084479e-05, "loss": 0.1094, "num_input_tokens_seen": 7301568, "step": 5625 }, { "epoch": 0.2750836733198153, "grad_norm": 0.408586710691452, "learning_rate": 4.759637320303793e-05, "loss": 0.0653, "num_input_tokens_seen": 7308192, "step": 5630 }, { "epoch": 0.27532797498350964, "grad_norm": 0.43614497780799866, "learning_rate": 4.759217116058635e-05, "loss": 0.0914, "num_input_tokens_seen": 7314432, "step": 5635 }, { "epoch": 0.27557227664720396, "grad_norm": 0.9131243228912354, "learning_rate": 4.758796563413807e-05, "loss": 0.1192, "num_input_tokens_seen": 7320864, "step": 5640 }, { "epoch": 0.2758165783108983, "grad_norm": 0.5437683463096619, "learning_rate": 4.758375662434163e-05, "loss": 0.0962, "num_input_tokens_seen": 7327904, "step": 5645 }, { "epoch": 0.2760608799745926, "grad_norm": 0.3531283736228943, "learning_rate": 4.7579544131846114e-05, "loss": 0.104, "num_input_tokens_seen": 7334208, "step": 5650 }, { "epoch": 0.27630518163828693, "grad_norm": 0.3994404375553131, "learning_rate": 4.757532815730114e-05, "loss": 0.1031, "num_input_tokens_seen": 7340736, "step": 5655 }, { "epoch": 0.2765494833019813, "grad_norm": 0.4671729505062103, "learning_rate": 4.7571108701356865e-05, "loss": 0.1083, "num_input_tokens_seen": 7347488, "step": 5660 }, { "epoch": 0.27679378496567564, "grad_norm": 0.18640753626823425, "learning_rate": 4.756688576466398e-05, "loss": 0.0869, "num_input_tokens_seen": 7353856, "step": 5665 }, { "epoch": 0.27703808662936996, "grad_norm": 0.30254149436950684, "learning_rate": 4.756265934787372e-05, "loss": 0.1178, "num_input_tokens_seen": 7360192, "step": 5670 }, { "epoch": 0.2772823882930643, "grad_norm": 0.33370861411094666, "learning_rate": 4.755842945163785e-05, "loss": 0.1132, "num_input_tokens_seen": 7366528, "step": 5675 }, { "epoch": 0.2775266899567586, "grad_norm": 0.21751217544078827, "learning_rate": 4.755419607660867e-05, "loss": 0.1355, "num_input_tokens_seen": 7372672, "step": 5680 }, { "epoch": 0.27777099162045293, "grad_norm": 0.19933640956878662, "learning_rate": 4.7549959223439016e-05, "loss": 0.0901, "num_input_tokens_seen": 7379456, "step": 5685 }, { "epoch": 0.27801529328414726, "grad_norm": 0.18701939284801483, "learning_rate": 4.754571889278228e-05, "loss": 0.087, "num_input_tokens_seen": 7385600, "step": 5690 }, { "epoch": 0.2782595949478416, "grad_norm": 0.2818923890590668, "learning_rate": 4.754147508529235e-05, "loss": 0.0759, "num_input_tokens_seen": 7392224, "step": 5695 }, { "epoch": 0.2785038966115359, "grad_norm": 0.5969399809837341, "learning_rate": 4.75372278016237e-05, "loss": 0.1057, "num_input_tokens_seen": 7398656, "step": 5700 }, { "epoch": 0.27874819827523023, "grad_norm": 0.20581644773483276, "learning_rate": 4.753297704243129e-05, "loss": 0.1084, "num_input_tokens_seen": 7405216, "step": 5705 }, { "epoch": 0.2789924999389246, "grad_norm": 0.6486238241195679, "learning_rate": 4.752872280837066e-05, "loss": 0.0732, "num_input_tokens_seen": 7411584, "step": 5710 }, { "epoch": 0.27923680160261893, "grad_norm": 0.5031300187110901, "learning_rate": 4.752446510009786e-05, "loss": 0.091, "num_input_tokens_seen": 7418016, "step": 5715 }, { "epoch": 0.27948110326631326, "grad_norm": 0.2374240905046463, "learning_rate": 4.7520203918269476e-05, "loss": 0.0969, "num_input_tokens_seen": 7424384, "step": 5720 }, { "epoch": 0.2797254049300076, "grad_norm": 0.1924123466014862, "learning_rate": 4.751593926354265e-05, "loss": 0.0912, "num_input_tokens_seen": 7430912, "step": 5725 }, { "epoch": 0.2799697065937019, "grad_norm": 0.5250719785690308, "learning_rate": 4.751167113657503e-05, "loss": 0.0793, "num_input_tokens_seen": 7437504, "step": 5730 }, { "epoch": 0.28021400825739623, "grad_norm": 0.8130444288253784, "learning_rate": 4.7507399538024834e-05, "loss": 0.0985, "num_input_tokens_seen": 7444480, "step": 5735 }, { "epoch": 0.28045830992109055, "grad_norm": 0.14965616166591644, "learning_rate": 4.750312446855077e-05, "loss": 0.0871, "num_input_tokens_seen": 7450464, "step": 5740 }, { "epoch": 0.2807026115847849, "grad_norm": 0.34686577320098877, "learning_rate": 4.749884592881212e-05, "loss": 0.0571, "num_input_tokens_seen": 7457280, "step": 5745 }, { "epoch": 0.2809469132484792, "grad_norm": 0.3136548399925232, "learning_rate": 4.74945639194687e-05, "loss": 0.0925, "num_input_tokens_seen": 7463808, "step": 5750 }, { "epoch": 0.2811912149121735, "grad_norm": 0.34371986985206604, "learning_rate": 4.749027844118083e-05, "loss": 0.1086, "num_input_tokens_seen": 7469952, "step": 5755 }, { "epoch": 0.2814355165758679, "grad_norm": 0.37078747153282166, "learning_rate": 4.7485989494609395e-05, "loss": 0.1014, "num_input_tokens_seen": 7477088, "step": 5760 }, { "epoch": 0.28167981823956223, "grad_norm": 0.26033976674079895, "learning_rate": 4.748169708041581e-05, "loss": 0.0971, "num_input_tokens_seen": 7483392, "step": 5765 }, { "epoch": 0.28192411990325655, "grad_norm": 0.18689119815826416, "learning_rate": 4.7477401199262004e-05, "loss": 0.0807, "num_input_tokens_seen": 7489568, "step": 5770 }, { "epoch": 0.2821684215669509, "grad_norm": 0.21495281159877777, "learning_rate": 4.747310185181048e-05, "loss": 0.0662, "num_input_tokens_seen": 7496512, "step": 5775 }, { "epoch": 0.2824127232306452, "grad_norm": 0.32764092087745667, "learning_rate": 4.746879903872422e-05, "loss": 0.0903, "num_input_tokens_seen": 7503584, "step": 5780 }, { "epoch": 0.2826570248943395, "grad_norm": 0.12989720702171326, "learning_rate": 4.746449276066679e-05, "loss": 0.0808, "num_input_tokens_seen": 7510304, "step": 5785 }, { "epoch": 0.28290132655803385, "grad_norm": 0.32641634345054626, "learning_rate": 4.746018301830227e-05, "loss": 0.1135, "num_input_tokens_seen": 7516576, "step": 5790 }, { "epoch": 0.2831456282217282, "grad_norm": 0.6408804655075073, "learning_rate": 4.7455869812295275e-05, "loss": 0.1053, "num_input_tokens_seen": 7523104, "step": 5795 }, { "epoch": 0.2833899298854225, "grad_norm": 0.4874303638935089, "learning_rate": 4.7451553143310964e-05, "loss": 0.1174, "num_input_tokens_seen": 7529280, "step": 5800 }, { "epoch": 0.2833899298854225, "eval_loss": 0.09755180776119232, "eval_runtime": 374.4932, "eval_samples_per_second": 97.158, "eval_steps_per_second": 24.291, "num_input_tokens_seen": 7529280, "step": 5800 }, { "epoch": 0.2836342315491168, "grad_norm": 0.2821379601955414, "learning_rate": 4.744723301201501e-05, "loss": 0.0964, "num_input_tokens_seen": 7536064, "step": 5805 }, { "epoch": 0.2838785332128112, "grad_norm": 0.13833487033843994, "learning_rate": 4.744290941907364e-05, "loss": 0.1122, "num_input_tokens_seen": 7542688, "step": 5810 }, { "epoch": 0.2841228348765055, "grad_norm": 0.3113449811935425, "learning_rate": 4.7438582365153594e-05, "loss": 0.1109, "num_input_tokens_seen": 7548864, "step": 5815 }, { "epoch": 0.28436713654019985, "grad_norm": 0.20129358768463135, "learning_rate": 4.743425185092217e-05, "loss": 0.0978, "num_input_tokens_seen": 7555360, "step": 5820 }, { "epoch": 0.2846114382038942, "grad_norm": 0.30659839510917664, "learning_rate": 4.742991787704719e-05, "loss": 0.0679, "num_input_tokens_seen": 7561952, "step": 5825 }, { "epoch": 0.2848557398675885, "grad_norm": 0.4412655234336853, "learning_rate": 4.7425580444196994e-05, "loss": 0.1038, "num_input_tokens_seen": 7568416, "step": 5830 }, { "epoch": 0.2851000415312828, "grad_norm": 0.686520516872406, "learning_rate": 4.742123955304048e-05, "loss": 0.1083, "num_input_tokens_seen": 7575008, "step": 5835 }, { "epoch": 0.28534434319497715, "grad_norm": 0.15650558471679688, "learning_rate": 4.741689520424706e-05, "loss": 0.0939, "num_input_tokens_seen": 7581760, "step": 5840 }, { "epoch": 0.28558864485867147, "grad_norm": 0.18703249096870422, "learning_rate": 4.741254739848669e-05, "loss": 0.077, "num_input_tokens_seen": 7588192, "step": 5845 }, { "epoch": 0.2858329465223658, "grad_norm": 0.5926378965377808, "learning_rate": 4.740819613642987e-05, "loss": 0.096, "num_input_tokens_seen": 7594304, "step": 5850 }, { "epoch": 0.2860772481860602, "grad_norm": 0.32568830251693726, "learning_rate": 4.74038414187476e-05, "loss": 0.0792, "num_input_tokens_seen": 7600352, "step": 5855 }, { "epoch": 0.2863215498497545, "grad_norm": 0.2654265761375427, "learning_rate": 4.739948324611144e-05, "loss": 0.1059, "num_input_tokens_seen": 7607296, "step": 5860 }, { "epoch": 0.2865658515134488, "grad_norm": 0.27822670340538025, "learning_rate": 4.7395121619193465e-05, "loss": 0.1217, "num_input_tokens_seen": 7614016, "step": 5865 }, { "epoch": 0.28681015317714315, "grad_norm": 1.142453908920288, "learning_rate": 4.7390756538666313e-05, "loss": 0.1313, "num_input_tokens_seen": 7620320, "step": 5870 }, { "epoch": 0.28705445484083747, "grad_norm": 0.180776908993721, "learning_rate": 4.738638800520311e-05, "loss": 0.0879, "num_input_tokens_seen": 7627040, "step": 5875 }, { "epoch": 0.2872987565045318, "grad_norm": 0.54304438829422, "learning_rate": 4.738201601947757e-05, "loss": 0.1102, "num_input_tokens_seen": 7633664, "step": 5880 }, { "epoch": 0.2875430581682261, "grad_norm": 0.30125194787979126, "learning_rate": 4.7377640582163876e-05, "loss": 0.0973, "num_input_tokens_seen": 7640032, "step": 5885 }, { "epoch": 0.28778735983192044, "grad_norm": 0.22093643248081207, "learning_rate": 4.7373261693936786e-05, "loss": 0.0933, "num_input_tokens_seen": 7646688, "step": 5890 }, { "epoch": 0.28803166149561477, "grad_norm": 0.15861466526985168, "learning_rate": 4.7368879355471595e-05, "loss": 0.0846, "num_input_tokens_seen": 7653408, "step": 5895 }, { "epoch": 0.2882759631593091, "grad_norm": 0.4896460473537445, "learning_rate": 4.736449356744409e-05, "loss": 0.1013, "num_input_tokens_seen": 7659648, "step": 5900 }, { "epoch": 0.28852026482300347, "grad_norm": 0.44210943579673767, "learning_rate": 4.736010433053064e-05, "loss": 0.1006, "num_input_tokens_seen": 7666400, "step": 5905 }, { "epoch": 0.2887645664866978, "grad_norm": 0.6441329717636108, "learning_rate": 4.73557116454081e-05, "loss": 0.0846, "num_input_tokens_seen": 7672768, "step": 5910 }, { "epoch": 0.2890088681503921, "grad_norm": 0.22064858675003052, "learning_rate": 4.735131551275389e-05, "loss": 0.0855, "num_input_tokens_seen": 7679680, "step": 5915 }, { "epoch": 0.28925316981408644, "grad_norm": 0.692341148853302, "learning_rate": 4.734691593324594e-05, "loss": 0.1014, "num_input_tokens_seen": 7686240, "step": 5920 }, { "epoch": 0.28949747147778077, "grad_norm": 0.30786728858947754, "learning_rate": 4.734251290756272e-05, "loss": 0.1022, "num_input_tokens_seen": 7692640, "step": 5925 }, { "epoch": 0.2897417731414751, "grad_norm": 0.28139230608940125, "learning_rate": 4.7338106436383246e-05, "loss": 0.1143, "num_input_tokens_seen": 7699072, "step": 5930 }, { "epoch": 0.2899860748051694, "grad_norm": 0.4800257980823517, "learning_rate": 4.733369652038703e-05, "loss": 0.0842, "num_input_tokens_seen": 7705248, "step": 5935 }, { "epoch": 0.29023037646886374, "grad_norm": 0.3081100285053253, "learning_rate": 4.7329283160254156e-05, "loss": 0.0799, "num_input_tokens_seen": 7711968, "step": 5940 }, { "epoch": 0.29047467813255806, "grad_norm": 0.31846678256988525, "learning_rate": 4.732486635666521e-05, "loss": 0.1062, "num_input_tokens_seen": 7718496, "step": 5945 }, { "epoch": 0.2907189797962524, "grad_norm": 0.1676880270242691, "learning_rate": 4.732044611030132e-05, "loss": 0.103, "num_input_tokens_seen": 7725440, "step": 5950 }, { "epoch": 0.29096328145994677, "grad_norm": 0.1924290806055069, "learning_rate": 4.731602242184414e-05, "loss": 0.1142, "num_input_tokens_seen": 7731424, "step": 5955 }, { "epoch": 0.2912075831236411, "grad_norm": 0.2702045440673828, "learning_rate": 4.7311595291975864e-05, "loss": 0.1022, "num_input_tokens_seen": 7737984, "step": 5960 }, { "epoch": 0.2914518847873354, "grad_norm": 0.1499556303024292, "learning_rate": 4.7307164721379216e-05, "loss": 0.0995, "num_input_tokens_seen": 7744288, "step": 5965 }, { "epoch": 0.29169618645102974, "grad_norm": 0.3814457356929779, "learning_rate": 4.730273071073743e-05, "loss": 0.0815, "num_input_tokens_seen": 7750944, "step": 5970 }, { "epoch": 0.29194048811472406, "grad_norm": 0.16399218142032623, "learning_rate": 4.729829326073429e-05, "loss": 0.1005, "num_input_tokens_seen": 7757024, "step": 5975 }, { "epoch": 0.2921847897784184, "grad_norm": 0.20710714161396027, "learning_rate": 4.7293852372054126e-05, "loss": 0.0956, "num_input_tokens_seen": 7763360, "step": 5980 }, { "epoch": 0.2924290914421127, "grad_norm": 0.29068928956985474, "learning_rate": 4.728940804538176e-05, "loss": 0.1065, "num_input_tokens_seen": 7770144, "step": 5985 }, { "epoch": 0.29267339310580703, "grad_norm": 0.3473339378833771, "learning_rate": 4.7284960281402556e-05, "loss": 0.1072, "num_input_tokens_seen": 7776192, "step": 5990 }, { "epoch": 0.29291769476950136, "grad_norm": 0.1538001000881195, "learning_rate": 4.728050908080244e-05, "loss": 0.1083, "num_input_tokens_seen": 7782368, "step": 5995 }, { "epoch": 0.29316199643319574, "grad_norm": 0.3088107109069824, "learning_rate": 4.727605444426782e-05, "loss": 0.0869, "num_input_tokens_seen": 7788992, "step": 6000 }, { "epoch": 0.29316199643319574, "eval_loss": 0.0975622683763504, "eval_runtime": 374.5757, "eval_samples_per_second": 97.137, "eval_steps_per_second": 24.286, "num_input_tokens_seen": 7788992, "step": 6000 }, { "epoch": 0.29340629809689006, "grad_norm": 0.15291574597358704, "learning_rate": 4.727159637248567e-05, "loss": 0.0656, "num_input_tokens_seen": 7795872, "step": 6005 }, { "epoch": 0.2936505997605844, "grad_norm": 0.36411401629447937, "learning_rate": 4.7267134866143474e-05, "loss": 0.1188, "num_input_tokens_seen": 7802592, "step": 6010 }, { "epoch": 0.2938949014242787, "grad_norm": 1.2406153678894043, "learning_rate": 4.726266992592926e-05, "loss": 0.1097, "num_input_tokens_seen": 7809376, "step": 6015 }, { "epoch": 0.29413920308797303, "grad_norm": 0.412537157535553, "learning_rate": 4.725820155253157e-05, "loss": 0.0704, "num_input_tokens_seen": 7815872, "step": 6020 }, { "epoch": 0.29438350475166736, "grad_norm": 0.19983835518360138, "learning_rate": 4.725372974663948e-05, "loss": 0.0865, "num_input_tokens_seen": 7822592, "step": 6025 }, { "epoch": 0.2946278064153617, "grad_norm": 0.14307938516139984, "learning_rate": 4.724925450894262e-05, "loss": 0.0824, "num_input_tokens_seen": 7829184, "step": 6030 }, { "epoch": 0.294872108079056, "grad_norm": 0.45066019892692566, "learning_rate": 4.72447758401311e-05, "loss": 0.0881, "num_input_tokens_seen": 7835744, "step": 6035 }, { "epoch": 0.29511640974275033, "grad_norm": 0.16637641191482544, "learning_rate": 4.7240293740895616e-05, "loss": 0.0789, "num_input_tokens_seen": 7842624, "step": 6040 }, { "epoch": 0.29536071140644465, "grad_norm": 0.21520176529884338, "learning_rate": 4.723580821192733e-05, "loss": 0.0894, "num_input_tokens_seen": 7848864, "step": 6045 }, { "epoch": 0.29560501307013903, "grad_norm": 0.45462849736213684, "learning_rate": 4.7231319253917996e-05, "loss": 0.0738, "num_input_tokens_seen": 7856064, "step": 6050 }, { "epoch": 0.29584931473383336, "grad_norm": 0.349864661693573, "learning_rate": 4.722682686755986e-05, "loss": 0.1012, "num_input_tokens_seen": 7862432, "step": 6055 }, { "epoch": 0.2960936163975277, "grad_norm": 0.10131508111953735, "learning_rate": 4.722233105354569e-05, "loss": 0.099, "num_input_tokens_seen": 7868800, "step": 6060 }, { "epoch": 0.296337918061222, "grad_norm": 0.2167479246854782, "learning_rate": 4.7217831812568815e-05, "loss": 0.0791, "num_input_tokens_seen": 7875040, "step": 6065 }, { "epoch": 0.29658221972491633, "grad_norm": 0.1902344673871994, "learning_rate": 4.721332914532307e-05, "loss": 0.08, "num_input_tokens_seen": 7881664, "step": 6070 }, { "epoch": 0.29682652138861065, "grad_norm": 0.1869397908449173, "learning_rate": 4.720882305250281e-05, "loss": 0.1029, "num_input_tokens_seen": 7888320, "step": 6075 }, { "epoch": 0.297070823052305, "grad_norm": 0.4186192750930786, "learning_rate": 4.720431353480295e-05, "loss": 0.0746, "num_input_tokens_seen": 7894400, "step": 6080 }, { "epoch": 0.2973151247159993, "grad_norm": 0.20166410505771637, "learning_rate": 4.719980059291891e-05, "loss": 0.1003, "num_input_tokens_seen": 7900864, "step": 6085 }, { "epoch": 0.2975594263796936, "grad_norm": 0.5871819257736206, "learning_rate": 4.7195284227546634e-05, "loss": 0.0896, "num_input_tokens_seen": 7907488, "step": 6090 }, { "epoch": 0.29780372804338795, "grad_norm": 0.5040030479431152, "learning_rate": 4.7190764439382604e-05, "loss": 0.0775, "num_input_tokens_seen": 7913728, "step": 6095 }, { "epoch": 0.29804802970708233, "grad_norm": 0.4891963601112366, "learning_rate": 4.7186241229123826e-05, "loss": 0.0918, "num_input_tokens_seen": 7920384, "step": 6100 }, { "epoch": 0.29829233137077665, "grad_norm": 0.31751564145088196, "learning_rate": 4.718171459746785e-05, "loss": 0.112, "num_input_tokens_seen": 7927072, "step": 6105 }, { "epoch": 0.298536633034471, "grad_norm": 0.30168282985687256, "learning_rate": 4.717718454511273e-05, "loss": 0.0791, "num_input_tokens_seen": 7933952, "step": 6110 }, { "epoch": 0.2987809346981653, "grad_norm": 0.4341393709182739, "learning_rate": 4.7172651072757056e-05, "loss": 0.0896, "num_input_tokens_seen": 7940544, "step": 6115 }, { "epoch": 0.2990252363618596, "grad_norm": 0.3220139443874359, "learning_rate": 4.7168114181099945e-05, "loss": 0.0694, "num_input_tokens_seen": 7947040, "step": 6120 }, { "epoch": 0.29926953802555395, "grad_norm": 0.5119746327400208, "learning_rate": 4.716357387084105e-05, "loss": 0.0805, "num_input_tokens_seen": 7953792, "step": 6125 }, { "epoch": 0.2995138396892483, "grad_norm": 0.24669881165027618, "learning_rate": 4.715903014268054e-05, "loss": 0.0814, "num_input_tokens_seen": 7960448, "step": 6130 }, { "epoch": 0.2997581413529426, "grad_norm": 0.23228824138641357, "learning_rate": 4.715448299731911e-05, "loss": 0.0689, "num_input_tokens_seen": 7967360, "step": 6135 }, { "epoch": 0.3000024430166369, "grad_norm": 0.547822117805481, "learning_rate": 4.7149932435457986e-05, "loss": 0.0846, "num_input_tokens_seen": 7973664, "step": 6140 }, { "epoch": 0.30024674468033125, "grad_norm": 0.41425660252571106, "learning_rate": 4.714537845779894e-05, "loss": 0.0672, "num_input_tokens_seen": 7980064, "step": 6145 }, { "epoch": 0.3004910463440256, "grad_norm": 0.6489253044128418, "learning_rate": 4.714082106504423e-05, "loss": 0.068, "num_input_tokens_seen": 7986720, "step": 6150 }, { "epoch": 0.30073534800771995, "grad_norm": 0.1696435511112213, "learning_rate": 4.713626025789667e-05, "loss": 0.1311, "num_input_tokens_seen": 7992960, "step": 6155 }, { "epoch": 0.3009796496714143, "grad_norm": 0.2527689039707184, "learning_rate": 4.7131696037059606e-05, "loss": 0.0776, "num_input_tokens_seen": 8000544, "step": 6160 }, { "epoch": 0.3012239513351086, "grad_norm": 0.18133632838726044, "learning_rate": 4.712712840323689e-05, "loss": 0.1012, "num_input_tokens_seen": 8007424, "step": 6165 }, { "epoch": 0.3014682529988029, "grad_norm": 0.2757152020931244, "learning_rate": 4.71225573571329e-05, "loss": 0.0808, "num_input_tokens_seen": 8013920, "step": 6170 }, { "epoch": 0.30171255466249725, "grad_norm": 0.27862662076950073, "learning_rate": 4.711798289945256e-05, "loss": 0.0848, "num_input_tokens_seen": 8020576, "step": 6175 }, { "epoch": 0.30195685632619157, "grad_norm": 0.18740317225456238, "learning_rate": 4.71134050309013e-05, "loss": 0.1043, "num_input_tokens_seen": 8027008, "step": 6180 }, { "epoch": 0.3022011579898859, "grad_norm": 0.16726990044116974, "learning_rate": 4.710882375218509e-05, "loss": 0.0954, "num_input_tokens_seen": 8033696, "step": 6185 }, { "epoch": 0.3024454596535802, "grad_norm": 0.5069397687911987, "learning_rate": 4.7104239064010424e-05, "loss": 0.0945, "num_input_tokens_seen": 8039808, "step": 6190 }, { "epoch": 0.3026897613172746, "grad_norm": 0.2747390866279602, "learning_rate": 4.709965096708432e-05, "loss": 0.0748, "num_input_tokens_seen": 8046208, "step": 6195 }, { "epoch": 0.3029340629809689, "grad_norm": 0.24998725950717926, "learning_rate": 4.709505946211431e-05, "loss": 0.0825, "num_input_tokens_seen": 8052736, "step": 6200 }, { "epoch": 0.3029340629809689, "eval_loss": 0.09710023552179337, "eval_runtime": 375.5642, "eval_samples_per_second": 96.881, "eval_steps_per_second": 24.222, "num_input_tokens_seen": 8052736, "step": 6200 }, { "epoch": 0.30317836464466325, "grad_norm": 0.3044418692588806, "learning_rate": 4.709046454980846e-05, "loss": 0.0597, "num_input_tokens_seen": 8059104, "step": 6205 }, { "epoch": 0.30342266630835757, "grad_norm": 0.19415827095508575, "learning_rate": 4.708586623087538e-05, "loss": 0.0901, "num_input_tokens_seen": 8065280, "step": 6210 }, { "epoch": 0.3036669679720519, "grad_norm": 0.6562895774841309, "learning_rate": 4.708126450602418e-05, "loss": 0.0924, "num_input_tokens_seen": 8071808, "step": 6215 }, { "epoch": 0.3039112696357462, "grad_norm": 0.5076685547828674, "learning_rate": 4.7076659375964495e-05, "loss": 0.0967, "num_input_tokens_seen": 8078528, "step": 6220 }, { "epoch": 0.30415557129944054, "grad_norm": 1.5332947969436646, "learning_rate": 4.707205084140651e-05, "loss": 0.1436, "num_input_tokens_seen": 8084800, "step": 6225 }, { "epoch": 0.30439987296313487, "grad_norm": 0.27832502126693726, "learning_rate": 4.7067438903060904e-05, "loss": 0.0523, "num_input_tokens_seen": 8091296, "step": 6230 }, { "epoch": 0.3046441746268292, "grad_norm": 0.1640067994594574, "learning_rate": 4.70628235616389e-05, "loss": 0.0782, "num_input_tokens_seen": 8097376, "step": 6235 }, { "epoch": 0.3048884762905235, "grad_norm": 0.46255233883857727, "learning_rate": 4.7058204817852256e-05, "loss": 0.0851, "num_input_tokens_seen": 8104128, "step": 6240 }, { "epoch": 0.3051327779542179, "grad_norm": 0.1567063331604004, "learning_rate": 4.705358267241322e-05, "loss": 0.0661, "num_input_tokens_seen": 8110368, "step": 6245 }, { "epoch": 0.3053770796179122, "grad_norm": 0.5842726826667786, "learning_rate": 4.704895712603459e-05, "loss": 0.1, "num_input_tokens_seen": 8116704, "step": 6250 }, { "epoch": 0.30562138128160654, "grad_norm": 0.15723298490047455, "learning_rate": 4.704432817942969e-05, "loss": 0.1046, "num_input_tokens_seen": 8123200, "step": 6255 }, { "epoch": 0.30586568294530087, "grad_norm": 0.2831327021121979, "learning_rate": 4.703969583331236e-05, "loss": 0.0742, "num_input_tokens_seen": 8129440, "step": 6260 }, { "epoch": 0.3061099846089952, "grad_norm": 0.18067342042922974, "learning_rate": 4.7035060088396965e-05, "loss": 0.0954, "num_input_tokens_seen": 8135968, "step": 6265 }, { "epoch": 0.3063542862726895, "grad_norm": 0.5640091896057129, "learning_rate": 4.703042094539839e-05, "loss": 0.096, "num_input_tokens_seen": 8142752, "step": 6270 }, { "epoch": 0.30659858793638384, "grad_norm": 0.19749034941196442, "learning_rate": 4.702577840503206e-05, "loss": 0.0888, "num_input_tokens_seen": 8148928, "step": 6275 }, { "epoch": 0.30684288960007816, "grad_norm": 0.7259510159492493, "learning_rate": 4.70211324680139e-05, "loss": 0.0918, "num_input_tokens_seen": 8155328, "step": 6280 }, { "epoch": 0.3070871912637725, "grad_norm": 0.23447658121585846, "learning_rate": 4.7016483135060386e-05, "loss": 0.0872, "num_input_tokens_seen": 8162080, "step": 6285 }, { "epoch": 0.3073314929274668, "grad_norm": 0.2876020669937134, "learning_rate": 4.701183040688849e-05, "loss": 0.0997, "num_input_tokens_seen": 8169184, "step": 6290 }, { "epoch": 0.3075757945911612, "grad_norm": 0.5281079411506653, "learning_rate": 4.700717428421573e-05, "loss": 0.0943, "num_input_tokens_seen": 8175680, "step": 6295 }, { "epoch": 0.3078200962548555, "grad_norm": 0.29680153727531433, "learning_rate": 4.700251476776014e-05, "loss": 0.0897, "num_input_tokens_seen": 8182496, "step": 6300 }, { "epoch": 0.30806439791854984, "grad_norm": 0.41140469908714294, "learning_rate": 4.699785185824026e-05, "loss": 0.0995, "num_input_tokens_seen": 8189088, "step": 6305 }, { "epoch": 0.30830869958224416, "grad_norm": 0.2377861738204956, "learning_rate": 4.699318555637519e-05, "loss": 0.0974, "num_input_tokens_seen": 8195776, "step": 6310 }, { "epoch": 0.3085530012459385, "grad_norm": 0.4154195785522461, "learning_rate": 4.6988515862884525e-05, "loss": 0.102, "num_input_tokens_seen": 8202272, "step": 6315 }, { "epoch": 0.3087973029096328, "grad_norm": 1.0901411771774292, "learning_rate": 4.698384277848838e-05, "loss": 0.0896, "num_input_tokens_seen": 8208640, "step": 6320 }, { "epoch": 0.30904160457332713, "grad_norm": 0.4970794916152954, "learning_rate": 4.6979166303907425e-05, "loss": 0.1121, "num_input_tokens_seen": 8214944, "step": 6325 }, { "epoch": 0.30928590623702146, "grad_norm": 0.32521796226501465, "learning_rate": 4.697448643986281e-05, "loss": 0.0921, "num_input_tokens_seen": 8221696, "step": 6330 }, { "epoch": 0.3095302079007158, "grad_norm": 0.4863957166671753, "learning_rate": 4.696980318707624e-05, "loss": 0.0998, "num_input_tokens_seen": 8227904, "step": 6335 }, { "epoch": 0.30977450956441016, "grad_norm": 0.1828237622976303, "learning_rate": 4.6965116546269924e-05, "loss": 0.0876, "num_input_tokens_seen": 8234400, "step": 6340 }, { "epoch": 0.3100188112281045, "grad_norm": 0.6501533389091492, "learning_rate": 4.6960426518166615e-05, "loss": 0.0966, "num_input_tokens_seen": 8241088, "step": 6345 }, { "epoch": 0.3102631128917988, "grad_norm": 0.5146931409835815, "learning_rate": 4.6955733103489556e-05, "loss": 0.1301, "num_input_tokens_seen": 8247136, "step": 6350 }, { "epoch": 0.31050741455549313, "grad_norm": 0.302685409784317, "learning_rate": 4.695103630296255e-05, "loss": 0.113, "num_input_tokens_seen": 8253280, "step": 6355 }, { "epoch": 0.31075171621918746, "grad_norm": 0.6133214235305786, "learning_rate": 4.694633611730988e-05, "loss": 0.1246, "num_input_tokens_seen": 8259392, "step": 6360 }, { "epoch": 0.3109960178828818, "grad_norm": 0.2739103436470032, "learning_rate": 4.694163254725639e-05, "loss": 0.0982, "num_input_tokens_seen": 8265920, "step": 6365 }, { "epoch": 0.3112403195465761, "grad_norm": 0.272920697927475, "learning_rate": 4.693692559352743e-05, "loss": 0.0859, "num_input_tokens_seen": 8272960, "step": 6370 }, { "epoch": 0.31148462121027043, "grad_norm": 0.4265994131565094, "learning_rate": 4.693221525684886e-05, "loss": 0.0946, "num_input_tokens_seen": 8280000, "step": 6375 }, { "epoch": 0.31172892287396475, "grad_norm": 0.23218931257724762, "learning_rate": 4.6927501537947084e-05, "loss": 0.0988, "num_input_tokens_seen": 8286144, "step": 6380 }, { "epoch": 0.3119732245376591, "grad_norm": 0.3957938849925995, "learning_rate": 4.692278443754901e-05, "loss": 0.0949, "num_input_tokens_seen": 8292320, "step": 6385 }, { "epoch": 0.31221752620135346, "grad_norm": 0.3962356448173523, "learning_rate": 4.691806395638208e-05, "loss": 0.0733, "num_input_tokens_seen": 8298848, "step": 6390 }, { "epoch": 0.3124618278650478, "grad_norm": 0.3808154761791229, "learning_rate": 4.6913340095174255e-05, "loss": 0.1076, "num_input_tokens_seen": 8305120, "step": 6395 }, { "epoch": 0.3127061295287421, "grad_norm": 0.5401763916015625, "learning_rate": 4.690861285465399e-05, "loss": 0.0997, "num_input_tokens_seen": 8311808, "step": 6400 }, { "epoch": 0.3127061295287421, "eval_loss": 0.09652689844369888, "eval_runtime": 374.9334, "eval_samples_per_second": 97.044, "eval_steps_per_second": 24.263, "num_input_tokens_seen": 8311808, "step": 6400 }, { "epoch": 0.31295043119243643, "grad_norm": 0.29489344358444214, "learning_rate": 4.690388223555031e-05, "loss": 0.0961, "num_input_tokens_seen": 8317856, "step": 6405 }, { "epoch": 0.31319473285613075, "grad_norm": 0.5153646469116211, "learning_rate": 4.689914823859273e-05, "loss": 0.1062, "num_input_tokens_seen": 8324608, "step": 6410 }, { "epoch": 0.3134390345198251, "grad_norm": 0.19459591805934906, "learning_rate": 4.689441086451129e-05, "loss": 0.1105, "num_input_tokens_seen": 8331456, "step": 6415 }, { "epoch": 0.3136833361835194, "grad_norm": 0.8802443146705627, "learning_rate": 4.688967011403655e-05, "loss": 0.098, "num_input_tokens_seen": 8337792, "step": 6420 }, { "epoch": 0.3139276378472137, "grad_norm": 0.6191151738166809, "learning_rate": 4.68849259878996e-05, "loss": 0.0979, "num_input_tokens_seen": 8344032, "step": 6425 }, { "epoch": 0.31417193951090805, "grad_norm": 0.20685191452503204, "learning_rate": 4.6880178486832036e-05, "loss": 0.0834, "num_input_tokens_seen": 8350272, "step": 6430 }, { "epoch": 0.3144162411746024, "grad_norm": 0.5125857591629028, "learning_rate": 4.687542761156598e-05, "loss": 0.0988, "num_input_tokens_seen": 8357056, "step": 6435 }, { "epoch": 0.31466054283829675, "grad_norm": 0.7260459661483765, "learning_rate": 4.6870673362834096e-05, "loss": 0.1373, "num_input_tokens_seen": 8363360, "step": 6440 }, { "epoch": 0.3149048445019911, "grad_norm": 0.32399651408195496, "learning_rate": 4.6865915741369526e-05, "loss": 0.0904, "num_input_tokens_seen": 8370016, "step": 6445 }, { "epoch": 0.3151491461656854, "grad_norm": 0.3208819627761841, "learning_rate": 4.686115474790597e-05, "loss": 0.1068, "num_input_tokens_seen": 8376128, "step": 6450 }, { "epoch": 0.3153934478293797, "grad_norm": 0.22948595881462097, "learning_rate": 4.685639038317762e-05, "loss": 0.1169, "num_input_tokens_seen": 8382176, "step": 6455 }, { "epoch": 0.31563774949307405, "grad_norm": 0.20652920007705688, "learning_rate": 4.685162264791921e-05, "loss": 0.1009, "num_input_tokens_seen": 8388512, "step": 6460 }, { "epoch": 0.3158820511567684, "grad_norm": 0.3723239302635193, "learning_rate": 4.684685154286599e-05, "loss": 0.0776, "num_input_tokens_seen": 8394816, "step": 6465 }, { "epoch": 0.3161263528204627, "grad_norm": 0.9379007816314697, "learning_rate": 4.684207706875371e-05, "loss": 0.1007, "num_input_tokens_seen": 8400800, "step": 6470 }, { "epoch": 0.316370654484157, "grad_norm": 0.22900867462158203, "learning_rate": 4.683729922631866e-05, "loss": 0.0978, "num_input_tokens_seen": 8407072, "step": 6475 }, { "epoch": 0.31661495614785135, "grad_norm": 0.38279980421066284, "learning_rate": 4.683251801629765e-05, "loss": 0.1067, "num_input_tokens_seen": 8413632, "step": 6480 }, { "epoch": 0.31685925781154567, "grad_norm": 0.20542703568935394, "learning_rate": 4.6827733439428e-05, "loss": 0.0907, "num_input_tokens_seen": 8420128, "step": 6485 }, { "epoch": 0.31710355947524005, "grad_norm": 1.049893856048584, "learning_rate": 4.682294549644754e-05, "loss": 0.085, "num_input_tokens_seen": 8426816, "step": 6490 }, { "epoch": 0.3173478611389344, "grad_norm": 0.21514475345611572, "learning_rate": 4.681815418809464e-05, "loss": 0.1051, "num_input_tokens_seen": 8433088, "step": 6495 }, { "epoch": 0.3175921628026287, "grad_norm": 0.22200337052345276, "learning_rate": 4.681335951510819e-05, "loss": 0.0863, "num_input_tokens_seen": 8439616, "step": 6500 }, { "epoch": 0.317836464466323, "grad_norm": 0.20581814646720886, "learning_rate": 4.6808561478227576e-05, "loss": 0.0883, "num_input_tokens_seen": 8445952, "step": 6505 }, { "epoch": 0.31808076613001735, "grad_norm": 0.24307692050933838, "learning_rate": 4.680376007819271e-05, "loss": 0.0878, "num_input_tokens_seen": 8451968, "step": 6510 }, { "epoch": 0.31832506779371167, "grad_norm": 0.3898006081581116, "learning_rate": 4.679895531574405e-05, "loss": 0.0941, "num_input_tokens_seen": 8457984, "step": 6515 }, { "epoch": 0.318569369457406, "grad_norm": 0.7587398886680603, "learning_rate": 4.679414719162253e-05, "loss": 0.1064, "num_input_tokens_seen": 8464288, "step": 6520 }, { "epoch": 0.3188136711211003, "grad_norm": 0.17727145552635193, "learning_rate": 4.6789335706569635e-05, "loss": 0.0791, "num_input_tokens_seen": 8470656, "step": 6525 }, { "epoch": 0.31905797278479464, "grad_norm": 0.41929078102111816, "learning_rate": 4.678452086132734e-05, "loss": 0.1342, "num_input_tokens_seen": 8476672, "step": 6530 }, { "epoch": 0.319302274448489, "grad_norm": 0.4222988486289978, "learning_rate": 4.677970265663818e-05, "loss": 0.091, "num_input_tokens_seen": 8483168, "step": 6535 }, { "epoch": 0.31954657611218334, "grad_norm": 0.31732770800590515, "learning_rate": 4.677488109324517e-05, "loss": 0.1134, "num_input_tokens_seen": 8489472, "step": 6540 }, { "epoch": 0.31979087777587767, "grad_norm": 0.3334413468837738, "learning_rate": 4.6770056171891846e-05, "loss": 0.0853, "num_input_tokens_seen": 8496256, "step": 6545 }, { "epoch": 0.320035179439572, "grad_norm": 0.7095257639884949, "learning_rate": 4.6765227893322286e-05, "loss": 0.0783, "num_input_tokens_seen": 8503744, "step": 6550 }, { "epoch": 0.3202794811032663, "grad_norm": 0.3275570869445801, "learning_rate": 4.676039625828107e-05, "loss": 0.0968, "num_input_tokens_seen": 8510560, "step": 6555 }, { "epoch": 0.32052378276696064, "grad_norm": 0.17167100310325623, "learning_rate": 4.675556126751328e-05, "loss": 0.0824, "num_input_tokens_seen": 8517344, "step": 6560 }, { "epoch": 0.32076808443065497, "grad_norm": 0.3472211956977844, "learning_rate": 4.6750722921764556e-05, "loss": 0.0847, "num_input_tokens_seen": 8523616, "step": 6565 }, { "epoch": 0.3210123860943493, "grad_norm": 0.3127390146255493, "learning_rate": 4.674588122178102e-05, "loss": 0.1164, "num_input_tokens_seen": 8529920, "step": 6570 }, { "epoch": 0.3212566877580436, "grad_norm": 0.2203143835067749, "learning_rate": 4.674103616830931e-05, "loss": 0.0951, "num_input_tokens_seen": 8536160, "step": 6575 }, { "epoch": 0.32150098942173794, "grad_norm": 0.11770858615636826, "learning_rate": 4.673618776209663e-05, "loss": 0.0955, "num_input_tokens_seen": 8542464, "step": 6580 }, { "epoch": 0.3217452910854323, "grad_norm": 0.2054276466369629, "learning_rate": 4.673133600389063e-05, "loss": 0.0815, "num_input_tokens_seen": 8548576, "step": 6585 }, { "epoch": 0.32198959274912664, "grad_norm": 0.21968188881874084, "learning_rate": 4.672648089443953e-05, "loss": 0.1184, "num_input_tokens_seen": 8555552, "step": 6590 }, { "epoch": 0.32223389441282096, "grad_norm": 0.14749445021152496, "learning_rate": 4.672162243449204e-05, "loss": 0.0994, "num_input_tokens_seen": 8562016, "step": 6595 }, { "epoch": 0.3224781960765153, "grad_norm": 0.3071030080318451, "learning_rate": 4.67167606247974e-05, "loss": 0.1088, "num_input_tokens_seen": 8568416, "step": 6600 }, { "epoch": 0.3224781960765153, "eval_loss": 0.09665754437446594, "eval_runtime": 374.5563, "eval_samples_per_second": 97.142, "eval_steps_per_second": 24.287, "num_input_tokens_seen": 8568416, "step": 6600 }, { "epoch": 0.3227224977402096, "grad_norm": 0.26558318734169006, "learning_rate": 4.671189546610536e-05, "loss": 0.1083, "num_input_tokens_seen": 8574848, "step": 6605 }, { "epoch": 0.32296679940390394, "grad_norm": 1.266261339187622, "learning_rate": 4.67070269591662e-05, "loss": 0.1118, "num_input_tokens_seen": 8581696, "step": 6610 }, { "epoch": 0.32321110106759826, "grad_norm": 0.1399136632680893, "learning_rate": 4.670215510473068e-05, "loss": 0.0777, "num_input_tokens_seen": 8588480, "step": 6615 }, { "epoch": 0.3234554027312926, "grad_norm": 0.313291072845459, "learning_rate": 4.669727990355013e-05, "loss": 0.079, "num_input_tokens_seen": 8595200, "step": 6620 }, { "epoch": 0.3236997043949869, "grad_norm": 0.2375296950340271, "learning_rate": 4.669240135637635e-05, "loss": 0.0972, "num_input_tokens_seen": 8601824, "step": 6625 }, { "epoch": 0.32394400605868123, "grad_norm": 0.3401222825050354, "learning_rate": 4.6687519463961675e-05, "loss": 0.0996, "num_input_tokens_seen": 8608320, "step": 6630 }, { "epoch": 0.3241883077223756, "grad_norm": 0.3387232720851898, "learning_rate": 4.668263422705896e-05, "loss": 0.0905, "num_input_tokens_seen": 8615552, "step": 6635 }, { "epoch": 0.32443260938606994, "grad_norm": 0.7111276984214783, "learning_rate": 4.667774564642156e-05, "loss": 0.1079, "num_input_tokens_seen": 8622112, "step": 6640 }, { "epoch": 0.32467691104976426, "grad_norm": 0.28168144822120667, "learning_rate": 4.6672853722803365e-05, "loss": 0.1208, "num_input_tokens_seen": 8628544, "step": 6645 }, { "epoch": 0.3249212127134586, "grad_norm": 0.6079109907150269, "learning_rate": 4.666795845695877e-05, "loss": 0.0839, "num_input_tokens_seen": 8635200, "step": 6650 }, { "epoch": 0.3251655143771529, "grad_norm": 0.20271706581115723, "learning_rate": 4.666305984964269e-05, "loss": 0.066, "num_input_tokens_seen": 8641280, "step": 6655 }, { "epoch": 0.32540981604084723, "grad_norm": 0.2475515455007553, "learning_rate": 4.6658157901610535e-05, "loss": 0.1106, "num_input_tokens_seen": 8648160, "step": 6660 }, { "epoch": 0.32565411770454156, "grad_norm": 0.21975202858448029, "learning_rate": 4.665325261361826e-05, "loss": 0.1007, "num_input_tokens_seen": 8654400, "step": 6665 }, { "epoch": 0.3258984193682359, "grad_norm": 0.4545229375362396, "learning_rate": 4.664834398642232e-05, "loss": 0.0994, "num_input_tokens_seen": 8661408, "step": 6670 }, { "epoch": 0.3261427210319302, "grad_norm": 0.25218504667282104, "learning_rate": 4.6643432020779686e-05, "loss": 0.1049, "num_input_tokens_seen": 8667776, "step": 6675 }, { "epoch": 0.3263870226956246, "grad_norm": 0.4199483394622803, "learning_rate": 4.663851671744786e-05, "loss": 0.0825, "num_input_tokens_seen": 8674208, "step": 6680 }, { "epoch": 0.3266313243593189, "grad_norm": 0.4125681221485138, "learning_rate": 4.6633598077184815e-05, "loss": 0.081, "num_input_tokens_seen": 8680576, "step": 6685 }, { "epoch": 0.32687562602301323, "grad_norm": 0.22639521956443787, "learning_rate": 4.662867610074908e-05, "loss": 0.0875, "num_input_tokens_seen": 8687360, "step": 6690 }, { "epoch": 0.32711992768670756, "grad_norm": 0.22832851111888885, "learning_rate": 4.6623750788899696e-05, "loss": 0.0756, "num_input_tokens_seen": 8693728, "step": 6695 }, { "epoch": 0.3273642293504019, "grad_norm": 0.4847135543823242, "learning_rate": 4.6618822142396195e-05, "loss": 0.1238, "num_input_tokens_seen": 8700160, "step": 6700 }, { "epoch": 0.3276085310140962, "grad_norm": 0.15553246438503265, "learning_rate": 4.661389016199864e-05, "loss": 0.118, "num_input_tokens_seen": 8706560, "step": 6705 }, { "epoch": 0.32785283267779053, "grad_norm": 0.27314677834510803, "learning_rate": 4.660895484846761e-05, "loss": 0.0969, "num_input_tokens_seen": 8712672, "step": 6710 }, { "epoch": 0.32809713434148485, "grad_norm": 0.17922787368297577, "learning_rate": 4.660401620256418e-05, "loss": 0.1041, "num_input_tokens_seen": 8719584, "step": 6715 }, { "epoch": 0.3283414360051792, "grad_norm": 0.1568642109632492, "learning_rate": 4.659907422504997e-05, "loss": 0.1016, "num_input_tokens_seen": 8726688, "step": 6720 }, { "epoch": 0.3285857376688735, "grad_norm": 0.863193690776825, "learning_rate": 4.6594128916687074e-05, "loss": 0.1023, "num_input_tokens_seen": 8733440, "step": 6725 }, { "epoch": 0.3288300393325679, "grad_norm": 0.25032225251197815, "learning_rate": 4.658918027823813e-05, "loss": 0.1016, "num_input_tokens_seen": 8739872, "step": 6730 }, { "epoch": 0.3290743409962622, "grad_norm": 0.5146589279174805, "learning_rate": 4.658422831046628e-05, "loss": 0.1049, "num_input_tokens_seen": 8746528, "step": 6735 }, { "epoch": 0.32931864265995653, "grad_norm": 0.4492826461791992, "learning_rate": 4.657927301413518e-05, "loss": 0.1138, "num_input_tokens_seen": 8752864, "step": 6740 }, { "epoch": 0.32956294432365085, "grad_norm": 0.7462248206138611, "learning_rate": 4.657431439000901e-05, "loss": 0.1179, "num_input_tokens_seen": 8759744, "step": 6745 }, { "epoch": 0.3298072459873452, "grad_norm": 0.45156151056289673, "learning_rate": 4.656935243885243e-05, "loss": 0.0887, "num_input_tokens_seen": 8765664, "step": 6750 }, { "epoch": 0.3300515476510395, "grad_norm": 0.5316187143325806, "learning_rate": 4.656438716143066e-05, "loss": 0.0942, "num_input_tokens_seen": 8771680, "step": 6755 }, { "epoch": 0.3302958493147338, "grad_norm": 0.6096909642219543, "learning_rate": 4.6559418558509384e-05, "loss": 0.0789, "num_input_tokens_seen": 8778528, "step": 6760 }, { "epoch": 0.33054015097842815, "grad_norm": 0.2576163709163666, "learning_rate": 4.6554446630854833e-05, "loss": 0.1133, "num_input_tokens_seen": 8785088, "step": 6765 }, { "epoch": 0.3307844526421225, "grad_norm": 0.3542119562625885, "learning_rate": 4.654947137923374e-05, "loss": 0.081, "num_input_tokens_seen": 8791360, "step": 6770 }, { "epoch": 0.3310287543058168, "grad_norm": 0.4521823227405548, "learning_rate": 4.654449280441335e-05, "loss": 0.0836, "num_input_tokens_seen": 8797824, "step": 6775 }, { "epoch": 0.3312730559695112, "grad_norm": 1.2152637243270874, "learning_rate": 4.653951090716143e-05, "loss": 0.1038, "num_input_tokens_seen": 8804160, "step": 6780 }, { "epoch": 0.3315173576332055, "grad_norm": 0.3236430883407593, "learning_rate": 4.653452568824625e-05, "loss": 0.0841, "num_input_tokens_seen": 8810848, "step": 6785 }, { "epoch": 0.3317616592968998, "grad_norm": 1.2942639589309692, "learning_rate": 4.6529537148436585e-05, "loss": 0.0924, "num_input_tokens_seen": 8817312, "step": 6790 }, { "epoch": 0.33200596096059415, "grad_norm": 0.5791277289390564, "learning_rate": 4.6524545288501734e-05, "loss": 0.0962, "num_input_tokens_seen": 8824032, "step": 6795 }, { "epoch": 0.3322502626242885, "grad_norm": 0.1854354292154312, "learning_rate": 4.6519550109211506e-05, "loss": 0.0638, "num_input_tokens_seen": 8830400, "step": 6800 }, { "epoch": 0.3322502626242885, "eval_loss": 0.09633044898509979, "eval_runtime": 375.0898, "eval_samples_per_second": 97.003, "eval_steps_per_second": 24.253, "num_input_tokens_seen": 8830400, "step": 6800 }, { "epoch": 0.3324945642879828, "grad_norm": 0.22719000279903412, "learning_rate": 4.651455161133622e-05, "loss": 0.1059, "num_input_tokens_seen": 8836864, "step": 6805 }, { "epoch": 0.3327388659516771, "grad_norm": 0.3890743553638458, "learning_rate": 4.6509549795646704e-05, "loss": 0.0685, "num_input_tokens_seen": 8843520, "step": 6810 }, { "epoch": 0.33298316761537144, "grad_norm": 0.5180655121803284, "learning_rate": 4.6504544662914306e-05, "loss": 0.0899, "num_input_tokens_seen": 8850016, "step": 6815 }, { "epoch": 0.33322746927906577, "grad_norm": 0.4497598111629486, "learning_rate": 4.6499536213910876e-05, "loss": 0.0691, "num_input_tokens_seen": 8856384, "step": 6820 }, { "epoch": 0.3334717709427601, "grad_norm": 0.2348642647266388, "learning_rate": 4.6494524449408786e-05, "loss": 0.0888, "num_input_tokens_seen": 8863072, "step": 6825 }, { "epoch": 0.3337160726064545, "grad_norm": 0.1755627691745758, "learning_rate": 4.6489509370180903e-05, "loss": 0.0901, "num_input_tokens_seen": 8869568, "step": 6830 }, { "epoch": 0.3339603742701488, "grad_norm": 0.39013513922691345, "learning_rate": 4.648449097700063e-05, "loss": 0.098, "num_input_tokens_seen": 8876032, "step": 6835 }, { "epoch": 0.3342046759338431, "grad_norm": 0.164560467004776, "learning_rate": 4.647946927064185e-05, "loss": 0.1097, "num_input_tokens_seen": 8882208, "step": 6840 }, { "epoch": 0.33444897759753744, "grad_norm": 0.21302202343940735, "learning_rate": 4.647444425187898e-05, "loss": 0.0822, "num_input_tokens_seen": 8888608, "step": 6845 }, { "epoch": 0.33469327926123177, "grad_norm": 0.15892136096954346, "learning_rate": 4.646941592148695e-05, "loss": 0.1019, "num_input_tokens_seen": 8895872, "step": 6850 }, { "epoch": 0.3349375809249261, "grad_norm": 0.5791984796524048, "learning_rate": 4.646438428024117e-05, "loss": 0.1038, "num_input_tokens_seen": 8902496, "step": 6855 }, { "epoch": 0.3351818825886204, "grad_norm": 0.23105904459953308, "learning_rate": 4.64593493289176e-05, "loss": 0.1318, "num_input_tokens_seen": 8908832, "step": 6860 }, { "epoch": 0.33542618425231474, "grad_norm": 0.5279911160469055, "learning_rate": 4.64543110682927e-05, "loss": 0.1264, "num_input_tokens_seen": 8915008, "step": 6865 }, { "epoch": 0.33567048591600906, "grad_norm": 0.3584080934524536, "learning_rate": 4.644926949914341e-05, "loss": 0.0825, "num_input_tokens_seen": 8920992, "step": 6870 }, { "epoch": 0.33591478757970344, "grad_norm": 0.17046168446540833, "learning_rate": 4.644422462224722e-05, "loss": 0.0764, "num_input_tokens_seen": 8927744, "step": 6875 }, { "epoch": 0.33615908924339777, "grad_norm": 0.270063579082489, "learning_rate": 4.643917643838211e-05, "loss": 0.0951, "num_input_tokens_seen": 8934560, "step": 6880 }, { "epoch": 0.3364033909070921, "grad_norm": 0.5159220695495605, "learning_rate": 4.6434124948326564e-05, "loss": 0.1079, "num_input_tokens_seen": 8941344, "step": 6885 }, { "epoch": 0.3366476925707864, "grad_norm": 0.16833680868148804, "learning_rate": 4.6429070152859594e-05, "loss": 0.0992, "num_input_tokens_seen": 8947936, "step": 6890 }, { "epoch": 0.33689199423448074, "grad_norm": 0.23169288039207458, "learning_rate": 4.6424012052760714e-05, "loss": 0.1223, "num_input_tokens_seen": 8955136, "step": 6895 }, { "epoch": 0.33713629589817506, "grad_norm": 0.3539023697376251, "learning_rate": 4.6418950648809945e-05, "loss": 0.1304, "num_input_tokens_seen": 8961472, "step": 6900 }, { "epoch": 0.3373805975618694, "grad_norm": 0.261599063873291, "learning_rate": 4.641388594178782e-05, "loss": 0.0995, "num_input_tokens_seen": 8967808, "step": 6905 }, { "epoch": 0.3376248992255637, "grad_norm": 0.22506673634052277, "learning_rate": 4.640881793247538e-05, "loss": 0.0721, "num_input_tokens_seen": 8974304, "step": 6910 }, { "epoch": 0.33786920088925804, "grad_norm": 0.519288957118988, "learning_rate": 4.6403746621654173e-05, "loss": 0.0857, "num_input_tokens_seen": 8980928, "step": 6915 }, { "epoch": 0.33811350255295236, "grad_norm": 0.19205817580223083, "learning_rate": 4.639867201010626e-05, "loss": 0.0953, "num_input_tokens_seen": 8987264, "step": 6920 }, { "epoch": 0.33835780421664674, "grad_norm": 0.3876197934150696, "learning_rate": 4.6393594098614204e-05, "loss": 0.0931, "num_input_tokens_seen": 8993728, "step": 6925 }, { "epoch": 0.33860210588034106, "grad_norm": 0.29678305983543396, "learning_rate": 4.63885128879611e-05, "loss": 0.0929, "num_input_tokens_seen": 9000256, "step": 6930 }, { "epoch": 0.3388464075440354, "grad_norm": 0.5200182199478149, "learning_rate": 4.638342837893052e-05, "loss": 0.0948, "num_input_tokens_seen": 9006752, "step": 6935 }, { "epoch": 0.3390907092077297, "grad_norm": 0.24472558498382568, "learning_rate": 4.6378340572306565e-05, "loss": 0.0944, "num_input_tokens_seen": 9013856, "step": 6940 }, { "epoch": 0.33933501087142404, "grad_norm": 0.2699231803417206, "learning_rate": 4.6373249468873833e-05, "loss": 0.1056, "num_input_tokens_seen": 9020128, "step": 6945 }, { "epoch": 0.33957931253511836, "grad_norm": 0.5463542342185974, "learning_rate": 4.636815506941744e-05, "loss": 0.0764, "num_input_tokens_seen": 9026176, "step": 6950 }, { "epoch": 0.3398236141988127, "grad_norm": 0.5582578182220459, "learning_rate": 4.6363057374723004e-05, "loss": 0.0781, "num_input_tokens_seen": 9032576, "step": 6955 }, { "epoch": 0.340067915862507, "grad_norm": 0.5236517190933228, "learning_rate": 4.635795638557666e-05, "loss": 0.0904, "num_input_tokens_seen": 9038752, "step": 6960 }, { "epoch": 0.34031221752620133, "grad_norm": 0.5602813959121704, "learning_rate": 4.635285210276504e-05, "loss": 0.0701, "num_input_tokens_seen": 9044960, "step": 6965 }, { "epoch": 0.34055651918989566, "grad_norm": 0.21379488706588745, "learning_rate": 4.6347744527075295e-05, "loss": 0.1154, "num_input_tokens_seen": 9051552, "step": 6970 }, { "epoch": 0.34080082085359004, "grad_norm": 0.3556109666824341, "learning_rate": 4.634263365929506e-05, "loss": 0.0968, "num_input_tokens_seen": 9058048, "step": 6975 }, { "epoch": 0.34104512251728436, "grad_norm": 0.4515099823474884, "learning_rate": 4.6337519500212515e-05, "loss": 0.1065, "num_input_tokens_seen": 9064480, "step": 6980 }, { "epoch": 0.3412894241809787, "grad_norm": 0.3550862669944763, "learning_rate": 4.633240205061632e-05, "loss": 0.1162, "num_input_tokens_seen": 9071488, "step": 6985 }, { "epoch": 0.341533725844673, "grad_norm": 0.41133394837379456, "learning_rate": 4.632728131129565e-05, "loss": 0.1327, "num_input_tokens_seen": 9077920, "step": 6990 }, { "epoch": 0.34177802750836733, "grad_norm": 0.2531043589115143, "learning_rate": 4.632215728304018e-05, "loss": 0.0749, "num_input_tokens_seen": 9084672, "step": 6995 }, { "epoch": 0.34202232917206166, "grad_norm": 0.16680778563022614, "learning_rate": 4.63170299666401e-05, "loss": 0.114, "num_input_tokens_seen": 9091040, "step": 7000 }, { "epoch": 0.34202232917206166, "eval_loss": 0.09608318656682968, "eval_runtime": 375.1374, "eval_samples_per_second": 96.991, "eval_steps_per_second": 24.25, "num_input_tokens_seen": 9091040, "step": 7000 }, { "epoch": 0.342266630835756, "grad_norm": 0.23337538540363312, "learning_rate": 4.631189936288612e-05, "loss": 0.081, "num_input_tokens_seen": 9097312, "step": 7005 }, { "epoch": 0.3425109324994503, "grad_norm": 0.5350567698478699, "learning_rate": 4.630676547256944e-05, "loss": 0.1046, "num_input_tokens_seen": 9104000, "step": 7010 }, { "epoch": 0.34275523416314463, "grad_norm": 0.1471272110939026, "learning_rate": 4.630162829648176e-05, "loss": 0.1069, "num_input_tokens_seen": 9110304, "step": 7015 }, { "epoch": 0.342999535826839, "grad_norm": 1.0054364204406738, "learning_rate": 4.629648783541531e-05, "loss": 0.1057, "num_input_tokens_seen": 9116736, "step": 7020 }, { "epoch": 0.34324383749053333, "grad_norm": 0.16202262043952942, "learning_rate": 4.6291344090162804e-05, "loss": 0.1071, "num_input_tokens_seen": 9122912, "step": 7025 }, { "epoch": 0.34348813915422766, "grad_norm": 0.23545308411121368, "learning_rate": 4.628619706151748e-05, "loss": 0.0876, "num_input_tokens_seen": 9129440, "step": 7030 }, { "epoch": 0.343732440817922, "grad_norm": 0.44832131266593933, "learning_rate": 4.628104675027306e-05, "loss": 0.0914, "num_input_tokens_seen": 9135712, "step": 7035 }, { "epoch": 0.3439767424816163, "grad_norm": 0.31616097688674927, "learning_rate": 4.6275893157223805e-05, "loss": 0.0765, "num_input_tokens_seen": 9142528, "step": 7040 }, { "epoch": 0.34422104414531063, "grad_norm": 0.24060198664665222, "learning_rate": 4.627073628316445e-05, "loss": 0.0841, "num_input_tokens_seen": 9149472, "step": 7045 }, { "epoch": 0.34446534580900495, "grad_norm": 0.3735615015029907, "learning_rate": 4.626557612889026e-05, "loss": 0.092, "num_input_tokens_seen": 9155424, "step": 7050 }, { "epoch": 0.3447096474726993, "grad_norm": 0.3381032943725586, "learning_rate": 4.626041269519699e-05, "loss": 0.0908, "num_input_tokens_seen": 9162048, "step": 7055 }, { "epoch": 0.3449539491363936, "grad_norm": 0.6544587016105652, "learning_rate": 4.6255245982880905e-05, "loss": 0.0752, "num_input_tokens_seen": 9169056, "step": 7060 }, { "epoch": 0.3451982508000879, "grad_norm": 0.8924040794372559, "learning_rate": 4.625007599273879e-05, "loss": 0.1048, "num_input_tokens_seen": 9175424, "step": 7065 }, { "epoch": 0.3454425524637823, "grad_norm": 0.6964027881622314, "learning_rate": 4.6244902725567895e-05, "loss": 0.0795, "num_input_tokens_seen": 9181760, "step": 7070 }, { "epoch": 0.34568685412747663, "grad_norm": 0.25748410820961, "learning_rate": 4.6239726182166024e-05, "loss": 0.1097, "num_input_tokens_seen": 9188832, "step": 7075 }, { "epoch": 0.34593115579117095, "grad_norm": 0.5754373073577881, "learning_rate": 4.623454636333147e-05, "loss": 0.1159, "num_input_tokens_seen": 9194912, "step": 7080 }, { "epoch": 0.3461754574548653, "grad_norm": 0.37527069449424744, "learning_rate": 4.622936326986301e-05, "loss": 0.1065, "num_input_tokens_seen": 9201376, "step": 7085 }, { "epoch": 0.3464197591185596, "grad_norm": 0.7837941646575928, "learning_rate": 4.6224176902559946e-05, "loss": 0.1119, "num_input_tokens_seen": 9207840, "step": 7090 }, { "epoch": 0.3466640607822539, "grad_norm": 0.37711766362190247, "learning_rate": 4.621898726222209e-05, "loss": 0.1054, "num_input_tokens_seen": 9214400, "step": 7095 }, { "epoch": 0.34690836244594825, "grad_norm": 0.25164729356765747, "learning_rate": 4.6213794349649744e-05, "loss": 0.088, "num_input_tokens_seen": 9220768, "step": 7100 }, { "epoch": 0.3471526641096426, "grad_norm": 0.4564807116985321, "learning_rate": 4.6208598165643715e-05, "loss": 0.1117, "num_input_tokens_seen": 9227296, "step": 7105 }, { "epoch": 0.3473969657733369, "grad_norm": 0.23389315605163574, "learning_rate": 4.620339871100533e-05, "loss": 0.0828, "num_input_tokens_seen": 9233856, "step": 7110 }, { "epoch": 0.3476412674370312, "grad_norm": 0.34571751952171326, "learning_rate": 4.6198195986536394e-05, "loss": 0.077, "num_input_tokens_seen": 9239936, "step": 7115 }, { "epoch": 0.3478855691007256, "grad_norm": 0.26823511719703674, "learning_rate": 4.619298999303926e-05, "loss": 0.1074, "num_input_tokens_seen": 9246048, "step": 7120 }, { "epoch": 0.3481298707644199, "grad_norm": 0.24435152113437653, "learning_rate": 4.618778073131673e-05, "loss": 0.0905, "num_input_tokens_seen": 9252288, "step": 7125 }, { "epoch": 0.34837417242811425, "grad_norm": 0.8718639016151428, "learning_rate": 4.618256820217215e-05, "loss": 0.0942, "num_input_tokens_seen": 9259520, "step": 7130 }, { "epoch": 0.3486184740918086, "grad_norm": 0.24350665509700775, "learning_rate": 4.617735240640936e-05, "loss": 0.09, "num_input_tokens_seen": 9265952, "step": 7135 }, { "epoch": 0.3488627757555029, "grad_norm": 0.43934282660484314, "learning_rate": 4.6172133344832705e-05, "loss": 0.126, "num_input_tokens_seen": 9272096, "step": 7140 }, { "epoch": 0.3491070774191972, "grad_norm": 0.39389699697494507, "learning_rate": 4.6166911018247004e-05, "loss": 0.0885, "num_input_tokens_seen": 9278944, "step": 7145 }, { "epoch": 0.34935137908289154, "grad_norm": 0.22860507667064667, "learning_rate": 4.616168542745764e-05, "loss": 0.1069, "num_input_tokens_seen": 9284992, "step": 7150 }, { "epoch": 0.34959568074658587, "grad_norm": 0.3294461965560913, "learning_rate": 4.6156456573270446e-05, "loss": 0.1209, "num_input_tokens_seen": 9291520, "step": 7155 }, { "epoch": 0.3498399824102802, "grad_norm": 0.14749279618263245, "learning_rate": 4.615122445649177e-05, "loss": 0.081, "num_input_tokens_seen": 9298080, "step": 7160 }, { "epoch": 0.3500842840739745, "grad_norm": 0.6459203362464905, "learning_rate": 4.6145989077928486e-05, "loss": 0.1039, "num_input_tokens_seen": 9304192, "step": 7165 }, { "epoch": 0.3503285857376689, "grad_norm": 0.2079561948776245, "learning_rate": 4.6140750438387953e-05, "loss": 0.0977, "num_input_tokens_seen": 9310528, "step": 7170 }, { "epoch": 0.3505728874013632, "grad_norm": 0.5643799901008606, "learning_rate": 4.613550853867803e-05, "loss": 0.0865, "num_input_tokens_seen": 9317408, "step": 7175 }, { "epoch": 0.35081718906505754, "grad_norm": 0.446873277425766, "learning_rate": 4.613026337960708e-05, "loss": 0.0865, "num_input_tokens_seen": 9324000, "step": 7180 }, { "epoch": 0.35106149072875187, "grad_norm": 0.724429190158844, "learning_rate": 4.612501496198398e-05, "loss": 0.0722, "num_input_tokens_seen": 9330368, "step": 7185 }, { "epoch": 0.3513057923924462, "grad_norm": 0.48229730129241943, "learning_rate": 4.61197632866181e-05, "loss": 0.1081, "num_input_tokens_seen": 9336640, "step": 7190 }, { "epoch": 0.3515500940561405, "grad_norm": 0.40992647409439087, "learning_rate": 4.611450835431931e-05, "loss": 0.0912, "num_input_tokens_seen": 9343392, "step": 7195 }, { "epoch": 0.35179439571983484, "grad_norm": 0.8364034295082092, "learning_rate": 4.6109250165898e-05, "loss": 0.0982, "num_input_tokens_seen": 9350272, "step": 7200 }, { "epoch": 0.35179439571983484, "eval_loss": 0.09585875272750854, "eval_runtime": 374.9072, "eval_samples_per_second": 97.051, "eval_steps_per_second": 24.265, "num_input_tokens_seen": 9350272, "step": 7200 }, { "epoch": 0.35203869738352916, "grad_norm": 0.29683157801628113, "learning_rate": 4.610398872216503e-05, "loss": 0.1082, "num_input_tokens_seen": 9356576, "step": 7205 }, { "epoch": 0.3522829990472235, "grad_norm": 0.2395683377981186, "learning_rate": 4.6098724023931796e-05, "loss": 0.0952, "num_input_tokens_seen": 9362848, "step": 7210 }, { "epoch": 0.35252730071091787, "grad_norm": 0.2822047173976898, "learning_rate": 4.609345607201017e-05, "loss": 0.0979, "num_input_tokens_seen": 9369472, "step": 7215 }, { "epoch": 0.3527716023746122, "grad_norm": 0.6766539812088013, "learning_rate": 4.608818486721254e-05, "loss": 0.1057, "num_input_tokens_seen": 9375360, "step": 7220 }, { "epoch": 0.3530159040383065, "grad_norm": 0.44512808322906494, "learning_rate": 4.608291041035179e-05, "loss": 0.0917, "num_input_tokens_seen": 9382720, "step": 7225 }, { "epoch": 0.35326020570200084, "grad_norm": 0.610241174697876, "learning_rate": 4.607763270224132e-05, "loss": 0.084, "num_input_tokens_seen": 9389440, "step": 7230 }, { "epoch": 0.35350450736569516, "grad_norm": 0.16365033388137817, "learning_rate": 4.6072351743695e-05, "loss": 0.1095, "num_input_tokens_seen": 9395520, "step": 7235 }, { "epoch": 0.3537488090293895, "grad_norm": 0.15372923016548157, "learning_rate": 4.606706753552723e-05, "loss": 0.0842, "num_input_tokens_seen": 9401536, "step": 7240 }, { "epoch": 0.3539931106930838, "grad_norm": 0.18984533846378326, "learning_rate": 4.6061780078552906e-05, "loss": 0.0905, "num_input_tokens_seen": 9408320, "step": 7245 }, { "epoch": 0.35423741235677814, "grad_norm": 0.9000102877616882, "learning_rate": 4.605648937358742e-05, "loss": 0.0861, "num_input_tokens_seen": 9414688, "step": 7250 }, { "epoch": 0.35448171402047246, "grad_norm": 0.13787023723125458, "learning_rate": 4.605119542144665e-05, "loss": 0.0697, "num_input_tokens_seen": 9421536, "step": 7255 }, { "epoch": 0.3547260156841668, "grad_norm": 0.4291568398475647, "learning_rate": 4.604589822294701e-05, "loss": 0.0948, "num_input_tokens_seen": 9428032, "step": 7260 }, { "epoch": 0.35497031734786116, "grad_norm": 0.3419637382030487, "learning_rate": 4.604059777890537e-05, "loss": 0.0968, "num_input_tokens_seen": 9434272, "step": 7265 }, { "epoch": 0.3552146190115555, "grad_norm": 0.29854816198349, "learning_rate": 4.6035294090139145e-05, "loss": 0.0833, "num_input_tokens_seen": 9440768, "step": 7270 }, { "epoch": 0.3554589206752498, "grad_norm": 0.1672353744506836, "learning_rate": 4.6029987157466226e-05, "loss": 0.0998, "num_input_tokens_seen": 9447328, "step": 7275 }, { "epoch": 0.35570322233894414, "grad_norm": 0.9627863168716431, "learning_rate": 4.602467698170502e-05, "loss": 0.0994, "num_input_tokens_seen": 9454112, "step": 7280 }, { "epoch": 0.35594752400263846, "grad_norm": 0.15947259962558746, "learning_rate": 4.601936356367439e-05, "loss": 0.0901, "num_input_tokens_seen": 9460224, "step": 7285 }, { "epoch": 0.3561918256663328, "grad_norm": 0.1844579577445984, "learning_rate": 4.601404690419377e-05, "loss": 0.0823, "num_input_tokens_seen": 9466848, "step": 7290 }, { "epoch": 0.3564361273300271, "grad_norm": 0.272942453622818, "learning_rate": 4.600872700408303e-05, "loss": 0.1225, "num_input_tokens_seen": 9473536, "step": 7295 }, { "epoch": 0.35668042899372143, "grad_norm": 0.598596453666687, "learning_rate": 4.600340386416258e-05, "loss": 0.1026, "num_input_tokens_seen": 9480352, "step": 7300 }, { "epoch": 0.35692473065741576, "grad_norm": 0.3421473205089569, "learning_rate": 4.5998077485253296e-05, "loss": 0.0821, "num_input_tokens_seen": 9486656, "step": 7305 }, { "epoch": 0.3571690323211101, "grad_norm": 0.2250036597251892, "learning_rate": 4.59927478681766e-05, "loss": 0.0968, "num_input_tokens_seen": 9493280, "step": 7310 }, { "epoch": 0.35741333398480446, "grad_norm": 0.5540391802787781, "learning_rate": 4.5987415013754366e-05, "loss": 0.1081, "num_input_tokens_seen": 9499616, "step": 7315 }, { "epoch": 0.3576576356484988, "grad_norm": 0.24305523931980133, "learning_rate": 4.598207892280899e-05, "loss": 0.074, "num_input_tokens_seen": 9506432, "step": 7320 }, { "epoch": 0.3579019373121931, "grad_norm": 0.9592607617378235, "learning_rate": 4.597673959616337e-05, "loss": 0.1309, "num_input_tokens_seen": 9512608, "step": 7325 }, { "epoch": 0.35814623897588743, "grad_norm": 0.20522546768188477, "learning_rate": 4.597139703464089e-05, "loss": 0.0748, "num_input_tokens_seen": 9519104, "step": 7330 }, { "epoch": 0.35839054063958176, "grad_norm": 0.658508837223053, "learning_rate": 4.596605123906545e-05, "loss": 0.1037, "num_input_tokens_seen": 9525760, "step": 7335 }, { "epoch": 0.3586348423032761, "grad_norm": 0.2821878492832184, "learning_rate": 4.596070221026143e-05, "loss": 0.1001, "num_input_tokens_seen": 9531616, "step": 7340 }, { "epoch": 0.3588791439669704, "grad_norm": 0.19046127796173096, "learning_rate": 4.595534994905372e-05, "loss": 0.083, "num_input_tokens_seen": 9538016, "step": 7345 }, { "epoch": 0.35912344563066473, "grad_norm": 0.23124246299266815, "learning_rate": 4.594999445626771e-05, "loss": 0.1161, "num_input_tokens_seen": 9544352, "step": 7350 }, { "epoch": 0.35936774729435905, "grad_norm": 0.8199215531349182, "learning_rate": 4.5944635732729276e-05, "loss": 0.097, "num_input_tokens_seen": 9550816, "step": 7355 }, { "epoch": 0.35961204895805343, "grad_norm": 0.3270043134689331, "learning_rate": 4.5939273779264804e-05, "loss": 0.0839, "num_input_tokens_seen": 9557152, "step": 7360 }, { "epoch": 0.35985635062174776, "grad_norm": 0.6138167977333069, "learning_rate": 4.593390859670118e-05, "loss": 0.0677, "num_input_tokens_seen": 9563872, "step": 7365 }, { "epoch": 0.3601006522854421, "grad_norm": 0.3018108010292053, "learning_rate": 4.5928540185865776e-05, "loss": 0.0827, "num_input_tokens_seen": 9570464, "step": 7370 }, { "epoch": 0.3603449539491364, "grad_norm": 0.2651233375072479, "learning_rate": 4.592316854758648e-05, "loss": 0.1124, "num_input_tokens_seen": 9576864, "step": 7375 }, { "epoch": 0.36058925561283073, "grad_norm": 0.1556655317544937, "learning_rate": 4.5917793682691646e-05, "loss": 0.076, "num_input_tokens_seen": 9583616, "step": 7380 }, { "epoch": 0.36083355727652505, "grad_norm": 0.2546665668487549, "learning_rate": 4.5912415592010164e-05, "loss": 0.0863, "num_input_tokens_seen": 9590048, "step": 7385 }, { "epoch": 0.3610778589402194, "grad_norm": 0.29070353507995605, "learning_rate": 4.5907034276371386e-05, "loss": 0.1015, "num_input_tokens_seen": 9596480, "step": 7390 }, { "epoch": 0.3613221606039137, "grad_norm": 0.29489922523498535, "learning_rate": 4.5901649736605196e-05, "loss": 0.0958, "num_input_tokens_seen": 9602880, "step": 7395 }, { "epoch": 0.361566462267608, "grad_norm": 0.6577204465866089, "learning_rate": 4.589626197354195e-05, "loss": 0.1036, "num_input_tokens_seen": 9609312, "step": 7400 }, { "epoch": 0.361566462267608, "eval_loss": 0.09535429626703262, "eval_runtime": 375.5231, "eval_samples_per_second": 96.892, "eval_steps_per_second": 24.225, "num_input_tokens_seen": 9609312, "step": 7400 }, { "epoch": 0.36181076393130235, "grad_norm": 0.4312498867511749, "learning_rate": 4.5890870988012504e-05, "loss": 0.0957, "num_input_tokens_seen": 9615360, "step": 7405 }, { "epoch": 0.36205506559499673, "grad_norm": 0.14470405876636505, "learning_rate": 4.5885476780848226e-05, "loss": 0.0987, "num_input_tokens_seen": 9621888, "step": 7410 }, { "epoch": 0.36229936725869105, "grad_norm": 0.1626768857240677, "learning_rate": 4.5880079352880964e-05, "loss": 0.1048, "num_input_tokens_seen": 9628384, "step": 7415 }, { "epoch": 0.3625436689223854, "grad_norm": 0.6011003851890564, "learning_rate": 4.5874678704943065e-05, "loss": 0.1043, "num_input_tokens_seen": 9634816, "step": 7420 }, { "epoch": 0.3627879705860797, "grad_norm": 0.8495525121688843, "learning_rate": 4.5869274837867394e-05, "loss": 0.1227, "num_input_tokens_seen": 9640800, "step": 7425 }, { "epoch": 0.363032272249774, "grad_norm": 0.18628095090389252, "learning_rate": 4.5863867752487275e-05, "loss": 0.0999, "num_input_tokens_seen": 9646944, "step": 7430 }, { "epoch": 0.36327657391346835, "grad_norm": 0.4826263189315796, "learning_rate": 4.5858457449636554e-05, "loss": 0.084, "num_input_tokens_seen": 9653760, "step": 7435 }, { "epoch": 0.36352087557716267, "grad_norm": 0.2338653802871704, "learning_rate": 4.5853043930149574e-05, "loss": 0.0791, "num_input_tokens_seen": 9660480, "step": 7440 }, { "epoch": 0.363765177240857, "grad_norm": 0.20244251191616058, "learning_rate": 4.584762719486117e-05, "loss": 0.0907, "num_input_tokens_seen": 9666752, "step": 7445 }, { "epoch": 0.3640094789045513, "grad_norm": 0.14804324507713318, "learning_rate": 4.584220724460665e-05, "loss": 0.0918, "num_input_tokens_seen": 9673312, "step": 7450 }, { "epoch": 0.36425378056824564, "grad_norm": 0.4955236315727234, "learning_rate": 4.5836784080221865e-05, "loss": 0.0912, "num_input_tokens_seen": 9679872, "step": 7455 }, { "epoch": 0.36449808223194, "grad_norm": 0.3991062343120575, "learning_rate": 4.583135770254312e-05, "loss": 0.0994, "num_input_tokens_seen": 9686560, "step": 7460 }, { "epoch": 0.36474238389563435, "grad_norm": 0.16122642159461975, "learning_rate": 4.5825928112407236e-05, "loss": 0.0876, "num_input_tokens_seen": 9693056, "step": 7465 }, { "epoch": 0.36498668555932867, "grad_norm": 0.28037822246551514, "learning_rate": 4.582049531065152e-05, "loss": 0.1022, "num_input_tokens_seen": 9699264, "step": 7470 }, { "epoch": 0.365230987223023, "grad_norm": 0.38030681014060974, "learning_rate": 4.5815059298113783e-05, "loss": 0.092, "num_input_tokens_seen": 9705472, "step": 7475 }, { "epoch": 0.3654752888867173, "grad_norm": 0.37383216619491577, "learning_rate": 4.580962007563232e-05, "loss": 0.075, "num_input_tokens_seen": 9712672, "step": 7480 }, { "epoch": 0.36571959055041164, "grad_norm": 0.2224600464105606, "learning_rate": 4.5804177644045935e-05, "loss": 0.1087, "num_input_tokens_seen": 9719200, "step": 7485 }, { "epoch": 0.36596389221410597, "grad_norm": 0.22474658489227295, "learning_rate": 4.579873200419391e-05, "loss": 0.0885, "num_input_tokens_seen": 9725696, "step": 7490 }, { "epoch": 0.3662081938778003, "grad_norm": 0.32164913415908813, "learning_rate": 4.5793283156916046e-05, "loss": 0.1113, "num_input_tokens_seen": 9732064, "step": 7495 }, { "epoch": 0.3664524955414946, "grad_norm": 0.46172916889190674, "learning_rate": 4.578783110305261e-05, "loss": 0.1166, "num_input_tokens_seen": 9738432, "step": 7500 }, { "epoch": 0.36669679720518894, "grad_norm": 0.8490980267524719, "learning_rate": 4.578237584344438e-05, "loss": 0.1027, "num_input_tokens_seen": 9744832, "step": 7505 }, { "epoch": 0.3669410988688833, "grad_norm": 0.4996781051158905, "learning_rate": 4.577691737893263e-05, "loss": 0.1055, "num_input_tokens_seen": 9751616, "step": 7510 }, { "epoch": 0.36718540053257764, "grad_norm": 0.17678190767765045, "learning_rate": 4.577145571035912e-05, "loss": 0.0769, "num_input_tokens_seen": 9758176, "step": 7515 }, { "epoch": 0.36742970219627197, "grad_norm": 0.8000085353851318, "learning_rate": 4.576599083856611e-05, "loss": 0.1116, "num_input_tokens_seen": 9764672, "step": 7520 }, { "epoch": 0.3676740038599663, "grad_norm": 0.20434880256652832, "learning_rate": 4.576052276439635e-05, "loss": 0.1085, "num_input_tokens_seen": 9770880, "step": 7525 }, { "epoch": 0.3679183055236606, "grad_norm": 0.446663498878479, "learning_rate": 4.575505148869308e-05, "loss": 0.0834, "num_input_tokens_seen": 9777152, "step": 7530 }, { "epoch": 0.36816260718735494, "grad_norm": 0.1923208236694336, "learning_rate": 4.574957701230006e-05, "loss": 0.0922, "num_input_tokens_seen": 9783680, "step": 7535 }, { "epoch": 0.36840690885104926, "grad_norm": 0.41546696424484253, "learning_rate": 4.57440993360615e-05, "loss": 0.0954, "num_input_tokens_seen": 9790240, "step": 7540 }, { "epoch": 0.3686512105147436, "grad_norm": 0.19437837600708008, "learning_rate": 4.5738618460822134e-05, "loss": 0.0831, "num_input_tokens_seen": 9796576, "step": 7545 }, { "epoch": 0.3688955121784379, "grad_norm": 0.289424866437912, "learning_rate": 4.573313438742719e-05, "loss": 0.0937, "num_input_tokens_seen": 9802944, "step": 7550 }, { "epoch": 0.3691398138421323, "grad_norm": 0.7993665337562561, "learning_rate": 4.5727647116722374e-05, "loss": 0.0969, "num_input_tokens_seen": 9809760, "step": 7555 }, { "epoch": 0.3693841155058266, "grad_norm": 0.1709360033273697, "learning_rate": 4.5722156649553884e-05, "loss": 0.0756, "num_input_tokens_seen": 9816224, "step": 7560 }, { "epoch": 0.36962841716952094, "grad_norm": 0.1993180811405182, "learning_rate": 4.571666298676843e-05, "loss": 0.0954, "num_input_tokens_seen": 9822400, "step": 7565 }, { "epoch": 0.36987271883321526, "grad_norm": 0.1935500204563141, "learning_rate": 4.571116612921321e-05, "loss": 0.0985, "num_input_tokens_seen": 9829248, "step": 7570 }, { "epoch": 0.3701170204969096, "grad_norm": 0.6968016624450684, "learning_rate": 4.57056660777359e-05, "loss": 0.0906, "num_input_tokens_seen": 9835328, "step": 7575 }, { "epoch": 0.3703613221606039, "grad_norm": 0.5512961149215698, "learning_rate": 4.5700162833184666e-05, "loss": 0.0967, "num_input_tokens_seen": 9841952, "step": 7580 }, { "epoch": 0.37060562382429824, "grad_norm": 0.5486388802528381, "learning_rate": 4.5694656396408195e-05, "loss": 0.0934, "num_input_tokens_seen": 9848576, "step": 7585 }, { "epoch": 0.37084992548799256, "grad_norm": 0.26373687386512756, "learning_rate": 4.5689146768255646e-05, "loss": 0.1107, "num_input_tokens_seen": 9854880, "step": 7590 }, { "epoch": 0.3710942271516869, "grad_norm": 0.758677065372467, "learning_rate": 4.568363394957667e-05, "loss": 0.1065, "num_input_tokens_seen": 9861088, "step": 7595 }, { "epoch": 0.3713385288153812, "grad_norm": 0.4355042576789856, "learning_rate": 4.567811794122141e-05, "loss": 0.0818, "num_input_tokens_seen": 9867648, "step": 7600 }, { "epoch": 0.3713385288153812, "eval_loss": 0.0956866666674614, "eval_runtime": 375.9148, "eval_samples_per_second": 96.791, "eval_steps_per_second": 24.2, "num_input_tokens_seen": 9867648, "step": 7600 }, { "epoch": 0.3715828304790756, "grad_norm": 0.43515321612358093, "learning_rate": 4.56725987440405e-05, "loss": 0.0887, "num_input_tokens_seen": 9873856, "step": 7605 }, { "epoch": 0.3718271321427699, "grad_norm": 0.5253986120223999, "learning_rate": 4.566707635888508e-05, "loss": 0.1262, "num_input_tokens_seen": 9880352, "step": 7610 }, { "epoch": 0.37207143380646424, "grad_norm": 0.512972891330719, "learning_rate": 4.566155078660677e-05, "loss": 0.1229, "num_input_tokens_seen": 9886848, "step": 7615 }, { "epoch": 0.37231573547015856, "grad_norm": 0.7451463341712952, "learning_rate": 4.565602202805768e-05, "loss": 0.1223, "num_input_tokens_seen": 9893408, "step": 7620 }, { "epoch": 0.3725600371338529, "grad_norm": 0.32101404666900635, "learning_rate": 4.56504900840904e-05, "loss": 0.1146, "num_input_tokens_seen": 9899968, "step": 7625 }, { "epoch": 0.3728043387975472, "grad_norm": 0.47586753964424133, "learning_rate": 4.564495495555805e-05, "loss": 0.0973, "num_input_tokens_seen": 9906752, "step": 7630 }, { "epoch": 0.37304864046124153, "grad_norm": 0.15460646152496338, "learning_rate": 4.5639416643314204e-05, "loss": 0.0838, "num_input_tokens_seen": 9913632, "step": 7635 }, { "epoch": 0.37329294212493586, "grad_norm": 0.23528249561786652, "learning_rate": 4.5633875148212946e-05, "loss": 0.1077, "num_input_tokens_seen": 9920128, "step": 7640 }, { "epoch": 0.3735372437886302, "grad_norm": 0.336615651845932, "learning_rate": 4.562833047110883e-05, "loss": 0.0874, "num_input_tokens_seen": 9926496, "step": 7645 }, { "epoch": 0.3737815454523245, "grad_norm": 0.18891341984272003, "learning_rate": 4.5622782612856923e-05, "loss": 0.0778, "num_input_tokens_seen": 9932960, "step": 7650 }, { "epoch": 0.3740258471160189, "grad_norm": 0.3808456361293793, "learning_rate": 4.561723157431278e-05, "loss": 0.0947, "num_input_tokens_seen": 9939424, "step": 7655 }, { "epoch": 0.3742701487797132, "grad_norm": 0.15083634853363037, "learning_rate": 4.5611677356332435e-05, "loss": 0.0874, "num_input_tokens_seen": 9946400, "step": 7660 }, { "epoch": 0.37451445044340753, "grad_norm": 0.2464376986026764, "learning_rate": 4.560611995977242e-05, "loss": 0.1119, "num_input_tokens_seen": 9952736, "step": 7665 }, { "epoch": 0.37475875210710186, "grad_norm": 0.47794270515441895, "learning_rate": 4.560055938548975e-05, "loss": 0.0791, "num_input_tokens_seen": 9958816, "step": 7670 }, { "epoch": 0.3750030537707962, "grad_norm": 0.37807539105415344, "learning_rate": 4.5594995634341944e-05, "loss": 0.045, "num_input_tokens_seen": 9965504, "step": 7675 }, { "epoch": 0.3752473554344905, "grad_norm": 0.35169315338134766, "learning_rate": 4.5589428707187e-05, "loss": 0.1053, "num_input_tokens_seen": 9971680, "step": 7680 }, { "epoch": 0.37549165709818483, "grad_norm": 0.24330246448516846, "learning_rate": 4.55838586048834e-05, "loss": 0.1056, "num_input_tokens_seen": 9978080, "step": 7685 }, { "epoch": 0.37573595876187915, "grad_norm": 0.1670188158750534, "learning_rate": 4.557828532829013e-05, "loss": 0.0788, "num_input_tokens_seen": 9984672, "step": 7690 }, { "epoch": 0.3759802604255735, "grad_norm": 0.43509650230407715, "learning_rate": 4.557270887826667e-05, "loss": 0.0805, "num_input_tokens_seen": 9991136, "step": 7695 }, { "epoch": 0.37622456208926786, "grad_norm": 0.2142462134361267, "learning_rate": 4.556712925567296e-05, "loss": 0.0791, "num_input_tokens_seen": 9997632, "step": 7700 }, { "epoch": 0.3764688637529622, "grad_norm": 0.2139381319284439, "learning_rate": 4.5561546461369454e-05, "loss": 0.1181, "num_input_tokens_seen": 10004544, "step": 7705 }, { "epoch": 0.3767131654166565, "grad_norm": 0.3362012207508087, "learning_rate": 4.55559604962171e-05, "loss": 0.0911, "num_input_tokens_seen": 10011200, "step": 7710 }, { "epoch": 0.3769574670803508, "grad_norm": 0.4361678659915924, "learning_rate": 4.55503713610773e-05, "loss": 0.1149, "num_input_tokens_seen": 10017504, "step": 7715 }, { "epoch": 0.37720176874404515, "grad_norm": 0.31618303060531616, "learning_rate": 4.5544779056812e-05, "loss": 0.0892, "num_input_tokens_seen": 10023456, "step": 7720 }, { "epoch": 0.3774460704077395, "grad_norm": 0.5145931839942932, "learning_rate": 4.553918358428358e-05, "loss": 0.1078, "num_input_tokens_seen": 10030464, "step": 7725 }, { "epoch": 0.3776903720714338, "grad_norm": 0.22819802165031433, "learning_rate": 4.553358494435494e-05, "loss": 0.0541, "num_input_tokens_seen": 10037120, "step": 7730 }, { "epoch": 0.3779346737351281, "grad_norm": 0.233259379863739, "learning_rate": 4.5527983137889464e-05, "loss": 0.0719, "num_input_tokens_seen": 10043520, "step": 7735 }, { "epoch": 0.37817897539882245, "grad_norm": 0.38800638914108276, "learning_rate": 4.5522378165751015e-05, "loss": 0.0794, "num_input_tokens_seen": 10049728, "step": 7740 }, { "epoch": 0.37842327706251677, "grad_norm": 0.5711126327514648, "learning_rate": 4.5516770028803954e-05, "loss": 0.0941, "num_input_tokens_seen": 10056064, "step": 7745 }, { "epoch": 0.37866757872621115, "grad_norm": 0.9922067523002625, "learning_rate": 4.5511158727913116e-05, "loss": 0.0779, "num_input_tokens_seen": 10062464, "step": 7750 }, { "epoch": 0.3789118803899055, "grad_norm": 0.4866816997528076, "learning_rate": 4.5505544263943856e-05, "loss": 0.0974, "num_input_tokens_seen": 10068896, "step": 7755 }, { "epoch": 0.3791561820535998, "grad_norm": 0.35246092081069946, "learning_rate": 4.549992663776197e-05, "loss": 0.0796, "num_input_tokens_seen": 10075872, "step": 7760 }, { "epoch": 0.3794004837172941, "grad_norm": 0.37047138810157776, "learning_rate": 4.5494305850233786e-05, "loss": 0.0595, "num_input_tokens_seen": 10083104, "step": 7765 }, { "epoch": 0.37964478538098845, "grad_norm": 0.5479318499565125, "learning_rate": 4.5488681902226094e-05, "loss": 0.0846, "num_input_tokens_seen": 10089408, "step": 7770 }, { "epoch": 0.37988908704468277, "grad_norm": 0.225518599152565, "learning_rate": 4.5483054794606174e-05, "loss": 0.0842, "num_input_tokens_seen": 10095488, "step": 7775 }, { "epoch": 0.3801333887083771, "grad_norm": 0.6675151586532593, "learning_rate": 4.547742452824179e-05, "loss": 0.1096, "num_input_tokens_seen": 10101856, "step": 7780 }, { "epoch": 0.3803776903720714, "grad_norm": 0.9586536884307861, "learning_rate": 4.5471791104001215e-05, "loss": 0.0972, "num_input_tokens_seen": 10108320, "step": 7785 }, { "epoch": 0.38062199203576574, "grad_norm": 0.21296024322509766, "learning_rate": 4.546615452275319e-05, "loss": 0.0872, "num_input_tokens_seen": 10114496, "step": 7790 }, { "epoch": 0.38086629369946007, "grad_norm": 0.2058820128440857, "learning_rate": 4.5460514785366944e-05, "loss": 0.0943, "num_input_tokens_seen": 10120768, "step": 7795 }, { "epoch": 0.38111059536315445, "grad_norm": 0.9199239015579224, "learning_rate": 4.545487189271219e-05, "loss": 0.0938, "num_input_tokens_seen": 10127328, "step": 7800 }, { "epoch": 0.38111059536315445, "eval_loss": 0.09502172470092773, "eval_runtime": 375.5233, "eval_samples_per_second": 96.891, "eval_steps_per_second": 24.225, "num_input_tokens_seen": 10127328, "step": 7800 }, { "epoch": 0.38135489702684877, "grad_norm": 0.32027003169059753, "learning_rate": 4.544922584565914e-05, "loss": 0.1085, "num_input_tokens_seen": 10133760, "step": 7805 }, { "epoch": 0.3815991986905431, "grad_norm": 0.91648268699646, "learning_rate": 4.544357664507848e-05, "loss": 0.0851, "num_input_tokens_seen": 10140480, "step": 7810 }, { "epoch": 0.3818435003542374, "grad_norm": 0.16249676048755646, "learning_rate": 4.54379242918414e-05, "loss": 0.0809, "num_input_tokens_seen": 10147264, "step": 7815 }, { "epoch": 0.38208780201793174, "grad_norm": 0.5144299268722534, "learning_rate": 4.543226878681955e-05, "loss": 0.1105, "num_input_tokens_seen": 10153728, "step": 7820 }, { "epoch": 0.38233210368162607, "grad_norm": 0.2848314940929413, "learning_rate": 4.5426610130885087e-05, "loss": 0.0929, "num_input_tokens_seen": 10160288, "step": 7825 }, { "epoch": 0.3825764053453204, "grad_norm": 0.21551813185214996, "learning_rate": 4.542094832491064e-05, "loss": 0.1033, "num_input_tokens_seen": 10166848, "step": 7830 }, { "epoch": 0.3828207070090147, "grad_norm": 0.4925219714641571, "learning_rate": 4.541528336976934e-05, "loss": 0.0838, "num_input_tokens_seen": 10173632, "step": 7835 }, { "epoch": 0.38306500867270904, "grad_norm": 0.5760303139686584, "learning_rate": 4.540961526633479e-05, "loss": 0.1071, "num_input_tokens_seen": 10180096, "step": 7840 }, { "epoch": 0.38330931033640336, "grad_norm": 0.17418669164180756, "learning_rate": 4.540394401548108e-05, "loss": 0.0974, "num_input_tokens_seen": 10186528, "step": 7845 }, { "epoch": 0.38355361200009774, "grad_norm": 0.3773221969604492, "learning_rate": 4.539826961808279e-05, "loss": 0.0825, "num_input_tokens_seen": 10192992, "step": 7850 }, { "epoch": 0.38379791366379207, "grad_norm": 0.25431889295578003, "learning_rate": 4.5392592075014994e-05, "loss": 0.0949, "num_input_tokens_seen": 10198784, "step": 7855 }, { "epoch": 0.3840422153274864, "grad_norm": 0.19750288128852844, "learning_rate": 4.538691138715322e-05, "loss": 0.091, "num_input_tokens_seen": 10205344, "step": 7860 }, { "epoch": 0.3842865169911807, "grad_norm": 0.5481963157653809, "learning_rate": 4.5381227555373516e-05, "loss": 0.149, "num_input_tokens_seen": 10211840, "step": 7865 }, { "epoch": 0.38453081865487504, "grad_norm": 0.2319909632205963, "learning_rate": 4.537554058055239e-05, "loss": 0.0917, "num_input_tokens_seen": 10218496, "step": 7870 }, { "epoch": 0.38477512031856936, "grad_norm": 0.3848058879375458, "learning_rate": 4.5369850463566865e-05, "loss": 0.0804, "num_input_tokens_seen": 10225248, "step": 7875 }, { "epoch": 0.3850194219822637, "grad_norm": 0.4474436342716217, "learning_rate": 4.5364157205294404e-05, "loss": 0.122, "num_input_tokens_seen": 10231808, "step": 7880 }, { "epoch": 0.385263723645958, "grad_norm": 0.12610602378845215, "learning_rate": 4.5358460806612996e-05, "loss": 0.0667, "num_input_tokens_seen": 10238176, "step": 7885 }, { "epoch": 0.38550802530965234, "grad_norm": 0.4740665555000305, "learning_rate": 4.535276126840109e-05, "loss": 0.115, "num_input_tokens_seen": 10244704, "step": 7890 }, { "epoch": 0.3857523269733467, "grad_norm": 0.523306131362915, "learning_rate": 4.5347058591537626e-05, "loss": 0.1198, "num_input_tokens_seen": 10250688, "step": 7895 }, { "epoch": 0.38599662863704104, "grad_norm": 0.7374188899993896, "learning_rate": 4.534135277690203e-05, "loss": 0.1066, "num_input_tokens_seen": 10257088, "step": 7900 }, { "epoch": 0.38624093030073536, "grad_norm": 0.18992283940315247, "learning_rate": 4.533564382537421e-05, "loss": 0.1131, "num_input_tokens_seen": 10263488, "step": 7905 }, { "epoch": 0.3864852319644297, "grad_norm": 0.4271664321422577, "learning_rate": 4.532993173783456e-05, "loss": 0.0928, "num_input_tokens_seen": 10269984, "step": 7910 }, { "epoch": 0.386729533628124, "grad_norm": 0.3246254026889801, "learning_rate": 4.5324216515163954e-05, "loss": 0.0904, "num_input_tokens_seen": 10276576, "step": 7915 }, { "epoch": 0.38697383529181834, "grad_norm": 0.40088534355163574, "learning_rate": 4.531849815824375e-05, "loss": 0.0756, "num_input_tokens_seen": 10282656, "step": 7920 }, { "epoch": 0.38721813695551266, "grad_norm": 0.25249263644218445, "learning_rate": 4.5312776667955795e-05, "loss": 0.0946, "num_input_tokens_seen": 10288992, "step": 7925 }, { "epoch": 0.387462438619207, "grad_norm": 0.3541928827762604, "learning_rate": 4.5307052045182405e-05, "loss": 0.0866, "num_input_tokens_seen": 10295232, "step": 7930 }, { "epoch": 0.3877067402829013, "grad_norm": 0.23854270577430725, "learning_rate": 4.53013242908064e-05, "loss": 0.0931, "num_input_tokens_seen": 10301920, "step": 7935 }, { "epoch": 0.38795104194659563, "grad_norm": 0.6034083962440491, "learning_rate": 4.529559340571107e-05, "loss": 0.1049, "num_input_tokens_seen": 10307712, "step": 7940 }, { "epoch": 0.38819534361029, "grad_norm": 0.5347896218299866, "learning_rate": 4.528985939078018e-05, "loss": 0.099, "num_input_tokens_seen": 10314112, "step": 7945 }, { "epoch": 0.38843964527398434, "grad_norm": 0.2671723961830139, "learning_rate": 4.5284122246898e-05, "loss": 0.0834, "num_input_tokens_seen": 10320608, "step": 7950 }, { "epoch": 0.38868394693767866, "grad_norm": 0.4726446568965912, "learning_rate": 4.527838197494926e-05, "loss": 0.0871, "num_input_tokens_seen": 10326848, "step": 7955 }, { "epoch": 0.388928248601373, "grad_norm": 0.21795466542243958, "learning_rate": 4.527263857581918e-05, "loss": 0.0951, "num_input_tokens_seen": 10332864, "step": 7960 }, { "epoch": 0.3891725502650673, "grad_norm": 0.20819896459579468, "learning_rate": 4.526689205039347e-05, "loss": 0.114, "num_input_tokens_seen": 10339008, "step": 7965 }, { "epoch": 0.38941685192876163, "grad_norm": 0.3122056722640991, "learning_rate": 4.5261142399558324e-05, "loss": 0.0835, "num_input_tokens_seen": 10345504, "step": 7970 }, { "epoch": 0.38966115359245596, "grad_norm": 0.2433474212884903, "learning_rate": 4.525538962420041e-05, "loss": 0.1001, "num_input_tokens_seen": 10352064, "step": 7975 }, { "epoch": 0.3899054552561503, "grad_norm": 0.2426920235157013, "learning_rate": 4.524963372520685e-05, "loss": 0.1174, "num_input_tokens_seen": 10358496, "step": 7980 }, { "epoch": 0.3901497569198446, "grad_norm": 0.17681962251663208, "learning_rate": 4.524387470346531e-05, "loss": 0.0842, "num_input_tokens_seen": 10364864, "step": 7985 }, { "epoch": 0.3903940585835389, "grad_norm": 0.21558716893196106, "learning_rate": 4.5238112559863885e-05, "loss": 0.0774, "num_input_tokens_seen": 10371264, "step": 7990 }, { "epoch": 0.3906383602472333, "grad_norm": 0.6677268147468567, "learning_rate": 4.5232347295291175e-05, "loss": 0.1025, "num_input_tokens_seen": 10377568, "step": 7995 }, { "epoch": 0.39088266191092763, "grad_norm": 0.39364394545555115, "learning_rate": 4.522657891063626e-05, "loss": 0.0798, "num_input_tokens_seen": 10383808, "step": 8000 }, { "epoch": 0.39088266191092763, "eval_loss": 0.09487472474575043, "eval_runtime": 374.5461, "eval_samples_per_second": 97.144, "eval_steps_per_second": 24.288, "num_input_tokens_seen": 10383808, "step": 8000 }, { "epoch": 0.39112696357462196, "grad_norm": 0.24965660274028778, "learning_rate": 4.52208074067887e-05, "loss": 0.1189, "num_input_tokens_seen": 10390048, "step": 8005 }, { "epoch": 0.3913712652383163, "grad_norm": 0.7718707323074341, "learning_rate": 4.5215032784638516e-05, "loss": 0.0885, "num_input_tokens_seen": 10397248, "step": 8010 }, { "epoch": 0.3916155669020106, "grad_norm": 0.4689159393310547, "learning_rate": 4.5209255045076245e-05, "loss": 0.0906, "num_input_tokens_seen": 10404224, "step": 8015 }, { "epoch": 0.3918598685657049, "grad_norm": 0.40622955560684204, "learning_rate": 4.5203474188992875e-05, "loss": 0.1094, "num_input_tokens_seen": 10410272, "step": 8020 }, { "epoch": 0.39210417022939925, "grad_norm": 0.24850063025951385, "learning_rate": 4.51976902172799e-05, "loss": 0.0953, "num_input_tokens_seen": 10416320, "step": 8025 }, { "epoch": 0.3923484718930936, "grad_norm": 0.20960386097431183, "learning_rate": 4.519190313082927e-05, "loss": 0.0939, "num_input_tokens_seen": 10422912, "step": 8030 }, { "epoch": 0.3925927735567879, "grad_norm": 0.20571839809417725, "learning_rate": 4.518611293053343e-05, "loss": 0.1002, "num_input_tokens_seen": 10429664, "step": 8035 }, { "epoch": 0.3928370752204823, "grad_norm": 0.4781019389629364, "learning_rate": 4.51803196172853e-05, "loss": 0.0893, "num_input_tokens_seen": 10436224, "step": 8040 }, { "epoch": 0.3930813768841766, "grad_norm": 0.4978799819946289, "learning_rate": 4.517452319197828e-05, "loss": 0.0978, "num_input_tokens_seen": 10443168, "step": 8045 }, { "epoch": 0.3933256785478709, "grad_norm": 0.33419954776763916, "learning_rate": 4.5168723655506265e-05, "loss": 0.0714, "num_input_tokens_seen": 10449600, "step": 8050 }, { "epoch": 0.39356998021156525, "grad_norm": 0.1512090116739273, "learning_rate": 4.51629210087636e-05, "loss": 0.0747, "num_input_tokens_seen": 10455968, "step": 8055 }, { "epoch": 0.3938142818752596, "grad_norm": 0.43367308378219604, "learning_rate": 4.515711525264513e-05, "loss": 0.0643, "num_input_tokens_seen": 10462592, "step": 8060 }, { "epoch": 0.3940585835389539, "grad_norm": 0.3762434422969818, "learning_rate": 4.5151306388046175e-05, "loss": 0.1004, "num_input_tokens_seen": 10469024, "step": 8065 }, { "epoch": 0.3943028852026482, "grad_norm": 0.46777835488319397, "learning_rate": 4.514549441586255e-05, "loss": 0.0925, "num_input_tokens_seen": 10476320, "step": 8070 }, { "epoch": 0.39454718686634255, "grad_norm": 0.20981109142303467, "learning_rate": 4.513967933699051e-05, "loss": 0.0938, "num_input_tokens_seen": 10482976, "step": 8075 }, { "epoch": 0.39479148853003687, "grad_norm": 0.1886710673570633, "learning_rate": 4.513386115232684e-05, "loss": 0.0945, "num_input_tokens_seen": 10489632, "step": 8080 }, { "epoch": 0.3950357901937312, "grad_norm": 0.1378287822008133, "learning_rate": 4.5128039862768745e-05, "loss": 0.0654, "num_input_tokens_seen": 10496000, "step": 8085 }, { "epoch": 0.3952800918574256, "grad_norm": 0.29286158084869385, "learning_rate": 4.512221546921397e-05, "loss": 0.1078, "num_input_tokens_seen": 10502208, "step": 8090 }, { "epoch": 0.3955243935211199, "grad_norm": 0.10389220714569092, "learning_rate": 4.5116387972560694e-05, "loss": 0.0801, "num_input_tokens_seen": 10508544, "step": 8095 }, { "epoch": 0.3957686951848142, "grad_norm": 0.14186911284923553, "learning_rate": 4.511055737370759e-05, "loss": 0.0755, "num_input_tokens_seen": 10514752, "step": 8100 }, { "epoch": 0.39601299684850855, "grad_norm": 0.301164835691452, "learning_rate": 4.510472367355383e-05, "loss": 0.1312, "num_input_tokens_seen": 10520992, "step": 8105 }, { "epoch": 0.39625729851220287, "grad_norm": 0.18470588326454163, "learning_rate": 4.509888687299901e-05, "loss": 0.1032, "num_input_tokens_seen": 10527456, "step": 8110 }, { "epoch": 0.3965016001758972, "grad_norm": 0.5381827354431152, "learning_rate": 4.5093046972943266e-05, "loss": 0.0818, "num_input_tokens_seen": 10534080, "step": 8115 }, { "epoch": 0.3967459018395915, "grad_norm": 0.254205584526062, "learning_rate": 4.508720397428717e-05, "loss": 0.0974, "num_input_tokens_seen": 10540576, "step": 8120 }, { "epoch": 0.39699020350328584, "grad_norm": 0.25433966517448425, "learning_rate": 4.508135787793178e-05, "loss": 0.0877, "num_input_tokens_seen": 10547392, "step": 8125 }, { "epoch": 0.39723450516698017, "grad_norm": 0.17843613028526306, "learning_rate": 4.5075508684778664e-05, "loss": 0.0828, "num_input_tokens_seen": 10553632, "step": 8130 }, { "epoch": 0.3974788068306745, "grad_norm": 0.21498489379882812, "learning_rate": 4.506965639572982e-05, "loss": 0.0914, "num_input_tokens_seen": 10560064, "step": 8135 }, { "epoch": 0.39772310849436887, "grad_norm": 0.14336995780467987, "learning_rate": 4.506380101168774e-05, "loss": 0.0915, "num_input_tokens_seen": 10566944, "step": 8140 }, { "epoch": 0.3979674101580632, "grad_norm": 0.44079843163490295, "learning_rate": 4.505794253355542e-05, "loss": 0.1038, "num_input_tokens_seen": 10572992, "step": 8145 }, { "epoch": 0.3982117118217575, "grad_norm": 0.3894024193286896, "learning_rate": 4.5052080962236286e-05, "loss": 0.0936, "num_input_tokens_seen": 10579104, "step": 8150 }, { "epoch": 0.39845601348545184, "grad_norm": 0.5725772976875305, "learning_rate": 4.504621629863428e-05, "loss": 0.0772, "num_input_tokens_seen": 10585248, "step": 8155 }, { "epoch": 0.39870031514914617, "grad_norm": 0.23913495242595673, "learning_rate": 4.504034854365381e-05, "loss": 0.0953, "num_input_tokens_seen": 10592256, "step": 8160 }, { "epoch": 0.3989446168128405, "grad_norm": 0.2508503496646881, "learning_rate": 4.503447769819974e-05, "loss": 0.1014, "num_input_tokens_seen": 10598720, "step": 8165 }, { "epoch": 0.3991889184765348, "grad_norm": 0.5255420207977295, "learning_rate": 4.502860376317745e-05, "loss": 0.1169, "num_input_tokens_seen": 10605856, "step": 8170 }, { "epoch": 0.39943322014022914, "grad_norm": 0.17378942668437958, "learning_rate": 4.502272673949276e-05, "loss": 0.0869, "num_input_tokens_seen": 10612256, "step": 8175 }, { "epoch": 0.39967752180392346, "grad_norm": 0.644054114818573, "learning_rate": 4.501684662805199e-05, "loss": 0.1153, "num_input_tokens_seen": 10618144, "step": 8180 }, { "epoch": 0.3999218234676178, "grad_norm": 0.17658524215221405, "learning_rate": 4.5010963429761924e-05, "loss": 0.0919, "num_input_tokens_seen": 10624192, "step": 8185 }, { "epoch": 0.40016612513131217, "grad_norm": 0.45344191789627075, "learning_rate": 4.500507714552982e-05, "loss": 0.0671, "num_input_tokens_seen": 10630816, "step": 8190 }, { "epoch": 0.4004104267950065, "grad_norm": 0.28513798117637634, "learning_rate": 4.499918777626342e-05, "loss": 0.0958, "num_input_tokens_seen": 10637088, "step": 8195 }, { "epoch": 0.4006547284587008, "grad_norm": 0.555738091468811, "learning_rate": 4.499329532287093e-05, "loss": 0.0917, "num_input_tokens_seen": 10643424, "step": 8200 }, { "epoch": 0.4006547284587008, "eval_loss": 0.09489544481039047, "eval_runtime": 374.104, "eval_samples_per_second": 97.259, "eval_steps_per_second": 24.317, "num_input_tokens_seen": 10643424, "step": 8200 }, { "epoch": 0.40089903012239514, "grad_norm": 0.48781919479370117, "learning_rate": 4.4987399786261064e-05, "loss": 0.0874, "num_input_tokens_seen": 10650144, "step": 8205 }, { "epoch": 0.40114333178608946, "grad_norm": 0.31811556220054626, "learning_rate": 4.498150116734297e-05, "loss": 0.0893, "num_input_tokens_seen": 10656544, "step": 8210 }, { "epoch": 0.4013876334497838, "grad_norm": 0.30050212144851685, "learning_rate": 4.4975599467026294e-05, "loss": 0.0825, "num_input_tokens_seen": 10663040, "step": 8215 }, { "epoch": 0.4016319351134781, "grad_norm": 0.17791488766670227, "learning_rate": 4.496969468622114e-05, "loss": 0.1006, "num_input_tokens_seen": 10669280, "step": 8220 }, { "epoch": 0.40187623677717244, "grad_norm": 0.473021924495697, "learning_rate": 4.496378682583813e-05, "loss": 0.0905, "num_input_tokens_seen": 10675552, "step": 8225 }, { "epoch": 0.40212053844086676, "grad_norm": 0.1508665382862091, "learning_rate": 4.495787588678829e-05, "loss": 0.1447, "num_input_tokens_seen": 10681792, "step": 8230 }, { "epoch": 0.40236484010456114, "grad_norm": 0.23653998970985413, "learning_rate": 4.4951961869983196e-05, "loss": 0.0842, "num_input_tokens_seen": 10688320, "step": 8235 }, { "epoch": 0.40260914176825546, "grad_norm": 0.20208436250686646, "learning_rate": 4.494604477633485e-05, "loss": 0.0937, "num_input_tokens_seen": 10694112, "step": 8240 }, { "epoch": 0.4028534434319498, "grad_norm": 0.2553683817386627, "learning_rate": 4.4940124606755734e-05, "loss": 0.0913, "num_input_tokens_seen": 10700288, "step": 8245 }, { "epoch": 0.4030977450956441, "grad_norm": 0.3951612710952759, "learning_rate": 4.493420136215882e-05, "loss": 0.0659, "num_input_tokens_seen": 10706720, "step": 8250 }, { "epoch": 0.40334204675933844, "grad_norm": 0.8150582909584045, "learning_rate": 4.492827504345756e-05, "loss": 0.095, "num_input_tokens_seen": 10713440, "step": 8255 }, { "epoch": 0.40358634842303276, "grad_norm": 0.5856761336326599, "learning_rate": 4.492234565156584e-05, "loss": 0.1197, "num_input_tokens_seen": 10720128, "step": 8260 }, { "epoch": 0.4038306500867271, "grad_norm": 0.4472034275531769, "learning_rate": 4.491641318739807e-05, "loss": 0.056, "num_input_tokens_seen": 10726272, "step": 8265 }, { "epoch": 0.4040749517504214, "grad_norm": 0.5010238885879517, "learning_rate": 4.4910477651869096e-05, "loss": 0.0841, "num_input_tokens_seen": 10732192, "step": 8270 }, { "epoch": 0.40431925341411573, "grad_norm": 0.3190794587135315, "learning_rate": 4.4904539045894254e-05, "loss": 0.0806, "num_input_tokens_seen": 10738272, "step": 8275 }, { "epoch": 0.40456355507781006, "grad_norm": 0.511170506477356, "learning_rate": 4.4898597370389364e-05, "loss": 0.0954, "num_input_tokens_seen": 10745088, "step": 8280 }, { "epoch": 0.40480785674150443, "grad_norm": 0.2968136966228485, "learning_rate": 4.489265262627069e-05, "loss": 0.1016, "num_input_tokens_seen": 10751456, "step": 8285 }, { "epoch": 0.40505215840519876, "grad_norm": 0.26138070225715637, "learning_rate": 4.488670481445499e-05, "loss": 0.0951, "num_input_tokens_seen": 10758016, "step": 8290 }, { "epoch": 0.4052964600688931, "grad_norm": 0.3540462851524353, "learning_rate": 4.488075393585951e-05, "loss": 0.0998, "num_input_tokens_seen": 10764064, "step": 8295 }, { "epoch": 0.4055407617325874, "grad_norm": 0.5575839877128601, "learning_rate": 4.487479999140193e-05, "loss": 0.1268, "num_input_tokens_seen": 10770400, "step": 8300 }, { "epoch": 0.40578506339628173, "grad_norm": 0.15888452529907227, "learning_rate": 4.4868842982000425e-05, "loss": 0.0877, "num_input_tokens_seen": 10777312, "step": 8305 }, { "epoch": 0.40602936505997606, "grad_norm": 0.2508857548236847, "learning_rate": 4.486288290857365e-05, "loss": 0.0901, "num_input_tokens_seen": 10783840, "step": 8310 }, { "epoch": 0.4062736667236704, "grad_norm": 0.24971851706504822, "learning_rate": 4.4856919772040715e-05, "loss": 0.1063, "num_input_tokens_seen": 10790272, "step": 8315 }, { "epoch": 0.4065179683873647, "grad_norm": 0.47376853227615356, "learning_rate": 4.485095357332122e-05, "loss": 0.1222, "num_input_tokens_seen": 10796320, "step": 8320 }, { "epoch": 0.406762270051059, "grad_norm": 0.17590326070785522, "learning_rate": 4.484498431333521e-05, "loss": 0.0709, "num_input_tokens_seen": 10803072, "step": 8325 }, { "epoch": 0.40700657171475335, "grad_norm": 0.2635164260864258, "learning_rate": 4.4839011993003245e-05, "loss": 0.0939, "num_input_tokens_seen": 10809856, "step": 8330 }, { "epoch": 0.40725087337844773, "grad_norm": 0.42943137884140015, "learning_rate": 4.4833036613246305e-05, "loss": 0.0801, "num_input_tokens_seen": 10816064, "step": 8335 }, { "epoch": 0.40749517504214205, "grad_norm": 0.4809555411338806, "learning_rate": 4.482705817498589e-05, "loss": 0.0864, "num_input_tokens_seen": 10822976, "step": 8340 }, { "epoch": 0.4077394767058364, "grad_norm": 0.26542729139328003, "learning_rate": 4.4821076679143934e-05, "loss": 0.087, "num_input_tokens_seen": 10829312, "step": 8345 }, { "epoch": 0.4079837783695307, "grad_norm": 0.1757824420928955, "learning_rate": 4.481509212664288e-05, "loss": 0.0821, "num_input_tokens_seen": 10835584, "step": 8350 }, { "epoch": 0.408228080033225, "grad_norm": 0.17827895283699036, "learning_rate": 4.480910451840559e-05, "loss": 0.0811, "num_input_tokens_seen": 10842272, "step": 8355 }, { "epoch": 0.40847238169691935, "grad_norm": 0.20405225455760956, "learning_rate": 4.480311385535546e-05, "loss": 0.0982, "num_input_tokens_seen": 10848576, "step": 8360 }, { "epoch": 0.4087166833606137, "grad_norm": 0.42536839842796326, "learning_rate": 4.47971201384163e-05, "loss": 0.1109, "num_input_tokens_seen": 10856064, "step": 8365 }, { "epoch": 0.408960985024308, "grad_norm": 0.26540231704711914, "learning_rate": 4.4791123368512446e-05, "loss": 0.0751, "num_input_tokens_seen": 10863008, "step": 8370 }, { "epoch": 0.4092052866880023, "grad_norm": 0.4590410590171814, "learning_rate": 4.478512354656864e-05, "loss": 0.0755, "num_input_tokens_seen": 10869568, "step": 8375 }, { "epoch": 0.4094495883516967, "grad_norm": 0.16194167733192444, "learning_rate": 4.477912067351016e-05, "loss": 0.0839, "num_input_tokens_seen": 10876064, "step": 8380 }, { "epoch": 0.409693890015391, "grad_norm": 0.13731703162193298, "learning_rate": 4.477311475026271e-05, "loss": 0.0557, "num_input_tokens_seen": 10882656, "step": 8385 }, { "epoch": 0.40993819167908535, "grad_norm": 0.40008601546287537, "learning_rate": 4.476710577775248e-05, "loss": 0.0901, "num_input_tokens_seen": 10888832, "step": 8390 }, { "epoch": 0.4101824933427797, "grad_norm": 0.7918853163719177, "learning_rate": 4.476109375690612e-05, "loss": 0.0831, "num_input_tokens_seen": 10895328, "step": 8395 }, { "epoch": 0.410426795006474, "grad_norm": 0.19502566754817963, "learning_rate": 4.4755078688650784e-05, "loss": 0.1053, "num_input_tokens_seen": 10901760, "step": 8400 }, { "epoch": 0.410426795006474, "eval_loss": 0.09504133462905884, "eval_runtime": 375.1116, "eval_samples_per_second": 96.998, "eval_steps_per_second": 24.251, "num_input_tokens_seen": 10901760, "step": 8400 }, { "epoch": 0.4106710966701683, "grad_norm": 0.42181119322776794, "learning_rate": 4.474906057391406e-05, "loss": 0.0964, "num_input_tokens_seen": 10908224, "step": 8405 }, { "epoch": 0.41091539833386265, "grad_norm": 0.645475447177887, "learning_rate": 4.4743039413624e-05, "loss": 0.1086, "num_input_tokens_seen": 10914976, "step": 8410 }, { "epoch": 0.41115969999755697, "grad_norm": 0.3938223421573639, "learning_rate": 4.473701520870916e-05, "loss": 0.0981, "num_input_tokens_seen": 10920896, "step": 8415 }, { "epoch": 0.4114040016612513, "grad_norm": 0.32088422775268555, "learning_rate": 4.4730987960098544e-05, "loss": 0.1044, "num_input_tokens_seen": 10926976, "step": 8420 }, { "epoch": 0.4116483033249456, "grad_norm": 0.19330988824367523, "learning_rate": 4.4724957668721635e-05, "loss": 0.103, "num_input_tokens_seen": 10933568, "step": 8425 }, { "epoch": 0.41189260498864, "grad_norm": 0.3905136287212372, "learning_rate": 4.471892433550836e-05, "loss": 0.0954, "num_input_tokens_seen": 10939808, "step": 8430 }, { "epoch": 0.4121369066523343, "grad_norm": 0.6214794516563416, "learning_rate": 4.471288796138916e-05, "loss": 0.0968, "num_input_tokens_seen": 10946464, "step": 8435 }, { "epoch": 0.41238120831602865, "grad_norm": 0.24621014297008514, "learning_rate": 4.470684854729491e-05, "loss": 0.0885, "num_input_tokens_seen": 10952736, "step": 8440 }, { "epoch": 0.41262550997972297, "grad_norm": 0.8060742616653442, "learning_rate": 4.4700806094156955e-05, "loss": 0.1127, "num_input_tokens_seen": 10958848, "step": 8445 }, { "epoch": 0.4128698116434173, "grad_norm": 0.2607007622718811, "learning_rate": 4.469476060290713e-05, "loss": 0.0802, "num_input_tokens_seen": 10965568, "step": 8450 }, { "epoch": 0.4131141133071116, "grad_norm": 0.2655186951160431, "learning_rate": 4.468871207447772e-05, "loss": 0.0851, "num_input_tokens_seen": 10972160, "step": 8455 }, { "epoch": 0.41335841497080594, "grad_norm": 0.23851212859153748, "learning_rate": 4.4682660509801486e-05, "loss": 0.1038, "num_input_tokens_seen": 10978720, "step": 8460 }, { "epoch": 0.41360271663450027, "grad_norm": 0.22938422858715057, "learning_rate": 4.467660590981165e-05, "loss": 0.0973, "num_input_tokens_seen": 10985440, "step": 8465 }, { "epoch": 0.4138470182981946, "grad_norm": 0.6038106083869934, "learning_rate": 4.467054827544191e-05, "loss": 0.094, "num_input_tokens_seen": 10992320, "step": 8470 }, { "epoch": 0.4140913199618889, "grad_norm": 0.18325065076351166, "learning_rate": 4.4664487607626434e-05, "loss": 0.1084, "num_input_tokens_seen": 10998432, "step": 8475 }, { "epoch": 0.4143356216255833, "grad_norm": 0.18601582944393158, "learning_rate": 4.4658423907299845e-05, "loss": 0.0935, "num_input_tokens_seen": 11004896, "step": 8480 }, { "epoch": 0.4145799232892776, "grad_norm": 0.19416894018650055, "learning_rate": 4.465235717539725e-05, "loss": 0.0749, "num_input_tokens_seen": 11011168, "step": 8485 }, { "epoch": 0.41482422495297194, "grad_norm": 0.4848026931285858, "learning_rate": 4.464628741285421e-05, "loss": 0.0859, "num_input_tokens_seen": 11017472, "step": 8490 }, { "epoch": 0.41506852661666627, "grad_norm": 0.3247193694114685, "learning_rate": 4.4640214620606754e-05, "loss": 0.0922, "num_input_tokens_seen": 11023968, "step": 8495 }, { "epoch": 0.4153128282803606, "grad_norm": 0.17956769466400146, "learning_rate": 4.46341387995914e-05, "loss": 0.1193, "num_input_tokens_seen": 11030784, "step": 8500 }, { "epoch": 0.4155571299440549, "grad_norm": 0.21816162765026093, "learning_rate": 4.4628059950745106e-05, "loss": 0.0937, "num_input_tokens_seen": 11037472, "step": 8505 }, { "epoch": 0.41580143160774924, "grad_norm": 0.6354973316192627, "learning_rate": 4.4621978075005297e-05, "loss": 0.0814, "num_input_tokens_seen": 11043616, "step": 8510 }, { "epoch": 0.41604573327144356, "grad_norm": 0.3477128744125366, "learning_rate": 4.461589317330989e-05, "loss": 0.1241, "num_input_tokens_seen": 11049984, "step": 8515 }, { "epoch": 0.4162900349351379, "grad_norm": 0.5842606425285339, "learning_rate": 4.460980524659724e-05, "loss": 0.1043, "num_input_tokens_seen": 11056768, "step": 8520 }, { "epoch": 0.4165343365988322, "grad_norm": 0.19986127316951752, "learning_rate": 4.46037142958062e-05, "loss": 0.0872, "num_input_tokens_seen": 11062912, "step": 8525 }, { "epoch": 0.4167786382625266, "grad_norm": 0.18039830029010773, "learning_rate": 4.4597620321876046e-05, "loss": 0.0617, "num_input_tokens_seen": 11069856, "step": 8530 }, { "epoch": 0.4170229399262209, "grad_norm": 0.39578309655189514, "learning_rate": 4.459152332574656e-05, "loss": 0.0697, "num_input_tokens_seen": 11076288, "step": 8535 }, { "epoch": 0.41726724158991524, "grad_norm": 0.24819740653038025, "learning_rate": 4.4585423308357985e-05, "loss": 0.1015, "num_input_tokens_seen": 11082624, "step": 8540 }, { "epoch": 0.41751154325360956, "grad_norm": 0.24237766861915588, "learning_rate": 4.457932027065102e-05, "loss": 0.0957, "num_input_tokens_seen": 11088736, "step": 8545 }, { "epoch": 0.4177558449173039, "grad_norm": 0.4632520377635956, "learning_rate": 4.45732142135668e-05, "loss": 0.0848, "num_input_tokens_seen": 11095360, "step": 8550 }, { "epoch": 0.4180001465809982, "grad_norm": 0.17249050736427307, "learning_rate": 4.4567105138046986e-05, "loss": 0.1108, "num_input_tokens_seen": 11101600, "step": 8555 }, { "epoch": 0.41824444824469253, "grad_norm": 0.1874215304851532, "learning_rate": 4.456099304503365e-05, "loss": 0.081, "num_input_tokens_seen": 11107712, "step": 8560 }, { "epoch": 0.41848874990838686, "grad_norm": 0.18339915573596954, "learning_rate": 4.455487793546939e-05, "loss": 0.0725, "num_input_tokens_seen": 11114144, "step": 8565 }, { "epoch": 0.4187330515720812, "grad_norm": 0.20407813787460327, "learning_rate": 4.454875981029719e-05, "loss": 0.0816, "num_input_tokens_seen": 11121120, "step": 8570 }, { "epoch": 0.41897735323577556, "grad_norm": 0.4410685896873474, "learning_rate": 4.454263867046057e-05, "loss": 0.0977, "num_input_tokens_seen": 11127680, "step": 8575 }, { "epoch": 0.4192216548994699, "grad_norm": 0.1353081315755844, "learning_rate": 4.4536514516903484e-05, "loss": 0.0889, "num_input_tokens_seen": 11134144, "step": 8580 }, { "epoch": 0.4194659565631642, "grad_norm": 0.5105931758880615, "learning_rate": 4.453038735057034e-05, "loss": 0.085, "num_input_tokens_seen": 11140864, "step": 8585 }, { "epoch": 0.41971025822685853, "grad_norm": 0.31556299328804016, "learning_rate": 4.4524257172406034e-05, "loss": 0.108, "num_input_tokens_seen": 11147264, "step": 8590 }, { "epoch": 0.41995455989055286, "grad_norm": 0.3487129807472229, "learning_rate": 4.451812398335592e-05, "loss": 0.0959, "num_input_tokens_seen": 11153472, "step": 8595 }, { "epoch": 0.4201988615542472, "grad_norm": 0.2850344479084015, "learning_rate": 4.4511987784365805e-05, "loss": 0.1287, "num_input_tokens_seen": 11159584, "step": 8600 }, { "epoch": 0.4201988615542472, "eval_loss": 0.09471043199300766, "eval_runtime": 374.7139, "eval_samples_per_second": 97.101, "eval_steps_per_second": 24.277, "num_input_tokens_seen": 11159584, "step": 8600 }, { "epoch": 0.4204431632179415, "grad_norm": 0.2151356041431427, "learning_rate": 4.450584857638197e-05, "loss": 0.1031, "num_input_tokens_seen": 11166080, "step": 8605 }, { "epoch": 0.42068746488163583, "grad_norm": 0.7869364023208618, "learning_rate": 4.449970636035116e-05, "loss": 0.0866, "num_input_tokens_seen": 11172256, "step": 8610 }, { "epoch": 0.42093176654533015, "grad_norm": 0.4161292314529419, "learning_rate": 4.4493561137220574e-05, "loss": 0.0939, "num_input_tokens_seen": 11179136, "step": 8615 }, { "epoch": 0.4211760682090245, "grad_norm": 0.21215486526489258, "learning_rate": 4.44874129079379e-05, "loss": 0.1007, "num_input_tokens_seen": 11185568, "step": 8620 }, { "epoch": 0.42142036987271886, "grad_norm": 0.33423087000846863, "learning_rate": 4.4481261673451255e-05, "loss": 0.0984, "num_input_tokens_seen": 11192160, "step": 8625 }, { "epoch": 0.4216646715364132, "grad_norm": 0.16426096856594086, "learning_rate": 4.4475107434709245e-05, "loss": 0.0894, "num_input_tokens_seen": 11198336, "step": 8630 }, { "epoch": 0.4219089732001075, "grad_norm": 0.13835877180099487, "learning_rate": 4.446895019266093e-05, "loss": 0.0864, "num_input_tokens_seen": 11204416, "step": 8635 }, { "epoch": 0.42215327486380183, "grad_norm": 0.29693421721458435, "learning_rate": 4.446278994825583e-05, "loss": 0.0727, "num_input_tokens_seen": 11210688, "step": 8640 }, { "epoch": 0.42239757652749615, "grad_norm": 0.5148151516914368, "learning_rate": 4.445662670244394e-05, "loss": 0.0959, "num_input_tokens_seen": 11217248, "step": 8645 }, { "epoch": 0.4226418781911905, "grad_norm": 0.24974198639392853, "learning_rate": 4.44504604561757e-05, "loss": 0.0827, "num_input_tokens_seen": 11223648, "step": 8650 }, { "epoch": 0.4228861798548848, "grad_norm": 0.6390606164932251, "learning_rate": 4.4444291210402035e-05, "loss": 0.1027, "num_input_tokens_seen": 11230208, "step": 8655 }, { "epoch": 0.4231304815185791, "grad_norm": 0.3569570481777191, "learning_rate": 4.443811896607431e-05, "loss": 0.11, "num_input_tokens_seen": 11236576, "step": 8660 }, { "epoch": 0.42337478318227345, "grad_norm": 0.15696261823177338, "learning_rate": 4.443194372414436e-05, "loss": 0.0776, "num_input_tokens_seen": 11242816, "step": 8665 }, { "epoch": 0.4236190848459678, "grad_norm": 0.33578795194625854, "learning_rate": 4.442576548556449e-05, "loss": 0.0859, "num_input_tokens_seen": 11249504, "step": 8670 }, { "epoch": 0.42386338650966215, "grad_norm": 0.4556460976600647, "learning_rate": 4.441958425128747e-05, "loss": 0.101, "num_input_tokens_seen": 11256128, "step": 8675 }, { "epoch": 0.4241076881733565, "grad_norm": 0.3310289680957794, "learning_rate": 4.4413400022266515e-05, "loss": 0.1047, "num_input_tokens_seen": 11263200, "step": 8680 }, { "epoch": 0.4243519898370508, "grad_norm": 0.2889561057090759, "learning_rate": 4.4407212799455313e-05, "loss": 0.0643, "num_input_tokens_seen": 11270048, "step": 8685 }, { "epoch": 0.4245962915007451, "grad_norm": 0.1730625331401825, "learning_rate": 4.4401022583808003e-05, "loss": 0.095, "num_input_tokens_seen": 11276992, "step": 8690 }, { "epoch": 0.42484059316443945, "grad_norm": 0.18595315515995026, "learning_rate": 4.439482937627921e-05, "loss": 0.0932, "num_input_tokens_seen": 11283872, "step": 8695 }, { "epoch": 0.4250848948281338, "grad_norm": 0.6878445148468018, "learning_rate": 4.4388633177824004e-05, "loss": 0.1019, "num_input_tokens_seen": 11289824, "step": 8700 }, { "epoch": 0.4253291964918281, "grad_norm": 0.49487248063087463, "learning_rate": 4.4382433989397895e-05, "loss": 0.0866, "num_input_tokens_seen": 11296064, "step": 8705 }, { "epoch": 0.4255734981555224, "grad_norm": 0.24426932632923126, "learning_rate": 4.4376231811956895e-05, "loss": 0.096, "num_input_tokens_seen": 11302368, "step": 8710 }, { "epoch": 0.42581779981921675, "grad_norm": 0.338815838098526, "learning_rate": 4.437002664645745e-05, "loss": 0.0932, "num_input_tokens_seen": 11308704, "step": 8715 }, { "epoch": 0.42606210148291107, "grad_norm": 0.589390754699707, "learning_rate": 4.436381849385649e-05, "loss": 0.1025, "num_input_tokens_seen": 11315360, "step": 8720 }, { "epoch": 0.42630640314660545, "grad_norm": 0.6501173973083496, "learning_rate": 4.435760735511136e-05, "loss": 0.0809, "num_input_tokens_seen": 11321792, "step": 8725 }, { "epoch": 0.4265507048102998, "grad_norm": 0.3952428698539734, "learning_rate": 4.435139323117992e-05, "loss": 0.0871, "num_input_tokens_seen": 11328000, "step": 8730 }, { "epoch": 0.4267950064739941, "grad_norm": 0.34026217460632324, "learning_rate": 4.434517612302046e-05, "loss": 0.0749, "num_input_tokens_seen": 11334560, "step": 8735 }, { "epoch": 0.4270393081376884, "grad_norm": 0.9436513781547546, "learning_rate": 4.433895603159174e-05, "loss": 0.0987, "num_input_tokens_seen": 11340864, "step": 8740 }, { "epoch": 0.42728360980138275, "grad_norm": 0.16071435809135437, "learning_rate": 4.433273295785296e-05, "loss": 0.0648, "num_input_tokens_seen": 11347360, "step": 8745 }, { "epoch": 0.42752791146507707, "grad_norm": 0.5065325498580933, "learning_rate": 4.432650690276382e-05, "loss": 0.0903, "num_input_tokens_seen": 11353856, "step": 8750 }, { "epoch": 0.4277722131287714, "grad_norm": 0.5208543539047241, "learning_rate": 4.4320277867284435e-05, "loss": 0.0914, "num_input_tokens_seen": 11360288, "step": 8755 }, { "epoch": 0.4280165147924657, "grad_norm": 0.17532987892627716, "learning_rate": 4.431404585237541e-05, "loss": 0.0658, "num_input_tokens_seen": 11367040, "step": 8760 }, { "epoch": 0.42826081645616004, "grad_norm": 0.7774492502212524, "learning_rate": 4.43078108589978e-05, "loss": 0.0979, "num_input_tokens_seen": 11373536, "step": 8765 }, { "epoch": 0.4285051181198544, "grad_norm": 0.4482872784137726, "learning_rate": 4.4301572888113116e-05, "loss": 0.1015, "num_input_tokens_seen": 11379808, "step": 8770 }, { "epoch": 0.42874941978354875, "grad_norm": 0.3861401379108429, "learning_rate": 4.4295331940683337e-05, "loss": 0.0651, "num_input_tokens_seen": 11386784, "step": 8775 }, { "epoch": 0.42899372144724307, "grad_norm": 0.41124191880226135, "learning_rate": 4.428908801767089e-05, "loss": 0.0798, "num_input_tokens_seen": 11393536, "step": 8780 }, { "epoch": 0.4292380231109374, "grad_norm": 0.20715075731277466, "learning_rate": 4.428284112003868e-05, "loss": 0.1135, "num_input_tokens_seen": 11399808, "step": 8785 }, { "epoch": 0.4294823247746317, "grad_norm": 0.29643112421035767, "learning_rate": 4.4276591248750033e-05, "loss": 0.0574, "num_input_tokens_seen": 11407648, "step": 8790 }, { "epoch": 0.42972662643832604, "grad_norm": 0.519967257976532, "learning_rate": 4.4270338404768774e-05, "loss": 0.0824, "num_input_tokens_seen": 11414368, "step": 8795 }, { "epoch": 0.42997092810202037, "grad_norm": 0.2172907143831253, "learning_rate": 4.426408258905917e-05, "loss": 0.0949, "num_input_tokens_seen": 11420640, "step": 8800 }, { "epoch": 0.42997092810202037, "eval_loss": 0.09419868886470795, "eval_runtime": 374.8665, "eval_samples_per_second": 97.061, "eval_steps_per_second": 24.267, "num_input_tokens_seen": 11420640, "step": 8800 }, { "epoch": 0.4302152297657147, "grad_norm": 0.3958605229854584, "learning_rate": 4.425782380258594e-05, "loss": 0.0731, "num_input_tokens_seen": 11427168, "step": 8805 }, { "epoch": 0.430459531429409, "grad_norm": 0.15529663860797882, "learning_rate": 4.425156204631427e-05, "loss": 0.0982, "num_input_tokens_seen": 11433856, "step": 8810 }, { "epoch": 0.43070383309310334, "grad_norm": 0.37306979298591614, "learning_rate": 4.424529732120981e-05, "loss": 0.0926, "num_input_tokens_seen": 11440192, "step": 8815 }, { "epoch": 0.4309481347567977, "grad_norm": 0.3798374533653259, "learning_rate": 4.423902962823864e-05, "loss": 0.1302, "num_input_tokens_seen": 11446464, "step": 8820 }, { "epoch": 0.43119243642049204, "grad_norm": 0.35076117515563965, "learning_rate": 4.423275896836733e-05, "loss": 0.0874, "num_input_tokens_seen": 11453024, "step": 8825 }, { "epoch": 0.43143673808418637, "grad_norm": 0.10571426898241043, "learning_rate": 4.42264853425629e-05, "loss": 0.1091, "num_input_tokens_seen": 11459136, "step": 8830 }, { "epoch": 0.4316810397478807, "grad_norm": 0.3908601403236389, "learning_rate": 4.4220208751792816e-05, "loss": 0.1006, "num_input_tokens_seen": 11465664, "step": 8835 }, { "epoch": 0.431925341411575, "grad_norm": 0.33171728253364563, "learning_rate": 4.421392919702499e-05, "loss": 0.0962, "num_input_tokens_seen": 11472320, "step": 8840 }, { "epoch": 0.43216964307526934, "grad_norm": 0.5789251327514648, "learning_rate": 4.4207646679227846e-05, "loss": 0.1009, "num_input_tokens_seen": 11478944, "step": 8845 }, { "epoch": 0.43241394473896366, "grad_norm": 0.2617029845714569, "learning_rate": 4.42013611993702e-05, "loss": 0.1039, "num_input_tokens_seen": 11485568, "step": 8850 }, { "epoch": 0.432658246402658, "grad_norm": 0.7831550240516663, "learning_rate": 4.419507275842135e-05, "loss": 0.1054, "num_input_tokens_seen": 11492448, "step": 8855 }, { "epoch": 0.4329025480663523, "grad_norm": 0.40194278955459595, "learning_rate": 4.418878135735106e-05, "loss": 0.0924, "num_input_tokens_seen": 11498848, "step": 8860 }, { "epoch": 0.43314684973004663, "grad_norm": 0.2659052312374115, "learning_rate": 4.418248699712955e-05, "loss": 0.0799, "num_input_tokens_seen": 11505408, "step": 8865 }, { "epoch": 0.433391151393741, "grad_norm": 0.9959058165550232, "learning_rate": 4.417618967872748e-05, "loss": 0.0892, "num_input_tokens_seen": 11512128, "step": 8870 }, { "epoch": 0.43363545305743534, "grad_norm": 0.269513338804245, "learning_rate": 4.4169889403115985e-05, "loss": 0.08, "num_input_tokens_seen": 11518816, "step": 8875 }, { "epoch": 0.43387975472112966, "grad_norm": 0.3428349494934082, "learning_rate": 4.4163586171266627e-05, "loss": 0.0653, "num_input_tokens_seen": 11525632, "step": 8880 }, { "epoch": 0.434124056384824, "grad_norm": 0.13840103149414062, "learning_rate": 4.415727998415147e-05, "loss": 0.0875, "num_input_tokens_seen": 11531840, "step": 8885 }, { "epoch": 0.4343683580485183, "grad_norm": 0.2531660199165344, "learning_rate": 4.4150970842742985e-05, "loss": 0.09, "num_input_tokens_seen": 11538144, "step": 8890 }, { "epoch": 0.43461265971221263, "grad_norm": 0.2589581608772278, "learning_rate": 4.4144658748014134e-05, "loss": 0.1191, "num_input_tokens_seen": 11545280, "step": 8895 }, { "epoch": 0.43485696137590696, "grad_norm": 0.20739047229290009, "learning_rate": 4.413834370093831e-05, "loss": 0.0891, "num_input_tokens_seen": 11552032, "step": 8900 }, { "epoch": 0.4351012630396013, "grad_norm": 0.35160383582115173, "learning_rate": 4.413202570248939e-05, "loss": 0.078, "num_input_tokens_seen": 11558464, "step": 8905 }, { "epoch": 0.4353455647032956, "grad_norm": 0.3363826274871826, "learning_rate": 4.412570475364167e-05, "loss": 0.1118, "num_input_tokens_seen": 11565280, "step": 8910 }, { "epoch": 0.43558986636699, "grad_norm": 0.16016387939453125, "learning_rate": 4.411938085536994e-05, "loss": 0.0858, "num_input_tokens_seen": 11571680, "step": 8915 }, { "epoch": 0.4358341680306843, "grad_norm": 0.4387572407722473, "learning_rate": 4.41130540086494e-05, "loss": 0.0787, "num_input_tokens_seen": 11577856, "step": 8920 }, { "epoch": 0.43607846969437863, "grad_norm": 0.6249940395355225, "learning_rate": 4.4106724214455754e-05, "loss": 0.0864, "num_input_tokens_seen": 11584768, "step": 8925 }, { "epoch": 0.43632277135807296, "grad_norm": 0.40941357612609863, "learning_rate": 4.4100391473765115e-05, "loss": 0.0667, "num_input_tokens_seen": 11591136, "step": 8930 }, { "epoch": 0.4365670730217673, "grad_norm": 0.1713060587644577, "learning_rate": 4.409405578755408e-05, "loss": 0.0791, "num_input_tokens_seen": 11598176, "step": 8935 }, { "epoch": 0.4368113746854616, "grad_norm": 0.19710151851177216, "learning_rate": 4.4087717156799705e-05, "loss": 0.1015, "num_input_tokens_seen": 11604384, "step": 8940 }, { "epoch": 0.43705567634915593, "grad_norm": 0.6424387693405151, "learning_rate": 4.408137558247946e-05, "loss": 0.0844, "num_input_tokens_seen": 11611104, "step": 8945 }, { "epoch": 0.43729997801285025, "grad_norm": 0.16286930441856384, "learning_rate": 4.4075031065571306e-05, "loss": 0.0743, "num_input_tokens_seen": 11617248, "step": 8950 }, { "epoch": 0.4375442796765446, "grad_norm": 0.14837433397769928, "learning_rate": 4.406868360705366e-05, "loss": 0.0704, "num_input_tokens_seen": 11623936, "step": 8955 }, { "epoch": 0.4377885813402389, "grad_norm": 0.28739142417907715, "learning_rate": 4.406233320790536e-05, "loss": 0.109, "num_input_tokens_seen": 11630432, "step": 8960 }, { "epoch": 0.4380328830039333, "grad_norm": 0.11382412165403366, "learning_rate": 4.4055979869105734e-05, "loss": 0.0821, "num_input_tokens_seen": 11636992, "step": 8965 }, { "epoch": 0.4382771846676276, "grad_norm": 0.22713597118854523, "learning_rate": 4.404962359163454e-05, "loss": 0.1077, "num_input_tokens_seen": 11643200, "step": 8970 }, { "epoch": 0.43852148633132193, "grad_norm": 0.12877866625785828, "learning_rate": 4.404326437647199e-05, "loss": 0.0791, "num_input_tokens_seen": 11650240, "step": 8975 }, { "epoch": 0.43876578799501625, "grad_norm": 0.2296106517314911, "learning_rate": 4.403690222459877e-05, "loss": 0.09, "num_input_tokens_seen": 11657216, "step": 8980 }, { "epoch": 0.4390100896587106, "grad_norm": 0.222207173705101, "learning_rate": 4.4030537136995984e-05, "loss": 0.072, "num_input_tokens_seen": 11663808, "step": 8985 }, { "epoch": 0.4392543913224049, "grad_norm": 0.2286708950996399, "learning_rate": 4.402416911464523e-05, "loss": 0.0724, "num_input_tokens_seen": 11670048, "step": 8990 }, { "epoch": 0.4394986929860992, "grad_norm": 0.6435275673866272, "learning_rate": 4.4017798158528516e-05, "loss": 0.1083, "num_input_tokens_seen": 11676480, "step": 8995 }, { "epoch": 0.43974299464979355, "grad_norm": 0.5325417518615723, "learning_rate": 4.401142426962834e-05, "loss": 0.0993, "num_input_tokens_seen": 11683072, "step": 9000 }, { "epoch": 0.43974299464979355, "eval_loss": 0.09458503872156143, "eval_runtime": 375.3001, "eval_samples_per_second": 96.949, "eval_steps_per_second": 24.239, "num_input_tokens_seen": 11683072, "step": 9000 }, { "epoch": 0.4399872963134879, "grad_norm": 0.35652950406074524, "learning_rate": 4.400504744892763e-05, "loss": 0.0776, "num_input_tokens_seen": 11689920, "step": 9005 }, { "epoch": 0.4402315979771822, "grad_norm": 0.9690607786178589, "learning_rate": 4.399866769740975e-05, "loss": 0.1235, "num_input_tokens_seen": 11696448, "step": 9010 }, { "epoch": 0.4404758996408766, "grad_norm": 0.2537718117237091, "learning_rate": 4.399228501605859e-05, "loss": 0.0781, "num_input_tokens_seen": 11703200, "step": 9015 }, { "epoch": 0.4407202013045709, "grad_norm": 0.21395860612392426, "learning_rate": 4.398589940585839e-05, "loss": 0.1074, "num_input_tokens_seen": 11709504, "step": 9020 }, { "epoch": 0.4409645029682652, "grad_norm": 0.19123700261116028, "learning_rate": 4.3979510867793917e-05, "loss": 0.1162, "num_input_tokens_seen": 11716224, "step": 9025 }, { "epoch": 0.44120880463195955, "grad_norm": 0.5989947319030762, "learning_rate": 4.3973119402850346e-05, "loss": 0.0887, "num_input_tokens_seen": 11722496, "step": 9030 }, { "epoch": 0.4414531062956539, "grad_norm": 0.22670532763004303, "learning_rate": 4.396672501201334e-05, "loss": 0.1106, "num_input_tokens_seen": 11728768, "step": 9035 }, { "epoch": 0.4416974079593482, "grad_norm": 0.6787622570991516, "learning_rate": 4.396032769626899e-05, "loss": 0.1329, "num_input_tokens_seen": 11735136, "step": 9040 }, { "epoch": 0.4419417096230425, "grad_norm": 0.156978040933609, "learning_rate": 4.395392745660384e-05, "loss": 0.105, "num_input_tokens_seen": 11741664, "step": 9045 }, { "epoch": 0.44218601128673685, "grad_norm": 0.15469834208488464, "learning_rate": 4.394752429400488e-05, "loss": 0.0981, "num_input_tokens_seen": 11748256, "step": 9050 }, { "epoch": 0.44243031295043117, "grad_norm": 0.7161087393760681, "learning_rate": 4.394111820945957e-05, "loss": 0.0982, "num_input_tokens_seen": 11754752, "step": 9055 }, { "epoch": 0.4426746146141255, "grad_norm": 0.16857406497001648, "learning_rate": 4.393470920395579e-05, "loss": 0.0705, "num_input_tokens_seen": 11760992, "step": 9060 }, { "epoch": 0.4429189162778199, "grad_norm": 0.25324028730392456, "learning_rate": 4.392829727848192e-05, "loss": 0.1033, "num_input_tokens_seen": 11767616, "step": 9065 }, { "epoch": 0.4431632179415142, "grad_norm": 0.4407302439212799, "learning_rate": 4.392188243402673e-05, "loss": 0.0885, "num_input_tokens_seen": 11774368, "step": 9070 }, { "epoch": 0.4434075196052085, "grad_norm": 0.23045296967029572, "learning_rate": 4.391546467157949e-05, "loss": 0.0952, "num_input_tokens_seen": 11780736, "step": 9075 }, { "epoch": 0.44365182126890285, "grad_norm": 0.2843899428844452, "learning_rate": 4.390904399212988e-05, "loss": 0.1064, "num_input_tokens_seen": 11787040, "step": 9080 }, { "epoch": 0.44389612293259717, "grad_norm": 0.5719471573829651, "learning_rate": 4.390262039666807e-05, "loss": 0.0829, "num_input_tokens_seen": 11793376, "step": 9085 }, { "epoch": 0.4441404245962915, "grad_norm": 0.2075566202402115, "learning_rate": 4.389619388618464e-05, "loss": 0.0921, "num_input_tokens_seen": 11799904, "step": 9090 }, { "epoch": 0.4443847262599858, "grad_norm": 0.21379385888576508, "learning_rate": 4.3889764461670655e-05, "loss": 0.0506, "num_input_tokens_seen": 11806368, "step": 9095 }, { "epoch": 0.44462902792368014, "grad_norm": 0.21732303500175476, "learning_rate": 4.38833321241176e-05, "loss": 0.0859, "num_input_tokens_seen": 11812928, "step": 9100 }, { "epoch": 0.44487332958737447, "grad_norm": 0.23905523121356964, "learning_rate": 4.3876896874517434e-05, "loss": 0.0967, "num_input_tokens_seen": 11819072, "step": 9105 }, { "epoch": 0.44511763125106885, "grad_norm": 0.5037259459495544, "learning_rate": 4.3870458713862554e-05, "loss": 0.1062, "num_input_tokens_seen": 11825120, "step": 9110 }, { "epoch": 0.44536193291476317, "grad_norm": 0.20248618721961975, "learning_rate": 4.386401764314579e-05, "loss": 0.0905, "num_input_tokens_seen": 11831936, "step": 9115 }, { "epoch": 0.4456062345784575, "grad_norm": 0.4579582214355469, "learning_rate": 4.385757366336045e-05, "loss": 0.0731, "num_input_tokens_seen": 11838624, "step": 9120 }, { "epoch": 0.4458505362421518, "grad_norm": 0.1333591490983963, "learning_rate": 4.385112677550027e-05, "loss": 0.0781, "num_input_tokens_seen": 11844576, "step": 9125 }, { "epoch": 0.44609483790584614, "grad_norm": 0.46799561381340027, "learning_rate": 4.384467698055945e-05, "loss": 0.0919, "num_input_tokens_seen": 11851072, "step": 9130 }, { "epoch": 0.44633913956954047, "grad_norm": 0.3900231122970581, "learning_rate": 4.383822427953261e-05, "loss": 0.0748, "num_input_tokens_seen": 11856640, "step": 9135 }, { "epoch": 0.4465834412332348, "grad_norm": 0.24502617120742798, "learning_rate": 4.3831768673414864e-05, "loss": 0.0891, "num_input_tokens_seen": 11863392, "step": 9140 }, { "epoch": 0.4468277428969291, "grad_norm": 0.3335499167442322, "learning_rate": 4.382531016320173e-05, "loss": 0.0837, "num_input_tokens_seen": 11869984, "step": 9145 }, { "epoch": 0.44707204456062344, "grad_norm": 0.21495899558067322, "learning_rate": 4.3818848749889184e-05, "loss": 0.1111, "num_input_tokens_seen": 11877312, "step": 9150 }, { "epoch": 0.44731634622431776, "grad_norm": 0.3568015694618225, "learning_rate": 4.381238443447368e-05, "loss": 0.0867, "num_input_tokens_seen": 11883904, "step": 9155 }, { "epoch": 0.44756064788801214, "grad_norm": 0.22483213245868683, "learning_rate": 4.380591721795208e-05, "loss": 0.0967, "num_input_tokens_seen": 11890368, "step": 9160 }, { "epoch": 0.44780494955170647, "grad_norm": 0.14732231199741364, "learning_rate": 4.3799447101321723e-05, "loss": 0.1157, "num_input_tokens_seen": 11896640, "step": 9165 }, { "epoch": 0.4480492512154008, "grad_norm": 0.4774629473686218, "learning_rate": 4.379297408558036e-05, "loss": 0.0894, "num_input_tokens_seen": 11902752, "step": 9170 }, { "epoch": 0.4482935528790951, "grad_norm": 0.26864027976989746, "learning_rate": 4.378649817172624e-05, "loss": 0.0684, "num_input_tokens_seen": 11909376, "step": 9175 }, { "epoch": 0.44853785454278944, "grad_norm": 0.17696569859981537, "learning_rate": 4.378001936075801e-05, "loss": 0.0741, "num_input_tokens_seen": 11915680, "step": 9180 }, { "epoch": 0.44878215620648376, "grad_norm": 0.09855657070875168, "learning_rate": 4.377353765367479e-05, "loss": 0.082, "num_input_tokens_seen": 11922048, "step": 9185 }, { "epoch": 0.4490264578701781, "grad_norm": 0.1896914392709732, "learning_rate": 4.376705305147614e-05, "loss": 0.1113, "num_input_tokens_seen": 11928448, "step": 9190 }, { "epoch": 0.4492707595338724, "grad_norm": 0.7494770288467407, "learning_rate": 4.376056555516206e-05, "loss": 0.1075, "num_input_tokens_seen": 11934816, "step": 9195 }, { "epoch": 0.44951506119756673, "grad_norm": 0.1893697828054428, "learning_rate": 4.375407516573302e-05, "loss": 0.1095, "num_input_tokens_seen": 11941600, "step": 9200 }, { "epoch": 0.44951506119756673, "eval_loss": 0.09403134137392044, "eval_runtime": 375.4719, "eval_samples_per_second": 96.905, "eval_steps_per_second": 24.228, "num_input_tokens_seen": 11941600, "step": 9200 }, { "epoch": 0.44975936286126106, "grad_norm": 0.14122571051120758, "learning_rate": 4.3747581884189913e-05, "loss": 0.0893, "num_input_tokens_seen": 11948064, "step": 9205 }, { "epoch": 0.45000366452495544, "grad_norm": 0.5093544721603394, "learning_rate": 4.374108571153408e-05, "loss": 0.0764, "num_input_tokens_seen": 11954624, "step": 9210 }, { "epoch": 0.45024796618864976, "grad_norm": 0.7812740802764893, "learning_rate": 4.3734586648767316e-05, "loss": 0.0903, "num_input_tokens_seen": 11961184, "step": 9215 }, { "epoch": 0.4504922678523441, "grad_norm": 0.40034618973731995, "learning_rate": 4.372808469689186e-05, "loss": 0.0887, "num_input_tokens_seen": 11967136, "step": 9220 }, { "epoch": 0.4507365695160384, "grad_norm": 0.20365630090236664, "learning_rate": 4.372157985691039e-05, "loss": 0.1031, "num_input_tokens_seen": 11973408, "step": 9225 }, { "epoch": 0.45098087117973273, "grad_norm": 0.14554734528064728, "learning_rate": 4.371507212982603e-05, "loss": 0.0735, "num_input_tokens_seen": 11979840, "step": 9230 }, { "epoch": 0.45122517284342706, "grad_norm": 0.2758944034576416, "learning_rate": 4.370856151664236e-05, "loss": 0.0867, "num_input_tokens_seen": 11986272, "step": 9235 }, { "epoch": 0.4514694745071214, "grad_norm": 0.30172017216682434, "learning_rate": 4.3702048018363404e-05, "loss": 0.1061, "num_input_tokens_seen": 11992928, "step": 9240 }, { "epoch": 0.4517137761708157, "grad_norm": 0.17399288713932037, "learning_rate": 4.369553163599362e-05, "loss": 0.0748, "num_input_tokens_seen": 11999584, "step": 9245 }, { "epoch": 0.45195807783451003, "grad_norm": 0.15796343982219696, "learning_rate": 4.3689012370537904e-05, "loss": 0.119, "num_input_tokens_seen": 12005984, "step": 9250 }, { "epoch": 0.4522023794982044, "grad_norm": 0.5924127101898193, "learning_rate": 4.368249022300164e-05, "loss": 0.1224, "num_input_tokens_seen": 12012288, "step": 9255 }, { "epoch": 0.45244668116189873, "grad_norm": 0.8185988068580627, "learning_rate": 4.367596519439059e-05, "loss": 0.1132, "num_input_tokens_seen": 12018560, "step": 9260 }, { "epoch": 0.45269098282559306, "grad_norm": 0.48226943612098694, "learning_rate": 4.366943728571101e-05, "loss": 0.0975, "num_input_tokens_seen": 12024864, "step": 9265 }, { "epoch": 0.4529352844892874, "grad_norm": 0.3797328770160675, "learning_rate": 4.366290649796959e-05, "loss": 0.1049, "num_input_tokens_seen": 12031520, "step": 9270 }, { "epoch": 0.4531795861529817, "grad_norm": 0.2352394461631775, "learning_rate": 4.3656372832173456e-05, "loss": 0.0714, "num_input_tokens_seen": 12038048, "step": 9275 }, { "epoch": 0.45342388781667603, "grad_norm": 0.29331469535827637, "learning_rate": 4.364983628933017e-05, "loss": 0.0712, "num_input_tokens_seen": 12044512, "step": 9280 }, { "epoch": 0.45366818948037035, "grad_norm": 0.21693302690982819, "learning_rate": 4.364329687044777e-05, "loss": 0.0866, "num_input_tokens_seen": 12051072, "step": 9285 }, { "epoch": 0.4539124911440647, "grad_norm": 0.511748731136322, "learning_rate": 4.36367545765347e-05, "loss": 0.1102, "num_input_tokens_seen": 12057664, "step": 9290 }, { "epoch": 0.454156792807759, "grad_norm": 0.16272519528865814, "learning_rate": 4.363020940859988e-05, "loss": 0.0768, "num_input_tokens_seen": 12063776, "step": 9295 }, { "epoch": 0.4544010944714533, "grad_norm": 0.2834262251853943, "learning_rate": 4.362366136765263e-05, "loss": 0.0839, "num_input_tokens_seen": 12070080, "step": 9300 }, { "epoch": 0.4546453961351477, "grad_norm": 0.3144623637199402, "learning_rate": 4.361711045470278e-05, "loss": 0.1024, "num_input_tokens_seen": 12076192, "step": 9305 }, { "epoch": 0.45488969779884203, "grad_norm": 0.27242690324783325, "learning_rate": 4.3610556670760524e-05, "loss": 0.0885, "num_input_tokens_seen": 12082464, "step": 9310 }, { "epoch": 0.45513399946253635, "grad_norm": 0.778265655040741, "learning_rate": 4.360400001683657e-05, "loss": 0.1193, "num_input_tokens_seen": 12088672, "step": 9315 }, { "epoch": 0.4553783011262307, "grad_norm": 0.3023586869239807, "learning_rate": 4.3597440493942e-05, "loss": 0.0724, "num_input_tokens_seen": 12094816, "step": 9320 }, { "epoch": 0.455622602789925, "grad_norm": 0.41790464520454407, "learning_rate": 4.3590878103088405e-05, "loss": 0.0924, "num_input_tokens_seen": 12101152, "step": 9325 }, { "epoch": 0.4558669044536193, "grad_norm": 0.10458789020776749, "learning_rate": 4.358431284528779e-05, "loss": 0.0915, "num_input_tokens_seen": 12107712, "step": 9330 }, { "epoch": 0.45611120611731365, "grad_norm": 0.6143754124641418, "learning_rate": 4.357774472155257e-05, "loss": 0.1259, "num_input_tokens_seen": 12114048, "step": 9335 }, { "epoch": 0.456355507781008, "grad_norm": 0.25665560364723206, "learning_rate": 4.3571173732895664e-05, "loss": 0.1243, "num_input_tokens_seen": 12120160, "step": 9340 }, { "epoch": 0.4565998094447023, "grad_norm": 0.9375230669975281, "learning_rate": 4.356459988033039e-05, "loss": 0.0794, "num_input_tokens_seen": 12126304, "step": 9345 }, { "epoch": 0.4568441111083966, "grad_norm": 0.19583477079868317, "learning_rate": 4.355802316487051e-05, "loss": 0.088, "num_input_tokens_seen": 12132544, "step": 9350 }, { "epoch": 0.457088412772091, "grad_norm": 0.2685479521751404, "learning_rate": 4.355144358753025e-05, "loss": 0.0779, "num_input_tokens_seen": 12139168, "step": 9355 }, { "epoch": 0.4573327144357853, "grad_norm": 0.4453516900539398, "learning_rate": 4.354486114932425e-05, "loss": 0.0895, "num_input_tokens_seen": 12146240, "step": 9360 }, { "epoch": 0.45757701609947965, "grad_norm": 0.27941110730171204, "learning_rate": 4.353827585126762e-05, "loss": 0.1065, "num_input_tokens_seen": 12152768, "step": 9365 }, { "epoch": 0.457821317763174, "grad_norm": 0.32358694076538086, "learning_rate": 4.353168769437588e-05, "loss": 0.0911, "num_input_tokens_seen": 12159392, "step": 9370 }, { "epoch": 0.4580656194268683, "grad_norm": 0.17039921879768372, "learning_rate": 4.3525096679665014e-05, "loss": 0.0641, "num_input_tokens_seen": 12165600, "step": 9375 }, { "epoch": 0.4583099210905626, "grad_norm": 0.43061789870262146, "learning_rate": 4.351850280815144e-05, "loss": 0.0918, "num_input_tokens_seen": 12172320, "step": 9380 }, { "epoch": 0.45855422275425695, "grad_norm": 0.24483342468738556, "learning_rate": 4.3511906080852014e-05, "loss": 0.0733, "num_input_tokens_seen": 12178400, "step": 9385 }, { "epoch": 0.45879852441795127, "grad_norm": 0.09682965278625488, "learning_rate": 4.350530649878404e-05, "loss": 0.0799, "num_input_tokens_seen": 12185088, "step": 9390 }, { "epoch": 0.4590428260816456, "grad_norm": 0.2790645658969879, "learning_rate": 4.3498704062965246e-05, "loss": 0.0805, "num_input_tokens_seen": 12192032, "step": 9395 }, { "epoch": 0.4592871277453399, "grad_norm": 0.13583171367645264, "learning_rate": 4.3492098774413815e-05, "loss": 0.1012, "num_input_tokens_seen": 12198528, "step": 9400 }, { "epoch": 0.4592871277453399, "eval_loss": 0.09385968744754791, "eval_runtime": 374.9164, "eval_samples_per_second": 97.048, "eval_steps_per_second": 24.264, "num_input_tokens_seen": 12198528, "step": 9400 }, { "epoch": 0.4595314294090343, "grad_norm": 0.2074560523033142, "learning_rate": 4.3485490634148375e-05, "loss": 0.1012, "num_input_tokens_seen": 12204928, "step": 9405 }, { "epoch": 0.4597757310727286, "grad_norm": 0.3016214668750763, "learning_rate": 4.347887964318797e-05, "loss": 0.0876, "num_input_tokens_seen": 12211136, "step": 9410 }, { "epoch": 0.46002003273642295, "grad_norm": 0.3320539891719818, "learning_rate": 4.34722658025521e-05, "loss": 0.0882, "num_input_tokens_seen": 12217408, "step": 9415 }, { "epoch": 0.46026433440011727, "grad_norm": 0.33656972646713257, "learning_rate": 4.346564911326071e-05, "loss": 0.07, "num_input_tokens_seen": 12223392, "step": 9420 }, { "epoch": 0.4605086360638116, "grad_norm": 0.29659655690193176, "learning_rate": 4.345902957633418e-05, "loss": 0.0806, "num_input_tokens_seen": 12229824, "step": 9425 }, { "epoch": 0.4607529377275059, "grad_norm": 0.37560784816741943, "learning_rate": 4.345240719279331e-05, "loss": 0.0809, "num_input_tokens_seen": 12236128, "step": 9430 }, { "epoch": 0.46099723939120024, "grad_norm": 0.6014356017112732, "learning_rate": 4.3445781963659374e-05, "loss": 0.074, "num_input_tokens_seen": 12242816, "step": 9435 }, { "epoch": 0.46124154105489457, "grad_norm": 0.30074095726013184, "learning_rate": 4.3439153889954045e-05, "loss": 0.1108, "num_input_tokens_seen": 12249184, "step": 9440 }, { "epoch": 0.4614858427185889, "grad_norm": 0.6799278855323792, "learning_rate": 4.343252297269946e-05, "loss": 0.1035, "num_input_tokens_seen": 12255456, "step": 9445 }, { "epoch": 0.46173014438228327, "grad_norm": 0.1827142983675003, "learning_rate": 4.342588921291821e-05, "loss": 0.0845, "num_input_tokens_seen": 12261952, "step": 9450 }, { "epoch": 0.4619744460459776, "grad_norm": 0.8026697635650635, "learning_rate": 4.341925261163328e-05, "loss": 0.0809, "num_input_tokens_seen": 12268384, "step": 9455 }, { "epoch": 0.4622187477096719, "grad_norm": 0.34157794713974, "learning_rate": 4.341261316986813e-05, "loss": 0.0886, "num_input_tokens_seen": 12274720, "step": 9460 }, { "epoch": 0.46246304937336624, "grad_norm": 0.31060081720352173, "learning_rate": 4.340597088864664e-05, "loss": 0.1136, "num_input_tokens_seen": 12281120, "step": 9465 }, { "epoch": 0.46270735103706057, "grad_norm": 0.38054344058036804, "learning_rate": 4.339932576899313e-05, "loss": 0.0979, "num_input_tokens_seen": 12287456, "step": 9470 }, { "epoch": 0.4629516527007549, "grad_norm": 0.15743988752365112, "learning_rate": 4.3392677811932375e-05, "loss": 0.0946, "num_input_tokens_seen": 12294496, "step": 9475 }, { "epoch": 0.4631959543644492, "grad_norm": 0.26877492666244507, "learning_rate": 4.338602701848956e-05, "loss": 0.0851, "num_input_tokens_seen": 12301152, "step": 9480 }, { "epoch": 0.46344025602814354, "grad_norm": 0.1979747712612152, "learning_rate": 4.337937338969033e-05, "loss": 0.1013, "num_input_tokens_seen": 12307232, "step": 9485 }, { "epoch": 0.46368455769183786, "grad_norm": 0.18034245073795319, "learning_rate": 4.337271692656075e-05, "loss": 0.0785, "num_input_tokens_seen": 12313568, "step": 9490 }, { "epoch": 0.4639288593555322, "grad_norm": 0.47406333684921265, "learning_rate": 4.336605763012733e-05, "loss": 0.14, "num_input_tokens_seen": 12320416, "step": 9495 }, { "epoch": 0.46417316101922657, "grad_norm": 0.2875617444515228, "learning_rate": 4.3359395501417026e-05, "loss": 0.0693, "num_input_tokens_seen": 12327456, "step": 9500 }, { "epoch": 0.4644174626829209, "grad_norm": 0.23902471363544464, "learning_rate": 4.335273054145722e-05, "loss": 0.0628, "num_input_tokens_seen": 12334368, "step": 9505 }, { "epoch": 0.4646617643466152, "grad_norm": 0.37962692975997925, "learning_rate": 4.334606275127572e-05, "loss": 0.0892, "num_input_tokens_seen": 12340736, "step": 9510 }, { "epoch": 0.46490606601030954, "grad_norm": 0.278117299079895, "learning_rate": 4.33393921319008e-05, "loss": 0.1115, "num_input_tokens_seen": 12347296, "step": 9515 }, { "epoch": 0.46515036767400386, "grad_norm": 0.271131306886673, "learning_rate": 4.3332718684361146e-05, "loss": 0.0641, "num_input_tokens_seen": 12353792, "step": 9520 }, { "epoch": 0.4653946693376982, "grad_norm": 0.41413503885269165, "learning_rate": 4.332604240968588e-05, "loss": 0.1012, "num_input_tokens_seen": 12360480, "step": 9525 }, { "epoch": 0.4656389710013925, "grad_norm": 0.845486044883728, "learning_rate": 4.331936330890459e-05, "loss": 0.098, "num_input_tokens_seen": 12366688, "step": 9530 }, { "epoch": 0.46588327266508683, "grad_norm": 0.8518353700637817, "learning_rate": 4.331268138304725e-05, "loss": 0.1046, "num_input_tokens_seen": 12372736, "step": 9535 }, { "epoch": 0.46612757432878116, "grad_norm": 0.4696088135242462, "learning_rate": 4.330599663314431e-05, "loss": 0.0971, "num_input_tokens_seen": 12379328, "step": 9540 }, { "epoch": 0.4663718759924755, "grad_norm": 0.38906121253967285, "learning_rate": 4.329930906022665e-05, "loss": 0.0926, "num_input_tokens_seen": 12385888, "step": 9545 }, { "epoch": 0.46661617765616986, "grad_norm": 0.40728959441185, "learning_rate": 4.3292618665325564e-05, "loss": 0.0884, "num_input_tokens_seen": 12392192, "step": 9550 }, { "epoch": 0.4668604793198642, "grad_norm": 0.4328010678291321, "learning_rate": 4.3285925449472796e-05, "loss": 0.0835, "num_input_tokens_seen": 12398624, "step": 9555 }, { "epoch": 0.4671047809835585, "grad_norm": 0.5184311270713806, "learning_rate": 4.327922941370054e-05, "loss": 0.0766, "num_input_tokens_seen": 12404768, "step": 9560 }, { "epoch": 0.46734908264725283, "grad_norm": 0.22027291357517242, "learning_rate": 4.3272530559041384e-05, "loss": 0.0894, "num_input_tokens_seen": 12410880, "step": 9565 }, { "epoch": 0.46759338431094716, "grad_norm": 0.4873640239238739, "learning_rate": 4.32658288865284e-05, "loss": 0.0937, "num_input_tokens_seen": 12417280, "step": 9570 }, { "epoch": 0.4678376859746415, "grad_norm": 0.4025898873806, "learning_rate": 4.325912439719505e-05, "loss": 0.115, "num_input_tokens_seen": 12423808, "step": 9575 }, { "epoch": 0.4680819876383358, "grad_norm": 0.14928464591503143, "learning_rate": 4.3252417092075266e-05, "loss": 0.0905, "num_input_tokens_seen": 12430560, "step": 9580 }, { "epoch": 0.46832628930203013, "grad_norm": 0.2617211639881134, "learning_rate": 4.3245706972203385e-05, "loss": 0.1004, "num_input_tokens_seen": 12436992, "step": 9585 }, { "epoch": 0.46857059096572445, "grad_norm": 0.14092741906642914, "learning_rate": 4.323899403861421e-05, "loss": 0.0833, "num_input_tokens_seen": 12443136, "step": 9590 }, { "epoch": 0.46881489262941883, "grad_norm": 0.23320384323596954, "learning_rate": 4.3232278292342935e-05, "loss": 0.0918, "num_input_tokens_seen": 12449472, "step": 9595 }, { "epoch": 0.46905919429311316, "grad_norm": 0.34205394983291626, "learning_rate": 4.322555973442524e-05, "loss": 0.0905, "num_input_tokens_seen": 12455968, "step": 9600 }, { "epoch": 0.46905919429311316, "eval_loss": 0.09413769841194153, "eval_runtime": 375.0456, "eval_samples_per_second": 97.015, "eval_steps_per_second": 24.256, "num_input_tokens_seen": 12455968, "step": 9600 }, { "epoch": 0.4693034959568075, "grad_norm": 0.6816325187683105, "learning_rate": 4.3218838365897184e-05, "loss": 0.0886, "num_input_tokens_seen": 12462304, "step": 9605 }, { "epoch": 0.4695477976205018, "grad_norm": 0.30035367608070374, "learning_rate": 4.3212114187795306e-05, "loss": 0.0939, "num_input_tokens_seen": 12469088, "step": 9610 }, { "epoch": 0.46979209928419613, "grad_norm": 0.12900009751319885, "learning_rate": 4.320538720115656e-05, "loss": 0.0824, "num_input_tokens_seen": 12475296, "step": 9615 }, { "epoch": 0.47003640094789045, "grad_norm": 0.6328036189079285, "learning_rate": 4.319865740701831e-05, "loss": 0.0961, "num_input_tokens_seen": 12481824, "step": 9620 }, { "epoch": 0.4702807026115848, "grad_norm": 0.14382144808769226, "learning_rate": 4.3191924806418396e-05, "loss": 0.0736, "num_input_tokens_seen": 12488928, "step": 9625 }, { "epoch": 0.4705250042752791, "grad_norm": 0.6057449579238892, "learning_rate": 4.318518940039507e-05, "loss": 0.1137, "num_input_tokens_seen": 12495584, "step": 9630 }, { "epoch": 0.4707693059389734, "grad_norm": 0.4363597333431244, "learning_rate": 4.3178451189987e-05, "loss": 0.0834, "num_input_tokens_seen": 12502336, "step": 9635 }, { "epoch": 0.47101360760266775, "grad_norm": 0.7874302268028259, "learning_rate": 4.3171710176233315e-05, "loss": 0.109, "num_input_tokens_seen": 12509120, "step": 9640 }, { "epoch": 0.47125790926636213, "grad_norm": 0.16362763941287994, "learning_rate": 4.316496636017355e-05, "loss": 0.0831, "num_input_tokens_seen": 12515296, "step": 9645 }, { "epoch": 0.47150221093005645, "grad_norm": 0.4915786683559418, "learning_rate": 4.315821974284771e-05, "loss": 0.0929, "num_input_tokens_seen": 12521792, "step": 9650 }, { "epoch": 0.4717465125937508, "grad_norm": 0.485515296459198, "learning_rate": 4.315147032529619e-05, "loss": 0.1096, "num_input_tokens_seen": 12528064, "step": 9655 }, { "epoch": 0.4719908142574451, "grad_norm": 0.17619530856609344, "learning_rate": 4.3144718108559845e-05, "loss": 0.1078, "num_input_tokens_seen": 12535328, "step": 9660 }, { "epoch": 0.4722351159211394, "grad_norm": 0.2919452488422394, "learning_rate": 4.3137963093679945e-05, "loss": 0.0888, "num_input_tokens_seen": 12542400, "step": 9665 }, { "epoch": 0.47247941758483375, "grad_norm": 0.15698130428791046, "learning_rate": 4.31312052816982e-05, "loss": 0.1174, "num_input_tokens_seen": 12548832, "step": 9670 }, { "epoch": 0.4727237192485281, "grad_norm": 0.3048890233039856, "learning_rate": 4.312444467365675e-05, "loss": 0.1131, "num_input_tokens_seen": 12555264, "step": 9675 }, { "epoch": 0.4729680209122224, "grad_norm": 0.1535702645778656, "learning_rate": 4.311768127059816e-05, "loss": 0.0878, "num_input_tokens_seen": 12561312, "step": 9680 }, { "epoch": 0.4732123225759167, "grad_norm": 0.3153226673603058, "learning_rate": 4.3110915073565444e-05, "loss": 0.1111, "num_input_tokens_seen": 12567616, "step": 9685 }, { "epoch": 0.47345662423961105, "grad_norm": 1.1279014348983765, "learning_rate": 4.310414608360203e-05, "loss": 0.0918, "num_input_tokens_seen": 12574048, "step": 9690 }, { "epoch": 0.4737009259033054, "grad_norm": 0.30228251218795776, "learning_rate": 4.309737430175177e-05, "loss": 0.0732, "num_input_tokens_seen": 12580896, "step": 9695 }, { "epoch": 0.47394522756699975, "grad_norm": 0.20101754367351532, "learning_rate": 4.309059972905897e-05, "loss": 0.0579, "num_input_tokens_seen": 12587552, "step": 9700 }, { "epoch": 0.4741895292306941, "grad_norm": 0.3497324585914612, "learning_rate": 4.308382236656836e-05, "loss": 0.0771, "num_input_tokens_seen": 12593696, "step": 9705 }, { "epoch": 0.4744338308943884, "grad_norm": 0.4566214382648468, "learning_rate": 4.307704221532507e-05, "loss": 0.0852, "num_input_tokens_seen": 12600576, "step": 9710 }, { "epoch": 0.4746781325580827, "grad_norm": 0.48624253273010254, "learning_rate": 4.307025927637471e-05, "loss": 0.1123, "num_input_tokens_seen": 12606912, "step": 9715 }, { "epoch": 0.47492243422177705, "grad_norm": 0.5886005163192749, "learning_rate": 4.306347355076328e-05, "loss": 0.0678, "num_input_tokens_seen": 12613312, "step": 9720 }, { "epoch": 0.47516673588547137, "grad_norm": 0.5364214777946472, "learning_rate": 4.305668503953724e-05, "loss": 0.1184, "num_input_tokens_seen": 12620128, "step": 9725 }, { "epoch": 0.4754110375491657, "grad_norm": 0.35428422689437866, "learning_rate": 4.3049893743743436e-05, "loss": 0.083, "num_input_tokens_seen": 12627040, "step": 9730 }, { "epoch": 0.47565533921286, "grad_norm": 0.6503781080245972, "learning_rate": 4.304309966442919e-05, "loss": 0.1081, "num_input_tokens_seen": 12634112, "step": 9735 }, { "epoch": 0.47589964087655434, "grad_norm": 0.203430637717247, "learning_rate": 4.303630280264224e-05, "loss": 0.0702, "num_input_tokens_seen": 12640864, "step": 9740 }, { "epoch": 0.4761439425402487, "grad_norm": 0.223591610789299, "learning_rate": 4.302950315943074e-05, "loss": 0.0695, "num_input_tokens_seen": 12647296, "step": 9745 }, { "epoch": 0.47638824420394305, "grad_norm": 0.5738632678985596, "learning_rate": 4.3022700735843275e-05, "loss": 0.0936, "num_input_tokens_seen": 12653632, "step": 9750 }, { "epoch": 0.47663254586763737, "grad_norm": 0.6361128091812134, "learning_rate": 4.301589553292887e-05, "loss": 0.1004, "num_input_tokens_seen": 12659872, "step": 9755 }, { "epoch": 0.4768768475313317, "grad_norm": 0.3009033799171448, "learning_rate": 4.300908755173697e-05, "loss": 0.0914, "num_input_tokens_seen": 12666272, "step": 9760 }, { "epoch": 0.477121149195026, "grad_norm": 0.6232930421829224, "learning_rate": 4.300227679331745e-05, "loss": 0.0957, "num_input_tokens_seen": 12672544, "step": 9765 }, { "epoch": 0.47736545085872034, "grad_norm": 0.312147319316864, "learning_rate": 4.299546325872063e-05, "loss": 0.0737, "num_input_tokens_seen": 12678944, "step": 9770 }, { "epoch": 0.47760975252241467, "grad_norm": 0.27800989151000977, "learning_rate": 4.2988646948997225e-05, "loss": 0.0831, "num_input_tokens_seen": 12685568, "step": 9775 }, { "epoch": 0.477854054186109, "grad_norm": 0.25603047013282776, "learning_rate": 4.29818278651984e-05, "loss": 0.0858, "num_input_tokens_seen": 12691648, "step": 9780 }, { "epoch": 0.4780983558498033, "grad_norm": 0.7319769263267517, "learning_rate": 4.297500600837574e-05, "loss": 0.0896, "num_input_tokens_seen": 12698048, "step": 9785 }, { "epoch": 0.4783426575134977, "grad_norm": 0.6008042097091675, "learning_rate": 4.2968181379581276e-05, "loss": 0.0684, "num_input_tokens_seen": 12704256, "step": 9790 }, { "epoch": 0.478586959177192, "grad_norm": 0.19206684827804565, "learning_rate": 4.296135397986743e-05, "loss": 0.107, "num_input_tokens_seen": 12710560, "step": 9795 }, { "epoch": 0.47883126084088634, "grad_norm": 0.6373012661933899, "learning_rate": 4.295452381028709e-05, "loss": 0.102, "num_input_tokens_seen": 12716992, "step": 9800 }, { "epoch": 0.47883126084088634, "eval_loss": 0.09356200695037842, "eval_runtime": 374.7527, "eval_samples_per_second": 97.091, "eval_steps_per_second": 24.275, "num_input_tokens_seen": 12716992, "step": 9800 }, { "epoch": 0.47907556250458067, "grad_norm": 0.5252261161804199, "learning_rate": 4.294769087189354e-05, "loss": 0.1031, "num_input_tokens_seen": 12722976, "step": 9805 }, { "epoch": 0.479319864168275, "grad_norm": 0.8096954822540283, "learning_rate": 4.294085516574052e-05, "loss": 0.0897, "num_input_tokens_seen": 12729088, "step": 9810 }, { "epoch": 0.4795641658319693, "grad_norm": 0.23290322721004486, "learning_rate": 4.2934016692882176e-05, "loss": 0.0738, "num_input_tokens_seen": 12735456, "step": 9815 }, { "epoch": 0.47980846749566364, "grad_norm": 0.15550784766674042, "learning_rate": 4.292717545437308e-05, "loss": 0.0883, "num_input_tokens_seen": 12742208, "step": 9820 }, { "epoch": 0.48005276915935796, "grad_norm": 0.27070537209510803, "learning_rate": 4.292033145126825e-05, "loss": 0.1085, "num_input_tokens_seen": 12748704, "step": 9825 }, { "epoch": 0.4802970708230523, "grad_norm": 0.4185120761394501, "learning_rate": 4.29134846846231e-05, "loss": 0.0759, "num_input_tokens_seen": 12754784, "step": 9830 }, { "epoch": 0.4805413724867466, "grad_norm": 0.20937614142894745, "learning_rate": 4.29066351554935e-05, "loss": 0.0905, "num_input_tokens_seen": 12761024, "step": 9835 }, { "epoch": 0.480785674150441, "grad_norm": 0.4572712481021881, "learning_rate": 4.289978286493574e-05, "loss": 0.1079, "num_input_tokens_seen": 12767616, "step": 9840 }, { "epoch": 0.4810299758141353, "grad_norm": 0.3417256772518158, "learning_rate": 4.28929278140065e-05, "loss": 0.0878, "num_input_tokens_seen": 12774080, "step": 9845 }, { "epoch": 0.48127427747782964, "grad_norm": 0.5459171533584595, "learning_rate": 4.288607000376295e-05, "loss": 0.0695, "num_input_tokens_seen": 12780352, "step": 9850 }, { "epoch": 0.48151857914152396, "grad_norm": 0.7492088079452515, "learning_rate": 4.2879209435262624e-05, "loss": 0.0951, "num_input_tokens_seen": 12786816, "step": 9855 }, { "epoch": 0.4817628808052183, "grad_norm": 0.2989334464073181, "learning_rate": 4.287234610956353e-05, "loss": 0.0839, "num_input_tokens_seen": 12793696, "step": 9860 }, { "epoch": 0.4820071824689126, "grad_norm": 0.4919036328792572, "learning_rate": 4.2865480027724056e-05, "loss": 0.101, "num_input_tokens_seen": 12799904, "step": 9865 }, { "epoch": 0.48225148413260693, "grad_norm": 0.3942774832248688, "learning_rate": 4.285861119080306e-05, "loss": 0.1131, "num_input_tokens_seen": 12806848, "step": 9870 }, { "epoch": 0.48249578579630126, "grad_norm": 0.1723613291978836, "learning_rate": 4.2851739599859784e-05, "loss": 0.084, "num_input_tokens_seen": 12813440, "step": 9875 }, { "epoch": 0.4827400874599956, "grad_norm": 0.7141793966293335, "learning_rate": 4.2844865255953934e-05, "loss": 0.1234, "num_input_tokens_seen": 12819648, "step": 9880 }, { "epoch": 0.4829843891236899, "grad_norm": 0.304801344871521, "learning_rate": 4.2837988160145605e-05, "loss": 0.0876, "num_input_tokens_seen": 12825760, "step": 9885 }, { "epoch": 0.4832286907873843, "grad_norm": 0.1995321363210678, "learning_rate": 4.2831108313495336e-05, "loss": 0.099, "num_input_tokens_seen": 12832768, "step": 9890 }, { "epoch": 0.4834729924510786, "grad_norm": 0.6157315373420715, "learning_rate": 4.282422571706408e-05, "loss": 0.0838, "num_input_tokens_seen": 12838944, "step": 9895 }, { "epoch": 0.48371729411477293, "grad_norm": 0.38434284925460815, "learning_rate": 4.281734037191323e-05, "loss": 0.1132, "num_input_tokens_seen": 12844992, "step": 9900 }, { "epoch": 0.48396159577846726, "grad_norm": 0.23871107399463654, "learning_rate": 4.281045227910459e-05, "loss": 0.0733, "num_input_tokens_seen": 12851296, "step": 9905 }, { "epoch": 0.4842058974421616, "grad_norm": 0.20925547182559967, "learning_rate": 4.280356143970038e-05, "loss": 0.1015, "num_input_tokens_seen": 12857472, "step": 9910 }, { "epoch": 0.4844501991058559, "grad_norm": 0.17818550765514374, "learning_rate": 4.279666785476327e-05, "loss": 0.0781, "num_input_tokens_seen": 12863904, "step": 9915 }, { "epoch": 0.48469450076955023, "grad_norm": 0.19844995439052582, "learning_rate": 4.2789771525356325e-05, "loss": 0.1, "num_input_tokens_seen": 12870016, "step": 9920 }, { "epoch": 0.48493880243324455, "grad_norm": 0.21806077659130096, "learning_rate": 4.2782872452543056e-05, "loss": 0.1046, "num_input_tokens_seen": 12876800, "step": 9925 }, { "epoch": 0.4851831040969389, "grad_norm": 0.5236464142799377, "learning_rate": 4.2775970637387376e-05, "loss": 0.1034, "num_input_tokens_seen": 12883072, "step": 9930 }, { "epoch": 0.48542740576063326, "grad_norm": 0.40286150574684143, "learning_rate": 4.276906608095363e-05, "loss": 0.0882, "num_input_tokens_seen": 12889440, "step": 9935 }, { "epoch": 0.4856717074243276, "grad_norm": 0.4697713255882263, "learning_rate": 4.276215878430661e-05, "loss": 0.1078, "num_input_tokens_seen": 12895744, "step": 9940 }, { "epoch": 0.4859160090880219, "grad_norm": 0.3075747489929199, "learning_rate": 4.275524874851149e-05, "loss": 0.1158, "num_input_tokens_seen": 12902016, "step": 9945 }, { "epoch": 0.48616031075171623, "grad_norm": 0.3676363229751587, "learning_rate": 4.274833597463388e-05, "loss": 0.0824, "num_input_tokens_seen": 12908576, "step": 9950 }, { "epoch": 0.48640461241541055, "grad_norm": 0.7787530422210693, "learning_rate": 4.2741420463739824e-05, "loss": 0.0981, "num_input_tokens_seen": 12914688, "step": 9955 }, { "epoch": 0.4866489140791049, "grad_norm": 0.2927566468715668, "learning_rate": 4.273450221689578e-05, "loss": 0.1302, "num_input_tokens_seen": 12921024, "step": 9960 }, { "epoch": 0.4868932157427992, "grad_norm": 0.3238849639892578, "learning_rate": 4.272758123516863e-05, "loss": 0.0789, "num_input_tokens_seen": 12927872, "step": 9965 }, { "epoch": 0.4871375174064935, "grad_norm": 0.1518145054578781, "learning_rate": 4.272065751962567e-05, "loss": 0.0933, "num_input_tokens_seen": 12934208, "step": 9970 }, { "epoch": 0.48738181907018785, "grad_norm": 0.2792324721813202, "learning_rate": 4.271373107133464e-05, "loss": 0.0995, "num_input_tokens_seen": 12940736, "step": 9975 }, { "epoch": 0.4876261207338822, "grad_norm": 0.31354182958602905, "learning_rate": 4.270680189136366e-05, "loss": 0.1098, "num_input_tokens_seen": 12947360, "step": 9980 }, { "epoch": 0.48787042239757655, "grad_norm": 0.2795947194099426, "learning_rate": 4.269986998078132e-05, "loss": 0.0956, "num_input_tokens_seen": 12954176, "step": 9985 }, { "epoch": 0.4881147240612709, "grad_norm": 0.4441078007221222, "learning_rate": 4.2692935340656595e-05, "loss": 0.1041, "num_input_tokens_seen": 12960928, "step": 9990 }, { "epoch": 0.4883590257249652, "grad_norm": 0.7011984586715698, "learning_rate": 4.26859979720589e-05, "loss": 0.1155, "num_input_tokens_seen": 12967648, "step": 9995 }, { "epoch": 0.4886033273886595, "grad_norm": 0.600551426410675, "learning_rate": 4.267905787605806e-05, "loss": 0.0917, "num_input_tokens_seen": 12974048, "step": 10000 }, { "epoch": 0.4886033273886595, "eval_loss": 0.0936049222946167, "eval_runtime": 374.9651, "eval_samples_per_second": 97.036, "eval_steps_per_second": 24.261, "num_input_tokens_seen": 12974048, "step": 10000 }, { "epoch": 0.48884762905235385, "grad_norm": 0.21122337877750397, "learning_rate": 4.267211505372433e-05, "loss": 0.1137, "num_input_tokens_seen": 12980960, "step": 10005 }, { "epoch": 0.4890919307160482, "grad_norm": 0.44049105048179626, "learning_rate": 4.266516950612837e-05, "loss": 0.1359, "num_input_tokens_seen": 12987456, "step": 10010 }, { "epoch": 0.4893362323797425, "grad_norm": 0.2224460244178772, "learning_rate": 4.265822123434128e-05, "loss": 0.0947, "num_input_tokens_seen": 12994016, "step": 10015 }, { "epoch": 0.4895805340434368, "grad_norm": 0.6208474040031433, "learning_rate": 4.265127023943457e-05, "loss": 0.0879, "num_input_tokens_seen": 13000256, "step": 10020 }, { "epoch": 0.48982483570713115, "grad_norm": 0.2073742151260376, "learning_rate": 4.2644316522480176e-05, "loss": 0.1009, "num_input_tokens_seen": 13006368, "step": 10025 }, { "epoch": 0.49006913737082547, "grad_norm": 0.47753068804740906, "learning_rate": 4.263736008455044e-05, "loss": 0.0956, "num_input_tokens_seen": 13012736, "step": 10030 }, { "epoch": 0.49031343903451985, "grad_norm": 0.38614290952682495, "learning_rate": 4.2630400926718125e-05, "loss": 0.0919, "num_input_tokens_seen": 13019072, "step": 10035 }, { "epoch": 0.4905577406982142, "grad_norm": 0.4886200428009033, "learning_rate": 4.262343905005644e-05, "loss": 0.0753, "num_input_tokens_seen": 13025472, "step": 10040 }, { "epoch": 0.4908020423619085, "grad_norm": 0.1106780394911766, "learning_rate": 4.261647445563897e-05, "loss": 0.0684, "num_input_tokens_seen": 13031840, "step": 10045 }, { "epoch": 0.4910463440256028, "grad_norm": 0.4909670352935791, "learning_rate": 4.260950714453976e-05, "loss": 0.0892, "num_input_tokens_seen": 13038272, "step": 10050 }, { "epoch": 0.49129064568929715, "grad_norm": 0.21369346976280212, "learning_rate": 4.2602537117833266e-05, "loss": 0.089, "num_input_tokens_seen": 13044544, "step": 10055 }, { "epoch": 0.49153494735299147, "grad_norm": 0.2265106737613678, "learning_rate": 4.259556437659433e-05, "loss": 0.1105, "num_input_tokens_seen": 13050848, "step": 10060 }, { "epoch": 0.4917792490166858, "grad_norm": 0.1656748503446579, "learning_rate": 4.258858892189825e-05, "loss": 0.1166, "num_input_tokens_seen": 13057280, "step": 10065 }, { "epoch": 0.4920235506803801, "grad_norm": 0.3740430772304535, "learning_rate": 4.2581610754820725e-05, "loss": 0.1081, "num_input_tokens_seen": 13063744, "step": 10070 }, { "epoch": 0.49226785234407444, "grad_norm": 0.49709323048591614, "learning_rate": 4.2574629876437876e-05, "loss": 0.0856, "num_input_tokens_seen": 13070304, "step": 10075 }, { "epoch": 0.49251215400776877, "grad_norm": 0.13967399299144745, "learning_rate": 4.256764628782625e-05, "loss": 0.0826, "num_input_tokens_seen": 13076416, "step": 10080 }, { "epoch": 0.49275645567146314, "grad_norm": 0.5018734335899353, "learning_rate": 4.256065999006279e-05, "loss": 0.0969, "num_input_tokens_seen": 13082816, "step": 10085 }, { "epoch": 0.49300075733515747, "grad_norm": 0.1240691989660263, "learning_rate": 4.2553670984224885e-05, "loss": 0.068, "num_input_tokens_seen": 13089280, "step": 10090 }, { "epoch": 0.4932450589988518, "grad_norm": 0.2407962828874588, "learning_rate": 4.254667927139032e-05, "loss": 0.1081, "num_input_tokens_seen": 13095680, "step": 10095 }, { "epoch": 0.4934893606625461, "grad_norm": 0.22777333855628967, "learning_rate": 4.2539684852637295e-05, "loss": 0.0942, "num_input_tokens_seen": 13102112, "step": 10100 }, { "epoch": 0.49373366232624044, "grad_norm": 0.4781443178653717, "learning_rate": 4.253268772904446e-05, "loss": 0.0811, "num_input_tokens_seen": 13108640, "step": 10105 }, { "epoch": 0.49397796398993477, "grad_norm": 1.2083666324615479, "learning_rate": 4.252568790169085e-05, "loss": 0.1073, "num_input_tokens_seen": 13115136, "step": 10110 }, { "epoch": 0.4942222656536291, "grad_norm": 0.42576366662979126, "learning_rate": 4.251868537165592e-05, "loss": 0.0868, "num_input_tokens_seen": 13121600, "step": 10115 }, { "epoch": 0.4944665673173234, "grad_norm": 0.19144012033939362, "learning_rate": 4.251168014001955e-05, "loss": 0.0675, "num_input_tokens_seen": 13128288, "step": 10120 }, { "epoch": 0.49471086898101774, "grad_norm": 0.22539496421813965, "learning_rate": 4.250467220786204e-05, "loss": 0.0943, "num_input_tokens_seen": 13134272, "step": 10125 }, { "epoch": 0.4949551706447121, "grad_norm": 0.30700334906578064, "learning_rate": 4.249766157626409e-05, "loss": 0.0929, "num_input_tokens_seen": 13140576, "step": 10130 }, { "epoch": 0.49519947230840644, "grad_norm": 0.25884875655174255, "learning_rate": 4.249064824630684e-05, "loss": 0.083, "num_input_tokens_seen": 13147040, "step": 10135 }, { "epoch": 0.49544377397210076, "grad_norm": 0.2132061868906021, "learning_rate": 4.248363221907183e-05, "loss": 0.0662, "num_input_tokens_seen": 13154016, "step": 10140 }, { "epoch": 0.4956880756357951, "grad_norm": 0.5937870144844055, "learning_rate": 4.2476613495641026e-05, "loss": 0.0851, "num_input_tokens_seen": 13160288, "step": 10145 }, { "epoch": 0.4959323772994894, "grad_norm": 0.27000099420547485, "learning_rate": 4.246959207709679e-05, "loss": 0.0845, "num_input_tokens_seen": 13166752, "step": 10150 }, { "epoch": 0.49617667896318374, "grad_norm": 0.43536534905433655, "learning_rate": 4.246256796452192e-05, "loss": 0.0872, "num_input_tokens_seen": 13173632, "step": 10155 }, { "epoch": 0.49642098062687806, "grad_norm": 0.22728580236434937, "learning_rate": 4.245554115899962e-05, "loss": 0.103, "num_input_tokens_seen": 13180512, "step": 10160 }, { "epoch": 0.4966652822905724, "grad_norm": 0.2925064265727997, "learning_rate": 4.2448511661613514e-05, "loss": 0.0783, "num_input_tokens_seen": 13186752, "step": 10165 }, { "epoch": 0.4969095839542667, "grad_norm": 0.19624380767345428, "learning_rate": 4.2441479473447635e-05, "loss": 0.0978, "num_input_tokens_seen": 13193152, "step": 10170 }, { "epoch": 0.49715388561796103, "grad_norm": 0.2194245159626007, "learning_rate": 4.243444459558644e-05, "loss": 0.1011, "num_input_tokens_seen": 13199456, "step": 10175 }, { "epoch": 0.4973981872816554, "grad_norm": 0.27635353803634644, "learning_rate": 4.24274070291148e-05, "loss": 0.1013, "num_input_tokens_seen": 13205728, "step": 10180 }, { "epoch": 0.49764248894534974, "grad_norm": 0.2255016416311264, "learning_rate": 4.242036677511798e-05, "loss": 0.0706, "num_input_tokens_seen": 13212000, "step": 10185 }, { "epoch": 0.49788679060904406, "grad_norm": 0.6583988666534424, "learning_rate": 4.241332383468169e-05, "loss": 0.1184, "num_input_tokens_seen": 13218048, "step": 10190 }, { "epoch": 0.4981310922727384, "grad_norm": 0.5318723320960999, "learning_rate": 4.2406278208892034e-05, "loss": 0.1001, "num_input_tokens_seen": 13224640, "step": 10195 }, { "epoch": 0.4983753939364327, "grad_norm": 0.5570424795150757, "learning_rate": 4.2399229898835536e-05, "loss": 0.0785, "num_input_tokens_seen": 13231360, "step": 10200 }, { "epoch": 0.4983753939364327, "eval_loss": 0.09344924241304398, "eval_runtime": 375.387, "eval_samples_per_second": 96.927, "eval_steps_per_second": 24.234, "num_input_tokens_seen": 13231360, "step": 10200 }, { "epoch": 0.49861969560012703, "grad_norm": 0.3778485059738159, "learning_rate": 4.239217890559914e-05, "loss": 0.1073, "num_input_tokens_seen": 13237600, "step": 10205 }, { "epoch": 0.49886399726382136, "grad_norm": 0.16686733067035675, "learning_rate": 4.238512523027019e-05, "loss": 0.0972, "num_input_tokens_seen": 13244256, "step": 10210 }, { "epoch": 0.4991082989275157, "grad_norm": 0.34980377554893494, "learning_rate": 4.237806887393645e-05, "loss": 0.0803, "num_input_tokens_seen": 13251136, "step": 10215 }, { "epoch": 0.49935260059121, "grad_norm": 0.4166311025619507, "learning_rate": 4.237100983768611e-05, "loss": 0.0854, "num_input_tokens_seen": 13257536, "step": 10220 }, { "epoch": 0.49959690225490433, "grad_norm": 0.15603716671466827, "learning_rate": 4.2363948122607756e-05, "loss": 0.0969, "num_input_tokens_seen": 13264000, "step": 10225 }, { "epoch": 0.4998412039185987, "grad_norm": 0.3983118236064911, "learning_rate": 4.235688372979039e-05, "loss": 0.0856, "num_input_tokens_seen": 13270560, "step": 10230 }, { "epoch": 0.500085505582293, "grad_norm": 0.3592761754989624, "learning_rate": 4.234981666032343e-05, "loss": 0.0869, "num_input_tokens_seen": 13276864, "step": 10235 }, { "epoch": 0.5003298072459873, "grad_norm": 0.20105214416980743, "learning_rate": 4.2342746915296704e-05, "loss": 0.1181, "num_input_tokens_seen": 13283360, "step": 10240 }, { "epoch": 0.5005741089096817, "grad_norm": 0.2702498137950897, "learning_rate": 4.233567449580047e-05, "loss": 0.0749, "num_input_tokens_seen": 13290048, "step": 10245 }, { "epoch": 0.500818410573376, "grad_norm": 0.4845500886440277, "learning_rate": 4.232859940292537e-05, "loss": 0.1073, "num_input_tokens_seen": 13296032, "step": 10250 }, { "epoch": 0.5010627122370703, "grad_norm": 0.3689948618412018, "learning_rate": 4.232152163776248e-05, "loss": 0.1017, "num_input_tokens_seen": 13303200, "step": 10255 }, { "epoch": 0.5013070139007647, "grad_norm": 0.22100898623466492, "learning_rate": 4.231444120140328e-05, "loss": 0.1041, "num_input_tokens_seen": 13309504, "step": 10260 }, { "epoch": 0.501551315564459, "grad_norm": 0.2999191880226135, "learning_rate": 4.230735809493967e-05, "loss": 0.1273, "num_input_tokens_seen": 13315712, "step": 10265 }, { "epoch": 0.5017956172281534, "grad_norm": 0.2901536524295807, "learning_rate": 4.2300272319463926e-05, "loss": 0.0962, "num_input_tokens_seen": 13322272, "step": 10270 }, { "epoch": 0.5020399188918476, "grad_norm": 0.07775488495826721, "learning_rate": 4.2293183876068786e-05, "loss": 0.0808, "num_input_tokens_seen": 13328416, "step": 10275 }, { "epoch": 0.502284220555542, "grad_norm": 0.21148736774921417, "learning_rate": 4.228609276584737e-05, "loss": 0.0797, "num_input_tokens_seen": 13335296, "step": 10280 }, { "epoch": 0.5025285222192363, "grad_norm": 0.2514142096042633, "learning_rate": 4.227899898989323e-05, "loss": 0.0827, "num_input_tokens_seen": 13342496, "step": 10285 }, { "epoch": 0.5027728238829307, "grad_norm": 0.3076842129230499, "learning_rate": 4.2271902549300293e-05, "loss": 0.0716, "num_input_tokens_seen": 13348576, "step": 10290 }, { "epoch": 0.5030171255466249, "grad_norm": 0.202505961060524, "learning_rate": 4.226480344516294e-05, "loss": 0.0886, "num_input_tokens_seen": 13355008, "step": 10295 }, { "epoch": 0.5032614272103193, "grad_norm": 0.2959975600242615, "learning_rate": 4.2257701678575925e-05, "loss": 0.0888, "num_input_tokens_seen": 13361728, "step": 10300 }, { "epoch": 0.5035057288740137, "grad_norm": 0.16230130195617676, "learning_rate": 4.225059725063444e-05, "loss": 0.0995, "num_input_tokens_seen": 13368032, "step": 10305 }, { "epoch": 0.503750030537708, "grad_norm": 0.29194116592407227, "learning_rate": 4.2243490162434074e-05, "loss": 0.0973, "num_input_tokens_seen": 13374752, "step": 10310 }, { "epoch": 0.5039943322014023, "grad_norm": 0.1442253738641739, "learning_rate": 4.223638041507083e-05, "loss": 0.0741, "num_input_tokens_seen": 13380800, "step": 10315 }, { "epoch": 0.5042386338650966, "grad_norm": 0.12347207963466644, "learning_rate": 4.2229268009641124e-05, "loss": 0.0814, "num_input_tokens_seen": 13387200, "step": 10320 }, { "epoch": 0.504482935528791, "grad_norm": 0.42031097412109375, "learning_rate": 4.222215294724177e-05, "loss": 0.1033, "num_input_tokens_seen": 13393920, "step": 10325 }, { "epoch": 0.5047272371924852, "grad_norm": 0.38308417797088623, "learning_rate": 4.2215035228970005e-05, "loss": 0.0923, "num_input_tokens_seen": 13400736, "step": 10330 }, { "epoch": 0.5049715388561796, "grad_norm": 0.44037532806396484, "learning_rate": 4.2207914855923464e-05, "loss": 0.1276, "num_input_tokens_seen": 13407424, "step": 10335 }, { "epoch": 0.5052158405198739, "grad_norm": 0.2891870141029358, "learning_rate": 4.220079182920021e-05, "loss": 0.1021, "num_input_tokens_seen": 13414432, "step": 10340 }, { "epoch": 0.5054601421835683, "grad_norm": 0.23073194921016693, "learning_rate": 4.2193666149898705e-05, "loss": 0.0737, "num_input_tokens_seen": 13420960, "step": 10345 }, { "epoch": 0.5057044438472627, "grad_norm": 0.18326066434383392, "learning_rate": 4.21865378191178e-05, "loss": 0.143, "num_input_tokens_seen": 13427168, "step": 10350 }, { "epoch": 0.5059487455109569, "grad_norm": 0.12741072475910187, "learning_rate": 4.217940683795678e-05, "loss": 0.0701, "num_input_tokens_seen": 13433568, "step": 10355 }, { "epoch": 0.5061930471746513, "grad_norm": 0.3001338243484497, "learning_rate": 4.217227320751534e-05, "loss": 0.0957, "num_input_tokens_seen": 13439648, "step": 10360 }, { "epoch": 0.5064373488383456, "grad_norm": 0.14977987110614777, "learning_rate": 4.216513692889358e-05, "loss": 0.0857, "num_input_tokens_seen": 13446272, "step": 10365 }, { "epoch": 0.50668165050204, "grad_norm": 0.3913434147834778, "learning_rate": 4.215799800319199e-05, "loss": 0.0759, "num_input_tokens_seen": 13452672, "step": 10370 }, { "epoch": 0.5069259521657342, "grad_norm": 0.3866802453994751, "learning_rate": 4.2150856431511485e-05, "loss": 0.0993, "num_input_tokens_seen": 13459040, "step": 10375 }, { "epoch": 0.5071702538294286, "grad_norm": 0.1975667029619217, "learning_rate": 4.214371221495339e-05, "loss": 0.0755, "num_input_tokens_seen": 13465824, "step": 10380 }, { "epoch": 0.5074145554931229, "grad_norm": 0.29508763551712036, "learning_rate": 4.213656535461942e-05, "loss": 0.1134, "num_input_tokens_seen": 13471904, "step": 10385 }, { "epoch": 0.5076588571568172, "grad_norm": 0.6374650001525879, "learning_rate": 4.2129415851611734e-05, "loss": 0.1029, "num_input_tokens_seen": 13477856, "step": 10390 }, { "epoch": 0.5079031588205115, "grad_norm": 0.9439045190811157, "learning_rate": 4.2122263707032855e-05, "loss": 0.1176, "num_input_tokens_seen": 13483552, "step": 10395 }, { "epoch": 0.5081474604842059, "grad_norm": 0.11875420808792114, "learning_rate": 4.211510892198574e-05, "loss": 0.0688, "num_input_tokens_seen": 13489760, "step": 10400 }, { "epoch": 0.5081474604842059, "eval_loss": 0.09367989748716354, "eval_runtime": 375.1946, "eval_samples_per_second": 96.976, "eval_steps_per_second": 24.246, "num_input_tokens_seen": 13489760, "step": 10400 }, { "epoch": 0.5083917621479003, "grad_norm": 0.46006524562835693, "learning_rate": 4.210795149757375e-05, "loss": 0.0913, "num_input_tokens_seen": 13496160, "step": 10405 }, { "epoch": 0.5086360638115945, "grad_norm": 0.15087682008743286, "learning_rate": 4.210079143490065e-05, "loss": 0.064, "num_input_tokens_seen": 13503072, "step": 10410 }, { "epoch": 0.5088803654752889, "grad_norm": 0.4283623993396759, "learning_rate": 4.2093628735070604e-05, "loss": 0.107, "num_input_tokens_seen": 13509120, "step": 10415 }, { "epoch": 0.5091246671389832, "grad_norm": 0.164418026804924, "learning_rate": 4.208646339918819e-05, "loss": 0.0789, "num_input_tokens_seen": 13515616, "step": 10420 }, { "epoch": 0.5093689688026776, "grad_norm": 0.37582072615623474, "learning_rate": 4.2079295428358414e-05, "loss": 0.1024, "num_input_tokens_seen": 13522240, "step": 10425 }, { "epoch": 0.5096132704663718, "grad_norm": 0.17374710738658905, "learning_rate": 4.207212482368664e-05, "loss": 0.0882, "num_input_tokens_seen": 13528512, "step": 10430 }, { "epoch": 0.5098575721300662, "grad_norm": 0.787684440612793, "learning_rate": 4.206495158627867e-05, "loss": 0.131, "num_input_tokens_seen": 13534528, "step": 10435 }, { "epoch": 0.5101018737937605, "grad_norm": 0.17978434264659882, "learning_rate": 4.205777571724073e-05, "loss": 0.0744, "num_input_tokens_seen": 13541120, "step": 10440 }, { "epoch": 0.5103461754574549, "grad_norm": 0.4742504954338074, "learning_rate": 4.20505972176794e-05, "loss": 0.1051, "num_input_tokens_seen": 13547904, "step": 10445 }, { "epoch": 0.5105904771211492, "grad_norm": 0.19655387103557587, "learning_rate": 4.204341608870171e-05, "loss": 0.0953, "num_input_tokens_seen": 13554400, "step": 10450 }, { "epoch": 0.5108347787848435, "grad_norm": 0.42384085059165955, "learning_rate": 4.203623233141508e-05, "loss": 0.0714, "num_input_tokens_seen": 13560864, "step": 10455 }, { "epoch": 0.5110790804485379, "grad_norm": 0.41217041015625, "learning_rate": 4.2029045946927334e-05, "loss": 0.1099, "num_input_tokens_seen": 13567264, "step": 10460 }, { "epoch": 0.5113233821122322, "grad_norm": 0.15557676553726196, "learning_rate": 4.20218569363467e-05, "loss": 0.0869, "num_input_tokens_seen": 13573568, "step": 10465 }, { "epoch": 0.5115676837759265, "grad_norm": 0.3383007049560547, "learning_rate": 4.2014665300781834e-05, "loss": 0.0796, "num_input_tokens_seen": 13579552, "step": 10470 }, { "epoch": 0.5118119854396208, "grad_norm": 0.2098289579153061, "learning_rate": 4.200747104134174e-05, "loss": 0.103, "num_input_tokens_seen": 13586080, "step": 10475 }, { "epoch": 0.5120562871033152, "grad_norm": 0.30448129773139954, "learning_rate": 4.200027415913588e-05, "loss": 0.0816, "num_input_tokens_seen": 13592928, "step": 10480 }, { "epoch": 0.5123005887670095, "grad_norm": 0.4644514322280884, "learning_rate": 4.1993074655274126e-05, "loss": 0.0822, "num_input_tokens_seen": 13600064, "step": 10485 }, { "epoch": 0.5125448904307038, "grad_norm": 0.44283097982406616, "learning_rate": 4.198587253086669e-05, "loss": 0.1135, "num_input_tokens_seen": 13606880, "step": 10490 }, { "epoch": 0.5127891920943982, "grad_norm": 0.6517175436019897, "learning_rate": 4.197866778702426e-05, "loss": 0.083, "num_input_tokens_seen": 13613216, "step": 10495 }, { "epoch": 0.5130334937580925, "grad_norm": 0.12171350419521332, "learning_rate": 4.197146042485789e-05, "loss": 0.0661, "num_input_tokens_seen": 13619776, "step": 10500 }, { "epoch": 0.5132777954217869, "grad_norm": 0.3777287006378174, "learning_rate": 4.1964250445479046e-05, "loss": 0.0745, "num_input_tokens_seen": 13626400, "step": 10505 }, { "epoch": 0.5135220970854811, "grad_norm": 0.43098026514053345, "learning_rate": 4.19570378499996e-05, "loss": 0.0823, "num_input_tokens_seen": 13632480, "step": 10510 }, { "epoch": 0.5137663987491755, "grad_norm": 0.22120021283626556, "learning_rate": 4.194982263953182e-05, "loss": 0.1007, "num_input_tokens_seen": 13639456, "step": 10515 }, { "epoch": 0.5140107004128698, "grad_norm": 0.16194121539592743, "learning_rate": 4.194260481518838e-05, "loss": 0.0963, "num_input_tokens_seen": 13645600, "step": 10520 }, { "epoch": 0.5142550020765642, "grad_norm": 0.26760828495025635, "learning_rate": 4.1935384378082366e-05, "loss": 0.0719, "num_input_tokens_seen": 13651904, "step": 10525 }, { "epoch": 0.5144993037402584, "grad_norm": 0.3831353783607483, "learning_rate": 4.1928161329327267e-05, "loss": 0.0858, "num_input_tokens_seen": 13658336, "step": 10530 }, { "epoch": 0.5147436054039528, "grad_norm": 0.7448642253875732, "learning_rate": 4.1920935670036945e-05, "loss": 0.106, "num_input_tokens_seen": 13664480, "step": 10535 }, { "epoch": 0.5149879070676471, "grad_norm": 0.2156420350074768, "learning_rate": 4.1913707401325705e-05, "loss": 0.1003, "num_input_tokens_seen": 13670720, "step": 10540 }, { "epoch": 0.5152322087313415, "grad_norm": 0.13368447124958038, "learning_rate": 4.1906476524308235e-05, "loss": 0.1005, "num_input_tokens_seen": 13677152, "step": 10545 }, { "epoch": 0.5154765103950358, "grad_norm": 0.5343707799911499, "learning_rate": 4.189924304009962e-05, "loss": 0.1203, "num_input_tokens_seen": 13683616, "step": 10550 }, { "epoch": 0.5157208120587301, "grad_norm": 0.20330122113227844, "learning_rate": 4.189200694981537e-05, "loss": 0.0937, "num_input_tokens_seen": 13689696, "step": 10555 }, { "epoch": 0.5159651137224245, "grad_norm": 0.21063467860221863, "learning_rate": 4.188476825457136e-05, "loss": 0.1122, "num_input_tokens_seen": 13696416, "step": 10560 }, { "epoch": 0.5162094153861188, "grad_norm": 0.3971911668777466, "learning_rate": 4.18775269554839e-05, "loss": 0.1085, "num_input_tokens_seen": 13703136, "step": 10565 }, { "epoch": 0.5164537170498131, "grad_norm": 0.44575726985931396, "learning_rate": 4.187028305366969e-05, "loss": 0.0859, "num_input_tokens_seen": 13709216, "step": 10570 }, { "epoch": 0.5166980187135074, "grad_norm": 2.993875741958618, "learning_rate": 4.1863036550245824e-05, "loss": 0.1131, "num_input_tokens_seen": 13716480, "step": 10575 }, { "epoch": 0.5169423203772018, "grad_norm": 0.1847839057445526, "learning_rate": 4.1855787446329806e-05, "loss": 0.0732, "num_input_tokens_seen": 13723936, "step": 10580 }, { "epoch": 0.517186622040896, "grad_norm": 0.1580183506011963, "learning_rate": 4.184853574303955e-05, "loss": 0.1198, "num_input_tokens_seen": 13730592, "step": 10585 }, { "epoch": 0.5174309237045904, "grad_norm": 0.16050492227077484, "learning_rate": 4.184128144149334e-05, "loss": 0.1067, "num_input_tokens_seen": 13737152, "step": 10590 }, { "epoch": 0.5176752253682848, "grad_norm": 0.24482347071170807, "learning_rate": 4.1834024542809896e-05, "loss": 0.103, "num_input_tokens_seen": 13743808, "step": 10595 }, { "epoch": 0.5179195270319791, "grad_norm": 0.3342719078063965, "learning_rate": 4.1826765048108315e-05, "loss": 0.083, "num_input_tokens_seen": 13750592, "step": 10600 }, { "epoch": 0.5179195270319791, "eval_loss": 0.09333345293998718, "eval_runtime": 375.6308, "eval_samples_per_second": 96.864, "eval_steps_per_second": 24.218, "num_input_tokens_seen": 13750592, "step": 10600 }, { "epoch": 0.5181638286956735, "grad_norm": 0.25275689363479614, "learning_rate": 4.181950295850811e-05, "loss": 0.0864, "num_input_tokens_seen": 13756896, "step": 10605 }, { "epoch": 0.5184081303593677, "grad_norm": 0.4827510714530945, "learning_rate": 4.181223827512918e-05, "loss": 0.115, "num_input_tokens_seen": 13763328, "step": 10610 }, { "epoch": 0.5186524320230621, "grad_norm": 0.4642776846885681, "learning_rate": 4.180497099909183e-05, "loss": 0.0846, "num_input_tokens_seen": 13769696, "step": 10615 }, { "epoch": 0.5188967336867564, "grad_norm": 0.22699744999408722, "learning_rate": 4.179770113151677e-05, "loss": 0.0868, "num_input_tokens_seen": 13775808, "step": 10620 }, { "epoch": 0.5191410353504508, "grad_norm": 0.2951979637145996, "learning_rate": 4.179042867352511e-05, "loss": 0.0957, "num_input_tokens_seen": 13781952, "step": 10625 }, { "epoch": 0.519385337014145, "grad_norm": 0.25118088722229004, "learning_rate": 4.1783153626238334e-05, "loss": 0.0934, "num_input_tokens_seen": 13788064, "step": 10630 }, { "epoch": 0.5196296386778394, "grad_norm": 0.16465625166893005, "learning_rate": 4.177587599077836e-05, "loss": 0.0836, "num_input_tokens_seen": 13794112, "step": 10635 }, { "epoch": 0.5198739403415338, "grad_norm": 0.6097593903541565, "learning_rate": 4.1768595768267494e-05, "loss": 0.0893, "num_input_tokens_seen": 13800736, "step": 10640 }, { "epoch": 0.520118242005228, "grad_norm": 0.24734041094779968, "learning_rate": 4.176131295982843e-05, "loss": 0.0818, "num_input_tokens_seen": 13807008, "step": 10645 }, { "epoch": 0.5203625436689224, "grad_norm": 0.1366853415966034, "learning_rate": 4.1754027566584276e-05, "loss": 0.0866, "num_input_tokens_seen": 13813760, "step": 10650 }, { "epoch": 0.5206068453326167, "grad_norm": 0.2725757956504822, "learning_rate": 4.174673958965852e-05, "loss": 0.0695, "num_input_tokens_seen": 13820224, "step": 10655 }, { "epoch": 0.5208511469963111, "grad_norm": 0.203365758061409, "learning_rate": 4.173944903017507e-05, "loss": 0.115, "num_input_tokens_seen": 13826976, "step": 10660 }, { "epoch": 0.5210954486600053, "grad_norm": 0.24005752801895142, "learning_rate": 4.173215588925822e-05, "loss": 0.0899, "num_input_tokens_seen": 13833184, "step": 10665 }, { "epoch": 0.5213397503236997, "grad_norm": 0.17574109137058258, "learning_rate": 4.172486016803266e-05, "loss": 0.1022, "num_input_tokens_seen": 13839936, "step": 10670 }, { "epoch": 0.521584051987394, "grad_norm": 0.46165820956230164, "learning_rate": 4.171756186762349e-05, "loss": 0.0831, "num_input_tokens_seen": 13846144, "step": 10675 }, { "epoch": 0.5218283536510884, "grad_norm": 0.41700562834739685, "learning_rate": 4.171026098915619e-05, "loss": 0.0816, "num_input_tokens_seen": 13852736, "step": 10680 }, { "epoch": 0.5220726553147826, "grad_norm": 0.2526512145996094, "learning_rate": 4.170295753375665e-05, "loss": 0.0732, "num_input_tokens_seen": 13859552, "step": 10685 }, { "epoch": 0.522316956978477, "grad_norm": 0.1935499608516693, "learning_rate": 4.169565150255117e-05, "loss": 0.0885, "num_input_tokens_seen": 13866016, "step": 10690 }, { "epoch": 0.5225612586421714, "grad_norm": 0.16316191852092743, "learning_rate": 4.16883428966664e-05, "loss": 0.0774, "num_input_tokens_seen": 13872160, "step": 10695 }, { "epoch": 0.5228055603058657, "grad_norm": 0.291038453578949, "learning_rate": 4.168103171722944e-05, "loss": 0.1058, "num_input_tokens_seen": 13878368, "step": 10700 }, { "epoch": 0.52304986196956, "grad_norm": 0.19341324269771576, "learning_rate": 4.167371796536777e-05, "loss": 0.0879, "num_input_tokens_seen": 13884928, "step": 10705 }, { "epoch": 0.5232941636332543, "grad_norm": 0.38451725244522095, "learning_rate": 4.166640164220924e-05, "loss": 0.099, "num_input_tokens_seen": 13891360, "step": 10710 }, { "epoch": 0.5235384652969487, "grad_norm": 0.33223941922187805, "learning_rate": 4.1659082748882144e-05, "loss": 0.0999, "num_input_tokens_seen": 13898272, "step": 10715 }, { "epoch": 0.523782766960643, "grad_norm": 0.2195337563753128, "learning_rate": 4.1651761286515135e-05, "loss": 0.0748, "num_input_tokens_seen": 13904992, "step": 10720 }, { "epoch": 0.5240270686243373, "grad_norm": 0.37428662180900574, "learning_rate": 4.164443725623728e-05, "loss": 0.0875, "num_input_tokens_seen": 13911744, "step": 10725 }, { "epoch": 0.5242713702880316, "grad_norm": 0.27855271100997925, "learning_rate": 4.163711065917802e-05, "loss": 0.0965, "num_input_tokens_seen": 13918112, "step": 10730 }, { "epoch": 0.524515671951726, "grad_norm": 0.810528039932251, "learning_rate": 4.1629781496467234e-05, "loss": 0.0924, "num_input_tokens_seen": 13924224, "step": 10735 }, { "epoch": 0.5247599736154204, "grad_norm": 0.8225935697555542, "learning_rate": 4.1622449769235164e-05, "loss": 0.1237, "num_input_tokens_seen": 13930624, "step": 10740 }, { "epoch": 0.5250042752791146, "grad_norm": 0.20385393500328064, "learning_rate": 4.161511547861243e-05, "loss": 0.0841, "num_input_tokens_seen": 13936992, "step": 10745 }, { "epoch": 0.525248576942809, "grad_norm": 0.17294315993785858, "learning_rate": 4.1607778625730104e-05, "loss": 0.1177, "num_input_tokens_seen": 13943360, "step": 10750 }, { "epoch": 0.5254928786065033, "grad_norm": 0.4402948021888733, "learning_rate": 4.160043921171961e-05, "loss": 0.101, "num_input_tokens_seen": 13949792, "step": 10755 }, { "epoch": 0.5257371802701977, "grad_norm": 0.42175811529159546, "learning_rate": 4.159309723771276e-05, "loss": 0.0884, "num_input_tokens_seen": 13956480, "step": 10760 }, { "epoch": 0.5259814819338919, "grad_norm": 0.2721147835254669, "learning_rate": 4.158575270484181e-05, "loss": 0.0852, "num_input_tokens_seen": 13962688, "step": 10765 }, { "epoch": 0.5262257835975863, "grad_norm": 0.16704389452934265, "learning_rate": 4.157840561423936e-05, "loss": 0.1055, "num_input_tokens_seen": 13969888, "step": 10770 }, { "epoch": 0.5264700852612806, "grad_norm": 0.1397521048784256, "learning_rate": 4.1571055967038416e-05, "loss": 0.0759, "num_input_tokens_seen": 13976768, "step": 10775 }, { "epoch": 0.526714386924975, "grad_norm": 0.5234789252281189, "learning_rate": 4.156370376437241e-05, "loss": 0.0986, "num_input_tokens_seen": 13983360, "step": 10780 }, { "epoch": 0.5269586885886693, "grad_norm": 0.17798008024692535, "learning_rate": 4.155634900737513e-05, "loss": 0.1092, "num_input_tokens_seen": 13989504, "step": 10785 }, { "epoch": 0.5272029902523636, "grad_norm": 0.2402227222919464, "learning_rate": 4.1548991697180764e-05, "loss": 0.0808, "num_input_tokens_seen": 13996320, "step": 10790 }, { "epoch": 0.527447291916058, "grad_norm": 0.24523070454597473, "learning_rate": 4.1541631834923914e-05, "loss": 0.0822, "num_input_tokens_seen": 14003040, "step": 10795 }, { "epoch": 0.5276915935797523, "grad_norm": 0.14810673892498016, "learning_rate": 4.153426942173956e-05, "loss": 0.0866, "num_input_tokens_seen": 14009088, "step": 10800 }, { "epoch": 0.5276915935797523, "eval_loss": 0.0925857201218605, "eval_runtime": 374.6771, "eval_samples_per_second": 97.11, "eval_steps_per_second": 24.28, "num_input_tokens_seen": 14009088, "step": 10800 }, { "epoch": 0.5279358952434466, "grad_norm": 0.25152388215065, "learning_rate": 4.152690445876308e-05, "loss": 0.0781, "num_input_tokens_seen": 14015776, "step": 10805 }, { "epoch": 0.5281801969071409, "grad_norm": 0.15968918800354004, "learning_rate": 4.1519536947130245e-05, "loss": 0.0754, "num_input_tokens_seen": 14022144, "step": 10810 }, { "epoch": 0.5284244985708353, "grad_norm": 0.21062858402729034, "learning_rate": 4.151216688797722e-05, "loss": 0.0806, "num_input_tokens_seen": 14028864, "step": 10815 }, { "epoch": 0.5286688002345296, "grad_norm": 0.278475284576416, "learning_rate": 4.150479428244054e-05, "loss": 0.0851, "num_input_tokens_seen": 14035392, "step": 10820 }, { "epoch": 0.5289131018982239, "grad_norm": 0.43620890378952026, "learning_rate": 4.1497419131657176e-05, "loss": 0.071, "num_input_tokens_seen": 14041888, "step": 10825 }, { "epoch": 0.5291574035619182, "grad_norm": 0.11318830400705338, "learning_rate": 4.149004143676447e-05, "loss": 0.0844, "num_input_tokens_seen": 14048768, "step": 10830 }, { "epoch": 0.5294017052256126, "grad_norm": 0.22160857915878296, "learning_rate": 4.148266119890015e-05, "loss": 0.1096, "num_input_tokens_seen": 14054944, "step": 10835 }, { "epoch": 0.529646006889307, "grad_norm": 0.49474212527275085, "learning_rate": 4.1475278419202324e-05, "loss": 0.0988, "num_input_tokens_seen": 14061568, "step": 10840 }, { "epoch": 0.5298903085530012, "grad_norm": 0.3187461793422699, "learning_rate": 4.146789309880953e-05, "loss": 0.0786, "num_input_tokens_seen": 14067872, "step": 10845 }, { "epoch": 0.5301346102166956, "grad_norm": 0.16401275992393494, "learning_rate": 4.146050523886068e-05, "loss": 0.1062, "num_input_tokens_seen": 14073824, "step": 10850 }, { "epoch": 0.5303789118803899, "grad_norm": 0.5855420231819153, "learning_rate": 4.1453114840495055e-05, "loss": 0.0958, "num_input_tokens_seen": 14079968, "step": 10855 }, { "epoch": 0.5306232135440843, "grad_norm": 0.4398351311683655, "learning_rate": 4.1445721904852364e-05, "loss": 0.0885, "num_input_tokens_seen": 14087040, "step": 10860 }, { "epoch": 0.5308675152077785, "grad_norm": 0.5937902927398682, "learning_rate": 4.143832643307269e-05, "loss": 0.1094, "num_input_tokens_seen": 14094432, "step": 10865 }, { "epoch": 0.5311118168714729, "grad_norm": 0.6075465083122253, "learning_rate": 4.1430928426296503e-05, "loss": 0.1045, "num_input_tokens_seen": 14100480, "step": 10870 }, { "epoch": 0.5313561185351672, "grad_norm": 0.6342405676841736, "learning_rate": 4.142352788566466e-05, "loss": 0.0918, "num_input_tokens_seen": 14106848, "step": 10875 }, { "epoch": 0.5316004201988616, "grad_norm": 0.24179932475090027, "learning_rate": 4.1416124812318424e-05, "loss": 0.0828, "num_input_tokens_seen": 14112800, "step": 10880 }, { "epoch": 0.5318447218625559, "grad_norm": 0.3297145664691925, "learning_rate": 4.1408719207399453e-05, "loss": 0.0904, "num_input_tokens_seen": 14119328, "step": 10885 }, { "epoch": 0.5320890235262502, "grad_norm": 0.3997986912727356, "learning_rate": 4.140131107204978e-05, "loss": 0.112, "num_input_tokens_seen": 14126080, "step": 10890 }, { "epoch": 0.5323333251899446, "grad_norm": 0.41833773255348206, "learning_rate": 4.139390040741182e-05, "loss": 0.0709, "num_input_tokens_seen": 14132320, "step": 10895 }, { "epoch": 0.5325776268536389, "grad_norm": 0.13428843021392822, "learning_rate": 4.1386487214628396e-05, "loss": 0.0605, "num_input_tokens_seen": 14138464, "step": 10900 }, { "epoch": 0.5328219285173332, "grad_norm": 0.6000746488571167, "learning_rate": 4.137907149484272e-05, "loss": 0.0901, "num_input_tokens_seen": 14144768, "step": 10905 }, { "epoch": 0.5330662301810275, "grad_norm": 0.21440526843070984, "learning_rate": 4.137165324919839e-05, "loss": 0.0869, "num_input_tokens_seen": 14151584, "step": 10910 }, { "epoch": 0.5333105318447219, "grad_norm": 0.2066512554883957, "learning_rate": 4.136423247883939e-05, "loss": 0.0814, "num_input_tokens_seen": 14158400, "step": 10915 }, { "epoch": 0.5335548335084161, "grad_norm": 0.3579169809818268, "learning_rate": 4.135680918491009e-05, "loss": 0.1128, "num_input_tokens_seen": 14164608, "step": 10920 }, { "epoch": 0.5337991351721105, "grad_norm": 0.14876990020275116, "learning_rate": 4.1349383368555265e-05, "loss": 0.098, "num_input_tokens_seen": 14171232, "step": 10925 }, { "epoch": 0.5340434368358048, "grad_norm": 0.35173624753952026, "learning_rate": 4.1341955030920065e-05, "loss": 0.0768, "num_input_tokens_seen": 14177696, "step": 10930 }, { "epoch": 0.5342877384994992, "grad_norm": 0.25140032172203064, "learning_rate": 4.1334524173150036e-05, "loss": 0.0936, "num_input_tokens_seen": 14184544, "step": 10935 }, { "epoch": 0.5345320401631936, "grad_norm": 0.4708119332790375, "learning_rate": 4.13270907963911e-05, "loss": 0.076, "num_input_tokens_seen": 14190784, "step": 10940 }, { "epoch": 0.5347763418268878, "grad_norm": 0.21016691625118256, "learning_rate": 4.131965490178959e-05, "loss": 0.0992, "num_input_tokens_seen": 14196896, "step": 10945 }, { "epoch": 0.5350206434905822, "grad_norm": 0.4209407567977905, "learning_rate": 4.131221649049222e-05, "loss": 0.069, "num_input_tokens_seen": 14203424, "step": 10950 }, { "epoch": 0.5352649451542765, "grad_norm": 0.2625023424625397, "learning_rate": 4.130477556364606e-05, "loss": 0.0582, "num_input_tokens_seen": 14210080, "step": 10955 }, { "epoch": 0.5355092468179709, "grad_norm": 0.17516709864139557, "learning_rate": 4.129733212239861e-05, "loss": 0.0901, "num_input_tokens_seen": 14216160, "step": 10960 }, { "epoch": 0.5357535484816651, "grad_norm": 0.12092161178588867, "learning_rate": 4.128988616789774e-05, "loss": 0.0773, "num_input_tokens_seen": 14222592, "step": 10965 }, { "epoch": 0.5359978501453595, "grad_norm": 0.11239384114742279, "learning_rate": 4.1282437701291724e-05, "loss": 0.0735, "num_input_tokens_seen": 14229664, "step": 10970 }, { "epoch": 0.5362421518090538, "grad_norm": 0.2616156339645386, "learning_rate": 4.1274986723729184e-05, "loss": 0.0743, "num_input_tokens_seen": 14235968, "step": 10975 }, { "epoch": 0.5364864534727481, "grad_norm": 0.5123366117477417, "learning_rate": 4.126753323635917e-05, "loss": 0.0996, "num_input_tokens_seen": 14242592, "step": 10980 }, { "epoch": 0.5367307551364425, "grad_norm": 0.21469774842262268, "learning_rate": 4.12600772403311e-05, "loss": 0.0852, "num_input_tokens_seen": 14249056, "step": 10985 }, { "epoch": 0.5369750568001368, "grad_norm": 0.2216307520866394, "learning_rate": 4.125261873679479e-05, "loss": 0.0891, "num_input_tokens_seen": 14255488, "step": 10990 }, { "epoch": 0.5372193584638312, "grad_norm": 0.27127641439437866, "learning_rate": 4.124515772690042e-05, "loss": 0.0903, "num_input_tokens_seen": 14262080, "step": 10995 }, { "epoch": 0.5374636601275254, "grad_norm": 0.2076992243528366, "learning_rate": 4.123769421179858e-05, "loss": 0.08, "num_input_tokens_seen": 14268352, "step": 11000 }, { "epoch": 0.5374636601275254, "eval_loss": 0.09249327331781387, "eval_runtime": 375.1552, "eval_samples_per_second": 96.987, "eval_steps_per_second": 24.249, "num_input_tokens_seen": 14268352, "step": 11000 }, { "epoch": 0.5377079617912198, "grad_norm": 0.531017005443573, "learning_rate": 4.1230228192640236e-05, "loss": 0.0737, "num_input_tokens_seen": 14274656, "step": 11005 }, { "epoch": 0.5379522634549141, "grad_norm": 0.26887866854667664, "learning_rate": 4.122275967057675e-05, "loss": 0.1049, "num_input_tokens_seen": 14280672, "step": 11010 }, { "epoch": 0.5381965651186085, "grad_norm": 0.4108617901802063, "learning_rate": 4.1215288646759846e-05, "loss": 0.122, "num_input_tokens_seen": 14287104, "step": 11015 }, { "epoch": 0.5384408667823027, "grad_norm": 0.3317992389202118, "learning_rate": 4.120781512234166e-05, "loss": 0.0797, "num_input_tokens_seen": 14293504, "step": 11020 }, { "epoch": 0.5386851684459971, "grad_norm": 0.5340105891227722, "learning_rate": 4.120033909847471e-05, "loss": 0.1135, "num_input_tokens_seen": 14299328, "step": 11025 }, { "epoch": 0.5389294701096915, "grad_norm": 0.3470867872238159, "learning_rate": 4.119286057631187e-05, "loss": 0.0948, "num_input_tokens_seen": 14305664, "step": 11030 }, { "epoch": 0.5391737717733858, "grad_norm": 0.19283176958560944, "learning_rate": 4.118537955700646e-05, "loss": 0.0858, "num_input_tokens_seen": 14312096, "step": 11035 }, { "epoch": 0.5394180734370801, "grad_norm": 0.4583074152469635, "learning_rate": 4.11778960417121e-05, "loss": 0.0794, "num_input_tokens_seen": 14318368, "step": 11040 }, { "epoch": 0.5396623751007744, "grad_norm": 0.24581393599510193, "learning_rate": 4.117041003158288e-05, "loss": 0.087, "num_input_tokens_seen": 14324448, "step": 11045 }, { "epoch": 0.5399066767644688, "grad_norm": 0.2831425964832306, "learning_rate": 4.1162921527773215e-05, "loss": 0.0842, "num_input_tokens_seen": 14331200, "step": 11050 }, { "epoch": 0.5401509784281631, "grad_norm": 0.21427392959594727, "learning_rate": 4.115543053143794e-05, "loss": 0.0923, "num_input_tokens_seen": 14337472, "step": 11055 }, { "epoch": 0.5403952800918574, "grad_norm": 0.23416051268577576, "learning_rate": 4.114793704373226e-05, "loss": 0.0908, "num_input_tokens_seen": 14343872, "step": 11060 }, { "epoch": 0.5406395817555517, "grad_norm": 0.3012680113315582, "learning_rate": 4.114044106581175e-05, "loss": 0.0815, "num_input_tokens_seen": 14350720, "step": 11065 }, { "epoch": 0.5408838834192461, "grad_norm": 0.7018107771873474, "learning_rate": 4.11329425988324e-05, "loss": 0.0944, "num_input_tokens_seen": 14357280, "step": 11070 }, { "epoch": 0.5411281850829404, "grad_norm": 0.38820213079452515, "learning_rate": 4.112544164395056e-05, "loss": 0.1108, "num_input_tokens_seen": 14363872, "step": 11075 }, { "epoch": 0.5413724867466347, "grad_norm": 0.2058093398809433, "learning_rate": 4.111793820232297e-05, "loss": 0.0845, "num_input_tokens_seen": 14370048, "step": 11080 }, { "epoch": 0.5416167884103291, "grad_norm": 0.22434037923812866, "learning_rate": 4.1110432275106767e-05, "loss": 0.0769, "num_input_tokens_seen": 14376416, "step": 11085 }, { "epoch": 0.5418610900740234, "grad_norm": 0.6212311387062073, "learning_rate": 4.110292386345944e-05, "loss": 0.0884, "num_input_tokens_seen": 14382816, "step": 11090 }, { "epoch": 0.5421053917377178, "grad_norm": 0.33293718099594116, "learning_rate": 4.109541296853891e-05, "loss": 0.0941, "num_input_tokens_seen": 14389888, "step": 11095 }, { "epoch": 0.542349693401412, "grad_norm": 0.14369632303714752, "learning_rate": 4.108789959150341e-05, "loss": 0.0836, "num_input_tokens_seen": 14396448, "step": 11100 }, { "epoch": 0.5425939950651064, "grad_norm": 0.21923016011714935, "learning_rate": 4.108038373351163e-05, "loss": 0.0917, "num_input_tokens_seen": 14402688, "step": 11105 }, { "epoch": 0.5428382967288007, "grad_norm": 0.6952000856399536, "learning_rate": 4.10728653957226e-05, "loss": 0.0759, "num_input_tokens_seen": 14409472, "step": 11110 }, { "epoch": 0.5430825983924951, "grad_norm": 0.1812397688627243, "learning_rate": 4.106534457929575e-05, "loss": 0.0815, "num_input_tokens_seen": 14415968, "step": 11115 }, { "epoch": 0.5433269000561893, "grad_norm": 0.1642857789993286, "learning_rate": 4.105782128539086e-05, "loss": 0.0738, "num_input_tokens_seen": 14422688, "step": 11120 }, { "epoch": 0.5435712017198837, "grad_norm": 0.21126589179039001, "learning_rate": 4.1050295515168144e-05, "loss": 0.0788, "num_input_tokens_seen": 14429312, "step": 11125 }, { "epoch": 0.5438155033835781, "grad_norm": 0.20996759831905365, "learning_rate": 4.1042767269788155e-05, "loss": 0.0765, "num_input_tokens_seen": 14435968, "step": 11130 }, { "epoch": 0.5440598050472724, "grad_norm": 0.2945360243320465, "learning_rate": 4.103523655041185e-05, "loss": 0.0747, "num_input_tokens_seen": 14442592, "step": 11135 }, { "epoch": 0.5443041067109667, "grad_norm": 0.3858847916126251, "learning_rate": 4.102770335820055e-05, "loss": 0.1042, "num_input_tokens_seen": 14448832, "step": 11140 }, { "epoch": 0.544548408374661, "grad_norm": 0.3541830778121948, "learning_rate": 4.1020167694315984e-05, "loss": 0.0763, "num_input_tokens_seen": 14455328, "step": 11145 }, { "epoch": 0.5447927100383554, "grad_norm": 0.16054685413837433, "learning_rate": 4.101262955992023e-05, "loss": 0.0684, "num_input_tokens_seen": 14461664, "step": 11150 }, { "epoch": 0.5450370117020497, "grad_norm": 0.49088963866233826, "learning_rate": 4.100508895617578e-05, "loss": 0.1132, "num_input_tokens_seen": 14467968, "step": 11155 }, { "epoch": 0.545281313365744, "grad_norm": 0.16138598322868347, "learning_rate": 4.099754588424547e-05, "loss": 0.0749, "num_input_tokens_seen": 14474368, "step": 11160 }, { "epoch": 0.5455256150294383, "grad_norm": 0.2515026032924652, "learning_rate": 4.0990000345292546e-05, "loss": 0.0923, "num_input_tokens_seen": 14481216, "step": 11165 }, { "epoch": 0.5457699166931327, "grad_norm": 0.2291174829006195, "learning_rate": 4.098245234048064e-05, "loss": 0.1068, "num_input_tokens_seen": 14487776, "step": 11170 }, { "epoch": 0.5460142183568271, "grad_norm": 0.41968443989753723, "learning_rate": 4.0974901870973726e-05, "loss": 0.0942, "num_input_tokens_seen": 14494304, "step": 11175 }, { "epoch": 0.5462585200205213, "grad_norm": 0.27717912197113037, "learning_rate": 4.096734893793619e-05, "loss": 0.0747, "num_input_tokens_seen": 14501120, "step": 11180 }, { "epoch": 0.5465028216842157, "grad_norm": 0.1608569622039795, "learning_rate": 4.095979354253279e-05, "loss": 0.0939, "num_input_tokens_seen": 14507424, "step": 11185 }, { "epoch": 0.54674712334791, "grad_norm": 0.2834719717502594, "learning_rate": 4.0952235685928656e-05, "loss": 0.0899, "num_input_tokens_seen": 14513728, "step": 11190 }, { "epoch": 0.5469914250116044, "grad_norm": 0.34239065647125244, "learning_rate": 4.094467536928932e-05, "loss": 0.1123, "num_input_tokens_seen": 14520480, "step": 11195 }, { "epoch": 0.5472357266752986, "grad_norm": 0.3917379677295685, "learning_rate": 4.093711259378067e-05, "loss": 0.0721, "num_input_tokens_seen": 14527072, "step": 11200 }, { "epoch": 0.5472357266752986, "eval_loss": 0.0926986113190651, "eval_runtime": 375.0507, "eval_samples_per_second": 97.014, "eval_steps_per_second": 24.255, "num_input_tokens_seen": 14527072, "step": 11200 }, { "epoch": 0.547480028338993, "grad_norm": 0.3655013144016266, "learning_rate": 4.092954736056897e-05, "loss": 0.1101, "num_input_tokens_seen": 14533888, "step": 11205 }, { "epoch": 0.5477243300026873, "grad_norm": 0.16818389296531677, "learning_rate": 4.09219796708209e-05, "loss": 0.0814, "num_input_tokens_seen": 14540224, "step": 11210 }, { "epoch": 0.5479686316663817, "grad_norm": 0.10685382783412933, "learning_rate": 4.0914409525703464e-05, "loss": 0.0804, "num_input_tokens_seen": 14547264, "step": 11215 }, { "epoch": 0.5482129333300759, "grad_norm": 0.5131166577339172, "learning_rate": 4.090683692638408e-05, "loss": 0.0997, "num_input_tokens_seen": 14553952, "step": 11220 }, { "epoch": 0.5484572349937703, "grad_norm": 0.3428378701210022, "learning_rate": 4.089926187403056e-05, "loss": 0.0881, "num_input_tokens_seen": 14560128, "step": 11225 }, { "epoch": 0.5487015366574647, "grad_norm": 0.25034576654434204, "learning_rate": 4.0891684369811044e-05, "loss": 0.1068, "num_input_tokens_seen": 14566368, "step": 11230 }, { "epoch": 0.548945838321159, "grad_norm": 0.19330435991287231, "learning_rate": 4.0884104414894107e-05, "loss": 0.0728, "num_input_tokens_seen": 14572960, "step": 11235 }, { "epoch": 0.5491901399848533, "grad_norm": 0.2052890658378601, "learning_rate": 4.087652201044864e-05, "loss": 0.0917, "num_input_tokens_seen": 14579424, "step": 11240 }, { "epoch": 0.5494344416485476, "grad_norm": 0.8793042302131653, "learning_rate": 4.086893715764397e-05, "loss": 0.1132, "num_input_tokens_seen": 14585696, "step": 11245 }, { "epoch": 0.549678743312242, "grad_norm": 0.2051493227481842, "learning_rate": 4.086134985764977e-05, "loss": 0.0847, "num_input_tokens_seen": 14591968, "step": 11250 }, { "epoch": 0.5499230449759362, "grad_norm": 0.413018137216568, "learning_rate": 4.0853760111636085e-05, "loss": 0.1069, "num_input_tokens_seen": 14598144, "step": 11255 }, { "epoch": 0.5501673466396306, "grad_norm": 0.2688762843608856, "learning_rate": 4.084616792077337e-05, "loss": 0.0785, "num_input_tokens_seen": 14605024, "step": 11260 }, { "epoch": 0.5504116483033249, "grad_norm": 1.0607370138168335, "learning_rate": 4.083857328623243e-05, "loss": 0.0882, "num_input_tokens_seen": 14611232, "step": 11265 }, { "epoch": 0.5506559499670193, "grad_norm": 0.1938750147819519, "learning_rate": 4.083097620918444e-05, "loss": 0.0709, "num_input_tokens_seen": 14618048, "step": 11270 }, { "epoch": 0.5509002516307137, "grad_norm": 0.243669793009758, "learning_rate": 4.082337669080097e-05, "loss": 0.0772, "num_input_tokens_seen": 14624416, "step": 11275 }, { "epoch": 0.5511445532944079, "grad_norm": 0.41518840193748474, "learning_rate": 4.081577473225398e-05, "loss": 0.0907, "num_input_tokens_seen": 14630432, "step": 11280 }, { "epoch": 0.5513888549581023, "grad_norm": 0.21813629567623138, "learning_rate": 4.080817033471577e-05, "loss": 0.0949, "num_input_tokens_seen": 14637696, "step": 11285 }, { "epoch": 0.5516331566217966, "grad_norm": 0.432980477809906, "learning_rate": 4.080056349935903e-05, "loss": 0.101, "num_input_tokens_seen": 14644224, "step": 11290 }, { "epoch": 0.551877458285491, "grad_norm": 0.4453399181365967, "learning_rate": 4.079295422735684e-05, "loss": 0.1008, "num_input_tokens_seen": 14650560, "step": 11295 }, { "epoch": 0.5521217599491852, "grad_norm": 0.27068889141082764, "learning_rate": 4.078534251988264e-05, "loss": 0.0774, "num_input_tokens_seen": 14657312, "step": 11300 }, { "epoch": 0.5523660616128796, "grad_norm": 0.6905425786972046, "learning_rate": 4.077772837811025e-05, "loss": 0.1024, "num_input_tokens_seen": 14663520, "step": 11305 }, { "epoch": 0.5526103632765739, "grad_norm": 0.28445035219192505, "learning_rate": 4.0770111803213874e-05, "loss": 0.0615, "num_input_tokens_seen": 14670304, "step": 11310 }, { "epoch": 0.5528546649402682, "grad_norm": 0.6675089001655579, "learning_rate": 4.076249279636807e-05, "loss": 0.0597, "num_input_tokens_seen": 14676480, "step": 11315 }, { "epoch": 0.5530989666039626, "grad_norm": 0.4094609022140503, "learning_rate": 4.075487135874781e-05, "loss": 0.0879, "num_input_tokens_seen": 14682944, "step": 11320 }, { "epoch": 0.5533432682676569, "grad_norm": 0.25270548462867737, "learning_rate": 4.074724749152837e-05, "loss": 0.0828, "num_input_tokens_seen": 14689568, "step": 11325 }, { "epoch": 0.5535875699313513, "grad_norm": 0.420410692691803, "learning_rate": 4.07396211958855e-05, "loss": 0.0936, "num_input_tokens_seen": 14696064, "step": 11330 }, { "epoch": 0.5538318715950455, "grad_norm": 0.4818675220012665, "learning_rate": 4.073199247299523e-05, "loss": 0.0782, "num_input_tokens_seen": 14702464, "step": 11335 }, { "epoch": 0.5540761732587399, "grad_norm": 0.20726259052753448, "learning_rate": 4.072436132403403e-05, "loss": 0.0965, "num_input_tokens_seen": 14709184, "step": 11340 }, { "epoch": 0.5543204749224342, "grad_norm": 0.20149728655815125, "learning_rate": 4.0716727750178704e-05, "loss": 0.1055, "num_input_tokens_seen": 14716096, "step": 11345 }, { "epoch": 0.5545647765861286, "grad_norm": 0.5376307368278503, "learning_rate": 4.0709091752606455e-05, "loss": 0.0931, "num_input_tokens_seen": 14722752, "step": 11350 }, { "epoch": 0.5548090782498228, "grad_norm": 0.29042649269104004, "learning_rate": 4.070145333249484e-05, "loss": 0.0814, "num_input_tokens_seen": 14729216, "step": 11355 }, { "epoch": 0.5550533799135172, "grad_norm": 0.19498181343078613, "learning_rate": 4.069381249102181e-05, "loss": 0.1245, "num_input_tokens_seen": 14735744, "step": 11360 }, { "epoch": 0.5552976815772115, "grad_norm": 0.29522666335105896, "learning_rate": 4.0686169229365665e-05, "loss": 0.1203, "num_input_tokens_seen": 14742112, "step": 11365 }, { "epoch": 0.5555419832409059, "grad_norm": 0.24193300306797028, "learning_rate": 4.067852354870511e-05, "loss": 0.089, "num_input_tokens_seen": 14748192, "step": 11370 }, { "epoch": 0.5557862849046002, "grad_norm": 0.25476011633872986, "learning_rate": 4.067087545021919e-05, "loss": 0.1136, "num_input_tokens_seen": 14754432, "step": 11375 }, { "epoch": 0.5560305865682945, "grad_norm": 0.1477353870868683, "learning_rate": 4.066322493508734e-05, "loss": 0.0773, "num_input_tokens_seen": 14760896, "step": 11380 }, { "epoch": 0.5562748882319889, "grad_norm": 0.20779676735401154, "learning_rate": 4.065557200448937e-05, "loss": 0.0957, "num_input_tokens_seen": 14767680, "step": 11385 }, { "epoch": 0.5565191898956832, "grad_norm": 0.4830362796783447, "learning_rate": 4.064791665960546e-05, "loss": 0.0723, "num_input_tokens_seen": 14773824, "step": 11390 }, { "epoch": 0.5567634915593775, "grad_norm": 0.36705443263053894, "learning_rate": 4.064025890161615e-05, "loss": 0.0804, "num_input_tokens_seen": 14780544, "step": 11395 }, { "epoch": 0.5570077932230718, "grad_norm": 1.075465202331543, "learning_rate": 4.0632598731702373e-05, "loss": 0.114, "num_input_tokens_seen": 14787040, "step": 11400 }, { "epoch": 0.5570077932230718, "eval_loss": 0.09272349625825882, "eval_runtime": 373.9899, "eval_samples_per_second": 97.289, "eval_steps_per_second": 24.324, "num_input_tokens_seen": 14787040, "step": 11400 }, { "epoch": 0.5572520948867662, "grad_norm": 0.3276066184043884, "learning_rate": 4.0624936151045426e-05, "loss": 0.103, "num_input_tokens_seen": 14793568, "step": 11405 }, { "epoch": 0.5574963965504605, "grad_norm": 0.44127634167671204, "learning_rate": 4.061727116082696e-05, "loss": 0.0921, "num_input_tokens_seen": 14800096, "step": 11410 }, { "epoch": 0.5577406982141548, "grad_norm": 0.28696805238723755, "learning_rate": 4.060960376222903e-05, "loss": 0.083, "num_input_tokens_seen": 14806304, "step": 11415 }, { "epoch": 0.5579849998778492, "grad_norm": 0.20911386609077454, "learning_rate": 4.0601933956434034e-05, "loss": 0.0901, "num_input_tokens_seen": 14812576, "step": 11420 }, { "epoch": 0.5582293015415435, "grad_norm": 0.16307279467582703, "learning_rate": 4.059426174462476e-05, "loss": 0.0948, "num_input_tokens_seen": 14819264, "step": 11425 }, { "epoch": 0.5584736032052379, "grad_norm": 0.25213536620140076, "learning_rate": 4.058658712798435e-05, "loss": 0.0825, "num_input_tokens_seen": 14825888, "step": 11430 }, { "epoch": 0.5587179048689321, "grad_norm": 0.27931901812553406, "learning_rate": 4.0578910107696336e-05, "loss": 0.0525, "num_input_tokens_seen": 14832864, "step": 11435 }, { "epoch": 0.5589622065326265, "grad_norm": 0.4028266668319702, "learning_rate": 4.05712306849446e-05, "loss": 0.0728, "num_input_tokens_seen": 14839392, "step": 11440 }, { "epoch": 0.5592065081963208, "grad_norm": 0.6421599984169006, "learning_rate": 4.0563548860913415e-05, "loss": 0.0871, "num_input_tokens_seen": 14845952, "step": 11445 }, { "epoch": 0.5594508098600152, "grad_norm": 0.12734316289424896, "learning_rate": 4.0555864636787414e-05, "loss": 0.1047, "num_input_tokens_seen": 14852832, "step": 11450 }, { "epoch": 0.5596951115237094, "grad_norm": 0.5387681722640991, "learning_rate": 4.054817801375159e-05, "loss": 0.0943, "num_input_tokens_seen": 14858976, "step": 11455 }, { "epoch": 0.5599394131874038, "grad_norm": 0.2587452530860901, "learning_rate": 4.054048899299134e-05, "loss": 0.0976, "num_input_tokens_seen": 14865696, "step": 11460 }, { "epoch": 0.5601837148510982, "grad_norm": 0.3403545618057251, "learning_rate": 4.0532797575692385e-05, "loss": 0.0967, "num_input_tokens_seen": 14872192, "step": 11465 }, { "epoch": 0.5604280165147925, "grad_norm": 0.35157299041748047, "learning_rate": 4.052510376304085e-05, "loss": 0.0938, "num_input_tokens_seen": 14878464, "step": 11470 }, { "epoch": 0.5606723181784868, "grad_norm": 0.3169492483139038, "learning_rate": 4.051740755622321e-05, "loss": 0.0926, "num_input_tokens_seen": 14884672, "step": 11475 }, { "epoch": 0.5609166198421811, "grad_norm": 0.38208264112472534, "learning_rate": 4.050970895642632e-05, "loss": 0.0891, "num_input_tokens_seen": 14891136, "step": 11480 }, { "epoch": 0.5611609215058755, "grad_norm": 0.33915653824806213, "learning_rate": 4.050200796483741e-05, "loss": 0.0819, "num_input_tokens_seen": 14897312, "step": 11485 }, { "epoch": 0.5614052231695698, "grad_norm": 0.5638338923454285, "learning_rate": 4.049430458264405e-05, "loss": 0.0647, "num_input_tokens_seen": 14903360, "step": 11490 }, { "epoch": 0.5616495248332641, "grad_norm": 0.27263882756233215, "learning_rate": 4.048659881103422e-05, "loss": 0.099, "num_input_tokens_seen": 14909824, "step": 11495 }, { "epoch": 0.5618938264969584, "grad_norm": 0.5121637582778931, "learning_rate": 4.0478890651196235e-05, "loss": 0.1192, "num_input_tokens_seen": 14916128, "step": 11500 }, { "epoch": 0.5621381281606528, "grad_norm": 0.3975006341934204, "learning_rate": 4.047118010431879e-05, "loss": 0.0799, "num_input_tokens_seen": 14922656, "step": 11505 }, { "epoch": 0.562382429824347, "grad_norm": 0.131761372089386, "learning_rate": 4.046346717159094e-05, "loss": 0.0885, "num_input_tokens_seen": 14928896, "step": 11510 }, { "epoch": 0.5626267314880414, "grad_norm": 0.3731580376625061, "learning_rate": 4.045575185420214e-05, "loss": 0.0835, "num_input_tokens_seen": 14935328, "step": 11515 }, { "epoch": 0.5628710331517358, "grad_norm": 0.4731961190700531, "learning_rate": 4.0448034153342165e-05, "loss": 0.1094, "num_input_tokens_seen": 14942112, "step": 11520 }, { "epoch": 0.5631153348154301, "grad_norm": 0.3175733685493469, "learning_rate": 4.0440314070201194e-05, "loss": 0.0744, "num_input_tokens_seen": 14949216, "step": 11525 }, { "epoch": 0.5633596364791245, "grad_norm": 0.28578484058380127, "learning_rate": 4.043259160596976e-05, "loss": 0.0749, "num_input_tokens_seen": 14955392, "step": 11530 }, { "epoch": 0.5636039381428187, "grad_norm": 0.2681703567504883, "learning_rate": 4.0424866761838767e-05, "loss": 0.0856, "num_input_tokens_seen": 14961856, "step": 11535 }, { "epoch": 0.5638482398065131, "grad_norm": 0.1671198308467865, "learning_rate": 4.041713953899948e-05, "loss": 0.0739, "num_input_tokens_seen": 14968448, "step": 11540 }, { "epoch": 0.5640925414702074, "grad_norm": 0.31003186106681824, "learning_rate": 4.0409409938643515e-05, "loss": 0.0923, "num_input_tokens_seen": 14975040, "step": 11545 }, { "epoch": 0.5643368431339018, "grad_norm": 0.26579156517982483, "learning_rate": 4.0401677961962904e-05, "loss": 0.0878, "num_input_tokens_seen": 14981344, "step": 11550 }, { "epoch": 0.564581144797596, "grad_norm": 0.1797543317079544, "learning_rate": 4.039394361015001e-05, "loss": 0.0682, "num_input_tokens_seen": 14987552, "step": 11555 }, { "epoch": 0.5648254464612904, "grad_norm": 0.7676941752433777, "learning_rate": 4.038620688439755e-05, "loss": 0.0951, "num_input_tokens_seen": 14993824, "step": 11560 }, { "epoch": 0.5650697481249848, "grad_norm": 0.23949190974235535, "learning_rate": 4.037846778589862e-05, "loss": 0.0795, "num_input_tokens_seen": 15000384, "step": 11565 }, { "epoch": 0.565314049788679, "grad_norm": 0.8821595907211304, "learning_rate": 4.0370726315846715e-05, "loss": 0.0981, "num_input_tokens_seen": 15006848, "step": 11570 }, { "epoch": 0.5655583514523734, "grad_norm": 0.35820984840393066, "learning_rate": 4.036298247543565e-05, "loss": 0.1147, "num_input_tokens_seen": 15013312, "step": 11575 }, { "epoch": 0.5658026531160677, "grad_norm": 0.5085839629173279, "learning_rate": 4.035523626585962e-05, "loss": 0.0735, "num_input_tokens_seen": 15019488, "step": 11580 }, { "epoch": 0.5660469547797621, "grad_norm": 0.35520055890083313, "learning_rate": 4.0347487688313194e-05, "loss": 0.1086, "num_input_tokens_seen": 15025888, "step": 11585 }, { "epoch": 0.5662912564434563, "grad_norm": 0.49896150827407837, "learning_rate": 4.0339736743991296e-05, "loss": 0.0939, "num_input_tokens_seen": 15032768, "step": 11590 }, { "epoch": 0.5665355581071507, "grad_norm": 0.31287074089050293, "learning_rate": 4.0331983434089227e-05, "loss": 0.082, "num_input_tokens_seen": 15039424, "step": 11595 }, { "epoch": 0.566779859770845, "grad_norm": 0.11482471227645874, "learning_rate": 4.032422775980264e-05, "loss": 0.0787, "num_input_tokens_seen": 15045600, "step": 11600 }, { "epoch": 0.566779859770845, "eval_loss": 0.09227412939071655, "eval_runtime": 375.4597, "eval_samples_per_second": 96.908, "eval_steps_per_second": 24.229, "num_input_tokens_seen": 15045600, "step": 11600 }, { "epoch": 0.5670241614345394, "grad_norm": 0.3887866139411926, "learning_rate": 4.031646972232754e-05, "loss": 0.1106, "num_input_tokens_seen": 15051872, "step": 11605 }, { "epoch": 0.5672684630982336, "grad_norm": 0.4172067642211914, "learning_rate": 4.0308709322860344e-05, "loss": 0.0778, "num_input_tokens_seen": 15058400, "step": 11610 }, { "epoch": 0.567512764761928, "grad_norm": 0.28366819024086, "learning_rate": 4.0300946562597784e-05, "loss": 0.0668, "num_input_tokens_seen": 15064896, "step": 11615 }, { "epoch": 0.5677570664256224, "grad_norm": 0.6159065365791321, "learning_rate": 4.029318144273698e-05, "loss": 0.0691, "num_input_tokens_seen": 15071584, "step": 11620 }, { "epoch": 0.5680013680893167, "grad_norm": 0.1353152096271515, "learning_rate": 4.0285413964475415e-05, "loss": 0.0939, "num_input_tokens_seen": 15077824, "step": 11625 }, { "epoch": 0.568245669753011, "grad_norm": 0.6654989123344421, "learning_rate": 4.0277644129010927e-05, "loss": 0.0984, "num_input_tokens_seen": 15084160, "step": 11630 }, { "epoch": 0.5684899714167053, "grad_norm": 0.26029452681541443, "learning_rate": 4.0269871937541724e-05, "loss": 0.0839, "num_input_tokens_seen": 15090880, "step": 11635 }, { "epoch": 0.5687342730803997, "grad_norm": 0.31914111971855164, "learning_rate": 4.026209739126637e-05, "loss": 0.1229, "num_input_tokens_seen": 15097184, "step": 11640 }, { "epoch": 0.568978574744094, "grad_norm": 0.3136904239654541, "learning_rate": 4.025432049138381e-05, "loss": 0.1115, "num_input_tokens_seen": 15103552, "step": 11645 }, { "epoch": 0.5692228764077883, "grad_norm": 0.44040921330451965, "learning_rate": 4.0246541239093325e-05, "loss": 0.0932, "num_input_tokens_seen": 15110176, "step": 11650 }, { "epoch": 0.5694671780714826, "grad_norm": 0.5917658805847168, "learning_rate": 4.023875963559459e-05, "loss": 0.0813, "num_input_tokens_seen": 15116736, "step": 11655 }, { "epoch": 0.569711479735177, "grad_norm": 0.24947790801525116, "learning_rate": 4.023097568208761e-05, "loss": 0.0949, "num_input_tokens_seen": 15123936, "step": 11660 }, { "epoch": 0.5699557813988714, "grad_norm": 0.29144740104675293, "learning_rate": 4.022318937977277e-05, "loss": 0.0748, "num_input_tokens_seen": 15130368, "step": 11665 }, { "epoch": 0.5702000830625656, "grad_norm": 0.5808823108673096, "learning_rate": 4.021540072985084e-05, "loss": 0.0881, "num_input_tokens_seen": 15136928, "step": 11670 }, { "epoch": 0.57044438472626, "grad_norm": 0.5930858850479126, "learning_rate": 4.020760973352289e-05, "loss": 0.07, "num_input_tokens_seen": 15144032, "step": 11675 }, { "epoch": 0.5706886863899543, "grad_norm": 0.26073983311653137, "learning_rate": 4.019981639199042e-05, "loss": 0.0759, "num_input_tokens_seen": 15150528, "step": 11680 }, { "epoch": 0.5709329880536487, "grad_norm": 0.5097990036010742, "learning_rate": 4.0192020706455245e-05, "loss": 0.0845, "num_input_tokens_seen": 15156800, "step": 11685 }, { "epoch": 0.5711772897173429, "grad_norm": 0.6638206839561462, "learning_rate": 4.018422267811956e-05, "loss": 0.1012, "num_input_tokens_seen": 15163072, "step": 11690 }, { "epoch": 0.5714215913810373, "grad_norm": 0.4406627118587494, "learning_rate": 4.017642230818592e-05, "loss": 0.0801, "num_input_tokens_seen": 15169568, "step": 11695 }, { "epoch": 0.5716658930447316, "grad_norm": 0.3179217278957367, "learning_rate": 4.0168619597857246e-05, "loss": 0.1217, "num_input_tokens_seen": 15175904, "step": 11700 }, { "epoch": 0.571910194708426, "grad_norm": 0.7055352330207825, "learning_rate": 4.016081454833681e-05, "loss": 0.1115, "num_input_tokens_seen": 15181728, "step": 11705 }, { "epoch": 0.5721544963721203, "grad_norm": 0.3661091923713684, "learning_rate": 4.0153007160828245e-05, "loss": 0.0948, "num_input_tokens_seen": 15188288, "step": 11710 }, { "epoch": 0.5723987980358146, "grad_norm": 0.1472560465335846, "learning_rate": 4.0145197436535555e-05, "loss": 0.0832, "num_input_tokens_seen": 15194848, "step": 11715 }, { "epoch": 0.572643099699509, "grad_norm": 0.410995215177536, "learning_rate": 4.0137385376663095e-05, "loss": 0.1088, "num_input_tokens_seen": 15201536, "step": 11720 }, { "epoch": 0.5728874013632033, "grad_norm": 0.16191023588180542, "learning_rate": 4.012957098241558e-05, "loss": 0.0775, "num_input_tokens_seen": 15208032, "step": 11725 }, { "epoch": 0.5731317030268976, "grad_norm": 0.20592908561229706, "learning_rate": 4.0121754254998076e-05, "loss": 0.1136, "num_input_tokens_seen": 15214272, "step": 11730 }, { "epoch": 0.5733760046905919, "grad_norm": 0.22682705521583557, "learning_rate": 4.011393519561606e-05, "loss": 0.0844, "num_input_tokens_seen": 15220672, "step": 11735 }, { "epoch": 0.5736203063542863, "grad_norm": 1.0481839179992676, "learning_rate": 4.010611380547529e-05, "loss": 0.1026, "num_input_tokens_seen": 15227136, "step": 11740 }, { "epoch": 0.5738646080179806, "grad_norm": 0.5424748063087463, "learning_rate": 4.009829008578192e-05, "loss": 0.1036, "num_input_tokens_seen": 15233536, "step": 11745 }, { "epoch": 0.5741089096816749, "grad_norm": 1.344347357749939, "learning_rate": 4.00904640377425e-05, "loss": 0.1115, "num_input_tokens_seen": 15240512, "step": 11750 }, { "epoch": 0.5743532113453692, "grad_norm": 0.2639642059803009, "learning_rate": 4.0082635662563886e-05, "loss": 0.0745, "num_input_tokens_seen": 15246784, "step": 11755 }, { "epoch": 0.5745975130090636, "grad_norm": 0.24097278714179993, "learning_rate": 4.007480496145331e-05, "loss": 0.1108, "num_input_tokens_seen": 15252896, "step": 11760 }, { "epoch": 0.574841814672758, "grad_norm": 0.45698559284210205, "learning_rate": 4.006697193561837e-05, "loss": 0.1081, "num_input_tokens_seen": 15259232, "step": 11765 }, { "epoch": 0.5750861163364522, "grad_norm": 0.43721896409988403, "learning_rate": 4.005913658626701e-05, "loss": 0.08, "num_input_tokens_seen": 15265760, "step": 11770 }, { "epoch": 0.5753304180001466, "grad_norm": 0.1981433629989624, "learning_rate": 4.005129891460754e-05, "loss": 0.0789, "num_input_tokens_seen": 15272416, "step": 11775 }, { "epoch": 0.5755747196638409, "grad_norm": 0.29872065782546997, "learning_rate": 4.004345892184864e-05, "loss": 0.0857, "num_input_tokens_seen": 15278624, "step": 11780 }, { "epoch": 0.5758190213275353, "grad_norm": 0.29297876358032227, "learning_rate": 4.003561660919932e-05, "loss": 0.104, "num_input_tokens_seen": 15284960, "step": 11785 }, { "epoch": 0.5760633229912295, "grad_norm": 0.24440865218639374, "learning_rate": 4.002777197786897e-05, "loss": 0.111, "num_input_tokens_seen": 15292960, "step": 11790 }, { "epoch": 0.5763076246549239, "grad_norm": 0.29052501916885376, "learning_rate": 4.0019925029067326e-05, "loss": 0.0836, "num_input_tokens_seen": 15299648, "step": 11795 }, { "epoch": 0.5765519263186182, "grad_norm": 0.30603066086769104, "learning_rate": 4.0012075764004495e-05, "loss": 0.0709, "num_input_tokens_seen": 15306176, "step": 11800 }, { "epoch": 0.5765519263186182, "eval_loss": 0.0924813374876976, "eval_runtime": 375.3035, "eval_samples_per_second": 96.948, "eval_steps_per_second": 24.239, "num_input_tokens_seen": 15306176, "step": 11800 }, { "epoch": 0.5767962279823126, "grad_norm": 0.21672400832176208, "learning_rate": 4.000422418389094e-05, "loss": 0.0972, "num_input_tokens_seen": 15312416, "step": 11805 }, { "epoch": 0.5770405296460069, "grad_norm": 0.27566686272621155, "learning_rate": 3.999637028993744e-05, "loss": 0.0787, "num_input_tokens_seen": 15318656, "step": 11810 }, { "epoch": 0.5772848313097012, "grad_norm": 0.543626606464386, "learning_rate": 3.99885140833552e-05, "loss": 0.0969, "num_input_tokens_seen": 15325376, "step": 11815 }, { "epoch": 0.5775291329733956, "grad_norm": 0.242306649684906, "learning_rate": 3.998065556535572e-05, "loss": 0.09, "num_input_tokens_seen": 15331872, "step": 11820 }, { "epoch": 0.5777734346370899, "grad_norm": 0.12866969406604767, "learning_rate": 3.9972794737150895e-05, "loss": 0.0914, "num_input_tokens_seen": 15338240, "step": 11825 }, { "epoch": 0.5780177363007842, "grad_norm": 0.25388064980506897, "learning_rate": 3.996493159995297e-05, "loss": 0.0743, "num_input_tokens_seen": 15345184, "step": 11830 }, { "epoch": 0.5782620379644785, "grad_norm": 0.37954798340797424, "learning_rate": 3.995706615497453e-05, "loss": 0.0926, "num_input_tokens_seen": 15351744, "step": 11835 }, { "epoch": 0.5785063396281729, "grad_norm": 0.5607308745384216, "learning_rate": 3.994919840342852e-05, "loss": 0.0821, "num_input_tokens_seen": 15358048, "step": 11840 }, { "epoch": 0.5787506412918672, "grad_norm": 0.18328624963760376, "learning_rate": 3.994132834652825e-05, "loss": 0.0895, "num_input_tokens_seen": 15364288, "step": 11845 }, { "epoch": 0.5789949429555615, "grad_norm": 0.25944259762763977, "learning_rate": 3.99334559854874e-05, "loss": 0.0989, "num_input_tokens_seen": 15370848, "step": 11850 }, { "epoch": 0.5792392446192559, "grad_norm": 0.18832071125507355, "learning_rate": 3.9925581321519955e-05, "loss": 0.0634, "num_input_tokens_seen": 15377088, "step": 11855 }, { "epoch": 0.5794835462829502, "grad_norm": 0.27709686756134033, "learning_rate": 3.991770435584031e-05, "loss": 0.0827, "num_input_tokens_seen": 15383808, "step": 11860 }, { "epoch": 0.5797278479466446, "grad_norm": 0.35490626096725464, "learning_rate": 3.990982508966319e-05, "loss": 0.0827, "num_input_tokens_seen": 15390624, "step": 11865 }, { "epoch": 0.5799721496103388, "grad_norm": 0.29492971301078796, "learning_rate": 3.990194352420367e-05, "loss": 0.1028, "num_input_tokens_seen": 15397280, "step": 11870 }, { "epoch": 0.5802164512740332, "grad_norm": 0.29083314538002014, "learning_rate": 3.9894059660677184e-05, "loss": 0.0981, "num_input_tokens_seen": 15404096, "step": 11875 }, { "epoch": 0.5804607529377275, "grad_norm": 0.18774396181106567, "learning_rate": 3.9886173500299526e-05, "loss": 0.1077, "num_input_tokens_seen": 15410464, "step": 11880 }, { "epoch": 0.5807050546014219, "grad_norm": 0.21484555304050446, "learning_rate": 3.987828504428685e-05, "loss": 0.0944, "num_input_tokens_seen": 15416736, "step": 11885 }, { "epoch": 0.5809493562651161, "grad_norm": 0.22514761984348297, "learning_rate": 3.987039429385565e-05, "loss": 0.0808, "num_input_tokens_seen": 15422720, "step": 11890 }, { "epoch": 0.5811936579288105, "grad_norm": 0.1731557548046112, "learning_rate": 3.986250125022277e-05, "loss": 0.0876, "num_input_tokens_seen": 15429184, "step": 11895 }, { "epoch": 0.5814379595925048, "grad_norm": 0.23112773895263672, "learning_rate": 3.985460591460544e-05, "loss": 0.0776, "num_input_tokens_seen": 15435584, "step": 11900 }, { "epoch": 0.5816822612561992, "grad_norm": 0.3014093041419983, "learning_rate": 3.984670828822118e-05, "loss": 0.0694, "num_input_tokens_seen": 15442176, "step": 11905 }, { "epoch": 0.5819265629198935, "grad_norm": 0.19248858094215393, "learning_rate": 3.983880837228794e-05, "loss": 0.1251, "num_input_tokens_seen": 15448384, "step": 11910 }, { "epoch": 0.5821708645835878, "grad_norm": 0.2515658140182495, "learning_rate": 3.983090616802396e-05, "loss": 0.0775, "num_input_tokens_seen": 15454848, "step": 11915 }, { "epoch": 0.5824151662472822, "grad_norm": 0.41815245151519775, "learning_rate": 3.982300167664788e-05, "loss": 0.0954, "num_input_tokens_seen": 15461376, "step": 11920 }, { "epoch": 0.5826594679109764, "grad_norm": 0.14944913983345032, "learning_rate": 3.981509489937868e-05, "loss": 0.0722, "num_input_tokens_seen": 15468448, "step": 11925 }, { "epoch": 0.5829037695746708, "grad_norm": 0.3426343500614166, "learning_rate": 3.9807185837435643e-05, "loss": 0.11, "num_input_tokens_seen": 15474880, "step": 11930 }, { "epoch": 0.5831480712383651, "grad_norm": 0.9040818810462952, "learning_rate": 3.9799274492038484e-05, "loss": 0.1086, "num_input_tokens_seen": 15480928, "step": 11935 }, { "epoch": 0.5833923729020595, "grad_norm": 0.5609897375106812, "learning_rate": 3.979136086440722e-05, "loss": 0.0982, "num_input_tokens_seen": 15486976, "step": 11940 }, { "epoch": 0.5836366745657537, "grad_norm": 0.2578250765800476, "learning_rate": 3.9783444955762226e-05, "loss": 0.1145, "num_input_tokens_seen": 15493696, "step": 11945 }, { "epoch": 0.5838809762294481, "grad_norm": 0.657230019569397, "learning_rate": 3.977552676732424e-05, "loss": 0.106, "num_input_tokens_seen": 15500352, "step": 11950 }, { "epoch": 0.5841252778931425, "grad_norm": 0.14265140891075134, "learning_rate": 3.976760630031435e-05, "loss": 0.1002, "num_input_tokens_seen": 15506304, "step": 11955 }, { "epoch": 0.5843695795568368, "grad_norm": 0.2962074279785156, "learning_rate": 3.975968355595398e-05, "loss": 0.0806, "num_input_tokens_seen": 15512512, "step": 11960 }, { "epoch": 0.5846138812205312, "grad_norm": 0.1383495032787323, "learning_rate": 3.9751758535464935e-05, "loss": 0.0795, "num_input_tokens_seen": 15519136, "step": 11965 }, { "epoch": 0.5848581828842254, "grad_norm": 0.15405896306037903, "learning_rate": 3.9743831240069326e-05, "loss": 0.0772, "num_input_tokens_seen": 15525664, "step": 11970 }, { "epoch": 0.5851024845479198, "grad_norm": 0.183787539601326, "learning_rate": 3.9735901670989675e-05, "loss": 0.0874, "num_input_tokens_seen": 15532064, "step": 11975 }, { "epoch": 0.5853467862116141, "grad_norm": 0.15051864087581635, "learning_rate": 3.97279698294488e-05, "loss": 0.0819, "num_input_tokens_seen": 15538688, "step": 11980 }, { "epoch": 0.5855910878753084, "grad_norm": 0.19054274260997772, "learning_rate": 3.9720035716669876e-05, "loss": 0.0991, "num_input_tokens_seen": 15545344, "step": 11985 }, { "epoch": 0.5858353895390027, "grad_norm": 0.28477826714515686, "learning_rate": 3.9712099333876474e-05, "loss": 0.0932, "num_input_tokens_seen": 15551904, "step": 11990 }, { "epoch": 0.5860796912026971, "grad_norm": 0.4778057336807251, "learning_rate": 3.9704160682292475e-05, "loss": 0.1021, "num_input_tokens_seen": 15558400, "step": 11995 }, { "epoch": 0.5863239928663915, "grad_norm": 0.7212212681770325, "learning_rate": 3.9696219763142106e-05, "loss": 0.0965, "num_input_tokens_seen": 15565184, "step": 12000 }, { "epoch": 0.5863239928663915, "eval_loss": 0.09267397224903107, "eval_runtime": 374.9478, "eval_samples_per_second": 97.04, "eval_steps_per_second": 24.262, "num_input_tokens_seen": 15565184, "step": 12000 }, { "epoch": 0.5865682945300857, "grad_norm": 0.7905503511428833, "learning_rate": 3.968827657764997e-05, "loss": 0.0988, "num_input_tokens_seen": 15572096, "step": 12005 }, { "epoch": 0.5868125961937801, "grad_norm": 0.4832771122455597, "learning_rate": 3.9680331127041e-05, "loss": 0.0803, "num_input_tokens_seen": 15578688, "step": 12010 }, { "epoch": 0.5870568978574744, "grad_norm": 0.35747453570365906, "learning_rate": 3.9672383412540495e-05, "loss": 0.0907, "num_input_tokens_seen": 15584896, "step": 12015 }, { "epoch": 0.5873011995211688, "grad_norm": 0.17623093724250793, "learning_rate": 3.966443343537407e-05, "loss": 0.1166, "num_input_tokens_seen": 15591392, "step": 12020 }, { "epoch": 0.587545501184863, "grad_norm": 0.2647101879119873, "learning_rate": 3.965648119676772e-05, "loss": 0.0724, "num_input_tokens_seen": 15597792, "step": 12025 }, { "epoch": 0.5877898028485574, "grad_norm": 0.5593610405921936, "learning_rate": 3.96485266979478e-05, "loss": 0.1306, "num_input_tokens_seen": 15603968, "step": 12030 }, { "epoch": 0.5880341045122517, "grad_norm": 0.3718784749507904, "learning_rate": 3.9640569940140974e-05, "loss": 0.1, "num_input_tokens_seen": 15610400, "step": 12035 }, { "epoch": 0.5882784061759461, "grad_norm": 0.6256546974182129, "learning_rate": 3.963261092457428e-05, "loss": 0.0939, "num_input_tokens_seen": 15617056, "step": 12040 }, { "epoch": 0.5885227078396403, "grad_norm": 0.6325628757476807, "learning_rate": 3.962464965247509e-05, "loss": 0.0898, "num_input_tokens_seen": 15624096, "step": 12045 }, { "epoch": 0.5887670095033347, "grad_norm": 0.22285215556621552, "learning_rate": 3.9616686125071135e-05, "loss": 0.1112, "num_input_tokens_seen": 15630208, "step": 12050 }, { "epoch": 0.5890113111670291, "grad_norm": 0.12283826619386673, "learning_rate": 3.9608720343590506e-05, "loss": 0.0742, "num_input_tokens_seen": 15637184, "step": 12055 }, { "epoch": 0.5892556128307234, "grad_norm": 0.13977321982383728, "learning_rate": 3.960075230926161e-05, "loss": 0.0714, "num_input_tokens_seen": 15643552, "step": 12060 }, { "epoch": 0.5894999144944177, "grad_norm": 0.6943238377571106, "learning_rate": 3.959278202331322e-05, "loss": 0.1029, "num_input_tokens_seen": 15649984, "step": 12065 }, { "epoch": 0.589744216158112, "grad_norm": 0.33683180809020996, "learning_rate": 3.958480948697446e-05, "loss": 0.0941, "num_input_tokens_seen": 15656448, "step": 12070 }, { "epoch": 0.5899885178218064, "grad_norm": 0.9194881916046143, "learning_rate": 3.95768347014748e-05, "loss": 0.088, "num_input_tokens_seen": 15662912, "step": 12075 }, { "epoch": 0.5902328194855007, "grad_norm": 0.2500481605529785, "learning_rate": 3.956885766804404e-05, "loss": 0.0792, "num_input_tokens_seen": 15669024, "step": 12080 }, { "epoch": 0.590477121149195, "grad_norm": 0.13035139441490173, "learning_rate": 3.956087838791235e-05, "loss": 0.1036, "num_input_tokens_seen": 15675424, "step": 12085 }, { "epoch": 0.5907214228128893, "grad_norm": 0.26389098167419434, "learning_rate": 3.955289686231022e-05, "loss": 0.0941, "num_input_tokens_seen": 15681760, "step": 12090 }, { "epoch": 0.5909657244765837, "grad_norm": 0.5971890687942505, "learning_rate": 3.9544913092468504e-05, "loss": 0.1244, "num_input_tokens_seen": 15688416, "step": 12095 }, { "epoch": 0.5912100261402781, "grad_norm": 0.9304025173187256, "learning_rate": 3.9536927079618425e-05, "loss": 0.095, "num_input_tokens_seen": 15694880, "step": 12100 }, { "epoch": 0.5914543278039723, "grad_norm": 0.17429868876934052, "learning_rate": 3.9528938824991494e-05, "loss": 0.0868, "num_input_tokens_seen": 15701760, "step": 12105 }, { "epoch": 0.5916986294676667, "grad_norm": 0.18168805539608002, "learning_rate": 3.952094832981962e-05, "loss": 0.0962, "num_input_tokens_seen": 15708768, "step": 12110 }, { "epoch": 0.591942931131361, "grad_norm": 0.40679457783699036, "learning_rate": 3.951295559533503e-05, "loss": 0.0918, "num_input_tokens_seen": 15715136, "step": 12115 }, { "epoch": 0.5921872327950554, "grad_norm": 0.1987023800611496, "learning_rate": 3.95049606227703e-05, "loss": 0.084, "num_input_tokens_seen": 15721760, "step": 12120 }, { "epoch": 0.5924315344587496, "grad_norm": 0.21283994615077972, "learning_rate": 3.949696341335838e-05, "loss": 0.1099, "num_input_tokens_seen": 15727840, "step": 12125 }, { "epoch": 0.592675836122444, "grad_norm": 0.4501434862613678, "learning_rate": 3.9488963968332503e-05, "loss": 0.0796, "num_input_tokens_seen": 15734240, "step": 12130 }, { "epoch": 0.5929201377861383, "grad_norm": 0.2434212863445282, "learning_rate": 3.948096228892631e-05, "loss": 0.1087, "num_input_tokens_seen": 15740640, "step": 12135 }, { "epoch": 0.5931644394498327, "grad_norm": 0.31868797540664673, "learning_rate": 3.947295837637375e-05, "loss": 0.0968, "num_input_tokens_seen": 15746848, "step": 12140 }, { "epoch": 0.593408741113527, "grad_norm": 0.8966848254203796, "learning_rate": 3.9464952231909135e-05, "loss": 0.127, "num_input_tokens_seen": 15753344, "step": 12145 }, { "epoch": 0.5936530427772213, "grad_norm": 0.22218184173107147, "learning_rate": 3.945694385676711e-05, "loss": 0.0982, "num_input_tokens_seen": 15759648, "step": 12150 }, { "epoch": 0.5938973444409157, "grad_norm": 0.2016710489988327, "learning_rate": 3.944893325218265e-05, "loss": 0.0687, "num_input_tokens_seen": 15765760, "step": 12155 }, { "epoch": 0.59414164610461, "grad_norm": 0.2067076861858368, "learning_rate": 3.944092041939112e-05, "loss": 0.0889, "num_input_tokens_seen": 15772032, "step": 12160 }, { "epoch": 0.5943859477683043, "grad_norm": 0.09972681105136871, "learning_rate": 3.943290535962818e-05, "loss": 0.078, "num_input_tokens_seen": 15778400, "step": 12165 }, { "epoch": 0.5946302494319986, "grad_norm": 0.43038174510002136, "learning_rate": 3.942488807412985e-05, "loss": 0.0666, "num_input_tokens_seen": 15785056, "step": 12170 }, { "epoch": 0.594874551095693, "grad_norm": 0.4305626153945923, "learning_rate": 3.941686856413251e-05, "loss": 0.1027, "num_input_tokens_seen": 15791552, "step": 12175 }, { "epoch": 0.5951188527593873, "grad_norm": 0.3403131365776062, "learning_rate": 3.9408846830872874e-05, "loss": 0.0718, "num_input_tokens_seen": 15797632, "step": 12180 }, { "epoch": 0.5953631544230816, "grad_norm": 0.5085770487785339, "learning_rate": 3.940082287558798e-05, "loss": 0.1271, "num_input_tokens_seen": 15804448, "step": 12185 }, { "epoch": 0.5956074560867759, "grad_norm": 0.15173476934432983, "learning_rate": 3.939279669951522e-05, "loss": 0.0744, "num_input_tokens_seen": 15811264, "step": 12190 }, { "epoch": 0.5958517577504703, "grad_norm": 0.654689610004425, "learning_rate": 3.938476830389234e-05, "loss": 0.1077, "num_input_tokens_seen": 15817600, "step": 12195 }, { "epoch": 0.5960960594141647, "grad_norm": 0.1501881182193756, "learning_rate": 3.937673768995742e-05, "loss": 0.1025, "num_input_tokens_seen": 15824576, "step": 12200 }, { "epoch": 0.5960960594141647, "eval_loss": 0.09168458729982376, "eval_runtime": 375.1716, "eval_samples_per_second": 96.982, "eval_steps_per_second": 24.248, "num_input_tokens_seen": 15824576, "step": 12200 }, { "epoch": 0.5963403610778589, "grad_norm": 0.9294490814208984, "learning_rate": 3.936870485894888e-05, "loss": 0.1132, "num_input_tokens_seen": 15831360, "step": 12205 }, { "epoch": 0.5965846627415533, "grad_norm": 0.8025923371315002, "learning_rate": 3.9360669812105475e-05, "loss": 0.118, "num_input_tokens_seen": 15837856, "step": 12210 }, { "epoch": 0.5968289644052476, "grad_norm": 0.2542835772037506, "learning_rate": 3.9352632550666325e-05, "loss": 0.0826, "num_input_tokens_seen": 15844448, "step": 12215 }, { "epoch": 0.597073266068942, "grad_norm": 0.30764511227607727, "learning_rate": 3.9344593075870866e-05, "loss": 0.0825, "num_input_tokens_seen": 15850976, "step": 12220 }, { "epoch": 0.5973175677326362, "grad_norm": 0.14888215065002441, "learning_rate": 3.933655138895889e-05, "loss": 0.082, "num_input_tokens_seen": 15857472, "step": 12225 }, { "epoch": 0.5975618693963306, "grad_norm": 0.43498438596725464, "learning_rate": 3.932850749117053e-05, "loss": 0.1023, "num_input_tokens_seen": 15863744, "step": 12230 }, { "epoch": 0.5978061710600249, "grad_norm": 0.137593075633049, "learning_rate": 3.932046138374624e-05, "loss": 0.1122, "num_input_tokens_seen": 15870400, "step": 12235 }, { "epoch": 0.5980504727237193, "grad_norm": 0.3217637240886688, "learning_rate": 3.9312413067926854e-05, "loss": 0.1019, "num_input_tokens_seen": 15877280, "step": 12240 }, { "epoch": 0.5982947743874136, "grad_norm": 0.2537616193294525, "learning_rate": 3.9304362544953506e-05, "loss": 0.1121, "num_input_tokens_seen": 15883520, "step": 12245 }, { "epoch": 0.5985390760511079, "grad_norm": 0.6901839375495911, "learning_rate": 3.929630981606769e-05, "loss": 0.0681, "num_input_tokens_seen": 15889888, "step": 12250 }, { "epoch": 0.5987833777148023, "grad_norm": 0.39980995655059814, "learning_rate": 3.928825488251124e-05, "loss": 0.0866, "num_input_tokens_seen": 15896448, "step": 12255 }, { "epoch": 0.5990276793784965, "grad_norm": 0.6403158903121948, "learning_rate": 3.9280197745526344e-05, "loss": 0.1029, "num_input_tokens_seen": 15902432, "step": 12260 }, { "epoch": 0.5992719810421909, "grad_norm": 0.22509890794754028, "learning_rate": 3.9272138406355495e-05, "loss": 0.0699, "num_input_tokens_seen": 15908992, "step": 12265 }, { "epoch": 0.5995162827058852, "grad_norm": 0.7639783024787903, "learning_rate": 3.926407686624154e-05, "loss": 0.1013, "num_input_tokens_seen": 15915392, "step": 12270 }, { "epoch": 0.5997605843695796, "grad_norm": 0.3692246675491333, "learning_rate": 3.9256013126427684e-05, "loss": 0.106, "num_input_tokens_seen": 15921728, "step": 12275 }, { "epoch": 0.6000048860332738, "grad_norm": 0.87227463722229, "learning_rate": 3.9247947188157455e-05, "loss": 0.1334, "num_input_tokens_seen": 15928160, "step": 12280 }, { "epoch": 0.6002491876969682, "grad_norm": 0.6376311182975769, "learning_rate": 3.9239879052674715e-05, "loss": 0.0898, "num_input_tokens_seen": 15934464, "step": 12285 }, { "epoch": 0.6004934893606625, "grad_norm": 0.5167179107666016, "learning_rate": 3.9231808721223673e-05, "loss": 0.1023, "num_input_tokens_seen": 15940800, "step": 12290 }, { "epoch": 0.6007377910243569, "grad_norm": 0.24256843328475952, "learning_rate": 3.9223736195048886e-05, "loss": 0.111, "num_input_tokens_seen": 15947296, "step": 12295 }, { "epoch": 0.6009820926880513, "grad_norm": 1.1805012226104736, "learning_rate": 3.921566147539523e-05, "loss": 0.0994, "num_input_tokens_seen": 15953664, "step": 12300 }, { "epoch": 0.6012263943517455, "grad_norm": 0.297425776720047, "learning_rate": 3.920758456350792e-05, "loss": 0.1045, "num_input_tokens_seen": 15960608, "step": 12305 }, { "epoch": 0.6014706960154399, "grad_norm": 0.12519590556621552, "learning_rate": 3.919950546063253e-05, "loss": 0.0833, "num_input_tokens_seen": 15966912, "step": 12310 }, { "epoch": 0.6017149976791342, "grad_norm": 0.1443617194890976, "learning_rate": 3.919142416801496e-05, "loss": 0.1025, "num_input_tokens_seen": 15973120, "step": 12315 }, { "epoch": 0.6019592993428285, "grad_norm": 0.22512434422969818, "learning_rate": 3.918334068690144e-05, "loss": 0.0879, "num_input_tokens_seen": 15979360, "step": 12320 }, { "epoch": 0.6022036010065228, "grad_norm": 0.4305414855480194, "learning_rate": 3.917525501853855e-05, "loss": 0.0665, "num_input_tokens_seen": 15986400, "step": 12325 }, { "epoch": 0.6024479026702172, "grad_norm": 0.3615410327911377, "learning_rate": 3.916716716417319e-05, "loss": 0.0816, "num_input_tokens_seen": 15992992, "step": 12330 }, { "epoch": 0.6026922043339115, "grad_norm": 0.1860709935426712, "learning_rate": 3.915907712505263e-05, "loss": 0.0985, "num_input_tokens_seen": 15999040, "step": 12335 }, { "epoch": 0.6029365059976058, "grad_norm": 0.2682226002216339, "learning_rate": 3.915098490242444e-05, "loss": 0.0856, "num_input_tokens_seen": 16006048, "step": 12340 }, { "epoch": 0.6031808076613002, "grad_norm": 0.9268876910209656, "learning_rate": 3.914289049753654e-05, "loss": 0.0942, "num_input_tokens_seen": 16012224, "step": 12345 }, { "epoch": 0.6034251093249945, "grad_norm": 0.3085937798023224, "learning_rate": 3.913479391163719e-05, "loss": 0.0878, "num_input_tokens_seen": 16018816, "step": 12350 }, { "epoch": 0.6036694109886889, "grad_norm": 0.22755753993988037, "learning_rate": 3.9126695145975e-05, "loss": 0.1031, "num_input_tokens_seen": 16025248, "step": 12355 }, { "epoch": 0.6039137126523831, "grad_norm": 0.24096308648586273, "learning_rate": 3.911859420179889e-05, "loss": 0.1179, "num_input_tokens_seen": 16031744, "step": 12360 }, { "epoch": 0.6041580143160775, "grad_norm": 0.18788263201713562, "learning_rate": 3.911049108035813e-05, "loss": 0.1004, "num_input_tokens_seen": 16038944, "step": 12365 }, { "epoch": 0.6044023159797718, "grad_norm": 0.1944054365158081, "learning_rate": 3.910238578290232e-05, "loss": 0.0977, "num_input_tokens_seen": 16045216, "step": 12370 }, { "epoch": 0.6046466176434662, "grad_norm": 0.34498894214630127, "learning_rate": 3.90942783106814e-05, "loss": 0.0617, "num_input_tokens_seen": 16051584, "step": 12375 }, { "epoch": 0.6048909193071604, "grad_norm": 0.22888635098934174, "learning_rate": 3.908616866494564e-05, "loss": 0.0794, "num_input_tokens_seen": 16057664, "step": 12380 }, { "epoch": 0.6051352209708548, "grad_norm": 0.1802472025156021, "learning_rate": 3.907805684694566e-05, "loss": 0.0753, "num_input_tokens_seen": 16064032, "step": 12385 }, { "epoch": 0.6053795226345492, "grad_norm": 0.6095444560050964, "learning_rate": 3.90699428579324e-05, "loss": 0.0877, "num_input_tokens_seen": 16070528, "step": 12390 }, { "epoch": 0.6056238242982435, "grad_norm": 0.6045458316802979, "learning_rate": 3.906182669915713e-05, "loss": 0.101, "num_input_tokens_seen": 16076960, "step": 12395 }, { "epoch": 0.6058681259619378, "grad_norm": 0.22751376032829285, "learning_rate": 3.9053708371871476e-05, "loss": 0.1073, "num_input_tokens_seen": 16083104, "step": 12400 }, { "epoch": 0.6058681259619378, "eval_loss": 0.09226745367050171, "eval_runtime": 374.1568, "eval_samples_per_second": 97.245, "eval_steps_per_second": 24.313, "num_input_tokens_seen": 16083104, "step": 12400 }, { "epoch": 0.6061124276256321, "grad_norm": 0.209528386592865, "learning_rate": 3.904558787732738e-05, "loss": 0.089, "num_input_tokens_seen": 16089376, "step": 12405 }, { "epoch": 0.6063567292893265, "grad_norm": 0.2464532107114792, "learning_rate": 3.9037465216777135e-05, "loss": 0.0732, "num_input_tokens_seen": 16095808, "step": 12410 }, { "epoch": 0.6066010309530208, "grad_norm": 0.24705667793750763, "learning_rate": 3.902934039147334e-05, "loss": 0.0913, "num_input_tokens_seen": 16102112, "step": 12415 }, { "epoch": 0.6068453326167151, "grad_norm": 0.31995829939842224, "learning_rate": 3.902121340266894e-05, "loss": 0.08, "num_input_tokens_seen": 16108960, "step": 12420 }, { "epoch": 0.6070896342804094, "grad_norm": 0.1541624516248703, "learning_rate": 3.9013084251617246e-05, "loss": 0.0944, "num_input_tokens_seen": 16115488, "step": 12425 }, { "epoch": 0.6073339359441038, "grad_norm": 0.45704206824302673, "learning_rate": 3.9004952939571865e-05, "loss": 0.0826, "num_input_tokens_seen": 16121824, "step": 12430 }, { "epoch": 0.607578237607798, "grad_norm": 0.629039466381073, "learning_rate": 3.899681946778673e-05, "loss": 0.1024, "num_input_tokens_seen": 16128512, "step": 12435 }, { "epoch": 0.6078225392714924, "grad_norm": 0.6600524187088013, "learning_rate": 3.898868383751615e-05, "loss": 0.0827, "num_input_tokens_seen": 16135264, "step": 12440 }, { "epoch": 0.6080668409351868, "grad_norm": 0.25892460346221924, "learning_rate": 3.8980546050014724e-05, "loss": 0.0823, "num_input_tokens_seen": 16142176, "step": 12445 }, { "epoch": 0.6083111425988811, "grad_norm": 0.170340895652771, "learning_rate": 3.897240610653741e-05, "loss": 0.1035, "num_input_tokens_seen": 16148832, "step": 12450 }, { "epoch": 0.6085554442625755, "grad_norm": 0.19041474163532257, "learning_rate": 3.896426400833948e-05, "loss": 0.0713, "num_input_tokens_seen": 16154752, "step": 12455 }, { "epoch": 0.6087997459262697, "grad_norm": 0.14198502898216248, "learning_rate": 3.895611975667656e-05, "loss": 0.0906, "num_input_tokens_seen": 16161344, "step": 12460 }, { "epoch": 0.6090440475899641, "grad_norm": 0.7291654348373413, "learning_rate": 3.8947973352804584e-05, "loss": 0.0845, "num_input_tokens_seen": 16167424, "step": 12465 }, { "epoch": 0.6092883492536584, "grad_norm": 0.4434713125228882, "learning_rate": 3.893982479797984e-05, "loss": 0.0753, "num_input_tokens_seen": 16174112, "step": 12470 }, { "epoch": 0.6095326509173528, "grad_norm": 0.6162723898887634, "learning_rate": 3.8931674093458926e-05, "loss": 0.118, "num_input_tokens_seen": 16180640, "step": 12475 }, { "epoch": 0.609776952581047, "grad_norm": 0.14745227992534637, "learning_rate": 3.89235212404988e-05, "loss": 0.0899, "num_input_tokens_seen": 16187104, "step": 12480 }, { "epoch": 0.6100212542447414, "grad_norm": 0.309672474861145, "learning_rate": 3.891536624035672e-05, "loss": 0.0922, "num_input_tokens_seen": 16193280, "step": 12485 }, { "epoch": 0.6102655559084358, "grad_norm": 0.5635393261909485, "learning_rate": 3.8907209094290295e-05, "loss": 0.1118, "num_input_tokens_seen": 16199840, "step": 12490 }, { "epoch": 0.61050985757213, "grad_norm": 0.7906203269958496, "learning_rate": 3.8899049803557466e-05, "loss": 0.0931, "num_input_tokens_seen": 16206080, "step": 12495 }, { "epoch": 0.6107541592358244, "grad_norm": 0.6973445415496826, "learning_rate": 3.889088836941648e-05, "loss": 0.1029, "num_input_tokens_seen": 16213152, "step": 12500 }, { "epoch": 0.6109984608995187, "grad_norm": 0.29850509762763977, "learning_rate": 3.8882724793125946e-05, "loss": 0.0716, "num_input_tokens_seen": 16219296, "step": 12505 }, { "epoch": 0.6112427625632131, "grad_norm": 0.6751330494880676, "learning_rate": 3.8874559075944794e-05, "loss": 0.1116, "num_input_tokens_seen": 16225792, "step": 12510 }, { "epoch": 0.6114870642269074, "grad_norm": 0.20806682109832764, "learning_rate": 3.886639121913227e-05, "loss": 0.0816, "num_input_tokens_seen": 16232576, "step": 12515 }, { "epoch": 0.6117313658906017, "grad_norm": 0.3186153173446655, "learning_rate": 3.885822122394797e-05, "loss": 0.0891, "num_input_tokens_seen": 16239328, "step": 12520 }, { "epoch": 0.611975667554296, "grad_norm": 0.2319234311580658, "learning_rate": 3.8850049091651794e-05, "loss": 0.0832, "num_input_tokens_seen": 16245728, "step": 12525 }, { "epoch": 0.6122199692179904, "grad_norm": 0.1984570473432541, "learning_rate": 3.8841874823504e-05, "loss": 0.0859, "num_input_tokens_seen": 16252288, "step": 12530 }, { "epoch": 0.6124642708816848, "grad_norm": 0.23969027400016785, "learning_rate": 3.8833698420765157e-05, "loss": 0.1094, "num_input_tokens_seen": 16258528, "step": 12535 }, { "epoch": 0.612708572545379, "grad_norm": 0.556422770023346, "learning_rate": 3.882551988469618e-05, "loss": 0.082, "num_input_tokens_seen": 16264992, "step": 12540 }, { "epoch": 0.6129528742090734, "grad_norm": 0.11791528016328812, "learning_rate": 3.881733921655829e-05, "loss": 0.0917, "num_input_tokens_seen": 16271296, "step": 12545 }, { "epoch": 0.6131971758727677, "grad_norm": 0.4621013402938843, "learning_rate": 3.8809156417613054e-05, "loss": 0.123, "num_input_tokens_seen": 16277280, "step": 12550 }, { "epoch": 0.613441477536462, "grad_norm": 0.1234906017780304, "learning_rate": 3.8800971489122364e-05, "loss": 0.0896, "num_input_tokens_seen": 16283840, "step": 12555 }, { "epoch": 0.6136857792001563, "grad_norm": 0.42022469639778137, "learning_rate": 3.8792784432348434e-05, "loss": 0.0827, "num_input_tokens_seen": 16290528, "step": 12560 }, { "epoch": 0.6139300808638507, "grad_norm": 0.8562715649604797, "learning_rate": 3.878459524855381e-05, "loss": 0.1069, "num_input_tokens_seen": 16296928, "step": 12565 }, { "epoch": 0.614174382527545, "grad_norm": 0.22351230680942535, "learning_rate": 3.8776403939001384e-05, "loss": 0.1032, "num_input_tokens_seen": 16303904, "step": 12570 }, { "epoch": 0.6144186841912394, "grad_norm": 0.3422057628631592, "learning_rate": 3.876821050495433e-05, "loss": 0.1094, "num_input_tokens_seen": 16310496, "step": 12575 }, { "epoch": 0.6146629858549336, "grad_norm": 0.30471929907798767, "learning_rate": 3.87600149476762e-05, "loss": 0.0946, "num_input_tokens_seen": 16316960, "step": 12580 }, { "epoch": 0.614907287518628, "grad_norm": 0.30641886591911316, "learning_rate": 3.8751817268430843e-05, "loss": 0.094, "num_input_tokens_seen": 16323456, "step": 12585 }, { "epoch": 0.6151515891823224, "grad_norm": 0.3273295760154724, "learning_rate": 3.8743617468482464e-05, "loss": 0.0936, "num_input_tokens_seen": 16329952, "step": 12590 }, { "epoch": 0.6153958908460166, "grad_norm": 0.5431983470916748, "learning_rate": 3.8735415549095535e-05, "loss": 0.0832, "num_input_tokens_seen": 16336384, "step": 12595 }, { "epoch": 0.615640192509711, "grad_norm": 0.16992658376693726, "learning_rate": 3.8727211511534934e-05, "loss": 0.0747, "num_input_tokens_seen": 16342784, "step": 12600 }, { "epoch": 0.615640192509711, "eval_loss": 0.09166111797094345, "eval_runtime": 374.7091, "eval_samples_per_second": 97.102, "eval_steps_per_second": 24.277, "num_input_tokens_seen": 16342784, "step": 12600 }, { "epoch": 0.6158844941734053, "grad_norm": 0.6655529737472534, "learning_rate": 3.8719005357065804e-05, "loss": 0.1083, "num_input_tokens_seen": 16349088, "step": 12605 }, { "epoch": 0.6161287958370997, "grad_norm": 0.311154305934906, "learning_rate": 3.8710797086953645e-05, "loss": 0.0849, "num_input_tokens_seen": 16355456, "step": 12610 }, { "epoch": 0.6163730975007939, "grad_norm": 0.287954717874527, "learning_rate": 3.870258670246427e-05, "loss": 0.091, "num_input_tokens_seen": 16361984, "step": 12615 }, { "epoch": 0.6166173991644883, "grad_norm": 0.13947848975658417, "learning_rate": 3.869437420486384e-05, "loss": 0.0729, "num_input_tokens_seen": 16368128, "step": 12620 }, { "epoch": 0.6168617008281826, "grad_norm": 0.2130604237318039, "learning_rate": 3.8686159595418805e-05, "loss": 0.0777, "num_input_tokens_seen": 16374720, "step": 12625 }, { "epoch": 0.617106002491877, "grad_norm": 0.48382309079170227, "learning_rate": 3.867794287539597e-05, "loss": 0.1069, "num_input_tokens_seen": 16380896, "step": 12630 }, { "epoch": 0.6173503041555713, "grad_norm": 0.2106802612543106, "learning_rate": 3.866972404606245e-05, "loss": 0.1205, "num_input_tokens_seen": 16387296, "step": 12635 }, { "epoch": 0.6175946058192656, "grad_norm": 0.15757234394550323, "learning_rate": 3.866150310868571e-05, "loss": 0.0863, "num_input_tokens_seen": 16394400, "step": 12640 }, { "epoch": 0.61783890748296, "grad_norm": 0.13507656753063202, "learning_rate": 3.8653280064533506e-05, "loss": 0.1051, "num_input_tokens_seen": 16400896, "step": 12645 }, { "epoch": 0.6180832091466543, "grad_norm": 0.30368858575820923, "learning_rate": 3.864505491487394e-05, "loss": 0.0856, "num_input_tokens_seen": 16407616, "step": 12650 }, { "epoch": 0.6183275108103486, "grad_norm": 0.16521385312080383, "learning_rate": 3.8636827660975414e-05, "loss": 0.1066, "num_input_tokens_seen": 16414272, "step": 12655 }, { "epoch": 0.6185718124740429, "grad_norm": 0.584850549697876, "learning_rate": 3.862859830410671e-05, "loss": 0.0798, "num_input_tokens_seen": 16421088, "step": 12660 }, { "epoch": 0.6188161141377373, "grad_norm": 0.535429835319519, "learning_rate": 3.862036684553688e-05, "loss": 0.1006, "num_input_tokens_seen": 16427392, "step": 12665 }, { "epoch": 0.6190604158014316, "grad_norm": 0.2887762486934662, "learning_rate": 3.8612133286535314e-05, "loss": 0.1005, "num_input_tokens_seen": 16433440, "step": 12670 }, { "epoch": 0.6193047174651259, "grad_norm": 0.15402893722057343, "learning_rate": 3.860389762837173e-05, "loss": 0.088, "num_input_tokens_seen": 16440032, "step": 12675 }, { "epoch": 0.6195490191288203, "grad_norm": 0.4091295003890991, "learning_rate": 3.859565987231618e-05, "loss": 0.0767, "num_input_tokens_seen": 16446688, "step": 12680 }, { "epoch": 0.6197933207925146, "grad_norm": 0.5242190957069397, "learning_rate": 3.858742001963902e-05, "loss": 0.0876, "num_input_tokens_seen": 16453184, "step": 12685 }, { "epoch": 0.620037622456209, "grad_norm": 0.22940169274806976, "learning_rate": 3.857917807161094e-05, "loss": 0.0822, "num_input_tokens_seen": 16459936, "step": 12690 }, { "epoch": 0.6202819241199032, "grad_norm": 0.15002425014972687, "learning_rate": 3.857093402950296e-05, "loss": 0.0601, "num_input_tokens_seen": 16466240, "step": 12695 }, { "epoch": 0.6205262257835976, "grad_norm": 0.15781185030937195, "learning_rate": 3.8562687894586414e-05, "loss": 0.0891, "num_input_tokens_seen": 16472896, "step": 12700 }, { "epoch": 0.6207705274472919, "grad_norm": 0.17996856570243835, "learning_rate": 3.8554439668132946e-05, "loss": 0.0806, "num_input_tokens_seen": 16478848, "step": 12705 }, { "epoch": 0.6210148291109863, "grad_norm": 0.5540077090263367, "learning_rate": 3.854618935141455e-05, "loss": 0.105, "num_input_tokens_seen": 16485312, "step": 12710 }, { "epoch": 0.6212591307746805, "grad_norm": 0.14914070069789886, "learning_rate": 3.8537936945703525e-05, "loss": 0.0973, "num_input_tokens_seen": 16491552, "step": 12715 }, { "epoch": 0.6215034324383749, "grad_norm": 0.43102937936782837, "learning_rate": 3.852968245227249e-05, "loss": 0.0837, "num_input_tokens_seen": 16498144, "step": 12720 }, { "epoch": 0.6217477341020692, "grad_norm": 0.5699617862701416, "learning_rate": 3.85214258723944e-05, "loss": 0.0829, "num_input_tokens_seen": 16504096, "step": 12725 }, { "epoch": 0.6219920357657636, "grad_norm": 0.4511668086051941, "learning_rate": 3.8513167207342524e-05, "loss": 0.098, "num_input_tokens_seen": 16510656, "step": 12730 }, { "epoch": 0.6222363374294579, "grad_norm": 0.22578531503677368, "learning_rate": 3.850490645839044e-05, "loss": 0.0966, "num_input_tokens_seen": 16517120, "step": 12735 }, { "epoch": 0.6224806390931522, "grad_norm": 0.0948222279548645, "learning_rate": 3.849664362681207e-05, "loss": 0.0929, "num_input_tokens_seen": 16523616, "step": 12740 }, { "epoch": 0.6227249407568466, "grad_norm": 0.1808909922838211, "learning_rate": 3.848837871388165e-05, "loss": 0.0901, "num_input_tokens_seen": 16529696, "step": 12745 }, { "epoch": 0.6229692424205409, "grad_norm": 0.09755673259496689, "learning_rate": 3.848011172087371e-05, "loss": 0.0931, "num_input_tokens_seen": 16536064, "step": 12750 }, { "epoch": 0.6232135440842352, "grad_norm": 0.6575576066970825, "learning_rate": 3.847184264906315e-05, "loss": 0.1099, "num_input_tokens_seen": 16542848, "step": 12755 }, { "epoch": 0.6234578457479295, "grad_norm": 0.44585201144218445, "learning_rate": 3.846357149972516e-05, "loss": 0.0751, "num_input_tokens_seen": 16549440, "step": 12760 }, { "epoch": 0.6237021474116239, "grad_norm": 0.15959005057811737, "learning_rate": 3.8455298274135246e-05, "loss": 0.1037, "num_input_tokens_seen": 16556160, "step": 12765 }, { "epoch": 0.6239464490753182, "grad_norm": 0.2968493700027466, "learning_rate": 3.8447022973569254e-05, "loss": 0.0731, "num_input_tokens_seen": 16562976, "step": 12770 }, { "epoch": 0.6241907507390125, "grad_norm": 0.4754473567008972, "learning_rate": 3.843874559930332e-05, "loss": 0.1094, "num_input_tokens_seen": 16569632, "step": 12775 }, { "epoch": 0.6244350524027069, "grad_norm": 0.13031668961048126, "learning_rate": 3.843046615261394e-05, "loss": 0.077, "num_input_tokens_seen": 16575840, "step": 12780 }, { "epoch": 0.6246793540664012, "grad_norm": 0.20322288572788239, "learning_rate": 3.842218463477791e-05, "loss": 0.1098, "num_input_tokens_seen": 16582240, "step": 12785 }, { "epoch": 0.6249236557300956, "grad_norm": 0.2494005709886551, "learning_rate": 3.841390104707233e-05, "loss": 0.0814, "num_input_tokens_seen": 16588608, "step": 12790 }, { "epoch": 0.6251679573937898, "grad_norm": 0.1577315628528595, "learning_rate": 3.8405615390774643e-05, "loss": 0.1024, "num_input_tokens_seen": 16595168, "step": 12795 }, { "epoch": 0.6254122590574842, "grad_norm": 0.1680424064397812, "learning_rate": 3.839732766716259e-05, "loss": 0.0969, "num_input_tokens_seen": 16601824, "step": 12800 }, { "epoch": 0.6254122590574842, "eval_loss": 0.09168284386396408, "eval_runtime": 375.0394, "eval_samples_per_second": 97.016, "eval_steps_per_second": 24.256, "num_input_tokens_seen": 16601824, "step": 12800 }, { "epoch": 0.6256565607211785, "grad_norm": 0.5110305547714233, "learning_rate": 3.838903787751425e-05, "loss": 0.0864, "num_input_tokens_seen": 16607968, "step": 12805 }, { "epoch": 0.6259008623848729, "grad_norm": 0.3108300268650055, "learning_rate": 3.838074602310802e-05, "loss": 0.0889, "num_input_tokens_seen": 16613984, "step": 12810 }, { "epoch": 0.6261451640485671, "grad_norm": 0.5571780204772949, "learning_rate": 3.837245210522258e-05, "loss": 0.0996, "num_input_tokens_seen": 16620160, "step": 12815 }, { "epoch": 0.6263894657122615, "grad_norm": 0.314265638589859, "learning_rate": 3.8364156125136996e-05, "loss": 0.1178, "num_input_tokens_seen": 16626912, "step": 12820 }, { "epoch": 0.6266337673759559, "grad_norm": 0.25960573554039, "learning_rate": 3.835585808413059e-05, "loss": 0.1031, "num_input_tokens_seen": 16633472, "step": 12825 }, { "epoch": 0.6268780690396502, "grad_norm": 0.22053369879722595, "learning_rate": 3.8347557983483024e-05, "loss": 0.1087, "num_input_tokens_seen": 16639616, "step": 12830 }, { "epoch": 0.6271223707033445, "grad_norm": 0.24432966113090515, "learning_rate": 3.833925582447428e-05, "loss": 0.1148, "num_input_tokens_seen": 16645792, "step": 12835 }, { "epoch": 0.6273666723670388, "grad_norm": 0.1807572841644287, "learning_rate": 3.8330951608384656e-05, "loss": 0.0894, "num_input_tokens_seen": 16652256, "step": 12840 }, { "epoch": 0.6276109740307332, "grad_norm": 0.3334534168243408, "learning_rate": 3.832264533649477e-05, "loss": 0.0892, "num_input_tokens_seen": 16658336, "step": 12845 }, { "epoch": 0.6278552756944275, "grad_norm": 0.18132387101650238, "learning_rate": 3.8314337010085555e-05, "loss": 0.1042, "num_input_tokens_seen": 16664640, "step": 12850 }, { "epoch": 0.6280995773581218, "grad_norm": 0.30001771450042725, "learning_rate": 3.830602663043824e-05, "loss": 0.1127, "num_input_tokens_seen": 16671424, "step": 12855 }, { "epoch": 0.6283438790218161, "grad_norm": 0.3135262429714203, "learning_rate": 3.8297714198834414e-05, "loss": 0.1015, "num_input_tokens_seen": 16677856, "step": 12860 }, { "epoch": 0.6285881806855105, "grad_norm": 0.37816697359085083, "learning_rate": 3.828939971655595e-05, "loss": 0.1167, "num_input_tokens_seen": 16684256, "step": 12865 }, { "epoch": 0.6288324823492047, "grad_norm": 0.39070892333984375, "learning_rate": 3.828108318488505e-05, "loss": 0.061, "num_input_tokens_seen": 16690688, "step": 12870 }, { "epoch": 0.6290767840128991, "grad_norm": 0.1642666757106781, "learning_rate": 3.8272764605104216e-05, "loss": 0.0783, "num_input_tokens_seen": 16697312, "step": 12875 }, { "epoch": 0.6293210856765935, "grad_norm": 0.19160225987434387, "learning_rate": 3.826444397849628e-05, "loss": 0.0999, "num_input_tokens_seen": 16703360, "step": 12880 }, { "epoch": 0.6295653873402878, "grad_norm": 0.7718209028244019, "learning_rate": 3.825612130634439e-05, "loss": 0.0947, "num_input_tokens_seen": 16709824, "step": 12885 }, { "epoch": 0.6298096890039822, "grad_norm": 0.43919840455055237, "learning_rate": 3.824779658993202e-05, "loss": 0.092, "num_input_tokens_seen": 16716256, "step": 12890 }, { "epoch": 0.6300539906676764, "grad_norm": 0.9774385094642639, "learning_rate": 3.823946983054292e-05, "loss": 0.1022, "num_input_tokens_seen": 16722720, "step": 12895 }, { "epoch": 0.6302982923313708, "grad_norm": 0.19761453568935394, "learning_rate": 3.82311410294612e-05, "loss": 0.0945, "num_input_tokens_seen": 16729504, "step": 12900 }, { "epoch": 0.6305425939950651, "grad_norm": 0.137911856174469, "learning_rate": 3.822281018797127e-05, "loss": 0.088, "num_input_tokens_seen": 16735808, "step": 12905 }, { "epoch": 0.6307868956587595, "grad_norm": 0.6000447273254395, "learning_rate": 3.821447730735783e-05, "loss": 0.1019, "num_input_tokens_seen": 16742880, "step": 12910 }, { "epoch": 0.6310311973224537, "grad_norm": 0.1545678675174713, "learning_rate": 3.820614238890592e-05, "loss": 0.067, "num_input_tokens_seen": 16749152, "step": 12915 }, { "epoch": 0.6312754989861481, "grad_norm": 0.7676807045936584, "learning_rate": 3.819780543390091e-05, "loss": 0.0857, "num_input_tokens_seen": 16755712, "step": 12920 }, { "epoch": 0.6315198006498425, "grad_norm": 0.09326354414224625, "learning_rate": 3.818946644362844e-05, "loss": 0.0687, "num_input_tokens_seen": 16762176, "step": 12925 }, { "epoch": 0.6317641023135367, "grad_norm": 0.4150613248348236, "learning_rate": 3.81811254193745e-05, "loss": 0.0956, "num_input_tokens_seen": 16768672, "step": 12930 }, { "epoch": 0.6320084039772311, "grad_norm": 0.6420544981956482, "learning_rate": 3.8172782362425366e-05, "loss": 0.1068, "num_input_tokens_seen": 16774976, "step": 12935 }, { "epoch": 0.6322527056409254, "grad_norm": 0.1338803470134735, "learning_rate": 3.816443727406765e-05, "loss": 0.0885, "num_input_tokens_seen": 16781408, "step": 12940 }, { "epoch": 0.6324970073046198, "grad_norm": 0.5520262122154236, "learning_rate": 3.815609015558829e-05, "loss": 0.0962, "num_input_tokens_seen": 16788064, "step": 12945 }, { "epoch": 0.632741308968314, "grad_norm": 0.39080291986465454, "learning_rate": 3.814774100827448e-05, "loss": 0.0933, "num_input_tokens_seen": 16794336, "step": 12950 }, { "epoch": 0.6329856106320084, "grad_norm": 0.4075186252593994, "learning_rate": 3.813938983341379e-05, "loss": 0.0723, "num_input_tokens_seen": 16800992, "step": 12955 }, { "epoch": 0.6332299122957027, "grad_norm": 0.2542790174484253, "learning_rate": 3.813103663229407e-05, "loss": 0.1179, "num_input_tokens_seen": 16807104, "step": 12960 }, { "epoch": 0.6334742139593971, "grad_norm": 0.4988100230693817, "learning_rate": 3.812268140620349e-05, "loss": 0.0873, "num_input_tokens_seen": 16813664, "step": 12965 }, { "epoch": 0.6337185156230913, "grad_norm": 0.13067732751369476, "learning_rate": 3.811432415643051e-05, "loss": 0.0788, "num_input_tokens_seen": 16819936, "step": 12970 }, { "epoch": 0.6339628172867857, "grad_norm": 0.6855308413505554, "learning_rate": 3.8105964884263954e-05, "loss": 0.072, "num_input_tokens_seen": 16825792, "step": 12975 }, { "epoch": 0.6342071189504801, "grad_norm": 0.4257001578807831, "learning_rate": 3.809760359099291e-05, "loss": 0.0881, "num_input_tokens_seen": 16832704, "step": 12980 }, { "epoch": 0.6344514206141744, "grad_norm": 0.6819767951965332, "learning_rate": 3.8089240277906804e-05, "loss": 0.0937, "num_input_tokens_seen": 16839456, "step": 12985 }, { "epoch": 0.6346957222778687, "grad_norm": 0.22256027162075043, "learning_rate": 3.808087494629535e-05, "loss": 0.1093, "num_input_tokens_seen": 16846752, "step": 12990 }, { "epoch": 0.634940023941563, "grad_norm": 0.20687082409858704, "learning_rate": 3.8072507597448595e-05, "loss": 0.0901, "num_input_tokens_seen": 16853600, "step": 12995 }, { "epoch": 0.6351843256052574, "grad_norm": 0.21211811900138855, "learning_rate": 3.806413823265689e-05, "loss": 0.1037, "num_input_tokens_seen": 16860320, "step": 13000 }, { "epoch": 0.6351843256052574, "eval_loss": 0.0914095863699913, "eval_runtime": 374.8652, "eval_samples_per_second": 97.062, "eval_steps_per_second": 24.267, "num_input_tokens_seen": 16860320, "step": 13000 }, { "epoch": 0.6354286272689517, "grad_norm": 0.3025308847427368, "learning_rate": 3.805576685321089e-05, "loss": 0.0795, "num_input_tokens_seen": 16867168, "step": 13005 }, { "epoch": 0.635672928932646, "grad_norm": 0.19717144966125488, "learning_rate": 3.804739346040158e-05, "loss": 0.1071, "num_input_tokens_seen": 16874080, "step": 13010 }, { "epoch": 0.6359172305963403, "grad_norm": 0.8871094584465027, "learning_rate": 3.8039018055520234e-05, "loss": 0.0984, "num_input_tokens_seen": 16880544, "step": 13015 }, { "epoch": 0.6361615322600347, "grad_norm": 0.5290582776069641, "learning_rate": 3.803064063985844e-05, "loss": 0.1418, "num_input_tokens_seen": 16886944, "step": 13020 }, { "epoch": 0.6364058339237291, "grad_norm": 0.14247602224349976, "learning_rate": 3.802226121470811e-05, "loss": 0.0991, "num_input_tokens_seen": 16893280, "step": 13025 }, { "epoch": 0.6366501355874233, "grad_norm": 0.157369926571846, "learning_rate": 3.801387978136145e-05, "loss": 0.1006, "num_input_tokens_seen": 16899456, "step": 13030 }, { "epoch": 0.6368944372511177, "grad_norm": 0.3260760009288788, "learning_rate": 3.800549634111099e-05, "loss": 0.0776, "num_input_tokens_seen": 16905856, "step": 13035 }, { "epoch": 0.637138738914812, "grad_norm": 0.2783550024032593, "learning_rate": 3.799711089524955e-05, "loss": 0.0775, "num_input_tokens_seen": 16912064, "step": 13040 }, { "epoch": 0.6373830405785064, "grad_norm": 0.16734884679317474, "learning_rate": 3.7988723445070285e-05, "loss": 0.0525, "num_input_tokens_seen": 16919008, "step": 13045 }, { "epoch": 0.6376273422422006, "grad_norm": 0.30986669659614563, "learning_rate": 3.798033399186663e-05, "loss": 0.1183, "num_input_tokens_seen": 16925152, "step": 13050 }, { "epoch": 0.637871643905895, "grad_norm": 0.3461357355117798, "learning_rate": 3.797194253693237e-05, "loss": 0.0895, "num_input_tokens_seen": 16931520, "step": 13055 }, { "epoch": 0.6381159455695893, "grad_norm": 0.28050097823143005, "learning_rate": 3.796354908156153e-05, "loss": 0.0992, "num_input_tokens_seen": 16938208, "step": 13060 }, { "epoch": 0.6383602472332837, "grad_norm": 0.2840797007083893, "learning_rate": 3.795515362704853e-05, "loss": 0.0958, "num_input_tokens_seen": 16944704, "step": 13065 }, { "epoch": 0.638604548896978, "grad_norm": 0.16139103472232819, "learning_rate": 3.794675617468803e-05, "loss": 0.0681, "num_input_tokens_seen": 16951360, "step": 13070 }, { "epoch": 0.6388488505606723, "grad_norm": 0.356642484664917, "learning_rate": 3.793835672577503e-05, "loss": 0.0827, "num_input_tokens_seen": 16957856, "step": 13075 }, { "epoch": 0.6390931522243667, "grad_norm": 0.1885504573583603, "learning_rate": 3.7929955281604826e-05, "loss": 0.0979, "num_input_tokens_seen": 16963968, "step": 13080 }, { "epoch": 0.639337453888061, "grad_norm": 0.6439441442489624, "learning_rate": 3.7921551843473036e-05, "loss": 0.1001, "num_input_tokens_seen": 16970304, "step": 13085 }, { "epoch": 0.6395817555517553, "grad_norm": 0.5494210720062256, "learning_rate": 3.791314641267557e-05, "loss": 0.081, "num_input_tokens_seen": 16976448, "step": 13090 }, { "epoch": 0.6398260572154496, "grad_norm": 0.26195284724235535, "learning_rate": 3.790473899050864e-05, "loss": 0.0704, "num_input_tokens_seen": 16983104, "step": 13095 }, { "epoch": 0.640070358879144, "grad_norm": 0.1290615200996399, "learning_rate": 3.7896329578268794e-05, "loss": 0.0599, "num_input_tokens_seen": 16989888, "step": 13100 }, { "epoch": 0.6403146605428383, "grad_norm": 0.32096487283706665, "learning_rate": 3.7887918177252855e-05, "loss": 0.0815, "num_input_tokens_seen": 16996192, "step": 13105 }, { "epoch": 0.6405589622065326, "grad_norm": 0.25007325410842896, "learning_rate": 3.787950478875798e-05, "loss": 0.1143, "num_input_tokens_seen": 17002400, "step": 13110 }, { "epoch": 0.6408032638702269, "grad_norm": 0.16266098618507385, "learning_rate": 3.787108941408162e-05, "loss": 0.0887, "num_input_tokens_seen": 17008544, "step": 13115 }, { "epoch": 0.6410475655339213, "grad_norm": 0.3113645911216736, "learning_rate": 3.786267205452151e-05, "loss": 0.1048, "num_input_tokens_seen": 17014880, "step": 13120 }, { "epoch": 0.6412918671976157, "grad_norm": 0.5568233728408813, "learning_rate": 3.785425271137573e-05, "loss": 0.0897, "num_input_tokens_seen": 17021728, "step": 13125 }, { "epoch": 0.6415361688613099, "grad_norm": 0.4425252676010132, "learning_rate": 3.7845831385942655e-05, "loss": 0.0807, "num_input_tokens_seen": 17028000, "step": 13130 }, { "epoch": 0.6417804705250043, "grad_norm": 0.5690284371376038, "learning_rate": 3.7837408079520944e-05, "loss": 0.0871, "num_input_tokens_seen": 17033952, "step": 13135 }, { "epoch": 0.6420247721886986, "grad_norm": 0.6908925175666809, "learning_rate": 3.782898279340957e-05, "loss": 0.0809, "num_input_tokens_seen": 17040448, "step": 13140 }, { "epoch": 0.642269073852393, "grad_norm": 0.12892325222492218, "learning_rate": 3.782055552890784e-05, "loss": 0.0746, "num_input_tokens_seen": 17046464, "step": 13145 }, { "epoch": 0.6425133755160872, "grad_norm": 0.39195457100868225, "learning_rate": 3.781212628731534e-05, "loss": 0.0876, "num_input_tokens_seen": 17053248, "step": 13150 }, { "epoch": 0.6427576771797816, "grad_norm": 0.6459270119667053, "learning_rate": 3.7803695069931946e-05, "loss": 0.0838, "num_input_tokens_seen": 17060352, "step": 13155 }, { "epoch": 0.6430019788434759, "grad_norm": 0.2990695536136627, "learning_rate": 3.779526187805789e-05, "loss": 0.0951, "num_input_tokens_seen": 17066592, "step": 13160 }, { "epoch": 0.6432462805071703, "grad_norm": 0.1765921711921692, "learning_rate": 3.778682671299364e-05, "loss": 0.1067, "num_input_tokens_seen": 17073408, "step": 13165 }, { "epoch": 0.6434905821708646, "grad_norm": 0.14913052320480347, "learning_rate": 3.777838957604003e-05, "loss": 0.0702, "num_input_tokens_seen": 17080064, "step": 13170 }, { "epoch": 0.6437348838345589, "grad_norm": 0.37214887142181396, "learning_rate": 3.776995046849816e-05, "loss": 0.0951, "num_input_tokens_seen": 17086144, "step": 13175 }, { "epoch": 0.6439791854982533, "grad_norm": 0.4317687153816223, "learning_rate": 3.776150939166945e-05, "loss": 0.0689, "num_input_tokens_seen": 17092672, "step": 13180 }, { "epoch": 0.6442234871619476, "grad_norm": 0.30357155203819275, "learning_rate": 3.775306634685562e-05, "loss": 0.1069, "num_input_tokens_seen": 17099008, "step": 13185 }, { "epoch": 0.6444677888256419, "grad_norm": 0.24657060205936432, "learning_rate": 3.7744621335358696e-05, "loss": 0.0913, "num_input_tokens_seen": 17105632, "step": 13190 }, { "epoch": 0.6447120904893362, "grad_norm": 0.42941153049468994, "learning_rate": 3.7736174358481e-05, "loss": 0.0878, "num_input_tokens_seen": 17112192, "step": 13195 }, { "epoch": 0.6449563921530306, "grad_norm": 0.550466001033783, "learning_rate": 3.7727725417525175e-05, "loss": 0.094, "num_input_tokens_seen": 17118528, "step": 13200 }, { "epoch": 0.6449563921530306, "eval_loss": 0.09316424280405045, "eval_runtime": 374.8102, "eval_samples_per_second": 97.076, "eval_steps_per_second": 24.271, "num_input_tokens_seen": 17118528, "step": 13200 }, { "epoch": 0.6452006938167248, "grad_norm": 0.21675170958042145, "learning_rate": 3.771927451379414e-05, "loss": 0.0866, "num_input_tokens_seen": 17124896, "step": 13205 }, { "epoch": 0.6454449954804192, "grad_norm": 0.7382897734642029, "learning_rate": 3.7710821648591135e-05, "loss": 0.0751, "num_input_tokens_seen": 17131520, "step": 13210 }, { "epoch": 0.6456892971441136, "grad_norm": 0.15012700855731964, "learning_rate": 3.7702366823219694e-05, "loss": 0.0718, "num_input_tokens_seen": 17137920, "step": 13215 }, { "epoch": 0.6459335988078079, "grad_norm": 0.5015268921852112, "learning_rate": 3.769391003898366e-05, "loss": 0.0892, "num_input_tokens_seen": 17144288, "step": 13220 }, { "epoch": 0.6461779004715023, "grad_norm": 0.5168539881706238, "learning_rate": 3.768545129718718e-05, "loss": 0.0745, "num_input_tokens_seen": 17150624, "step": 13225 }, { "epoch": 0.6464222021351965, "grad_norm": 0.13585211336612701, "learning_rate": 3.7676990599134686e-05, "loss": 0.1007, "num_input_tokens_seen": 17157344, "step": 13230 }, { "epoch": 0.6466665037988909, "grad_norm": 0.1357048600912094, "learning_rate": 3.766852794613095e-05, "loss": 0.0693, "num_input_tokens_seen": 17164000, "step": 13235 }, { "epoch": 0.6469108054625852, "grad_norm": 0.5546624064445496, "learning_rate": 3.766006333948099e-05, "loss": 0.0898, "num_input_tokens_seen": 17170304, "step": 13240 }, { "epoch": 0.6471551071262795, "grad_norm": 0.2213313728570938, "learning_rate": 3.765159678049017e-05, "loss": 0.0911, "num_input_tokens_seen": 17177056, "step": 13245 }, { "epoch": 0.6473994087899738, "grad_norm": 0.2467799037694931, "learning_rate": 3.7643128270464134e-05, "loss": 0.0717, "num_input_tokens_seen": 17183680, "step": 13250 }, { "epoch": 0.6476437104536682, "grad_norm": 0.6307225227355957, "learning_rate": 3.763465781070884e-05, "loss": 0.0984, "num_input_tokens_seen": 17190112, "step": 13255 }, { "epoch": 0.6478880121173625, "grad_norm": 0.1761166900396347, "learning_rate": 3.762618540253052e-05, "loss": 0.0974, "num_input_tokens_seen": 17196576, "step": 13260 }, { "epoch": 0.6481323137810568, "grad_norm": 0.2252129316329956, "learning_rate": 3.761771104723576e-05, "loss": 0.0996, "num_input_tokens_seen": 17202528, "step": 13265 }, { "epoch": 0.6483766154447512, "grad_norm": 0.16342508792877197, "learning_rate": 3.7609234746131386e-05, "loss": 0.0945, "num_input_tokens_seen": 17209024, "step": 13270 }, { "epoch": 0.6486209171084455, "grad_norm": 0.5850774645805359, "learning_rate": 3.7600756500524556e-05, "loss": 0.0774, "num_input_tokens_seen": 17215424, "step": 13275 }, { "epoch": 0.6488652187721399, "grad_norm": 0.3311155438423157, "learning_rate": 3.759227631172271e-05, "loss": 0.1233, "num_input_tokens_seen": 17222176, "step": 13280 }, { "epoch": 0.6491095204358341, "grad_norm": 0.23856449127197266, "learning_rate": 3.758379418103363e-05, "loss": 0.0909, "num_input_tokens_seen": 17229408, "step": 13285 }, { "epoch": 0.6493538220995285, "grad_norm": 0.444612056016922, "learning_rate": 3.757531010976534e-05, "loss": 0.0976, "num_input_tokens_seen": 17235936, "step": 13290 }, { "epoch": 0.6495981237632228, "grad_norm": 0.25381800532341003, "learning_rate": 3.75668240992262e-05, "loss": 0.0976, "num_input_tokens_seen": 17242848, "step": 13295 }, { "epoch": 0.6498424254269172, "grad_norm": 0.3440360724925995, "learning_rate": 3.7558336150724865e-05, "loss": 0.0746, "num_input_tokens_seen": 17249408, "step": 13300 }, { "epoch": 0.6500867270906114, "grad_norm": 0.1465148627758026, "learning_rate": 3.754984626557028e-05, "loss": 0.0834, "num_input_tokens_seen": 17255936, "step": 13305 }, { "epoch": 0.6503310287543058, "grad_norm": 0.36427557468414307, "learning_rate": 3.754135444507168e-05, "loss": 0.0991, "num_input_tokens_seen": 17262624, "step": 13310 }, { "epoch": 0.6505753304180002, "grad_norm": 0.379330575466156, "learning_rate": 3.753286069053863e-05, "loss": 0.0849, "num_input_tokens_seen": 17269152, "step": 13315 }, { "epoch": 0.6508196320816945, "grad_norm": 0.4018440544605255, "learning_rate": 3.7524365003280945e-05, "loss": 0.0979, "num_input_tokens_seen": 17275808, "step": 13320 }, { "epoch": 0.6510639337453888, "grad_norm": 0.19800329208374023, "learning_rate": 3.75158673846088e-05, "loss": 0.1086, "num_input_tokens_seen": 17282368, "step": 13325 }, { "epoch": 0.6513082354090831, "grad_norm": 0.2462194710969925, "learning_rate": 3.750736783583262e-05, "loss": 0.0852, "num_input_tokens_seen": 17288864, "step": 13330 }, { "epoch": 0.6515525370727775, "grad_norm": 0.5398600697517395, "learning_rate": 3.7498866358263144e-05, "loss": 0.0847, "num_input_tokens_seen": 17294976, "step": 13335 }, { "epoch": 0.6517968387364718, "grad_norm": 0.4459443986415863, "learning_rate": 3.74903629532114e-05, "loss": 0.0805, "num_input_tokens_seen": 17301632, "step": 13340 }, { "epoch": 0.6520411404001661, "grad_norm": 0.11975347250699997, "learning_rate": 3.748185762198873e-05, "loss": 0.1083, "num_input_tokens_seen": 17308000, "step": 13345 }, { "epoch": 0.6522854420638604, "grad_norm": 0.3129933774471283, "learning_rate": 3.747335036590676e-05, "loss": 0.085, "num_input_tokens_seen": 17314208, "step": 13350 }, { "epoch": 0.6525297437275548, "grad_norm": 0.3971010148525238, "learning_rate": 3.7464841186277405e-05, "loss": 0.0915, "num_input_tokens_seen": 17321216, "step": 13355 }, { "epoch": 0.6527740453912492, "grad_norm": 0.3000963032245636, "learning_rate": 3.7456330084412896e-05, "loss": 0.0796, "num_input_tokens_seen": 17327840, "step": 13360 }, { "epoch": 0.6530183470549434, "grad_norm": 0.30097660422325134, "learning_rate": 3.744781706162576e-05, "loss": 0.076, "num_input_tokens_seen": 17334144, "step": 13365 }, { "epoch": 0.6532626487186378, "grad_norm": 0.4029676616191864, "learning_rate": 3.743930211922879e-05, "loss": 0.0836, "num_input_tokens_seen": 17340448, "step": 13370 }, { "epoch": 0.6535069503823321, "grad_norm": 0.13987791538238525, "learning_rate": 3.743078525853513e-05, "loss": 0.0988, "num_input_tokens_seen": 17346848, "step": 13375 }, { "epoch": 0.6537512520460265, "grad_norm": 0.6524109244346619, "learning_rate": 3.7422266480858154e-05, "loss": 0.0958, "num_input_tokens_seen": 17353216, "step": 13380 }, { "epoch": 0.6539955537097207, "grad_norm": 0.2365979701280594, "learning_rate": 3.741374578751158e-05, "loss": 0.0769, "num_input_tokens_seen": 17359840, "step": 13385 }, { "epoch": 0.6542398553734151, "grad_norm": 0.12652714550495148, "learning_rate": 3.740522317980941e-05, "loss": 0.1111, "num_input_tokens_seen": 17365376, "step": 13390 }, { "epoch": 0.6544841570371094, "grad_norm": 0.5391887426376343, "learning_rate": 3.739669865906593e-05, "loss": 0.0807, "num_input_tokens_seen": 17372096, "step": 13395 }, { "epoch": 0.6547284587008038, "grad_norm": 0.23741842806339264, "learning_rate": 3.738817222659573e-05, "loss": 0.0959, "num_input_tokens_seen": 17378528, "step": 13400 }, { "epoch": 0.6547284587008038, "eval_loss": 0.09149932861328125, "eval_runtime": 375.1556, "eval_samples_per_second": 96.986, "eval_steps_per_second": 24.249, "num_input_tokens_seen": 17378528, "step": 13400 }, { "epoch": 0.654972760364498, "grad_norm": 0.7261647582054138, "learning_rate": 3.73796438837137e-05, "loss": 0.0705, "num_input_tokens_seen": 17384832, "step": 13405 }, { "epoch": 0.6552170620281924, "grad_norm": 0.19420339167118073, "learning_rate": 3.7371113631735e-05, "loss": 0.0866, "num_input_tokens_seen": 17391360, "step": 13410 }, { "epoch": 0.6554613636918868, "grad_norm": 0.11665099114179611, "learning_rate": 3.736258147197512e-05, "loss": 0.0799, "num_input_tokens_seen": 17397632, "step": 13415 }, { "epoch": 0.6557056653555811, "grad_norm": 0.34081804752349854, "learning_rate": 3.735404740574981e-05, "loss": 0.091, "num_input_tokens_seen": 17404160, "step": 13420 }, { "epoch": 0.6559499670192754, "grad_norm": 0.33065974712371826, "learning_rate": 3.7345511434375145e-05, "loss": 0.0855, "num_input_tokens_seen": 17410656, "step": 13425 }, { "epoch": 0.6561942686829697, "grad_norm": 0.1834324449300766, "learning_rate": 3.733697355916748e-05, "loss": 0.0982, "num_input_tokens_seen": 17416832, "step": 13430 }, { "epoch": 0.6564385703466641, "grad_norm": 0.19426120817661285, "learning_rate": 3.732843378144345e-05, "loss": 0.0694, "num_input_tokens_seen": 17423104, "step": 13435 }, { "epoch": 0.6566828720103584, "grad_norm": 0.17684213817119598, "learning_rate": 3.7319892102519995e-05, "loss": 0.1002, "num_input_tokens_seen": 17429664, "step": 13440 }, { "epoch": 0.6569271736740527, "grad_norm": 0.4362488090991974, "learning_rate": 3.731134852371436e-05, "loss": 0.0709, "num_input_tokens_seen": 17436096, "step": 13445 }, { "epoch": 0.657171475337747, "grad_norm": 0.38274338841438293, "learning_rate": 3.730280304634408e-05, "loss": 0.0892, "num_input_tokens_seen": 17442688, "step": 13450 }, { "epoch": 0.6574157770014414, "grad_norm": 0.2792942523956299, "learning_rate": 3.729425567172696e-05, "loss": 0.0923, "num_input_tokens_seen": 17449024, "step": 13455 }, { "epoch": 0.6576600786651358, "grad_norm": 0.3775821924209595, "learning_rate": 3.728570640118111e-05, "loss": 0.0965, "num_input_tokens_seen": 17455360, "step": 13460 }, { "epoch": 0.65790438032883, "grad_norm": 0.20083549618721008, "learning_rate": 3.727715523602494e-05, "loss": 0.1069, "num_input_tokens_seen": 17461632, "step": 13465 }, { "epoch": 0.6581486819925244, "grad_norm": 0.20377883315086365, "learning_rate": 3.726860217757715e-05, "loss": 0.0801, "num_input_tokens_seen": 17468416, "step": 13470 }, { "epoch": 0.6583929836562187, "grad_norm": 0.21586334705352783, "learning_rate": 3.726004722715673e-05, "loss": 0.0915, "num_input_tokens_seen": 17474720, "step": 13475 }, { "epoch": 0.6586372853199131, "grad_norm": 0.3967972695827484, "learning_rate": 3.725149038608296e-05, "loss": 0.0843, "num_input_tokens_seen": 17481280, "step": 13480 }, { "epoch": 0.6588815869836073, "grad_norm": 0.11854197829961777, "learning_rate": 3.7242931655675404e-05, "loss": 0.0902, "num_input_tokens_seen": 17488032, "step": 13485 }, { "epoch": 0.6591258886473017, "grad_norm": 0.22518350183963776, "learning_rate": 3.7234371037253937e-05, "loss": 0.1301, "num_input_tokens_seen": 17494272, "step": 13490 }, { "epoch": 0.659370190310996, "grad_norm": 0.15271109342575073, "learning_rate": 3.7225808532138705e-05, "loss": 0.0843, "num_input_tokens_seen": 17500544, "step": 13495 }, { "epoch": 0.6596144919746904, "grad_norm": 0.3049396276473999, "learning_rate": 3.721724414165016e-05, "loss": 0.0895, "num_input_tokens_seen": 17506816, "step": 13500 }, { "epoch": 0.6598587936383846, "grad_norm": 0.43541958928108215, "learning_rate": 3.720867786710904e-05, "loss": 0.0928, "num_input_tokens_seen": 17513536, "step": 13505 }, { "epoch": 0.660103095302079, "grad_norm": 0.2108832150697708, "learning_rate": 3.7200109709836366e-05, "loss": 0.1078, "num_input_tokens_seen": 17520512, "step": 13510 }, { "epoch": 0.6603473969657734, "grad_norm": 0.20485278964042664, "learning_rate": 3.7191539671153465e-05, "loss": 0.0844, "num_input_tokens_seen": 17526944, "step": 13515 }, { "epoch": 0.6605916986294676, "grad_norm": 0.20135021209716797, "learning_rate": 3.718296775238193e-05, "loss": 0.0895, "num_input_tokens_seen": 17533920, "step": 13520 }, { "epoch": 0.660836000293162, "grad_norm": 0.42099905014038086, "learning_rate": 3.7174393954843675e-05, "loss": 0.0864, "num_input_tokens_seen": 17540544, "step": 13525 }, { "epoch": 0.6610803019568563, "grad_norm": 0.7034563422203064, "learning_rate": 3.716581827986087e-05, "loss": 0.0897, "num_input_tokens_seen": 17546880, "step": 13530 }, { "epoch": 0.6613246036205507, "grad_norm": 0.22490549087524414, "learning_rate": 3.7157240728756004e-05, "loss": 0.0571, "num_input_tokens_seen": 17554080, "step": 13535 }, { "epoch": 0.661568905284245, "grad_norm": 0.13131584227085114, "learning_rate": 3.714866130285184e-05, "loss": 0.0824, "num_input_tokens_seen": 17560000, "step": 13540 }, { "epoch": 0.6618132069479393, "grad_norm": 0.44588255882263184, "learning_rate": 3.714008000347143e-05, "loss": 0.0823, "num_input_tokens_seen": 17566976, "step": 13545 }, { "epoch": 0.6620575086116336, "grad_norm": 0.5887045860290527, "learning_rate": 3.7131496831938126e-05, "loss": 0.0793, "num_input_tokens_seen": 17573024, "step": 13550 }, { "epoch": 0.662301810275328, "grad_norm": 0.23817835748195648, "learning_rate": 3.7122911789575565e-05, "loss": 0.0829, "num_input_tokens_seen": 17579840, "step": 13555 }, { "epoch": 0.6625461119390224, "grad_norm": 0.23353496193885803, "learning_rate": 3.711432487770765e-05, "loss": 0.0903, "num_input_tokens_seen": 17586432, "step": 13560 }, { "epoch": 0.6627904136027166, "grad_norm": 0.2114877998828888, "learning_rate": 3.710573609765861e-05, "loss": 0.094, "num_input_tokens_seen": 17592640, "step": 13565 }, { "epoch": 0.663034715266411, "grad_norm": 0.1485747992992401, "learning_rate": 3.709714545075292e-05, "loss": 0.0695, "num_input_tokens_seen": 17598976, "step": 13570 }, { "epoch": 0.6632790169301053, "grad_norm": 0.17074623703956604, "learning_rate": 3.708855293831538e-05, "loss": 0.0715, "num_input_tokens_seen": 17605920, "step": 13575 }, { "epoch": 0.6635233185937996, "grad_norm": 0.8698530197143555, "learning_rate": 3.707995856167107e-05, "loss": 0.1348, "num_input_tokens_seen": 17612544, "step": 13580 }, { "epoch": 0.6637676202574939, "grad_norm": 0.26659196615219116, "learning_rate": 3.707136232214534e-05, "loss": 0.082, "num_input_tokens_seen": 17618944, "step": 13585 }, { "epoch": 0.6640119219211883, "grad_norm": 0.33232006430625916, "learning_rate": 3.7062764221063844e-05, "loss": 0.0736, "num_input_tokens_seen": 17625696, "step": 13590 }, { "epoch": 0.6642562235848826, "grad_norm": 0.2281588464975357, "learning_rate": 3.705416425975252e-05, "loss": 0.0823, "num_input_tokens_seen": 17632160, "step": 13595 }, { "epoch": 0.664500525248577, "grad_norm": 0.33457720279693604, "learning_rate": 3.704556243953758e-05, "loss": 0.0789, "num_input_tokens_seen": 17638400, "step": 13600 }, { "epoch": 0.664500525248577, "eval_loss": 0.09151474386453629, "eval_runtime": 375.1958, "eval_samples_per_second": 96.976, "eval_steps_per_second": 24.246, "num_input_tokens_seen": 17638400, "step": 13600 }, { "epoch": 0.6647448269122713, "grad_norm": 0.20642180740833282, "learning_rate": 3.7036958761745535e-05, "loss": 0.1247, "num_input_tokens_seen": 17644832, "step": 13605 }, { "epoch": 0.6649891285759656, "grad_norm": 0.5054627060890198, "learning_rate": 3.702835322770318e-05, "loss": 0.1202, "num_input_tokens_seen": 17651424, "step": 13610 }, { "epoch": 0.66523343023966, "grad_norm": 0.23646168410778046, "learning_rate": 3.701974583873761e-05, "loss": 0.0929, "num_input_tokens_seen": 17657728, "step": 13615 }, { "epoch": 0.6654777319033542, "grad_norm": 0.7947539687156677, "learning_rate": 3.701113659617618e-05, "loss": 0.0865, "num_input_tokens_seen": 17664064, "step": 13620 }, { "epoch": 0.6657220335670486, "grad_norm": 0.4131574034690857, "learning_rate": 3.7002525501346535e-05, "loss": 0.1165, "num_input_tokens_seen": 17671104, "step": 13625 }, { "epoch": 0.6659663352307429, "grad_norm": 0.47595709562301636, "learning_rate": 3.699391255557664e-05, "loss": 0.0738, "num_input_tokens_seen": 17677696, "step": 13630 }, { "epoch": 0.6662106368944373, "grad_norm": 0.3991321325302124, "learning_rate": 3.69852977601947e-05, "loss": 0.1247, "num_input_tokens_seen": 17684000, "step": 13635 }, { "epoch": 0.6664549385581315, "grad_norm": 0.3851282298564911, "learning_rate": 3.697668111652922e-05, "loss": 0.0891, "num_input_tokens_seen": 17690624, "step": 13640 }, { "epoch": 0.6666992402218259, "grad_norm": 0.33704426884651184, "learning_rate": 3.6968062625909005e-05, "loss": 0.0702, "num_input_tokens_seen": 17696992, "step": 13645 }, { "epoch": 0.6669435418855202, "grad_norm": 0.5164620280265808, "learning_rate": 3.6959442289663135e-05, "loss": 0.0909, "num_input_tokens_seen": 17703296, "step": 13650 }, { "epoch": 0.6671878435492146, "grad_norm": 0.2782554030418396, "learning_rate": 3.695082010912098e-05, "loss": 0.1039, "num_input_tokens_seen": 17709888, "step": 13655 }, { "epoch": 0.667432145212909, "grad_norm": 0.41910940408706665, "learning_rate": 3.694219608561217e-05, "loss": 0.0924, "num_input_tokens_seen": 17716416, "step": 13660 }, { "epoch": 0.6676764468766032, "grad_norm": 0.09225679188966751, "learning_rate": 3.693357022046665e-05, "loss": 0.0755, "num_input_tokens_seen": 17722752, "step": 13665 }, { "epoch": 0.6679207485402976, "grad_norm": 0.7355268597602844, "learning_rate": 3.6924942515014644e-05, "loss": 0.0984, "num_input_tokens_seen": 17728992, "step": 13670 }, { "epoch": 0.6681650502039919, "grad_norm": 0.28308990597724915, "learning_rate": 3.691631297058664e-05, "loss": 0.0868, "num_input_tokens_seen": 17735456, "step": 13675 }, { "epoch": 0.6684093518676862, "grad_norm": 0.1436704695224762, "learning_rate": 3.6907681588513424e-05, "loss": 0.064, "num_input_tokens_seen": 17742176, "step": 13680 }, { "epoch": 0.6686536535313805, "grad_norm": 0.40694332122802734, "learning_rate": 3.689904837012606e-05, "loss": 0.0862, "num_input_tokens_seen": 17748672, "step": 13685 }, { "epoch": 0.6688979551950749, "grad_norm": 0.1865580528974533, "learning_rate": 3.689041331675591e-05, "loss": 0.0774, "num_input_tokens_seen": 17754880, "step": 13690 }, { "epoch": 0.6691422568587692, "grad_norm": 0.3478519022464752, "learning_rate": 3.688177642973461e-05, "loss": 0.1032, "num_input_tokens_seen": 17761344, "step": 13695 }, { "epoch": 0.6693865585224635, "grad_norm": 0.2946246266365051, "learning_rate": 3.687313771039406e-05, "loss": 0.0949, "num_input_tokens_seen": 17767616, "step": 13700 }, { "epoch": 0.6696308601861579, "grad_norm": 0.24723418056964874, "learning_rate": 3.686449716006647e-05, "loss": 0.0609, "num_input_tokens_seen": 17774048, "step": 13705 }, { "epoch": 0.6698751618498522, "grad_norm": 0.7139374017715454, "learning_rate": 3.685585478008432e-05, "loss": 0.0836, "num_input_tokens_seen": 17780576, "step": 13710 }, { "epoch": 0.6701194635135466, "grad_norm": 0.7951187491416931, "learning_rate": 3.6847210571780364e-05, "loss": 0.0778, "num_input_tokens_seen": 17787456, "step": 13715 }, { "epoch": 0.6703637651772408, "grad_norm": 0.2695467472076416, "learning_rate": 3.683856453648767e-05, "loss": 0.1033, "num_input_tokens_seen": 17794272, "step": 13720 }, { "epoch": 0.6706080668409352, "grad_norm": 0.28834211826324463, "learning_rate": 3.682991667553954e-05, "loss": 0.0916, "num_input_tokens_seen": 17801120, "step": 13725 }, { "epoch": 0.6708523685046295, "grad_norm": 0.34814226627349854, "learning_rate": 3.6821266990269606e-05, "loss": 0.0818, "num_input_tokens_seen": 17807680, "step": 13730 }, { "epoch": 0.6710966701683239, "grad_norm": 0.20115046203136444, "learning_rate": 3.681261548201174e-05, "loss": 0.0854, "num_input_tokens_seen": 17814400, "step": 13735 }, { "epoch": 0.6713409718320181, "grad_norm": 0.46516984701156616, "learning_rate": 3.6803962152100125e-05, "loss": 0.0849, "num_input_tokens_seen": 17820960, "step": 13740 }, { "epoch": 0.6715852734957125, "grad_norm": 0.24181291460990906, "learning_rate": 3.67953070018692e-05, "loss": 0.0827, "num_input_tokens_seen": 17827200, "step": 13745 }, { "epoch": 0.6718295751594069, "grad_norm": 0.6584663391113281, "learning_rate": 3.678665003265371e-05, "loss": 0.0866, "num_input_tokens_seen": 17833824, "step": 13750 }, { "epoch": 0.6720738768231012, "grad_norm": 0.31225094199180603, "learning_rate": 3.677799124578867e-05, "loss": 0.085, "num_input_tokens_seen": 17840640, "step": 13755 }, { "epoch": 0.6723181784867955, "grad_norm": 0.5168839693069458, "learning_rate": 3.676933064260937e-05, "loss": 0.1, "num_input_tokens_seen": 17847264, "step": 13760 }, { "epoch": 0.6725624801504898, "grad_norm": 0.34474217891693115, "learning_rate": 3.6760668224451365e-05, "loss": 0.0857, "num_input_tokens_seen": 17853888, "step": 13765 }, { "epoch": 0.6728067818141842, "grad_norm": 0.17055214941501617, "learning_rate": 3.675200399265054e-05, "loss": 0.0516, "num_input_tokens_seen": 17860544, "step": 13770 }, { "epoch": 0.6730510834778785, "grad_norm": 0.7492336630821228, "learning_rate": 3.6743337948543014e-05, "loss": 0.0905, "num_input_tokens_seen": 17866720, "step": 13775 }, { "epoch": 0.6732953851415728, "grad_norm": 0.5107656717300415, "learning_rate": 3.6734670093465204e-05, "loss": 0.0957, "num_input_tokens_seen": 17873248, "step": 13780 }, { "epoch": 0.6735396868052671, "grad_norm": 0.757636308670044, "learning_rate": 3.672600042875379e-05, "loss": 0.1024, "num_input_tokens_seen": 17879616, "step": 13785 }, { "epoch": 0.6737839884689615, "grad_norm": 0.2579326927661896, "learning_rate": 3.671732895574575e-05, "loss": 0.1013, "num_input_tokens_seen": 17885856, "step": 13790 }, { "epoch": 0.6740282901326557, "grad_norm": 0.18815292418003082, "learning_rate": 3.670865567577834e-05, "loss": 0.1231, "num_input_tokens_seen": 17892352, "step": 13795 }, { "epoch": 0.6742725917963501, "grad_norm": 0.19103500247001648, "learning_rate": 3.669998059018909e-05, "loss": 0.0712, "num_input_tokens_seen": 17898336, "step": 13800 }, { "epoch": 0.6742725917963501, "eval_loss": 0.0909789577126503, "eval_runtime": 375.7709, "eval_samples_per_second": 96.828, "eval_steps_per_second": 24.209, "num_input_tokens_seen": 17898336, "step": 13800 }, { "epoch": 0.6745168934600445, "grad_norm": 0.3266575336456299, "learning_rate": 3.6691303700315796e-05, "loss": 0.0924, "num_input_tokens_seen": 17905184, "step": 13805 }, { "epoch": 0.6747611951237388, "grad_norm": 0.2474757730960846, "learning_rate": 3.668262500749655e-05, "loss": 0.1062, "num_input_tokens_seen": 17911488, "step": 13810 }, { "epoch": 0.6750054967874332, "grad_norm": 0.5566434264183044, "learning_rate": 3.667394451306971e-05, "loss": 0.0857, "num_input_tokens_seen": 17918048, "step": 13815 }, { "epoch": 0.6752497984511274, "grad_norm": 0.5366958975791931, "learning_rate": 3.666526221837393e-05, "loss": 0.1057, "num_input_tokens_seen": 17924736, "step": 13820 }, { "epoch": 0.6754941001148218, "grad_norm": 0.5638468861579895, "learning_rate": 3.665657812474812e-05, "loss": 0.0775, "num_input_tokens_seen": 17931072, "step": 13825 }, { "epoch": 0.6757384017785161, "grad_norm": 0.13964639604091644, "learning_rate": 3.664789223353147e-05, "loss": 0.0803, "num_input_tokens_seen": 17937280, "step": 13830 }, { "epoch": 0.6759827034422105, "grad_norm": 0.19442877173423767, "learning_rate": 3.663920454606347e-05, "loss": 0.0778, "num_input_tokens_seen": 17943776, "step": 13835 }, { "epoch": 0.6762270051059047, "grad_norm": 0.2579020857810974, "learning_rate": 3.6630515063683856e-05, "loss": 0.0817, "num_input_tokens_seen": 17950496, "step": 13840 }, { "epoch": 0.6764713067695991, "grad_norm": 0.25397223234176636, "learning_rate": 3.662182378773267e-05, "loss": 0.0762, "num_input_tokens_seen": 17956704, "step": 13845 }, { "epoch": 0.6767156084332935, "grad_norm": 0.22204312682151794, "learning_rate": 3.66131307195502e-05, "loss": 0.0947, "num_input_tokens_seen": 17963200, "step": 13850 }, { "epoch": 0.6769599100969877, "grad_norm": 0.15394088625907898, "learning_rate": 3.6604435860477034e-05, "loss": 0.0822, "num_input_tokens_seen": 17970272, "step": 13855 }, { "epoch": 0.6772042117606821, "grad_norm": 0.16768690943717957, "learning_rate": 3.6595739211854025e-05, "loss": 0.0876, "num_input_tokens_seen": 17977088, "step": 13860 }, { "epoch": 0.6774485134243764, "grad_norm": 0.22794368863105774, "learning_rate": 3.658704077502231e-05, "loss": 0.1055, "num_input_tokens_seen": 17983552, "step": 13865 }, { "epoch": 0.6776928150880708, "grad_norm": 0.17109490931034088, "learning_rate": 3.65783405513233e-05, "loss": 0.0734, "num_input_tokens_seen": 17990272, "step": 13870 }, { "epoch": 0.677937116751765, "grad_norm": 0.19843000173568726, "learning_rate": 3.656963854209867e-05, "loss": 0.0824, "num_input_tokens_seen": 17996832, "step": 13875 }, { "epoch": 0.6781814184154594, "grad_norm": 0.3117630183696747, "learning_rate": 3.656093474869038e-05, "loss": 0.1178, "num_input_tokens_seen": 18003360, "step": 13880 }, { "epoch": 0.6784257200791537, "grad_norm": 0.28296035528182983, "learning_rate": 3.655222917244068e-05, "loss": 0.0904, "num_input_tokens_seen": 18009984, "step": 13885 }, { "epoch": 0.6786700217428481, "grad_norm": 0.20409132540225983, "learning_rate": 3.6543521814692054e-05, "loss": 0.1111, "num_input_tokens_seen": 18016448, "step": 13890 }, { "epoch": 0.6789143234065425, "grad_norm": 0.2211407721042633, "learning_rate": 3.653481267678731e-05, "loss": 0.09, "num_input_tokens_seen": 18023232, "step": 13895 }, { "epoch": 0.6791586250702367, "grad_norm": 0.1822749376296997, "learning_rate": 3.652610176006949e-05, "loss": 0.0758, "num_input_tokens_seen": 18029408, "step": 13900 }, { "epoch": 0.6794029267339311, "grad_norm": 0.3055925667285919, "learning_rate": 3.6517389065881925e-05, "loss": 0.0948, "num_input_tokens_seen": 18035808, "step": 13905 }, { "epoch": 0.6796472283976254, "grad_norm": 0.22040928900241852, "learning_rate": 3.650867459556824e-05, "loss": 0.0653, "num_input_tokens_seen": 18042784, "step": 13910 }, { "epoch": 0.6798915300613197, "grad_norm": 0.3332790732383728, "learning_rate": 3.64999583504723e-05, "loss": 0.0804, "num_input_tokens_seen": 18049184, "step": 13915 }, { "epoch": 0.680135831725014, "grad_norm": 0.26554253697395325, "learning_rate": 3.649124033193827e-05, "loss": 0.1081, "num_input_tokens_seen": 18055360, "step": 13920 }, { "epoch": 0.6803801333887084, "grad_norm": 0.32013970613479614, "learning_rate": 3.648252054131057e-05, "loss": 0.0771, "num_input_tokens_seen": 18061856, "step": 13925 }, { "epoch": 0.6806244350524027, "grad_norm": 0.4788517951965332, "learning_rate": 3.647379897993391e-05, "loss": 0.091, "num_input_tokens_seen": 18068544, "step": 13930 }, { "epoch": 0.680868736716097, "grad_norm": 0.41597244143486023, "learning_rate": 3.646507564915325e-05, "loss": 0.0864, "num_input_tokens_seen": 18074944, "step": 13935 }, { "epoch": 0.6811130383797913, "grad_norm": 0.45164451003074646, "learning_rate": 3.645635055031385e-05, "loss": 0.1004, "num_input_tokens_seen": 18081248, "step": 13940 }, { "epoch": 0.6813573400434857, "grad_norm": 0.8120595812797546, "learning_rate": 3.6447623684761224e-05, "loss": 0.1183, "num_input_tokens_seen": 18087552, "step": 13945 }, { "epoch": 0.6816016417071801, "grad_norm": 0.5767908096313477, "learning_rate": 3.643889505384117e-05, "loss": 0.1042, "num_input_tokens_seen": 18093632, "step": 13950 }, { "epoch": 0.6818459433708743, "grad_norm": 0.22189800441265106, "learning_rate": 3.6430164658899744e-05, "loss": 0.0953, "num_input_tokens_seen": 18099904, "step": 13955 }, { "epoch": 0.6820902450345687, "grad_norm": 0.6417896151542664, "learning_rate": 3.642143250128329e-05, "loss": 0.0851, "num_input_tokens_seen": 18106400, "step": 13960 }, { "epoch": 0.682334546698263, "grad_norm": 0.3787507712841034, "learning_rate": 3.641269858233841e-05, "loss": 0.0997, "num_input_tokens_seen": 18113440, "step": 13965 }, { "epoch": 0.6825788483619574, "grad_norm": 0.3887023627758026, "learning_rate": 3.640396290341199e-05, "loss": 0.0961, "num_input_tokens_seen": 18120064, "step": 13970 }, { "epoch": 0.6828231500256516, "grad_norm": 0.13544847071170807, "learning_rate": 3.639522546585118e-05, "loss": 0.0912, "num_input_tokens_seen": 18126464, "step": 13975 }, { "epoch": 0.683067451689346, "grad_norm": 0.40716874599456787, "learning_rate": 3.6386486271003404e-05, "loss": 0.1154, "num_input_tokens_seen": 18133184, "step": 13980 }, { "epoch": 0.6833117533530403, "grad_norm": 0.21533383429050446, "learning_rate": 3.6377745320216346e-05, "loss": 0.137, "num_input_tokens_seen": 18139264, "step": 13985 }, { "epoch": 0.6835560550167347, "grad_norm": 0.2456529438495636, "learning_rate": 3.636900261483798e-05, "loss": 0.0934, "num_input_tokens_seen": 18145728, "step": 13990 }, { "epoch": 0.683800356680429, "grad_norm": 0.46813318133354187, "learning_rate": 3.636025815621654e-05, "loss": 0.0998, "num_input_tokens_seen": 18152032, "step": 13995 }, { "epoch": 0.6840446583441233, "grad_norm": 0.21410417556762695, "learning_rate": 3.635151194570054e-05, "loss": 0.1007, "num_input_tokens_seen": 18158528, "step": 14000 }, { "epoch": 0.6840446583441233, "eval_loss": 0.09177669137716293, "eval_runtime": 374.8766, "eval_samples_per_second": 97.059, "eval_steps_per_second": 24.267, "num_input_tokens_seen": 18158528, "step": 14000 }, { "epoch": 0.6842889600078177, "grad_norm": 0.8299993276596069, "learning_rate": 3.634276398463873e-05, "loss": 0.1169, "num_input_tokens_seen": 18164768, "step": 14005 }, { "epoch": 0.684533261671512, "grad_norm": 0.21373413503170013, "learning_rate": 3.633401427438018e-05, "loss": 0.088, "num_input_tokens_seen": 18171648, "step": 14010 }, { "epoch": 0.6847775633352063, "grad_norm": 0.12998953461647034, "learning_rate": 3.63252628162742e-05, "loss": 0.0653, "num_input_tokens_seen": 18178016, "step": 14015 }, { "epoch": 0.6850218649989006, "grad_norm": 0.14578045904636383, "learning_rate": 3.6316509611670364e-05, "loss": 0.0858, "num_input_tokens_seen": 18184768, "step": 14020 }, { "epoch": 0.685266166662595, "grad_norm": 0.5632151365280151, "learning_rate": 3.630775466191854e-05, "loss": 0.057, "num_input_tokens_seen": 18191328, "step": 14025 }, { "epoch": 0.6855104683262893, "grad_norm": 0.26060280203819275, "learning_rate": 3.629899796836884e-05, "loss": 0.1162, "num_input_tokens_seen": 18197760, "step": 14030 }, { "epoch": 0.6857547699899836, "grad_norm": 0.5500631332397461, "learning_rate": 3.6290239532371666e-05, "loss": 0.1136, "num_input_tokens_seen": 18204128, "step": 14035 }, { "epoch": 0.685999071653678, "grad_norm": 0.10304605960845947, "learning_rate": 3.628147935527767e-05, "loss": 0.0731, "num_input_tokens_seen": 18210336, "step": 14040 }, { "epoch": 0.6862433733173723, "grad_norm": 0.3926004469394684, "learning_rate": 3.627271743843779e-05, "loss": 0.0943, "num_input_tokens_seen": 18217376, "step": 14045 }, { "epoch": 0.6864876749810667, "grad_norm": 0.2627641260623932, "learning_rate": 3.626395378320321e-05, "loss": 0.0943, "num_input_tokens_seen": 18223616, "step": 14050 }, { "epoch": 0.6867319766447609, "grad_norm": 0.780266284942627, "learning_rate": 3.625518839092541e-05, "loss": 0.0864, "num_input_tokens_seen": 18230368, "step": 14055 }, { "epoch": 0.6869762783084553, "grad_norm": 0.27274373173713684, "learning_rate": 3.624642126295612e-05, "loss": 0.0999, "num_input_tokens_seen": 18236544, "step": 14060 }, { "epoch": 0.6872205799721496, "grad_norm": 0.7125009298324585, "learning_rate": 3.6237652400647345e-05, "loss": 0.08, "num_input_tokens_seen": 18243264, "step": 14065 }, { "epoch": 0.687464881635844, "grad_norm": 0.4364245533943176, "learning_rate": 3.622888180535134e-05, "loss": 0.0915, "num_input_tokens_seen": 18249632, "step": 14070 }, { "epoch": 0.6877091832995382, "grad_norm": 0.3229767680168152, "learning_rate": 3.6220109478420655e-05, "loss": 0.0836, "num_input_tokens_seen": 18256800, "step": 14075 }, { "epoch": 0.6879534849632326, "grad_norm": 0.22679594159126282, "learning_rate": 3.6211335421208084e-05, "loss": 0.1263, "num_input_tokens_seen": 18262656, "step": 14080 }, { "epoch": 0.6881977866269269, "grad_norm": 0.19915197789669037, "learning_rate": 3.62025596350667e-05, "loss": 0.0744, "num_input_tokens_seen": 18269216, "step": 14085 }, { "epoch": 0.6884420882906213, "grad_norm": 0.14340561628341675, "learning_rate": 3.619378212134984e-05, "loss": 0.0808, "num_input_tokens_seen": 18275808, "step": 14090 }, { "epoch": 0.6886863899543156, "grad_norm": 0.5791017413139343, "learning_rate": 3.618500288141111e-05, "loss": 0.0987, "num_input_tokens_seen": 18282432, "step": 14095 }, { "epoch": 0.6889306916180099, "grad_norm": 0.1727171093225479, "learning_rate": 3.617622191660438e-05, "loss": 0.0851, "num_input_tokens_seen": 18288672, "step": 14100 }, { "epoch": 0.6891749932817043, "grad_norm": 0.26686641573905945, "learning_rate": 3.616743922828377e-05, "loss": 0.0791, "num_input_tokens_seen": 18295360, "step": 14105 }, { "epoch": 0.6894192949453986, "grad_norm": 0.21802599728107452, "learning_rate": 3.615865481780371e-05, "loss": 0.0832, "num_input_tokens_seen": 18301728, "step": 14110 }, { "epoch": 0.6896635966090929, "grad_norm": 1.5025238990783691, "learning_rate": 3.614986868651883e-05, "loss": 0.1116, "num_input_tokens_seen": 18308544, "step": 14115 }, { "epoch": 0.6899078982727872, "grad_norm": 0.20308546721935272, "learning_rate": 3.614108083578409e-05, "loss": 0.1047, "num_input_tokens_seen": 18314688, "step": 14120 }, { "epoch": 0.6901521999364816, "grad_norm": 0.47644752264022827, "learning_rate": 3.613229126695467e-05, "loss": 0.0865, "num_input_tokens_seen": 18321312, "step": 14125 }, { "epoch": 0.6903965016001758, "grad_norm": 0.32885152101516724, "learning_rate": 3.612349998138605e-05, "loss": 0.0896, "num_input_tokens_seen": 18327872, "step": 14130 }, { "epoch": 0.6906408032638702, "grad_norm": 0.23787306249141693, "learning_rate": 3.6114706980433946e-05, "loss": 0.0804, "num_input_tokens_seen": 18334848, "step": 14135 }, { "epoch": 0.6908851049275646, "grad_norm": 0.2915501296520233, "learning_rate": 3.610591226545435e-05, "loss": 0.0923, "num_input_tokens_seen": 18341184, "step": 14140 }, { "epoch": 0.6911294065912589, "grad_norm": 0.24943643808364868, "learning_rate": 3.6097115837803505e-05, "loss": 0.0799, "num_input_tokens_seen": 18348224, "step": 14145 }, { "epoch": 0.6913737082549533, "grad_norm": 0.1354483664035797, "learning_rate": 3.608831769883795e-05, "loss": 0.0839, "num_input_tokens_seen": 18354944, "step": 14150 }, { "epoch": 0.6916180099186475, "grad_norm": 0.4628617465496063, "learning_rate": 3.607951784991446e-05, "loss": 0.0824, "num_input_tokens_seen": 18361248, "step": 14155 }, { "epoch": 0.6918623115823419, "grad_norm": 0.524030864238739, "learning_rate": 3.6070716292390085e-05, "loss": 0.09, "num_input_tokens_seen": 18367488, "step": 14160 }, { "epoch": 0.6921066132460362, "grad_norm": 0.3342891335487366, "learning_rate": 3.606191302762213e-05, "loss": 0.0908, "num_input_tokens_seen": 18373600, "step": 14165 }, { "epoch": 0.6923509149097306, "grad_norm": 0.29659613966941833, "learning_rate": 3.605310805696818e-05, "loss": 0.0933, "num_input_tokens_seen": 18380128, "step": 14170 }, { "epoch": 0.6925952165734248, "grad_norm": 0.41838154196739197, "learning_rate": 3.6044301381786067e-05, "loss": 0.0775, "num_input_tokens_seen": 18386816, "step": 14175 }, { "epoch": 0.6928395182371192, "grad_norm": 0.24583500623703003, "learning_rate": 3.6035493003433883e-05, "loss": 0.0856, "num_input_tokens_seen": 18393120, "step": 14180 }, { "epoch": 0.6930838199008135, "grad_norm": 0.3442663848400116, "learning_rate": 3.6026682923269994e-05, "loss": 0.081, "num_input_tokens_seen": 18399648, "step": 14185 }, { "epoch": 0.6933281215645078, "grad_norm": 0.2876844108104706, "learning_rate": 3.6017871142653034e-05, "loss": 0.0981, "num_input_tokens_seen": 18405856, "step": 14190 }, { "epoch": 0.6935724232282022, "grad_norm": 0.48288780450820923, "learning_rate": 3.600905766294189e-05, "loss": 0.1036, "num_input_tokens_seen": 18411936, "step": 14195 }, { "epoch": 0.6938167248918965, "grad_norm": 0.2624213695526123, "learning_rate": 3.60002424854957e-05, "loss": 0.1135, "num_input_tokens_seen": 18418528, "step": 14200 }, { "epoch": 0.6938167248918965, "eval_loss": 0.09073621779680252, "eval_runtime": 374.9403, "eval_samples_per_second": 97.042, "eval_steps_per_second": 24.263, "num_input_tokens_seen": 18418528, "step": 14200 }, { "epoch": 0.6940610265555909, "grad_norm": 0.5207003355026245, "learning_rate": 3.5991425611673876e-05, "loss": 0.0908, "num_input_tokens_seen": 18424896, "step": 14205 }, { "epoch": 0.6943053282192851, "grad_norm": 0.15403670072555542, "learning_rate": 3.5982607042836105e-05, "loss": 0.0787, "num_input_tokens_seen": 18431104, "step": 14210 }, { "epoch": 0.6945496298829795, "grad_norm": 0.13773365318775177, "learning_rate": 3.597378678034231e-05, "loss": 0.0768, "num_input_tokens_seen": 18437248, "step": 14215 }, { "epoch": 0.6947939315466738, "grad_norm": 0.7642780542373657, "learning_rate": 3.596496482555269e-05, "loss": 0.1085, "num_input_tokens_seen": 18443680, "step": 14220 }, { "epoch": 0.6950382332103682, "grad_norm": 0.3874877393245697, "learning_rate": 3.595614117982769e-05, "loss": 0.0842, "num_input_tokens_seen": 18449728, "step": 14225 }, { "epoch": 0.6952825348740624, "grad_norm": 0.11337012052536011, "learning_rate": 3.594731584452805e-05, "loss": 0.0842, "num_input_tokens_seen": 18456000, "step": 14230 }, { "epoch": 0.6955268365377568, "grad_norm": 0.250203400850296, "learning_rate": 3.593848882101472e-05, "loss": 0.0847, "num_input_tokens_seen": 18462784, "step": 14235 }, { "epoch": 0.6957711382014512, "grad_norm": 0.41607651114463806, "learning_rate": 3.592966011064896e-05, "loss": 0.0836, "num_input_tokens_seen": 18469376, "step": 14240 }, { "epoch": 0.6960154398651455, "grad_norm": 0.2743295729160309, "learning_rate": 3.592082971479226e-05, "loss": 0.0915, "num_input_tokens_seen": 18475872, "step": 14245 }, { "epoch": 0.6962597415288398, "grad_norm": 0.12882061302661896, "learning_rate": 3.5911997634806385e-05, "loss": 0.0922, "num_input_tokens_seen": 18482944, "step": 14250 }, { "epoch": 0.6965040431925341, "grad_norm": 0.708173394203186, "learning_rate": 3.5903163872053336e-05, "loss": 0.0935, "num_input_tokens_seen": 18489568, "step": 14255 }, { "epoch": 0.6967483448562285, "grad_norm": 0.21492870151996613, "learning_rate": 3.58943284278954e-05, "loss": 0.0644, "num_input_tokens_seen": 18495904, "step": 14260 }, { "epoch": 0.6969926465199228, "grad_norm": 0.4083978235721588, "learning_rate": 3.588549130369512e-05, "loss": 0.1023, "num_input_tokens_seen": 18502784, "step": 14265 }, { "epoch": 0.6972369481836171, "grad_norm": 0.14657841622829437, "learning_rate": 3.5876652500815274e-05, "loss": 0.0653, "num_input_tokens_seen": 18509376, "step": 14270 }, { "epoch": 0.6974812498473114, "grad_norm": 0.550206184387207, "learning_rate": 3.586781202061894e-05, "loss": 0.0856, "num_input_tokens_seen": 18515648, "step": 14275 }, { "epoch": 0.6977255515110058, "grad_norm": 0.13027940690517426, "learning_rate": 3.585896986446942e-05, "loss": 0.0919, "num_input_tokens_seen": 18522272, "step": 14280 }, { "epoch": 0.6979698531747002, "grad_norm": 0.38900989294052124, "learning_rate": 3.585012603373028e-05, "loss": 0.0668, "num_input_tokens_seen": 18528544, "step": 14285 }, { "epoch": 0.6982141548383944, "grad_norm": 0.19152337312698364, "learning_rate": 3.584128052976535e-05, "loss": 0.0881, "num_input_tokens_seen": 18535104, "step": 14290 }, { "epoch": 0.6984584565020888, "grad_norm": 0.544426441192627, "learning_rate": 3.5832433353938724e-05, "loss": 0.1018, "num_input_tokens_seen": 18541632, "step": 14295 }, { "epoch": 0.6987027581657831, "grad_norm": 0.18553180992603302, "learning_rate": 3.5823584507614746e-05, "loss": 0.0767, "num_input_tokens_seen": 18548320, "step": 14300 }, { "epoch": 0.6989470598294775, "grad_norm": 0.47763895988464355, "learning_rate": 3.581473399215802e-05, "loss": 0.09, "num_input_tokens_seen": 18554912, "step": 14305 }, { "epoch": 0.6991913614931717, "grad_norm": 0.7452356815338135, "learning_rate": 3.580588180893341e-05, "loss": 0.0779, "num_input_tokens_seen": 18562688, "step": 14310 }, { "epoch": 0.6994356631568661, "grad_norm": 0.19284643232822418, "learning_rate": 3.579702795930602e-05, "loss": 0.0962, "num_input_tokens_seen": 18569568, "step": 14315 }, { "epoch": 0.6996799648205604, "grad_norm": 0.20440183579921722, "learning_rate": 3.578817244464125e-05, "loss": 0.1038, "num_input_tokens_seen": 18575616, "step": 14320 }, { "epoch": 0.6999242664842548, "grad_norm": 0.29988402128219604, "learning_rate": 3.577931526630471e-05, "loss": 0.1077, "num_input_tokens_seen": 18581696, "step": 14325 }, { "epoch": 0.700168568147949, "grad_norm": 0.41035452485084534, "learning_rate": 3.577045642566229e-05, "loss": 0.0837, "num_input_tokens_seen": 18588160, "step": 14330 }, { "epoch": 0.7004128698116434, "grad_norm": 0.7128488421440125, "learning_rate": 3.576159592408014e-05, "loss": 0.1046, "num_input_tokens_seen": 18594624, "step": 14335 }, { "epoch": 0.7006571714753378, "grad_norm": 0.1877528876066208, "learning_rate": 3.575273376292466e-05, "loss": 0.0628, "num_input_tokens_seen": 18600928, "step": 14340 }, { "epoch": 0.7009014731390321, "grad_norm": 0.15594615042209625, "learning_rate": 3.574386994356251e-05, "loss": 0.0882, "num_input_tokens_seen": 18607488, "step": 14345 }, { "epoch": 0.7011457748027264, "grad_norm": 0.13989654183387756, "learning_rate": 3.573500446736059e-05, "loss": 0.0767, "num_input_tokens_seen": 18614432, "step": 14350 }, { "epoch": 0.7013900764664207, "grad_norm": 0.15573377907276154, "learning_rate": 3.5726137335686094e-05, "loss": 0.0973, "num_input_tokens_seen": 18620160, "step": 14355 }, { "epoch": 0.7016343781301151, "grad_norm": 0.4068516492843628, "learning_rate": 3.571726854990642e-05, "loss": 0.089, "num_input_tokens_seen": 18626656, "step": 14360 }, { "epoch": 0.7018786797938094, "grad_norm": 0.1101914495229721, "learning_rate": 3.570839811138925e-05, "loss": 0.0737, "num_input_tokens_seen": 18633152, "step": 14365 }, { "epoch": 0.7021229814575037, "grad_norm": 0.2676566243171692, "learning_rate": 3.569952602150252e-05, "loss": 0.0945, "num_input_tokens_seen": 18639936, "step": 14370 }, { "epoch": 0.702367283121198, "grad_norm": 0.43610504269599915, "learning_rate": 3.569065228161442e-05, "loss": 0.0786, "num_input_tokens_seen": 18646528, "step": 14375 }, { "epoch": 0.7026115847848924, "grad_norm": 0.4014550447463989, "learning_rate": 3.5681776893093395e-05, "loss": 0.0622, "num_input_tokens_seen": 18653184, "step": 14380 }, { "epoch": 0.7028558864485868, "grad_norm": 0.3166661858558655, "learning_rate": 3.5672899857308134e-05, "loss": 0.0857, "num_input_tokens_seen": 18659584, "step": 14385 }, { "epoch": 0.703100188112281, "grad_norm": 0.2371310442686081, "learning_rate": 3.566402117562759e-05, "loss": 0.0735, "num_input_tokens_seen": 18666368, "step": 14390 }, { "epoch": 0.7033444897759754, "grad_norm": 0.6200421452522278, "learning_rate": 3.565514084942097e-05, "loss": 0.0868, "num_input_tokens_seen": 18672928, "step": 14395 }, { "epoch": 0.7035887914396697, "grad_norm": 0.2432146668434143, "learning_rate": 3.564625888005773e-05, "loss": 0.0853, "num_input_tokens_seen": 18679264, "step": 14400 }, { "epoch": 0.7035887914396697, "eval_loss": 0.09159460663795471, "eval_runtime": 375.3907, "eval_samples_per_second": 96.926, "eval_steps_per_second": 24.233, "num_input_tokens_seen": 18679264, "step": 14400 }, { "epoch": 0.7038330931033641, "grad_norm": 0.14037223160266876, "learning_rate": 3.563737526890759e-05, "loss": 0.0863, "num_input_tokens_seen": 18685664, "step": 14405 }, { "epoch": 0.7040773947670583, "grad_norm": 0.21554359793663025, "learning_rate": 3.562849001734049e-05, "loss": 0.0677, "num_input_tokens_seen": 18691968, "step": 14410 }, { "epoch": 0.7043216964307527, "grad_norm": 0.24946162104606628, "learning_rate": 3.561960312672667e-05, "loss": 0.0935, "num_input_tokens_seen": 18698432, "step": 14415 }, { "epoch": 0.704565998094447, "grad_norm": 0.4650286138057709, "learning_rate": 3.5610714598436596e-05, "loss": 0.0949, "num_input_tokens_seen": 18704832, "step": 14420 }, { "epoch": 0.7048102997581414, "grad_norm": 0.3393504023551941, "learning_rate": 3.5601824433840986e-05, "loss": 0.0824, "num_input_tokens_seen": 18711392, "step": 14425 }, { "epoch": 0.7050546014218357, "grad_norm": 0.34301820397377014, "learning_rate": 3.559293263431082e-05, "loss": 0.1045, "num_input_tokens_seen": 18717824, "step": 14430 }, { "epoch": 0.70529890308553, "grad_norm": 0.09386276453733444, "learning_rate": 3.558403920121732e-05, "loss": 0.0653, "num_input_tokens_seen": 18724576, "step": 14435 }, { "epoch": 0.7055432047492244, "grad_norm": 0.14230933785438538, "learning_rate": 3.557514413593197e-05, "loss": 0.0689, "num_input_tokens_seen": 18731168, "step": 14440 }, { "epoch": 0.7057875064129187, "grad_norm": 0.28852254152297974, "learning_rate": 3.55662474398265e-05, "loss": 0.0932, "num_input_tokens_seen": 18737664, "step": 14445 }, { "epoch": 0.706031808076613, "grad_norm": 0.9113146662712097, "learning_rate": 3.555734911427288e-05, "loss": 0.1323, "num_input_tokens_seen": 18744000, "step": 14450 }, { "epoch": 0.7062761097403073, "grad_norm": 0.15202678740024567, "learning_rate": 3.5548449160643363e-05, "loss": 0.0795, "num_input_tokens_seen": 18750336, "step": 14455 }, { "epoch": 0.7065204114040017, "grad_norm": 0.8903259038925171, "learning_rate": 3.553954758031043e-05, "loss": 0.1346, "num_input_tokens_seen": 18757248, "step": 14460 }, { "epoch": 0.706764713067696, "grad_norm": 0.31850603222846985, "learning_rate": 3.5530644374646815e-05, "loss": 0.0871, "num_input_tokens_seen": 18763648, "step": 14465 }, { "epoch": 0.7070090147313903, "grad_norm": 0.24040290713310242, "learning_rate": 3.552173954502549e-05, "loss": 0.0734, "num_input_tokens_seen": 18770560, "step": 14470 }, { "epoch": 0.7072533163950846, "grad_norm": 0.30089858174324036, "learning_rate": 3.55128330928197e-05, "loss": 0.0856, "num_input_tokens_seen": 18776864, "step": 14475 }, { "epoch": 0.707497618058779, "grad_norm": 0.28489962220191956, "learning_rate": 3.550392501940294e-05, "loss": 0.0884, "num_input_tokens_seen": 18783968, "step": 14480 }, { "epoch": 0.7077419197224734, "grad_norm": 0.21086376905441284, "learning_rate": 3.5495015326148945e-05, "loss": 0.0833, "num_input_tokens_seen": 18790368, "step": 14485 }, { "epoch": 0.7079862213861676, "grad_norm": 0.21053342521190643, "learning_rate": 3.548610401443169e-05, "loss": 0.0983, "num_input_tokens_seen": 18796896, "step": 14490 }, { "epoch": 0.708230523049862, "grad_norm": 0.27012690901756287, "learning_rate": 3.547719108562543e-05, "loss": 0.0936, "num_input_tokens_seen": 18802976, "step": 14495 }, { "epoch": 0.7084748247135563, "grad_norm": 0.1910875290632248, "learning_rate": 3.546827654110464e-05, "loss": 0.1079, "num_input_tokens_seen": 18809536, "step": 14500 }, { "epoch": 0.7087191263772507, "grad_norm": 0.6340268850326538, "learning_rate": 3.545936038224405e-05, "loss": 0.0982, "num_input_tokens_seen": 18816000, "step": 14505 }, { "epoch": 0.7089634280409449, "grad_norm": 0.2324526011943817, "learning_rate": 3.545044261041864e-05, "loss": 0.0681, "num_input_tokens_seen": 18822432, "step": 14510 }, { "epoch": 0.7092077297046393, "grad_norm": 0.3180524706840515, "learning_rate": 3.5441523227003657e-05, "loss": 0.0896, "num_input_tokens_seen": 18828864, "step": 14515 }, { "epoch": 0.7094520313683336, "grad_norm": 0.24851655960083008, "learning_rate": 3.543260223337459e-05, "loss": 0.068, "num_input_tokens_seen": 18835744, "step": 14520 }, { "epoch": 0.709696333032028, "grad_norm": 0.7802987098693848, "learning_rate": 3.542367963090714e-05, "loss": 0.1177, "num_input_tokens_seen": 18842336, "step": 14525 }, { "epoch": 0.7099406346957223, "grad_norm": 0.32220062613487244, "learning_rate": 3.5414755420977295e-05, "loss": 0.0897, "num_input_tokens_seen": 18848736, "step": 14530 }, { "epoch": 0.7101849363594166, "grad_norm": 0.18270887434482574, "learning_rate": 3.54058296049613e-05, "loss": 0.0918, "num_input_tokens_seen": 18855136, "step": 14535 }, { "epoch": 0.710429238023111, "grad_norm": 0.7996596693992615, "learning_rate": 3.53969021842356e-05, "loss": 0.0804, "num_input_tokens_seen": 18861664, "step": 14540 }, { "epoch": 0.7106735396868052, "grad_norm": 0.20563597977161407, "learning_rate": 3.5387973160176926e-05, "loss": 0.1011, "num_input_tokens_seen": 18868000, "step": 14545 }, { "epoch": 0.7109178413504996, "grad_norm": 0.43481990694999695, "learning_rate": 3.537904253416224e-05, "loss": 0.0944, "num_input_tokens_seen": 18874496, "step": 14550 }, { "epoch": 0.7111621430141939, "grad_norm": 0.25879085063934326, "learning_rate": 3.537011030756878e-05, "loss": 0.1109, "num_input_tokens_seen": 18880864, "step": 14555 }, { "epoch": 0.7114064446778883, "grad_norm": 0.29855436086654663, "learning_rate": 3.536117648177399e-05, "loss": 0.0948, "num_input_tokens_seen": 18887584, "step": 14560 }, { "epoch": 0.7116507463415825, "grad_norm": 0.2842102646827698, "learning_rate": 3.535224105815558e-05, "loss": 0.0791, "num_input_tokens_seen": 18894272, "step": 14565 }, { "epoch": 0.7118950480052769, "grad_norm": 0.436079204082489, "learning_rate": 3.5343304038091494e-05, "loss": 0.09, "num_input_tokens_seen": 18900768, "step": 14570 }, { "epoch": 0.7121393496689713, "grad_norm": 0.37906819581985474, "learning_rate": 3.5334365422959955e-05, "loss": 0.1157, "num_input_tokens_seen": 18907520, "step": 14575 }, { "epoch": 0.7123836513326656, "grad_norm": 0.25497743487358093, "learning_rate": 3.5325425214139396e-05, "loss": 0.0747, "num_input_tokens_seen": 18914240, "step": 14580 }, { "epoch": 0.71262795299636, "grad_norm": 0.3867287337779999, "learning_rate": 3.531648341300851e-05, "loss": 0.0754, "num_input_tokens_seen": 18920576, "step": 14585 }, { "epoch": 0.7128722546600542, "grad_norm": 0.3310532569885254, "learning_rate": 3.530754002094623e-05, "loss": 0.0775, "num_input_tokens_seen": 18926816, "step": 14590 }, { "epoch": 0.7131165563237486, "grad_norm": 0.7494600415229797, "learning_rate": 3.529859503933175e-05, "loss": 0.0941, "num_input_tokens_seen": 18933376, "step": 14595 }, { "epoch": 0.7133608579874429, "grad_norm": 0.284038245677948, "learning_rate": 3.52896484695445e-05, "loss": 0.0988, "num_input_tokens_seen": 18940320, "step": 14600 }, { "epoch": 0.7133608579874429, "eval_loss": 0.09232643246650696, "eval_runtime": 374.4607, "eval_samples_per_second": 97.166, "eval_steps_per_second": 24.294, "num_input_tokens_seen": 18940320, "step": 14600 }, { "epoch": 0.7136051596511372, "grad_norm": 0.3210459053516388, "learning_rate": 3.528070031296414e-05, "loss": 0.1008, "num_input_tokens_seen": 18946816, "step": 14605 }, { "epoch": 0.7138494613148315, "grad_norm": 0.1744426190853119, "learning_rate": 3.5271750570970605e-05, "loss": 0.0716, "num_input_tokens_seen": 18953408, "step": 14610 }, { "epoch": 0.7140937629785259, "grad_norm": 0.47676795721054077, "learning_rate": 3.526279924494405e-05, "loss": 0.0843, "num_input_tokens_seen": 18959872, "step": 14615 }, { "epoch": 0.7143380646422202, "grad_norm": 0.23077401518821716, "learning_rate": 3.5253846336264874e-05, "loss": 0.1097, "num_input_tokens_seen": 18966624, "step": 14620 }, { "epoch": 0.7145823663059145, "grad_norm": 0.4007891118526459, "learning_rate": 3.5244891846313736e-05, "loss": 0.126, "num_input_tokens_seen": 18972768, "step": 14625 }, { "epoch": 0.7148266679696089, "grad_norm": 0.22930213809013367, "learning_rate": 3.5235935776471527e-05, "loss": 0.0897, "num_input_tokens_seen": 18979040, "step": 14630 }, { "epoch": 0.7150709696333032, "grad_norm": 0.32377496361732483, "learning_rate": 3.522697812811939e-05, "loss": 0.1027, "num_input_tokens_seen": 18985408, "step": 14635 }, { "epoch": 0.7153152712969976, "grad_norm": 0.22442826628684998, "learning_rate": 3.521801890263871e-05, "loss": 0.1052, "num_input_tokens_seen": 18991872, "step": 14640 }, { "epoch": 0.7155595729606918, "grad_norm": 0.3464365303516388, "learning_rate": 3.5209058101411114e-05, "loss": 0.0874, "num_input_tokens_seen": 18998208, "step": 14645 }, { "epoch": 0.7158038746243862, "grad_norm": 0.45641669631004333, "learning_rate": 3.520009572581845e-05, "loss": 0.0968, "num_input_tokens_seen": 19005024, "step": 14650 }, { "epoch": 0.7160481762880805, "grad_norm": 0.26582950353622437, "learning_rate": 3.519113177724285e-05, "loss": 0.1113, "num_input_tokens_seen": 19011296, "step": 14655 }, { "epoch": 0.7162924779517749, "grad_norm": 0.1872589886188507, "learning_rate": 3.5182166257066656e-05, "loss": 0.1053, "num_input_tokens_seen": 19017888, "step": 14660 }, { "epoch": 0.7165367796154691, "grad_norm": 0.47249478101730347, "learning_rate": 3.517319916667247e-05, "loss": 0.0958, "num_input_tokens_seen": 19024096, "step": 14665 }, { "epoch": 0.7167810812791635, "grad_norm": 0.2750479280948639, "learning_rate": 3.516423050744313e-05, "loss": 0.0934, "num_input_tokens_seen": 19030848, "step": 14670 }, { "epoch": 0.7170253829428579, "grad_norm": 0.4630361795425415, "learning_rate": 3.5155260280761704e-05, "loss": 0.0818, "num_input_tokens_seen": 19037312, "step": 14675 }, { "epoch": 0.7172696846065522, "grad_norm": 0.23540042340755463, "learning_rate": 3.514628848801154e-05, "loss": 0.1038, "num_input_tokens_seen": 19043360, "step": 14680 }, { "epoch": 0.7175139862702465, "grad_norm": 0.13261832296848297, "learning_rate": 3.5137315130576174e-05, "loss": 0.1068, "num_input_tokens_seen": 19050240, "step": 14685 }, { "epoch": 0.7177582879339408, "grad_norm": 0.28686898946762085, "learning_rate": 3.512834020983942e-05, "loss": 0.1008, "num_input_tokens_seen": 19056544, "step": 14690 }, { "epoch": 0.7180025895976352, "grad_norm": 0.21018792688846588, "learning_rate": 3.5119363727185334e-05, "loss": 0.1066, "num_input_tokens_seen": 19062528, "step": 14695 }, { "epoch": 0.7182468912613295, "grad_norm": 0.447069376707077, "learning_rate": 3.511038568399819e-05, "loss": 0.0703, "num_input_tokens_seen": 19068800, "step": 14700 }, { "epoch": 0.7184911929250238, "grad_norm": 0.36040234565734863, "learning_rate": 3.510140608166251e-05, "loss": 0.1151, "num_input_tokens_seen": 19075296, "step": 14705 }, { "epoch": 0.7187354945887181, "grad_norm": 0.34894615411758423, "learning_rate": 3.509242492156308e-05, "loss": 0.09, "num_input_tokens_seen": 19081536, "step": 14710 }, { "epoch": 0.7189797962524125, "grad_norm": 0.2080090194940567, "learning_rate": 3.5083442205084896e-05, "loss": 0.0872, "num_input_tokens_seen": 19087904, "step": 14715 }, { "epoch": 0.7192240979161069, "grad_norm": 0.34738484025001526, "learning_rate": 3.507445793361321e-05, "loss": 0.0977, "num_input_tokens_seen": 19094592, "step": 14720 }, { "epoch": 0.7194683995798011, "grad_norm": 0.14758948981761932, "learning_rate": 3.5065472108533505e-05, "loss": 0.0954, "num_input_tokens_seen": 19100672, "step": 14725 }, { "epoch": 0.7197127012434955, "grad_norm": 0.3791683614253998, "learning_rate": 3.5056484731231504e-05, "loss": 0.0912, "num_input_tokens_seen": 19107296, "step": 14730 }, { "epoch": 0.7199570029071898, "grad_norm": 0.22254247963428497, "learning_rate": 3.504749580309319e-05, "loss": 0.0976, "num_input_tokens_seen": 19113088, "step": 14735 }, { "epoch": 0.7202013045708842, "grad_norm": 0.6711651086807251, "learning_rate": 3.5038505325504753e-05, "loss": 0.0959, "num_input_tokens_seen": 19119552, "step": 14740 }, { "epoch": 0.7204456062345784, "grad_norm": 0.2299167662858963, "learning_rate": 3.502951329985264e-05, "loss": 0.1233, "num_input_tokens_seen": 19125856, "step": 14745 }, { "epoch": 0.7206899078982728, "grad_norm": 0.35474467277526855, "learning_rate": 3.502051972752354e-05, "loss": 0.0822, "num_input_tokens_seen": 19131904, "step": 14750 }, { "epoch": 0.7209342095619671, "grad_norm": 0.8187388777732849, "learning_rate": 3.5011524609904374e-05, "loss": 0.1293, "num_input_tokens_seen": 19138560, "step": 14755 }, { "epoch": 0.7211785112256615, "grad_norm": 1.0293930768966675, "learning_rate": 3.50025279483823e-05, "loss": 0.1113, "num_input_tokens_seen": 19144736, "step": 14760 }, { "epoch": 0.7214228128893557, "grad_norm": 0.17147469520568848, "learning_rate": 3.499352974434472e-05, "loss": 0.0801, "num_input_tokens_seen": 19151616, "step": 14765 }, { "epoch": 0.7216671145530501, "grad_norm": 0.18696635961532593, "learning_rate": 3.498452999917926e-05, "loss": 0.1021, "num_input_tokens_seen": 19157696, "step": 14770 }, { "epoch": 0.7219114162167445, "grad_norm": 0.6123788356781006, "learning_rate": 3.4975528714273795e-05, "loss": 0.097, "num_input_tokens_seen": 19163968, "step": 14775 }, { "epoch": 0.7221557178804388, "grad_norm": 0.27435779571533203, "learning_rate": 3.4966525891016454e-05, "loss": 0.1, "num_input_tokens_seen": 19170080, "step": 14780 }, { "epoch": 0.7224000195441331, "grad_norm": 0.5306727886199951, "learning_rate": 3.495752153079557e-05, "loss": 0.0937, "num_input_tokens_seen": 19176608, "step": 14785 }, { "epoch": 0.7226443212078274, "grad_norm": 0.1571110337972641, "learning_rate": 3.494851563499974e-05, "loss": 0.0708, "num_input_tokens_seen": 19183360, "step": 14790 }, { "epoch": 0.7228886228715218, "grad_norm": 0.1781570464372635, "learning_rate": 3.493950820501777e-05, "loss": 0.0978, "num_input_tokens_seen": 19189728, "step": 14795 }, { "epoch": 0.723132924535216, "grad_norm": 0.32257208228111267, "learning_rate": 3.493049924223872e-05, "loss": 0.0816, "num_input_tokens_seen": 19196416, "step": 14800 }, { "epoch": 0.723132924535216, "eval_loss": 0.0919724851846695, "eval_runtime": 374.8321, "eval_samples_per_second": 97.07, "eval_steps_per_second": 24.27, "num_input_tokens_seen": 19196416, "step": 14800 }, { "epoch": 0.7233772261989104, "grad_norm": 0.27696701884269714, "learning_rate": 3.49214887480519e-05, "loss": 0.0907, "num_input_tokens_seen": 19202560, "step": 14805 }, { "epoch": 0.7236215278626047, "grad_norm": 0.12701351940631866, "learning_rate": 3.4912476723846834e-05, "loss": 0.0838, "num_input_tokens_seen": 19209408, "step": 14810 }, { "epoch": 0.7238658295262991, "grad_norm": 0.15902164578437805, "learning_rate": 3.490346317101328e-05, "loss": 0.1203, "num_input_tokens_seen": 19215840, "step": 14815 }, { "epoch": 0.7241101311899935, "grad_norm": 0.3158629536628723, "learning_rate": 3.4894448090941266e-05, "loss": 0.0933, "num_input_tokens_seen": 19222304, "step": 14820 }, { "epoch": 0.7243544328536877, "grad_norm": 0.23828889429569244, "learning_rate": 3.488543148502101e-05, "loss": 0.067, "num_input_tokens_seen": 19229088, "step": 14825 }, { "epoch": 0.7245987345173821, "grad_norm": 0.7522140145301819, "learning_rate": 3.487641335464299e-05, "loss": 0.0811, "num_input_tokens_seen": 19236160, "step": 14830 }, { "epoch": 0.7248430361810764, "grad_norm": 0.2728089392185211, "learning_rate": 3.4867393701197914e-05, "loss": 0.1059, "num_input_tokens_seen": 19242240, "step": 14835 }, { "epoch": 0.7250873378447708, "grad_norm": 0.25189408659935, "learning_rate": 3.485837252607673e-05, "loss": 0.0775, "num_input_tokens_seen": 19249184, "step": 14840 }, { "epoch": 0.725331639508465, "grad_norm": 0.27855536341667175, "learning_rate": 3.4849349830670615e-05, "loss": 0.1128, "num_input_tokens_seen": 19255520, "step": 14845 }, { "epoch": 0.7255759411721594, "grad_norm": 0.4098506569862366, "learning_rate": 3.4840325616370976e-05, "loss": 0.0919, "num_input_tokens_seen": 19261536, "step": 14850 }, { "epoch": 0.7258202428358537, "grad_norm": 0.5162977576255798, "learning_rate": 3.483129988456947e-05, "loss": 0.0931, "num_input_tokens_seen": 19267808, "step": 14855 }, { "epoch": 0.726064544499548, "grad_norm": 0.2953129708766937, "learning_rate": 3.482227263665797e-05, "loss": 0.0726, "num_input_tokens_seen": 19273920, "step": 14860 }, { "epoch": 0.7263088461632423, "grad_norm": 0.11062305420637131, "learning_rate": 3.48132438740286e-05, "loss": 0.0759, "num_input_tokens_seen": 19280416, "step": 14865 }, { "epoch": 0.7265531478269367, "grad_norm": 0.20807002484798431, "learning_rate": 3.48042135980737e-05, "loss": 0.0783, "num_input_tokens_seen": 19286784, "step": 14870 }, { "epoch": 0.7267974494906311, "grad_norm": 0.18435703217983246, "learning_rate": 3.479518181018586e-05, "loss": 0.0884, "num_input_tokens_seen": 19293536, "step": 14875 }, { "epoch": 0.7270417511543253, "grad_norm": 0.25772416591644287, "learning_rate": 3.4786148511757886e-05, "loss": 0.0755, "num_input_tokens_seen": 19299968, "step": 14880 }, { "epoch": 0.7272860528180197, "grad_norm": 0.40237051248550415, "learning_rate": 3.477711370418284e-05, "loss": 0.084, "num_input_tokens_seen": 19306496, "step": 14885 }, { "epoch": 0.727530354481714, "grad_norm": 0.11457622051239014, "learning_rate": 3.476807738885399e-05, "loss": 0.0857, "num_input_tokens_seen": 19312928, "step": 14890 }, { "epoch": 0.7277746561454084, "grad_norm": 0.19890688359737396, "learning_rate": 3.475903956716485e-05, "loss": 0.0856, "num_input_tokens_seen": 19319232, "step": 14895 }, { "epoch": 0.7280189578091026, "grad_norm": 0.14500261843204498, "learning_rate": 3.475000024050917e-05, "loss": 0.0788, "num_input_tokens_seen": 19325984, "step": 14900 }, { "epoch": 0.728263259472797, "grad_norm": 0.5453290939331055, "learning_rate": 3.4740959410280926e-05, "loss": 0.0798, "num_input_tokens_seen": 19332000, "step": 14905 }, { "epoch": 0.7285075611364913, "grad_norm": 0.44879719614982605, "learning_rate": 3.4731917077874324e-05, "loss": 0.1145, "num_input_tokens_seen": 19338240, "step": 14910 }, { "epoch": 0.7287518628001857, "grad_norm": 0.5332766175270081, "learning_rate": 3.4722873244683816e-05, "loss": 0.094, "num_input_tokens_seen": 19345056, "step": 14915 }, { "epoch": 0.72899616446388, "grad_norm": 0.35015198588371277, "learning_rate": 3.4713827912104065e-05, "loss": 0.064, "num_input_tokens_seen": 19351744, "step": 14920 }, { "epoch": 0.7292404661275743, "grad_norm": 0.7860801219940186, "learning_rate": 3.470478108152998e-05, "loss": 0.1066, "num_input_tokens_seen": 19358144, "step": 14925 }, { "epoch": 0.7294847677912687, "grad_norm": 0.5169457793235779, "learning_rate": 3.4695732754356695e-05, "loss": 0.068, "num_input_tokens_seen": 19364192, "step": 14930 }, { "epoch": 0.729729069454963, "grad_norm": 0.4130229353904724, "learning_rate": 3.4686682931979576e-05, "loss": 0.0728, "num_input_tokens_seen": 19370080, "step": 14935 }, { "epoch": 0.7299733711186573, "grad_norm": 0.33190202713012695, "learning_rate": 3.467763161579422e-05, "loss": 0.0899, "num_input_tokens_seen": 19376800, "step": 14940 }, { "epoch": 0.7302176727823516, "grad_norm": 0.30252012610435486, "learning_rate": 3.466857880719645e-05, "loss": 0.0777, "num_input_tokens_seen": 19383136, "step": 14945 }, { "epoch": 0.730461974446046, "grad_norm": 0.27662065625190735, "learning_rate": 3.465952450758233e-05, "loss": 0.0942, "num_input_tokens_seen": 19389440, "step": 14950 }, { "epoch": 0.7307062761097403, "grad_norm": 0.49818816781044006, "learning_rate": 3.4650468718348126e-05, "loss": 0.0943, "num_input_tokens_seen": 19395904, "step": 14955 }, { "epoch": 0.7309505777734346, "grad_norm": 0.36025938391685486, "learning_rate": 3.464141144089038e-05, "loss": 0.0644, "num_input_tokens_seen": 19402528, "step": 14960 }, { "epoch": 0.731194879437129, "grad_norm": 0.3154584467411041, "learning_rate": 3.463235267660583e-05, "loss": 0.1032, "num_input_tokens_seen": 19409056, "step": 14965 }, { "epoch": 0.7314391811008233, "grad_norm": 0.24595554172992706, "learning_rate": 3.462329242689145e-05, "loss": 0.111, "num_input_tokens_seen": 19415680, "step": 14970 }, { "epoch": 0.7316834827645177, "grad_norm": 0.24741727113723755, "learning_rate": 3.461423069314444e-05, "loss": 0.0867, "num_input_tokens_seen": 19421984, "step": 14975 }, { "epoch": 0.7319277844282119, "grad_norm": 0.2057204693555832, "learning_rate": 3.460516747676224e-05, "loss": 0.0532, "num_input_tokens_seen": 19428480, "step": 14980 }, { "epoch": 0.7321720860919063, "grad_norm": 0.3409254848957062, "learning_rate": 3.459610277914251e-05, "loss": 0.102, "num_input_tokens_seen": 19435392, "step": 14985 }, { "epoch": 0.7324163877556006, "grad_norm": 0.26561981439590454, "learning_rate": 3.458703660168314e-05, "loss": 0.1216, "num_input_tokens_seen": 19441696, "step": 14990 }, { "epoch": 0.732660689419295, "grad_norm": 0.4383693039417267, "learning_rate": 3.457796894578224e-05, "loss": 0.0983, "num_input_tokens_seen": 19448896, "step": 14995 }, { "epoch": 0.7329049910829892, "grad_norm": 0.44499385356903076, "learning_rate": 3.456889981283817e-05, "loss": 0.0937, "num_input_tokens_seen": 19454912, "step": 15000 }, { "epoch": 0.7329049910829892, "eval_loss": 0.09056052565574646, "eval_runtime": 374.3418, "eval_samples_per_second": 97.197, "eval_steps_per_second": 24.301, "num_input_tokens_seen": 19454912, "step": 15000 }, { "epoch": 0.7331492927466836, "grad_norm": 0.24695658683776855, "learning_rate": 3.45598292042495e-05, "loss": 0.0953, "num_input_tokens_seen": 19461568, "step": 15005 }, { "epoch": 0.7333935944103779, "grad_norm": 0.13703520596027374, "learning_rate": 3.4550757121415035e-05, "loss": 0.0731, "num_input_tokens_seen": 19468736, "step": 15010 }, { "epoch": 0.7336378960740723, "grad_norm": 0.3530654013156891, "learning_rate": 3.454168356573378e-05, "loss": 0.0927, "num_input_tokens_seen": 19475168, "step": 15015 }, { "epoch": 0.7338821977377666, "grad_norm": 0.2881513237953186, "learning_rate": 3.453260853860503e-05, "loss": 0.0847, "num_input_tokens_seen": 19481728, "step": 15020 }, { "epoch": 0.7341264994014609, "grad_norm": 0.638748049736023, "learning_rate": 3.452353204142824e-05, "loss": 0.1024, "num_input_tokens_seen": 19488256, "step": 15025 }, { "epoch": 0.7343708010651553, "grad_norm": 0.20915889739990234, "learning_rate": 3.4514454075603136e-05, "loss": 0.0798, "num_input_tokens_seen": 19494528, "step": 15030 }, { "epoch": 0.7346151027288496, "grad_norm": 0.1594918668270111, "learning_rate": 3.450537464252964e-05, "loss": 0.0744, "num_input_tokens_seen": 19501088, "step": 15035 }, { "epoch": 0.7348594043925439, "grad_norm": 0.294597327709198, "learning_rate": 3.4496293743607925e-05, "loss": 0.1118, "num_input_tokens_seen": 19507424, "step": 15040 }, { "epoch": 0.7351037060562382, "grad_norm": 0.4090389013290405, "learning_rate": 3.448721138023838e-05, "loss": 0.0876, "num_input_tokens_seen": 19514080, "step": 15045 }, { "epoch": 0.7353480077199326, "grad_norm": 0.26160621643066406, "learning_rate": 3.447812755382162e-05, "loss": 0.0901, "num_input_tokens_seen": 19521056, "step": 15050 }, { "epoch": 0.7355923093836269, "grad_norm": 0.4982796907424927, "learning_rate": 3.446904226575847e-05, "loss": 0.1018, "num_input_tokens_seen": 19527584, "step": 15055 }, { "epoch": 0.7358366110473212, "grad_norm": 0.20144954323768616, "learning_rate": 3.445995551745002e-05, "loss": 0.0709, "num_input_tokens_seen": 19534368, "step": 15060 }, { "epoch": 0.7360809127110156, "grad_norm": 0.34916579723358154, "learning_rate": 3.445086731029753e-05, "loss": 0.0804, "num_input_tokens_seen": 19540416, "step": 15065 }, { "epoch": 0.7363252143747099, "grad_norm": 0.16161853075027466, "learning_rate": 3.444177764570255e-05, "loss": 0.065, "num_input_tokens_seen": 19547296, "step": 15070 }, { "epoch": 0.7365695160384043, "grad_norm": 0.19589589536190033, "learning_rate": 3.44326865250668e-05, "loss": 0.0905, "num_input_tokens_seen": 19553536, "step": 15075 }, { "epoch": 0.7368138177020985, "grad_norm": 0.5554328560829163, "learning_rate": 3.442359394979225e-05, "loss": 0.0894, "num_input_tokens_seen": 19559648, "step": 15080 }, { "epoch": 0.7370581193657929, "grad_norm": 0.584901750087738, "learning_rate": 3.441449992128108e-05, "loss": 0.0846, "num_input_tokens_seen": 19565984, "step": 15085 }, { "epoch": 0.7373024210294872, "grad_norm": 0.51506507396698, "learning_rate": 3.440540444093573e-05, "loss": 0.0831, "num_input_tokens_seen": 19573024, "step": 15090 }, { "epoch": 0.7375467226931816, "grad_norm": 0.286956787109375, "learning_rate": 3.43963075101588e-05, "loss": 0.0943, "num_input_tokens_seen": 19579360, "step": 15095 }, { "epoch": 0.7377910243568758, "grad_norm": 0.19370126724243164, "learning_rate": 3.438720913035318e-05, "loss": 0.0959, "num_input_tokens_seen": 19585920, "step": 15100 }, { "epoch": 0.7380353260205702, "grad_norm": 0.5792253017425537, "learning_rate": 3.437810930292195e-05, "loss": 0.1055, "num_input_tokens_seen": 19592544, "step": 15105 }, { "epoch": 0.7382796276842646, "grad_norm": 0.5783951878547668, "learning_rate": 3.43690080292684e-05, "loss": 0.109, "num_input_tokens_seen": 19598720, "step": 15110 }, { "epoch": 0.7385239293479589, "grad_norm": 0.2033645212650299, "learning_rate": 3.435990531079608e-05, "loss": 0.0927, "num_input_tokens_seen": 19605184, "step": 15115 }, { "epoch": 0.7387682310116532, "grad_norm": 0.2183029055595398, "learning_rate": 3.435080114890874e-05, "loss": 0.0867, "num_input_tokens_seen": 19611584, "step": 15120 }, { "epoch": 0.7390125326753475, "grad_norm": 0.259520024061203, "learning_rate": 3.434169554501035e-05, "loss": 0.0936, "num_input_tokens_seen": 19618048, "step": 15125 }, { "epoch": 0.7392568343390419, "grad_norm": 0.41293075680732727, "learning_rate": 3.433258850050511e-05, "loss": 0.071, "num_input_tokens_seen": 19624832, "step": 15130 }, { "epoch": 0.7395011360027361, "grad_norm": 0.185689777135849, "learning_rate": 3.4323480016797446e-05, "loss": 0.0618, "num_input_tokens_seen": 19631200, "step": 15135 }, { "epoch": 0.7397454376664305, "grad_norm": 0.33117789030075073, "learning_rate": 3.4314370095291995e-05, "loss": 0.1431, "num_input_tokens_seen": 19638464, "step": 15140 }, { "epoch": 0.7399897393301248, "grad_norm": 0.20248955488204956, "learning_rate": 3.430525873739363e-05, "loss": 0.0781, "num_input_tokens_seen": 19644576, "step": 15145 }, { "epoch": 0.7402340409938192, "grad_norm": 0.6296222805976868, "learning_rate": 3.429614594450743e-05, "loss": 0.1051, "num_input_tokens_seen": 19651136, "step": 15150 }, { "epoch": 0.7404783426575134, "grad_norm": 0.2394062727689743, "learning_rate": 3.428703171803869e-05, "loss": 0.0637, "num_input_tokens_seen": 19658112, "step": 15155 }, { "epoch": 0.7407226443212078, "grad_norm": 0.17971058189868927, "learning_rate": 3.4277916059392964e-05, "loss": 0.0881, "num_input_tokens_seen": 19664288, "step": 15160 }, { "epoch": 0.7409669459849022, "grad_norm": 0.3788207173347473, "learning_rate": 3.426879896997598e-05, "loss": 0.0992, "num_input_tokens_seen": 19670688, "step": 15165 }, { "epoch": 0.7412112476485965, "grad_norm": 0.4390468895435333, "learning_rate": 3.425968045119372e-05, "loss": 0.0738, "num_input_tokens_seen": 19676800, "step": 15170 }, { "epoch": 0.7414555493122909, "grad_norm": 1.1118052005767822, "learning_rate": 3.425056050445237e-05, "loss": 0.0887, "num_input_tokens_seen": 19683008, "step": 15175 }, { "epoch": 0.7416998509759851, "grad_norm": 0.24859705567359924, "learning_rate": 3.4241439131158336e-05, "loss": 0.0887, "num_input_tokens_seen": 19689696, "step": 15180 }, { "epoch": 0.7419441526396795, "grad_norm": 0.30889421701431274, "learning_rate": 3.423231633271825e-05, "loss": 0.0875, "num_input_tokens_seen": 19696384, "step": 15185 }, { "epoch": 0.7421884543033738, "grad_norm": 0.33794400095939636, "learning_rate": 3.4223192110538985e-05, "loss": 0.0718, "num_input_tokens_seen": 19702944, "step": 15190 }, { "epoch": 0.7424327559670681, "grad_norm": 0.22359444200992584, "learning_rate": 3.4214066466027575e-05, "loss": 0.0774, "num_input_tokens_seen": 19709344, "step": 15195 }, { "epoch": 0.7426770576307624, "grad_norm": 0.3004677891731262, "learning_rate": 3.4204939400591325e-05, "loss": 0.0987, "num_input_tokens_seen": 19715616, "step": 15200 }, { "epoch": 0.7426770576307624, "eval_loss": 0.0904901772737503, "eval_runtime": 374.3775, "eval_samples_per_second": 97.188, "eval_steps_per_second": 24.299, "num_input_tokens_seen": 19715616, "step": 15200 }, { "epoch": 0.7429213592944568, "grad_norm": 0.35675302147865295, "learning_rate": 3.419581091563775e-05, "loss": 0.1076, "num_input_tokens_seen": 19722208, "step": 15205 }, { "epoch": 0.7431656609581512, "grad_norm": 0.1952403336763382, "learning_rate": 3.418668101257456e-05, "loss": 0.1108, "num_input_tokens_seen": 19728256, "step": 15210 }, { "epoch": 0.7434099626218454, "grad_norm": 0.3448551893234253, "learning_rate": 3.417754969280971e-05, "loss": 0.0931, "num_input_tokens_seen": 19734912, "step": 15215 }, { "epoch": 0.7436542642855398, "grad_norm": 0.3114347755908966, "learning_rate": 3.416841695775137e-05, "loss": 0.0907, "num_input_tokens_seen": 19741568, "step": 15220 }, { "epoch": 0.7438985659492341, "grad_norm": 0.39120566844940186, "learning_rate": 3.415928280880792e-05, "loss": 0.1033, "num_input_tokens_seen": 19748192, "step": 15225 }, { "epoch": 0.7441428676129285, "grad_norm": 0.12095727026462555, "learning_rate": 3.4150147247387965e-05, "loss": 0.0575, "num_input_tokens_seen": 19754848, "step": 15230 }, { "epoch": 0.7443871692766227, "grad_norm": 0.6929975748062134, "learning_rate": 3.4141010274900306e-05, "loss": 0.1063, "num_input_tokens_seen": 19761056, "step": 15235 }, { "epoch": 0.7446314709403171, "grad_norm": 0.3223626911640167, "learning_rate": 3.413187189275399e-05, "loss": 0.0699, "num_input_tokens_seen": 19767200, "step": 15240 }, { "epoch": 0.7448757726040114, "grad_norm": 0.2727805972099304, "learning_rate": 3.4122732102358265e-05, "loss": 0.0681, "num_input_tokens_seen": 19774336, "step": 15245 }, { "epoch": 0.7451200742677058, "grad_norm": 0.2685346305370331, "learning_rate": 3.411359090512261e-05, "loss": 0.0746, "num_input_tokens_seen": 19781056, "step": 15250 }, { "epoch": 0.7453643759314001, "grad_norm": 0.5408146977424622, "learning_rate": 3.410444830245672e-05, "loss": 0.1041, "num_input_tokens_seen": 19787232, "step": 15255 }, { "epoch": 0.7456086775950944, "grad_norm": 0.23115339875221252, "learning_rate": 3.409530429577048e-05, "loss": 0.0969, "num_input_tokens_seen": 19794112, "step": 15260 }, { "epoch": 0.7458529792587888, "grad_norm": 0.3939475417137146, "learning_rate": 3.408615888647402e-05, "loss": 0.0829, "num_input_tokens_seen": 19800608, "step": 15265 }, { "epoch": 0.7460972809224831, "grad_norm": 0.694732129573822, "learning_rate": 3.4077012075977675e-05, "loss": 0.0719, "num_input_tokens_seen": 19806720, "step": 15270 }, { "epoch": 0.7463415825861774, "grad_norm": 0.6039611101150513, "learning_rate": 3.4067863865692e-05, "loss": 0.0905, "num_input_tokens_seen": 19812896, "step": 15275 }, { "epoch": 0.7465858842498717, "grad_norm": 0.3019779324531555, "learning_rate": 3.4058714257027755e-05, "loss": 0.1038, "num_input_tokens_seen": 19819680, "step": 15280 }, { "epoch": 0.7468301859135661, "grad_norm": 0.32639962434768677, "learning_rate": 3.404956325139594e-05, "loss": 0.0883, "num_input_tokens_seen": 19826624, "step": 15285 }, { "epoch": 0.7470744875772604, "grad_norm": 0.5737325549125671, "learning_rate": 3.404041085020775e-05, "loss": 0.0756, "num_input_tokens_seen": 19833504, "step": 15290 }, { "epoch": 0.7473187892409547, "grad_norm": 0.39019840955734253, "learning_rate": 3.403125705487459e-05, "loss": 0.0567, "num_input_tokens_seen": 19840320, "step": 15295 }, { "epoch": 0.747563090904649, "grad_norm": 0.3923274576663971, "learning_rate": 3.402210186680811e-05, "loss": 0.0918, "num_input_tokens_seen": 19847072, "step": 15300 }, { "epoch": 0.7478073925683434, "grad_norm": 0.38677331805229187, "learning_rate": 3.4012945287420137e-05, "loss": 0.0835, "num_input_tokens_seen": 19853056, "step": 15305 }, { "epoch": 0.7480516942320378, "grad_norm": 0.15734753012657166, "learning_rate": 3.400378731812274e-05, "loss": 0.0739, "num_input_tokens_seen": 19859680, "step": 15310 }, { "epoch": 0.748295995895732, "grad_norm": 0.1501723676919937, "learning_rate": 3.399462796032817e-05, "loss": 0.1081, "num_input_tokens_seen": 19865824, "step": 15315 }, { "epoch": 0.7485402975594264, "grad_norm": 0.20340092480182648, "learning_rate": 3.3985467215448954e-05, "loss": 0.0731, "num_input_tokens_seen": 19872256, "step": 15320 }, { "epoch": 0.7487845992231207, "grad_norm": 0.14953692257404327, "learning_rate": 3.3976305084897776e-05, "loss": 0.0992, "num_input_tokens_seen": 19878944, "step": 15325 }, { "epoch": 0.7490289008868151, "grad_norm": 0.8940133452415466, "learning_rate": 3.3967141570087544e-05, "loss": 0.0854, "num_input_tokens_seen": 19884992, "step": 15330 }, { "epoch": 0.7492732025505093, "grad_norm": 0.18268170952796936, "learning_rate": 3.39579766724314e-05, "loss": 0.1013, "num_input_tokens_seen": 19891232, "step": 15335 }, { "epoch": 0.7495175042142037, "grad_norm": 0.4334729313850403, "learning_rate": 3.3948810393342677e-05, "loss": 0.0946, "num_input_tokens_seen": 19897568, "step": 15340 }, { "epoch": 0.749761805877898, "grad_norm": 0.23391152918338776, "learning_rate": 3.3939642734234936e-05, "loss": 0.0962, "num_input_tokens_seen": 19903328, "step": 15345 }, { "epoch": 0.7500061075415924, "grad_norm": 0.15486371517181396, "learning_rate": 3.393047369652194e-05, "loss": 0.102, "num_input_tokens_seen": 19910080, "step": 15350 }, { "epoch": 0.7502504092052867, "grad_norm": 0.8478649258613586, "learning_rate": 3.3921303281617664e-05, "loss": 0.089, "num_input_tokens_seen": 19917088, "step": 15355 }, { "epoch": 0.750494710868981, "grad_norm": 0.3435886800289154, "learning_rate": 3.391213149093632e-05, "loss": 0.1051, "num_input_tokens_seen": 19923552, "step": 15360 }, { "epoch": 0.7507390125326754, "grad_norm": 0.6735595464706421, "learning_rate": 3.3902958325892303e-05, "loss": 0.0795, "num_input_tokens_seen": 19929824, "step": 15365 }, { "epoch": 0.7509833141963697, "grad_norm": 0.25880560278892517, "learning_rate": 3.389378378790023e-05, "loss": 0.081, "num_input_tokens_seen": 19936512, "step": 15370 }, { "epoch": 0.751227615860064, "grad_norm": 0.13905714452266693, "learning_rate": 3.388460787837493e-05, "loss": 0.0787, "num_input_tokens_seen": 19942816, "step": 15375 }, { "epoch": 0.7514719175237583, "grad_norm": 1.1034233570098877, "learning_rate": 3.387543059873145e-05, "loss": 0.11, "num_input_tokens_seen": 19949216, "step": 15380 }, { "epoch": 0.7517162191874527, "grad_norm": 0.5477410554885864, "learning_rate": 3.386625195038503e-05, "loss": 0.0842, "num_input_tokens_seen": 19956992, "step": 15385 }, { "epoch": 0.751960520851147, "grad_norm": 0.17100852727890015, "learning_rate": 3.3857071934751136e-05, "loss": 0.0645, "num_input_tokens_seen": 19963712, "step": 15390 }, { "epoch": 0.7522048225148413, "grad_norm": 0.16038203239440918, "learning_rate": 3.384789055324544e-05, "loss": 0.0744, "num_input_tokens_seen": 19970336, "step": 15395 }, { "epoch": 0.7524491241785357, "grad_norm": 0.61720210313797, "learning_rate": 3.3838707807283843e-05, "loss": 0.1072, "num_input_tokens_seen": 19976768, "step": 15400 }, { "epoch": 0.7524491241785357, "eval_loss": 0.09043679386377335, "eval_runtime": 374.9114, "eval_samples_per_second": 97.05, "eval_steps_per_second": 24.264, "num_input_tokens_seen": 19976768, "step": 15400 }, { "epoch": 0.75269342584223, "grad_norm": 0.15084105730056763, "learning_rate": 3.382952369828243e-05, "loss": 0.07, "num_input_tokens_seen": 19983008, "step": 15405 }, { "epoch": 0.7529377275059244, "grad_norm": 0.24862416088581085, "learning_rate": 3.38203382276575e-05, "loss": 0.0632, "num_input_tokens_seen": 19989280, "step": 15410 }, { "epoch": 0.7531820291696186, "grad_norm": 0.23303453624248505, "learning_rate": 3.381115139682557e-05, "loss": 0.1057, "num_input_tokens_seen": 19996064, "step": 15415 }, { "epoch": 0.753426330833313, "grad_norm": 0.32796576619148254, "learning_rate": 3.3801963207203366e-05, "loss": 0.0969, "num_input_tokens_seen": 20002752, "step": 15420 }, { "epoch": 0.7536706324970073, "grad_norm": 0.08774710446596146, "learning_rate": 3.379277366020782e-05, "loss": 0.0637, "num_input_tokens_seen": 20009664, "step": 15425 }, { "epoch": 0.7539149341607017, "grad_norm": 0.4744296669960022, "learning_rate": 3.3783582757256085e-05, "loss": 0.0979, "num_input_tokens_seen": 20015776, "step": 15430 }, { "epoch": 0.7541592358243959, "grad_norm": 0.19738052785396576, "learning_rate": 3.3774390499765504e-05, "loss": 0.0759, "num_input_tokens_seen": 20022272, "step": 15435 }, { "epoch": 0.7544035374880903, "grad_norm": 0.28042492270469666, "learning_rate": 3.376519688915364e-05, "loss": 0.0897, "num_input_tokens_seen": 20028608, "step": 15440 }, { "epoch": 0.7546478391517846, "grad_norm": 0.3847235143184662, "learning_rate": 3.3756001926838273e-05, "loss": 0.1027, "num_input_tokens_seen": 20035424, "step": 15445 }, { "epoch": 0.754892140815479, "grad_norm": 0.50521320104599, "learning_rate": 3.374680561423737e-05, "loss": 0.1004, "num_input_tokens_seen": 20041632, "step": 15450 }, { "epoch": 0.7551364424791733, "grad_norm": 1.2221832275390625, "learning_rate": 3.373760795276912e-05, "loss": 0.108, "num_input_tokens_seen": 20048000, "step": 15455 }, { "epoch": 0.7553807441428676, "grad_norm": 0.18004253506660461, "learning_rate": 3.372840894385192e-05, "loss": 0.0997, "num_input_tokens_seen": 20054368, "step": 15460 }, { "epoch": 0.755625045806562, "grad_norm": 0.972909688949585, "learning_rate": 3.3719208588904375e-05, "loss": 0.1158, "num_input_tokens_seen": 20060512, "step": 15465 }, { "epoch": 0.7558693474702562, "grad_norm": 0.17356222867965698, "learning_rate": 3.371000688934529e-05, "loss": 0.0975, "num_input_tokens_seen": 20066880, "step": 15470 }, { "epoch": 0.7561136491339506, "grad_norm": 0.357625275850296, "learning_rate": 3.370080384659369e-05, "loss": 0.0915, "num_input_tokens_seen": 20073280, "step": 15475 }, { "epoch": 0.7563579507976449, "grad_norm": 0.29080867767333984, "learning_rate": 3.36915994620688e-05, "loss": 0.0723, "num_input_tokens_seen": 20079616, "step": 15480 }, { "epoch": 0.7566022524613393, "grad_norm": 0.15443938970565796, "learning_rate": 3.3682393737190035e-05, "loss": 0.085, "num_input_tokens_seen": 20086496, "step": 15485 }, { "epoch": 0.7568465541250335, "grad_norm": 0.27520859241485596, "learning_rate": 3.3673186673377054e-05, "loss": 0.0864, "num_input_tokens_seen": 20092448, "step": 15490 }, { "epoch": 0.7570908557887279, "grad_norm": 0.5188062787055969, "learning_rate": 3.366397827204969e-05, "loss": 0.0919, "num_input_tokens_seen": 20098816, "step": 15495 }, { "epoch": 0.7573351574524223, "grad_norm": 0.576525866985321, "learning_rate": 3.3654768534628e-05, "loss": 0.0706, "num_input_tokens_seen": 20104800, "step": 15500 }, { "epoch": 0.7575794591161166, "grad_norm": 0.45583376288414, "learning_rate": 3.3645557462532245e-05, "loss": 0.0784, "num_input_tokens_seen": 20111264, "step": 15505 }, { "epoch": 0.757823760779811, "grad_norm": 0.23314149677753448, "learning_rate": 3.363634505718288e-05, "loss": 0.0607, "num_input_tokens_seen": 20117824, "step": 15510 }, { "epoch": 0.7580680624435052, "grad_norm": 0.15961578488349915, "learning_rate": 3.362713132000057e-05, "loss": 0.0793, "num_input_tokens_seen": 20124576, "step": 15515 }, { "epoch": 0.7583123641071996, "grad_norm": 0.36288580298423767, "learning_rate": 3.36179162524062e-05, "loss": 0.0619, "num_input_tokens_seen": 20130752, "step": 15520 }, { "epoch": 0.7585566657708939, "grad_norm": 0.12011334300041199, "learning_rate": 3.3608699855820846e-05, "loss": 0.0971, "num_input_tokens_seen": 20137088, "step": 15525 }, { "epoch": 0.7588009674345882, "grad_norm": 0.14369021356105804, "learning_rate": 3.359948213166578e-05, "loss": 0.0705, "num_input_tokens_seen": 20143328, "step": 15530 }, { "epoch": 0.7590452690982825, "grad_norm": 0.18700186908245087, "learning_rate": 3.359026308136252e-05, "loss": 0.0722, "num_input_tokens_seen": 20149440, "step": 15535 }, { "epoch": 0.7592895707619769, "grad_norm": 0.15876203775405884, "learning_rate": 3.358104270633272e-05, "loss": 0.1191, "num_input_tokens_seen": 20155520, "step": 15540 }, { "epoch": 0.7595338724256712, "grad_norm": 0.18415573239326477, "learning_rate": 3.357182100799831e-05, "loss": 0.0973, "num_input_tokens_seen": 20162304, "step": 15545 }, { "epoch": 0.7597781740893655, "grad_norm": 0.4587118625640869, "learning_rate": 3.3562597987781384e-05, "loss": 0.0994, "num_input_tokens_seen": 20169408, "step": 15550 }, { "epoch": 0.7600224757530599, "grad_norm": 0.2806416153907776, "learning_rate": 3.355337364710424e-05, "loss": 0.1086, "num_input_tokens_seen": 20175872, "step": 15555 }, { "epoch": 0.7602667774167542, "grad_norm": 0.3135741353034973, "learning_rate": 3.354414798738939e-05, "loss": 0.0865, "num_input_tokens_seen": 20182080, "step": 15560 }, { "epoch": 0.7605110790804486, "grad_norm": 0.2264910638332367, "learning_rate": 3.353492101005955e-05, "loss": 0.0779, "num_input_tokens_seen": 20189056, "step": 15565 }, { "epoch": 0.7607553807441428, "grad_norm": 0.28039655089378357, "learning_rate": 3.352569271653763e-05, "loss": 0.1032, "num_input_tokens_seen": 20195392, "step": 15570 }, { "epoch": 0.7609996824078372, "grad_norm": 0.5230692625045776, "learning_rate": 3.351646310824675e-05, "loss": 0.1027, "num_input_tokens_seen": 20202112, "step": 15575 }, { "epoch": 0.7612439840715315, "grad_norm": 0.5736832618713379, "learning_rate": 3.350723218661023e-05, "loss": 0.0763, "num_input_tokens_seen": 20208480, "step": 15580 }, { "epoch": 0.7614882857352259, "grad_norm": 0.8474494814872742, "learning_rate": 3.349799995305162e-05, "loss": 0.0905, "num_input_tokens_seen": 20215616, "step": 15585 }, { "epoch": 0.7617325873989201, "grad_norm": 0.21045082807540894, "learning_rate": 3.348876640899461e-05, "loss": 0.0704, "num_input_tokens_seen": 20222144, "step": 15590 }, { "epoch": 0.7619768890626145, "grad_norm": 0.23818182945251465, "learning_rate": 3.3479531555863144e-05, "loss": 0.0747, "num_input_tokens_seen": 20228192, "step": 15595 }, { "epoch": 0.7622211907263089, "grad_norm": 0.1582682579755783, "learning_rate": 3.3470295395081344e-05, "loss": 0.1297, "num_input_tokens_seen": 20234592, "step": 15600 }, { "epoch": 0.7622211907263089, "eval_loss": 0.09070826321840286, "eval_runtime": 374.2123, "eval_samples_per_second": 97.231, "eval_steps_per_second": 24.31, "num_input_tokens_seen": 20234592, "step": 15600 }, { "epoch": 0.7624654923900032, "grad_norm": 0.36363399028778076, "learning_rate": 3.3461057928073556e-05, "loss": 0.0836, "num_input_tokens_seen": 20241312, "step": 15605 }, { "epoch": 0.7627097940536975, "grad_norm": 1.0569547414779663, "learning_rate": 3.345181915626431e-05, "loss": 0.1387, "num_input_tokens_seen": 20247648, "step": 15610 }, { "epoch": 0.7629540957173918, "grad_norm": 0.48232921957969666, "learning_rate": 3.344257908107834e-05, "loss": 0.0926, "num_input_tokens_seen": 20254112, "step": 15615 }, { "epoch": 0.7631983973810862, "grad_norm": 0.33527249097824097, "learning_rate": 3.343333770394058e-05, "loss": 0.1033, "num_input_tokens_seen": 20260864, "step": 15620 }, { "epoch": 0.7634426990447805, "grad_norm": 0.27401357889175415, "learning_rate": 3.342409502627616e-05, "loss": 0.1093, "num_input_tokens_seen": 20267136, "step": 15625 }, { "epoch": 0.7636870007084748, "grad_norm": 0.2676611542701721, "learning_rate": 3.341485104951043e-05, "loss": 0.0757, "num_input_tokens_seen": 20273536, "step": 15630 }, { "epoch": 0.7639313023721691, "grad_norm": 0.7231329083442688, "learning_rate": 3.340560577506892e-05, "loss": 0.0765, "num_input_tokens_seen": 20279968, "step": 15635 }, { "epoch": 0.7641756040358635, "grad_norm": 0.1560540497303009, "learning_rate": 3.339635920437735e-05, "loss": 0.076, "num_input_tokens_seen": 20286336, "step": 15640 }, { "epoch": 0.7644199056995579, "grad_norm": 0.2054242491722107, "learning_rate": 3.338711133886169e-05, "loss": 0.0785, "num_input_tokens_seen": 20292448, "step": 15645 }, { "epoch": 0.7646642073632521, "grad_norm": 0.2308696210384369, "learning_rate": 3.3377862179948064e-05, "loss": 0.1191, "num_input_tokens_seen": 20298592, "step": 15650 }, { "epoch": 0.7649085090269465, "grad_norm": 0.20084238052368164, "learning_rate": 3.336861172906281e-05, "loss": 0.0754, "num_input_tokens_seen": 20305120, "step": 15655 }, { "epoch": 0.7651528106906408, "grad_norm": 0.1664259135723114, "learning_rate": 3.335935998763245e-05, "loss": 0.0651, "num_input_tokens_seen": 20311424, "step": 15660 }, { "epoch": 0.7653971123543352, "grad_norm": 0.13762089610099792, "learning_rate": 3.3350106957083744e-05, "loss": 0.0673, "num_input_tokens_seen": 20317952, "step": 15665 }, { "epoch": 0.7656414140180294, "grad_norm": 0.6393171548843384, "learning_rate": 3.33408526388436e-05, "loss": 0.1177, "num_input_tokens_seen": 20324288, "step": 15670 }, { "epoch": 0.7658857156817238, "grad_norm": 0.2160787731409073, "learning_rate": 3.3331597034339166e-05, "loss": 0.1019, "num_input_tokens_seen": 20330496, "step": 15675 }, { "epoch": 0.7661300173454181, "grad_norm": 0.25364625453948975, "learning_rate": 3.3322340144997764e-05, "loss": 0.0894, "num_input_tokens_seen": 20336608, "step": 15680 }, { "epoch": 0.7663743190091125, "grad_norm": 0.6801276206970215, "learning_rate": 3.331308197224693e-05, "loss": 0.1203, "num_input_tokens_seen": 20344064, "step": 15685 }, { "epoch": 0.7666186206728067, "grad_norm": 0.5827445387840271, "learning_rate": 3.330382251751438e-05, "loss": 0.0985, "num_input_tokens_seen": 20350432, "step": 15690 }, { "epoch": 0.7668629223365011, "grad_norm": 0.21954667568206787, "learning_rate": 3.3294561782228054e-05, "loss": 0.1008, "num_input_tokens_seen": 20357056, "step": 15695 }, { "epoch": 0.7671072240001955, "grad_norm": 0.3377029597759247, "learning_rate": 3.328529976781607e-05, "loss": 0.0715, "num_input_tokens_seen": 20363520, "step": 15700 }, { "epoch": 0.7673515256638898, "grad_norm": 0.4205717146396637, "learning_rate": 3.327603647570673e-05, "loss": 0.0772, "num_input_tokens_seen": 20369856, "step": 15705 }, { "epoch": 0.7675958273275841, "grad_norm": 0.2008480727672577, "learning_rate": 3.326677190732857e-05, "loss": 0.1006, "num_input_tokens_seen": 20376192, "step": 15710 }, { "epoch": 0.7678401289912784, "grad_norm": 0.622800886631012, "learning_rate": 3.325750606411029e-05, "loss": 0.0832, "num_input_tokens_seen": 20383360, "step": 15715 }, { "epoch": 0.7680844306549728, "grad_norm": 0.13500791788101196, "learning_rate": 3.3248238947480804e-05, "loss": 0.0871, "num_input_tokens_seen": 20390016, "step": 15720 }, { "epoch": 0.768328732318667, "grad_norm": 0.2939295172691345, "learning_rate": 3.323897055886922e-05, "loss": 0.0852, "num_input_tokens_seen": 20396224, "step": 15725 }, { "epoch": 0.7685730339823614, "grad_norm": 0.24057120084762573, "learning_rate": 3.322970089970484e-05, "loss": 0.0954, "num_input_tokens_seen": 20402592, "step": 15730 }, { "epoch": 0.7688173356460557, "grad_norm": 0.5854883790016174, "learning_rate": 3.3220429971417165e-05, "loss": 0.0771, "num_input_tokens_seen": 20408896, "step": 15735 }, { "epoch": 0.7690616373097501, "grad_norm": 0.1716734915971756, "learning_rate": 3.321115777543588e-05, "loss": 0.0803, "num_input_tokens_seen": 20415808, "step": 15740 }, { "epoch": 0.7693059389734445, "grad_norm": 0.10567527264356613, "learning_rate": 3.320188431319088e-05, "loss": 0.0732, "num_input_tokens_seen": 20422336, "step": 15745 }, { "epoch": 0.7695502406371387, "grad_norm": 0.6831982135772705, "learning_rate": 3.319260958611224e-05, "loss": 0.0879, "num_input_tokens_seen": 20428960, "step": 15750 }, { "epoch": 0.7697945423008331, "grad_norm": 0.21643786132335663, "learning_rate": 3.3183333595630256e-05, "loss": 0.1032, "num_input_tokens_seen": 20435712, "step": 15755 }, { "epoch": 0.7700388439645274, "grad_norm": 0.22789497673511505, "learning_rate": 3.317405634317538e-05, "loss": 0.092, "num_input_tokens_seen": 20441856, "step": 15760 }, { "epoch": 0.7702831456282218, "grad_norm": 0.26058682799339294, "learning_rate": 3.3164777830178315e-05, "loss": 0.0897, "num_input_tokens_seen": 20447936, "step": 15765 }, { "epoch": 0.770527447291916, "grad_norm": 0.32216182351112366, "learning_rate": 3.315549805806989e-05, "loss": 0.0783, "num_input_tokens_seen": 20454432, "step": 15770 }, { "epoch": 0.7707717489556104, "grad_norm": 0.3011501431465149, "learning_rate": 3.314621702828118e-05, "loss": 0.0733, "num_input_tokens_seen": 20460960, "step": 15775 }, { "epoch": 0.7710160506193047, "grad_norm": 0.17970073223114014, "learning_rate": 3.313693474224342e-05, "loss": 0.08, "num_input_tokens_seen": 20467392, "step": 15780 }, { "epoch": 0.771260352282999, "grad_norm": 0.44660404324531555, "learning_rate": 3.312765120138809e-05, "loss": 0.0909, "num_input_tokens_seen": 20473856, "step": 15785 }, { "epoch": 0.7715046539466934, "grad_norm": 0.19726650416851044, "learning_rate": 3.311836640714679e-05, "loss": 0.102, "num_input_tokens_seen": 20480192, "step": 15790 }, { "epoch": 0.7717489556103877, "grad_norm": 0.24625058472156525, "learning_rate": 3.310908036095137e-05, "loss": 0.09, "num_input_tokens_seen": 20486720, "step": 15795 }, { "epoch": 0.7719932572740821, "grad_norm": 0.16987064480781555, "learning_rate": 3.309979306423386e-05, "loss": 0.0795, "num_input_tokens_seen": 20493056, "step": 15800 }, { "epoch": 0.7719932572740821, "eval_loss": 0.09032799303531647, "eval_runtime": 373.7304, "eval_samples_per_second": 97.356, "eval_steps_per_second": 24.341, "num_input_tokens_seen": 20493056, "step": 15800 }, { "epoch": 0.7722375589377763, "grad_norm": 0.7245461344718933, "learning_rate": 3.309050451842647e-05, "loss": 0.0907, "num_input_tokens_seen": 20498880, "step": 15805 }, { "epoch": 0.7724818606014707, "grad_norm": 0.5405999422073364, "learning_rate": 3.3081214724961604e-05, "loss": 0.0778, "num_input_tokens_seen": 20505152, "step": 15810 }, { "epoch": 0.772726162265165, "grad_norm": 0.31857335567474365, "learning_rate": 3.307192368527188e-05, "loss": 0.0859, "num_input_tokens_seen": 20511232, "step": 15815 }, { "epoch": 0.7729704639288594, "grad_norm": 0.7869847416877747, "learning_rate": 3.306263140079008e-05, "loss": 0.1061, "num_input_tokens_seen": 20517824, "step": 15820 }, { "epoch": 0.7732147655925536, "grad_norm": 0.16376420855522156, "learning_rate": 3.30533378729492e-05, "loss": 0.0744, "num_input_tokens_seen": 20524448, "step": 15825 }, { "epoch": 0.773459067256248, "grad_norm": 0.15147343277931213, "learning_rate": 3.304404310318242e-05, "loss": 0.0889, "num_input_tokens_seen": 20530624, "step": 15830 }, { "epoch": 0.7737033689199423, "grad_norm": 0.5425155162811279, "learning_rate": 3.3034747092923105e-05, "loss": 0.0821, "num_input_tokens_seen": 20537568, "step": 15835 }, { "epoch": 0.7739476705836367, "grad_norm": 0.2370752990245819, "learning_rate": 3.3025449843604806e-05, "loss": 0.1034, "num_input_tokens_seen": 20544032, "step": 15840 }, { "epoch": 0.774191972247331, "grad_norm": 0.5597333908081055, "learning_rate": 3.30161513566613e-05, "loss": 0.0886, "num_input_tokens_seen": 20550336, "step": 15845 }, { "epoch": 0.7744362739110253, "grad_norm": 0.21963255107402802, "learning_rate": 3.3006851633526506e-05, "loss": 0.1078, "num_input_tokens_seen": 20556832, "step": 15850 }, { "epoch": 0.7746805755747197, "grad_norm": 0.1592438817024231, "learning_rate": 3.2997550675634584e-05, "loss": 0.0883, "num_input_tokens_seen": 20562944, "step": 15855 }, { "epoch": 0.774924877238414, "grad_norm": 0.3076557517051697, "learning_rate": 3.2988248484419825e-05, "loss": 0.087, "num_input_tokens_seen": 20569888, "step": 15860 }, { "epoch": 0.7751691789021083, "grad_norm": 0.4141273498535156, "learning_rate": 3.2978945061316776e-05, "loss": 0.0694, "num_input_tokens_seen": 20576736, "step": 15865 }, { "epoch": 0.7754134805658026, "grad_norm": 0.5801581740379333, "learning_rate": 3.296964040776013e-05, "loss": 0.0803, "num_input_tokens_seen": 20582752, "step": 15870 }, { "epoch": 0.775657782229497, "grad_norm": 0.5067822337150574, "learning_rate": 3.296033452518478e-05, "loss": 0.1004, "num_input_tokens_seen": 20589024, "step": 15875 }, { "epoch": 0.7759020838931913, "grad_norm": 0.16847997903823853, "learning_rate": 3.2951027415025806e-05, "loss": 0.0901, "num_input_tokens_seen": 20595520, "step": 15880 }, { "epoch": 0.7761463855568856, "grad_norm": 0.1456044465303421, "learning_rate": 3.294171907871849e-05, "loss": 0.0978, "num_input_tokens_seen": 20601504, "step": 15885 }, { "epoch": 0.77639068722058, "grad_norm": 0.5908177495002747, "learning_rate": 3.293240951769828e-05, "loss": 0.0772, "num_input_tokens_seen": 20607744, "step": 15890 }, { "epoch": 0.7766349888842743, "grad_norm": 0.26280805468559265, "learning_rate": 3.2923098733400846e-05, "loss": 0.0997, "num_input_tokens_seen": 20613952, "step": 15895 }, { "epoch": 0.7768792905479687, "grad_norm": 0.5007971525192261, "learning_rate": 3.291378672726202e-05, "loss": 0.0868, "num_input_tokens_seen": 20620800, "step": 15900 }, { "epoch": 0.7771235922116629, "grad_norm": 0.23507219552993774, "learning_rate": 3.2904473500717824e-05, "loss": 0.0653, "num_input_tokens_seen": 20627328, "step": 15905 }, { "epoch": 0.7773678938753573, "grad_norm": 0.31318363547325134, "learning_rate": 3.289515905520449e-05, "loss": 0.0772, "num_input_tokens_seen": 20633824, "step": 15910 }, { "epoch": 0.7776121955390516, "grad_norm": 0.5659288167953491, "learning_rate": 3.288584339215841e-05, "loss": 0.0886, "num_input_tokens_seen": 20640512, "step": 15915 }, { "epoch": 0.777856497202746, "grad_norm": 0.3963339626789093, "learning_rate": 3.287652651301617e-05, "loss": 0.0606, "num_input_tokens_seen": 20647040, "step": 15920 }, { "epoch": 0.7781007988664402, "grad_norm": 0.7084959745407104, "learning_rate": 3.286720841921457e-05, "loss": 0.09, "num_input_tokens_seen": 20653888, "step": 15925 }, { "epoch": 0.7783451005301346, "grad_norm": 0.44870227575302124, "learning_rate": 3.285788911219056e-05, "loss": 0.0712, "num_input_tokens_seen": 20660448, "step": 15930 }, { "epoch": 0.778589402193829, "grad_norm": 0.23321373760700226, "learning_rate": 3.284856859338131e-05, "loss": 0.1076, "num_input_tokens_seen": 20666208, "step": 15935 }, { "epoch": 0.7788337038575233, "grad_norm": 0.4133414328098297, "learning_rate": 3.283924686422414e-05, "loss": 0.0931, "num_input_tokens_seen": 20672352, "step": 15940 }, { "epoch": 0.7790780055212176, "grad_norm": 0.27067577838897705, "learning_rate": 3.282992392615659e-05, "loss": 0.09, "num_input_tokens_seen": 20678592, "step": 15945 }, { "epoch": 0.7793223071849119, "grad_norm": 0.3155580163002014, "learning_rate": 3.282059978061638e-05, "loss": 0.0888, "num_input_tokens_seen": 20685952, "step": 15950 }, { "epoch": 0.7795666088486063, "grad_norm": 0.4747217893600464, "learning_rate": 3.28112744290414e-05, "loss": 0.1048, "num_input_tokens_seen": 20692224, "step": 15955 }, { "epoch": 0.7798109105123006, "grad_norm": 0.2904408872127533, "learning_rate": 3.280194787286974e-05, "loss": 0.0951, "num_input_tokens_seen": 20698592, "step": 15960 }, { "epoch": 0.7800552121759949, "grad_norm": 0.14023855328559875, "learning_rate": 3.2792620113539674e-05, "loss": 0.0836, "num_input_tokens_seen": 20705344, "step": 15965 }, { "epoch": 0.7802995138396892, "grad_norm": 0.3207884728908539, "learning_rate": 3.278329115248966e-05, "loss": 0.0873, "num_input_tokens_seen": 20711936, "step": 15970 }, { "epoch": 0.7805438155033836, "grad_norm": 0.2163049876689911, "learning_rate": 3.277396099115834e-05, "loss": 0.0802, "num_input_tokens_seen": 20718112, "step": 15975 }, { "epoch": 0.7807881171670779, "grad_norm": 0.4113171100616455, "learning_rate": 3.276462963098454e-05, "loss": 0.0853, "num_input_tokens_seen": 20724288, "step": 15980 }, { "epoch": 0.7810324188307722, "grad_norm": 0.22360636293888092, "learning_rate": 3.275529707340728e-05, "loss": 0.077, "num_input_tokens_seen": 20730592, "step": 15985 }, { "epoch": 0.7812767204944666, "grad_norm": 0.36338016390800476, "learning_rate": 3.274596331986574e-05, "loss": 0.0766, "num_input_tokens_seen": 20737280, "step": 15990 }, { "epoch": 0.7815210221581609, "grad_norm": 0.11758249253034592, "learning_rate": 3.273662837179932e-05, "loss": 0.0764, "num_input_tokens_seen": 20743584, "step": 15995 }, { "epoch": 0.7817653238218553, "grad_norm": 0.2773174047470093, "learning_rate": 3.272729223064758e-05, "loss": 0.0911, "num_input_tokens_seen": 20750368, "step": 16000 }, { "epoch": 0.7817653238218553, "eval_loss": 0.09007633477449417, "eval_runtime": 374.2505, "eval_samples_per_second": 97.221, "eval_steps_per_second": 24.307, "num_input_tokens_seen": 20750368, "step": 16000 }, { "epoch": 0.7820096254855495, "grad_norm": 0.38131749629974365, "learning_rate": 3.2717954897850264e-05, "loss": 0.0998, "num_input_tokens_seen": 20756736, "step": 16005 }, { "epoch": 0.7822539271492439, "grad_norm": 0.45304617285728455, "learning_rate": 3.270861637484733e-05, "loss": 0.0726, "num_input_tokens_seen": 20763136, "step": 16010 }, { "epoch": 0.7824982288129382, "grad_norm": 0.19977276027202606, "learning_rate": 3.2699276663078867e-05, "loss": 0.1086, "num_input_tokens_seen": 20769344, "step": 16015 }, { "epoch": 0.7827425304766326, "grad_norm": 0.17315085232257843, "learning_rate": 3.268993576398519e-05, "loss": 0.0991, "num_input_tokens_seen": 20776256, "step": 16020 }, { "epoch": 0.7829868321403268, "grad_norm": 1.041510820388794, "learning_rate": 3.268059367900678e-05, "loss": 0.0931, "num_input_tokens_seen": 20782592, "step": 16025 }, { "epoch": 0.7832311338040212, "grad_norm": 0.4767925441265106, "learning_rate": 3.26712504095843e-05, "loss": 0.0682, "num_input_tokens_seen": 20789376, "step": 16030 }, { "epoch": 0.7834754354677156, "grad_norm": 0.6064233183860779, "learning_rate": 3.2661905957158615e-05, "loss": 0.0916, "num_input_tokens_seen": 20795840, "step": 16035 }, { "epoch": 0.7837197371314099, "grad_norm": 0.43465864658355713, "learning_rate": 3.2652560323170734e-05, "loss": 0.0999, "num_input_tokens_seen": 20802624, "step": 16040 }, { "epoch": 0.7839640387951042, "grad_norm": 0.43521833419799805, "learning_rate": 3.264321350906189e-05, "loss": 0.0823, "num_input_tokens_seen": 20808992, "step": 16045 }, { "epoch": 0.7842083404587985, "grad_norm": 0.8363173604011536, "learning_rate": 3.263386551627346e-05, "loss": 0.0951, "num_input_tokens_seen": 20815456, "step": 16050 }, { "epoch": 0.7844526421224929, "grad_norm": 0.7292890548706055, "learning_rate": 3.2624516346247055e-05, "loss": 0.0909, "num_input_tokens_seen": 20822432, "step": 16055 }, { "epoch": 0.7846969437861872, "grad_norm": 0.405486524105072, "learning_rate": 3.2615166000424404e-05, "loss": 0.1042, "num_input_tokens_seen": 20828608, "step": 16060 }, { "epoch": 0.7849412454498815, "grad_norm": 0.19354330003261566, "learning_rate": 3.260581448024745e-05, "loss": 0.1012, "num_input_tokens_seen": 20835360, "step": 16065 }, { "epoch": 0.7851855471135758, "grad_norm": 0.7793437838554382, "learning_rate": 3.2596461787158335e-05, "loss": 0.1221, "num_input_tokens_seen": 20842144, "step": 16070 }, { "epoch": 0.7854298487772702, "grad_norm": 0.17763392627239227, "learning_rate": 3.258710792259934e-05, "loss": 0.086, "num_input_tokens_seen": 20848032, "step": 16075 }, { "epoch": 0.7856741504409646, "grad_norm": 0.365969717502594, "learning_rate": 3.257775288801296e-05, "loss": 0.0757, "num_input_tokens_seen": 20854400, "step": 16080 }, { "epoch": 0.7859184521046588, "grad_norm": 0.3156670331954956, "learning_rate": 3.256839668484186e-05, "loss": 0.0885, "num_input_tokens_seen": 20861088, "step": 16085 }, { "epoch": 0.7861627537683532, "grad_norm": 0.23380668461322784, "learning_rate": 3.255903931452888e-05, "loss": 0.0748, "num_input_tokens_seen": 20867168, "step": 16090 }, { "epoch": 0.7864070554320475, "grad_norm": 0.5751817226409912, "learning_rate": 3.2549680778517045e-05, "loss": 0.0895, "num_input_tokens_seen": 20874208, "step": 16095 }, { "epoch": 0.7866513570957419, "grad_norm": 0.6495119333267212, "learning_rate": 3.2540321078249556e-05, "loss": 0.1109, "num_input_tokens_seen": 20881088, "step": 16100 }, { "epoch": 0.7868956587594361, "grad_norm": 0.7671195864677429, "learning_rate": 3.2530960215169795e-05, "loss": 0.1035, "num_input_tokens_seen": 20887424, "step": 16105 }, { "epoch": 0.7871399604231305, "grad_norm": 0.29511508345603943, "learning_rate": 3.2521598190721345e-05, "loss": 0.0869, "num_input_tokens_seen": 20893792, "step": 16110 }, { "epoch": 0.7873842620868248, "grad_norm": 0.34207212924957275, "learning_rate": 3.251223500634792e-05, "loss": 0.0948, "num_input_tokens_seen": 20899872, "step": 16115 }, { "epoch": 0.7876285637505192, "grad_norm": 0.21313707530498505, "learning_rate": 3.2502870663493445e-05, "loss": 0.0913, "num_input_tokens_seen": 20906208, "step": 16120 }, { "epoch": 0.7878728654142134, "grad_norm": 0.4783528447151184, "learning_rate": 3.249350516360203e-05, "loss": 0.1027, "num_input_tokens_seen": 20912416, "step": 16125 }, { "epoch": 0.7881171670779078, "grad_norm": 0.11148922145366669, "learning_rate": 3.248413850811797e-05, "loss": 0.08, "num_input_tokens_seen": 20919136, "step": 16130 }, { "epoch": 0.7883614687416022, "grad_norm": 0.23832370340824127, "learning_rate": 3.2474770698485677e-05, "loss": 0.1149, "num_input_tokens_seen": 20925568, "step": 16135 }, { "epoch": 0.7886057704052964, "grad_norm": 0.1318652331829071, "learning_rate": 3.246540173614983e-05, "loss": 0.0679, "num_input_tokens_seen": 20932064, "step": 16140 }, { "epoch": 0.7888500720689908, "grad_norm": 0.21414028108119965, "learning_rate": 3.2456031622555197e-05, "loss": 0.0782, "num_input_tokens_seen": 20938592, "step": 16145 }, { "epoch": 0.7890943737326851, "grad_norm": 0.25109416246414185, "learning_rate": 3.2446660359146794e-05, "loss": 0.076, "num_input_tokens_seen": 20944832, "step": 16150 }, { "epoch": 0.7893386753963795, "grad_norm": 0.46320194005966187, "learning_rate": 3.2437287947369786e-05, "loss": 0.0806, "num_input_tokens_seen": 20951584, "step": 16155 }, { "epoch": 0.7895829770600737, "grad_norm": 0.1495831161737442, "learning_rate": 3.2427914388669525e-05, "loss": 0.0648, "num_input_tokens_seen": 20958336, "step": 16160 }, { "epoch": 0.7898272787237681, "grad_norm": 0.15675799548625946, "learning_rate": 3.241853968449151e-05, "loss": 0.0948, "num_input_tokens_seen": 20964512, "step": 16165 }, { "epoch": 0.7900715803874624, "grad_norm": 0.21098695695400238, "learning_rate": 3.240916383628144e-05, "loss": 0.0975, "num_input_tokens_seen": 20970912, "step": 16170 }, { "epoch": 0.7903158820511568, "grad_norm": 0.3168441653251648, "learning_rate": 3.239978684548521e-05, "loss": 0.0638, "num_input_tokens_seen": 20977408, "step": 16175 }, { "epoch": 0.7905601837148512, "grad_norm": 0.5496974587440491, "learning_rate": 3.239040871354885e-05, "loss": 0.103, "num_input_tokens_seen": 20983648, "step": 16180 }, { "epoch": 0.7908044853785454, "grad_norm": 0.2846753001213074, "learning_rate": 3.2381029441918596e-05, "loss": 0.0741, "num_input_tokens_seen": 20990432, "step": 16185 }, { "epoch": 0.7910487870422398, "grad_norm": 0.11257338523864746, "learning_rate": 3.2371649032040845e-05, "loss": 0.065, "num_input_tokens_seen": 20997152, "step": 16190 }, { "epoch": 0.7912930887059341, "grad_norm": 0.17917214334011078, "learning_rate": 3.2362267485362174e-05, "loss": 0.083, "num_input_tokens_seen": 21003776, "step": 16195 }, { "epoch": 0.7915373903696284, "grad_norm": 0.21409712731838226, "learning_rate": 3.235288480332934e-05, "loss": 0.0956, "num_input_tokens_seen": 21010432, "step": 16200 }, { "epoch": 0.7915373903696284, "eval_loss": 0.09002770483493805, "eval_runtime": 374.6967, "eval_samples_per_second": 97.105, "eval_steps_per_second": 24.278, "num_input_tokens_seen": 21010432, "step": 16200 }, { "epoch": 0.7917816920333227, "grad_norm": 0.1966903656721115, "learning_rate": 3.234350098738927e-05, "loss": 0.088, "num_input_tokens_seen": 21016736, "step": 16205 }, { "epoch": 0.7920259936970171, "grad_norm": 0.2998768985271454, "learning_rate": 3.233411603898906e-05, "loss": 0.1016, "num_input_tokens_seen": 21023776, "step": 16210 }, { "epoch": 0.7922702953607114, "grad_norm": 0.1670360416173935, "learning_rate": 3.232472995957599e-05, "loss": 0.0764, "num_input_tokens_seen": 21030016, "step": 16215 }, { "epoch": 0.7925145970244057, "grad_norm": 0.2041340321302414, "learning_rate": 3.231534275059751e-05, "loss": 0.0548, "num_input_tokens_seen": 21036832, "step": 16220 }, { "epoch": 0.7927588986881, "grad_norm": 0.5878769159317017, "learning_rate": 3.230595441350125e-05, "loss": 0.0925, "num_input_tokens_seen": 21043264, "step": 16225 }, { "epoch": 0.7930032003517944, "grad_norm": 0.24251492321491241, "learning_rate": 3.2296564949735e-05, "loss": 0.0607, "num_input_tokens_seen": 21049824, "step": 16230 }, { "epoch": 0.7932475020154888, "grad_norm": 0.37280505895614624, "learning_rate": 3.228717436074675e-05, "loss": 0.0892, "num_input_tokens_seen": 21056640, "step": 16235 }, { "epoch": 0.793491803679183, "grad_norm": 0.5235351324081421, "learning_rate": 3.227778264798463e-05, "loss": 0.1149, "num_input_tokens_seen": 21063360, "step": 16240 }, { "epoch": 0.7937361053428774, "grad_norm": 0.47078943252563477, "learning_rate": 3.226838981289698e-05, "loss": 0.0988, "num_input_tokens_seen": 21069824, "step": 16245 }, { "epoch": 0.7939804070065717, "grad_norm": 0.17220379412174225, "learning_rate": 3.225899585693227e-05, "loss": 0.0748, "num_input_tokens_seen": 21076416, "step": 16250 }, { "epoch": 0.7942247086702661, "grad_norm": 0.3375694453716278, "learning_rate": 3.224960078153918e-05, "loss": 0.0734, "num_input_tokens_seen": 21082496, "step": 16255 }, { "epoch": 0.7944690103339603, "grad_norm": 0.2186184674501419, "learning_rate": 3.224020458816655e-05, "loss": 0.079, "num_input_tokens_seen": 21088992, "step": 16260 }, { "epoch": 0.7947133119976547, "grad_norm": 0.2882148027420044, "learning_rate": 3.223080727826337e-05, "loss": 0.0863, "num_input_tokens_seen": 21095328, "step": 16265 }, { "epoch": 0.794957613661349, "grad_norm": 0.5236883759498596, "learning_rate": 3.222140885327885e-05, "loss": 0.0683, "num_input_tokens_seen": 21101440, "step": 16270 }, { "epoch": 0.7952019153250434, "grad_norm": 0.407326340675354, "learning_rate": 3.221200931466234e-05, "loss": 0.0875, "num_input_tokens_seen": 21108064, "step": 16275 }, { "epoch": 0.7954462169887377, "grad_norm": 1.09868323802948, "learning_rate": 3.220260866386336e-05, "loss": 0.1025, "num_input_tokens_seen": 21114464, "step": 16280 }, { "epoch": 0.795690518652432, "grad_norm": 0.2643915116786957, "learning_rate": 3.21932069023316e-05, "loss": 0.0594, "num_input_tokens_seen": 21121568, "step": 16285 }, { "epoch": 0.7959348203161264, "grad_norm": 0.722817063331604, "learning_rate": 3.218380403151695e-05, "loss": 0.0662, "num_input_tokens_seen": 21127968, "step": 16290 }, { "epoch": 0.7961791219798207, "grad_norm": 0.13945432007312775, "learning_rate": 3.217440005286943e-05, "loss": 0.0791, "num_input_tokens_seen": 21133920, "step": 16295 }, { "epoch": 0.796423423643515, "grad_norm": 0.4055236279964447, "learning_rate": 3.216499496783928e-05, "loss": 0.0782, "num_input_tokens_seen": 21140672, "step": 16300 }, { "epoch": 0.7966677253072093, "grad_norm": 0.37047192454338074, "learning_rate": 3.2155588777876856e-05, "loss": 0.0913, "num_input_tokens_seen": 21147168, "step": 16305 }, { "epoch": 0.7969120269709037, "grad_norm": 0.5153141617774963, "learning_rate": 3.214618148443273e-05, "loss": 0.0862, "num_input_tokens_seen": 21153664, "step": 16310 }, { "epoch": 0.797156328634598, "grad_norm": 0.1620597094297409, "learning_rate": 3.2136773088957595e-05, "loss": 0.0602, "num_input_tokens_seen": 21160320, "step": 16315 }, { "epoch": 0.7974006302982923, "grad_norm": 0.23809976875782013, "learning_rate": 3.2127363592902374e-05, "loss": 0.07, "num_input_tokens_seen": 21166816, "step": 16320 }, { "epoch": 0.7976449319619867, "grad_norm": 0.22767113149166107, "learning_rate": 3.211795299771812e-05, "loss": 0.0946, "num_input_tokens_seen": 21173504, "step": 16325 }, { "epoch": 0.797889233625681, "grad_norm": 0.13772326707839966, "learning_rate": 3.210854130485605e-05, "loss": 0.053, "num_input_tokens_seen": 21179936, "step": 16330 }, { "epoch": 0.7981335352893754, "grad_norm": 0.8970298767089844, "learning_rate": 3.209912851576759e-05, "loss": 0.0896, "num_input_tokens_seen": 21186368, "step": 16335 }, { "epoch": 0.7983778369530696, "grad_norm": 0.5695586800575256, "learning_rate": 3.208971463190431e-05, "loss": 0.0634, "num_input_tokens_seen": 21192704, "step": 16340 }, { "epoch": 0.798622138616764, "grad_norm": 0.32216134667396545, "learning_rate": 3.208029965471793e-05, "loss": 0.0758, "num_input_tokens_seen": 21199200, "step": 16345 }, { "epoch": 0.7988664402804583, "grad_norm": 0.6614288687705994, "learning_rate": 3.2070883585660364e-05, "loss": 0.1159, "num_input_tokens_seen": 21205088, "step": 16350 }, { "epoch": 0.7991107419441527, "grad_norm": 0.4191703796386719, "learning_rate": 3.20614664261837e-05, "loss": 0.1127, "num_input_tokens_seen": 21211040, "step": 16355 }, { "epoch": 0.7993550436078469, "grad_norm": 0.2983381748199463, "learning_rate": 3.205204817774016e-05, "loss": 0.1159, "num_input_tokens_seen": 21217216, "step": 16360 }, { "epoch": 0.7995993452715413, "grad_norm": 0.188340425491333, "learning_rate": 3.204262884178218e-05, "loss": 0.0874, "num_input_tokens_seen": 21223936, "step": 16365 }, { "epoch": 0.7998436469352356, "grad_norm": 0.547640860080719, "learning_rate": 3.2033208419762314e-05, "loss": 0.0661, "num_input_tokens_seen": 21230496, "step": 16370 }, { "epoch": 0.80008794859893, "grad_norm": 0.13594357669353485, "learning_rate": 3.2023786913133344e-05, "loss": 0.0718, "num_input_tokens_seen": 21236672, "step": 16375 }, { "epoch": 0.8003322502626243, "grad_norm": 0.5695081949234009, "learning_rate": 3.201436432334816e-05, "loss": 0.0751, "num_input_tokens_seen": 21243744, "step": 16380 }, { "epoch": 0.8005765519263186, "grad_norm": 0.40831342339515686, "learning_rate": 3.2004940651859844e-05, "loss": 0.0922, "num_input_tokens_seen": 21250304, "step": 16385 }, { "epoch": 0.800820853590013, "grad_norm": 0.44466015696525574, "learning_rate": 3.1995515900121655e-05, "loss": 0.1274, "num_input_tokens_seen": 21256864, "step": 16390 }, { "epoch": 0.8010651552537073, "grad_norm": 0.49338626861572266, "learning_rate": 3.1986090069587e-05, "loss": 0.0974, "num_input_tokens_seen": 21263456, "step": 16395 }, { "epoch": 0.8013094569174016, "grad_norm": 0.26017460227012634, "learning_rate": 3.1976663161709466e-05, "loss": 0.0735, "num_input_tokens_seen": 21270112, "step": 16400 }, { "epoch": 0.8013094569174016, "eval_loss": 0.09003161638975143, "eval_runtime": 374.3888, "eval_samples_per_second": 97.185, "eval_steps_per_second": 24.298, "num_input_tokens_seen": 21270112, "step": 16400 }, { "epoch": 0.8015537585810959, "grad_norm": 0.2305414378643036, "learning_rate": 3.196723517794279e-05, "loss": 0.0927, "num_input_tokens_seen": 21276864, "step": 16405 }, { "epoch": 0.8017980602447903, "grad_norm": 0.3394778370857239, "learning_rate": 3.19578061197409e-05, "loss": 0.1035, "num_input_tokens_seen": 21283392, "step": 16410 }, { "epoch": 0.8020423619084845, "grad_norm": 0.27324944734573364, "learning_rate": 3.194837598855787e-05, "loss": 0.0661, "num_input_tokens_seen": 21290048, "step": 16415 }, { "epoch": 0.8022866635721789, "grad_norm": 0.608992874622345, "learning_rate": 3.193894478584794e-05, "loss": 0.0922, "num_input_tokens_seen": 21296704, "step": 16420 }, { "epoch": 0.8025309652358733, "grad_norm": 0.5174852609634399, "learning_rate": 3.192951251306553e-05, "loss": 0.0844, "num_input_tokens_seen": 21303456, "step": 16425 }, { "epoch": 0.8027752668995676, "grad_norm": 0.12355522066354752, "learning_rate": 3.192007917166521e-05, "loss": 0.0783, "num_input_tokens_seen": 21309792, "step": 16430 }, { "epoch": 0.803019568563262, "grad_norm": 0.33863064646720886, "learning_rate": 3.191064476310171e-05, "loss": 0.0738, "num_input_tokens_seen": 21316544, "step": 16435 }, { "epoch": 0.8032638702269562, "grad_norm": 0.627816915512085, "learning_rate": 3.1901209288829944e-05, "loss": 0.1167, "num_input_tokens_seen": 21322816, "step": 16440 }, { "epoch": 0.8035081718906506, "grad_norm": 0.25382542610168457, "learning_rate": 3.1891772750304985e-05, "loss": 0.098, "num_input_tokens_seen": 21329120, "step": 16445 }, { "epoch": 0.8037524735543449, "grad_norm": 0.11463594436645508, "learning_rate": 3.188233514898206e-05, "loss": 0.0889, "num_input_tokens_seen": 21335904, "step": 16450 }, { "epoch": 0.8039967752180393, "grad_norm": 0.6263303160667419, "learning_rate": 3.187289648631657e-05, "loss": 0.0976, "num_input_tokens_seen": 21342368, "step": 16455 }, { "epoch": 0.8042410768817335, "grad_norm": 0.22263690829277039, "learning_rate": 3.186345676376406e-05, "loss": 0.0938, "num_input_tokens_seen": 21348896, "step": 16460 }, { "epoch": 0.8044853785454279, "grad_norm": 0.5023384094238281, "learning_rate": 3.1854015982780275e-05, "loss": 0.1094, "num_input_tokens_seen": 21355168, "step": 16465 }, { "epoch": 0.8047296802091223, "grad_norm": 0.20235604047775269, "learning_rate": 3.1844574144821084e-05, "loss": 0.0818, "num_input_tokens_seen": 21361952, "step": 16470 }, { "epoch": 0.8049739818728165, "grad_norm": 0.4643576741218567, "learning_rate": 3.1835131251342554e-05, "loss": 0.1054, "num_input_tokens_seen": 21368000, "step": 16475 }, { "epoch": 0.8052182835365109, "grad_norm": 0.1672973930835724, "learning_rate": 3.182568730380089e-05, "loss": 0.0833, "num_input_tokens_seen": 21374784, "step": 16480 }, { "epoch": 0.8054625852002052, "grad_norm": 0.13738568127155304, "learning_rate": 3.181624230365245e-05, "loss": 0.1008, "num_input_tokens_seen": 21381568, "step": 16485 }, { "epoch": 0.8057068868638996, "grad_norm": 0.34201136231422424, "learning_rate": 3.180679625235381e-05, "loss": 0.1065, "num_input_tokens_seen": 21387904, "step": 16490 }, { "epoch": 0.8059511885275938, "grad_norm": 0.18863776326179504, "learning_rate": 3.1797349151361646e-05, "loss": 0.0813, "num_input_tokens_seen": 21394208, "step": 16495 }, { "epoch": 0.8061954901912882, "grad_norm": 0.2991012632846832, "learning_rate": 3.178790100213281e-05, "loss": 0.1138, "num_input_tokens_seen": 21400576, "step": 16500 }, { "epoch": 0.8064397918549825, "grad_norm": 0.2918361723423004, "learning_rate": 3.1778451806124346e-05, "loss": 0.0886, "num_input_tokens_seen": 21406944, "step": 16505 }, { "epoch": 0.8066840935186769, "grad_norm": 0.19832010567188263, "learning_rate": 3.176900156479342e-05, "loss": 0.061, "num_input_tokens_seen": 21413408, "step": 16510 }, { "epoch": 0.8069283951823711, "grad_norm": 0.25838011503219604, "learning_rate": 3.17595502795974e-05, "loss": 0.0813, "num_input_tokens_seen": 21419488, "step": 16515 }, { "epoch": 0.8071726968460655, "grad_norm": 0.2581910789012909, "learning_rate": 3.175009795199377e-05, "loss": 0.0716, "num_input_tokens_seen": 21425984, "step": 16520 }, { "epoch": 0.8074169985097599, "grad_norm": 0.21066027879714966, "learning_rate": 3.1740644583440224e-05, "loss": 0.0664, "num_input_tokens_seen": 21432544, "step": 16525 }, { "epoch": 0.8076613001734542, "grad_norm": 0.46053269505500793, "learning_rate": 3.173119017539457e-05, "loss": 0.1128, "num_input_tokens_seen": 21439200, "step": 16530 }, { "epoch": 0.8079056018371485, "grad_norm": 0.24051208794116974, "learning_rate": 3.172173472931479e-05, "loss": 0.1005, "num_input_tokens_seen": 21446272, "step": 16535 }, { "epoch": 0.8081499035008428, "grad_norm": 0.8008976578712463, "learning_rate": 3.1712278246659055e-05, "loss": 0.0865, "num_input_tokens_seen": 21452800, "step": 16540 }, { "epoch": 0.8083942051645372, "grad_norm": 0.4829369783401489, "learning_rate": 3.170282072888566e-05, "loss": 0.0795, "num_input_tokens_seen": 21459328, "step": 16545 }, { "epoch": 0.8086385068282315, "grad_norm": 0.3180612623691559, "learning_rate": 3.169336217745307e-05, "loss": 0.074, "num_input_tokens_seen": 21465408, "step": 16550 }, { "epoch": 0.8088828084919258, "grad_norm": 0.6453426480293274, "learning_rate": 3.1683902593819924e-05, "loss": 0.0984, "num_input_tokens_seen": 21472224, "step": 16555 }, { "epoch": 0.8091271101556201, "grad_norm": 0.161174014210701, "learning_rate": 3.1674441979445e-05, "loss": 0.0864, "num_input_tokens_seen": 21479456, "step": 16560 }, { "epoch": 0.8093714118193145, "grad_norm": 0.15104661881923676, "learning_rate": 3.166498033578725e-05, "loss": 0.0816, "num_input_tokens_seen": 21485728, "step": 16565 }, { "epoch": 0.8096157134830089, "grad_norm": 0.5785391926765442, "learning_rate": 3.165551766430578e-05, "loss": 0.0822, "num_input_tokens_seen": 21492320, "step": 16570 }, { "epoch": 0.8098600151467031, "grad_norm": 0.24244458973407745, "learning_rate": 3.164605396645984e-05, "loss": 0.1127, "num_input_tokens_seen": 21498720, "step": 16575 }, { "epoch": 0.8101043168103975, "grad_norm": 0.23552216589450836, "learning_rate": 3.163658924370886e-05, "loss": 0.0863, "num_input_tokens_seen": 21505568, "step": 16580 }, { "epoch": 0.8103486184740918, "grad_norm": 0.16243407130241394, "learning_rate": 3.1627123497512415e-05, "loss": 0.0981, "num_input_tokens_seen": 21512096, "step": 16585 }, { "epoch": 0.8105929201377862, "grad_norm": 0.14440470933914185, "learning_rate": 3.1617656729330245e-05, "loss": 0.0629, "num_input_tokens_seen": 21518784, "step": 16590 }, { "epoch": 0.8108372218014804, "grad_norm": 0.8911522626876831, "learning_rate": 3.1608188940622255e-05, "loss": 0.0915, "num_input_tokens_seen": 21525024, "step": 16595 }, { "epoch": 0.8110815234651748, "grad_norm": 0.588506817817688, "learning_rate": 3.159872013284847e-05, "loss": 0.131, "num_input_tokens_seen": 21531456, "step": 16600 }, { "epoch": 0.8110815234651748, "eval_loss": 0.0899258404970169, "eval_runtime": 374.436, "eval_samples_per_second": 97.173, "eval_steps_per_second": 24.295, "num_input_tokens_seen": 21531456, "step": 16600 }, { "epoch": 0.8113258251288691, "grad_norm": 0.5760505199432373, "learning_rate": 3.1589250307469134e-05, "loss": 0.0826, "num_input_tokens_seen": 21537888, "step": 16605 }, { "epoch": 0.8115701267925635, "grad_norm": 0.2738734185695648, "learning_rate": 3.1579779465944586e-05, "loss": 0.1053, "num_input_tokens_seen": 21544288, "step": 16610 }, { "epoch": 0.8118144284562578, "grad_norm": 0.24786977469921112, "learning_rate": 3.1570307609735363e-05, "loss": 0.0808, "num_input_tokens_seen": 21550784, "step": 16615 }, { "epoch": 0.8120587301199521, "grad_norm": 0.171997532248497, "learning_rate": 3.156083474030213e-05, "loss": 0.1059, "num_input_tokens_seen": 21557280, "step": 16620 }, { "epoch": 0.8123030317836465, "grad_norm": 0.7549643516540527, "learning_rate": 3.155136085910573e-05, "loss": 0.0793, "num_input_tokens_seen": 21563808, "step": 16625 }, { "epoch": 0.8125473334473408, "grad_norm": 0.24193868041038513, "learning_rate": 3.154188596760717e-05, "loss": 0.0851, "num_input_tokens_seen": 21570656, "step": 16630 }, { "epoch": 0.8127916351110351, "grad_norm": 0.1345985233783722, "learning_rate": 3.153241006726757e-05, "loss": 0.076, "num_input_tokens_seen": 21576736, "step": 16635 }, { "epoch": 0.8130359367747294, "grad_norm": 0.6716510653495789, "learning_rate": 3.152293315954825e-05, "loss": 0.0984, "num_input_tokens_seen": 21583424, "step": 16640 }, { "epoch": 0.8132802384384238, "grad_norm": 0.553682267665863, "learning_rate": 3.1513455245910666e-05, "loss": 0.0637, "num_input_tokens_seen": 21590048, "step": 16645 }, { "epoch": 0.813524540102118, "grad_norm": 0.16470417380332947, "learning_rate": 3.150397632781643e-05, "loss": 0.0703, "num_input_tokens_seen": 21596288, "step": 16650 }, { "epoch": 0.8137688417658124, "grad_norm": 0.717474639415741, "learning_rate": 3.149449640672731e-05, "loss": 0.0862, "num_input_tokens_seen": 21603008, "step": 16655 }, { "epoch": 0.8140131434295067, "grad_norm": 0.1572810411453247, "learning_rate": 3.148501548410523e-05, "loss": 0.0913, "num_input_tokens_seen": 21609248, "step": 16660 }, { "epoch": 0.8142574450932011, "grad_norm": 0.2648058235645294, "learning_rate": 3.1475533561412256e-05, "loss": 0.1405, "num_input_tokens_seen": 21615424, "step": 16665 }, { "epoch": 0.8145017467568955, "grad_norm": 0.4460708498954773, "learning_rate": 3.146605064011065e-05, "loss": 0.0965, "num_input_tokens_seen": 21621760, "step": 16670 }, { "epoch": 0.8147460484205897, "grad_norm": 0.3181137442588806, "learning_rate": 3.145656672166277e-05, "loss": 0.1003, "num_input_tokens_seen": 21627872, "step": 16675 }, { "epoch": 0.8149903500842841, "grad_norm": 0.1597660779953003, "learning_rate": 3.144708180753116e-05, "loss": 0.0764, "num_input_tokens_seen": 21634560, "step": 16680 }, { "epoch": 0.8152346517479784, "grad_norm": 0.9131814241409302, "learning_rate": 3.143759589917851e-05, "loss": 0.1024, "num_input_tokens_seen": 21640960, "step": 16685 }, { "epoch": 0.8154789534116728, "grad_norm": 0.5121044516563416, "learning_rate": 3.142810899806768e-05, "loss": 0.1156, "num_input_tokens_seen": 21646976, "step": 16690 }, { "epoch": 0.815723255075367, "grad_norm": 0.4616883397102356, "learning_rate": 3.141862110566166e-05, "loss": 0.0795, "num_input_tokens_seen": 21653440, "step": 16695 }, { "epoch": 0.8159675567390614, "grad_norm": 0.15890032052993774, "learning_rate": 3.1409132223423606e-05, "loss": 0.0737, "num_input_tokens_seen": 21659776, "step": 16700 }, { "epoch": 0.8162118584027557, "grad_norm": 0.11279349774122238, "learning_rate": 3.139964235281682e-05, "loss": 0.0696, "num_input_tokens_seen": 21666272, "step": 16705 }, { "epoch": 0.81645616006645, "grad_norm": 0.5619996190071106, "learning_rate": 3.139015149530476e-05, "loss": 0.0697, "num_input_tokens_seen": 21672640, "step": 16710 }, { "epoch": 0.8167004617301444, "grad_norm": 0.38340941071510315, "learning_rate": 3.1380659652351034e-05, "loss": 0.1093, "num_input_tokens_seen": 21678848, "step": 16715 }, { "epoch": 0.8169447633938387, "grad_norm": 0.47160834074020386, "learning_rate": 3.137116682541941e-05, "loss": 0.1026, "num_input_tokens_seen": 21685152, "step": 16720 }, { "epoch": 0.8171890650575331, "grad_norm": 0.14006511867046356, "learning_rate": 3.136167301597379e-05, "loss": 0.068, "num_input_tokens_seen": 21692064, "step": 16725 }, { "epoch": 0.8174333667212274, "grad_norm": 0.3258401155471802, "learning_rate": 3.1352178225478254e-05, "loss": 0.102, "num_input_tokens_seen": 21698432, "step": 16730 }, { "epoch": 0.8176776683849217, "grad_norm": 0.40133702754974365, "learning_rate": 3.1342682455396996e-05, "loss": 0.0812, "num_input_tokens_seen": 21705088, "step": 16735 }, { "epoch": 0.817921970048616, "grad_norm": 0.18908390402793884, "learning_rate": 3.133318570719441e-05, "loss": 0.0978, "num_input_tokens_seen": 21711712, "step": 16740 }, { "epoch": 0.8181662717123104, "grad_norm": 0.1481991857290268, "learning_rate": 3.132368798233499e-05, "loss": 0.088, "num_input_tokens_seen": 21718432, "step": 16745 }, { "epoch": 0.8184105733760046, "grad_norm": 0.4062485098838806, "learning_rate": 3.131418928228342e-05, "loss": 0.1001, "num_input_tokens_seen": 21724800, "step": 16750 }, { "epoch": 0.818654875039699, "grad_norm": 0.15153011679649353, "learning_rate": 3.1304689608504514e-05, "loss": 0.0995, "num_input_tokens_seen": 21731264, "step": 16755 }, { "epoch": 0.8188991767033934, "grad_norm": 0.28922146558761597, "learning_rate": 3.129518896246324e-05, "loss": 0.0979, "num_input_tokens_seen": 21737568, "step": 16760 }, { "epoch": 0.8191434783670877, "grad_norm": 0.2200266271829605, "learning_rate": 3.128568734562472e-05, "loss": 0.0795, "num_input_tokens_seen": 21743936, "step": 16765 }, { "epoch": 0.819387780030782, "grad_norm": 0.3555949628353119, "learning_rate": 3.127618475945421e-05, "loss": 0.0751, "num_input_tokens_seen": 21751040, "step": 16770 }, { "epoch": 0.8196320816944763, "grad_norm": 0.4327670931816101, "learning_rate": 3.126668120541715e-05, "loss": 0.1043, "num_input_tokens_seen": 21757312, "step": 16775 }, { "epoch": 0.8198763833581707, "grad_norm": 0.24282532930374146, "learning_rate": 3.1257176684979096e-05, "loss": 0.1017, "num_input_tokens_seen": 21763424, "step": 16780 }, { "epoch": 0.820120685021865, "grad_norm": 0.3799704909324646, "learning_rate": 3.124767119960576e-05, "loss": 0.0935, "num_input_tokens_seen": 21769504, "step": 16785 }, { "epoch": 0.8203649866855593, "grad_norm": 0.2929203510284424, "learning_rate": 3.123816475076301e-05, "loss": 0.0748, "num_input_tokens_seen": 21775936, "step": 16790 }, { "epoch": 0.8206092883492536, "grad_norm": 0.5101704597473145, "learning_rate": 3.122865733991687e-05, "loss": 0.076, "num_input_tokens_seen": 21782560, "step": 16795 }, { "epoch": 0.820853590012948, "grad_norm": 0.3014085292816162, "learning_rate": 3.1219148968533486e-05, "loss": 0.0908, "num_input_tokens_seen": 21788384, "step": 16800 }, { "epoch": 0.820853590012948, "eval_loss": 0.08982601016759872, "eval_runtime": 374.9925, "eval_samples_per_second": 97.029, "eval_steps_per_second": 24.259, "num_input_tokens_seen": 21788384, "step": 16800 }, { "epoch": 0.8210978916766423, "grad_norm": 0.5648805499076843, "learning_rate": 3.120963963807918e-05, "loss": 0.0919, "num_input_tokens_seen": 21794944, "step": 16805 }, { "epoch": 0.8213421933403366, "grad_norm": 0.24981187283992767, "learning_rate": 3.12001293500204e-05, "loss": 0.0605, "num_input_tokens_seen": 21801664, "step": 16810 }, { "epoch": 0.821586495004031, "grad_norm": 0.49200719594955444, "learning_rate": 3.1190618105823765e-05, "loss": 0.0676, "num_input_tokens_seen": 21807872, "step": 16815 }, { "epoch": 0.8218307966677253, "grad_norm": 0.2131631076335907, "learning_rate": 3.118110590695603e-05, "loss": 0.0925, "num_input_tokens_seen": 21814432, "step": 16820 }, { "epoch": 0.8220750983314197, "grad_norm": 0.6383135318756104, "learning_rate": 3.117159275488407e-05, "loss": 0.1154, "num_input_tokens_seen": 21820896, "step": 16825 }, { "epoch": 0.8223193999951139, "grad_norm": 0.2709878087043762, "learning_rate": 3.1162078651074956e-05, "loss": 0.0787, "num_input_tokens_seen": 21827168, "step": 16830 }, { "epoch": 0.8225637016588083, "grad_norm": 0.2695354223251343, "learning_rate": 3.1152563596995885e-05, "loss": 0.0875, "num_input_tokens_seen": 21833728, "step": 16835 }, { "epoch": 0.8228080033225026, "grad_norm": 0.26347512006759644, "learning_rate": 3.1143047594114186e-05, "loss": 0.114, "num_input_tokens_seen": 21839968, "step": 16840 }, { "epoch": 0.823052304986197, "grad_norm": 0.3256318271160126, "learning_rate": 3.113353064389734e-05, "loss": 0.0601, "num_input_tokens_seen": 21846432, "step": 16845 }, { "epoch": 0.8232966066498912, "grad_norm": 0.33883845806121826, "learning_rate": 3.1124012747812993e-05, "loss": 0.0832, "num_input_tokens_seen": 21852608, "step": 16850 }, { "epoch": 0.8235409083135856, "grad_norm": 0.2941320836544037, "learning_rate": 3.1114493907328936e-05, "loss": 0.1133, "num_input_tokens_seen": 21858912, "step": 16855 }, { "epoch": 0.82378520997728, "grad_norm": 0.5745192170143127, "learning_rate": 3.110497412391306e-05, "loss": 0.076, "num_input_tokens_seen": 21865440, "step": 16860 }, { "epoch": 0.8240295116409743, "grad_norm": 0.21800673007965088, "learning_rate": 3.1095453399033466e-05, "loss": 0.0858, "num_input_tokens_seen": 21871776, "step": 16865 }, { "epoch": 0.8242738133046686, "grad_norm": 0.546445369720459, "learning_rate": 3.108593173415835e-05, "loss": 0.0856, "num_input_tokens_seen": 21877856, "step": 16870 }, { "epoch": 0.8245181149683629, "grad_norm": 0.2388652116060257, "learning_rate": 3.107640913075609e-05, "loss": 0.0821, "num_input_tokens_seen": 21884192, "step": 16875 }, { "epoch": 0.8247624166320573, "grad_norm": 0.26216214895248413, "learning_rate": 3.106688559029517e-05, "loss": 0.1175, "num_input_tokens_seen": 21890944, "step": 16880 }, { "epoch": 0.8250067182957516, "grad_norm": 0.8030115365982056, "learning_rate": 3.105736111424425e-05, "loss": 0.1272, "num_input_tokens_seen": 21897536, "step": 16885 }, { "epoch": 0.8252510199594459, "grad_norm": 0.136929452419281, "learning_rate": 3.1047835704072136e-05, "loss": 0.0827, "num_input_tokens_seen": 21904192, "step": 16890 }, { "epoch": 0.8254953216231402, "grad_norm": 0.3317766487598419, "learning_rate": 3.103830936124775e-05, "loss": 0.123, "num_input_tokens_seen": 21910560, "step": 16895 }, { "epoch": 0.8257396232868346, "grad_norm": 0.45496827363967896, "learning_rate": 3.102878208724018e-05, "loss": 0.1033, "num_input_tokens_seen": 21916928, "step": 16900 }, { "epoch": 0.8259839249505289, "grad_norm": 0.13777026534080505, "learning_rate": 3.101925388351865e-05, "loss": 0.0844, "num_input_tokens_seen": 21923264, "step": 16905 }, { "epoch": 0.8262282266142232, "grad_norm": 0.32592254877090454, "learning_rate": 3.1009724751552515e-05, "loss": 0.0791, "num_input_tokens_seen": 21929888, "step": 16910 }, { "epoch": 0.8264725282779176, "grad_norm": 0.43317294120788574, "learning_rate": 3.100019469281131e-05, "loss": 0.0778, "num_input_tokens_seen": 21936576, "step": 16915 }, { "epoch": 0.8267168299416119, "grad_norm": 0.8109589219093323, "learning_rate": 3.0990663708764685e-05, "loss": 0.0946, "num_input_tokens_seen": 21942848, "step": 16920 }, { "epoch": 0.8269611316053063, "grad_norm": 0.24220432341098785, "learning_rate": 3.098113180088243e-05, "loss": 0.0726, "num_input_tokens_seen": 21949184, "step": 16925 }, { "epoch": 0.8272054332690005, "grad_norm": 0.32664328813552856, "learning_rate": 3.097159897063448e-05, "loss": 0.0773, "num_input_tokens_seen": 21956096, "step": 16930 }, { "epoch": 0.8274497349326949, "grad_norm": 0.3490127623081207, "learning_rate": 3.096206521949094e-05, "loss": 0.1264, "num_input_tokens_seen": 21962752, "step": 16935 }, { "epoch": 0.8276940365963892, "grad_norm": 0.4787880778312683, "learning_rate": 3.0952530548922006e-05, "loss": 0.1068, "num_input_tokens_seen": 21968832, "step": 16940 }, { "epoch": 0.8279383382600836, "grad_norm": 0.7480826377868652, "learning_rate": 3.0942994960398064e-05, "loss": 0.0704, "num_input_tokens_seen": 21974944, "step": 16945 }, { "epoch": 0.8281826399237778, "grad_norm": 0.2332380712032318, "learning_rate": 3.093345845538961e-05, "loss": 0.0915, "num_input_tokens_seen": 21981568, "step": 16950 }, { "epoch": 0.8284269415874722, "grad_norm": 0.14618125557899475, "learning_rate": 3.09239210353673e-05, "loss": 0.0578, "num_input_tokens_seen": 21988640, "step": 16955 }, { "epoch": 0.8286712432511666, "grad_norm": 0.17480309307575226, "learning_rate": 3.0914382701801926e-05, "loss": 0.0769, "num_input_tokens_seen": 21995264, "step": 16960 }, { "epoch": 0.8289155449148609, "grad_norm": 0.6042906045913696, "learning_rate": 3.090484345616441e-05, "loss": 0.086, "num_input_tokens_seen": 22001248, "step": 16965 }, { "epoch": 0.8291598465785552, "grad_norm": 0.15710695087909698, "learning_rate": 3.0895303299925825e-05, "loss": 0.098, "num_input_tokens_seen": 22007328, "step": 16970 }, { "epoch": 0.8294041482422495, "grad_norm": 0.25493836402893066, "learning_rate": 3.0885762234557393e-05, "loss": 0.1045, "num_input_tokens_seen": 22013760, "step": 16975 }, { "epoch": 0.8296484499059439, "grad_norm": 0.14305268228054047, "learning_rate": 3.087622026153045e-05, "loss": 0.0742, "num_input_tokens_seen": 22020256, "step": 16980 }, { "epoch": 0.8298927515696382, "grad_norm": 0.3098505139350891, "learning_rate": 3.086667738231651e-05, "loss": 0.0794, "num_input_tokens_seen": 22026624, "step": 16985 }, { "epoch": 0.8301370532333325, "grad_norm": 0.22186432778835297, "learning_rate": 3.085713359838718e-05, "loss": 0.1164, "num_input_tokens_seen": 22032800, "step": 16990 }, { "epoch": 0.8303813548970268, "grad_norm": 0.3860965371131897, "learning_rate": 3.084758891121425e-05, "loss": 0.1079, "num_input_tokens_seen": 22039264, "step": 16995 }, { "epoch": 0.8306256565607212, "grad_norm": 0.4619118869304657, "learning_rate": 3.083804332226963e-05, "loss": 0.0853, "num_input_tokens_seen": 22045600, "step": 17000 }, { "epoch": 0.8306256565607212, "eval_loss": 0.0897131860256195, "eval_runtime": 374.0322, "eval_samples_per_second": 97.278, "eval_steps_per_second": 24.321, "num_input_tokens_seen": 22045600, "step": 17000 }, { "epoch": 0.8308699582244156, "grad_norm": 0.6039373278617859, "learning_rate": 3.082849683302536e-05, "loss": 0.101, "num_input_tokens_seen": 22051904, "step": 17005 }, { "epoch": 0.8311142598881098, "grad_norm": 0.4700530767440796, "learning_rate": 3.081894944495363e-05, "loss": 0.0979, "num_input_tokens_seen": 22058528, "step": 17010 }, { "epoch": 0.8313585615518042, "grad_norm": 0.3616744875907898, "learning_rate": 3.080940115952677e-05, "loss": 0.0822, "num_input_tokens_seen": 22065408, "step": 17015 }, { "epoch": 0.8316028632154985, "grad_norm": 0.17675817012786865, "learning_rate": 3.0799851978217245e-05, "loss": 0.084, "num_input_tokens_seen": 22071680, "step": 17020 }, { "epoch": 0.8318471648791929, "grad_norm": 0.2230127602815628, "learning_rate": 3.0790301902497666e-05, "loss": 0.0858, "num_input_tokens_seen": 22078144, "step": 17025 }, { "epoch": 0.8320914665428871, "grad_norm": 0.22452829778194427, "learning_rate": 3.078075093384076e-05, "loss": 0.0843, "num_input_tokens_seen": 22084512, "step": 17030 }, { "epoch": 0.8323357682065815, "grad_norm": 0.3540613651275635, "learning_rate": 3.077119907371942e-05, "loss": 0.0872, "num_input_tokens_seen": 22090816, "step": 17035 }, { "epoch": 0.8325800698702758, "grad_norm": 0.17257794737815857, "learning_rate": 3.076164632360666e-05, "loss": 0.101, "num_input_tokens_seen": 22097248, "step": 17040 }, { "epoch": 0.8328243715339702, "grad_norm": 0.19133970141410828, "learning_rate": 3.075209268497563e-05, "loss": 0.089, "num_input_tokens_seen": 22103360, "step": 17045 }, { "epoch": 0.8330686731976644, "grad_norm": 0.34024062752723694, "learning_rate": 3.074253815929961e-05, "loss": 0.0805, "num_input_tokens_seen": 22109568, "step": 17050 }, { "epoch": 0.8333129748613588, "grad_norm": 0.22828665375709534, "learning_rate": 3.0732982748052054e-05, "loss": 0.0746, "num_input_tokens_seen": 22115968, "step": 17055 }, { "epoch": 0.8335572765250532, "grad_norm": 0.2205199897289276, "learning_rate": 3.072342645270651e-05, "loss": 0.0724, "num_input_tokens_seen": 22122240, "step": 17060 }, { "epoch": 0.8338015781887474, "grad_norm": 0.36554136872291565, "learning_rate": 3.071386927473668e-05, "loss": 0.0829, "num_input_tokens_seen": 22129280, "step": 17065 }, { "epoch": 0.8340458798524418, "grad_norm": 0.3644207715988159, "learning_rate": 3.0704311215616404e-05, "loss": 0.0978, "num_input_tokens_seen": 22135264, "step": 17070 }, { "epoch": 0.8342901815161361, "grad_norm": 0.38585177063941956, "learning_rate": 3.0694752276819656e-05, "loss": 0.0942, "num_input_tokens_seen": 22141824, "step": 17075 }, { "epoch": 0.8345344831798305, "grad_norm": 0.42264801263809204, "learning_rate": 3.068519245982054e-05, "loss": 0.0924, "num_input_tokens_seen": 22147840, "step": 17080 }, { "epoch": 0.8347787848435247, "grad_norm": 0.3079555928707123, "learning_rate": 3.0675631766093304e-05, "loss": 0.0719, "num_input_tokens_seen": 22154496, "step": 17085 }, { "epoch": 0.8350230865072191, "grad_norm": 0.17355524003505707, "learning_rate": 3.066607019711232e-05, "loss": 0.0772, "num_input_tokens_seen": 22161184, "step": 17090 }, { "epoch": 0.8352673881709134, "grad_norm": 0.1583855301141739, "learning_rate": 3.065650775435211e-05, "loss": 0.0808, "num_input_tokens_seen": 22167840, "step": 17095 }, { "epoch": 0.8355116898346078, "grad_norm": 0.211776465177536, "learning_rate": 3.0646944439287326e-05, "loss": 0.1116, "num_input_tokens_seen": 22173824, "step": 17100 }, { "epoch": 0.8357559914983022, "grad_norm": 0.45921382308006287, "learning_rate": 3.0637380253392736e-05, "loss": 0.0931, "num_input_tokens_seen": 22180576, "step": 17105 }, { "epoch": 0.8360002931619964, "grad_norm": 0.23808717727661133, "learning_rate": 3.062781519814327e-05, "loss": 0.0814, "num_input_tokens_seen": 22187424, "step": 17110 }, { "epoch": 0.8362445948256908, "grad_norm": 0.2967003285884857, "learning_rate": 3.0618249275013985e-05, "loss": 0.1111, "num_input_tokens_seen": 22193568, "step": 17115 }, { "epoch": 0.8364888964893851, "grad_norm": 0.22563020884990692, "learning_rate": 3.060868248548005e-05, "loss": 0.0878, "num_input_tokens_seen": 22199968, "step": 17120 }, { "epoch": 0.8367331981530794, "grad_norm": 0.24947775900363922, "learning_rate": 3.0599114831016796e-05, "loss": 0.0907, "num_input_tokens_seen": 22206496, "step": 17125 }, { "epoch": 0.8369774998167737, "grad_norm": 0.7324777841567993, "learning_rate": 3.0589546313099666e-05, "loss": 0.1164, "num_input_tokens_seen": 22212928, "step": 17130 }, { "epoch": 0.8372218014804681, "grad_norm": 0.5646961331367493, "learning_rate": 3.0579976933204255e-05, "loss": 0.0863, "num_input_tokens_seen": 22220032, "step": 17135 }, { "epoch": 0.8374661031441624, "grad_norm": 0.29201459884643555, "learning_rate": 3.0570406692806284e-05, "loss": 0.0886, "num_input_tokens_seen": 22226016, "step": 17140 }, { "epoch": 0.8377104048078567, "grad_norm": 0.15977485477924347, "learning_rate": 3.05608355933816e-05, "loss": 0.0872, "num_input_tokens_seen": 22232224, "step": 17145 }, { "epoch": 0.8379547064715511, "grad_norm": 0.4455246925354004, "learning_rate": 3.055126363640618e-05, "loss": 0.0795, "num_input_tokens_seen": 22238464, "step": 17150 }, { "epoch": 0.8381990081352454, "grad_norm": 0.39198818802833557, "learning_rate": 3.0541690823356146e-05, "loss": 0.0854, "num_input_tokens_seen": 22244512, "step": 17155 }, { "epoch": 0.8384433097989398, "grad_norm": 1.577189564704895, "learning_rate": 3.053211715570775e-05, "loss": 0.0938, "num_input_tokens_seen": 22251328, "step": 17160 }, { "epoch": 0.838687611462634, "grad_norm": 0.32406485080718994, "learning_rate": 3.052254263493736e-05, "loss": 0.083, "num_input_tokens_seen": 22257888, "step": 17165 }, { "epoch": 0.8389319131263284, "grad_norm": 0.3261761963367462, "learning_rate": 3.0512967262521498e-05, "loss": 0.0871, "num_input_tokens_seen": 22264544, "step": 17170 }, { "epoch": 0.8391762147900227, "grad_norm": 0.41712501645088196, "learning_rate": 3.0503391039936803e-05, "loss": 0.1124, "num_input_tokens_seen": 22271328, "step": 17175 }, { "epoch": 0.8394205164537171, "grad_norm": 0.2723931670188904, "learning_rate": 3.0493813968660056e-05, "loss": 0.0921, "num_input_tokens_seen": 22278016, "step": 17180 }, { "epoch": 0.8396648181174113, "grad_norm": 0.46017348766326904, "learning_rate": 3.0484236050168153e-05, "loss": 0.1183, "num_input_tokens_seen": 22284544, "step": 17185 }, { "epoch": 0.8399091197811057, "grad_norm": 0.2186369150876999, "learning_rate": 3.0474657285938123e-05, "loss": 0.085, "num_input_tokens_seen": 22291072, "step": 17190 }, { "epoch": 0.8401534214448, "grad_norm": 0.2149355262517929, "learning_rate": 3.046507767744715e-05, "loss": 0.0991, "num_input_tokens_seen": 22297216, "step": 17195 }, { "epoch": 0.8403977231084944, "grad_norm": 0.15166203677654266, "learning_rate": 3.045549722617252e-05, "loss": 0.0804, "num_input_tokens_seen": 22303808, "step": 17200 }, { "epoch": 0.8403977231084944, "eval_loss": 0.09070292860269547, "eval_runtime": 374.4058, "eval_samples_per_second": 97.181, "eval_steps_per_second": 24.297, "num_input_tokens_seen": 22303808, "step": 17200 }, { "epoch": 0.8406420247721887, "grad_norm": 0.18938742578029633, "learning_rate": 3.0445915933591658e-05, "loss": 0.0968, "num_input_tokens_seen": 22310272, "step": 17205 }, { "epoch": 0.840886326435883, "grad_norm": 0.22331397235393524, "learning_rate": 3.0436333801182114e-05, "loss": 0.088, "num_input_tokens_seen": 22316640, "step": 17210 }, { "epoch": 0.8411306280995774, "grad_norm": 0.17740795016288757, "learning_rate": 3.0426750830421596e-05, "loss": 0.0998, "num_input_tokens_seen": 22323872, "step": 17215 }, { "epoch": 0.8413749297632717, "grad_norm": 0.22116824984550476, "learning_rate": 3.0417167022787897e-05, "loss": 0.0864, "num_input_tokens_seen": 22330080, "step": 17220 }, { "epoch": 0.841619231426966, "grad_norm": 0.14979560673236847, "learning_rate": 3.0407582379758966e-05, "loss": 0.0893, "num_input_tokens_seen": 22336320, "step": 17225 }, { "epoch": 0.8418635330906603, "grad_norm": 0.1847561150789261, "learning_rate": 3.039799690281287e-05, "loss": 0.071, "num_input_tokens_seen": 22342688, "step": 17230 }, { "epoch": 0.8421078347543547, "grad_norm": 0.494182825088501, "learning_rate": 3.0388410593427823e-05, "loss": 0.1289, "num_input_tokens_seen": 22348992, "step": 17235 }, { "epoch": 0.842352136418049, "grad_norm": 0.29772305488586426, "learning_rate": 3.0378823453082146e-05, "loss": 0.0908, "num_input_tokens_seen": 22355552, "step": 17240 }, { "epoch": 0.8425964380817433, "grad_norm": 0.1828470230102539, "learning_rate": 3.03692354832543e-05, "loss": 0.0926, "num_input_tokens_seen": 22362528, "step": 17245 }, { "epoch": 0.8428407397454377, "grad_norm": 0.3277040123939514, "learning_rate": 3.0359646685422865e-05, "loss": 0.1175, "num_input_tokens_seen": 22369280, "step": 17250 }, { "epoch": 0.843085041409132, "grad_norm": 0.25194355845451355, "learning_rate": 3.035005706106656e-05, "loss": 0.076, "num_input_tokens_seen": 22376096, "step": 17255 }, { "epoch": 0.8433293430728264, "grad_norm": 0.5997493267059326, "learning_rate": 3.034046661166422e-05, "loss": 0.0994, "num_input_tokens_seen": 22382528, "step": 17260 }, { "epoch": 0.8435736447365206, "grad_norm": 0.7983303666114807, "learning_rate": 3.033087533869482e-05, "loss": 0.0909, "num_input_tokens_seen": 22388768, "step": 17265 }, { "epoch": 0.843817946400215, "grad_norm": 0.34005269408226013, "learning_rate": 3.0321283243637444e-05, "loss": 0.0814, "num_input_tokens_seen": 22395328, "step": 17270 }, { "epoch": 0.8440622480639093, "grad_norm": 0.21846984326839447, "learning_rate": 3.0311690327971326e-05, "loss": 0.0596, "num_input_tokens_seen": 22401792, "step": 17275 }, { "epoch": 0.8443065497276037, "grad_norm": 0.17984825372695923, "learning_rate": 3.030209659317581e-05, "loss": 0.0869, "num_input_tokens_seen": 22408832, "step": 17280 }, { "epoch": 0.8445508513912979, "grad_norm": 0.1281953603029251, "learning_rate": 3.0292502040730362e-05, "loss": 0.1138, "num_input_tokens_seen": 22415264, "step": 17285 }, { "epoch": 0.8447951530549923, "grad_norm": 0.5242812633514404, "learning_rate": 3.0282906672114597e-05, "loss": 0.0841, "num_input_tokens_seen": 22421600, "step": 17290 }, { "epoch": 0.8450394547186867, "grad_norm": 0.25346943736076355, "learning_rate": 3.027331048880823e-05, "loss": 0.1126, "num_input_tokens_seen": 22428096, "step": 17295 }, { "epoch": 0.845283756382381, "grad_norm": 0.3013734221458435, "learning_rate": 3.0263713492291123e-05, "loss": 0.1022, "num_input_tokens_seen": 22434560, "step": 17300 }, { "epoch": 0.8455280580460753, "grad_norm": 0.22334952652454376, "learning_rate": 3.0254115684043242e-05, "loss": 0.11, "num_input_tokens_seen": 22440576, "step": 17305 }, { "epoch": 0.8457723597097696, "grad_norm": 0.16783444583415985, "learning_rate": 3.024451706554469e-05, "loss": 0.0835, "num_input_tokens_seen": 22447104, "step": 17310 }, { "epoch": 0.846016661373464, "grad_norm": 0.5160782337188721, "learning_rate": 3.0234917638275705e-05, "loss": 0.0825, "num_input_tokens_seen": 22453440, "step": 17315 }, { "epoch": 0.8462609630371583, "grad_norm": 0.2154405266046524, "learning_rate": 3.0225317403716635e-05, "loss": 0.0764, "num_input_tokens_seen": 22459584, "step": 17320 }, { "epoch": 0.8465052647008526, "grad_norm": 0.3381101191043854, "learning_rate": 3.0215716363347956e-05, "loss": 0.0882, "num_input_tokens_seen": 22466112, "step": 17325 }, { "epoch": 0.8467495663645469, "grad_norm": 0.15667957067489624, "learning_rate": 3.0206114518650275e-05, "loss": 0.1001, "num_input_tokens_seen": 22472768, "step": 17330 }, { "epoch": 0.8469938680282413, "grad_norm": 0.8335277438163757, "learning_rate": 3.0196511871104304e-05, "loss": 0.1026, "num_input_tokens_seen": 22479232, "step": 17335 }, { "epoch": 0.8472381696919355, "grad_norm": 0.523216962814331, "learning_rate": 3.01869084221909e-05, "loss": 0.0889, "num_input_tokens_seen": 22485888, "step": 17340 }, { "epoch": 0.8474824713556299, "grad_norm": 0.37835678458213806, "learning_rate": 3.0177304173391037e-05, "loss": 0.0924, "num_input_tokens_seen": 22491904, "step": 17345 }, { "epoch": 0.8477267730193243, "grad_norm": 0.38496455550193787, "learning_rate": 3.01676991261858e-05, "loss": 0.0966, "num_input_tokens_seen": 22498016, "step": 17350 }, { "epoch": 0.8479710746830186, "grad_norm": 0.4230721592903137, "learning_rate": 3.015809328205642e-05, "loss": 0.0919, "num_input_tokens_seen": 22504192, "step": 17355 }, { "epoch": 0.848215376346713, "grad_norm": 0.3048405051231384, "learning_rate": 3.0148486642484248e-05, "loss": 0.0818, "num_input_tokens_seen": 22510496, "step": 17360 }, { "epoch": 0.8484596780104072, "grad_norm": 0.5615572333335876, "learning_rate": 3.0138879208950722e-05, "loss": 0.0726, "num_input_tokens_seen": 22516800, "step": 17365 }, { "epoch": 0.8487039796741016, "grad_norm": 0.3906906545162201, "learning_rate": 3.012927098293744e-05, "loss": 0.0787, "num_input_tokens_seen": 22523168, "step": 17370 }, { "epoch": 0.8489482813377959, "grad_norm": 0.13837017118930817, "learning_rate": 3.0119661965926123e-05, "loss": 0.096, "num_input_tokens_seen": 22529856, "step": 17375 }, { "epoch": 0.8491925830014903, "grad_norm": 0.4297938048839569, "learning_rate": 3.0110052159398587e-05, "loss": 0.1013, "num_input_tokens_seen": 22536192, "step": 17380 }, { "epoch": 0.8494368846651845, "grad_norm": 0.5027980804443359, "learning_rate": 3.0100441564836802e-05, "loss": 0.0773, "num_input_tokens_seen": 22542848, "step": 17385 }, { "epoch": 0.8496811863288789, "grad_norm": 0.14513884484767914, "learning_rate": 3.0090830183722817e-05, "loss": 0.1079, "num_input_tokens_seen": 22549312, "step": 17390 }, { "epoch": 0.8499254879925733, "grad_norm": 0.1429821401834488, "learning_rate": 3.0081218017538852e-05, "loss": 0.0913, "num_input_tokens_seen": 22556032, "step": 17395 }, { "epoch": 0.8501697896562675, "grad_norm": 0.3200609087944031, "learning_rate": 3.0071605067767212e-05, "loss": 0.066, "num_input_tokens_seen": 22562496, "step": 17400 }, { "epoch": 0.8501697896562675, "eval_loss": 0.09036116302013397, "eval_runtime": 374.071, "eval_samples_per_second": 97.268, "eval_steps_per_second": 24.319, "num_input_tokens_seen": 22562496, "step": 17400 }, { "epoch": 0.8504140913199619, "grad_norm": 0.15780438482761383, "learning_rate": 3.006199133589034e-05, "loss": 0.0885, "num_input_tokens_seen": 22568864, "step": 17405 }, { "epoch": 0.8506583929836562, "grad_norm": 0.15189464390277863, "learning_rate": 3.005237682339079e-05, "loss": 0.0769, "num_input_tokens_seen": 22575328, "step": 17410 }, { "epoch": 0.8509026946473506, "grad_norm": 0.37595874071121216, "learning_rate": 3.0042761531751228e-05, "loss": 0.0868, "num_input_tokens_seen": 22582048, "step": 17415 }, { "epoch": 0.8511469963110448, "grad_norm": 0.2656625509262085, "learning_rate": 3.0033145462454482e-05, "loss": 0.0945, "num_input_tokens_seen": 22588256, "step": 17420 }, { "epoch": 0.8513912979747392, "grad_norm": 0.6617624759674072, "learning_rate": 3.002352861698345e-05, "loss": 0.0902, "num_input_tokens_seen": 22594880, "step": 17425 }, { "epoch": 0.8516355996384335, "grad_norm": 0.355001300573349, "learning_rate": 3.0013910996821178e-05, "loss": 0.0642, "num_input_tokens_seen": 22601376, "step": 17430 }, { "epoch": 0.8518799013021279, "grad_norm": 0.18622341752052307, "learning_rate": 3.0004292603450817e-05, "loss": 0.0678, "num_input_tokens_seen": 22608032, "step": 17435 }, { "epoch": 0.8521242029658221, "grad_norm": 0.14756129682064056, "learning_rate": 2.9994673438355653e-05, "loss": 0.0856, "num_input_tokens_seen": 22614752, "step": 17440 }, { "epoch": 0.8523685046295165, "grad_norm": 0.20952297747135162, "learning_rate": 2.9985053503019078e-05, "loss": 0.0685, "num_input_tokens_seen": 22621056, "step": 17445 }, { "epoch": 0.8526128062932109, "grad_norm": 0.3203575611114502, "learning_rate": 2.99754327989246e-05, "loss": 0.1212, "num_input_tokens_seen": 22627584, "step": 17450 }, { "epoch": 0.8528571079569052, "grad_norm": 0.21986360847949982, "learning_rate": 2.9965811327555864e-05, "loss": 0.0725, "num_input_tokens_seen": 22633888, "step": 17455 }, { "epoch": 0.8531014096205995, "grad_norm": 0.3206116855144501, "learning_rate": 2.995618909039662e-05, "loss": 0.0668, "num_input_tokens_seen": 22640480, "step": 17460 }, { "epoch": 0.8533457112842938, "grad_norm": 0.26477110385894775, "learning_rate": 2.9946566088930727e-05, "loss": 0.0798, "num_input_tokens_seen": 22647200, "step": 17465 }, { "epoch": 0.8535900129479882, "grad_norm": 0.20388227701187134, "learning_rate": 2.9936942324642192e-05, "loss": 0.0519, "num_input_tokens_seen": 22653280, "step": 17470 }, { "epoch": 0.8538343146116825, "grad_norm": 0.4166276454925537, "learning_rate": 2.9927317799015097e-05, "loss": 0.1119, "num_input_tokens_seen": 22659552, "step": 17475 }, { "epoch": 0.8540786162753768, "grad_norm": 0.26837900280952454, "learning_rate": 2.9917692513533685e-05, "loss": 0.0865, "num_input_tokens_seen": 22666688, "step": 17480 }, { "epoch": 0.8543229179390711, "grad_norm": 0.2045954465866089, "learning_rate": 2.990806646968229e-05, "loss": 0.1061, "num_input_tokens_seen": 22672960, "step": 17485 }, { "epoch": 0.8545672196027655, "grad_norm": 0.15812604129314423, "learning_rate": 2.989843966894536e-05, "loss": 0.0975, "num_input_tokens_seen": 22679264, "step": 17490 }, { "epoch": 0.8548115212664599, "grad_norm": 0.3998697102069855, "learning_rate": 2.9888812112807472e-05, "loss": 0.1049, "num_input_tokens_seen": 22686528, "step": 17495 }, { "epoch": 0.8550558229301541, "grad_norm": 0.1358167678117752, "learning_rate": 2.987918380275333e-05, "loss": 0.0716, "num_input_tokens_seen": 22692640, "step": 17500 }, { "epoch": 0.8553001245938485, "grad_norm": 0.14821526408195496, "learning_rate": 2.9869554740267724e-05, "loss": 0.1029, "num_input_tokens_seen": 22699392, "step": 17505 }, { "epoch": 0.8555444262575428, "grad_norm": 0.1958092451095581, "learning_rate": 2.9859924926835585e-05, "loss": 0.1109, "num_input_tokens_seen": 22705824, "step": 17510 }, { "epoch": 0.8557887279212372, "grad_norm": 0.21446067094802856, "learning_rate": 2.9850294363941944e-05, "loss": 0.0794, "num_input_tokens_seen": 22712320, "step": 17515 }, { "epoch": 0.8560330295849314, "grad_norm": 0.46193188428878784, "learning_rate": 2.9840663053071967e-05, "loss": 0.0928, "num_input_tokens_seen": 22718720, "step": 17520 }, { "epoch": 0.8562773312486258, "grad_norm": 0.19208045303821564, "learning_rate": 2.983103099571091e-05, "loss": 0.0947, "num_input_tokens_seen": 22725024, "step": 17525 }, { "epoch": 0.8565216329123201, "grad_norm": 0.2915829122066498, "learning_rate": 2.9821398193344164e-05, "loss": 0.0896, "num_input_tokens_seen": 22731488, "step": 17530 }, { "epoch": 0.8567659345760145, "grad_norm": 0.26269224286079407, "learning_rate": 2.9811764647457226e-05, "loss": 0.08, "num_input_tokens_seen": 22737952, "step": 17535 }, { "epoch": 0.8570102362397088, "grad_norm": 0.16315214335918427, "learning_rate": 2.9802130359535714e-05, "loss": 0.0753, "num_input_tokens_seen": 22744256, "step": 17540 }, { "epoch": 0.8572545379034031, "grad_norm": 0.4325839877128601, "learning_rate": 2.979249533106535e-05, "loss": 0.0887, "num_input_tokens_seen": 22750688, "step": 17545 }, { "epoch": 0.8574988395670975, "grad_norm": 0.3008135259151459, "learning_rate": 2.9782859563531986e-05, "loss": 0.0923, "num_input_tokens_seen": 22756704, "step": 17550 }, { "epoch": 0.8577431412307918, "grad_norm": 0.1195087879896164, "learning_rate": 2.977322305842156e-05, "loss": 0.0959, "num_input_tokens_seen": 22762848, "step": 17555 }, { "epoch": 0.8579874428944861, "grad_norm": 0.12926027178764343, "learning_rate": 2.9763585817220162e-05, "loss": 0.0838, "num_input_tokens_seen": 22769216, "step": 17560 }, { "epoch": 0.8582317445581804, "grad_norm": 0.4750674366950989, "learning_rate": 2.975394784141397e-05, "loss": 0.0961, "num_input_tokens_seen": 22775648, "step": 17565 }, { "epoch": 0.8584760462218748, "grad_norm": 0.14611878991127014, "learning_rate": 2.974430913248928e-05, "loss": 0.1156, "num_input_tokens_seen": 22782016, "step": 17570 }, { "epoch": 0.8587203478855691, "grad_norm": 0.17196862399578094, "learning_rate": 2.9734669691932497e-05, "loss": 0.0843, "num_input_tokens_seen": 22788448, "step": 17575 }, { "epoch": 0.8589646495492634, "grad_norm": 0.38835838437080383, "learning_rate": 2.9725029521230147e-05, "loss": 0.1019, "num_input_tokens_seen": 22794816, "step": 17580 }, { "epoch": 0.8592089512129577, "grad_norm": 0.1705804169178009, "learning_rate": 2.9715388621868873e-05, "loss": 0.0888, "num_input_tokens_seen": 22801536, "step": 17585 }, { "epoch": 0.8594532528766521, "grad_norm": 0.22721366584300995, "learning_rate": 2.970574699533541e-05, "loss": 0.124, "num_input_tokens_seen": 22808224, "step": 17590 }, { "epoch": 0.8596975545403465, "grad_norm": 0.1758061945438385, "learning_rate": 2.969610464311662e-05, "loss": 0.088, "num_input_tokens_seen": 22814400, "step": 17595 }, { "epoch": 0.8599418562040407, "grad_norm": 0.8747115135192871, "learning_rate": 2.9686461566699487e-05, "loss": 0.0843, "num_input_tokens_seen": 22821376, "step": 17600 }, { "epoch": 0.8599418562040407, "eval_loss": 0.08986954391002655, "eval_runtime": 374.9967, "eval_samples_per_second": 97.028, "eval_steps_per_second": 24.259, "num_input_tokens_seen": 22821376, "step": 17600 }, { "epoch": 0.8601861578677351, "grad_norm": 0.2934255599975586, "learning_rate": 2.9676817767571086e-05, "loss": 0.0772, "num_input_tokens_seen": 22827840, "step": 17605 }, { "epoch": 0.8604304595314294, "grad_norm": 0.29167717695236206, "learning_rate": 2.966717324721861e-05, "loss": 0.0855, "num_input_tokens_seen": 22834144, "step": 17610 }, { "epoch": 0.8606747611951238, "grad_norm": 0.241676464676857, "learning_rate": 2.9657528007129366e-05, "loss": 0.1068, "num_input_tokens_seen": 22840864, "step": 17615 }, { "epoch": 0.860919062858818, "grad_norm": 0.11819951981306076, "learning_rate": 2.9647882048790777e-05, "loss": 0.0965, "num_input_tokens_seen": 22847360, "step": 17620 }, { "epoch": 0.8611633645225124, "grad_norm": 0.25793761014938354, "learning_rate": 2.963823537369037e-05, "loss": 0.0576, "num_input_tokens_seen": 22854048, "step": 17625 }, { "epoch": 0.8614076661862067, "grad_norm": 0.14896588027477264, "learning_rate": 2.9628587983315775e-05, "loss": 0.0827, "num_input_tokens_seen": 22860576, "step": 17630 }, { "epoch": 0.8616519678499011, "grad_norm": 1.51243257522583, "learning_rate": 2.9618939879154746e-05, "loss": 0.1134, "num_input_tokens_seen": 22867168, "step": 17635 }, { "epoch": 0.8618962695135954, "grad_norm": 0.23827920854091644, "learning_rate": 2.9609291062695143e-05, "loss": 0.0762, "num_input_tokens_seen": 22873312, "step": 17640 }, { "epoch": 0.8621405711772897, "grad_norm": 0.1963055580854416, "learning_rate": 2.9599641535424938e-05, "loss": 0.1185, "num_input_tokens_seen": 22879840, "step": 17645 }, { "epoch": 0.8623848728409841, "grad_norm": 0.34496915340423584, "learning_rate": 2.9589991298832202e-05, "loss": 0.0876, "num_input_tokens_seen": 22886528, "step": 17650 }, { "epoch": 0.8626291745046784, "grad_norm": 0.479964941740036, "learning_rate": 2.958034035440513e-05, "loss": 0.105, "num_input_tokens_seen": 22892704, "step": 17655 }, { "epoch": 0.8628734761683727, "grad_norm": 0.1268651783466339, "learning_rate": 2.957068870363201e-05, "loss": 0.067, "num_input_tokens_seen": 22899104, "step": 17660 }, { "epoch": 0.863117777832067, "grad_norm": 0.48359403014183044, "learning_rate": 2.956103634800126e-05, "loss": 0.1044, "num_input_tokens_seen": 22905728, "step": 17665 }, { "epoch": 0.8633620794957614, "grad_norm": 0.5506693720817566, "learning_rate": 2.9551383289001384e-05, "loss": 0.0804, "num_input_tokens_seen": 22912096, "step": 17670 }, { "epoch": 0.8636063811594556, "grad_norm": 1.5816967487335205, "learning_rate": 2.9541729528121005e-05, "loss": 0.1455, "num_input_tokens_seen": 22918336, "step": 17675 }, { "epoch": 0.86385068282315, "grad_norm": 0.22288860380649567, "learning_rate": 2.9532075066848856e-05, "loss": 0.099, "num_input_tokens_seen": 22925024, "step": 17680 }, { "epoch": 0.8640949844868444, "grad_norm": 0.08796606957912445, "learning_rate": 2.9522419906673786e-05, "loss": 0.057, "num_input_tokens_seen": 22931584, "step": 17685 }, { "epoch": 0.8643392861505387, "grad_norm": 0.2253311574459076, "learning_rate": 2.951276404908474e-05, "loss": 0.0764, "num_input_tokens_seen": 22938368, "step": 17690 }, { "epoch": 0.8645835878142331, "grad_norm": 0.12499465048313141, "learning_rate": 2.9503107495570752e-05, "loss": 0.0961, "num_input_tokens_seen": 22945088, "step": 17695 }, { "epoch": 0.8648278894779273, "grad_norm": 0.12226108461618423, "learning_rate": 2.9493450247621003e-05, "loss": 0.0799, "num_input_tokens_seen": 22951488, "step": 17700 }, { "epoch": 0.8650721911416217, "grad_norm": 0.3956070840358734, "learning_rate": 2.948379230672476e-05, "loss": 0.1072, "num_input_tokens_seen": 22957888, "step": 17705 }, { "epoch": 0.865316492805316, "grad_norm": 0.13805216550827026, "learning_rate": 2.9474133674371396e-05, "loss": 0.0894, "num_input_tokens_seen": 22964544, "step": 17710 }, { "epoch": 0.8655607944690104, "grad_norm": 0.34115394949913025, "learning_rate": 2.9464474352050387e-05, "loss": 0.0988, "num_input_tokens_seen": 22971328, "step": 17715 }, { "epoch": 0.8658050961327046, "grad_norm": 0.16830144822597504, "learning_rate": 2.9454814341251336e-05, "loss": 0.0848, "num_input_tokens_seen": 22977792, "step": 17720 }, { "epoch": 0.866049397796399, "grad_norm": 0.4663233458995819, "learning_rate": 2.9445153643463942e-05, "loss": 0.0667, "num_input_tokens_seen": 22984128, "step": 17725 }, { "epoch": 0.8662936994600933, "grad_norm": 0.47122862935066223, "learning_rate": 2.943549226017798e-05, "loss": 0.0954, "num_input_tokens_seen": 22990304, "step": 17730 }, { "epoch": 0.8665380011237876, "grad_norm": 0.35122501850128174, "learning_rate": 2.942583019288337e-05, "loss": 0.0915, "num_input_tokens_seen": 22996800, "step": 17735 }, { "epoch": 0.866782302787482, "grad_norm": 0.1804543286561966, "learning_rate": 2.9416167443070132e-05, "loss": 0.0676, "num_input_tokens_seen": 23003040, "step": 17740 }, { "epoch": 0.8670266044511763, "grad_norm": 0.19929249584674835, "learning_rate": 2.9406504012228375e-05, "loss": 0.0864, "num_input_tokens_seen": 23009440, "step": 17745 }, { "epoch": 0.8672709061148707, "grad_norm": 0.3161903917789459, "learning_rate": 2.939683990184832e-05, "loss": 0.0861, "num_input_tokens_seen": 23015616, "step": 17750 }, { "epoch": 0.867515207778565, "grad_norm": 0.26081591844558716, "learning_rate": 2.93871751134203e-05, "loss": 0.0966, "num_input_tokens_seen": 23021824, "step": 17755 }, { "epoch": 0.8677595094422593, "grad_norm": 0.29863664507865906, "learning_rate": 2.9377509648434752e-05, "loss": 0.0763, "num_input_tokens_seen": 23028448, "step": 17760 }, { "epoch": 0.8680038111059536, "grad_norm": 0.8593093752861023, "learning_rate": 2.9367843508382203e-05, "loss": 0.0962, "num_input_tokens_seen": 23035168, "step": 17765 }, { "epoch": 0.868248112769648, "grad_norm": 0.33109042048454285, "learning_rate": 2.9358176694753293e-05, "loss": 0.0811, "num_input_tokens_seen": 23041184, "step": 17770 }, { "epoch": 0.8684924144333422, "grad_norm": 0.5191903114318848, "learning_rate": 2.9348509209038766e-05, "loss": 0.0644, "num_input_tokens_seen": 23047968, "step": 17775 }, { "epoch": 0.8687367160970366, "grad_norm": 0.17059986293315887, "learning_rate": 2.933884105272947e-05, "loss": 0.1064, "num_input_tokens_seen": 23054688, "step": 17780 }, { "epoch": 0.868981017760731, "grad_norm": 0.33254024386405945, "learning_rate": 2.9329172227316366e-05, "loss": 0.0739, "num_input_tokens_seen": 23060960, "step": 17785 }, { "epoch": 0.8692253194244253, "grad_norm": 0.29885873198509216, "learning_rate": 2.93195027342905e-05, "loss": 0.095, "num_input_tokens_seen": 23067648, "step": 17790 }, { "epoch": 0.8694696210881196, "grad_norm": 0.55640709400177, "learning_rate": 2.9309832575143024e-05, "loss": 0.0964, "num_input_tokens_seen": 23074144, "step": 17795 }, { "epoch": 0.8697139227518139, "grad_norm": 0.14116008579730988, "learning_rate": 2.930016175136521e-05, "loss": 0.1056, "num_input_tokens_seen": 23080448, "step": 17800 }, { "epoch": 0.8697139227518139, "eval_loss": 0.09007331728935242, "eval_runtime": 374.4807, "eval_samples_per_second": 97.161, "eval_steps_per_second": 24.292, "num_input_tokens_seen": 23080448, "step": 17800 }, { "epoch": 0.8699582244155083, "grad_norm": 0.41600143909454346, "learning_rate": 2.9290490264448412e-05, "loss": 0.1038, "num_input_tokens_seen": 23086944, "step": 17805 }, { "epoch": 0.8702025260792026, "grad_norm": 0.41281574964523315, "learning_rate": 2.9280818115884094e-05, "loss": 0.095, "num_input_tokens_seen": 23093280, "step": 17810 }, { "epoch": 0.870446827742897, "grad_norm": 0.13422782719135284, "learning_rate": 2.9271145307163828e-05, "loss": 0.0816, "num_input_tokens_seen": 23099680, "step": 17815 }, { "epoch": 0.8706911294065912, "grad_norm": 0.19066046178340912, "learning_rate": 2.9261471839779287e-05, "loss": 0.1173, "num_input_tokens_seen": 23106080, "step": 17820 }, { "epoch": 0.8709354310702856, "grad_norm": 0.5603346228599548, "learning_rate": 2.925179771522223e-05, "loss": 0.0754, "num_input_tokens_seen": 23112576, "step": 17825 }, { "epoch": 0.87117973273398, "grad_norm": 0.17984382808208466, "learning_rate": 2.9242122934984535e-05, "loss": 0.0689, "num_input_tokens_seen": 23118784, "step": 17830 }, { "epoch": 0.8714240343976742, "grad_norm": 0.6116885542869568, "learning_rate": 2.9232447500558176e-05, "loss": 0.0779, "num_input_tokens_seen": 23125504, "step": 17835 }, { "epoch": 0.8716683360613686, "grad_norm": 0.6625697016716003, "learning_rate": 2.9222771413435225e-05, "loss": 0.0859, "num_input_tokens_seen": 23132288, "step": 17840 }, { "epoch": 0.8719126377250629, "grad_norm": 0.12628573179244995, "learning_rate": 2.9213094675107848e-05, "loss": 0.0748, "num_input_tokens_seen": 23138592, "step": 17845 }, { "epoch": 0.8721569393887573, "grad_norm": 0.207814559340477, "learning_rate": 2.9203417287068335e-05, "loss": 0.0767, "num_input_tokens_seen": 23144960, "step": 17850 }, { "epoch": 0.8724012410524515, "grad_norm": 0.25968238711357117, "learning_rate": 2.9193739250809042e-05, "loss": 0.0889, "num_input_tokens_seen": 23151136, "step": 17855 }, { "epoch": 0.8726455427161459, "grad_norm": 0.5135070085525513, "learning_rate": 2.9184060567822463e-05, "loss": 0.0866, "num_input_tokens_seen": 23157536, "step": 17860 }, { "epoch": 0.8728898443798402, "grad_norm": 0.16186366975307465, "learning_rate": 2.9174381239601166e-05, "loss": 0.0816, "num_input_tokens_seen": 23163968, "step": 17865 }, { "epoch": 0.8731341460435346, "grad_norm": 0.3245939612388611, "learning_rate": 2.916470126763783e-05, "loss": 0.0966, "num_input_tokens_seen": 23170688, "step": 17870 }, { "epoch": 0.8733784477072288, "grad_norm": 0.6210430264472961, "learning_rate": 2.9155020653425203e-05, "loss": 0.078, "num_input_tokens_seen": 23177088, "step": 17875 }, { "epoch": 0.8736227493709232, "grad_norm": 0.12264757603406906, "learning_rate": 2.9145339398456184e-05, "loss": 0.0712, "num_input_tokens_seen": 23183360, "step": 17880 }, { "epoch": 0.8738670510346176, "grad_norm": 0.19218415021896362, "learning_rate": 2.913565750422374e-05, "loss": 0.1086, "num_input_tokens_seen": 23189216, "step": 17885 }, { "epoch": 0.8741113526983119, "grad_norm": 0.2929335832595825, "learning_rate": 2.9125974972220938e-05, "loss": 0.0683, "num_input_tokens_seen": 23196704, "step": 17890 }, { "epoch": 0.8743556543620062, "grad_norm": 0.3041479289531708, "learning_rate": 2.9116291803940932e-05, "loss": 0.1021, "num_input_tokens_seen": 23202944, "step": 17895 }, { "epoch": 0.8745999560257005, "grad_norm": 0.39147108793258667, "learning_rate": 2.910660800087701e-05, "loss": 0.1144, "num_input_tokens_seen": 23209184, "step": 17900 }, { "epoch": 0.8748442576893949, "grad_norm": 0.27165380120277405, "learning_rate": 2.909692356452254e-05, "loss": 0.0885, "num_input_tokens_seen": 23215360, "step": 17905 }, { "epoch": 0.8750885593530892, "grad_norm": 0.699313759803772, "learning_rate": 2.9087238496370962e-05, "loss": 0.0953, "num_input_tokens_seen": 23222048, "step": 17910 }, { "epoch": 0.8753328610167835, "grad_norm": 0.49722763895988464, "learning_rate": 2.907755279791583e-05, "loss": 0.0992, "num_input_tokens_seen": 23228608, "step": 17915 }, { "epoch": 0.8755771626804778, "grad_norm": 0.6658222079277039, "learning_rate": 2.906786647065083e-05, "loss": 0.0723, "num_input_tokens_seen": 23235072, "step": 17920 }, { "epoch": 0.8758214643441722, "grad_norm": 0.16629073023796082, "learning_rate": 2.9058179516069695e-05, "loss": 0.0636, "num_input_tokens_seen": 23241472, "step": 17925 }, { "epoch": 0.8760657660078666, "grad_norm": 0.49550822377204895, "learning_rate": 2.9048491935666282e-05, "loss": 0.0622, "num_input_tokens_seen": 23248160, "step": 17930 }, { "epoch": 0.8763100676715608, "grad_norm": 0.13368822634220123, "learning_rate": 2.9038803730934534e-05, "loss": 0.0897, "num_input_tokens_seen": 23254336, "step": 17935 }, { "epoch": 0.8765543693352552, "grad_norm": 0.3492453396320343, "learning_rate": 2.9029114903368503e-05, "loss": 0.0932, "num_input_tokens_seen": 23260960, "step": 17940 }, { "epoch": 0.8767986709989495, "grad_norm": 0.15997011959552765, "learning_rate": 2.9019425454462318e-05, "loss": 0.0996, "num_input_tokens_seen": 23267488, "step": 17945 }, { "epoch": 0.8770429726626439, "grad_norm": 0.5128812193870544, "learning_rate": 2.9009735385710212e-05, "loss": 0.0903, "num_input_tokens_seen": 23273824, "step": 17950 }, { "epoch": 0.8772872743263381, "grad_norm": 0.2652057707309723, "learning_rate": 2.900004469860652e-05, "loss": 0.0877, "num_input_tokens_seen": 23280384, "step": 17955 }, { "epoch": 0.8775315759900325, "grad_norm": 0.4811704158782959, "learning_rate": 2.8990353394645668e-05, "loss": 0.0857, "num_input_tokens_seen": 23286624, "step": 17960 }, { "epoch": 0.8777758776537268, "grad_norm": 0.2688468396663666, "learning_rate": 2.8980661475322186e-05, "loss": 0.1008, "num_input_tokens_seen": 23292736, "step": 17965 }, { "epoch": 0.8780201793174212, "grad_norm": 0.3442624509334564, "learning_rate": 2.897096894213067e-05, "loss": 0.0813, "num_input_tokens_seen": 23299392, "step": 17970 }, { "epoch": 0.8782644809811155, "grad_norm": 0.376981258392334, "learning_rate": 2.8961275796565845e-05, "loss": 0.105, "num_input_tokens_seen": 23305920, "step": 17975 }, { "epoch": 0.8785087826448098, "grad_norm": 0.19998958706855774, "learning_rate": 2.8951582040122517e-05, "loss": 0.0883, "num_input_tokens_seen": 23312416, "step": 17980 }, { "epoch": 0.8787530843085042, "grad_norm": 0.362074077129364, "learning_rate": 2.894188767429557e-05, "loss": 0.0739, "num_input_tokens_seen": 23318752, "step": 17985 }, { "epoch": 0.8789973859721985, "grad_norm": 0.29471856355667114, "learning_rate": 2.8932192700580014e-05, "loss": 0.0851, "num_input_tokens_seen": 23324960, "step": 17990 }, { "epoch": 0.8792416876358928, "grad_norm": 0.17022575438022614, "learning_rate": 2.8922497120470916e-05, "loss": 0.0954, "num_input_tokens_seen": 23331520, "step": 17995 }, { "epoch": 0.8794859892995871, "grad_norm": 0.27712467312812805, "learning_rate": 2.891280093546348e-05, "loss": 0.1112, "num_input_tokens_seen": 23338016, "step": 18000 }, { "epoch": 0.8794859892995871, "eval_loss": 0.08935371786355972, "eval_runtime": 374.3197, "eval_samples_per_second": 97.203, "eval_steps_per_second": 24.303, "num_input_tokens_seen": 23338016, "step": 18000 }, { "epoch": 0.8797302909632815, "grad_norm": 0.5262339115142822, "learning_rate": 2.890310414705297e-05, "loss": 0.0591, "num_input_tokens_seen": 23344544, "step": 18005 }, { "epoch": 0.8799745926269757, "grad_norm": 0.34690868854522705, "learning_rate": 2.8893406756734742e-05, "loss": 0.0929, "num_input_tokens_seen": 23350848, "step": 18010 }, { "epoch": 0.8802188942906701, "grad_norm": 0.31060031056404114, "learning_rate": 2.888370876600427e-05, "loss": 0.1053, "num_input_tokens_seen": 23357248, "step": 18015 }, { "epoch": 0.8804631959543644, "grad_norm": 0.18808382749557495, "learning_rate": 2.8874010176357104e-05, "loss": 0.0969, "num_input_tokens_seen": 23363584, "step": 18020 }, { "epoch": 0.8807074976180588, "grad_norm": 0.23026956617832184, "learning_rate": 2.886431098928888e-05, "loss": 0.0687, "num_input_tokens_seen": 23369632, "step": 18025 }, { "epoch": 0.8809517992817532, "grad_norm": 0.5480344891548157, "learning_rate": 2.885461120629534e-05, "loss": 0.0706, "num_input_tokens_seen": 23376640, "step": 18030 }, { "epoch": 0.8811961009454474, "grad_norm": 0.2932250499725342, "learning_rate": 2.8844910828872317e-05, "loss": 0.0863, "num_input_tokens_seen": 23383424, "step": 18035 }, { "epoch": 0.8814404026091418, "grad_norm": 0.3285616338253021, "learning_rate": 2.8835209858515715e-05, "loss": 0.0718, "num_input_tokens_seen": 23389824, "step": 18040 }, { "epoch": 0.8816847042728361, "grad_norm": 0.393588125705719, "learning_rate": 2.8825508296721566e-05, "loss": 0.0788, "num_input_tokens_seen": 23396928, "step": 18045 }, { "epoch": 0.8819290059365305, "grad_norm": 0.17610865831375122, "learning_rate": 2.881580614498596e-05, "loss": 0.0781, "num_input_tokens_seen": 23403264, "step": 18050 }, { "epoch": 0.8821733076002247, "grad_norm": 0.18532267212867737, "learning_rate": 2.8806103404805103e-05, "loss": 0.067, "num_input_tokens_seen": 23410784, "step": 18055 }, { "epoch": 0.8824176092639191, "grad_norm": 0.13501639664173126, "learning_rate": 2.8796400077675257e-05, "loss": 0.0627, "num_input_tokens_seen": 23416864, "step": 18060 }, { "epoch": 0.8826619109276134, "grad_norm": 0.2069971114397049, "learning_rate": 2.8786696165092812e-05, "loss": 0.0783, "num_input_tokens_seen": 23423392, "step": 18065 }, { "epoch": 0.8829062125913077, "grad_norm": 0.3232601583003998, "learning_rate": 2.8776991668554236e-05, "loss": 0.0768, "num_input_tokens_seen": 23429632, "step": 18070 }, { "epoch": 0.8831505142550021, "grad_norm": 0.6066786646842957, "learning_rate": 2.876728658955608e-05, "loss": 0.0731, "num_input_tokens_seen": 23436416, "step": 18075 }, { "epoch": 0.8833948159186964, "grad_norm": 0.31345972418785095, "learning_rate": 2.8757580929594986e-05, "loss": 0.0984, "num_input_tokens_seen": 23442240, "step": 18080 }, { "epoch": 0.8836391175823908, "grad_norm": 0.3988233208656311, "learning_rate": 2.87478746901677e-05, "loss": 0.0904, "num_input_tokens_seen": 23448352, "step": 18085 }, { "epoch": 0.883883419246085, "grad_norm": 0.3525289297103882, "learning_rate": 2.873816787277103e-05, "loss": 0.0949, "num_input_tokens_seen": 23454528, "step": 18090 }, { "epoch": 0.8841277209097794, "grad_norm": 0.5145946741104126, "learning_rate": 2.8728460478901903e-05, "loss": 0.1062, "num_input_tokens_seen": 23460672, "step": 18095 }, { "epoch": 0.8843720225734737, "grad_norm": 0.7407499551773071, "learning_rate": 2.8718752510057307e-05, "loss": 0.1043, "num_input_tokens_seen": 23467200, "step": 18100 }, { "epoch": 0.8846163242371681, "grad_norm": 0.3354305624961853, "learning_rate": 2.870904396773435e-05, "loss": 0.0921, "num_input_tokens_seen": 23473760, "step": 18105 }, { "epoch": 0.8848606259008623, "grad_norm": 0.13561880588531494, "learning_rate": 2.86993348534302e-05, "loss": 0.059, "num_input_tokens_seen": 23479968, "step": 18110 }, { "epoch": 0.8851049275645567, "grad_norm": 0.17650875449180603, "learning_rate": 2.868962516864212e-05, "loss": 0.0999, "num_input_tokens_seen": 23486272, "step": 18115 }, { "epoch": 0.885349229228251, "grad_norm": 0.29027631878852844, "learning_rate": 2.8679914914867477e-05, "loss": 0.0799, "num_input_tokens_seen": 23493312, "step": 18120 }, { "epoch": 0.8855935308919454, "grad_norm": 0.19302977621555328, "learning_rate": 2.8670204093603713e-05, "loss": 0.0909, "num_input_tokens_seen": 23500128, "step": 18125 }, { "epoch": 0.8858378325556397, "grad_norm": 0.16494719684123993, "learning_rate": 2.8660492706348357e-05, "loss": 0.0816, "num_input_tokens_seen": 23506624, "step": 18130 }, { "epoch": 0.886082134219334, "grad_norm": 0.3132178485393524, "learning_rate": 2.8650780754599022e-05, "loss": 0.0891, "num_input_tokens_seen": 23512864, "step": 18135 }, { "epoch": 0.8863264358830284, "grad_norm": 0.31071940064430237, "learning_rate": 2.8641068239853407e-05, "loss": 0.0667, "num_input_tokens_seen": 23519232, "step": 18140 }, { "epoch": 0.8865707375467227, "grad_norm": 0.5971336960792542, "learning_rate": 2.863135516360932e-05, "loss": 0.0941, "num_input_tokens_seen": 23526336, "step": 18145 }, { "epoch": 0.886815039210417, "grad_norm": 0.20479370653629303, "learning_rate": 2.8621641527364633e-05, "loss": 0.0735, "num_input_tokens_seen": 23532992, "step": 18150 }, { "epoch": 0.8870593408741113, "grad_norm": 0.3925824463367462, "learning_rate": 2.8611927332617313e-05, "loss": 0.071, "num_input_tokens_seen": 23539360, "step": 18155 }, { "epoch": 0.8873036425378057, "grad_norm": 0.2449013888835907, "learning_rate": 2.8602212580865405e-05, "loss": 0.0741, "num_input_tokens_seen": 23545856, "step": 18160 }, { "epoch": 0.8875479442015, "grad_norm": 0.5615682005882263, "learning_rate": 2.859249727360705e-05, "loss": 0.0899, "num_input_tokens_seen": 23552384, "step": 18165 }, { "epoch": 0.8877922458651943, "grad_norm": 0.5315673351287842, "learning_rate": 2.8582781412340465e-05, "loss": 0.0906, "num_input_tokens_seen": 23558560, "step": 18170 }, { "epoch": 0.8880365475288887, "grad_norm": 0.2648983299732208, "learning_rate": 2.857306499856397e-05, "loss": 0.1114, "num_input_tokens_seen": 23564928, "step": 18175 }, { "epoch": 0.888280849192583, "grad_norm": 0.2779281437397003, "learning_rate": 2.856334803377594e-05, "loss": 0.0807, "num_input_tokens_seen": 23571296, "step": 18180 }, { "epoch": 0.8885251508562774, "grad_norm": 0.20126797258853912, "learning_rate": 2.8553630519474867e-05, "loss": 0.0826, "num_input_tokens_seen": 23578080, "step": 18185 }, { "epoch": 0.8887694525199716, "grad_norm": 0.1102561354637146, "learning_rate": 2.8543912457159317e-05, "loss": 0.0615, "num_input_tokens_seen": 23584672, "step": 18190 }, { "epoch": 0.889013754183666, "grad_norm": 0.29926663637161255, "learning_rate": 2.853419384832792e-05, "loss": 0.1066, "num_input_tokens_seen": 23591712, "step": 18195 }, { "epoch": 0.8892580558473603, "grad_norm": 1.0234917402267456, "learning_rate": 2.8524474694479423e-05, "loss": 0.1136, "num_input_tokens_seen": 23598208, "step": 18200 }, { "epoch": 0.8892580558473603, "eval_loss": 0.08968717604875565, "eval_runtime": 375.1115, "eval_samples_per_second": 96.998, "eval_steps_per_second": 24.251, "num_input_tokens_seen": 23598208, "step": 18200 }, { "epoch": 0.8895023575110547, "grad_norm": 0.1181454285979271, "learning_rate": 2.851475499711264e-05, "loss": 0.0975, "num_input_tokens_seen": 23604864, "step": 18205 }, { "epoch": 0.8897466591747489, "grad_norm": 0.3005370497703552, "learning_rate": 2.8505034757726468e-05, "loss": 0.1016, "num_input_tokens_seen": 23610880, "step": 18210 }, { "epoch": 0.8899909608384433, "grad_norm": 0.2371739000082016, "learning_rate": 2.8495313977819886e-05, "loss": 0.1071, "num_input_tokens_seen": 23617408, "step": 18215 }, { "epoch": 0.8902352625021377, "grad_norm": 0.14311620593070984, "learning_rate": 2.8485592658891956e-05, "loss": 0.0771, "num_input_tokens_seen": 23623488, "step": 18220 }, { "epoch": 0.890479564165832, "grad_norm": 0.17126630246639252, "learning_rate": 2.8475870802441844e-05, "loss": 0.0885, "num_input_tokens_seen": 23630368, "step": 18225 }, { "epoch": 0.8907238658295263, "grad_norm": 0.21971753239631653, "learning_rate": 2.8466148409968774e-05, "loss": 0.0898, "num_input_tokens_seen": 23636800, "step": 18230 }, { "epoch": 0.8909681674932206, "grad_norm": 0.16386130452156067, "learning_rate": 2.8456425482972067e-05, "loss": 0.0783, "num_input_tokens_seen": 23643072, "step": 18235 }, { "epoch": 0.891212469156915, "grad_norm": 0.2038736790418625, "learning_rate": 2.84467020229511e-05, "loss": 0.0857, "num_input_tokens_seen": 23649568, "step": 18240 }, { "epoch": 0.8914567708206093, "grad_norm": 0.2339845895767212, "learning_rate": 2.8436978031405375e-05, "loss": 0.1044, "num_input_tokens_seen": 23655712, "step": 18245 }, { "epoch": 0.8917010724843036, "grad_norm": 0.10625656694173813, "learning_rate": 2.842725350983445e-05, "loss": 0.082, "num_input_tokens_seen": 23661920, "step": 18250 }, { "epoch": 0.8919453741479979, "grad_norm": 0.19155161082744598, "learning_rate": 2.8417528459737957e-05, "loss": 0.0705, "num_input_tokens_seen": 23668608, "step": 18255 }, { "epoch": 0.8921896758116923, "grad_norm": 0.16559500992298126, "learning_rate": 2.8407802882615624e-05, "loss": 0.0671, "num_input_tokens_seen": 23675168, "step": 18260 }, { "epoch": 0.8924339774753866, "grad_norm": 0.22591069340705872, "learning_rate": 2.8398076779967277e-05, "loss": 0.0969, "num_input_tokens_seen": 23681344, "step": 18265 }, { "epoch": 0.8926782791390809, "grad_norm": 0.5535298585891724, "learning_rate": 2.8388350153292774e-05, "loss": 0.1241, "num_input_tokens_seen": 23688128, "step": 18270 }, { "epoch": 0.8929225808027753, "grad_norm": 0.3773252069950104, "learning_rate": 2.8378623004092103e-05, "loss": 0.0848, "num_input_tokens_seen": 23694560, "step": 18275 }, { "epoch": 0.8931668824664696, "grad_norm": 0.16177648305892944, "learning_rate": 2.8368895333865302e-05, "loss": 0.07, "num_input_tokens_seen": 23700960, "step": 18280 }, { "epoch": 0.893411184130164, "grad_norm": 0.21684448421001434, "learning_rate": 2.835916714411251e-05, "loss": 0.0814, "num_input_tokens_seen": 23707680, "step": 18285 }, { "epoch": 0.8936554857938582, "grad_norm": 0.2805804908275604, "learning_rate": 2.8349438436333926e-05, "loss": 0.0771, "num_input_tokens_seen": 23714720, "step": 18290 }, { "epoch": 0.8938997874575526, "grad_norm": 0.3821087181568146, "learning_rate": 2.833970921202984e-05, "loss": 0.0897, "num_input_tokens_seen": 23721088, "step": 18295 }, { "epoch": 0.8941440891212469, "grad_norm": 0.16066300868988037, "learning_rate": 2.8329979472700628e-05, "loss": 0.0628, "num_input_tokens_seen": 23727840, "step": 18300 }, { "epoch": 0.8943883907849413, "grad_norm": 0.17202377319335938, "learning_rate": 2.832024921984674e-05, "loss": 0.077, "num_input_tokens_seen": 23734304, "step": 18305 }, { "epoch": 0.8946326924486355, "grad_norm": 0.41301921010017395, "learning_rate": 2.8310518454968693e-05, "loss": 0.1379, "num_input_tokens_seen": 23740864, "step": 18310 }, { "epoch": 0.8948769941123299, "grad_norm": 0.6366181969642639, "learning_rate": 2.8300787179567095e-05, "loss": 0.0895, "num_input_tokens_seen": 23747328, "step": 18315 }, { "epoch": 0.8951212957760243, "grad_norm": 0.3795630931854248, "learning_rate": 2.8291055395142636e-05, "loss": 0.0759, "num_input_tokens_seen": 23754176, "step": 18320 }, { "epoch": 0.8953655974397186, "grad_norm": 0.35972025990486145, "learning_rate": 2.8281323103196073e-05, "loss": 0.0857, "num_input_tokens_seen": 23760160, "step": 18325 }, { "epoch": 0.8956098991034129, "grad_norm": 0.16973064839839935, "learning_rate": 2.8271590305228256e-05, "loss": 0.0788, "num_input_tokens_seen": 23766848, "step": 18330 }, { "epoch": 0.8958542007671072, "grad_norm": 0.3788314461708069, "learning_rate": 2.82618570027401e-05, "loss": 0.0929, "num_input_tokens_seen": 23773056, "step": 18335 }, { "epoch": 0.8960985024308016, "grad_norm": 0.495332807302475, "learning_rate": 2.8252123197232604e-05, "loss": 0.0896, "num_input_tokens_seen": 23779584, "step": 18340 }, { "epoch": 0.8963428040944958, "grad_norm": 0.46904754638671875, "learning_rate": 2.8242388890206843e-05, "loss": 0.1004, "num_input_tokens_seen": 23785728, "step": 18345 }, { "epoch": 0.8965871057581902, "grad_norm": 0.2710905075073242, "learning_rate": 2.8232654083163967e-05, "loss": 0.0758, "num_input_tokens_seen": 23792096, "step": 18350 }, { "epoch": 0.8968314074218845, "grad_norm": 0.6168007254600525, "learning_rate": 2.822291877760521e-05, "loss": 0.0863, "num_input_tokens_seen": 23798944, "step": 18355 }, { "epoch": 0.8970757090855789, "grad_norm": 0.1395745873451233, "learning_rate": 2.8213182975031864e-05, "loss": 0.0969, "num_input_tokens_seen": 23805120, "step": 18360 }, { "epoch": 0.8973200107492733, "grad_norm": 0.24542613327503204, "learning_rate": 2.8203446676945337e-05, "loss": 0.0915, "num_input_tokens_seen": 23811424, "step": 18365 }, { "epoch": 0.8975643124129675, "grad_norm": 0.1231754943728447, "learning_rate": 2.8193709884847075e-05, "loss": 0.0924, "num_input_tokens_seen": 23818016, "step": 18370 }, { "epoch": 0.8978086140766619, "grad_norm": 0.3409535884857178, "learning_rate": 2.8183972600238605e-05, "loss": 0.0965, "num_input_tokens_seen": 23824512, "step": 18375 }, { "epoch": 0.8980529157403562, "grad_norm": 0.5827006697654724, "learning_rate": 2.817423482462156e-05, "loss": 0.099, "num_input_tokens_seen": 23830784, "step": 18380 }, { "epoch": 0.8982972174040506, "grad_norm": 0.09947306662797928, "learning_rate": 2.8164496559497605e-05, "loss": 0.0769, "num_input_tokens_seen": 23837888, "step": 18385 }, { "epoch": 0.8985415190677448, "grad_norm": 0.5161409974098206, "learning_rate": 2.815475780636852e-05, "loss": 0.113, "num_input_tokens_seen": 23844416, "step": 18390 }, { "epoch": 0.8987858207314392, "grad_norm": 0.2833668887615204, "learning_rate": 2.814501856673613e-05, "loss": 0.0899, "num_input_tokens_seen": 23851296, "step": 18395 }, { "epoch": 0.8990301223951335, "grad_norm": 0.2156926840543747, "learning_rate": 2.8135278842102353e-05, "loss": 0.0664, "num_input_tokens_seen": 23857824, "step": 18400 }, { "epoch": 0.8990301223951335, "eval_loss": 0.09000683575868607, "eval_runtime": 374.2351, "eval_samples_per_second": 97.225, "eval_steps_per_second": 24.308, "num_input_tokens_seen": 23857824, "step": 18400 }, { "epoch": 0.8992744240588278, "grad_norm": 0.19635185599327087, "learning_rate": 2.8125538633969183e-05, "loss": 0.0876, "num_input_tokens_seen": 23864480, "step": 18405 }, { "epoch": 0.8995187257225221, "grad_norm": 0.9454399347305298, "learning_rate": 2.8115797943838677e-05, "loss": 0.128, "num_input_tokens_seen": 23871104, "step": 18410 }, { "epoch": 0.8997630273862165, "grad_norm": 0.48535746335983276, "learning_rate": 2.810605677321298e-05, "loss": 0.0699, "num_input_tokens_seen": 23877440, "step": 18415 }, { "epoch": 0.9000073290499109, "grad_norm": 0.15241964161396027, "learning_rate": 2.809631512359428e-05, "loss": 0.0964, "num_input_tokens_seen": 23883776, "step": 18420 }, { "epoch": 0.9002516307136051, "grad_norm": 0.3591036796569824, "learning_rate": 2.8086572996484884e-05, "loss": 0.0998, "num_input_tokens_seen": 23890400, "step": 18425 }, { "epoch": 0.9004959323772995, "grad_norm": 0.2231893241405487, "learning_rate": 2.8076830393387143e-05, "loss": 0.061, "num_input_tokens_seen": 23896832, "step": 18430 }, { "epoch": 0.9007402340409938, "grad_norm": 0.15624740719795227, "learning_rate": 2.8067087315803497e-05, "loss": 0.0951, "num_input_tokens_seen": 23903360, "step": 18435 }, { "epoch": 0.9009845357046882, "grad_norm": 0.4854942560195923, "learning_rate": 2.8057343765236433e-05, "loss": 0.0816, "num_input_tokens_seen": 23909984, "step": 18440 }, { "epoch": 0.9012288373683824, "grad_norm": 0.3051069676876068, "learning_rate": 2.804759974318854e-05, "loss": 0.0893, "num_input_tokens_seen": 23916800, "step": 18445 }, { "epoch": 0.9014731390320768, "grad_norm": 0.1815921813249588, "learning_rate": 2.8037855251162482e-05, "loss": 0.0961, "num_input_tokens_seen": 23923552, "step": 18450 }, { "epoch": 0.9017174406957711, "grad_norm": 0.1928400695323944, "learning_rate": 2.802811029066096e-05, "loss": 0.0814, "num_input_tokens_seen": 23930176, "step": 18455 }, { "epoch": 0.9019617423594655, "grad_norm": 0.5513299107551575, "learning_rate": 2.8018364863186764e-05, "loss": 0.0963, "num_input_tokens_seen": 23936768, "step": 18460 }, { "epoch": 0.9022060440231598, "grad_norm": 0.6882994771003723, "learning_rate": 2.800861897024279e-05, "loss": 0.0701, "num_input_tokens_seen": 23943424, "step": 18465 }, { "epoch": 0.9024503456868541, "grad_norm": 0.4078698456287384, "learning_rate": 2.799887261333196e-05, "loss": 0.1012, "num_input_tokens_seen": 23949376, "step": 18470 }, { "epoch": 0.9026946473505485, "grad_norm": 0.3367801606655121, "learning_rate": 2.798912579395728e-05, "loss": 0.0948, "num_input_tokens_seen": 23956192, "step": 18475 }, { "epoch": 0.9029389490142428, "grad_norm": 0.28776121139526367, "learning_rate": 2.797937851362185e-05, "loss": 0.0926, "num_input_tokens_seen": 23962528, "step": 18480 }, { "epoch": 0.9031832506779371, "grad_norm": 0.4738346040248871, "learning_rate": 2.7969630773828802e-05, "loss": 0.1111, "num_input_tokens_seen": 23969280, "step": 18485 }, { "epoch": 0.9034275523416314, "grad_norm": 0.09871172159910202, "learning_rate": 2.7959882576081382e-05, "loss": 0.0788, "num_input_tokens_seen": 23975712, "step": 18490 }, { "epoch": 0.9036718540053258, "grad_norm": 0.2622159421443939, "learning_rate": 2.795013392188286e-05, "loss": 0.0791, "num_input_tokens_seen": 23982048, "step": 18495 }, { "epoch": 0.9039161556690201, "grad_norm": 0.17691679298877716, "learning_rate": 2.7940384812736614e-05, "loss": 0.0809, "num_input_tokens_seen": 23988416, "step": 18500 }, { "epoch": 0.9041604573327144, "grad_norm": 0.11957360804080963, "learning_rate": 2.7930635250146087e-05, "loss": 0.0604, "num_input_tokens_seen": 23994624, "step": 18505 }, { "epoch": 0.9044047589964088, "grad_norm": 0.16198068857192993, "learning_rate": 2.792088523561477e-05, "loss": 0.1091, "num_input_tokens_seen": 24001056, "step": 18510 }, { "epoch": 0.9046490606601031, "grad_norm": 0.4794202148914337, "learning_rate": 2.7911134770646246e-05, "loss": 0.1026, "num_input_tokens_seen": 24007264, "step": 18515 }, { "epoch": 0.9048933623237975, "grad_norm": 0.5806252360343933, "learning_rate": 2.7901383856744157e-05, "loss": 0.1025, "num_input_tokens_seen": 24013728, "step": 18520 }, { "epoch": 0.9051376639874917, "grad_norm": 0.2020082026720047, "learning_rate": 2.7891632495412217e-05, "loss": 0.0856, "num_input_tokens_seen": 24020640, "step": 18525 }, { "epoch": 0.9053819656511861, "grad_norm": 0.1951412558555603, "learning_rate": 2.7881880688154205e-05, "loss": 0.0861, "num_input_tokens_seen": 24027488, "step": 18530 }, { "epoch": 0.9056262673148804, "grad_norm": 0.32649165391921997, "learning_rate": 2.7872128436473977e-05, "loss": 0.0943, "num_input_tokens_seen": 24034240, "step": 18535 }, { "epoch": 0.9058705689785748, "grad_norm": 0.3095437288284302, "learning_rate": 2.7862375741875448e-05, "loss": 0.0731, "num_input_tokens_seen": 24040832, "step": 18540 }, { "epoch": 0.906114870642269, "grad_norm": 0.12351945042610168, "learning_rate": 2.785262260586261e-05, "loss": 0.0974, "num_input_tokens_seen": 24047072, "step": 18545 }, { "epoch": 0.9063591723059634, "grad_norm": 0.18757542967796326, "learning_rate": 2.7842869029939517e-05, "loss": 0.0721, "num_input_tokens_seen": 24053600, "step": 18550 }, { "epoch": 0.9066034739696577, "grad_norm": 0.30250558257102966, "learning_rate": 2.7833115015610296e-05, "loss": 0.1062, "num_input_tokens_seen": 24059904, "step": 18555 }, { "epoch": 0.9068477756333521, "grad_norm": 0.29374057054519653, "learning_rate": 2.7823360564379136e-05, "loss": 0.1002, "num_input_tokens_seen": 24066112, "step": 18560 }, { "epoch": 0.9070920772970464, "grad_norm": 0.29185184836387634, "learning_rate": 2.7813605677750297e-05, "loss": 0.0807, "num_input_tokens_seen": 24072416, "step": 18565 }, { "epoch": 0.9073363789607407, "grad_norm": 0.3638104200363159, "learning_rate": 2.7803850357228102e-05, "loss": 0.0635, "num_input_tokens_seen": 24079168, "step": 18570 }, { "epoch": 0.9075806806244351, "grad_norm": 0.5562418699264526, "learning_rate": 2.779409460431695e-05, "loss": 0.0955, "num_input_tokens_seen": 24085632, "step": 18575 }, { "epoch": 0.9078249822881294, "grad_norm": 0.16471222043037415, "learning_rate": 2.778433842052129e-05, "loss": 0.0862, "num_input_tokens_seen": 24091808, "step": 18580 }, { "epoch": 0.9080692839518237, "grad_norm": 0.2286747545003891, "learning_rate": 2.7774581807345664e-05, "loss": 0.0777, "num_input_tokens_seen": 24098208, "step": 18585 }, { "epoch": 0.908313585615518, "grad_norm": 0.3724447190761566, "learning_rate": 2.776482476629465e-05, "loss": 0.0743, "num_input_tokens_seen": 24104352, "step": 18590 }, { "epoch": 0.9085578872792124, "grad_norm": 0.46105629205703735, "learning_rate": 2.7755067298872924e-05, "loss": 0.1028, "num_input_tokens_seen": 24110496, "step": 18595 }, { "epoch": 0.9088021889429067, "grad_norm": 0.2834215462207794, "learning_rate": 2.774530940658518e-05, "loss": 0.0703, "num_input_tokens_seen": 24117056, "step": 18600 }, { "epoch": 0.9088021889429067, "eval_loss": 0.08918669819831848, "eval_runtime": 374.7988, "eval_samples_per_second": 97.079, "eval_steps_per_second": 24.272, "num_input_tokens_seen": 24117056, "step": 18600 }, { "epoch": 0.909046490606601, "grad_norm": 0.2959657907485962, "learning_rate": 2.7735551090936236e-05, "loss": 0.0957, "num_input_tokens_seen": 24123392, "step": 18605 }, { "epoch": 0.9092907922702954, "grad_norm": 0.22979776561260223, "learning_rate": 2.7725792353430934e-05, "loss": 0.0681, "num_input_tokens_seen": 24129472, "step": 18610 }, { "epoch": 0.9095350939339897, "grad_norm": 0.5003346800804138, "learning_rate": 2.77160331955742e-05, "loss": 0.0801, "num_input_tokens_seen": 24136128, "step": 18615 }, { "epoch": 0.9097793955976841, "grad_norm": 0.20952247083187103, "learning_rate": 2.7706273618871008e-05, "loss": 0.1168, "num_input_tokens_seen": 24142848, "step": 18620 }, { "epoch": 0.9100236972613783, "grad_norm": 0.22049996256828308, "learning_rate": 2.769651362482642e-05, "loss": 0.0633, "num_input_tokens_seen": 24149312, "step": 18625 }, { "epoch": 0.9102679989250727, "grad_norm": 0.6202604174613953, "learning_rate": 2.768675321494555e-05, "loss": 0.1026, "num_input_tokens_seen": 24155520, "step": 18630 }, { "epoch": 0.910512300588767, "grad_norm": 0.15054070949554443, "learning_rate": 2.7676992390733565e-05, "loss": 0.0915, "num_input_tokens_seen": 24162592, "step": 18635 }, { "epoch": 0.9107566022524614, "grad_norm": 0.31681862473487854, "learning_rate": 2.766723115369571e-05, "loss": 0.0812, "num_input_tokens_seen": 24169120, "step": 18640 }, { "epoch": 0.9110009039161556, "grad_norm": 0.16439735889434814, "learning_rate": 2.765746950533729e-05, "loss": 0.1026, "num_input_tokens_seen": 24175904, "step": 18645 }, { "epoch": 0.91124520557985, "grad_norm": 0.45854806900024414, "learning_rate": 2.7647707447163684e-05, "loss": 0.0886, "num_input_tokens_seen": 24182400, "step": 18650 }, { "epoch": 0.9114895072435444, "grad_norm": 0.2203211635351181, "learning_rate": 2.7637944980680315e-05, "loss": 0.0936, "num_input_tokens_seen": 24188768, "step": 18655 }, { "epoch": 0.9117338089072387, "grad_norm": 0.14891588687896729, "learning_rate": 2.762818210739268e-05, "loss": 0.0815, "num_input_tokens_seen": 24195200, "step": 18660 }, { "epoch": 0.911978110570933, "grad_norm": 0.4684690833091736, "learning_rate": 2.7618418828806332e-05, "loss": 0.096, "num_input_tokens_seen": 24201792, "step": 18665 }, { "epoch": 0.9122224122346273, "grad_norm": 0.23849685490131378, "learning_rate": 2.76086551464269e-05, "loss": 0.104, "num_input_tokens_seen": 24208384, "step": 18670 }, { "epoch": 0.9124667138983217, "grad_norm": 0.17853224277496338, "learning_rate": 2.759889106176006e-05, "loss": 0.0778, "num_input_tokens_seen": 24214656, "step": 18675 }, { "epoch": 0.912711015562016, "grad_norm": 0.1320127248764038, "learning_rate": 2.758912657631156e-05, "loss": 0.0988, "num_input_tokens_seen": 24221056, "step": 18680 }, { "epoch": 0.9129553172257103, "grad_norm": 0.13438478112220764, "learning_rate": 2.7579361691587198e-05, "loss": 0.0858, "num_input_tokens_seen": 24227200, "step": 18685 }, { "epoch": 0.9131996188894046, "grad_norm": 0.6589659452438354, "learning_rate": 2.756959640909285e-05, "loss": 0.1177, "num_input_tokens_seen": 24233184, "step": 18690 }, { "epoch": 0.913443920553099, "grad_norm": 0.20282405614852905, "learning_rate": 2.7559830730334452e-05, "loss": 0.0917, "num_input_tokens_seen": 24239488, "step": 18695 }, { "epoch": 0.9136882222167932, "grad_norm": 0.40042561292648315, "learning_rate": 2.7550064656817988e-05, "loss": 0.0956, "num_input_tokens_seen": 24246048, "step": 18700 }, { "epoch": 0.9139325238804876, "grad_norm": 0.23250967264175415, "learning_rate": 2.7540298190049503e-05, "loss": 0.0941, "num_input_tokens_seen": 24252256, "step": 18705 }, { "epoch": 0.914176825544182, "grad_norm": 0.3419518768787384, "learning_rate": 2.7530531331535107e-05, "loss": 0.0569, "num_input_tokens_seen": 24258528, "step": 18710 }, { "epoch": 0.9144211272078763, "grad_norm": 0.2704160809516907, "learning_rate": 2.752076408278099e-05, "loss": 0.0851, "num_input_tokens_seen": 24264736, "step": 18715 }, { "epoch": 0.9146654288715707, "grad_norm": 0.18581075966358185, "learning_rate": 2.751099644529337e-05, "loss": 0.0909, "num_input_tokens_seen": 24271584, "step": 18720 }, { "epoch": 0.9149097305352649, "grad_norm": 0.2002429962158203, "learning_rate": 2.7501228420578533e-05, "loss": 0.0933, "num_input_tokens_seen": 24278432, "step": 18725 }, { "epoch": 0.9151540321989593, "grad_norm": 0.2136254757642746, "learning_rate": 2.7491460010142857e-05, "loss": 0.0848, "num_input_tokens_seen": 24284704, "step": 18730 }, { "epoch": 0.9153983338626536, "grad_norm": 0.2338852733373642, "learning_rate": 2.7481691215492727e-05, "loss": 0.0876, "num_input_tokens_seen": 24291392, "step": 18735 }, { "epoch": 0.915642635526348, "grad_norm": 0.2529701888561249, "learning_rate": 2.747192203813463e-05, "loss": 0.061, "num_input_tokens_seen": 24298176, "step": 18740 }, { "epoch": 0.9158869371900422, "grad_norm": 0.7238819599151611, "learning_rate": 2.7462152479575087e-05, "loss": 0.1101, "num_input_tokens_seen": 24304640, "step": 18745 }, { "epoch": 0.9161312388537366, "grad_norm": 0.3024289608001709, "learning_rate": 2.7452382541320697e-05, "loss": 0.0825, "num_input_tokens_seen": 24310816, "step": 18750 }, { "epoch": 0.916375540517431, "grad_norm": 0.161226287484169, "learning_rate": 2.7442612224878096e-05, "loss": 0.0801, "num_input_tokens_seen": 24317120, "step": 18755 }, { "epoch": 0.9166198421811252, "grad_norm": 0.362202912569046, "learning_rate": 2.7432841531753994e-05, "loss": 0.0872, "num_input_tokens_seen": 24323648, "step": 18760 }, { "epoch": 0.9168641438448196, "grad_norm": 0.5028149485588074, "learning_rate": 2.7423070463455147e-05, "loss": 0.0594, "num_input_tokens_seen": 24330848, "step": 18765 }, { "epoch": 0.9171084455085139, "grad_norm": 0.5807539224624634, "learning_rate": 2.7413299021488397e-05, "loss": 0.0963, "num_input_tokens_seen": 24337312, "step": 18770 }, { "epoch": 0.9173527471722083, "grad_norm": 0.3709031939506531, "learning_rate": 2.7403527207360615e-05, "loss": 0.1245, "num_input_tokens_seen": 24343904, "step": 18775 }, { "epoch": 0.9175970488359025, "grad_norm": 0.7842275500297546, "learning_rate": 2.7393755022578722e-05, "loss": 0.0856, "num_input_tokens_seen": 24350496, "step": 18780 }, { "epoch": 0.9178413504995969, "grad_norm": 0.1645662933588028, "learning_rate": 2.7383982468649714e-05, "loss": 0.0822, "num_input_tokens_seen": 24356512, "step": 18785 }, { "epoch": 0.9180856521632912, "grad_norm": 0.2289493978023529, "learning_rate": 2.7374209547080665e-05, "loss": 0.0812, "num_input_tokens_seen": 24362688, "step": 18790 }, { "epoch": 0.9183299538269856, "grad_norm": 0.5979043245315552, "learning_rate": 2.7364436259378663e-05, "loss": 0.0861, "num_input_tokens_seen": 24368992, "step": 18795 }, { "epoch": 0.9185742554906798, "grad_norm": 0.22755488753318787, "learning_rate": 2.735466260705088e-05, "loss": 0.1125, "num_input_tokens_seen": 24375456, "step": 18800 }, { "epoch": 0.9185742554906798, "eval_loss": 0.08907565474510193, "eval_runtime": 374.0046, "eval_samples_per_second": 97.285, "eval_steps_per_second": 24.323, "num_input_tokens_seen": 24375456, "step": 18800 }, { "epoch": 0.9188185571543742, "grad_norm": 0.6189422607421875, "learning_rate": 2.7344888591604524e-05, "loss": 0.0678, "num_input_tokens_seen": 24381888, "step": 18805 }, { "epoch": 0.9190628588180686, "grad_norm": 0.5734397172927856, "learning_rate": 2.7335114214546893e-05, "loss": 0.0711, "num_input_tokens_seen": 24388384, "step": 18810 }, { "epoch": 0.9193071604817629, "grad_norm": 0.4722439646720886, "learning_rate": 2.7325339477385293e-05, "loss": 0.1169, "num_input_tokens_seen": 24394880, "step": 18815 }, { "epoch": 0.9195514621454572, "grad_norm": 0.424305260181427, "learning_rate": 2.7315564381627128e-05, "loss": 0.0915, "num_input_tokens_seen": 24401184, "step": 18820 }, { "epoch": 0.9197957638091515, "grad_norm": 0.9461786150932312, "learning_rate": 2.7305788928779835e-05, "loss": 0.1031, "num_input_tokens_seen": 24407520, "step": 18825 }, { "epoch": 0.9200400654728459, "grad_norm": 0.1831243634223938, "learning_rate": 2.729601312035091e-05, "loss": 0.0941, "num_input_tokens_seen": 24414080, "step": 18830 }, { "epoch": 0.9202843671365402, "grad_norm": 0.26612114906311035, "learning_rate": 2.7286236957847915e-05, "loss": 0.074, "num_input_tokens_seen": 24420224, "step": 18835 }, { "epoch": 0.9205286688002345, "grad_norm": 0.26618492603302, "learning_rate": 2.7276460442778446e-05, "loss": 0.099, "num_input_tokens_seen": 24426304, "step": 18840 }, { "epoch": 0.9207729704639288, "grad_norm": 0.15495245158672333, "learning_rate": 2.726668357665017e-05, "loss": 0.0903, "num_input_tokens_seen": 24433216, "step": 18845 }, { "epoch": 0.9210172721276232, "grad_norm": 0.21513913571834564, "learning_rate": 2.7256906360970808e-05, "loss": 0.07, "num_input_tokens_seen": 24440000, "step": 18850 }, { "epoch": 0.9212615737913176, "grad_norm": 0.4052892327308655, "learning_rate": 2.7247128797248117e-05, "loss": 0.0913, "num_input_tokens_seen": 24446496, "step": 18855 }, { "epoch": 0.9215058754550118, "grad_norm": 0.5014392733573914, "learning_rate": 2.7237350886989925e-05, "loss": 0.0813, "num_input_tokens_seen": 24453088, "step": 18860 }, { "epoch": 0.9217501771187062, "grad_norm": 0.15917079150676727, "learning_rate": 2.7227572631704107e-05, "loss": 0.0963, "num_input_tokens_seen": 24460128, "step": 18865 }, { "epoch": 0.9219944787824005, "grad_norm": 0.40197068452835083, "learning_rate": 2.7217794032898596e-05, "loss": 0.0861, "num_input_tokens_seen": 24466336, "step": 18870 }, { "epoch": 0.9222387804460949, "grad_norm": 0.2629924714565277, "learning_rate": 2.7208015092081384e-05, "loss": 0.1367, "num_input_tokens_seen": 24472672, "step": 18875 }, { "epoch": 0.9224830821097891, "grad_norm": 0.13836370408535004, "learning_rate": 2.719823581076049e-05, "loss": 0.0728, "num_input_tokens_seen": 24479488, "step": 18880 }, { "epoch": 0.9227273837734835, "grad_norm": 0.31984415650367737, "learning_rate": 2.718845619044401e-05, "loss": 0.0921, "num_input_tokens_seen": 24485632, "step": 18885 }, { "epoch": 0.9229716854371778, "grad_norm": 0.17181473970413208, "learning_rate": 2.7178676232640088e-05, "loss": 0.076, "num_input_tokens_seen": 24492224, "step": 18890 }, { "epoch": 0.9232159871008722, "grad_norm": 0.12524822354316711, "learning_rate": 2.716889593885691e-05, "loss": 0.0962, "num_input_tokens_seen": 24499008, "step": 18895 }, { "epoch": 0.9234602887645665, "grad_norm": 0.45810627937316895, "learning_rate": 2.7159115310602716e-05, "loss": 0.0909, "num_input_tokens_seen": 24505568, "step": 18900 }, { "epoch": 0.9237045904282608, "grad_norm": 0.36905282735824585, "learning_rate": 2.7149334349385814e-05, "loss": 0.0832, "num_input_tokens_seen": 24512224, "step": 18905 }, { "epoch": 0.9239488920919552, "grad_norm": 0.3638761043548584, "learning_rate": 2.713955305671454e-05, "loss": 0.0749, "num_input_tokens_seen": 24518944, "step": 18910 }, { "epoch": 0.9241931937556495, "grad_norm": 0.692743718624115, "learning_rate": 2.71297714340973e-05, "loss": 0.0883, "num_input_tokens_seen": 24525408, "step": 18915 }, { "epoch": 0.9244374954193438, "grad_norm": 0.15802772343158722, "learning_rate": 2.7119989483042545e-05, "loss": 0.0772, "num_input_tokens_seen": 24531712, "step": 18920 }, { "epoch": 0.9246817970830381, "grad_norm": 0.19167020916938782, "learning_rate": 2.7110207205058768e-05, "loss": 0.0902, "num_input_tokens_seen": 24538304, "step": 18925 }, { "epoch": 0.9249260987467325, "grad_norm": 0.39523380994796753, "learning_rate": 2.7100424601654517e-05, "loss": 0.1217, "num_input_tokens_seen": 24544352, "step": 18930 }, { "epoch": 0.9251704004104268, "grad_norm": 0.10654401779174805, "learning_rate": 2.7090641674338403e-05, "loss": 0.0873, "num_input_tokens_seen": 24551168, "step": 18935 }, { "epoch": 0.9254147020741211, "grad_norm": 0.21582676470279694, "learning_rate": 2.7080858424619072e-05, "loss": 0.097, "num_input_tokens_seen": 24557536, "step": 18940 }, { "epoch": 0.9256590037378154, "grad_norm": 0.3746437430381775, "learning_rate": 2.707107485400521e-05, "loss": 0.0913, "num_input_tokens_seen": 24563680, "step": 18945 }, { "epoch": 0.9259033054015098, "grad_norm": 0.199446439743042, "learning_rate": 2.7061290964005586e-05, "loss": 0.0901, "num_input_tokens_seen": 24570336, "step": 18950 }, { "epoch": 0.9261476070652042, "grad_norm": 0.36616429686546326, "learning_rate": 2.7051506756129e-05, "loss": 0.0674, "num_input_tokens_seen": 24576704, "step": 18955 }, { "epoch": 0.9263919087288984, "grad_norm": 0.21355414390563965, "learning_rate": 2.704172223188428e-05, "loss": 0.1135, "num_input_tokens_seen": 24583072, "step": 18960 }, { "epoch": 0.9266362103925928, "grad_norm": 0.14952188730239868, "learning_rate": 2.7031937392780334e-05, "loss": 0.0633, "num_input_tokens_seen": 24589504, "step": 18965 }, { "epoch": 0.9268805120562871, "grad_norm": 0.32380568981170654, "learning_rate": 2.702215224032611e-05, "loss": 0.0837, "num_input_tokens_seen": 24595936, "step": 18970 }, { "epoch": 0.9271248137199815, "grad_norm": 0.645194947719574, "learning_rate": 2.70123667760306e-05, "loss": 0.0996, "num_input_tokens_seen": 24602560, "step": 18975 }, { "epoch": 0.9273691153836757, "grad_norm": 0.15498235821723938, "learning_rate": 2.7002581001402845e-05, "loss": 0.121, "num_input_tokens_seen": 24609440, "step": 18980 }, { "epoch": 0.9276134170473701, "grad_norm": 0.38611188530921936, "learning_rate": 2.6992794917951923e-05, "loss": 0.0901, "num_input_tokens_seen": 24616384, "step": 18985 }, { "epoch": 0.9278577187110644, "grad_norm": 0.35774001479148865, "learning_rate": 2.6983008527187e-05, "loss": 0.0868, "num_input_tokens_seen": 24622784, "step": 18990 }, { "epoch": 0.9281020203747588, "grad_norm": 0.25695672631263733, "learning_rate": 2.697322183061723e-05, "loss": 0.089, "num_input_tokens_seen": 24629344, "step": 18995 }, { "epoch": 0.9283463220384531, "grad_norm": 0.15359468758106232, "learning_rate": 2.696343482975186e-05, "loss": 0.0953, "num_input_tokens_seen": 24635712, "step": 19000 }, { "epoch": 0.9283463220384531, "eval_loss": 0.0890296921133995, "eval_runtime": 373.9314, "eval_samples_per_second": 97.304, "eval_steps_per_second": 24.328, "num_input_tokens_seen": 24635712, "step": 19000 }, { "epoch": 0.9285906237021474, "grad_norm": 0.29417580366134644, "learning_rate": 2.695364752610016e-05, "loss": 0.1036, "num_input_tokens_seen": 24642432, "step": 19005 }, { "epoch": 0.9288349253658418, "grad_norm": 0.2200874239206314, "learning_rate": 2.6943859921171467e-05, "loss": 0.0775, "num_input_tokens_seen": 24648896, "step": 19010 }, { "epoch": 0.929079227029536, "grad_norm": 0.16316738724708557, "learning_rate": 2.6934072016475143e-05, "loss": 0.0846, "num_input_tokens_seen": 24655360, "step": 19015 }, { "epoch": 0.9293235286932304, "grad_norm": 0.28268298506736755, "learning_rate": 2.6924283813520606e-05, "loss": 0.0622, "num_input_tokens_seen": 24662336, "step": 19020 }, { "epoch": 0.9295678303569247, "grad_norm": 0.12683719396591187, "learning_rate": 2.691449531381733e-05, "loss": 0.0847, "num_input_tokens_seen": 24668640, "step": 19025 }, { "epoch": 0.9298121320206191, "grad_norm": 0.662926435470581, "learning_rate": 2.6904706518874816e-05, "loss": 0.0858, "num_input_tokens_seen": 24675520, "step": 19030 }, { "epoch": 0.9300564336843133, "grad_norm": 0.2252766489982605, "learning_rate": 2.6894917430202615e-05, "loss": 0.0881, "num_input_tokens_seen": 24681632, "step": 19035 }, { "epoch": 0.9303007353480077, "grad_norm": 0.614877462387085, "learning_rate": 2.6885128049310343e-05, "loss": 0.0867, "num_input_tokens_seen": 24688032, "step": 19040 }, { "epoch": 0.9305450370117021, "grad_norm": 0.5955201387405396, "learning_rate": 2.687533837770762e-05, "loss": 0.0685, "num_input_tokens_seen": 24694208, "step": 19045 }, { "epoch": 0.9307893386753964, "grad_norm": 0.4499031603336334, "learning_rate": 2.6865548416904162e-05, "loss": 0.0802, "num_input_tokens_seen": 24701184, "step": 19050 }, { "epoch": 0.9310336403390908, "grad_norm": 0.4778059422969818, "learning_rate": 2.68557581684097e-05, "loss": 0.092, "num_input_tokens_seen": 24707616, "step": 19055 }, { "epoch": 0.931277942002785, "grad_norm": 0.23361244797706604, "learning_rate": 2.6845967633733998e-05, "loss": 0.0952, "num_input_tokens_seen": 24713984, "step": 19060 }, { "epoch": 0.9315222436664794, "grad_norm": 0.1934223771095276, "learning_rate": 2.683617681438689e-05, "loss": 0.0669, "num_input_tokens_seen": 24721024, "step": 19065 }, { "epoch": 0.9317665453301737, "grad_norm": 0.4968211054801941, "learning_rate": 2.682638571187825e-05, "loss": 0.0758, "num_input_tokens_seen": 24727712, "step": 19070 }, { "epoch": 0.932010846993868, "grad_norm": 0.28380659222602844, "learning_rate": 2.6816594327717976e-05, "loss": 0.0979, "num_input_tokens_seen": 24733952, "step": 19075 }, { "epoch": 0.9322551486575623, "grad_norm": 0.2667348086833954, "learning_rate": 2.680680266341603e-05, "loss": 0.0656, "num_input_tokens_seen": 24740576, "step": 19080 }, { "epoch": 0.9324994503212567, "grad_norm": 0.7428613305091858, "learning_rate": 2.67970107204824e-05, "loss": 0.0866, "num_input_tokens_seen": 24747264, "step": 19085 }, { "epoch": 0.932743751984951, "grad_norm": 0.5109148025512695, "learning_rate": 2.6787218500427142e-05, "loss": 0.1262, "num_input_tokens_seen": 24753568, "step": 19090 }, { "epoch": 0.9329880536486453, "grad_norm": 0.1478530466556549, "learning_rate": 2.6777426004760332e-05, "loss": 0.0829, "num_input_tokens_seen": 24760096, "step": 19095 }, { "epoch": 0.9332323553123397, "grad_norm": 0.25819918513298035, "learning_rate": 2.6767633234992094e-05, "loss": 0.1062, "num_input_tokens_seen": 24766240, "step": 19100 }, { "epoch": 0.933476656976034, "grad_norm": 0.538183331489563, "learning_rate": 2.6757840192632598e-05, "loss": 0.1067, "num_input_tokens_seen": 24772608, "step": 19105 }, { "epoch": 0.9337209586397284, "grad_norm": 0.2640734016895294, "learning_rate": 2.6748046879192052e-05, "loss": 0.0703, "num_input_tokens_seen": 24779072, "step": 19110 }, { "epoch": 0.9339652603034226, "grad_norm": 0.4217391014099121, "learning_rate": 2.673825329618071e-05, "loss": 0.0854, "num_input_tokens_seen": 24785792, "step": 19115 }, { "epoch": 0.934209561967117, "grad_norm": 0.27998456358909607, "learning_rate": 2.6728459445108866e-05, "loss": 0.0795, "num_input_tokens_seen": 24792320, "step": 19120 }, { "epoch": 0.9344538636308113, "grad_norm": 0.43936485052108765, "learning_rate": 2.6718665327486854e-05, "loss": 0.0759, "num_input_tokens_seen": 24798784, "step": 19125 }, { "epoch": 0.9346981652945057, "grad_norm": 0.6251252293586731, "learning_rate": 2.6708870944825048e-05, "loss": 0.0804, "num_input_tokens_seen": 24805408, "step": 19130 }, { "epoch": 0.9349424669581999, "grad_norm": 0.1770763099193573, "learning_rate": 2.6699076298633874e-05, "loss": 0.0933, "num_input_tokens_seen": 24811264, "step": 19135 }, { "epoch": 0.9351867686218943, "grad_norm": 0.3858553171157837, "learning_rate": 2.6689281390423788e-05, "loss": 0.0912, "num_input_tokens_seen": 24817536, "step": 19140 }, { "epoch": 0.9354310702855887, "grad_norm": 0.27459123730659485, "learning_rate": 2.667948622170527e-05, "loss": 0.1071, "num_input_tokens_seen": 24823936, "step": 19145 }, { "epoch": 0.935675371949283, "grad_norm": 0.11473389714956284, "learning_rate": 2.6669690793988873e-05, "loss": 0.0661, "num_input_tokens_seen": 24829792, "step": 19150 }, { "epoch": 0.9359196736129773, "grad_norm": 0.476998507976532, "learning_rate": 2.665989510878518e-05, "loss": 0.0829, "num_input_tokens_seen": 24836448, "step": 19155 }, { "epoch": 0.9361639752766716, "grad_norm": 0.14497298002243042, "learning_rate": 2.6650099167604793e-05, "loss": 0.0858, "num_input_tokens_seen": 24843488, "step": 19160 }, { "epoch": 0.936408276940366, "grad_norm": 0.7456052303314209, "learning_rate": 2.6640302971958376e-05, "loss": 0.091, "num_input_tokens_seen": 24849568, "step": 19165 }, { "epoch": 0.9366525786040603, "grad_norm": 0.5625641942024231, "learning_rate": 2.6630506523356635e-05, "loss": 0.0856, "num_input_tokens_seen": 24856608, "step": 19170 }, { "epoch": 0.9368968802677546, "grad_norm": 0.1944843828678131, "learning_rate": 2.6620709823310297e-05, "loss": 0.1123, "num_input_tokens_seen": 24863136, "step": 19175 }, { "epoch": 0.9371411819314489, "grad_norm": 0.42925044894218445, "learning_rate": 2.661091287333014e-05, "loss": 0.0957, "num_input_tokens_seen": 24869472, "step": 19180 }, { "epoch": 0.9373854835951433, "grad_norm": 0.45036962628364563, "learning_rate": 2.660111567492696e-05, "loss": 0.0886, "num_input_tokens_seen": 24875584, "step": 19185 }, { "epoch": 0.9376297852588377, "grad_norm": 0.4038768410682678, "learning_rate": 2.6591318229611635e-05, "loss": 0.0798, "num_input_tokens_seen": 24881952, "step": 19190 }, { "epoch": 0.9378740869225319, "grad_norm": 0.6621960401535034, "learning_rate": 2.6581520538895037e-05, "loss": 0.089, "num_input_tokens_seen": 24888928, "step": 19195 }, { "epoch": 0.9381183885862263, "grad_norm": 0.4495975971221924, "learning_rate": 2.6571722604288102e-05, "loss": 0.0729, "num_input_tokens_seen": 24895360, "step": 19200 }, { "epoch": 0.9381183885862263, "eval_loss": 0.08913805335760117, "eval_runtime": 374.7485, "eval_samples_per_second": 97.092, "eval_steps_per_second": 24.275, "num_input_tokens_seen": 24895360, "step": 19200 }, { "epoch": 0.9383626902499206, "grad_norm": 0.3338392674922943, "learning_rate": 2.656192442730179e-05, "loss": 0.0811, "num_input_tokens_seen": 24902208, "step": 19205 }, { "epoch": 0.938606991913615, "grad_norm": 0.6114447116851807, "learning_rate": 2.6552126009447098e-05, "loss": 0.0683, "num_input_tokens_seen": 24908576, "step": 19210 }, { "epoch": 0.9388512935773092, "grad_norm": 0.47490835189819336, "learning_rate": 2.654232735223507e-05, "loss": 0.0686, "num_input_tokens_seen": 24914912, "step": 19215 }, { "epoch": 0.9390955952410036, "grad_norm": 0.19589565694332123, "learning_rate": 2.6532528457176787e-05, "loss": 0.0756, "num_input_tokens_seen": 24921664, "step": 19220 }, { "epoch": 0.9393398969046979, "grad_norm": 0.3463967442512512, "learning_rate": 2.6522729325783348e-05, "loss": 0.0854, "num_input_tokens_seen": 24928576, "step": 19225 }, { "epoch": 0.9395841985683923, "grad_norm": 0.136136993765831, "learning_rate": 2.6512929959565914e-05, "loss": 0.0829, "num_input_tokens_seen": 24934912, "step": 19230 }, { "epoch": 0.9398285002320865, "grad_norm": 0.21703487634658813, "learning_rate": 2.6503130360035673e-05, "loss": 0.0982, "num_input_tokens_seen": 24940992, "step": 19235 }, { "epoch": 0.9400728018957809, "grad_norm": 0.18275684118270874, "learning_rate": 2.6493330528703835e-05, "loss": 0.0665, "num_input_tokens_seen": 24947968, "step": 19240 }, { "epoch": 0.9403171035594753, "grad_norm": 1.635873556137085, "learning_rate": 2.648353046708167e-05, "loss": 0.0906, "num_input_tokens_seen": 24954784, "step": 19245 }, { "epoch": 0.9405614052231696, "grad_norm": 0.20230285823345184, "learning_rate": 2.647373017668046e-05, "loss": 0.091, "num_input_tokens_seen": 24961440, "step": 19250 }, { "epoch": 0.9408057068868639, "grad_norm": 0.5069946050643921, "learning_rate": 2.6463929659011537e-05, "loss": 0.063, "num_input_tokens_seen": 24967680, "step": 19255 }, { "epoch": 0.9410500085505582, "grad_norm": 0.11691563576459885, "learning_rate": 2.6454128915586262e-05, "loss": 0.0708, "num_input_tokens_seen": 24974368, "step": 19260 }, { "epoch": 0.9412943102142526, "grad_norm": 0.24950063228607178, "learning_rate": 2.6444327947916036e-05, "loss": 0.0886, "num_input_tokens_seen": 24981056, "step": 19265 }, { "epoch": 0.9415386118779469, "grad_norm": 0.48501521348953247, "learning_rate": 2.6434526757512292e-05, "loss": 0.0915, "num_input_tokens_seen": 24987424, "step": 19270 }, { "epoch": 0.9417829135416412, "grad_norm": 0.5464374423027039, "learning_rate": 2.6424725345886486e-05, "loss": 0.1291, "num_input_tokens_seen": 24994048, "step": 19275 }, { "epoch": 0.9420272152053355, "grad_norm": 0.183528333902359, "learning_rate": 2.641492371455014e-05, "loss": 0.1037, "num_input_tokens_seen": 25000736, "step": 19280 }, { "epoch": 0.9422715168690299, "grad_norm": 0.4858938753604889, "learning_rate": 2.640512186501477e-05, "loss": 0.1203, "num_input_tokens_seen": 25007104, "step": 19285 }, { "epoch": 0.9425158185327243, "grad_norm": 0.2035517543554306, "learning_rate": 2.639531979879195e-05, "loss": 0.0493, "num_input_tokens_seen": 25013760, "step": 19290 }, { "epoch": 0.9427601201964185, "grad_norm": 0.15474426746368408, "learning_rate": 2.638551751739328e-05, "loss": 0.0974, "num_input_tokens_seen": 25020672, "step": 19295 }, { "epoch": 0.9430044218601129, "grad_norm": 0.23425310850143433, "learning_rate": 2.6375715022330404e-05, "loss": 0.062, "num_input_tokens_seen": 25027136, "step": 19300 }, { "epoch": 0.9432487235238072, "grad_norm": 0.1946316808462143, "learning_rate": 2.6365912315114976e-05, "loss": 0.088, "num_input_tokens_seen": 25033696, "step": 19305 }, { "epoch": 0.9434930251875016, "grad_norm": 0.3884798586368561, "learning_rate": 2.6356109397258704e-05, "loss": 0.113, "num_input_tokens_seen": 25040448, "step": 19310 }, { "epoch": 0.9437373268511958, "grad_norm": 0.33458682894706726, "learning_rate": 2.6346306270273325e-05, "loss": 0.0932, "num_input_tokens_seen": 25046976, "step": 19315 }, { "epoch": 0.9439816285148902, "grad_norm": 0.19744253158569336, "learning_rate": 2.6336502935670608e-05, "loss": 0.0836, "num_input_tokens_seen": 25053728, "step": 19320 }, { "epoch": 0.9442259301785845, "grad_norm": 0.15224100649356842, "learning_rate": 2.6326699394962333e-05, "loss": 0.1001, "num_input_tokens_seen": 25060416, "step": 19325 }, { "epoch": 0.9444702318422789, "grad_norm": 0.33016273379325867, "learning_rate": 2.6316895649660334e-05, "loss": 0.0808, "num_input_tokens_seen": 25066656, "step": 19330 }, { "epoch": 0.9447145335059732, "grad_norm": 0.5323377847671509, "learning_rate": 2.6307091701276486e-05, "loss": 0.0891, "num_input_tokens_seen": 25072768, "step": 19335 }, { "epoch": 0.9449588351696675, "grad_norm": 0.734731137752533, "learning_rate": 2.629728755132267e-05, "loss": 0.1025, "num_input_tokens_seen": 25079104, "step": 19340 }, { "epoch": 0.9452031368333619, "grad_norm": 0.2983050048351288, "learning_rate": 2.628748320131081e-05, "loss": 0.0729, "num_input_tokens_seen": 25085984, "step": 19345 }, { "epoch": 0.9454474384970561, "grad_norm": 0.22297325730323792, "learning_rate": 2.6277678652752856e-05, "loss": 0.0748, "num_input_tokens_seen": 25092640, "step": 19350 }, { "epoch": 0.9456917401607505, "grad_norm": 0.5759766101837158, "learning_rate": 2.6267873907160807e-05, "loss": 0.0808, "num_input_tokens_seen": 25099136, "step": 19355 }, { "epoch": 0.9459360418244448, "grad_norm": 0.2497042566537857, "learning_rate": 2.6258068966046668e-05, "loss": 0.0954, "num_input_tokens_seen": 25105600, "step": 19360 }, { "epoch": 0.9461803434881392, "grad_norm": 0.10406335443258286, "learning_rate": 2.6248263830922475e-05, "loss": 0.0747, "num_input_tokens_seen": 25111872, "step": 19365 }, { "epoch": 0.9464246451518334, "grad_norm": 0.5115606784820557, "learning_rate": 2.6238458503300318e-05, "loss": 0.099, "num_input_tokens_seen": 25118464, "step": 19370 }, { "epoch": 0.9466689468155278, "grad_norm": 0.4060019552707672, "learning_rate": 2.6228652984692292e-05, "loss": 0.1112, "num_input_tokens_seen": 25124512, "step": 19375 }, { "epoch": 0.9469132484792221, "grad_norm": 0.16827890276908875, "learning_rate": 2.621884727661054e-05, "loss": 0.0752, "num_input_tokens_seen": 25130752, "step": 19380 }, { "epoch": 0.9471575501429165, "grad_norm": 0.24417367577552795, "learning_rate": 2.6209041380567222e-05, "loss": 0.0889, "num_input_tokens_seen": 25137216, "step": 19385 }, { "epoch": 0.9474018518066109, "grad_norm": 0.1777062863111496, "learning_rate": 2.6199235298074527e-05, "loss": 0.0878, "num_input_tokens_seen": 25143808, "step": 19390 }, { "epoch": 0.9476461534703051, "grad_norm": 0.2553309500217438, "learning_rate": 2.618942903064468e-05, "loss": 0.0847, "num_input_tokens_seen": 25150112, "step": 19395 }, { "epoch": 0.9478904551339995, "grad_norm": 0.4473471939563751, "learning_rate": 2.6179622579789932e-05, "loss": 0.087, "num_input_tokens_seen": 25156480, "step": 19400 }, { "epoch": 0.9478904551339995, "eval_loss": 0.08898431062698364, "eval_runtime": 375.0238, "eval_samples_per_second": 97.021, "eval_steps_per_second": 24.257, "num_input_tokens_seen": 25156480, "step": 19400 }, { "epoch": 0.9481347567976938, "grad_norm": 0.31207042932510376, "learning_rate": 2.6169815947022553e-05, "loss": 0.0728, "num_input_tokens_seen": 25162400, "step": 19405 }, { "epoch": 0.9483790584613881, "grad_norm": 0.32481706142425537, "learning_rate": 2.6160009133854853e-05, "loss": 0.0636, "num_input_tokens_seen": 25168608, "step": 19410 }, { "epoch": 0.9486233601250824, "grad_norm": 0.1642649918794632, "learning_rate": 2.6150202141799168e-05, "loss": 0.069, "num_input_tokens_seen": 25175104, "step": 19415 }, { "epoch": 0.9488676617887768, "grad_norm": 0.5628874897956848, "learning_rate": 2.614039497236786e-05, "loss": 0.0912, "num_input_tokens_seen": 25181536, "step": 19420 }, { "epoch": 0.9491119634524711, "grad_norm": 0.30286794900894165, "learning_rate": 2.6130587627073315e-05, "loss": 0.0869, "num_input_tokens_seen": 25187808, "step": 19425 }, { "epoch": 0.9493562651161654, "grad_norm": 0.146489217877388, "learning_rate": 2.6120780107427956e-05, "loss": 0.0778, "num_input_tokens_seen": 25193888, "step": 19430 }, { "epoch": 0.9496005667798598, "grad_norm": 1.300654649734497, "learning_rate": 2.6110972414944214e-05, "loss": 0.087, "num_input_tokens_seen": 25200288, "step": 19435 }, { "epoch": 0.9498448684435541, "grad_norm": 0.2706489562988281, "learning_rate": 2.6101164551134565e-05, "loss": 0.0761, "num_input_tokens_seen": 25206848, "step": 19440 }, { "epoch": 0.9500891701072485, "grad_norm": 1.061658263206482, "learning_rate": 2.6091356517511505e-05, "loss": 0.1187, "num_input_tokens_seen": 25212864, "step": 19445 }, { "epoch": 0.9503334717709427, "grad_norm": 0.7186448574066162, "learning_rate": 2.608154831558755e-05, "loss": 0.1076, "num_input_tokens_seen": 25219200, "step": 19450 }, { "epoch": 0.9505777734346371, "grad_norm": 0.18022367358207703, "learning_rate": 2.607173994687526e-05, "loss": 0.073, "num_input_tokens_seen": 25226240, "step": 19455 }, { "epoch": 0.9508220750983314, "grad_norm": 0.29375186562538147, "learning_rate": 2.6061931412887196e-05, "loss": 0.104, "num_input_tokens_seen": 25232576, "step": 19460 }, { "epoch": 0.9510663767620258, "grad_norm": 0.39451268315315247, "learning_rate": 2.6052122715135973e-05, "loss": 0.1042, "num_input_tokens_seen": 25239136, "step": 19465 }, { "epoch": 0.95131067842572, "grad_norm": 0.2068338841199875, "learning_rate": 2.60423138551342e-05, "loss": 0.0744, "num_input_tokens_seen": 25245632, "step": 19470 }, { "epoch": 0.9515549800894144, "grad_norm": 0.23799151182174683, "learning_rate": 2.6032504834394527e-05, "loss": 0.0817, "num_input_tokens_seen": 25252000, "step": 19475 }, { "epoch": 0.9517992817531087, "grad_norm": 0.44194120168685913, "learning_rate": 2.602269565442964e-05, "loss": 0.1001, "num_input_tokens_seen": 25258080, "step": 19480 }, { "epoch": 0.9520435834168031, "grad_norm": 0.6800147294998169, "learning_rate": 2.6012886316752227e-05, "loss": 0.0932, "num_input_tokens_seen": 25264256, "step": 19485 }, { "epoch": 0.9522878850804974, "grad_norm": 0.19742479920387268, "learning_rate": 2.6003076822875018e-05, "loss": 0.0914, "num_input_tokens_seen": 25270496, "step": 19490 }, { "epoch": 0.9525321867441917, "grad_norm": 0.34610405564308167, "learning_rate": 2.5993267174310755e-05, "loss": 0.1072, "num_input_tokens_seen": 25276864, "step": 19495 }, { "epoch": 0.9527764884078861, "grad_norm": 0.22099654376506805, "learning_rate": 2.5983457372572218e-05, "loss": 0.0674, "num_input_tokens_seen": 25283520, "step": 19500 }, { "epoch": 0.9530207900715804, "grad_norm": 0.25810524821281433, "learning_rate": 2.597364741917219e-05, "loss": 0.1044, "num_input_tokens_seen": 25289600, "step": 19505 }, { "epoch": 0.9532650917352747, "grad_norm": 1.0813772678375244, "learning_rate": 2.5963837315623492e-05, "loss": 0.0987, "num_input_tokens_seen": 25296416, "step": 19510 }, { "epoch": 0.953509393398969, "grad_norm": 0.3274948000907898, "learning_rate": 2.595402706343897e-05, "loss": 0.0797, "num_input_tokens_seen": 25303136, "step": 19515 }, { "epoch": 0.9537536950626634, "grad_norm": 0.15196490287780762, "learning_rate": 2.594421666413148e-05, "loss": 0.0757, "num_input_tokens_seen": 25309920, "step": 19520 }, { "epoch": 0.9539979967263577, "grad_norm": 0.22533893585205078, "learning_rate": 2.5934406119213928e-05, "loss": 0.0908, "num_input_tokens_seen": 25316768, "step": 19525 }, { "epoch": 0.954242298390052, "grad_norm": 0.43226751685142517, "learning_rate": 2.5924595430199193e-05, "loss": 0.0822, "num_input_tokens_seen": 25323552, "step": 19530 }, { "epoch": 0.9544866000537464, "grad_norm": 0.4180562198162079, "learning_rate": 2.5914784598600238e-05, "loss": 0.0708, "num_input_tokens_seen": 25330112, "step": 19535 }, { "epoch": 0.9547309017174407, "grad_norm": 0.42959827184677124, "learning_rate": 2.5904973625930002e-05, "loss": 0.0694, "num_input_tokens_seen": 25336416, "step": 19540 }, { "epoch": 0.9549752033811351, "grad_norm": 0.3724403977394104, "learning_rate": 2.5895162513701456e-05, "loss": 0.085, "num_input_tokens_seen": 25342880, "step": 19545 }, { "epoch": 0.9552195050448293, "grad_norm": 0.1855461746454239, "learning_rate": 2.5885351263427593e-05, "loss": 0.0986, "num_input_tokens_seen": 25349408, "step": 19550 }, { "epoch": 0.9554638067085237, "grad_norm": 0.257829874753952, "learning_rate": 2.5875539876621448e-05, "loss": 0.0745, "num_input_tokens_seen": 25355744, "step": 19555 }, { "epoch": 0.955708108372218, "grad_norm": 0.5556387305259705, "learning_rate": 2.586572835479605e-05, "loss": 0.0854, "num_input_tokens_seen": 25362496, "step": 19560 }, { "epoch": 0.9559524100359124, "grad_norm": 0.3568178415298462, "learning_rate": 2.585591669946446e-05, "loss": 0.0978, "num_input_tokens_seen": 25369088, "step": 19565 }, { "epoch": 0.9561967116996066, "grad_norm": 0.2168034166097641, "learning_rate": 2.5846104912139756e-05, "loss": 0.0746, "num_input_tokens_seen": 25376160, "step": 19570 }, { "epoch": 0.956441013363301, "grad_norm": 0.13085174560546875, "learning_rate": 2.583629299433505e-05, "loss": 0.0894, "num_input_tokens_seen": 25382528, "step": 19575 }, { "epoch": 0.9566853150269954, "grad_norm": 0.22150561213493347, "learning_rate": 2.582648094756345e-05, "loss": 0.1016, "num_input_tokens_seen": 25389632, "step": 19580 }, { "epoch": 0.9569296166906897, "grad_norm": 0.14871981739997864, "learning_rate": 2.5816668773338098e-05, "loss": 0.0526, "num_input_tokens_seen": 25396608, "step": 19585 }, { "epoch": 0.957173918354384, "grad_norm": 0.37701189517974854, "learning_rate": 2.580685647317216e-05, "loss": 0.0752, "num_input_tokens_seen": 25403264, "step": 19590 }, { "epoch": 0.9574182200180783, "grad_norm": 0.6227290034294128, "learning_rate": 2.5797044048578818e-05, "loss": 0.0893, "num_input_tokens_seen": 25409824, "step": 19595 }, { "epoch": 0.9576625216817727, "grad_norm": 1.4035674333572388, "learning_rate": 2.5787231501071262e-05, "loss": 0.1061, "num_input_tokens_seen": 25415936, "step": 19600 }, { "epoch": 0.9576625216817727, "eval_loss": 0.08914730697870255, "eval_runtime": 374.0637, "eval_samples_per_second": 97.27, "eval_steps_per_second": 24.319, "num_input_tokens_seen": 25415936, "step": 19600 }, { "epoch": 0.957906823345467, "grad_norm": 0.17018991708755493, "learning_rate": 2.577741883216272e-05, "loss": 0.085, "num_input_tokens_seen": 25422240, "step": 19605 }, { "epoch": 0.9581511250091613, "grad_norm": 0.3244796395301819, "learning_rate": 2.576760604336642e-05, "loss": 0.0905, "num_input_tokens_seen": 25428992, "step": 19610 }, { "epoch": 0.9583954266728556, "grad_norm": 0.31706109642982483, "learning_rate": 2.575779313619563e-05, "loss": 0.088, "num_input_tokens_seen": 25435488, "step": 19615 }, { "epoch": 0.95863972833655, "grad_norm": 0.1436462700366974, "learning_rate": 2.5747980112163605e-05, "loss": 0.0727, "num_input_tokens_seen": 25441664, "step": 19620 }, { "epoch": 0.9588840300002442, "grad_norm": 0.32762831449508667, "learning_rate": 2.5738166972783656e-05, "loss": 0.0691, "num_input_tokens_seen": 25448608, "step": 19625 }, { "epoch": 0.9591283316639386, "grad_norm": 0.09423831105232239, "learning_rate": 2.5728353719569075e-05, "loss": 0.0533, "num_input_tokens_seen": 25455232, "step": 19630 }, { "epoch": 0.959372633327633, "grad_norm": 0.16332310438156128, "learning_rate": 2.57185403540332e-05, "loss": 0.0764, "num_input_tokens_seen": 25461440, "step": 19635 }, { "epoch": 0.9596169349913273, "grad_norm": 0.16142651438713074, "learning_rate": 2.5708726877689375e-05, "loss": 0.0881, "num_input_tokens_seen": 25467776, "step": 19640 }, { "epoch": 0.9598612366550217, "grad_norm": 0.21102963387966156, "learning_rate": 2.5698913292050964e-05, "loss": 0.0743, "num_input_tokens_seen": 25474080, "step": 19645 }, { "epoch": 0.9601055383187159, "grad_norm": 0.4211382269859314, "learning_rate": 2.568909959863133e-05, "loss": 0.0831, "num_input_tokens_seen": 25480768, "step": 19650 }, { "epoch": 0.9603498399824103, "grad_norm": 0.24415428936481476, "learning_rate": 2.5679285798943887e-05, "loss": 0.0889, "num_input_tokens_seen": 25487520, "step": 19655 }, { "epoch": 0.9605941416461046, "grad_norm": 0.14496996998786926, "learning_rate": 2.5669471894502035e-05, "loss": 0.0662, "num_input_tokens_seen": 25494016, "step": 19660 }, { "epoch": 0.960838443309799, "grad_norm": 0.2113446146249771, "learning_rate": 2.56596578868192e-05, "loss": 0.0858, "num_input_tokens_seen": 25500800, "step": 19665 }, { "epoch": 0.9610827449734932, "grad_norm": 0.15742236375808716, "learning_rate": 2.564984377740883e-05, "loss": 0.0756, "num_input_tokens_seen": 25506848, "step": 19670 }, { "epoch": 0.9613270466371876, "grad_norm": 0.4266643226146698, "learning_rate": 2.564002956778438e-05, "loss": 0.0826, "num_input_tokens_seen": 25513184, "step": 19675 }, { "epoch": 0.961571348300882, "grad_norm": 0.5330446362495422, "learning_rate": 2.563021525945934e-05, "loss": 0.1213, "num_input_tokens_seen": 25519552, "step": 19680 }, { "epoch": 0.9618156499645762, "grad_norm": 0.2196696698665619, "learning_rate": 2.562040085394718e-05, "loss": 0.0791, "num_input_tokens_seen": 25525632, "step": 19685 }, { "epoch": 0.9620599516282706, "grad_norm": 0.37191516160964966, "learning_rate": 2.56105863527614e-05, "loss": 0.0776, "num_input_tokens_seen": 25532032, "step": 19690 }, { "epoch": 0.9623042532919649, "grad_norm": 0.4522491693496704, "learning_rate": 2.5600771757415548e-05, "loss": 0.106, "num_input_tokens_seen": 25538272, "step": 19695 }, { "epoch": 0.9625485549556593, "grad_norm": 0.37088507413864136, "learning_rate": 2.5590957069423134e-05, "loss": 0.1017, "num_input_tokens_seen": 25544640, "step": 19700 }, { "epoch": 0.9627928566193535, "grad_norm": 0.13749925792217255, "learning_rate": 2.5581142290297716e-05, "loss": 0.1162, "num_input_tokens_seen": 25551456, "step": 19705 }, { "epoch": 0.9630371582830479, "grad_norm": 0.21916097402572632, "learning_rate": 2.557132742155285e-05, "loss": 0.0883, "num_input_tokens_seen": 25558016, "step": 19710 }, { "epoch": 0.9632814599467422, "grad_norm": 0.22186493873596191, "learning_rate": 2.556151246470212e-05, "loss": 0.0907, "num_input_tokens_seen": 25564544, "step": 19715 }, { "epoch": 0.9635257616104366, "grad_norm": 0.5271334648132324, "learning_rate": 2.5551697421259114e-05, "loss": 0.0911, "num_input_tokens_seen": 25570912, "step": 19720 }, { "epoch": 0.963770063274131, "grad_norm": 0.3502485752105713, "learning_rate": 2.554188229273743e-05, "loss": 0.0861, "num_input_tokens_seen": 25577632, "step": 19725 }, { "epoch": 0.9640143649378252, "grad_norm": 0.15906348824501038, "learning_rate": 2.5532067080650678e-05, "loss": 0.1165, "num_input_tokens_seen": 25583936, "step": 19730 }, { "epoch": 0.9642586666015196, "grad_norm": 0.30591827630996704, "learning_rate": 2.55222517865125e-05, "loss": 0.0993, "num_input_tokens_seen": 25590656, "step": 19735 }, { "epoch": 0.9645029682652139, "grad_norm": 0.35356366634368896, "learning_rate": 2.5512436411836538e-05, "loss": 0.098, "num_input_tokens_seen": 25596864, "step": 19740 }, { "epoch": 0.9647472699289082, "grad_norm": 0.5095844864845276, "learning_rate": 2.5502620958136443e-05, "loss": 0.0951, "num_input_tokens_seen": 25603328, "step": 19745 }, { "epoch": 0.9649915715926025, "grad_norm": 0.11390718817710876, "learning_rate": 2.5492805426925874e-05, "loss": 0.0807, "num_input_tokens_seen": 25610272, "step": 19750 }, { "epoch": 0.9652358732562969, "grad_norm": 0.34081798791885376, "learning_rate": 2.5482989819718523e-05, "loss": 0.1033, "num_input_tokens_seen": 25616896, "step": 19755 }, { "epoch": 0.9654801749199912, "grad_norm": 0.5327046513557434, "learning_rate": 2.5473174138028065e-05, "loss": 0.0975, "num_input_tokens_seen": 25623456, "step": 19760 }, { "epoch": 0.9657244765836855, "grad_norm": 0.23735204339027405, "learning_rate": 2.5463358383368212e-05, "loss": 0.076, "num_input_tokens_seen": 25629504, "step": 19765 }, { "epoch": 0.9659687782473798, "grad_norm": 0.35302695631980896, "learning_rate": 2.545354255725267e-05, "loss": 0.0938, "num_input_tokens_seen": 25635776, "step": 19770 }, { "epoch": 0.9662130799110742, "grad_norm": 0.18998123705387115, "learning_rate": 2.5443726661195165e-05, "loss": 0.0719, "num_input_tokens_seen": 25642272, "step": 19775 }, { "epoch": 0.9664573815747686, "grad_norm": 0.6697708964347839, "learning_rate": 2.543391069670944e-05, "loss": 0.072, "num_input_tokens_seen": 25648864, "step": 19780 }, { "epoch": 0.9667016832384628, "grad_norm": 0.1811734139919281, "learning_rate": 2.5424094665309228e-05, "loss": 0.0686, "num_input_tokens_seen": 25655968, "step": 19785 }, { "epoch": 0.9669459849021572, "grad_norm": 1.2140882015228271, "learning_rate": 2.5414278568508292e-05, "loss": 0.0963, "num_input_tokens_seen": 25663456, "step": 19790 }, { "epoch": 0.9671902865658515, "grad_norm": 0.36001119017601013, "learning_rate": 2.540446240782039e-05, "loss": 0.1031, "num_input_tokens_seen": 25670464, "step": 19795 }, { "epoch": 0.9674345882295459, "grad_norm": 0.33917832374572754, "learning_rate": 2.5394646184759307e-05, "loss": 0.093, "num_input_tokens_seen": 25677472, "step": 19800 }, { "epoch": 0.9674345882295459, "eval_loss": 0.0890367180109024, "eval_runtime": 374.3272, "eval_samples_per_second": 97.201, "eval_steps_per_second": 24.302, "num_input_tokens_seen": 25677472, "step": 19800 }, { "epoch": 0.9676788898932401, "grad_norm": 0.20454271137714386, "learning_rate": 2.538482990083882e-05, "loss": 0.0901, "num_input_tokens_seen": 25683584, "step": 19805 }, { "epoch": 0.9679231915569345, "grad_norm": 0.614475429058075, "learning_rate": 2.5375013557572725e-05, "loss": 0.0894, "num_input_tokens_seen": 25689984, "step": 19810 }, { "epoch": 0.9681674932206288, "grad_norm": 0.4380057156085968, "learning_rate": 2.536519715647483e-05, "loss": 0.0726, "num_input_tokens_seen": 25696608, "step": 19815 }, { "epoch": 0.9684117948843232, "grad_norm": 0.2736043632030487, "learning_rate": 2.535538069905894e-05, "loss": 0.0765, "num_input_tokens_seen": 25703488, "step": 19820 }, { "epoch": 0.9686560965480175, "grad_norm": 0.37765800952911377, "learning_rate": 2.534556418683888e-05, "loss": 0.1016, "num_input_tokens_seen": 25709728, "step": 19825 }, { "epoch": 0.9689003982117118, "grad_norm": 0.9340927600860596, "learning_rate": 2.5335747621328486e-05, "loss": 0.0832, "num_input_tokens_seen": 25716320, "step": 19830 }, { "epoch": 0.9691446998754062, "grad_norm": 0.24286481738090515, "learning_rate": 2.5325931004041586e-05, "loss": 0.1026, "num_input_tokens_seen": 25722528, "step": 19835 }, { "epoch": 0.9693890015391005, "grad_norm": 0.15853877365589142, "learning_rate": 2.5316114336492032e-05, "loss": 0.0654, "num_input_tokens_seen": 25728864, "step": 19840 }, { "epoch": 0.9696333032027948, "grad_norm": 0.19566915929317474, "learning_rate": 2.530629762019367e-05, "loss": 0.0661, "num_input_tokens_seen": 25735264, "step": 19845 }, { "epoch": 0.9698776048664891, "grad_norm": 0.14605973660945892, "learning_rate": 2.5296480856660364e-05, "loss": 0.0706, "num_input_tokens_seen": 25741536, "step": 19850 }, { "epoch": 0.9701219065301835, "grad_norm": 0.2853865623474121, "learning_rate": 2.528666404740599e-05, "loss": 0.0783, "num_input_tokens_seen": 25748000, "step": 19855 }, { "epoch": 0.9703662081938778, "grad_norm": 0.14705686271190643, "learning_rate": 2.527684719394442e-05, "loss": 0.0974, "num_input_tokens_seen": 25754304, "step": 19860 }, { "epoch": 0.9706105098575721, "grad_norm": 0.7201052308082581, "learning_rate": 2.526703029778953e-05, "loss": 0.1022, "num_input_tokens_seen": 25760640, "step": 19865 }, { "epoch": 0.9708548115212665, "grad_norm": 0.2875422537326813, "learning_rate": 2.5257213360455208e-05, "loss": 0.0735, "num_input_tokens_seen": 25767136, "step": 19870 }, { "epoch": 0.9710991131849608, "grad_norm": 0.26493147015571594, "learning_rate": 2.5247396383455353e-05, "loss": 0.0691, "num_input_tokens_seen": 25773984, "step": 19875 }, { "epoch": 0.9713434148486552, "grad_norm": 0.2551213800907135, "learning_rate": 2.523757936830387e-05, "loss": 0.125, "num_input_tokens_seen": 25780128, "step": 19880 }, { "epoch": 0.9715877165123494, "grad_norm": 0.7943810224533081, "learning_rate": 2.5227762316514662e-05, "loss": 0.0865, "num_input_tokens_seen": 25786336, "step": 19885 }, { "epoch": 0.9718320181760438, "grad_norm": 0.1599266529083252, "learning_rate": 2.5217945229601648e-05, "loss": 0.1034, "num_input_tokens_seen": 25792704, "step": 19890 }, { "epoch": 0.9720763198397381, "grad_norm": 0.19081561267375946, "learning_rate": 2.5208128109078738e-05, "loss": 0.0941, "num_input_tokens_seen": 25799136, "step": 19895 }, { "epoch": 0.9723206215034325, "grad_norm": 0.3539240062236786, "learning_rate": 2.5198310956459853e-05, "loss": 0.0743, "num_input_tokens_seen": 25805888, "step": 19900 }, { "epoch": 0.9725649231671267, "grad_norm": 0.15501344203948975, "learning_rate": 2.518849377325893e-05, "loss": 0.0794, "num_input_tokens_seen": 25812000, "step": 19905 }, { "epoch": 0.9728092248308211, "grad_norm": 0.47270065546035767, "learning_rate": 2.51786765609899e-05, "loss": 0.0749, "num_input_tokens_seen": 25818880, "step": 19910 }, { "epoch": 0.9730535264945154, "grad_norm": 0.35710084438323975, "learning_rate": 2.5168859321166694e-05, "loss": 0.0865, "num_input_tokens_seen": 25825600, "step": 19915 }, { "epoch": 0.9732978281582098, "grad_norm": 0.3556209206581116, "learning_rate": 2.515904205530326e-05, "loss": 0.0844, "num_input_tokens_seen": 25831840, "step": 19920 }, { "epoch": 0.9735421298219041, "grad_norm": 0.261748343706131, "learning_rate": 2.514922476491355e-05, "loss": 0.109, "num_input_tokens_seen": 25838528, "step": 19925 }, { "epoch": 0.9737864314855984, "grad_norm": 0.522487461566925, "learning_rate": 2.51394074515115e-05, "loss": 0.0777, "num_input_tokens_seen": 25845120, "step": 19930 }, { "epoch": 0.9740307331492928, "grad_norm": 0.34182536602020264, "learning_rate": 2.5129590116611067e-05, "loss": 0.1033, "num_input_tokens_seen": 25851424, "step": 19935 }, { "epoch": 0.974275034812987, "grad_norm": 0.211613729596138, "learning_rate": 2.5119772761726212e-05, "loss": 0.0915, "num_input_tokens_seen": 25858176, "step": 19940 }, { "epoch": 0.9745193364766814, "grad_norm": 0.4021447002887726, "learning_rate": 2.5109955388370893e-05, "loss": 0.0785, "num_input_tokens_seen": 25864672, "step": 19945 }, { "epoch": 0.9747636381403757, "grad_norm": 0.6277337670326233, "learning_rate": 2.510013799805907e-05, "loss": 0.0863, "num_input_tokens_seen": 25871136, "step": 19950 }, { "epoch": 0.9750079398040701, "grad_norm": 0.2254858762025833, "learning_rate": 2.5090320592304706e-05, "loss": 0.079, "num_input_tokens_seen": 25877664, "step": 19955 }, { "epoch": 0.9752522414677643, "grad_norm": 0.18532483279705048, "learning_rate": 2.5080503172621777e-05, "loss": 0.0789, "num_input_tokens_seen": 25884256, "step": 19960 }, { "epoch": 0.9754965431314587, "grad_norm": 0.22555847465991974, "learning_rate": 2.5070685740524246e-05, "loss": 0.0676, "num_input_tokens_seen": 25890464, "step": 19965 }, { "epoch": 0.9757408447951531, "grad_norm": 0.20481540262699127, "learning_rate": 2.5060868297526084e-05, "loss": 0.0707, "num_input_tokens_seen": 25896800, "step": 19970 }, { "epoch": 0.9759851464588474, "grad_norm": 0.27331599593162537, "learning_rate": 2.5051050845141267e-05, "loss": 0.0942, "num_input_tokens_seen": 25902752, "step": 19975 }, { "epoch": 0.9762294481225418, "grad_norm": 0.21060827374458313, "learning_rate": 2.5041233384883765e-05, "loss": 0.1056, "num_input_tokens_seen": 25908992, "step": 19980 }, { "epoch": 0.976473749786236, "grad_norm": 0.2812557518482208, "learning_rate": 2.5031415918267564e-05, "loss": 0.0796, "num_input_tokens_seen": 25915392, "step": 19985 }, { "epoch": 0.9767180514499304, "grad_norm": 0.2520727813243866, "learning_rate": 2.5021598446806626e-05, "loss": 0.0768, "num_input_tokens_seen": 25921632, "step": 19990 }, { "epoch": 0.9769623531136247, "grad_norm": 0.2572614550590515, "learning_rate": 2.5011780972014937e-05, "loss": 0.0998, "num_input_tokens_seen": 25928032, "step": 19995 }, { "epoch": 0.977206654777319, "grad_norm": 0.3053305745124817, "learning_rate": 2.5001963495406478e-05, "loss": 0.0704, "num_input_tokens_seen": 25934656, "step": 20000 }, { "epoch": 0.977206654777319, "eval_loss": 0.08882079273462296, "eval_runtime": 374.6206, "eval_samples_per_second": 97.125, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 25934656, "step": 20000 }, { "epoch": 0.9774509564410133, "grad_norm": 0.8008792400360107, "learning_rate": 2.499214601849522e-05, "loss": 0.0877, "num_input_tokens_seen": 25941312, "step": 20005 }, { "epoch": 0.9776952581047077, "grad_norm": 0.5698226094245911, "learning_rate": 2.4982328542795148e-05, "loss": 0.1119, "num_input_tokens_seen": 25947520, "step": 20010 }, { "epoch": 0.9779395597684021, "grad_norm": 0.22439315915107727, "learning_rate": 2.497251106982024e-05, "loss": 0.0722, "num_input_tokens_seen": 25953856, "step": 20015 }, { "epoch": 0.9781838614320963, "grad_norm": 0.1626497060060501, "learning_rate": 2.4962693601084458e-05, "loss": 0.095, "num_input_tokens_seen": 25961120, "step": 20020 }, { "epoch": 0.9784281630957907, "grad_norm": 0.1751680225133896, "learning_rate": 2.4952876138101794e-05, "loss": 0.063, "num_input_tokens_seen": 25967584, "step": 20025 }, { "epoch": 0.978672464759485, "grad_norm": 0.25142061710357666, "learning_rate": 2.4943058682386233e-05, "loss": 0.0629, "num_input_tokens_seen": 25973824, "step": 20030 }, { "epoch": 0.9789167664231794, "grad_norm": 0.44291412830352783, "learning_rate": 2.493324123545173e-05, "loss": 0.0848, "num_input_tokens_seen": 25980288, "step": 20035 }, { "epoch": 0.9791610680868736, "grad_norm": 0.6044988632202148, "learning_rate": 2.4923423798812272e-05, "loss": 0.0755, "num_input_tokens_seen": 25986720, "step": 20040 }, { "epoch": 0.979405369750568, "grad_norm": 0.27301889657974243, "learning_rate": 2.4913606373981825e-05, "loss": 0.083, "num_input_tokens_seen": 25993216, "step": 20045 }, { "epoch": 0.9796496714142623, "grad_norm": 0.12432216107845306, "learning_rate": 2.4903788962474357e-05, "loss": 0.1079, "num_input_tokens_seen": 25999552, "step": 20050 }, { "epoch": 0.9798939730779567, "grad_norm": 0.4696910083293915, "learning_rate": 2.489397156580385e-05, "loss": 0.0834, "num_input_tokens_seen": 26005664, "step": 20055 }, { "epoch": 0.9801382747416509, "grad_norm": 0.3402862548828125, "learning_rate": 2.4884154185484246e-05, "loss": 0.0758, "num_input_tokens_seen": 26012704, "step": 20060 }, { "epoch": 0.9803825764053453, "grad_norm": 0.43721944093704224, "learning_rate": 2.4874336823029526e-05, "loss": 0.069, "num_input_tokens_seen": 26019488, "step": 20065 }, { "epoch": 0.9806268780690397, "grad_norm": 0.23451705276966095, "learning_rate": 2.4864519479953656e-05, "loss": 0.0881, "num_input_tokens_seen": 26025792, "step": 20070 }, { "epoch": 0.980871179732734, "grad_norm": 0.14065423607826233, "learning_rate": 2.485470215777058e-05, "loss": 0.0742, "num_input_tokens_seen": 26031648, "step": 20075 }, { "epoch": 0.9811154813964283, "grad_norm": 0.31152939796447754, "learning_rate": 2.4844884857994258e-05, "loss": 0.0732, "num_input_tokens_seen": 26038048, "step": 20080 }, { "epoch": 0.9813597830601226, "grad_norm": 0.21083560585975647, "learning_rate": 2.4835067582138638e-05, "loss": 0.1213, "num_input_tokens_seen": 26044608, "step": 20085 }, { "epoch": 0.981604084723817, "grad_norm": 0.3791220188140869, "learning_rate": 2.4825250331717666e-05, "loss": 0.0739, "num_input_tokens_seen": 26051264, "step": 20090 }, { "epoch": 0.9818483863875113, "grad_norm": 0.18050611019134521, "learning_rate": 2.4815433108245298e-05, "loss": 0.1019, "num_input_tokens_seen": 26058112, "step": 20095 }, { "epoch": 0.9820926880512056, "grad_norm": 0.1804398000240326, "learning_rate": 2.4805615913235456e-05, "loss": 0.0723, "num_input_tokens_seen": 26064160, "step": 20100 }, { "epoch": 0.9823369897148999, "grad_norm": 0.15451423823833466, "learning_rate": 2.479579874820208e-05, "loss": 0.0692, "num_input_tokens_seen": 26070272, "step": 20105 }, { "epoch": 0.9825812913785943, "grad_norm": 0.16312481462955475, "learning_rate": 2.4785981614659115e-05, "loss": 0.092, "num_input_tokens_seen": 26077120, "step": 20110 }, { "epoch": 0.9828255930422887, "grad_norm": 0.19060328602790833, "learning_rate": 2.477616451412047e-05, "loss": 0.0715, "num_input_tokens_seen": 26083776, "step": 20115 }, { "epoch": 0.9830698947059829, "grad_norm": 0.29512715339660645, "learning_rate": 2.476634744810007e-05, "loss": 0.1002, "num_input_tokens_seen": 26090016, "step": 20120 }, { "epoch": 0.9833141963696773, "grad_norm": 0.4115443527698517, "learning_rate": 2.475653041811183e-05, "loss": 0.088, "num_input_tokens_seen": 26096608, "step": 20125 }, { "epoch": 0.9835584980333716, "grad_norm": 0.18970555067062378, "learning_rate": 2.4746713425669652e-05, "loss": 0.0847, "num_input_tokens_seen": 26102688, "step": 20130 }, { "epoch": 0.983802799697066, "grad_norm": 0.2448626607656479, "learning_rate": 2.4736896472287458e-05, "loss": 0.0845, "num_input_tokens_seen": 26108704, "step": 20135 }, { "epoch": 0.9840471013607602, "grad_norm": 0.2267584651708603, "learning_rate": 2.4727079559479124e-05, "loss": 0.0708, "num_input_tokens_seen": 26115360, "step": 20140 }, { "epoch": 0.9842914030244546, "grad_norm": 0.3499455749988556, "learning_rate": 2.4717262688758557e-05, "loss": 0.1013, "num_input_tokens_seen": 26121728, "step": 20145 }, { "epoch": 0.9845357046881489, "grad_norm": 0.2009107917547226, "learning_rate": 2.4707445861639637e-05, "loss": 0.0937, "num_input_tokens_seen": 26128256, "step": 20150 }, { "epoch": 0.9847800063518433, "grad_norm": 0.20178170502185822, "learning_rate": 2.4697629079636244e-05, "loss": 0.0863, "num_input_tokens_seen": 26135200, "step": 20155 }, { "epoch": 0.9850243080155375, "grad_norm": 0.21069355309009552, "learning_rate": 2.4687812344262244e-05, "loss": 0.0548, "num_input_tokens_seen": 26141472, "step": 20160 }, { "epoch": 0.9852686096792319, "grad_norm": 0.37003427743911743, "learning_rate": 2.46779956570315e-05, "loss": 0.0937, "num_input_tokens_seen": 26148064, "step": 20165 }, { "epoch": 0.9855129113429263, "grad_norm": 0.5270837545394897, "learning_rate": 2.466817901945787e-05, "loss": 0.0924, "num_input_tokens_seen": 26154400, "step": 20170 }, { "epoch": 0.9857572130066206, "grad_norm": 0.49082455039024353, "learning_rate": 2.4658362433055217e-05, "loss": 0.0956, "num_input_tokens_seen": 26160640, "step": 20175 }, { "epoch": 0.9860015146703149, "grad_norm": 0.14573264122009277, "learning_rate": 2.4648545899337356e-05, "loss": 0.0698, "num_input_tokens_seen": 26167456, "step": 20180 }, { "epoch": 0.9862458163340092, "grad_norm": 0.29297760128974915, "learning_rate": 2.4638729419818143e-05, "loss": 0.0914, "num_input_tokens_seen": 26174112, "step": 20185 }, { "epoch": 0.9864901179977036, "grad_norm": 0.5646803379058838, "learning_rate": 2.46289129960114e-05, "loss": 0.0739, "num_input_tokens_seen": 26180320, "step": 20190 }, { "epoch": 0.9867344196613979, "grad_norm": 0.47817185521125793, "learning_rate": 2.4619096629430924e-05, "loss": 0.0874, "num_input_tokens_seen": 26186464, "step": 20195 }, { "epoch": 0.9869787213250922, "grad_norm": 0.5546550750732422, "learning_rate": 2.4609280321590543e-05, "loss": 0.1025, "num_input_tokens_seen": 26193248, "step": 20200 }, { "epoch": 0.9869787213250922, "eval_loss": 0.08915680646896362, "eval_runtime": 374.2306, "eval_samples_per_second": 97.226, "eval_steps_per_second": 24.309, "num_input_tokens_seen": 26193248, "step": 20200 }, { "epoch": 0.9872230229887865, "grad_norm": 0.6588925719261169, "learning_rate": 2.4599464074004037e-05, "loss": 0.0821, "num_input_tokens_seen": 26199136, "step": 20205 }, { "epoch": 0.9874673246524809, "grad_norm": 0.418319970369339, "learning_rate": 2.4589647888185204e-05, "loss": 0.0738, "num_input_tokens_seen": 26205376, "step": 20210 }, { "epoch": 0.9877116263161753, "grad_norm": 0.3522476553916931, "learning_rate": 2.4579831765647836e-05, "loss": 0.0752, "num_input_tokens_seen": 26212064, "step": 20215 }, { "epoch": 0.9879559279798695, "grad_norm": 0.7388630509376526, "learning_rate": 2.4570015707905676e-05, "loss": 0.0983, "num_input_tokens_seen": 26218144, "step": 20220 }, { "epoch": 0.9882002296435639, "grad_norm": 0.3849160671234131, "learning_rate": 2.4560199716472508e-05, "loss": 0.0914, "num_input_tokens_seen": 26224672, "step": 20225 }, { "epoch": 0.9884445313072582, "grad_norm": 0.23769566416740417, "learning_rate": 2.455038379286207e-05, "loss": 0.0915, "num_input_tokens_seen": 26230688, "step": 20230 }, { "epoch": 0.9886888329709526, "grad_norm": 0.2810097634792328, "learning_rate": 2.4540567938588095e-05, "loss": 0.1023, "num_input_tokens_seen": 26236736, "step": 20235 }, { "epoch": 0.9889331346346468, "grad_norm": 0.16423030197620392, "learning_rate": 2.4530752155164328e-05, "loss": 0.1062, "num_input_tokens_seen": 26242688, "step": 20240 }, { "epoch": 0.9891774362983412, "grad_norm": 0.6208972334861755, "learning_rate": 2.4520936444104463e-05, "loss": 0.1112, "num_input_tokens_seen": 26249600, "step": 20245 }, { "epoch": 0.9894217379620355, "grad_norm": 0.4444115161895752, "learning_rate": 2.4511120806922218e-05, "loss": 0.0632, "num_input_tokens_seen": 26256032, "step": 20250 }, { "epoch": 0.9896660396257299, "grad_norm": 0.21790587902069092, "learning_rate": 2.45013052451313e-05, "loss": 0.106, "num_input_tokens_seen": 26262272, "step": 20255 }, { "epoch": 0.9899103412894242, "grad_norm": 0.40946176648139954, "learning_rate": 2.4491489760245376e-05, "loss": 0.0974, "num_input_tokens_seen": 26269120, "step": 20260 }, { "epoch": 0.9901546429531185, "grad_norm": 0.15204013884067535, "learning_rate": 2.4481674353778115e-05, "loss": 0.0616, "num_input_tokens_seen": 26275776, "step": 20265 }, { "epoch": 0.9903989446168129, "grad_norm": 0.1855192333459854, "learning_rate": 2.447185902724319e-05, "loss": 0.0969, "num_input_tokens_seen": 26282272, "step": 20270 }, { "epoch": 0.9906432462805072, "grad_norm": 0.2589610517024994, "learning_rate": 2.4462043782154233e-05, "loss": 0.1183, "num_input_tokens_seen": 26288416, "step": 20275 }, { "epoch": 0.9908875479442015, "grad_norm": 0.31362292170524597, "learning_rate": 2.4452228620024895e-05, "loss": 0.0762, "num_input_tokens_seen": 26294240, "step": 20280 }, { "epoch": 0.9911318496078958, "grad_norm": 0.16564348340034485, "learning_rate": 2.4442413542368776e-05, "loss": 0.0906, "num_input_tokens_seen": 26300288, "step": 20285 }, { "epoch": 0.9913761512715902, "grad_norm": 0.14914019405841827, "learning_rate": 2.4432598550699502e-05, "loss": 0.0641, "num_input_tokens_seen": 26306624, "step": 20290 }, { "epoch": 0.9916204529352844, "grad_norm": 0.29469025135040283, "learning_rate": 2.4422783646530663e-05, "loss": 0.0977, "num_input_tokens_seen": 26312832, "step": 20295 }, { "epoch": 0.9918647545989788, "grad_norm": 0.308658629655838, "learning_rate": 2.441296883137584e-05, "loss": 0.1013, "num_input_tokens_seen": 26318752, "step": 20300 }, { "epoch": 0.9921090562626731, "grad_norm": 0.2819124162197113, "learning_rate": 2.4403154106748592e-05, "loss": 0.0884, "num_input_tokens_seen": 26325120, "step": 20305 }, { "epoch": 0.9923533579263675, "grad_norm": 0.4984522759914398, "learning_rate": 2.4393339474162494e-05, "loss": 0.0927, "num_input_tokens_seen": 26331744, "step": 20310 }, { "epoch": 0.9925976595900619, "grad_norm": 0.12410425394773483, "learning_rate": 2.4383524935131062e-05, "loss": 0.0738, "num_input_tokens_seen": 26338112, "step": 20315 }, { "epoch": 0.9928419612537561, "grad_norm": 0.7004914283752441, "learning_rate": 2.437371049116784e-05, "loss": 0.0929, "num_input_tokens_seen": 26344800, "step": 20320 }, { "epoch": 0.9930862629174505, "grad_norm": 0.6611570119857788, "learning_rate": 2.436389614378632e-05, "loss": 0.0687, "num_input_tokens_seen": 26351200, "step": 20325 }, { "epoch": 0.9933305645811448, "grad_norm": 0.18662993609905243, "learning_rate": 2.435408189450002e-05, "loss": 0.0623, "num_input_tokens_seen": 26357760, "step": 20330 }, { "epoch": 0.9935748662448391, "grad_norm": 0.3999994993209839, "learning_rate": 2.4344267744822406e-05, "loss": 0.0651, "num_input_tokens_seen": 26364160, "step": 20335 }, { "epoch": 0.9938191679085334, "grad_norm": 0.1801261156797409, "learning_rate": 2.4334453696266944e-05, "loss": 0.102, "num_input_tokens_seen": 26370432, "step": 20340 }, { "epoch": 0.9940634695722278, "grad_norm": 0.7045138478279114, "learning_rate": 2.432463975034708e-05, "loss": 0.0941, "num_input_tokens_seen": 26378272, "step": 20345 }, { "epoch": 0.9943077712359221, "grad_norm": 0.23720310628414154, "learning_rate": 2.4314825908576265e-05, "loss": 0.0855, "num_input_tokens_seen": 26384800, "step": 20350 }, { "epoch": 0.9945520728996164, "grad_norm": 0.19450926780700684, "learning_rate": 2.4305012172467897e-05, "loss": 0.1058, "num_input_tokens_seen": 26391296, "step": 20355 }, { "epoch": 0.9947963745633108, "grad_norm": 0.37583738565444946, "learning_rate": 2.4295198543535393e-05, "loss": 0.0876, "num_input_tokens_seen": 26397824, "step": 20360 }, { "epoch": 0.9950406762270051, "grad_norm": 0.6599127650260925, "learning_rate": 2.4285385023292124e-05, "loss": 0.1031, "num_input_tokens_seen": 26404832, "step": 20365 }, { "epoch": 0.9952849778906995, "grad_norm": 0.38213008642196655, "learning_rate": 2.427557161325147e-05, "loss": 0.0809, "num_input_tokens_seen": 26411712, "step": 20370 }, { "epoch": 0.9955292795543937, "grad_norm": 0.14970748126506805, "learning_rate": 2.4265758314926778e-05, "loss": 0.0825, "num_input_tokens_seen": 26417600, "step": 20375 }, { "epoch": 0.9957735812180881, "grad_norm": 0.31379956007003784, "learning_rate": 2.4255945129831373e-05, "loss": 0.0648, "num_input_tokens_seen": 26423648, "step": 20380 }, { "epoch": 0.9960178828817824, "grad_norm": 0.4775267243385315, "learning_rate": 2.4246132059478578e-05, "loss": 0.0894, "num_input_tokens_seen": 26429632, "step": 20385 }, { "epoch": 0.9962621845454768, "grad_norm": 0.4820932447910309, "learning_rate": 2.4236319105381706e-05, "loss": 0.1038, "num_input_tokens_seen": 26436064, "step": 20390 }, { "epoch": 0.996506486209171, "grad_norm": 0.29095321893692017, "learning_rate": 2.422650626905401e-05, "loss": 0.0846, "num_input_tokens_seen": 26443008, "step": 20395 }, { "epoch": 0.9967507878728654, "grad_norm": 0.1694071739912033, "learning_rate": 2.4216693552008785e-05, "loss": 0.0833, "num_input_tokens_seen": 26449184, "step": 20400 }, { "epoch": 0.9967507878728654, "eval_loss": 0.0888579934835434, "eval_runtime": 374.7669, "eval_samples_per_second": 97.087, "eval_steps_per_second": 24.274, "num_input_tokens_seen": 26449184, "step": 20400 }, { "epoch": 0.9969950895365598, "grad_norm": 1.0045535564422607, "learning_rate": 2.4206880955759247e-05, "loss": 0.0767, "num_input_tokens_seen": 26455616, "step": 20405 }, { "epoch": 0.9972393912002541, "grad_norm": 0.35976335406303406, "learning_rate": 2.419706848181863e-05, "loss": 0.0999, "num_input_tokens_seen": 26461920, "step": 20410 }, { "epoch": 0.9974836928639484, "grad_norm": 0.4966786205768585, "learning_rate": 2.4187256131700153e-05, "loss": 0.0918, "num_input_tokens_seen": 26468192, "step": 20415 }, { "epoch": 0.9977279945276427, "grad_norm": 0.17595922946929932, "learning_rate": 2.4177443906916985e-05, "loss": 0.0838, "num_input_tokens_seen": 26474848, "step": 20420 }, { "epoch": 0.9979722961913371, "grad_norm": 0.21546515822410583, "learning_rate": 2.4167631808982303e-05, "loss": 0.0876, "num_input_tokens_seen": 26480960, "step": 20425 }, { "epoch": 0.9982165978550314, "grad_norm": 0.3606940805912018, "learning_rate": 2.4157819839409264e-05, "loss": 0.1269, "num_input_tokens_seen": 26486944, "step": 20430 }, { "epoch": 0.9984608995187257, "grad_norm": 0.44370296597480774, "learning_rate": 2.414800799971098e-05, "loss": 0.0765, "num_input_tokens_seen": 26493728, "step": 20435 }, { "epoch": 0.99870520118242, "grad_norm": 0.6243374347686768, "learning_rate": 2.4138196291400582e-05, "loss": 0.0604, "num_input_tokens_seen": 26500640, "step": 20440 }, { "epoch": 0.9989495028461144, "grad_norm": 0.3333088159561157, "learning_rate": 2.412838471599114e-05, "loss": 0.072, "num_input_tokens_seen": 26507008, "step": 20445 }, { "epoch": 0.9991938045098087, "grad_norm": 0.3853881061077118, "learning_rate": 2.411857327499572e-05, "loss": 0.0732, "num_input_tokens_seen": 26513312, "step": 20450 }, { "epoch": 0.999438106173503, "grad_norm": 0.1721297800540924, "learning_rate": 2.410876196992739e-05, "loss": 0.0612, "num_input_tokens_seen": 26519776, "step": 20455 }, { "epoch": 0.9996824078371974, "grad_norm": 0.6848729848861694, "learning_rate": 2.4098950802299156e-05, "loss": 0.102, "num_input_tokens_seen": 26525888, "step": 20460 }, { "epoch": 0.9999267095008917, "grad_norm": 0.357361763715744, "learning_rate": 2.4089139773624027e-05, "loss": 0.0497, "num_input_tokens_seen": 26532384, "step": 20465 }, { "epoch": 1.0001954413309555, "grad_norm": 0.30891141295433044, "learning_rate": 2.4079328885415007e-05, "loss": 0.1148, "num_input_tokens_seen": 26538976, "step": 20470 }, { "epoch": 1.0004397429946499, "grad_norm": 0.12242733687162399, "learning_rate": 2.4069518139185036e-05, "loss": 0.0999, "num_input_tokens_seen": 26545568, "step": 20475 }, { "epoch": 1.000684044658344, "grad_norm": 0.21364285051822662, "learning_rate": 2.405970753644706e-05, "loss": 0.0957, "num_input_tokens_seen": 26552448, "step": 20480 }, { "epoch": 1.0009283463220384, "grad_norm": 0.34672868251800537, "learning_rate": 2.4049897078714e-05, "loss": 0.0984, "num_input_tokens_seen": 26558464, "step": 20485 }, { "epoch": 1.0011726479857328, "grad_norm": 0.3276509940624237, "learning_rate": 2.404008676749874e-05, "loss": 0.0976, "num_input_tokens_seen": 26566528, "step": 20490 }, { "epoch": 1.0014169496494272, "grad_norm": 0.18632757663726807, "learning_rate": 2.403027660431418e-05, "loss": 0.0794, "num_input_tokens_seen": 26572960, "step": 20495 }, { "epoch": 1.0016612513131213, "grad_norm": 0.1906147003173828, "learning_rate": 2.402046659067314e-05, "loss": 0.0848, "num_input_tokens_seen": 26579296, "step": 20500 }, { "epoch": 1.0019055529768157, "grad_norm": 0.2651177942752838, "learning_rate": 2.401065672808847e-05, "loss": 0.109, "num_input_tokens_seen": 26585664, "step": 20505 }, { "epoch": 1.00214985464051, "grad_norm": 0.18921248614788055, "learning_rate": 2.400084701807296e-05, "loss": 0.1024, "num_input_tokens_seen": 26592800, "step": 20510 }, { "epoch": 1.0023941563042045, "grad_norm": 0.7345209121704102, "learning_rate": 2.39910374621394e-05, "loss": 0.0954, "num_input_tokens_seen": 26599456, "step": 20515 }, { "epoch": 1.0026384579678989, "grad_norm": 0.1451474279165268, "learning_rate": 2.3981228061800544e-05, "loss": 0.0861, "num_input_tokens_seen": 26605760, "step": 20520 }, { "epoch": 1.002882759631593, "grad_norm": 0.38906773924827576, "learning_rate": 2.3971418818569115e-05, "loss": 0.0921, "num_input_tokens_seen": 26612544, "step": 20525 }, { "epoch": 1.0031270612952874, "grad_norm": 0.2976262867450714, "learning_rate": 2.3961609733957832e-05, "loss": 0.0882, "num_input_tokens_seen": 26619488, "step": 20530 }, { "epoch": 1.0033713629589818, "grad_norm": 0.525398850440979, "learning_rate": 2.395180080947939e-05, "loss": 0.0832, "num_input_tokens_seen": 26626144, "step": 20535 }, { "epoch": 1.0036156646226762, "grad_norm": 0.23437131941318512, "learning_rate": 2.394199204664642e-05, "loss": 0.0809, "num_input_tokens_seen": 26632384, "step": 20540 }, { "epoch": 1.0038599662863703, "grad_norm": 0.13375703990459442, "learning_rate": 2.3932183446971583e-05, "loss": 0.087, "num_input_tokens_seen": 26639168, "step": 20545 }, { "epoch": 1.0041042679500647, "grad_norm": 0.18388783931732178, "learning_rate": 2.3922375011967473e-05, "loss": 0.0749, "num_input_tokens_seen": 26646112, "step": 20550 }, { "epoch": 1.004348569613759, "grad_norm": 0.3260902762413025, "learning_rate": 2.3912566743146676e-05, "loss": 0.0734, "num_input_tokens_seen": 26652384, "step": 20555 }, { "epoch": 1.0045928712774534, "grad_norm": 0.5951550006866455, "learning_rate": 2.390275864202176e-05, "loss": 0.0826, "num_input_tokens_seen": 26658912, "step": 20560 }, { "epoch": 1.0048371729411478, "grad_norm": 0.16348524391651154, "learning_rate": 2.3892950710105243e-05, "loss": 0.0602, "num_input_tokens_seen": 26665024, "step": 20565 }, { "epoch": 1.005081474604842, "grad_norm": 0.19663357734680176, "learning_rate": 2.3883142948909635e-05, "loss": 0.0763, "num_input_tokens_seen": 26671200, "step": 20570 }, { "epoch": 1.0053257762685364, "grad_norm": 0.14395473897457123, "learning_rate": 2.3873335359947433e-05, "loss": 0.0575, "num_input_tokens_seen": 26677792, "step": 20575 }, { "epoch": 1.0055700779322307, "grad_norm": 0.626596987247467, "learning_rate": 2.3863527944731066e-05, "loss": 0.0934, "num_input_tokens_seen": 26684512, "step": 20580 }, { "epoch": 1.0058143795959251, "grad_norm": 0.15724121034145355, "learning_rate": 2.385372070477298e-05, "loss": 0.0833, "num_input_tokens_seen": 26690880, "step": 20585 }, { "epoch": 1.0060586812596193, "grad_norm": 0.23277783393859863, "learning_rate": 2.384391364158556e-05, "loss": 0.0899, "num_input_tokens_seen": 26697504, "step": 20590 }, { "epoch": 1.0063029829233137, "grad_norm": 0.7076415419578552, "learning_rate": 2.3834106756681185e-05, "loss": 0.0873, "num_input_tokens_seen": 26703808, "step": 20595 }, { "epoch": 1.006547284587008, "grad_norm": 0.25805768370628357, "learning_rate": 2.3824300051572206e-05, "loss": 0.1009, "num_input_tokens_seen": 26710048, "step": 20600 }, { "epoch": 1.006547284587008, "eval_loss": 0.08877930790185928, "eval_runtime": 374.9438, "eval_samples_per_second": 97.041, "eval_steps_per_second": 24.262, "num_input_tokens_seen": 26710048, "step": 20600 }, { "epoch": 1.0067915862507024, "grad_norm": 0.16051648557186127, "learning_rate": 2.3814493527770923e-05, "loss": 0.0958, "num_input_tokens_seen": 26716544, "step": 20605 }, { "epoch": 1.0070358879143968, "grad_norm": 0.621579110622406, "learning_rate": 2.3804687186789637e-05, "loss": 0.0795, "num_input_tokens_seen": 26722880, "step": 20610 }, { "epoch": 1.007280189578091, "grad_norm": 0.23721496760845184, "learning_rate": 2.379488103014062e-05, "loss": 0.105, "num_input_tokens_seen": 26729600, "step": 20615 }, { "epoch": 1.0075244912417853, "grad_norm": 0.27530038356781006, "learning_rate": 2.3785075059336086e-05, "loss": 0.0883, "num_input_tokens_seen": 26735968, "step": 20620 }, { "epoch": 1.0077687929054797, "grad_norm": 0.22741059958934784, "learning_rate": 2.3775269275888248e-05, "loss": 0.0792, "num_input_tokens_seen": 26742304, "step": 20625 }, { "epoch": 1.008013094569174, "grad_norm": 0.268917977809906, "learning_rate": 2.3765463681309274e-05, "loss": 0.0528, "num_input_tokens_seen": 26749024, "step": 20630 }, { "epoch": 1.0082573962328683, "grad_norm": 0.19811317324638367, "learning_rate": 2.3755658277111313e-05, "loss": 0.0922, "num_input_tokens_seen": 26755328, "step": 20635 }, { "epoch": 1.0085016978965626, "grad_norm": 0.31349337100982666, "learning_rate": 2.374585306480649e-05, "loss": 0.1033, "num_input_tokens_seen": 26761760, "step": 20640 }, { "epoch": 1.008745999560257, "grad_norm": 0.6017816066741943, "learning_rate": 2.3736048045906877e-05, "loss": 0.1225, "num_input_tokens_seen": 26767872, "step": 20645 }, { "epoch": 1.0089903012239514, "grad_norm": 0.20059135556221008, "learning_rate": 2.372624322192454e-05, "loss": 0.0921, "num_input_tokens_seen": 26774112, "step": 20650 }, { "epoch": 1.0092346028876458, "grad_norm": 0.2436361312866211, "learning_rate": 2.3716438594371516e-05, "loss": 0.0751, "num_input_tokens_seen": 26780832, "step": 20655 }, { "epoch": 1.00947890455134, "grad_norm": 0.39254626631736755, "learning_rate": 2.3706634164759784e-05, "loss": 0.0599, "num_input_tokens_seen": 26787264, "step": 20660 }, { "epoch": 1.0097232062150343, "grad_norm": 0.23911753296852112, "learning_rate": 2.3696829934601323e-05, "loss": 0.0703, "num_input_tokens_seen": 26793312, "step": 20665 }, { "epoch": 1.0099675078787287, "grad_norm": 0.23160356283187866, "learning_rate": 2.3687025905408053e-05, "loss": 0.0928, "num_input_tokens_seen": 26799840, "step": 20670 }, { "epoch": 1.010211809542423, "grad_norm": 0.15810784697532654, "learning_rate": 2.3677222078691886e-05, "loss": 0.0678, "num_input_tokens_seen": 26806816, "step": 20675 }, { "epoch": 1.0104561112061172, "grad_norm": 0.26207032799720764, "learning_rate": 2.366741845596471e-05, "loss": 0.1268, "num_input_tokens_seen": 26813344, "step": 20680 }, { "epoch": 1.0107004128698116, "grad_norm": 0.5236732959747314, "learning_rate": 2.3657615038738343e-05, "loss": 0.067, "num_input_tokens_seen": 26819616, "step": 20685 }, { "epoch": 1.010944714533506, "grad_norm": 0.4135574698448181, "learning_rate": 2.3647811828524614e-05, "loss": 0.1229, "num_input_tokens_seen": 26826272, "step": 20690 }, { "epoch": 1.0111890161972004, "grad_norm": 0.300971120595932, "learning_rate": 2.363800882683529e-05, "loss": 0.0982, "num_input_tokens_seen": 26832704, "step": 20695 }, { "epoch": 1.0114333178608947, "grad_norm": 0.6431025266647339, "learning_rate": 2.3628206035182125e-05, "loss": 0.0853, "num_input_tokens_seen": 26839424, "step": 20700 }, { "epoch": 1.011677619524589, "grad_norm": 0.30535030364990234, "learning_rate": 2.361840345507683e-05, "loss": 0.0997, "num_input_tokens_seen": 26845568, "step": 20705 }, { "epoch": 1.0119219211882833, "grad_norm": 0.19493244588375092, "learning_rate": 2.3608601088031073e-05, "loss": 0.0928, "num_input_tokens_seen": 26852032, "step": 20710 }, { "epoch": 1.0121662228519777, "grad_norm": 0.1916520744562149, "learning_rate": 2.3598798935556516e-05, "loss": 0.0711, "num_input_tokens_seen": 26858304, "step": 20715 }, { "epoch": 1.012410524515672, "grad_norm": 0.5520507097244263, "learning_rate": 2.3588996999164784e-05, "loss": 0.1031, "num_input_tokens_seen": 26865504, "step": 20720 }, { "epoch": 1.0126548261793662, "grad_norm": 0.17561855912208557, "learning_rate": 2.3579195280367434e-05, "loss": 0.0875, "num_input_tokens_seen": 26872160, "step": 20725 }, { "epoch": 1.0128991278430606, "grad_norm": 0.3010266423225403, "learning_rate": 2.356939378067603e-05, "loss": 0.0882, "num_input_tokens_seen": 26878368, "step": 20730 }, { "epoch": 1.013143429506755, "grad_norm": 0.4484022259712219, "learning_rate": 2.3559592501602092e-05, "loss": 0.0776, "num_input_tokens_seen": 26885088, "step": 20735 }, { "epoch": 1.0133877311704493, "grad_norm": 0.1895541250705719, "learning_rate": 2.3549791444657076e-05, "loss": 0.0854, "num_input_tokens_seen": 26891584, "step": 20740 }, { "epoch": 1.0136320328341435, "grad_norm": 0.5122219920158386, "learning_rate": 2.353999061135246e-05, "loss": 0.1001, "num_input_tokens_seen": 26898112, "step": 20745 }, { "epoch": 1.0138763344978379, "grad_norm": 0.6531121730804443, "learning_rate": 2.3530190003199626e-05, "loss": 0.0679, "num_input_tokens_seen": 26905088, "step": 20750 }, { "epoch": 1.0141206361615323, "grad_norm": 0.4468840956687927, "learning_rate": 2.3520389621709965e-05, "loss": 0.085, "num_input_tokens_seen": 26911968, "step": 20755 }, { "epoch": 1.0143649378252266, "grad_norm": 0.35489127039909363, "learning_rate": 2.351058946839483e-05, "loss": 0.0717, "num_input_tokens_seen": 26918272, "step": 20760 }, { "epoch": 1.014609239488921, "grad_norm": 0.3548562228679657, "learning_rate": 2.350078954476551e-05, "loss": 0.1036, "num_input_tokens_seen": 26924672, "step": 20765 }, { "epoch": 1.0148535411526152, "grad_norm": 0.3561871647834778, "learning_rate": 2.3490989852333272e-05, "loss": 0.1118, "num_input_tokens_seen": 26930688, "step": 20770 }, { "epoch": 1.0150978428163095, "grad_norm": 0.09152190387248993, "learning_rate": 2.3481190392609377e-05, "loss": 0.0626, "num_input_tokens_seen": 26937024, "step": 20775 }, { "epoch": 1.015342144480004, "grad_norm": 0.5741194486618042, "learning_rate": 2.3471391167105e-05, "loss": 0.0598, "num_input_tokens_seen": 26943616, "step": 20780 }, { "epoch": 1.0155864461436983, "grad_norm": 0.23617075383663177, "learning_rate": 2.3461592177331325e-05, "loss": 0.0775, "num_input_tokens_seen": 26949952, "step": 20785 }, { "epoch": 1.0158307478073925, "grad_norm": 0.2649257779121399, "learning_rate": 2.345179342479946e-05, "loss": 0.0829, "num_input_tokens_seen": 26956192, "step": 20790 }, { "epoch": 1.0160750494710868, "grad_norm": 0.4404158592224121, "learning_rate": 2.3441994911020503e-05, "loss": 0.0777, "num_input_tokens_seen": 26962464, "step": 20795 }, { "epoch": 1.0163193511347812, "grad_norm": 0.3724573850631714, "learning_rate": 2.3432196637505522e-05, "loss": 0.1114, "num_input_tokens_seen": 26968800, "step": 20800 }, { "epoch": 1.0163193511347812, "eval_loss": 0.08912566304206848, "eval_runtime": 374.1697, "eval_samples_per_second": 97.242, "eval_steps_per_second": 24.313, "num_input_tokens_seen": 26968800, "step": 20800 }, { "epoch": 1.0165636527984756, "grad_norm": 0.4525831341743469, "learning_rate": 2.3422398605765515e-05, "loss": 0.0748, "num_input_tokens_seen": 26975936, "step": 20805 }, { "epoch": 1.01680795446217, "grad_norm": 0.1921437829732895, "learning_rate": 2.3412600817311462e-05, "loss": 0.0845, "num_input_tokens_seen": 26982656, "step": 20810 }, { "epoch": 1.0170522561258641, "grad_norm": 0.2860223650932312, "learning_rate": 2.3402803273654326e-05, "loss": 0.0957, "num_input_tokens_seen": 26989760, "step": 20815 }, { "epoch": 1.0172965577895585, "grad_norm": 0.17419792711734772, "learning_rate": 2.3393005976304983e-05, "loss": 0.0779, "num_input_tokens_seen": 26996576, "step": 20820 }, { "epoch": 1.017540859453253, "grad_norm": 0.18156471848487854, "learning_rate": 2.338320892677432e-05, "loss": 0.08, "num_input_tokens_seen": 27003232, "step": 20825 }, { "epoch": 1.0177851611169473, "grad_norm": 0.2601892948150635, "learning_rate": 2.3373412126573155e-05, "loss": 0.0574, "num_input_tokens_seen": 27009312, "step": 20830 }, { "epoch": 1.0180294627806414, "grad_norm": 0.1771131008863449, "learning_rate": 2.3363615577212285e-05, "loss": 0.0654, "num_input_tokens_seen": 27015552, "step": 20835 }, { "epoch": 1.0182737644443358, "grad_norm": 0.19709326326847076, "learning_rate": 2.3353819280202455e-05, "loss": 0.0817, "num_input_tokens_seen": 27022336, "step": 20840 }, { "epoch": 1.0185180661080302, "grad_norm": 0.15243031084537506, "learning_rate": 2.334402323705438e-05, "loss": 0.0756, "num_input_tokens_seen": 27028800, "step": 20845 }, { "epoch": 1.0187623677717246, "grad_norm": 0.09610342979431152, "learning_rate": 2.3334227449278725e-05, "loss": 0.0549, "num_input_tokens_seen": 27035584, "step": 20850 }, { "epoch": 1.019006669435419, "grad_norm": 0.9366940259933472, "learning_rate": 2.3324431918386143e-05, "loss": 0.0943, "num_input_tokens_seen": 27042400, "step": 20855 }, { "epoch": 1.0192509710991131, "grad_norm": 0.22421877086162567, "learning_rate": 2.3314636645887207e-05, "loss": 0.1024, "num_input_tokens_seen": 27048704, "step": 20860 }, { "epoch": 1.0194952727628075, "grad_norm": 0.2890772819519043, "learning_rate": 2.3304841633292487e-05, "loss": 0.0844, "num_input_tokens_seen": 27055296, "step": 20865 }, { "epoch": 1.0197395744265019, "grad_norm": 0.2024518996477127, "learning_rate": 2.329504688211248e-05, "loss": 0.094, "num_input_tokens_seen": 27061824, "step": 20870 }, { "epoch": 1.0199838760901963, "grad_norm": 0.2386631816625595, "learning_rate": 2.3285252393857677e-05, "loss": 0.0883, "num_input_tokens_seen": 27068352, "step": 20875 }, { "epoch": 1.0202281777538904, "grad_norm": 0.39749911427497864, "learning_rate": 2.327545817003851e-05, "loss": 0.1087, "num_input_tokens_seen": 27074816, "step": 20880 }, { "epoch": 1.0204724794175848, "grad_norm": 0.7277222871780396, "learning_rate": 2.326566421216535e-05, "loss": 0.0934, "num_input_tokens_seen": 27081504, "step": 20885 }, { "epoch": 1.0207167810812792, "grad_norm": 0.12705877423286438, "learning_rate": 2.3255870521748565e-05, "loss": 0.0696, "num_input_tokens_seen": 27087744, "step": 20890 }, { "epoch": 1.0209610827449735, "grad_norm": 0.15233711898326874, "learning_rate": 2.3246077100298474e-05, "loss": 0.0905, "num_input_tokens_seen": 27094464, "step": 20895 }, { "epoch": 1.021205384408668, "grad_norm": 0.24929115176200867, "learning_rate": 2.3236283949325328e-05, "loss": 0.1077, "num_input_tokens_seen": 27100512, "step": 20900 }, { "epoch": 1.021449686072362, "grad_norm": 0.41574233770370483, "learning_rate": 2.3226491070339368e-05, "loss": 0.0995, "num_input_tokens_seen": 27107008, "step": 20905 }, { "epoch": 1.0216939877360565, "grad_norm": 0.31033584475517273, "learning_rate": 2.3216698464850762e-05, "loss": 0.0852, "num_input_tokens_seen": 27113472, "step": 20910 }, { "epoch": 1.0219382893997508, "grad_norm": 0.13084399700164795, "learning_rate": 2.320690613436967e-05, "loss": 0.0966, "num_input_tokens_seen": 27119808, "step": 20915 }, { "epoch": 1.0221825910634452, "grad_norm": 1.0038052797317505, "learning_rate": 2.3197114080406192e-05, "loss": 0.1003, "num_input_tokens_seen": 27125920, "step": 20920 }, { "epoch": 1.0224268927271394, "grad_norm": 0.11612729728221893, "learning_rate": 2.3187322304470365e-05, "loss": 0.0683, "num_input_tokens_seen": 27132960, "step": 20925 }, { "epoch": 1.0226711943908338, "grad_norm": 0.1666315495967865, "learning_rate": 2.3177530808072222e-05, "loss": 0.0772, "num_input_tokens_seen": 27140000, "step": 20930 }, { "epoch": 1.0229154960545281, "grad_norm": 0.4064238369464874, "learning_rate": 2.316773959272174e-05, "loss": 0.0704, "num_input_tokens_seen": 27146080, "step": 20935 }, { "epoch": 1.0231597977182225, "grad_norm": 0.6138691306114197, "learning_rate": 2.3157948659928823e-05, "loss": 0.0765, "num_input_tokens_seen": 27152416, "step": 20940 }, { "epoch": 1.023404099381917, "grad_norm": 0.2454817146062851, "learning_rate": 2.3148158011203388e-05, "loss": 0.0919, "num_input_tokens_seen": 27159200, "step": 20945 }, { "epoch": 1.023648401045611, "grad_norm": 0.5479267239570618, "learning_rate": 2.3138367648055253e-05, "loss": 0.0721, "num_input_tokens_seen": 27165472, "step": 20950 }, { "epoch": 1.0238927027093054, "grad_norm": 0.26615744829177856, "learning_rate": 2.312857757199422e-05, "loss": 0.103, "num_input_tokens_seen": 27172064, "step": 20955 }, { "epoch": 1.0241370043729998, "grad_norm": 0.40804728865623474, "learning_rate": 2.3118787784530048e-05, "loss": 0.0754, "num_input_tokens_seen": 27178432, "step": 20960 }, { "epoch": 1.0243813060366942, "grad_norm": 0.15693281590938568, "learning_rate": 2.310899828717243e-05, "loss": 0.0637, "num_input_tokens_seen": 27184992, "step": 20965 }, { "epoch": 1.0246256077003884, "grad_norm": 0.16429954767227173, "learning_rate": 2.309920908143104e-05, "loss": 0.0664, "num_input_tokens_seen": 27191712, "step": 20970 }, { "epoch": 1.0248699093640827, "grad_norm": 0.2802380919456482, "learning_rate": 2.308942016881551e-05, "loss": 0.0821, "num_input_tokens_seen": 27198048, "step": 20975 }, { "epoch": 1.0251142110277771, "grad_norm": 0.6041950583457947, "learning_rate": 2.307963155083539e-05, "loss": 0.0685, "num_input_tokens_seen": 27204384, "step": 20980 }, { "epoch": 1.0253585126914715, "grad_norm": 0.26141154766082764, "learning_rate": 2.306984322900022e-05, "loss": 0.1152, "num_input_tokens_seen": 27210592, "step": 20985 }, { "epoch": 1.0256028143551656, "grad_norm": 0.5496932864189148, "learning_rate": 2.3060055204819482e-05, "loss": 0.1129, "num_input_tokens_seen": 27216960, "step": 20990 }, { "epoch": 1.02584711601886, "grad_norm": 0.2555514872074127, "learning_rate": 2.3050267479802604e-05, "loss": 0.0912, "num_input_tokens_seen": 27223552, "step": 20995 }, { "epoch": 1.0260914176825544, "grad_norm": 0.2519477307796478, "learning_rate": 2.304048005545899e-05, "loss": 0.0707, "num_input_tokens_seen": 27230240, "step": 21000 }, { "epoch": 1.0260914176825544, "eval_loss": 0.08859981596469879, "eval_runtime": 375.3705, "eval_samples_per_second": 96.931, "eval_steps_per_second": 24.235, "num_input_tokens_seen": 27230240, "step": 21000 }, { "epoch": 1.0263357193462488, "grad_norm": 0.1805913895368576, "learning_rate": 2.3030692933297972e-05, "loss": 0.0835, "num_input_tokens_seen": 27236704, "step": 21005 }, { "epoch": 1.0265800210099432, "grad_norm": 0.3022463917732239, "learning_rate": 2.3020906114828843e-05, "loss": 0.0724, "num_input_tokens_seen": 27242688, "step": 21010 }, { "epoch": 1.0268243226736373, "grad_norm": 0.3328514099121094, "learning_rate": 2.301111960156088e-05, "loss": 0.0874, "num_input_tokens_seen": 27249408, "step": 21015 }, { "epoch": 1.0270686243373317, "grad_norm": 0.16150714457035065, "learning_rate": 2.300133339500326e-05, "loss": 0.0961, "num_input_tokens_seen": 27255872, "step": 21020 }, { "epoch": 1.027312926001026, "grad_norm": 0.16734862327575684, "learning_rate": 2.2991547496665148e-05, "loss": 0.046, "num_input_tokens_seen": 27262240, "step": 21025 }, { "epoch": 1.0275572276647205, "grad_norm": 0.18653450906276703, "learning_rate": 2.298176190805565e-05, "loss": 0.0945, "num_input_tokens_seen": 27268384, "step": 21030 }, { "epoch": 1.0278015293284146, "grad_norm": 0.5003401637077332, "learning_rate": 2.2971976630683826e-05, "loss": 0.062, "num_input_tokens_seen": 27274912, "step": 21035 }, { "epoch": 1.028045830992109, "grad_norm": 0.25242358446121216, "learning_rate": 2.29621916660587e-05, "loss": 0.1099, "num_input_tokens_seen": 27281280, "step": 21040 }, { "epoch": 1.0282901326558034, "grad_norm": 0.22883166372776031, "learning_rate": 2.295240701568922e-05, "loss": 0.0961, "num_input_tokens_seen": 27287520, "step": 21045 }, { "epoch": 1.0285344343194978, "grad_norm": 0.3100943863391876, "learning_rate": 2.2942622681084312e-05, "loss": 0.1179, "num_input_tokens_seen": 27294464, "step": 21050 }, { "epoch": 1.0287787359831921, "grad_norm": 0.2588374614715576, "learning_rate": 2.293283866375284e-05, "loss": 0.0998, "num_input_tokens_seen": 27301024, "step": 21055 }, { "epoch": 1.0290230376468863, "grad_norm": 0.19568808376789093, "learning_rate": 2.2923054965203627e-05, "loss": 0.1305, "num_input_tokens_seen": 27307808, "step": 21060 }, { "epoch": 1.0292673393105807, "grad_norm": 0.20387206971645355, "learning_rate": 2.2913271586945443e-05, "loss": 0.0741, "num_input_tokens_seen": 27314272, "step": 21065 }, { "epoch": 1.029511640974275, "grad_norm": 0.7057787775993347, "learning_rate": 2.290348853048699e-05, "loss": 0.0967, "num_input_tokens_seen": 27320992, "step": 21070 }, { "epoch": 1.0297559426379694, "grad_norm": 0.44313785433769226, "learning_rate": 2.2893705797336956e-05, "loss": 0.0988, "num_input_tokens_seen": 27327488, "step": 21075 }, { "epoch": 1.0300002443016636, "grad_norm": 0.14582647383213043, "learning_rate": 2.288392338900397e-05, "loss": 0.0946, "num_input_tokens_seen": 27333728, "step": 21080 }, { "epoch": 1.030244545965358, "grad_norm": 0.18740354478359222, "learning_rate": 2.2874141306996576e-05, "loss": 0.0851, "num_input_tokens_seen": 27340192, "step": 21085 }, { "epoch": 1.0304888476290524, "grad_norm": 0.3749724328517914, "learning_rate": 2.2864359552823312e-05, "loss": 0.1149, "num_input_tokens_seen": 27346528, "step": 21090 }, { "epoch": 1.0307331492927467, "grad_norm": 0.21226602792739868, "learning_rate": 2.2854578127992648e-05, "loss": 0.0931, "num_input_tokens_seen": 27352832, "step": 21095 }, { "epoch": 1.030977450956441, "grad_norm": 0.24708080291748047, "learning_rate": 2.2844797034012988e-05, "loss": 0.074, "num_input_tokens_seen": 27359328, "step": 21100 }, { "epoch": 1.0312217526201353, "grad_norm": 0.17115028202533722, "learning_rate": 2.2835016272392722e-05, "loss": 0.0765, "num_input_tokens_seen": 27366080, "step": 21105 }, { "epoch": 1.0314660542838296, "grad_norm": 0.17906180024147034, "learning_rate": 2.2825235844640142e-05, "loss": 0.1282, "num_input_tokens_seen": 27372736, "step": 21110 }, { "epoch": 1.031710355947524, "grad_norm": 0.6428399085998535, "learning_rate": 2.2815455752263522e-05, "loss": 0.0794, "num_input_tokens_seen": 27379328, "step": 21115 }, { "epoch": 1.0319546576112184, "grad_norm": 0.9005890488624573, "learning_rate": 2.2805675996771092e-05, "loss": 0.0974, "num_input_tokens_seen": 27385888, "step": 21120 }, { "epoch": 1.0321989592749126, "grad_norm": 0.3301650881767273, "learning_rate": 2.2795896579670987e-05, "loss": 0.103, "num_input_tokens_seen": 27391904, "step": 21125 }, { "epoch": 1.032443260938607, "grad_norm": 0.734485924243927, "learning_rate": 2.2786117502471337e-05, "loss": 0.0853, "num_input_tokens_seen": 27398176, "step": 21130 }, { "epoch": 1.0326875626023013, "grad_norm": 0.198449969291687, "learning_rate": 2.2776338766680185e-05, "loss": 0.1006, "num_input_tokens_seen": 27404512, "step": 21135 }, { "epoch": 1.0329318642659957, "grad_norm": 0.20648522675037384, "learning_rate": 2.2766560373805533e-05, "loss": 0.0885, "num_input_tokens_seen": 27411136, "step": 21140 }, { "epoch": 1.03317616592969, "grad_norm": 0.14846420288085938, "learning_rate": 2.2756782325355353e-05, "loss": 0.082, "num_input_tokens_seen": 27417344, "step": 21145 }, { "epoch": 1.0334204675933842, "grad_norm": 0.16616670787334442, "learning_rate": 2.2747004622837514e-05, "loss": 0.0934, "num_input_tokens_seen": 27424064, "step": 21150 }, { "epoch": 1.0336647692570786, "grad_norm": 0.28140485286712646, "learning_rate": 2.2737227267759878e-05, "loss": 0.0813, "num_input_tokens_seen": 27430304, "step": 21155 }, { "epoch": 1.033909070920773, "grad_norm": 0.6260281801223755, "learning_rate": 2.272745026163024e-05, "loss": 0.1163, "num_input_tokens_seen": 27436736, "step": 21160 }, { "epoch": 1.0341533725844674, "grad_norm": 0.34750357270240784, "learning_rate": 2.271767360595633e-05, "loss": 0.0864, "num_input_tokens_seen": 27443232, "step": 21165 }, { "epoch": 1.0343976742481615, "grad_norm": 0.23513391613960266, "learning_rate": 2.270789730224583e-05, "loss": 0.0817, "num_input_tokens_seen": 27449600, "step": 21170 }, { "epoch": 1.034641975911856, "grad_norm": 0.19606347382068634, "learning_rate": 2.2698121352006367e-05, "loss": 0.0657, "num_input_tokens_seen": 27456032, "step": 21175 }, { "epoch": 1.0348862775755503, "grad_norm": 0.3947094678878784, "learning_rate": 2.2688345756745517e-05, "loss": 0.0724, "num_input_tokens_seen": 27462432, "step": 21180 }, { "epoch": 1.0351305792392447, "grad_norm": 0.48234373331069946, "learning_rate": 2.267857051797081e-05, "loss": 0.1073, "num_input_tokens_seen": 27469440, "step": 21185 }, { "epoch": 1.035374880902939, "grad_norm": 0.28418970108032227, "learning_rate": 2.2668795637189695e-05, "loss": 0.062, "num_input_tokens_seen": 27476096, "step": 21190 }, { "epoch": 1.0356191825666332, "grad_norm": 0.3023475706577301, "learning_rate": 2.2659021115909586e-05, "loss": 0.0892, "num_input_tokens_seen": 27482560, "step": 21195 }, { "epoch": 1.0358634842303276, "grad_norm": 0.354014128446579, "learning_rate": 2.2649246955637847e-05, "loss": 0.0742, "num_input_tokens_seen": 27489152, "step": 21200 }, { "epoch": 1.0358634842303276, "eval_loss": 0.08883202821016312, "eval_runtime": 374.6934, "eval_samples_per_second": 97.106, "eval_steps_per_second": 24.279, "num_input_tokens_seen": 27489152, "step": 21200 }, { "epoch": 1.036107785894022, "grad_norm": 0.2760128080844879, "learning_rate": 2.2639473157881766e-05, "loss": 0.0889, "num_input_tokens_seen": 27495008, "step": 21205 }, { "epoch": 1.0363520875577164, "grad_norm": 0.5436946749687195, "learning_rate": 2.2629699724148594e-05, "loss": 0.0973, "num_input_tokens_seen": 27501568, "step": 21210 }, { "epoch": 1.0365963892214105, "grad_norm": 0.3492698073387146, "learning_rate": 2.26199266559455e-05, "loss": 0.0935, "num_input_tokens_seen": 27508032, "step": 21215 }, { "epoch": 1.0368406908851049, "grad_norm": 0.38006123900413513, "learning_rate": 2.2610153954779625e-05, "loss": 0.0992, "num_input_tokens_seen": 27514656, "step": 21220 }, { "epoch": 1.0370849925487993, "grad_norm": 0.12900105118751526, "learning_rate": 2.2600381622158056e-05, "loss": 0.104, "num_input_tokens_seen": 27520608, "step": 21225 }, { "epoch": 1.0373292942124936, "grad_norm": 0.1698274463415146, "learning_rate": 2.2590609659587783e-05, "loss": 0.0638, "num_input_tokens_seen": 27527456, "step": 21230 }, { "epoch": 1.037573595876188, "grad_norm": 0.3193780481815338, "learning_rate": 2.2580838068575787e-05, "loss": 0.0838, "num_input_tokens_seen": 27533888, "step": 21235 }, { "epoch": 1.0378178975398822, "grad_norm": 0.3888212740421295, "learning_rate": 2.257106685062896e-05, "loss": 0.0775, "num_input_tokens_seen": 27540544, "step": 21240 }, { "epoch": 1.0380621992035766, "grad_norm": 0.48625585436820984, "learning_rate": 2.256129600725415e-05, "loss": 0.1081, "num_input_tokens_seen": 27546816, "step": 21245 }, { "epoch": 1.038306500867271, "grad_norm": 0.3134099543094635, "learning_rate": 2.2551525539958145e-05, "loss": 0.0957, "num_input_tokens_seen": 27553408, "step": 21250 }, { "epoch": 1.0385508025309653, "grad_norm": 0.31460443139076233, "learning_rate": 2.2541755450247663e-05, "loss": 0.0855, "num_input_tokens_seen": 27559808, "step": 21255 }, { "epoch": 1.0387951041946595, "grad_norm": 0.22094187140464783, "learning_rate": 2.2531985739629382e-05, "loss": 0.0822, "num_input_tokens_seen": 27566080, "step": 21260 }, { "epoch": 1.0390394058583539, "grad_norm": 0.32232582569122314, "learning_rate": 2.2522216409609924e-05, "loss": 0.0691, "num_input_tokens_seen": 27572672, "step": 21265 }, { "epoch": 1.0392837075220482, "grad_norm": 0.2472890019416809, "learning_rate": 2.2512447461695826e-05, "loss": 0.0833, "num_input_tokens_seen": 27579136, "step": 21270 }, { "epoch": 1.0395280091857426, "grad_norm": 0.3004603981971741, "learning_rate": 2.2502678897393593e-05, "loss": 0.0667, "num_input_tokens_seen": 27585632, "step": 21275 }, { "epoch": 1.039772310849437, "grad_norm": 0.17667947709560394, "learning_rate": 2.2492910718209665e-05, "loss": 0.0743, "num_input_tokens_seen": 27591904, "step": 21280 }, { "epoch": 1.0400166125131312, "grad_norm": 0.26670530438423157, "learning_rate": 2.2483142925650398e-05, "loss": 0.0599, "num_input_tokens_seen": 27598720, "step": 21285 }, { "epoch": 1.0402609141768255, "grad_norm": 0.1728384792804718, "learning_rate": 2.247337552122213e-05, "loss": 0.0694, "num_input_tokens_seen": 27605216, "step": 21290 }, { "epoch": 1.04050521584052, "grad_norm": 0.2496589720249176, "learning_rate": 2.24636085064311e-05, "loss": 0.0697, "num_input_tokens_seen": 27612192, "step": 21295 }, { "epoch": 1.0407495175042143, "grad_norm": 0.1450144350528717, "learning_rate": 2.245384188278351e-05, "loss": 0.1113, "num_input_tokens_seen": 27618720, "step": 21300 }, { "epoch": 1.0409938191679085, "grad_norm": 0.40535587072372437, "learning_rate": 2.2444075651785513e-05, "loss": 0.0705, "num_input_tokens_seen": 27624832, "step": 21305 }, { "epoch": 1.0412381208316028, "grad_norm": 0.4487711787223816, "learning_rate": 2.243430981494316e-05, "loss": 0.093, "num_input_tokens_seen": 27630880, "step": 21310 }, { "epoch": 1.0414824224952972, "grad_norm": 0.16623914241790771, "learning_rate": 2.2424544373762475e-05, "loss": 0.077, "num_input_tokens_seen": 27637664, "step": 21315 }, { "epoch": 1.0417267241589916, "grad_norm": 0.47474217414855957, "learning_rate": 2.2414779329749418e-05, "loss": 0.1022, "num_input_tokens_seen": 27644224, "step": 21320 }, { "epoch": 1.0419710258226857, "grad_norm": 0.2711011469364166, "learning_rate": 2.2405014684409873e-05, "loss": 0.1212, "num_input_tokens_seen": 27650304, "step": 21325 }, { "epoch": 1.0422153274863801, "grad_norm": 0.13555683195590973, "learning_rate": 2.239525043924968e-05, "loss": 0.083, "num_input_tokens_seen": 27657024, "step": 21330 }, { "epoch": 1.0424596291500745, "grad_norm": 0.20340928435325623, "learning_rate": 2.2385486595774592e-05, "loss": 0.0738, "num_input_tokens_seen": 27663360, "step": 21335 }, { "epoch": 1.0427039308137689, "grad_norm": 0.32798853516578674, "learning_rate": 2.237572315549033e-05, "loss": 0.0834, "num_input_tokens_seen": 27670304, "step": 21340 }, { "epoch": 1.0429482324774633, "grad_norm": 0.2637314796447754, "learning_rate": 2.2365960119902545e-05, "loss": 0.1042, "num_input_tokens_seen": 27676416, "step": 21345 }, { "epoch": 1.0431925341411574, "grad_norm": 0.21998363733291626, "learning_rate": 2.2356197490516806e-05, "loss": 0.0807, "num_input_tokens_seen": 27683008, "step": 21350 }, { "epoch": 1.0434368358048518, "grad_norm": 0.45125851035118103, "learning_rate": 2.234643526883863e-05, "loss": 0.0798, "num_input_tokens_seen": 27689536, "step": 21355 }, { "epoch": 1.0436811374685462, "grad_norm": 0.2547217011451721, "learning_rate": 2.2336673456373497e-05, "loss": 0.0883, "num_input_tokens_seen": 27695776, "step": 21360 }, { "epoch": 1.0439254391322406, "grad_norm": 0.17216989398002625, "learning_rate": 2.2326912054626772e-05, "loss": 0.0973, "num_input_tokens_seen": 27702048, "step": 21365 }, { "epoch": 1.0441697407959347, "grad_norm": 0.3143247663974762, "learning_rate": 2.2317151065103813e-05, "loss": 0.0882, "num_input_tokens_seen": 27708640, "step": 21370 }, { "epoch": 1.044414042459629, "grad_norm": 0.5187466740608215, "learning_rate": 2.2307390489309865e-05, "loss": 0.0914, "num_input_tokens_seen": 27714496, "step": 21375 }, { "epoch": 1.0446583441233235, "grad_norm": 0.5000181198120117, "learning_rate": 2.2297630328750146e-05, "loss": 0.104, "num_input_tokens_seen": 27721024, "step": 21380 }, { "epoch": 1.0449026457870179, "grad_norm": 0.2738091051578522, "learning_rate": 2.228787058492979e-05, "loss": 0.1014, "num_input_tokens_seen": 27727584, "step": 21385 }, { "epoch": 1.0451469474507122, "grad_norm": 0.3923007845878601, "learning_rate": 2.2278111259353875e-05, "loss": 0.0674, "num_input_tokens_seen": 27733760, "step": 21390 }, { "epoch": 1.0453912491144064, "grad_norm": 0.2108922302722931, "learning_rate": 2.2268352353527395e-05, "loss": 0.0815, "num_input_tokens_seen": 27740192, "step": 21395 }, { "epoch": 1.0456355507781008, "grad_norm": 0.25527945160865784, "learning_rate": 2.225859386895533e-05, "loss": 0.1052, "num_input_tokens_seen": 27746528, "step": 21400 }, { "epoch": 1.0456355507781008, "eval_loss": 0.08917230367660522, "eval_runtime": 374.8267, "eval_samples_per_second": 97.072, "eval_steps_per_second": 24.27, "num_input_tokens_seen": 27746528, "step": 21400 }, { "epoch": 1.0458798524417952, "grad_norm": 0.49304670095443726, "learning_rate": 2.2248835807142525e-05, "loss": 0.0885, "num_input_tokens_seen": 27753216, "step": 21405 }, { "epoch": 1.0461241541054895, "grad_norm": 0.21920377016067505, "learning_rate": 2.2239078169593826e-05, "loss": 0.1006, "num_input_tokens_seen": 27759936, "step": 21410 }, { "epoch": 1.0463684557691837, "grad_norm": 0.45498228073120117, "learning_rate": 2.222932095781396e-05, "loss": 0.0907, "num_input_tokens_seen": 27766400, "step": 21415 }, { "epoch": 1.046612757432878, "grad_norm": 0.16881738603115082, "learning_rate": 2.221956417330762e-05, "loss": 0.072, "num_input_tokens_seen": 27772704, "step": 21420 }, { "epoch": 1.0468570590965725, "grad_norm": 0.2192085236310959, "learning_rate": 2.2209807817579438e-05, "loss": 0.0997, "num_input_tokens_seen": 27779296, "step": 21425 }, { "epoch": 1.0471013607602668, "grad_norm": 0.13612952828407288, "learning_rate": 2.220005189213394e-05, "loss": 0.0731, "num_input_tokens_seen": 27785408, "step": 21430 }, { "epoch": 1.0473456624239612, "grad_norm": 0.1263706088066101, "learning_rate": 2.2190296398475624e-05, "loss": 0.0782, "num_input_tokens_seen": 27791520, "step": 21435 }, { "epoch": 1.0475899640876554, "grad_norm": 0.19994725286960602, "learning_rate": 2.2180541338108926e-05, "loss": 0.0732, "num_input_tokens_seen": 27798144, "step": 21440 }, { "epoch": 1.0478342657513497, "grad_norm": 0.18471916019916534, "learning_rate": 2.2170786712538176e-05, "loss": 0.0862, "num_input_tokens_seen": 27804768, "step": 21445 }, { "epoch": 1.0480785674150441, "grad_norm": 0.1854570358991623, "learning_rate": 2.216103252326768e-05, "loss": 0.099, "num_input_tokens_seen": 27811264, "step": 21450 }, { "epoch": 1.0483228690787385, "grad_norm": 0.49103015661239624, "learning_rate": 2.2151278771801635e-05, "loss": 0.1203, "num_input_tokens_seen": 27817664, "step": 21455 }, { "epoch": 1.0485671707424327, "grad_norm": 0.2157897800207138, "learning_rate": 2.21415254596442e-05, "loss": 0.0803, "num_input_tokens_seen": 27824384, "step": 21460 }, { "epoch": 1.048811472406127, "grad_norm": 0.3223787546157837, "learning_rate": 2.213177258829947e-05, "loss": 0.0825, "num_input_tokens_seen": 27830848, "step": 21465 }, { "epoch": 1.0490557740698214, "grad_norm": 0.19161203503608704, "learning_rate": 2.2122020159271445e-05, "loss": 0.09, "num_input_tokens_seen": 27837568, "step": 21470 }, { "epoch": 1.0493000757335158, "grad_norm": 0.15355296432971954, "learning_rate": 2.2112268174064075e-05, "loss": 0.0919, "num_input_tokens_seen": 27844224, "step": 21475 }, { "epoch": 1.0495443773972102, "grad_norm": 0.2118130624294281, "learning_rate": 2.2102516634181253e-05, "loss": 0.0674, "num_input_tokens_seen": 27850592, "step": 21480 }, { "epoch": 1.0497886790609043, "grad_norm": 0.3190227746963501, "learning_rate": 2.209276554112677e-05, "loss": 0.0902, "num_input_tokens_seen": 27857376, "step": 21485 }, { "epoch": 1.0500329807245987, "grad_norm": 0.3725196123123169, "learning_rate": 2.2083014896404384e-05, "loss": 0.0964, "num_input_tokens_seen": 27863872, "step": 21490 }, { "epoch": 1.050277282388293, "grad_norm": 0.45315682888031006, "learning_rate": 2.207326470151775e-05, "loss": 0.1046, "num_input_tokens_seen": 27870272, "step": 21495 }, { "epoch": 1.0505215840519875, "grad_norm": 0.15807323157787323, "learning_rate": 2.2063514957970477e-05, "loss": 0.0805, "num_input_tokens_seen": 27876800, "step": 21500 }, { "epoch": 1.0507658857156816, "grad_norm": 0.2871948480606079, "learning_rate": 2.205376566726611e-05, "loss": 0.0812, "num_input_tokens_seen": 27883584, "step": 21505 }, { "epoch": 1.051010187379376, "grad_norm": 0.09779278934001923, "learning_rate": 2.204401683090809e-05, "loss": 0.1092, "num_input_tokens_seen": 27891072, "step": 21510 }, { "epoch": 1.0512544890430704, "grad_norm": 0.5916316509246826, "learning_rate": 2.203426845039982e-05, "loss": 0.0755, "num_input_tokens_seen": 27898080, "step": 21515 }, { "epoch": 1.0514987907067648, "grad_norm": 0.2675863206386566, "learning_rate": 2.202452052724464e-05, "loss": 0.0767, "num_input_tokens_seen": 27904768, "step": 21520 }, { "epoch": 1.051743092370459, "grad_norm": 1.3166316747665405, "learning_rate": 2.2014773062945777e-05, "loss": 0.123, "num_input_tokens_seen": 27911104, "step": 21525 }, { "epoch": 1.0519873940341533, "grad_norm": 0.24292226135730743, "learning_rate": 2.2005026059006427e-05, "loss": 0.0766, "num_input_tokens_seen": 27917376, "step": 21530 }, { "epoch": 1.0522316956978477, "grad_norm": 0.39117178320884705, "learning_rate": 2.1995279516929695e-05, "loss": 0.0916, "num_input_tokens_seen": 27924064, "step": 21535 }, { "epoch": 1.052475997361542, "grad_norm": 0.11561793833971024, "learning_rate": 2.1985533438218613e-05, "loss": 0.0871, "num_input_tokens_seen": 27931264, "step": 21540 }, { "epoch": 1.0527202990252365, "grad_norm": 0.17804086208343506, "learning_rate": 2.197578782437617e-05, "loss": 0.0869, "num_input_tokens_seen": 27937568, "step": 21545 }, { "epoch": 1.0529646006889306, "grad_norm": 0.18314409255981445, "learning_rate": 2.196604267690524e-05, "loss": 0.0926, "num_input_tokens_seen": 27944128, "step": 21550 }, { "epoch": 1.053208902352625, "grad_norm": 0.6189883351325989, "learning_rate": 2.195629799730865e-05, "loss": 0.1056, "num_input_tokens_seen": 27950560, "step": 21555 }, { "epoch": 1.0534532040163194, "grad_norm": 0.3800612688064575, "learning_rate": 2.1946553787089173e-05, "loss": 0.0926, "num_input_tokens_seen": 27957152, "step": 21560 }, { "epoch": 1.0536975056800137, "grad_norm": 0.1351679563522339, "learning_rate": 2.193681004774947e-05, "loss": 0.0784, "num_input_tokens_seen": 27963776, "step": 21565 }, { "epoch": 1.053941807343708, "grad_norm": 0.32011500000953674, "learning_rate": 2.1927066780792154e-05, "loss": 0.0992, "num_input_tokens_seen": 27970272, "step": 21570 }, { "epoch": 1.0541861090074023, "grad_norm": 0.24438033998012543, "learning_rate": 2.191732398771975e-05, "loss": 0.0769, "num_input_tokens_seen": 27976544, "step": 21575 }, { "epoch": 1.0544304106710967, "grad_norm": 0.20201453566551208, "learning_rate": 2.1907581670034725e-05, "loss": 0.0775, "num_input_tokens_seen": 27982688, "step": 21580 }, { "epoch": 1.054674712334791, "grad_norm": 0.2936769723892212, "learning_rate": 2.189783982923948e-05, "loss": 0.0818, "num_input_tokens_seen": 27989216, "step": 21585 }, { "epoch": 1.0549190139984854, "grad_norm": 0.40865299105644226, "learning_rate": 2.1888098466836303e-05, "loss": 0.0725, "num_input_tokens_seen": 27996032, "step": 21590 }, { "epoch": 1.0551633156621796, "grad_norm": 0.3171561062335968, "learning_rate": 2.1878357584327457e-05, "loss": 0.1096, "num_input_tokens_seen": 28002528, "step": 21595 }, { "epoch": 1.055407617325874, "grad_norm": 0.24529777467250824, "learning_rate": 2.1868617183215103e-05, "loss": 0.0898, "num_input_tokens_seen": 28009568, "step": 21600 }, { "epoch": 1.055407617325874, "eval_loss": 0.08843624591827393, "eval_runtime": 374.5321, "eval_samples_per_second": 97.148, "eval_steps_per_second": 24.289, "num_input_tokens_seen": 28009568, "step": 21600 }, { "epoch": 1.0556519189895683, "grad_norm": 0.39267024397850037, "learning_rate": 2.1858877265001327e-05, "loss": 0.0867, "num_input_tokens_seen": 28016320, "step": 21605 }, { "epoch": 1.0558962206532627, "grad_norm": 0.37235406041145325, "learning_rate": 2.184913783118816e-05, "loss": 0.0707, "num_input_tokens_seen": 28023168, "step": 21610 }, { "epoch": 1.0561405223169569, "grad_norm": 0.18303769826889038, "learning_rate": 2.1839398883277522e-05, "loss": 0.0898, "num_input_tokens_seen": 28029408, "step": 21615 }, { "epoch": 1.0563848239806513, "grad_norm": 0.22163330018520355, "learning_rate": 2.182966042277129e-05, "loss": 0.0819, "num_input_tokens_seen": 28035744, "step": 21620 }, { "epoch": 1.0566291256443456, "grad_norm": 0.19708897173404694, "learning_rate": 2.181992245117128e-05, "loss": 0.0816, "num_input_tokens_seen": 28042784, "step": 21625 }, { "epoch": 1.05687342730804, "grad_norm": 0.18999414145946503, "learning_rate": 2.181018496997918e-05, "loss": 0.0828, "num_input_tokens_seen": 28049504, "step": 21630 }, { "epoch": 1.0571177289717344, "grad_norm": 0.25935783982276917, "learning_rate": 2.1800447980696648e-05, "loss": 0.0995, "num_input_tokens_seen": 28055936, "step": 21635 }, { "epoch": 1.0573620306354286, "grad_norm": 0.2266811728477478, "learning_rate": 2.1790711484825248e-05, "loss": 0.0924, "num_input_tokens_seen": 28062496, "step": 21640 }, { "epoch": 1.057606332299123, "grad_norm": 0.3300866484642029, "learning_rate": 2.178097548386646e-05, "loss": 0.0763, "num_input_tokens_seen": 28069632, "step": 21645 }, { "epoch": 1.0578506339628173, "grad_norm": 0.10738052427768707, "learning_rate": 2.1771239979321712e-05, "loss": 0.0835, "num_input_tokens_seen": 28075552, "step": 21650 }, { "epoch": 1.0580949356265117, "grad_norm": 0.6716892719268799, "learning_rate": 2.1761504972692327e-05, "loss": 0.0901, "num_input_tokens_seen": 28082016, "step": 21655 }, { "epoch": 1.0583392372902058, "grad_norm": 0.20878008008003235, "learning_rate": 2.1751770465479572e-05, "loss": 0.0885, "num_input_tokens_seen": 28088704, "step": 21660 }, { "epoch": 1.0585835389539002, "grad_norm": 0.4624391496181488, "learning_rate": 2.174203645918464e-05, "loss": 0.0831, "num_input_tokens_seen": 28094752, "step": 21665 }, { "epoch": 1.0588278406175946, "grad_norm": 0.5655032992362976, "learning_rate": 2.1732302955308624e-05, "loss": 0.0869, "num_input_tokens_seen": 28101312, "step": 21670 }, { "epoch": 1.059072142281289, "grad_norm": 0.1875850409269333, "learning_rate": 2.172256995535255e-05, "loss": 0.1011, "num_input_tokens_seen": 28107744, "step": 21675 }, { "epoch": 1.0593164439449834, "grad_norm": 0.4783015847206116, "learning_rate": 2.171283746081739e-05, "loss": 0.0811, "num_input_tokens_seen": 28114368, "step": 21680 }, { "epoch": 1.0595607456086775, "grad_norm": 0.13355202972888947, "learning_rate": 2.1703105473203988e-05, "loss": 0.0615, "num_input_tokens_seen": 28120640, "step": 21685 }, { "epoch": 1.059805047272372, "grad_norm": 0.1872178167104721, "learning_rate": 2.1693373994013168e-05, "loss": 0.0578, "num_input_tokens_seen": 28127232, "step": 21690 }, { "epoch": 1.0600493489360663, "grad_norm": 0.4565977156162262, "learning_rate": 2.168364302474562e-05, "loss": 0.0742, "num_input_tokens_seen": 28133664, "step": 21695 }, { "epoch": 1.0602936505997607, "grad_norm": 0.2071342170238495, "learning_rate": 2.167391256690199e-05, "loss": 0.0948, "num_input_tokens_seen": 28140576, "step": 21700 }, { "epoch": 1.0605379522634548, "grad_norm": 0.43504270911216736, "learning_rate": 2.1664182621982855e-05, "loss": 0.0754, "num_input_tokens_seen": 28146848, "step": 21705 }, { "epoch": 1.0607822539271492, "grad_norm": 0.28970324993133545, "learning_rate": 2.1654453191488673e-05, "loss": 0.1089, "num_input_tokens_seen": 28153152, "step": 21710 }, { "epoch": 1.0610265555908436, "grad_norm": 0.22637200355529785, "learning_rate": 2.1644724276919846e-05, "loss": 0.0695, "num_input_tokens_seen": 28160000, "step": 21715 }, { "epoch": 1.061270857254538, "grad_norm": 0.5109673142433167, "learning_rate": 2.1634995879776715e-05, "loss": 0.0869, "num_input_tokens_seen": 28166944, "step": 21720 }, { "epoch": 1.0615151589182323, "grad_norm": 0.20392580330371857, "learning_rate": 2.162526800155949e-05, "loss": 0.0689, "num_input_tokens_seen": 28173824, "step": 21725 }, { "epoch": 1.0617594605819265, "grad_norm": 0.3977251648902893, "learning_rate": 2.1615540643768363e-05, "loss": 0.1063, "num_input_tokens_seen": 28180064, "step": 21730 }, { "epoch": 1.0620037622456209, "grad_norm": 0.14110714197158813, "learning_rate": 2.160581380790339e-05, "loss": 0.1015, "num_input_tokens_seen": 28186816, "step": 21735 }, { "epoch": 1.0622480639093153, "grad_norm": 0.15808701515197754, "learning_rate": 2.1596087495464586e-05, "loss": 0.0494, "num_input_tokens_seen": 28193440, "step": 21740 }, { "epoch": 1.0624923655730096, "grad_norm": 0.2680657207965851, "learning_rate": 2.1586361707951866e-05, "loss": 0.086, "num_input_tokens_seen": 28200032, "step": 21745 }, { "epoch": 1.0627366672367038, "grad_norm": 0.4477308988571167, "learning_rate": 2.157663644686507e-05, "loss": 0.0931, "num_input_tokens_seen": 28206304, "step": 21750 }, { "epoch": 1.0629809689003982, "grad_norm": 0.28219330310821533, "learning_rate": 2.156691171370396e-05, "loss": 0.0764, "num_input_tokens_seen": 28212800, "step": 21755 }, { "epoch": 1.0632252705640926, "grad_norm": 0.15799224376678467, "learning_rate": 2.1557187509968195e-05, "loss": 0.1086, "num_input_tokens_seen": 28219200, "step": 21760 }, { "epoch": 1.063469572227787, "grad_norm": 0.11284436285495758, "learning_rate": 2.1547463837157382e-05, "loss": 0.0768, "num_input_tokens_seen": 28225472, "step": 21765 }, { "epoch": 1.0637138738914813, "grad_norm": 0.5406661033630371, "learning_rate": 2.1537740696771045e-05, "loss": 0.1105, "num_input_tokens_seen": 28232064, "step": 21770 }, { "epoch": 1.0639581755551755, "grad_norm": 0.24233438074588776, "learning_rate": 2.1528018090308587e-05, "loss": 0.1039, "num_input_tokens_seen": 28238528, "step": 21775 }, { "epoch": 1.0642024772188698, "grad_norm": 0.17851752042770386, "learning_rate": 2.151829601926938e-05, "loss": 0.0918, "num_input_tokens_seen": 28245120, "step": 21780 }, { "epoch": 1.0644467788825642, "grad_norm": 0.4257485866546631, "learning_rate": 2.1508574485152684e-05, "loss": 0.0825, "num_input_tokens_seen": 28251328, "step": 21785 }, { "epoch": 1.0646910805462586, "grad_norm": 0.6733453869819641, "learning_rate": 2.1498853489457667e-05, "loss": 0.1045, "num_input_tokens_seen": 28257696, "step": 21790 }, { "epoch": 1.0649353822099528, "grad_norm": 0.6580775380134583, "learning_rate": 2.1489133033683455e-05, "loss": 0.0948, "num_input_tokens_seen": 28264256, "step": 21795 }, { "epoch": 1.0651796838736471, "grad_norm": 0.16439837217330933, "learning_rate": 2.1479413119329038e-05, "loss": 0.0782, "num_input_tokens_seen": 28270592, "step": 21800 }, { "epoch": 1.0651796838736471, "eval_loss": 0.08862791955471039, "eval_runtime": 375.2125, "eval_samples_per_second": 96.972, "eval_steps_per_second": 24.245, "num_input_tokens_seen": 28270592, "step": 21800 }, { "epoch": 1.0654239855373415, "grad_norm": 0.25483036041259766, "learning_rate": 2.1469693747893355e-05, "loss": 0.086, "num_input_tokens_seen": 28277632, "step": 21805 }, { "epoch": 1.065668287201036, "grad_norm": 0.3828031122684479, "learning_rate": 2.1459974920875274e-05, "loss": 0.0768, "num_input_tokens_seen": 28283872, "step": 21810 }, { "epoch": 1.0659125888647303, "grad_norm": 0.14066748321056366, "learning_rate": 2.145025663977354e-05, "loss": 0.0718, "num_input_tokens_seen": 28290112, "step": 21815 }, { "epoch": 1.0661568905284244, "grad_norm": 0.2760370969772339, "learning_rate": 2.1440538906086844e-05, "loss": 0.0971, "num_input_tokens_seen": 28296640, "step": 21820 }, { "epoch": 1.0664011921921188, "grad_norm": 0.14014586806297302, "learning_rate": 2.1430821721313782e-05, "loss": 0.0862, "num_input_tokens_seen": 28303520, "step": 21825 }, { "epoch": 1.0666454938558132, "grad_norm": 0.4913872182369232, "learning_rate": 2.142110508695286e-05, "loss": 0.0868, "num_input_tokens_seen": 28309824, "step": 21830 }, { "epoch": 1.0668897955195076, "grad_norm": 0.1435028314590454, "learning_rate": 2.1411389004502515e-05, "loss": 0.1121, "num_input_tokens_seen": 28316256, "step": 21835 }, { "epoch": 1.0671340971832017, "grad_norm": 0.8010074496269226, "learning_rate": 2.140167347546107e-05, "loss": 0.0753, "num_input_tokens_seen": 28323040, "step": 21840 }, { "epoch": 1.0673783988468961, "grad_norm": 0.24892540276050568, "learning_rate": 2.1391958501326793e-05, "loss": 0.0846, "num_input_tokens_seen": 28329824, "step": 21845 }, { "epoch": 1.0676227005105905, "grad_norm": 0.2956691086292267, "learning_rate": 2.1382244083597873e-05, "loss": 0.0915, "num_input_tokens_seen": 28335968, "step": 21850 }, { "epoch": 1.0678670021742849, "grad_norm": 0.3910670876502991, "learning_rate": 2.137253022377237e-05, "loss": 0.1031, "num_input_tokens_seen": 28343008, "step": 21855 }, { "epoch": 1.0681113038379793, "grad_norm": 0.3522753119468689, "learning_rate": 2.136281692334829e-05, "loss": 0.0884, "num_input_tokens_seen": 28349760, "step": 21860 }, { "epoch": 1.0683556055016734, "grad_norm": 0.6314899325370789, "learning_rate": 2.135310418382356e-05, "loss": 0.0813, "num_input_tokens_seen": 28355840, "step": 21865 }, { "epoch": 1.0685999071653678, "grad_norm": 0.15495635569095612, "learning_rate": 2.134339200669598e-05, "loss": 0.0729, "num_input_tokens_seen": 28362016, "step": 21870 }, { "epoch": 1.0688442088290622, "grad_norm": 0.37343692779541016, "learning_rate": 2.133368039346331e-05, "loss": 0.0852, "num_input_tokens_seen": 28368512, "step": 21875 }, { "epoch": 1.0690885104927565, "grad_norm": 0.45633628964424133, "learning_rate": 2.1323969345623195e-05, "loss": 0.0759, "num_input_tokens_seen": 28374976, "step": 21880 }, { "epoch": 1.0693328121564507, "grad_norm": 0.20756034553050995, "learning_rate": 2.1314258864673207e-05, "loss": 0.0873, "num_input_tokens_seen": 28381792, "step": 21885 }, { "epoch": 1.069577113820145, "grad_norm": 0.49969977140426636, "learning_rate": 2.130454895211082e-05, "loss": 0.0928, "num_input_tokens_seen": 28388128, "step": 21890 }, { "epoch": 1.0698214154838395, "grad_norm": 0.4395589828491211, "learning_rate": 2.129483960943342e-05, "loss": 0.0696, "num_input_tokens_seen": 28394720, "step": 21895 }, { "epoch": 1.0700657171475338, "grad_norm": 0.5667276978492737, "learning_rate": 2.128513083813831e-05, "loss": 0.1011, "num_input_tokens_seen": 28401600, "step": 21900 }, { "epoch": 1.070310018811228, "grad_norm": 0.41021376848220825, "learning_rate": 2.1275422639722724e-05, "loss": 0.0848, "num_input_tokens_seen": 28408160, "step": 21905 }, { "epoch": 1.0705543204749224, "grad_norm": 0.34558144211769104, "learning_rate": 2.126571501568376e-05, "loss": 0.0914, "num_input_tokens_seen": 28414528, "step": 21910 }, { "epoch": 1.0707986221386168, "grad_norm": 0.2132580280303955, "learning_rate": 2.1256007967518478e-05, "loss": 0.0928, "num_input_tokens_seen": 28420864, "step": 21915 }, { "epoch": 1.0710429238023111, "grad_norm": 0.2800518572330475, "learning_rate": 2.124630149672381e-05, "loss": 0.101, "num_input_tokens_seen": 28427776, "step": 21920 }, { "epoch": 1.0712872254660055, "grad_norm": 0.3312361240386963, "learning_rate": 2.1236595604796624e-05, "loss": 0.1016, "num_input_tokens_seen": 28434176, "step": 21925 }, { "epoch": 1.0715315271296997, "grad_norm": 0.40045028924942017, "learning_rate": 2.1226890293233693e-05, "loss": 0.1043, "num_input_tokens_seen": 28440896, "step": 21930 }, { "epoch": 1.071775828793394, "grad_norm": 0.4138928949832916, "learning_rate": 2.1217185563531694e-05, "loss": 0.091, "num_input_tokens_seen": 28447584, "step": 21935 }, { "epoch": 1.0720201304570884, "grad_norm": 0.5143356919288635, "learning_rate": 2.120748141718721e-05, "loss": 0.0821, "num_input_tokens_seen": 28454016, "step": 21940 }, { "epoch": 1.0722644321207828, "grad_norm": 0.3923596739768982, "learning_rate": 2.1197777855696765e-05, "loss": 0.1066, "num_input_tokens_seen": 28460736, "step": 21945 }, { "epoch": 1.072508733784477, "grad_norm": 0.17845825850963593, "learning_rate": 2.1188074880556746e-05, "loss": 0.0774, "num_input_tokens_seen": 28467424, "step": 21950 }, { "epoch": 1.0727530354481714, "grad_norm": 1.3065555095672607, "learning_rate": 2.1178372493263495e-05, "loss": 0.1094, "num_input_tokens_seen": 28473920, "step": 21955 }, { "epoch": 1.0729973371118657, "grad_norm": 0.19873648881912231, "learning_rate": 2.116867069531322e-05, "loss": 0.1006, "num_input_tokens_seen": 28480576, "step": 21960 }, { "epoch": 1.0732416387755601, "grad_norm": 0.22857235372066498, "learning_rate": 2.1158969488202073e-05, "loss": 0.0886, "num_input_tokens_seen": 28486720, "step": 21965 }, { "epoch": 1.0734859404392545, "grad_norm": 0.32495996356010437, "learning_rate": 2.114926887342611e-05, "loss": 0.0925, "num_input_tokens_seen": 28493696, "step": 21970 }, { "epoch": 1.0737302421029487, "grad_norm": 0.4152905344963074, "learning_rate": 2.113956885248127e-05, "loss": 0.096, "num_input_tokens_seen": 28499968, "step": 21975 }, { "epoch": 1.073974543766643, "grad_norm": 0.4973906874656677, "learning_rate": 2.112986942686342e-05, "loss": 0.0616, "num_input_tokens_seen": 28506976, "step": 21980 }, { "epoch": 1.0742188454303374, "grad_norm": 0.2735450267791748, "learning_rate": 2.112017059806835e-05, "loss": 0.0652, "num_input_tokens_seen": 28513792, "step": 21985 }, { "epoch": 1.0744631470940318, "grad_norm": 0.2877542972564697, "learning_rate": 2.1110472367591724e-05, "loss": 0.082, "num_input_tokens_seen": 28520608, "step": 21990 }, { "epoch": 1.074707448757726, "grad_norm": 0.21561555564403534, "learning_rate": 2.1100774736929145e-05, "loss": 0.0849, "num_input_tokens_seen": 28526944, "step": 21995 }, { "epoch": 1.0749517504214203, "grad_norm": 0.2480669766664505, "learning_rate": 2.10910777075761e-05, "loss": 0.0578, "num_input_tokens_seen": 28533952, "step": 22000 }, { "epoch": 1.0749517504214203, "eval_loss": 0.08870303630828857, "eval_runtime": 374.9351, "eval_samples_per_second": 97.043, "eval_steps_per_second": 24.263, "num_input_tokens_seen": 28533952, "step": 22000 }, { "epoch": 1.0751960520851147, "grad_norm": 0.4086638391017914, "learning_rate": 2.108138128102799e-05, "loss": 0.0637, "num_input_tokens_seen": 28540576, "step": 22005 }, { "epoch": 1.075440353748809, "grad_norm": 0.35270360112190247, "learning_rate": 2.107168545878014e-05, "loss": 0.1034, "num_input_tokens_seen": 28547264, "step": 22010 }, { "epoch": 1.0756846554125032, "grad_norm": 0.281549334526062, "learning_rate": 2.106199024232775e-05, "loss": 0.072, "num_input_tokens_seen": 28553120, "step": 22015 }, { "epoch": 1.0759289570761976, "grad_norm": 0.2720888555049896, "learning_rate": 2.105229563316595e-05, "loss": 0.0846, "num_input_tokens_seen": 28559104, "step": 22020 }, { "epoch": 1.076173258739892, "grad_norm": 0.41770878434181213, "learning_rate": 2.1042601632789784e-05, "loss": 0.0748, "num_input_tokens_seen": 28565440, "step": 22025 }, { "epoch": 1.0764175604035864, "grad_norm": 0.16354134678840637, "learning_rate": 2.103290824269417e-05, "loss": 0.0828, "num_input_tokens_seen": 28572000, "step": 22030 }, { "epoch": 1.0766618620672808, "grad_norm": 0.22635917365550995, "learning_rate": 2.1023215464373965e-05, "loss": 0.071, "num_input_tokens_seen": 28578784, "step": 22035 }, { "epoch": 1.076906163730975, "grad_norm": 0.5372456312179565, "learning_rate": 2.1013523299323908e-05, "loss": 0.082, "num_input_tokens_seen": 28585120, "step": 22040 }, { "epoch": 1.0771504653946693, "grad_norm": 0.39445924758911133, "learning_rate": 2.1003831749038654e-05, "loss": 0.0739, "num_input_tokens_seen": 28591584, "step": 22045 }, { "epoch": 1.0773947670583637, "grad_norm": 0.16020257771015167, "learning_rate": 2.099414081501277e-05, "loss": 0.0879, "num_input_tokens_seen": 28597792, "step": 22050 }, { "epoch": 1.077639068722058, "grad_norm": 0.17771969735622406, "learning_rate": 2.09844504987407e-05, "loss": 0.0909, "num_input_tokens_seen": 28604064, "step": 22055 }, { "epoch": 1.0778833703857522, "grad_norm": 0.1981746256351471, "learning_rate": 2.097476080171683e-05, "loss": 0.0824, "num_input_tokens_seen": 28610336, "step": 22060 }, { "epoch": 1.0781276720494466, "grad_norm": 0.42490825057029724, "learning_rate": 2.0965071725435436e-05, "loss": 0.083, "num_input_tokens_seen": 28617344, "step": 22065 }, { "epoch": 1.078371973713141, "grad_norm": 0.5582310557365417, "learning_rate": 2.0955383271390684e-05, "loss": 0.084, "num_input_tokens_seen": 28623296, "step": 22070 }, { "epoch": 1.0786162753768354, "grad_norm": 0.19147932529449463, "learning_rate": 2.094569544107666e-05, "loss": 0.0677, "num_input_tokens_seen": 28629952, "step": 22075 }, { "epoch": 1.0788605770405297, "grad_norm": 0.21318936347961426, "learning_rate": 2.093600823598735e-05, "loss": 0.1056, "num_input_tokens_seen": 28636192, "step": 22080 }, { "epoch": 1.079104878704224, "grad_norm": 0.5448675751686096, "learning_rate": 2.092632165761663e-05, "loss": 0.0852, "num_input_tokens_seen": 28642816, "step": 22085 }, { "epoch": 1.0793491803679183, "grad_norm": 0.15587414801120758, "learning_rate": 2.091663570745832e-05, "loss": 0.0812, "num_input_tokens_seen": 28648672, "step": 22090 }, { "epoch": 1.0795934820316127, "grad_norm": 0.1428956389427185, "learning_rate": 2.0906950387006086e-05, "loss": 0.1242, "num_input_tokens_seen": 28654784, "step": 22095 }, { "epoch": 1.079837783695307, "grad_norm": 0.1572774052619934, "learning_rate": 2.0897265697753543e-05, "loss": 0.0907, "num_input_tokens_seen": 28661376, "step": 22100 }, { "epoch": 1.0800820853590012, "grad_norm": 0.2568388283252716, "learning_rate": 2.088758164119419e-05, "loss": 0.0883, "num_input_tokens_seen": 28667712, "step": 22105 }, { "epoch": 1.0803263870226956, "grad_norm": 0.1809830665588379, "learning_rate": 2.0877898218821428e-05, "loss": 0.0848, "num_input_tokens_seen": 28674208, "step": 22110 }, { "epoch": 1.08057068868639, "grad_norm": 0.47123217582702637, "learning_rate": 2.0868215432128565e-05, "loss": 0.0925, "num_input_tokens_seen": 28680736, "step": 22115 }, { "epoch": 1.0808149903500843, "grad_norm": 0.2559930086135864, "learning_rate": 2.0858533282608796e-05, "loss": 0.0795, "num_input_tokens_seen": 28686848, "step": 22120 }, { "epoch": 1.0810592920137787, "grad_norm": 0.2065904140472412, "learning_rate": 2.084885177175524e-05, "loss": 0.1066, "num_input_tokens_seen": 28692768, "step": 22125 }, { "epoch": 1.0813035936774729, "grad_norm": 0.27943921089172363, "learning_rate": 2.0839170901060917e-05, "loss": 0.0767, "num_input_tokens_seen": 28699072, "step": 22130 }, { "epoch": 1.0815478953411672, "grad_norm": 0.5748248100280762, "learning_rate": 2.082949067201872e-05, "loss": 0.0856, "num_input_tokens_seen": 28705376, "step": 22135 }, { "epoch": 1.0817921970048616, "grad_norm": 0.11612728983163834, "learning_rate": 2.0819811086121475e-05, "loss": 0.0917, "num_input_tokens_seen": 28711808, "step": 22140 }, { "epoch": 1.082036498668556, "grad_norm": 0.18058396875858307, "learning_rate": 2.08101321448619e-05, "loss": 0.0987, "num_input_tokens_seen": 28717920, "step": 22145 }, { "epoch": 1.0822808003322502, "grad_norm": 0.6127630472183228, "learning_rate": 2.080045384973259e-05, "loss": 0.1145, "num_input_tokens_seen": 28724128, "step": 22150 }, { "epoch": 1.0825251019959445, "grad_norm": 0.39687803387641907, "learning_rate": 2.0790776202226082e-05, "loss": 0.0955, "num_input_tokens_seen": 28730304, "step": 22155 }, { "epoch": 1.082769403659639, "grad_norm": 0.3624953627586365, "learning_rate": 2.078109920383477e-05, "loss": 0.081, "num_input_tokens_seen": 28737440, "step": 22160 }, { "epoch": 1.0830137053233333, "grad_norm": 0.20047296583652496, "learning_rate": 2.0771422856050978e-05, "loss": 0.0776, "num_input_tokens_seen": 28743840, "step": 22165 }, { "epoch": 1.0832580069870277, "grad_norm": 0.27752965688705444, "learning_rate": 2.076174716036693e-05, "loss": 0.09, "num_input_tokens_seen": 28750048, "step": 22170 }, { "epoch": 1.0835023086507218, "grad_norm": 0.14754453301429749, "learning_rate": 2.075207211827472e-05, "loss": 0.0887, "num_input_tokens_seen": 28756704, "step": 22175 }, { "epoch": 1.0837466103144162, "grad_norm": 0.7635356783866882, "learning_rate": 2.074239773126638e-05, "loss": 0.0983, "num_input_tokens_seen": 28762912, "step": 22180 }, { "epoch": 1.0839909119781106, "grad_norm": 0.46855565905570984, "learning_rate": 2.073272400083382e-05, "loss": 0.099, "num_input_tokens_seen": 28769152, "step": 22185 }, { "epoch": 1.084235213641805, "grad_norm": 0.42233046889305115, "learning_rate": 2.072305092846883e-05, "loss": 0.0567, "num_input_tokens_seen": 28775744, "step": 22190 }, { "epoch": 1.0844795153054991, "grad_norm": 0.1587917059659958, "learning_rate": 2.0713378515663152e-05, "loss": 0.0947, "num_input_tokens_seen": 28782080, "step": 22195 }, { "epoch": 1.0847238169691935, "grad_norm": 0.38974276185035706, "learning_rate": 2.070370676390836e-05, "loss": 0.0975, "num_input_tokens_seen": 28788352, "step": 22200 }, { "epoch": 1.0847238169691935, "eval_loss": 0.08833634108304977, "eval_runtime": 374.4758, "eval_samples_per_second": 97.162, "eval_steps_per_second": 24.293, "num_input_tokens_seen": 28788352, "step": 22200 }, { "epoch": 1.084968118632888, "grad_norm": 0.25074759125709534, "learning_rate": 2.0694035674695974e-05, "loss": 0.0728, "num_input_tokens_seen": 28795072, "step": 22205 }, { "epoch": 1.0852124202965823, "grad_norm": 0.12843671441078186, "learning_rate": 2.0684365249517416e-05, "loss": 0.0848, "num_input_tokens_seen": 28801824, "step": 22210 }, { "epoch": 1.0854567219602766, "grad_norm": 0.247460275888443, "learning_rate": 2.067469548986396e-05, "loss": 0.1032, "num_input_tokens_seen": 28808480, "step": 22215 }, { "epoch": 1.0857010236239708, "grad_norm": 0.29682278633117676, "learning_rate": 2.066502639722681e-05, "loss": 0.0681, "num_input_tokens_seen": 28815328, "step": 22220 }, { "epoch": 1.0859453252876652, "grad_norm": 0.43275606632232666, "learning_rate": 2.065535797309708e-05, "loss": 0.106, "num_input_tokens_seen": 28821728, "step": 22225 }, { "epoch": 1.0861896269513596, "grad_norm": 1.563659906387329, "learning_rate": 2.0645690218965736e-05, "loss": 0.1057, "num_input_tokens_seen": 28828192, "step": 22230 }, { "epoch": 1.086433928615054, "grad_norm": 0.19085481762886047, "learning_rate": 2.063602313632369e-05, "loss": 0.0795, "num_input_tokens_seen": 28834272, "step": 22235 }, { "epoch": 1.086678230278748, "grad_norm": 0.8001002073287964, "learning_rate": 2.0626356726661704e-05, "loss": 0.0924, "num_input_tokens_seen": 28840384, "step": 22240 }, { "epoch": 1.0869225319424425, "grad_norm": 0.21310482919216156, "learning_rate": 2.0616690991470477e-05, "loss": 0.0928, "num_input_tokens_seen": 28846816, "step": 22245 }, { "epoch": 1.0871668336061369, "grad_norm": 0.2938551604747772, "learning_rate": 2.0607025932240595e-05, "loss": 0.0674, "num_input_tokens_seen": 28853024, "step": 22250 }, { "epoch": 1.0874111352698312, "grad_norm": 0.4396660625934601, "learning_rate": 2.059736155046251e-05, "loss": 0.0842, "num_input_tokens_seen": 28859552, "step": 22255 }, { "epoch": 1.0876554369335256, "grad_norm": 0.43241599202156067, "learning_rate": 2.0587697847626603e-05, "loss": 0.1155, "num_input_tokens_seen": 28866720, "step": 22260 }, { "epoch": 1.0878997385972198, "grad_norm": 0.19146504998207092, "learning_rate": 2.057803482522314e-05, "loss": 0.0622, "num_input_tokens_seen": 28873280, "step": 22265 }, { "epoch": 1.0881440402609142, "grad_norm": 0.6990851163864136, "learning_rate": 2.056837248474227e-05, "loss": 0.0718, "num_input_tokens_seen": 28880128, "step": 22270 }, { "epoch": 1.0883883419246085, "grad_norm": 0.27386629581451416, "learning_rate": 2.0558710827674064e-05, "loss": 0.0845, "num_input_tokens_seen": 28886720, "step": 22275 }, { "epoch": 1.088632643588303, "grad_norm": 0.2527279555797577, "learning_rate": 2.054904985550845e-05, "loss": 0.0858, "num_input_tokens_seen": 28893088, "step": 22280 }, { "epoch": 1.088876945251997, "grad_norm": 0.162306010723114, "learning_rate": 2.0539389569735287e-05, "loss": 0.071, "num_input_tokens_seen": 28899904, "step": 22285 }, { "epoch": 1.0891212469156915, "grad_norm": 0.42290931940078735, "learning_rate": 2.052972997184431e-05, "loss": 0.1082, "num_input_tokens_seen": 28906176, "step": 22290 }, { "epoch": 1.0893655485793858, "grad_norm": 0.3036128878593445, "learning_rate": 2.0520071063325146e-05, "loss": 0.0842, "num_input_tokens_seen": 28912416, "step": 22295 }, { "epoch": 1.0896098502430802, "grad_norm": 0.22108592092990875, "learning_rate": 2.051041284566732e-05, "loss": 0.0877, "num_input_tokens_seen": 28918976, "step": 22300 }, { "epoch": 1.0898541519067746, "grad_norm": 0.3491162061691284, "learning_rate": 2.050075532036026e-05, "loss": 0.0743, "num_input_tokens_seen": 28925056, "step": 22305 }, { "epoch": 1.0900984535704688, "grad_norm": 0.7739551067352295, "learning_rate": 2.0491098488893264e-05, "loss": 0.1076, "num_input_tokens_seen": 28931520, "step": 22310 }, { "epoch": 1.0903427552341631, "grad_norm": 0.32782992720603943, "learning_rate": 2.0481442352755546e-05, "loss": 0.0765, "num_input_tokens_seen": 28937760, "step": 22315 }, { "epoch": 1.0905870568978575, "grad_norm": 0.16854310035705566, "learning_rate": 2.0471786913436198e-05, "loss": 0.0959, "num_input_tokens_seen": 28943872, "step": 22320 }, { "epoch": 1.090831358561552, "grad_norm": 0.7802807092666626, "learning_rate": 2.0462132172424218e-05, "loss": 0.1106, "num_input_tokens_seen": 28950112, "step": 22325 }, { "epoch": 1.091075660225246, "grad_norm": 0.4176125228404999, "learning_rate": 2.0452478131208484e-05, "loss": 0.0796, "num_input_tokens_seen": 28956640, "step": 22330 }, { "epoch": 1.0913199618889404, "grad_norm": 0.1436525583267212, "learning_rate": 2.0442824791277765e-05, "loss": 0.0645, "num_input_tokens_seen": 28963136, "step": 22335 }, { "epoch": 1.0915642635526348, "grad_norm": 0.19698646664619446, "learning_rate": 2.0433172154120727e-05, "loss": 0.0878, "num_input_tokens_seen": 28969376, "step": 22340 }, { "epoch": 1.0918085652163292, "grad_norm": 0.2389107495546341, "learning_rate": 2.0423520221225947e-05, "loss": 0.054, "num_input_tokens_seen": 28975968, "step": 22345 }, { "epoch": 1.0920528668800236, "grad_norm": 0.28861889243125916, "learning_rate": 2.0413868994081848e-05, "loss": 0.0642, "num_input_tokens_seen": 28982368, "step": 22350 }, { "epoch": 1.0922971685437177, "grad_norm": 0.385797917842865, "learning_rate": 2.0404218474176795e-05, "loss": 0.0798, "num_input_tokens_seen": 28989216, "step": 22355 }, { "epoch": 1.092541470207412, "grad_norm": 0.15530937910079956, "learning_rate": 2.0394568662999002e-05, "loss": 0.0736, "num_input_tokens_seen": 28995872, "step": 22360 }, { "epoch": 1.0927857718711065, "grad_norm": 0.1927560716867447, "learning_rate": 2.0384919562036593e-05, "loss": 0.0608, "num_input_tokens_seen": 29002624, "step": 22365 }, { "epoch": 1.0930300735348009, "grad_norm": 0.46045902371406555, "learning_rate": 2.0375271172777593e-05, "loss": 0.0997, "num_input_tokens_seen": 29009184, "step": 22370 }, { "epoch": 1.093274375198495, "grad_norm": 0.3620081841945648, "learning_rate": 2.0365623496709885e-05, "loss": 0.0992, "num_input_tokens_seen": 29015648, "step": 22375 }, { "epoch": 1.0935186768621894, "grad_norm": 0.603955090045929, "learning_rate": 2.0355976535321283e-05, "loss": 0.088, "num_input_tokens_seen": 29022144, "step": 22380 }, { "epoch": 1.0937629785258838, "grad_norm": 0.17187584936618805, "learning_rate": 2.034633029009945e-05, "loss": 0.0757, "num_input_tokens_seen": 29028032, "step": 22385 }, { "epoch": 1.0940072801895782, "grad_norm": 0.2324189394712448, "learning_rate": 2.0336684762531972e-05, "loss": 0.1012, "num_input_tokens_seen": 29034720, "step": 22390 }, { "epoch": 1.0942515818532725, "grad_norm": 0.1877123862504959, "learning_rate": 2.032703995410631e-05, "loss": 0.0853, "num_input_tokens_seen": 29040864, "step": 22395 }, { "epoch": 1.0944958835169667, "grad_norm": 0.32033517956733704, "learning_rate": 2.031739586630981e-05, "loss": 0.0986, "num_input_tokens_seen": 29047328, "step": 22400 }, { "epoch": 1.0944958835169667, "eval_loss": 0.08838493376970291, "eval_runtime": 374.4305, "eval_samples_per_second": 97.174, "eval_steps_per_second": 24.296, "num_input_tokens_seen": 29047328, "step": 22400 }, { "epoch": 1.094740185180661, "grad_norm": 0.32950928807258606, "learning_rate": 2.0307752500629707e-05, "loss": 0.0888, "num_input_tokens_seen": 29053504, "step": 22405 }, { "epoch": 1.0949844868443555, "grad_norm": 0.23027293384075165, "learning_rate": 2.0298109858553144e-05, "loss": 0.0962, "num_input_tokens_seen": 29059872, "step": 22410 }, { "epoch": 1.0952287885080498, "grad_norm": 0.10870584100484848, "learning_rate": 2.028846794156712e-05, "loss": 0.0653, "num_input_tokens_seen": 29066176, "step": 22415 }, { "epoch": 1.095473090171744, "grad_norm": 0.14942197501659393, "learning_rate": 2.027882675115856e-05, "loss": 0.0838, "num_input_tokens_seen": 29072480, "step": 22420 }, { "epoch": 1.0957173918354384, "grad_norm": 0.3136413097381592, "learning_rate": 2.026918628881423e-05, "loss": 0.0716, "num_input_tokens_seen": 29078880, "step": 22425 }, { "epoch": 1.0959616934991327, "grad_norm": 0.8230021595954895, "learning_rate": 2.0259546556020833e-05, "loss": 0.0958, "num_input_tokens_seen": 29084928, "step": 22430 }, { "epoch": 1.0962059951628271, "grad_norm": 0.20439676940441132, "learning_rate": 2.024990755426493e-05, "loss": 0.0931, "num_input_tokens_seen": 29091072, "step": 22435 }, { "epoch": 1.0964502968265213, "grad_norm": 1.086944580078125, "learning_rate": 2.0240269285032975e-05, "loss": 0.1136, "num_input_tokens_seen": 29097280, "step": 22440 }, { "epoch": 1.0966945984902157, "grad_norm": 0.18866471946239471, "learning_rate": 2.0230631749811306e-05, "loss": 0.0938, "num_input_tokens_seen": 29103872, "step": 22445 }, { "epoch": 1.09693890015391, "grad_norm": 0.541763961315155, "learning_rate": 2.0220994950086162e-05, "loss": 0.0966, "num_input_tokens_seen": 29110688, "step": 22450 }, { "epoch": 1.0971832018176044, "grad_norm": 0.3836635947227478, "learning_rate": 2.021135888734365e-05, "loss": 0.0804, "num_input_tokens_seen": 29117376, "step": 22455 }, { "epoch": 1.0974275034812988, "grad_norm": 0.4390811622142792, "learning_rate": 2.0201723563069783e-05, "loss": 0.0736, "num_input_tokens_seen": 29123808, "step": 22460 }, { "epoch": 1.097671805144993, "grad_norm": 0.23405379056930542, "learning_rate": 2.0192088978750433e-05, "loss": 0.0898, "num_input_tokens_seen": 29130528, "step": 22465 }, { "epoch": 1.0979161068086873, "grad_norm": 0.3960021138191223, "learning_rate": 2.0182455135871385e-05, "loss": 0.0922, "num_input_tokens_seen": 29137408, "step": 22470 }, { "epoch": 1.0981604084723817, "grad_norm": 0.22987814247608185, "learning_rate": 2.0172822035918305e-05, "loss": 0.0789, "num_input_tokens_seen": 29143680, "step": 22475 }, { "epoch": 1.098404710136076, "grad_norm": 0.14757348597049713, "learning_rate": 2.016318968037671e-05, "loss": 0.1021, "num_input_tokens_seen": 29150432, "step": 22480 }, { "epoch": 1.0986490117997703, "grad_norm": 0.19538885354995728, "learning_rate": 2.015355807073206e-05, "loss": 0.1028, "num_input_tokens_seen": 29156768, "step": 22485 }, { "epoch": 1.0988933134634646, "grad_norm": 0.289713054895401, "learning_rate": 2.0143927208469664e-05, "loss": 0.1136, "num_input_tokens_seen": 29163168, "step": 22490 }, { "epoch": 1.099137615127159, "grad_norm": 0.48622217774391174, "learning_rate": 2.0134297095074708e-05, "loss": 0.0751, "num_input_tokens_seen": 29170432, "step": 22495 }, { "epoch": 1.0993819167908534, "grad_norm": 0.1738637387752533, "learning_rate": 2.0124667732032297e-05, "loss": 0.0849, "num_input_tokens_seen": 29177568, "step": 22500 }, { "epoch": 1.0996262184545478, "grad_norm": 0.1450543999671936, "learning_rate": 2.011503912082738e-05, "loss": 0.066, "num_input_tokens_seen": 29184288, "step": 22505 }, { "epoch": 1.099870520118242, "grad_norm": 0.24529531598091125, "learning_rate": 2.0105411262944823e-05, "loss": 0.0882, "num_input_tokens_seen": 29190720, "step": 22510 }, { "epoch": 1.1001148217819363, "grad_norm": 0.2794599235057831, "learning_rate": 2.0095784159869366e-05, "loss": 0.1087, "num_input_tokens_seen": 29196768, "step": 22515 }, { "epoch": 1.1003591234456307, "grad_norm": 0.3035268187522888, "learning_rate": 2.0086157813085608e-05, "loss": 0.0798, "num_input_tokens_seen": 29203072, "step": 22520 }, { "epoch": 1.100603425109325, "grad_norm": 0.3694099485874176, "learning_rate": 2.0076532224078068e-05, "loss": 0.0824, "num_input_tokens_seen": 29209344, "step": 22525 }, { "epoch": 1.1008477267730192, "grad_norm": 0.23580029606819153, "learning_rate": 2.0066907394331142e-05, "loss": 0.0841, "num_input_tokens_seen": 29216064, "step": 22530 }, { "epoch": 1.1010920284367136, "grad_norm": 0.25621235370635986, "learning_rate": 2.0057283325329077e-05, "loss": 0.077, "num_input_tokens_seen": 29222528, "step": 22535 }, { "epoch": 1.101336330100408, "grad_norm": 0.8654872179031372, "learning_rate": 2.0047660018556047e-05, "loss": 0.1052, "num_input_tokens_seen": 29228864, "step": 22540 }, { "epoch": 1.1015806317641024, "grad_norm": 0.14271295070648193, "learning_rate": 2.0038037475496075e-05, "loss": 0.0825, "num_input_tokens_seen": 29235168, "step": 22545 }, { "epoch": 1.1018249334277965, "grad_norm": 0.25570932030677795, "learning_rate": 2.0028415697633073e-05, "loss": 0.0934, "num_input_tokens_seen": 29241408, "step": 22550 }, { "epoch": 1.102069235091491, "grad_norm": 0.7512453198432922, "learning_rate": 2.0018794686450858e-05, "loss": 0.0691, "num_input_tokens_seen": 29248384, "step": 22555 }, { "epoch": 1.1023135367551853, "grad_norm": 0.13944104313850403, "learning_rate": 2.0009174443433088e-05, "loss": 0.0663, "num_input_tokens_seen": 29255104, "step": 22560 }, { "epoch": 1.1025578384188797, "grad_norm": 0.2523828446865082, "learning_rate": 1.999955497006334e-05, "loss": 0.0676, "num_input_tokens_seen": 29261568, "step": 22565 }, { "epoch": 1.102802140082574, "grad_norm": 0.39328742027282715, "learning_rate": 1.9989936267825067e-05, "loss": 0.0795, "num_input_tokens_seen": 29268000, "step": 22570 }, { "epoch": 1.1030464417462682, "grad_norm": 0.15957985818386078, "learning_rate": 1.9980318338201572e-05, "loss": 0.0876, "num_input_tokens_seen": 29274016, "step": 22575 }, { "epoch": 1.1032907434099626, "grad_norm": 0.34936419129371643, "learning_rate": 1.997070118267607e-05, "loss": 0.0728, "num_input_tokens_seen": 29280704, "step": 22580 }, { "epoch": 1.103535045073657, "grad_norm": 0.8164060115814209, "learning_rate": 1.9961084802731654e-05, "loss": 0.0861, "num_input_tokens_seen": 29287168, "step": 22585 }, { "epoch": 1.1037793467373513, "grad_norm": 0.5503456592559814, "learning_rate": 1.9951469199851273e-05, "loss": 0.0749, "num_input_tokens_seen": 29293280, "step": 22590 }, { "epoch": 1.1040236484010455, "grad_norm": 0.1922094076871872, "learning_rate": 1.99418543755178e-05, "loss": 0.0962, "num_input_tokens_seen": 29299520, "step": 22595 }, { "epoch": 1.1042679500647399, "grad_norm": 0.25853556394577026, "learning_rate": 1.9932240331213936e-05, "loss": 0.0893, "num_input_tokens_seen": 29306368, "step": 22600 }, { "epoch": 1.1042679500647399, "eval_loss": 0.08834383636713028, "eval_runtime": 374.6411, "eval_samples_per_second": 97.12, "eval_steps_per_second": 24.282, "num_input_tokens_seen": 29306368, "step": 22600 }, { "epoch": 1.1045122517284343, "grad_norm": 0.3506718575954437, "learning_rate": 1.9922627068422297e-05, "loss": 0.0729, "num_input_tokens_seen": 29313408, "step": 22605 }, { "epoch": 1.1047565533921286, "grad_norm": 0.31156712770462036, "learning_rate": 1.991301458862538e-05, "loss": 0.0821, "num_input_tokens_seen": 29320224, "step": 22610 }, { "epoch": 1.105000855055823, "grad_norm": 0.32011300325393677, "learning_rate": 1.9903402893305536e-05, "loss": 0.0749, "num_input_tokens_seen": 29326848, "step": 22615 }, { "epoch": 1.1052451567195172, "grad_norm": 0.3000398278236389, "learning_rate": 1.9893791983945016e-05, "loss": 0.0799, "num_input_tokens_seen": 29333632, "step": 22620 }, { "epoch": 1.1054894583832116, "grad_norm": 0.23832644522190094, "learning_rate": 1.988418186202594e-05, "loss": 0.0777, "num_input_tokens_seen": 29340000, "step": 22625 }, { "epoch": 1.105733760046906, "grad_norm": 0.2177031934261322, "learning_rate": 1.98745725290303e-05, "loss": 0.085, "num_input_tokens_seen": 29346368, "step": 22630 }, { "epoch": 1.1059780617106003, "grad_norm": 0.2044457644224167, "learning_rate": 1.986496398644e-05, "loss": 0.0747, "num_input_tokens_seen": 29352736, "step": 22635 }, { "epoch": 1.1062223633742945, "grad_norm": 0.19427236914634705, "learning_rate": 1.9855356235736777e-05, "loss": 0.084, "num_input_tokens_seen": 29358912, "step": 22640 }, { "epoch": 1.1064666650379889, "grad_norm": 0.1775866448879242, "learning_rate": 1.9845749278402277e-05, "loss": 0.0778, "num_input_tokens_seen": 29365248, "step": 22645 }, { "epoch": 1.1067109667016832, "grad_norm": 0.3691560626029968, "learning_rate": 1.9836143115918006e-05, "loss": 0.077, "num_input_tokens_seen": 29371936, "step": 22650 }, { "epoch": 1.1069552683653776, "grad_norm": 0.7054842710494995, "learning_rate": 1.9826537749765367e-05, "loss": 0.0821, "num_input_tokens_seen": 29378432, "step": 22655 }, { "epoch": 1.107199570029072, "grad_norm": 0.3536294996738434, "learning_rate": 1.9816933181425625e-05, "loss": 0.1115, "num_input_tokens_seen": 29384864, "step": 22660 }, { "epoch": 1.1074438716927661, "grad_norm": 0.24816355109214783, "learning_rate": 1.9807329412379903e-05, "loss": 0.0848, "num_input_tokens_seen": 29391264, "step": 22665 }, { "epoch": 1.1076881733564605, "grad_norm": 0.3525415062904358, "learning_rate": 1.9797726444109247e-05, "loss": 0.0963, "num_input_tokens_seen": 29397568, "step": 22670 }, { "epoch": 1.107932475020155, "grad_norm": 0.14929789304733276, "learning_rate": 1.9788124278094557e-05, "loss": 0.0633, "num_input_tokens_seen": 29404672, "step": 22675 }, { "epoch": 1.1081767766838493, "grad_norm": 0.7736753225326538, "learning_rate": 1.9778522915816594e-05, "loss": 0.0954, "num_input_tokens_seen": 29410816, "step": 22680 }, { "epoch": 1.1084210783475434, "grad_norm": 0.31152579188346863, "learning_rate": 1.9768922358756014e-05, "loss": 0.0921, "num_input_tokens_seen": 29417312, "step": 22685 }, { "epoch": 1.1086653800112378, "grad_norm": 0.1947651505470276, "learning_rate": 1.9759322608393353e-05, "loss": 0.0893, "num_input_tokens_seen": 29424000, "step": 22690 }, { "epoch": 1.1089096816749322, "grad_norm": 0.23306991159915924, "learning_rate": 1.9749723666208992e-05, "loss": 0.0672, "num_input_tokens_seen": 29430080, "step": 22695 }, { "epoch": 1.1091539833386266, "grad_norm": 0.7416858077049255, "learning_rate": 1.9740125533683235e-05, "loss": 0.0962, "num_input_tokens_seen": 29436096, "step": 22700 }, { "epoch": 1.109398285002321, "grad_norm": 0.5513026118278503, "learning_rate": 1.9730528212296208e-05, "loss": 0.069, "num_input_tokens_seen": 29442688, "step": 22705 }, { "epoch": 1.1096425866660151, "grad_norm": 0.25628557801246643, "learning_rate": 1.9720931703527945e-05, "loss": 0.0943, "num_input_tokens_seen": 29449280, "step": 22710 }, { "epoch": 1.1098868883297095, "grad_norm": 0.14882196485996246, "learning_rate": 1.9711336008858373e-05, "loss": 0.0848, "num_input_tokens_seen": 29455488, "step": 22715 }, { "epoch": 1.1101311899934039, "grad_norm": 0.15067176520824432, "learning_rate": 1.9701741129767233e-05, "loss": 0.0942, "num_input_tokens_seen": 29462400, "step": 22720 }, { "epoch": 1.1103754916570983, "grad_norm": 0.13720384240150452, "learning_rate": 1.9692147067734202e-05, "loss": 0.0869, "num_input_tokens_seen": 29468832, "step": 22725 }, { "epoch": 1.1106197933207924, "grad_norm": 0.19740258157253265, "learning_rate": 1.96825538242388e-05, "loss": 0.0973, "num_input_tokens_seen": 29475456, "step": 22730 }, { "epoch": 1.1108640949844868, "grad_norm": 0.24690401554107666, "learning_rate": 1.967296140076041e-05, "loss": 0.1068, "num_input_tokens_seen": 29482336, "step": 22735 }, { "epoch": 1.1111083966481812, "grad_norm": 0.3885224759578705, "learning_rate": 1.966336979877833e-05, "loss": 0.1054, "num_input_tokens_seen": 29488800, "step": 22740 }, { "epoch": 1.1113526983118756, "grad_norm": 0.13267742097377777, "learning_rate": 1.9653779019771678e-05, "loss": 0.0816, "num_input_tokens_seen": 29495264, "step": 22745 }, { "epoch": 1.11159699997557, "grad_norm": 0.27757638692855835, "learning_rate": 1.9644189065219488e-05, "loss": 0.0783, "num_input_tokens_seen": 29501632, "step": 22750 }, { "epoch": 1.111841301639264, "grad_norm": 0.17464923858642578, "learning_rate": 1.9634599936600655e-05, "loss": 0.0881, "num_input_tokens_seen": 29508096, "step": 22755 }, { "epoch": 1.1120856033029585, "grad_norm": 0.5551157593727112, "learning_rate": 1.9625011635393935e-05, "loss": 0.1221, "num_input_tokens_seen": 29514976, "step": 22760 }, { "epoch": 1.1123299049666528, "grad_norm": 0.2743127644062042, "learning_rate": 1.9615424163077963e-05, "loss": 0.0903, "num_input_tokens_seen": 29521312, "step": 22765 }, { "epoch": 1.1125742066303472, "grad_norm": 0.1510256826877594, "learning_rate": 1.9605837521131263e-05, "loss": 0.0781, "num_input_tokens_seen": 29527872, "step": 22770 }, { "epoch": 1.1128185082940414, "grad_norm": 0.2588798403739929, "learning_rate": 1.9596251711032192e-05, "loss": 0.0939, "num_input_tokens_seen": 29534944, "step": 22775 }, { "epoch": 1.1130628099577358, "grad_norm": 0.32021722197532654, "learning_rate": 1.958666673425903e-05, "loss": 0.0841, "num_input_tokens_seen": 29541696, "step": 22780 }, { "epoch": 1.1133071116214301, "grad_norm": 0.13455304503440857, "learning_rate": 1.957708259228987e-05, "loss": 0.0754, "num_input_tokens_seen": 29548128, "step": 22785 }, { "epoch": 1.1135514132851245, "grad_norm": 0.21929635107517242, "learning_rate": 1.956749928660273e-05, "loss": 0.0821, "num_input_tokens_seen": 29554656, "step": 22790 }, { "epoch": 1.113795714948819, "grad_norm": 0.13649335503578186, "learning_rate": 1.955791681867547e-05, "loss": 0.0818, "num_input_tokens_seen": 29561024, "step": 22795 }, { "epoch": 1.114040016612513, "grad_norm": 0.12389136105775833, "learning_rate": 1.9548335189985824e-05, "loss": 0.0878, "num_input_tokens_seen": 29567616, "step": 22800 }, { "epoch": 1.114040016612513, "eval_loss": 0.0883156880736351, "eval_runtime": 374.6272, "eval_samples_per_second": 97.123, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 29567616, "step": 22800 }, { "epoch": 1.1142843182762074, "grad_norm": 0.6257988810539246, "learning_rate": 1.9538754402011396e-05, "loss": 0.1054, "num_input_tokens_seen": 29573824, "step": 22805 }, { "epoch": 1.1145286199399018, "grad_norm": 0.2994338870048523, "learning_rate": 1.952917445622968e-05, "loss": 0.0947, "num_input_tokens_seen": 29580096, "step": 22810 }, { "epoch": 1.1147729216035962, "grad_norm": 0.20486247539520264, "learning_rate": 1.9519595354118005e-05, "loss": 0.088, "num_input_tokens_seen": 29586880, "step": 22815 }, { "epoch": 1.1150172232672904, "grad_norm": 0.3685702681541443, "learning_rate": 1.951001709715361e-05, "loss": 0.0764, "num_input_tokens_seen": 29593728, "step": 22820 }, { "epoch": 1.1152615249309847, "grad_norm": 0.2670380771160126, "learning_rate": 1.9500439686813556e-05, "loss": 0.0987, "num_input_tokens_seen": 29600544, "step": 22825 }, { "epoch": 1.1155058265946791, "grad_norm": 0.30634036660194397, "learning_rate": 1.949086312457482e-05, "loss": 0.0797, "num_input_tokens_seen": 29607136, "step": 22830 }, { "epoch": 1.1157501282583735, "grad_norm": 0.14526981115341187, "learning_rate": 1.9481287411914223e-05, "loss": 0.0928, "num_input_tokens_seen": 29613184, "step": 22835 }, { "epoch": 1.1159944299220679, "grad_norm": 0.3599439263343811, "learning_rate": 1.9471712550308457e-05, "loss": 0.1061, "num_input_tokens_seen": 29619424, "step": 22840 }, { "epoch": 1.116238731585762, "grad_norm": 0.21342936158180237, "learning_rate": 1.946213854123409e-05, "loss": 0.0713, "num_input_tokens_seen": 29626208, "step": 22845 }, { "epoch": 1.1164830332494564, "grad_norm": 0.31112128496170044, "learning_rate": 1.9452565386167554e-05, "loss": 0.0862, "num_input_tokens_seen": 29632640, "step": 22850 }, { "epoch": 1.1167273349131508, "grad_norm": 0.19248339533805847, "learning_rate": 1.9442993086585142e-05, "loss": 0.0588, "num_input_tokens_seen": 29640032, "step": 22855 }, { "epoch": 1.1169716365768452, "grad_norm": 1.1799348592758179, "learning_rate": 1.9433421643963043e-05, "loss": 0.1179, "num_input_tokens_seen": 29646272, "step": 22860 }, { "epoch": 1.1172159382405393, "grad_norm": 0.341552734375, "learning_rate": 1.942385105977727e-05, "loss": 0.0823, "num_input_tokens_seen": 29652576, "step": 22865 }, { "epoch": 1.1174602399042337, "grad_norm": 0.46778836846351624, "learning_rate": 1.9414281335503743e-05, "loss": 0.0791, "num_input_tokens_seen": 29659616, "step": 22870 }, { "epoch": 1.117704541567928, "grad_norm": 0.2973352372646332, "learning_rate": 1.9404712472618232e-05, "loss": 0.0536, "num_input_tokens_seen": 29666272, "step": 22875 }, { "epoch": 1.1179488432316225, "grad_norm": 0.4360572397708893, "learning_rate": 1.939514447259636e-05, "loss": 0.086, "num_input_tokens_seen": 29672544, "step": 22880 }, { "epoch": 1.1181931448953168, "grad_norm": 0.5606981515884399, "learning_rate": 1.938557733691365e-05, "loss": 0.0662, "num_input_tokens_seen": 29679168, "step": 22885 }, { "epoch": 1.118437446559011, "grad_norm": 0.25761786103248596, "learning_rate": 1.9376011067045476e-05, "loss": 0.0825, "num_input_tokens_seen": 29686176, "step": 22890 }, { "epoch": 1.1186817482227054, "grad_norm": 0.2372492551803589, "learning_rate": 1.9366445664467065e-05, "loss": 0.0825, "num_input_tokens_seen": 29693120, "step": 22895 }, { "epoch": 1.1189260498863998, "grad_norm": 0.5303356647491455, "learning_rate": 1.9356881130653533e-05, "loss": 0.0614, "num_input_tokens_seen": 29698912, "step": 22900 }, { "epoch": 1.1191703515500941, "grad_norm": 0.2859772741794586, "learning_rate": 1.9347317467079846e-05, "loss": 0.0599, "num_input_tokens_seen": 29705056, "step": 22905 }, { "epoch": 1.1194146532137883, "grad_norm": 0.344448447227478, "learning_rate": 1.9337754675220836e-05, "loss": 0.0855, "num_input_tokens_seen": 29711936, "step": 22910 }, { "epoch": 1.1196589548774827, "grad_norm": 0.22665616869926453, "learning_rate": 1.9328192756551218e-05, "loss": 0.0812, "num_input_tokens_seen": 29718816, "step": 22915 }, { "epoch": 1.119903256541177, "grad_norm": 0.13613760471343994, "learning_rate": 1.931863171254555e-05, "loss": 0.0732, "num_input_tokens_seen": 29725472, "step": 22920 }, { "epoch": 1.1201475582048714, "grad_norm": 0.24444331228733063, "learning_rate": 1.930907154467826e-05, "loss": 0.072, "num_input_tokens_seen": 29732384, "step": 22925 }, { "epoch": 1.1203918598685658, "grad_norm": 0.3577415943145752, "learning_rate": 1.9299512254423673e-05, "loss": 0.075, "num_input_tokens_seen": 29738880, "step": 22930 }, { "epoch": 1.12063616153226, "grad_norm": 0.2567347288131714, "learning_rate": 1.9289953843255914e-05, "loss": 0.136, "num_input_tokens_seen": 29745088, "step": 22935 }, { "epoch": 1.1208804631959544, "grad_norm": 0.3200339078903198, "learning_rate": 1.9280396312649048e-05, "loss": 0.0998, "num_input_tokens_seen": 29751936, "step": 22940 }, { "epoch": 1.1211247648596487, "grad_norm": 0.5077679753303528, "learning_rate": 1.9270839664076936e-05, "loss": 0.1089, "num_input_tokens_seen": 29758304, "step": 22945 }, { "epoch": 1.1213690665233431, "grad_norm": 0.2616898715496063, "learning_rate": 1.9261283899013345e-05, "loss": 0.0877, "num_input_tokens_seen": 29764800, "step": 22950 }, { "epoch": 1.1216133681870373, "grad_norm": 0.353684663772583, "learning_rate": 1.92517290189319e-05, "loss": 0.1027, "num_input_tokens_seen": 29770848, "step": 22955 }, { "epoch": 1.1218576698507317, "grad_norm": 0.1462494283914566, "learning_rate": 1.924217502530607e-05, "loss": 0.081, "num_input_tokens_seen": 29777568, "step": 22960 }, { "epoch": 1.122101971514426, "grad_norm": 0.23030877113342285, "learning_rate": 1.9232621919609207e-05, "loss": 0.0852, "num_input_tokens_seen": 29784160, "step": 22965 }, { "epoch": 1.1223462731781204, "grad_norm": 0.21512648463249207, "learning_rate": 1.9223069703314534e-05, "loss": 0.0871, "num_input_tokens_seen": 29790400, "step": 22970 }, { "epoch": 1.1225905748418146, "grad_norm": 0.16281716525554657, "learning_rate": 1.92135183778951e-05, "loss": 0.1016, "num_input_tokens_seen": 29796672, "step": 22975 }, { "epoch": 1.122834876505509, "grad_norm": 0.2909418046474457, "learning_rate": 1.9203967944823857e-05, "loss": 0.1025, "num_input_tokens_seen": 29803232, "step": 22980 }, { "epoch": 1.1230791781692033, "grad_norm": 0.25188755989074707, "learning_rate": 1.9194418405573588e-05, "loss": 0.1023, "num_input_tokens_seen": 29810048, "step": 22985 }, { "epoch": 1.1233234798328977, "grad_norm": 0.17410577833652496, "learning_rate": 1.9184869761616954e-05, "loss": 0.1088, "num_input_tokens_seen": 29816640, "step": 22990 }, { "epoch": 1.123567781496592, "grad_norm": 0.1973978877067566, "learning_rate": 1.9175322014426495e-05, "loss": 0.077, "num_input_tokens_seen": 29823200, "step": 22995 }, { "epoch": 1.1238120831602862, "grad_norm": 0.40216076374053955, "learning_rate": 1.9165775165474565e-05, "loss": 0.0908, "num_input_tokens_seen": 29829920, "step": 23000 }, { "epoch": 1.1238120831602862, "eval_loss": 0.08849793672561646, "eval_runtime": 374.6242, "eval_samples_per_second": 97.124, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 29829920, "step": 23000 }, { "epoch": 1.1240563848239806, "grad_norm": 0.2365419566631317, "learning_rate": 1.9156229216233434e-05, "loss": 0.0582, "num_input_tokens_seen": 29836448, "step": 23005 }, { "epoch": 1.124300686487675, "grad_norm": 0.5777180194854736, "learning_rate": 1.9146684168175184e-05, "loss": 0.1151, "num_input_tokens_seen": 29842816, "step": 23010 }, { "epoch": 1.1245449881513694, "grad_norm": 0.764566957950592, "learning_rate": 1.9137140022771796e-05, "loss": 0.0764, "num_input_tokens_seen": 29849344, "step": 23015 }, { "epoch": 1.1247892898150635, "grad_norm": 0.15075072646141052, "learning_rate": 1.9127596781495103e-05, "loss": 0.0823, "num_input_tokens_seen": 29855680, "step": 23020 }, { "epoch": 1.125033591478758, "grad_norm": 0.2742973566055298, "learning_rate": 1.9118054445816767e-05, "loss": 0.0916, "num_input_tokens_seen": 29862304, "step": 23025 }, { "epoch": 1.1252778931424523, "grad_norm": 0.18993188440799713, "learning_rate": 1.9108513017208356e-05, "loss": 0.0658, "num_input_tokens_seen": 29869312, "step": 23030 }, { "epoch": 1.1255221948061467, "grad_norm": 0.3554087281227112, "learning_rate": 1.9098972497141287e-05, "loss": 0.1055, "num_input_tokens_seen": 29875712, "step": 23035 }, { "epoch": 1.1257664964698408, "grad_norm": 0.18980707228183746, "learning_rate": 1.9089432887086806e-05, "loss": 0.0822, "num_input_tokens_seen": 29882176, "step": 23040 }, { "epoch": 1.1260107981335352, "grad_norm": 0.3144944906234741, "learning_rate": 1.9079894188516056e-05, "loss": 0.0945, "num_input_tokens_seen": 29888800, "step": 23045 }, { "epoch": 1.1262550997972296, "grad_norm": 0.1695493459701538, "learning_rate": 1.907035640290002e-05, "loss": 0.0655, "num_input_tokens_seen": 29895264, "step": 23050 }, { "epoch": 1.126499401460924, "grad_norm": 0.2630649507045746, "learning_rate": 1.9060819531709534e-05, "loss": 0.0666, "num_input_tokens_seen": 29901760, "step": 23055 }, { "epoch": 1.1267437031246184, "grad_norm": 0.36466503143310547, "learning_rate": 1.9051283576415325e-05, "loss": 0.0869, "num_input_tokens_seen": 29908896, "step": 23060 }, { "epoch": 1.1269880047883125, "grad_norm": 0.5650889873504639, "learning_rate": 1.904174853848793e-05, "loss": 0.1029, "num_input_tokens_seen": 29915296, "step": 23065 }, { "epoch": 1.127232306452007, "grad_norm": 0.4001355469226837, "learning_rate": 1.903221441939779e-05, "loss": 0.0851, "num_input_tokens_seen": 29921280, "step": 23070 }, { "epoch": 1.1274766081157013, "grad_norm": 0.38080689311027527, "learning_rate": 1.9022681220615194e-05, "loss": 0.0806, "num_input_tokens_seen": 29928416, "step": 23075 }, { "epoch": 1.1277209097793957, "grad_norm": 0.21206870675086975, "learning_rate": 1.9013148943610255e-05, "loss": 0.0941, "num_input_tokens_seen": 29934976, "step": 23080 }, { "epoch": 1.1279652114430898, "grad_norm": 0.20519214868545532, "learning_rate": 1.9003617589852998e-05, "loss": 0.0841, "num_input_tokens_seen": 29941472, "step": 23085 }, { "epoch": 1.1282095131067842, "grad_norm": 0.65261310338974, "learning_rate": 1.899408716081326e-05, "loss": 0.0714, "num_input_tokens_seen": 29948352, "step": 23090 }, { "epoch": 1.1284538147704786, "grad_norm": 0.10253609716892242, "learning_rate": 1.898455765796075e-05, "loss": 0.0917, "num_input_tokens_seen": 29954976, "step": 23095 }, { "epoch": 1.128698116434173, "grad_norm": 0.37464967370033264, "learning_rate": 1.8975029082765053e-05, "loss": 0.1053, "num_input_tokens_seen": 29961504, "step": 23100 }, { "epoch": 1.1289424180978673, "grad_norm": 0.2613820433616638, "learning_rate": 1.8965501436695577e-05, "loss": 0.0792, "num_input_tokens_seen": 29967936, "step": 23105 }, { "epoch": 1.1291867197615615, "grad_norm": 0.23363249003887177, "learning_rate": 1.895597472122161e-05, "loss": 0.0982, "num_input_tokens_seen": 29974240, "step": 23110 }, { "epoch": 1.1294310214252559, "grad_norm": 0.4019390642642975, "learning_rate": 1.894644893781231e-05, "loss": 0.0996, "num_input_tokens_seen": 29980608, "step": 23115 }, { "epoch": 1.1296753230889502, "grad_norm": 0.4386231303215027, "learning_rate": 1.893692408793665e-05, "loss": 0.1062, "num_input_tokens_seen": 29986624, "step": 23120 }, { "epoch": 1.1299196247526446, "grad_norm": 0.4096095561981201, "learning_rate": 1.8927400173063493e-05, "loss": 0.0962, "num_input_tokens_seen": 29993280, "step": 23125 }, { "epoch": 1.1301639264163388, "grad_norm": 0.2344115674495697, "learning_rate": 1.891787719466154e-05, "loss": 0.1063, "num_input_tokens_seen": 29999840, "step": 23130 }, { "epoch": 1.1304082280800332, "grad_norm": 0.18203939497470856, "learning_rate": 1.8908355154199346e-05, "loss": 0.0876, "num_input_tokens_seen": 30006304, "step": 23135 }, { "epoch": 1.1306525297437275, "grad_norm": 0.22077061235904694, "learning_rate": 1.8898834053145357e-05, "loss": 0.0754, "num_input_tokens_seen": 30012928, "step": 23140 }, { "epoch": 1.130896831407422, "grad_norm": 0.17533569037914276, "learning_rate": 1.8889313892967813e-05, "loss": 0.0703, "num_input_tokens_seen": 30019264, "step": 23145 }, { "epoch": 1.1311411330711163, "grad_norm": 0.4265294075012207, "learning_rate": 1.8879794675134863e-05, "loss": 0.0768, "num_input_tokens_seen": 30025504, "step": 23150 }, { "epoch": 1.1313854347348105, "grad_norm": 0.3380172550678253, "learning_rate": 1.8870276401114494e-05, "loss": 0.0941, "num_input_tokens_seen": 30032096, "step": 23155 }, { "epoch": 1.1316297363985048, "grad_norm": 0.3733638525009155, "learning_rate": 1.886075907237453e-05, "loss": 0.0716, "num_input_tokens_seen": 30038400, "step": 23160 }, { "epoch": 1.1318740380621992, "grad_norm": 0.2763510048389435, "learning_rate": 1.8851242690382672e-05, "loss": 0.0875, "num_input_tokens_seen": 30045024, "step": 23165 }, { "epoch": 1.1321183397258936, "grad_norm": 0.6721059679985046, "learning_rate": 1.884172725660645e-05, "loss": 0.0886, "num_input_tokens_seen": 30051552, "step": 23170 }, { "epoch": 1.1323626413895878, "grad_norm": 0.7533072829246521, "learning_rate": 1.8832212772513277e-05, "loss": 0.0642, "num_input_tokens_seen": 30058688, "step": 23175 }, { "epoch": 1.1326069430532821, "grad_norm": 0.49309536814689636, "learning_rate": 1.8822699239570414e-05, "loss": 0.0876, "num_input_tokens_seen": 30065760, "step": 23180 }, { "epoch": 1.1328512447169765, "grad_norm": 0.18216565251350403, "learning_rate": 1.8813186659244943e-05, "loss": 0.0665, "num_input_tokens_seen": 30072160, "step": 23185 }, { "epoch": 1.133095546380671, "grad_norm": 0.15733599662780762, "learning_rate": 1.880367503300385e-05, "loss": 0.0582, "num_input_tokens_seen": 30078720, "step": 23190 }, { "epoch": 1.1333398480443653, "grad_norm": 0.3655782639980316, "learning_rate": 1.8794164362313927e-05, "loss": 0.0979, "num_input_tokens_seen": 30085888, "step": 23195 }, { "epoch": 1.1335841497080594, "grad_norm": 0.15272216498851776, "learning_rate": 1.878465464864185e-05, "loss": 0.056, "num_input_tokens_seen": 30092128, "step": 23200 }, { "epoch": 1.1335841497080594, "eval_loss": 0.08822857588529587, "eval_runtime": 374.6296, "eval_samples_per_second": 97.123, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 30092128, "step": 23200 }, { "epoch": 1.1338284513717538, "grad_norm": 0.36415818333625793, "learning_rate": 1.877514589345414e-05, "loss": 0.0676, "num_input_tokens_seen": 30098848, "step": 23205 }, { "epoch": 1.1340727530354482, "grad_norm": 0.14205437898635864, "learning_rate": 1.876563809821715e-05, "loss": 0.1054, "num_input_tokens_seen": 30105344, "step": 23210 }, { "epoch": 1.1343170546991426, "grad_norm": 0.20458510518074036, "learning_rate": 1.8756131264397106e-05, "loss": 0.0687, "num_input_tokens_seen": 30111680, "step": 23215 }, { "epoch": 1.1345613563628367, "grad_norm": 0.23319552838802338, "learning_rate": 1.87466253934601e-05, "loss": 0.0794, "num_input_tokens_seen": 30118464, "step": 23220 }, { "epoch": 1.134805658026531, "grad_norm": 0.3301481604576111, "learning_rate": 1.8737120486872033e-05, "loss": 0.0728, "num_input_tokens_seen": 30124704, "step": 23225 }, { "epoch": 1.1350499596902255, "grad_norm": 0.3844461143016815, "learning_rate": 1.8727616546098696e-05, "loss": 0.0773, "num_input_tokens_seen": 30130816, "step": 23230 }, { "epoch": 1.1352942613539199, "grad_norm": 0.5772092342376709, "learning_rate": 1.8718113572605716e-05, "loss": 0.0708, "num_input_tokens_seen": 30137408, "step": 23235 }, { "epoch": 1.1355385630176142, "grad_norm": 0.14929433166980743, "learning_rate": 1.8708611567858554e-05, "loss": 0.0784, "num_input_tokens_seen": 30144160, "step": 23240 }, { "epoch": 1.1357828646813084, "grad_norm": 0.136687234044075, "learning_rate": 1.8699110533322565e-05, "loss": 0.1045, "num_input_tokens_seen": 30150464, "step": 23245 }, { "epoch": 1.1360271663450028, "grad_norm": 0.17285126447677612, "learning_rate": 1.8689610470462897e-05, "loss": 0.0711, "num_input_tokens_seen": 30156768, "step": 23250 }, { "epoch": 1.1362714680086972, "grad_norm": 0.32349514961242676, "learning_rate": 1.8680111380744604e-05, "loss": 0.0765, "num_input_tokens_seen": 30163776, "step": 23255 }, { "epoch": 1.1365157696723915, "grad_norm": 0.5343195796012878, "learning_rate": 1.8670613265632564e-05, "loss": 0.1031, "num_input_tokens_seen": 30170240, "step": 23260 }, { "epoch": 1.1367600713360857, "grad_norm": 0.13712768256664276, "learning_rate": 1.866111612659149e-05, "loss": 0.0729, "num_input_tokens_seen": 30176736, "step": 23265 }, { "epoch": 1.13700437299978, "grad_norm": 0.4956461191177368, "learning_rate": 1.8651619965085967e-05, "loss": 0.0934, "num_input_tokens_seen": 30183104, "step": 23270 }, { "epoch": 1.1372486746634745, "grad_norm": 0.21809588372707367, "learning_rate": 1.8642124782580433e-05, "loss": 0.068, "num_input_tokens_seen": 30189344, "step": 23275 }, { "epoch": 1.1374929763271688, "grad_norm": 0.8282745480537415, "learning_rate": 1.8632630580539144e-05, "loss": 0.0929, "num_input_tokens_seen": 30195520, "step": 23280 }, { "epoch": 1.1377372779908632, "grad_norm": 0.6276751160621643, "learning_rate": 1.862313736042625e-05, "loss": 0.0716, "num_input_tokens_seen": 30201632, "step": 23285 }, { "epoch": 1.1379815796545574, "grad_norm": 0.3490039110183716, "learning_rate": 1.8613645123705703e-05, "loss": 0.0808, "num_input_tokens_seen": 30208128, "step": 23290 }, { "epoch": 1.1382258813182518, "grad_norm": 0.34632372856140137, "learning_rate": 1.8604153871841328e-05, "loss": 0.1036, "num_input_tokens_seen": 30214144, "step": 23295 }, { "epoch": 1.1384701829819461, "grad_norm": 0.25091856718063354, "learning_rate": 1.859466360629682e-05, "loss": 0.0874, "num_input_tokens_seen": 30220320, "step": 23300 }, { "epoch": 1.1387144846456405, "grad_norm": 0.31003645062446594, "learning_rate": 1.8585174328535666e-05, "loss": 0.0819, "num_input_tokens_seen": 30227200, "step": 23305 }, { "epoch": 1.1389587863093347, "grad_norm": 0.24081194400787354, "learning_rate": 1.857568604002124e-05, "loss": 0.0819, "num_input_tokens_seen": 30233632, "step": 23310 }, { "epoch": 1.139203087973029, "grad_norm": 0.33199837803840637, "learning_rate": 1.8566198742216774e-05, "loss": 0.1079, "num_input_tokens_seen": 30240160, "step": 23315 }, { "epoch": 1.1394473896367234, "grad_norm": 0.5476243495941162, "learning_rate": 1.85567124365853e-05, "loss": 0.1218, "num_input_tokens_seen": 30246656, "step": 23320 }, { "epoch": 1.1396916913004178, "grad_norm": 0.2651541531085968, "learning_rate": 1.854722712458975e-05, "loss": 0.0632, "num_input_tokens_seen": 30253216, "step": 23325 }, { "epoch": 1.1399359929641122, "grad_norm": 0.3444966673851013, "learning_rate": 1.853774280769286e-05, "loss": 0.0844, "num_input_tokens_seen": 30259936, "step": 23330 }, { "epoch": 1.1401802946278063, "grad_norm": 0.22741536796092987, "learning_rate": 1.852825948735724e-05, "loss": 0.0835, "num_input_tokens_seen": 30266624, "step": 23335 }, { "epoch": 1.1404245962915007, "grad_norm": 0.280184268951416, "learning_rate": 1.851877716504534e-05, "loss": 0.0885, "num_input_tokens_seen": 30272864, "step": 23340 }, { "epoch": 1.140668897955195, "grad_norm": 0.43622469902038574, "learning_rate": 1.8509295842219448e-05, "loss": 0.1142, "num_input_tokens_seen": 30278976, "step": 23345 }, { "epoch": 1.1409131996188895, "grad_norm": 0.26178738474845886, "learning_rate": 1.8499815520341697e-05, "loss": 0.0687, "num_input_tokens_seen": 30285536, "step": 23350 }, { "epoch": 1.1411575012825836, "grad_norm": 0.4872136116027832, "learning_rate": 1.8490336200874094e-05, "loss": 0.0853, "num_input_tokens_seen": 30291776, "step": 23355 }, { "epoch": 1.141401802946278, "grad_norm": 0.39051780104637146, "learning_rate": 1.848085788527844e-05, "loss": 0.0775, "num_input_tokens_seen": 30297728, "step": 23360 }, { "epoch": 1.1416461046099724, "grad_norm": 0.29934096336364746, "learning_rate": 1.847138057501644e-05, "loss": 0.0894, "num_input_tokens_seen": 30304800, "step": 23365 }, { "epoch": 1.1418904062736668, "grad_norm": 0.3332065939903259, "learning_rate": 1.8461904271549582e-05, "loss": 0.1095, "num_input_tokens_seen": 30311136, "step": 23370 }, { "epoch": 1.1421347079373612, "grad_norm": 0.32980668544769287, "learning_rate": 1.845242897633926e-05, "loss": 0.1153, "num_input_tokens_seen": 30317504, "step": 23375 }, { "epoch": 1.1423790096010553, "grad_norm": 0.14979198575019836, "learning_rate": 1.844295469084667e-05, "loss": 0.0977, "num_input_tokens_seen": 30324032, "step": 23380 }, { "epoch": 1.1426233112647497, "grad_norm": 0.11332244426012039, "learning_rate": 1.843348141653286e-05, "loss": 0.0818, "num_input_tokens_seen": 30330048, "step": 23385 }, { "epoch": 1.142867612928444, "grad_norm": 0.11322194337844849, "learning_rate": 1.842400915485874e-05, "loss": 0.0692, "num_input_tokens_seen": 30336832, "step": 23390 }, { "epoch": 1.1431119145921385, "grad_norm": 1.006689429283142, "learning_rate": 1.8414537907285053e-05, "loss": 0.0943, "num_input_tokens_seen": 30343360, "step": 23395 }, { "epoch": 1.1433562162558326, "grad_norm": 0.26909536123275757, "learning_rate": 1.840506767527237e-05, "loss": 0.0931, "num_input_tokens_seen": 30349984, "step": 23400 }, { "epoch": 1.1433562162558326, "eval_loss": 0.08854573965072632, "eval_runtime": 374.4118, "eval_samples_per_second": 97.179, "eval_steps_per_second": 24.297, "num_input_tokens_seen": 30349984, "step": 23400 }, { "epoch": 1.143600517919527, "grad_norm": 0.15420684218406677, "learning_rate": 1.8395598460281137e-05, "loss": 0.0706, "num_input_tokens_seen": 30356416, "step": 23405 }, { "epoch": 1.1438448195832214, "grad_norm": 0.1829744428396225, "learning_rate": 1.838613026377161e-05, "loss": 0.0693, "num_input_tokens_seen": 30362848, "step": 23410 }, { "epoch": 1.1440891212469158, "grad_norm": 0.13779759407043457, "learning_rate": 1.8376663087203917e-05, "loss": 0.0908, "num_input_tokens_seen": 30369536, "step": 23415 }, { "epoch": 1.1443334229106101, "grad_norm": 0.23156119883060455, "learning_rate": 1.8367196932038014e-05, "loss": 0.0899, "num_input_tokens_seen": 30375840, "step": 23420 }, { "epoch": 1.1445777245743043, "grad_norm": 0.4791715443134308, "learning_rate": 1.8357731799733686e-05, "loss": 0.0816, "num_input_tokens_seen": 30382080, "step": 23425 }, { "epoch": 1.1448220262379987, "grad_norm": 0.3221021294593811, "learning_rate": 1.8348267691750586e-05, "loss": 0.0776, "num_input_tokens_seen": 30387872, "step": 23430 }, { "epoch": 1.145066327901693, "grad_norm": 0.33345627784729004, "learning_rate": 1.833880460954821e-05, "loss": 0.0966, "num_input_tokens_seen": 30394272, "step": 23435 }, { "epoch": 1.1453106295653874, "grad_norm": 0.24029839038848877, "learning_rate": 1.8329342554585866e-05, "loss": 0.0643, "num_input_tokens_seen": 30400768, "step": 23440 }, { "epoch": 1.1455549312290816, "grad_norm": 0.08911963552236557, "learning_rate": 1.8319881528322735e-05, "loss": 0.098, "num_input_tokens_seen": 30407392, "step": 23445 }, { "epoch": 1.145799232892776, "grad_norm": 0.3417167663574219, "learning_rate": 1.8310421532217815e-05, "loss": 0.0988, "num_input_tokens_seen": 30413728, "step": 23450 }, { "epoch": 1.1460435345564703, "grad_norm": 0.40488484501838684, "learning_rate": 1.8300962567729958e-05, "loss": 0.1017, "num_input_tokens_seen": 30420096, "step": 23455 }, { "epoch": 1.1462878362201647, "grad_norm": 0.32649052143096924, "learning_rate": 1.8291504636317866e-05, "loss": 0.0758, "num_input_tokens_seen": 30426880, "step": 23460 }, { "epoch": 1.146532137883859, "grad_norm": 0.43212267756462097, "learning_rate": 1.8282047739440055e-05, "loss": 0.0773, "num_input_tokens_seen": 30433152, "step": 23465 }, { "epoch": 1.1467764395475533, "grad_norm": 1.3934040069580078, "learning_rate": 1.8272591878554903e-05, "loss": 0.1082, "num_input_tokens_seen": 30439456, "step": 23470 }, { "epoch": 1.1470207412112476, "grad_norm": 0.3022930920124054, "learning_rate": 1.8263137055120638e-05, "loss": 0.0844, "num_input_tokens_seen": 30445536, "step": 23475 }, { "epoch": 1.147265042874942, "grad_norm": 0.49519309401512146, "learning_rate": 1.8253683270595295e-05, "loss": 0.0784, "num_input_tokens_seen": 30451872, "step": 23480 }, { "epoch": 1.1475093445386364, "grad_norm": 0.12198735028505325, "learning_rate": 1.824423052643677e-05, "loss": 0.0775, "num_input_tokens_seen": 30458400, "step": 23485 }, { "epoch": 1.1477536462023306, "grad_norm": 0.3818872570991516, "learning_rate": 1.82347788241028e-05, "loss": 0.1116, "num_input_tokens_seen": 30464928, "step": 23490 }, { "epoch": 1.147997947866025, "grad_norm": 0.15883734822273254, "learning_rate": 1.8225328165050942e-05, "loss": 0.1047, "num_input_tokens_seen": 30470976, "step": 23495 }, { "epoch": 1.1482422495297193, "grad_norm": 0.6746895909309387, "learning_rate": 1.821587855073863e-05, "loss": 0.0869, "num_input_tokens_seen": 30477728, "step": 23500 }, { "epoch": 1.1484865511934137, "grad_norm": 0.3150218725204468, "learning_rate": 1.8206429982623086e-05, "loss": 0.1112, "num_input_tokens_seen": 30483776, "step": 23505 }, { "epoch": 1.148730852857108, "grad_norm": 0.20418184995651245, "learning_rate": 1.8196982462161416e-05, "loss": 0.0943, "num_input_tokens_seen": 30490144, "step": 23510 }, { "epoch": 1.1489751545208022, "grad_norm": 0.3419434428215027, "learning_rate": 1.818753599081055e-05, "loss": 0.0802, "num_input_tokens_seen": 30496640, "step": 23515 }, { "epoch": 1.1492194561844966, "grad_norm": 0.21928291022777557, "learning_rate": 1.817809057002724e-05, "loss": 0.1043, "num_input_tokens_seen": 30502944, "step": 23520 }, { "epoch": 1.149463757848191, "grad_norm": 0.5664961934089661, "learning_rate": 1.8168646201268096e-05, "loss": 0.0985, "num_input_tokens_seen": 30509088, "step": 23525 }, { "epoch": 1.1497080595118854, "grad_norm": 0.4007129669189453, "learning_rate": 1.8159202885989557e-05, "loss": 0.1086, "num_input_tokens_seen": 30515264, "step": 23530 }, { "epoch": 1.1499523611755795, "grad_norm": 0.4956609010696411, "learning_rate": 1.814976062564789e-05, "loss": 0.1162, "num_input_tokens_seen": 30521440, "step": 23535 }, { "epoch": 1.150196662839274, "grad_norm": 0.3096039593219757, "learning_rate": 1.8140319421699234e-05, "loss": 0.0786, "num_input_tokens_seen": 30528000, "step": 23540 }, { "epoch": 1.1504409645029683, "grad_norm": 0.2374710589647293, "learning_rate": 1.8130879275599515e-05, "loss": 0.0668, "num_input_tokens_seen": 30534272, "step": 23545 }, { "epoch": 1.1506852661666627, "grad_norm": 0.30882829427719116, "learning_rate": 1.8121440188804544e-05, "loss": 0.0927, "num_input_tokens_seen": 30540384, "step": 23550 }, { "epoch": 1.150929567830357, "grad_norm": 0.1928691416978836, "learning_rate": 1.811200216276993e-05, "loss": 0.0903, "num_input_tokens_seen": 30546944, "step": 23555 }, { "epoch": 1.1511738694940512, "grad_norm": 0.28582069277763367, "learning_rate": 1.810256519895115e-05, "loss": 0.1044, "num_input_tokens_seen": 30553888, "step": 23560 }, { "epoch": 1.1514181711577456, "grad_norm": 0.4185904264450073, "learning_rate": 1.8093129298803494e-05, "loss": 0.0784, "num_input_tokens_seen": 30560736, "step": 23565 }, { "epoch": 1.15166247282144, "grad_norm": 0.5150255560874939, "learning_rate": 1.808369446378209e-05, "loss": 0.1037, "num_input_tokens_seen": 30567264, "step": 23570 }, { "epoch": 1.1519067744851341, "grad_norm": 0.18663232028484344, "learning_rate": 1.8074260695341914e-05, "loss": 0.0662, "num_input_tokens_seen": 30573536, "step": 23575 }, { "epoch": 1.1521510761488285, "grad_norm": 0.19756585359573364, "learning_rate": 1.8064827994937782e-05, "loss": 0.0691, "num_input_tokens_seen": 30579776, "step": 23580 }, { "epoch": 1.1523953778125229, "grad_norm": 0.5417436361312866, "learning_rate": 1.8055396364024317e-05, "loss": 0.0916, "num_input_tokens_seen": 30586080, "step": 23585 }, { "epoch": 1.1526396794762173, "grad_norm": 0.42758694291114807, "learning_rate": 1.804596580405601e-05, "loss": 0.094, "num_input_tokens_seen": 30592480, "step": 23590 }, { "epoch": 1.1528839811399116, "grad_norm": 0.5289229154586792, "learning_rate": 1.8036536316487174e-05, "loss": 0.1057, "num_input_tokens_seen": 30598912, "step": 23595 }, { "epoch": 1.1531282828036058, "grad_norm": 0.42125415802001953, "learning_rate": 1.802710790277193e-05, "loss": 0.0704, "num_input_tokens_seen": 30605344, "step": 23600 }, { "epoch": 1.1531282828036058, "eval_loss": 0.08830350637435913, "eval_runtime": 374.5871, "eval_samples_per_second": 97.134, "eval_steps_per_second": 24.285, "num_input_tokens_seen": 30605344, "step": 23600 }, { "epoch": 1.1533725844673002, "grad_norm": 0.4403766691684723, "learning_rate": 1.801768056436429e-05, "loss": 0.0778, "num_input_tokens_seen": 30612416, "step": 23605 }, { "epoch": 1.1536168861309946, "grad_norm": 0.22316811978816986, "learning_rate": 1.8008254302718035e-05, "loss": 0.0711, "num_input_tokens_seen": 30618464, "step": 23610 }, { "epoch": 1.153861187794689, "grad_norm": 0.2275138646364212, "learning_rate": 1.7998829119286837e-05, "loss": 0.1094, "num_input_tokens_seen": 30624704, "step": 23615 }, { "epoch": 1.154105489458383, "grad_norm": 0.35773730278015137, "learning_rate": 1.798940501552418e-05, "loss": 0.0689, "num_input_tokens_seen": 30631008, "step": 23620 }, { "epoch": 1.1543497911220775, "grad_norm": 0.08856621384620667, "learning_rate": 1.797998199288336e-05, "loss": 0.0684, "num_input_tokens_seen": 30637792, "step": 23625 }, { "epoch": 1.1545940927857719, "grad_norm": 0.2033853828907013, "learning_rate": 1.7970560052817543e-05, "loss": 0.0957, "num_input_tokens_seen": 30644096, "step": 23630 }, { "epoch": 1.1548383944494662, "grad_norm": 0.12807920575141907, "learning_rate": 1.7961139196779702e-05, "loss": 0.0652, "num_input_tokens_seen": 30650848, "step": 23635 }, { "epoch": 1.1550826961131606, "grad_norm": 0.8838899731636047, "learning_rate": 1.7951719426222647e-05, "loss": 0.0905, "num_input_tokens_seen": 30657664, "step": 23640 }, { "epoch": 1.1553269977768548, "grad_norm": 0.14426496624946594, "learning_rate": 1.794230074259904e-05, "loss": 0.0835, "num_input_tokens_seen": 30664640, "step": 23645 }, { "epoch": 1.1555712994405491, "grad_norm": 0.2189611941576004, "learning_rate": 1.7932883147361336e-05, "loss": 0.0728, "num_input_tokens_seen": 30671488, "step": 23650 }, { "epoch": 1.1558156011042435, "grad_norm": 0.22930209338665009, "learning_rate": 1.7923466641961865e-05, "loss": 0.0688, "num_input_tokens_seen": 30677952, "step": 23655 }, { "epoch": 1.156059902767938, "grad_norm": 0.15143540501594543, "learning_rate": 1.791405122785278e-05, "loss": 0.0795, "num_input_tokens_seen": 30684384, "step": 23660 }, { "epoch": 1.156304204431632, "grad_norm": 0.4513052999973297, "learning_rate": 1.7904636906486037e-05, "loss": 0.0968, "num_input_tokens_seen": 30690848, "step": 23665 }, { "epoch": 1.1565485060953264, "grad_norm": 0.1533186137676239, "learning_rate": 1.7895223679313448e-05, "loss": 0.067, "num_input_tokens_seen": 30697728, "step": 23670 }, { "epoch": 1.1567928077590208, "grad_norm": 0.09829884022474289, "learning_rate": 1.7885811547786653e-05, "loss": 0.0728, "num_input_tokens_seen": 30704160, "step": 23675 }, { "epoch": 1.1570371094227152, "grad_norm": 0.5782732367515564, "learning_rate": 1.7876400513357115e-05, "loss": 0.0865, "num_input_tokens_seen": 30710752, "step": 23680 }, { "epoch": 1.1572814110864096, "grad_norm": 0.39617782831192017, "learning_rate": 1.7866990577476146e-05, "loss": 0.0764, "num_input_tokens_seen": 30717568, "step": 23685 }, { "epoch": 1.1575257127501037, "grad_norm": 0.23020757734775543, "learning_rate": 1.7857581741594863e-05, "loss": 0.0796, "num_input_tokens_seen": 30723968, "step": 23690 }, { "epoch": 1.1577700144137981, "grad_norm": 0.5986683368682861, "learning_rate": 1.7848174007164237e-05, "loss": 0.0805, "num_input_tokens_seen": 30730592, "step": 23695 }, { "epoch": 1.1580143160774925, "grad_norm": 0.21332195401191711, "learning_rate": 1.7838767375635052e-05, "loss": 0.1076, "num_input_tokens_seen": 30737024, "step": 23700 }, { "epoch": 1.1582586177411869, "grad_norm": 0.4889935851097107, "learning_rate": 1.782936184845793e-05, "loss": 0.131, "num_input_tokens_seen": 30743136, "step": 23705 }, { "epoch": 1.158502919404881, "grad_norm": 0.3300177752971649, "learning_rate": 1.7819957427083334e-05, "loss": 0.083, "num_input_tokens_seen": 30749280, "step": 23710 }, { "epoch": 1.1587472210685754, "grad_norm": 0.3316912353038788, "learning_rate": 1.7810554112961516e-05, "loss": 0.0678, "num_input_tokens_seen": 30755776, "step": 23715 }, { "epoch": 1.1589915227322698, "grad_norm": 0.14282388985157013, "learning_rate": 1.7801151907542607e-05, "loss": 0.1022, "num_input_tokens_seen": 30762368, "step": 23720 }, { "epoch": 1.1592358243959642, "grad_norm": 0.34845706820487976, "learning_rate": 1.7791750812276547e-05, "loss": 0.0922, "num_input_tokens_seen": 30769024, "step": 23725 }, { "epoch": 1.1594801260596586, "grad_norm": 0.5411543846130371, "learning_rate": 1.778235082861309e-05, "loss": 0.0905, "num_input_tokens_seen": 30775456, "step": 23730 }, { "epoch": 1.1597244277233527, "grad_norm": 0.1408606916666031, "learning_rate": 1.777295195800184e-05, "loss": 0.0997, "num_input_tokens_seen": 30781824, "step": 23735 }, { "epoch": 1.159968729387047, "grad_norm": 0.35804811120033264, "learning_rate": 1.7763554201892215e-05, "loss": 0.0837, "num_input_tokens_seen": 30788576, "step": 23740 }, { "epoch": 1.1602130310507415, "grad_norm": 0.46390706300735474, "learning_rate": 1.7754157561733476e-05, "loss": 0.0691, "num_input_tokens_seen": 30795008, "step": 23745 }, { "epoch": 1.1604573327144359, "grad_norm": 0.4010213613510132, "learning_rate": 1.7744762038974702e-05, "loss": 0.097, "num_input_tokens_seen": 30801664, "step": 23750 }, { "epoch": 1.16070163437813, "grad_norm": 0.34583503007888794, "learning_rate": 1.7735367635064788e-05, "loss": 0.0765, "num_input_tokens_seen": 30808256, "step": 23755 }, { "epoch": 1.1609459360418244, "grad_norm": 0.22385616600513458, "learning_rate": 1.7725974351452474e-05, "loss": 0.0961, "num_input_tokens_seen": 30815072, "step": 23760 }, { "epoch": 1.1611902377055188, "grad_norm": 0.143145352602005, "learning_rate": 1.771658218958634e-05, "loss": 0.0822, "num_input_tokens_seen": 30821632, "step": 23765 }, { "epoch": 1.1614345393692131, "grad_norm": 0.20049422979354858, "learning_rate": 1.770719115091475e-05, "loss": 0.0727, "num_input_tokens_seen": 30828032, "step": 23770 }, { "epoch": 1.1616788410329075, "grad_norm": 0.22766683995723724, "learning_rate": 1.7697801236885935e-05, "loss": 0.0797, "num_input_tokens_seen": 30835008, "step": 23775 }, { "epoch": 1.1619231426966017, "grad_norm": 0.18094418942928314, "learning_rate": 1.7688412448947944e-05, "loss": 0.0746, "num_input_tokens_seen": 30841664, "step": 23780 }, { "epoch": 1.162167444360296, "grad_norm": 0.19244803488254547, "learning_rate": 1.767902478854862e-05, "loss": 0.0751, "num_input_tokens_seen": 30847616, "step": 23785 }, { "epoch": 1.1624117460239904, "grad_norm": 0.34571516513824463, "learning_rate": 1.766963825713569e-05, "loss": 0.0967, "num_input_tokens_seen": 30854272, "step": 23790 }, { "epoch": 1.1626560476876848, "grad_norm": 0.4826172888278961, "learning_rate": 1.766025285615665e-05, "loss": 0.0853, "num_input_tokens_seen": 30860960, "step": 23795 }, { "epoch": 1.162900349351379, "grad_norm": 0.4486021399497986, "learning_rate": 1.7650868587058854e-05, "loss": 0.0984, "num_input_tokens_seen": 30867648, "step": 23800 }, { "epoch": 1.162900349351379, "eval_loss": 0.08817337453365326, "eval_runtime": 373.9216, "eval_samples_per_second": 97.306, "eval_steps_per_second": 24.329, "num_input_tokens_seen": 30867648, "step": 23800 }, { "epoch": 1.1631446510150734, "grad_norm": 0.4036164879798889, "learning_rate": 1.7641485451289484e-05, "loss": 0.0837, "num_input_tokens_seen": 30873696, "step": 23805 }, { "epoch": 1.1633889526787677, "grad_norm": 0.33997389674186707, "learning_rate": 1.7632103450295534e-05, "loss": 0.1117, "num_input_tokens_seen": 30880064, "step": 23810 }, { "epoch": 1.1636332543424621, "grad_norm": 0.18286089599132538, "learning_rate": 1.762272258552381e-05, "loss": 0.067, "num_input_tokens_seen": 30886816, "step": 23815 }, { "epoch": 1.1638775560061565, "grad_norm": 0.16473610699176788, "learning_rate": 1.7613342858420988e-05, "loss": 0.0802, "num_input_tokens_seen": 30893760, "step": 23820 }, { "epoch": 1.1641218576698507, "grad_norm": 0.391360342502594, "learning_rate": 1.760396427043351e-05, "loss": 0.0944, "num_input_tokens_seen": 30900384, "step": 23825 }, { "epoch": 1.164366159333545, "grad_norm": 0.12542037665843964, "learning_rate": 1.7594586823007696e-05, "loss": 0.1006, "num_input_tokens_seen": 30907488, "step": 23830 }, { "epoch": 1.1646104609972394, "grad_norm": 0.2511763572692871, "learning_rate": 1.7585210517589646e-05, "loss": 0.0775, "num_input_tokens_seen": 30914112, "step": 23835 }, { "epoch": 1.1648547626609338, "grad_norm": 0.22049933671951294, "learning_rate": 1.7575835355625314e-05, "loss": 0.094, "num_input_tokens_seen": 30920288, "step": 23840 }, { "epoch": 1.165099064324628, "grad_norm": 0.279242604970932, "learning_rate": 1.756646133856048e-05, "loss": 0.0546, "num_input_tokens_seen": 30927168, "step": 23845 }, { "epoch": 1.1653433659883223, "grad_norm": 0.18107885122299194, "learning_rate": 1.7557088467840714e-05, "loss": 0.0758, "num_input_tokens_seen": 30933600, "step": 23850 }, { "epoch": 1.1655876676520167, "grad_norm": 0.4249993562698364, "learning_rate": 1.7547716744911438e-05, "loss": 0.0901, "num_input_tokens_seen": 30940224, "step": 23855 }, { "epoch": 1.165831969315711, "grad_norm": 0.32785138487815857, "learning_rate": 1.7538346171217902e-05, "loss": 0.0643, "num_input_tokens_seen": 30946912, "step": 23860 }, { "epoch": 1.1660762709794055, "grad_norm": 0.12572988867759705, "learning_rate": 1.7528976748205146e-05, "loss": 0.0861, "num_input_tokens_seen": 30953376, "step": 23865 }, { "epoch": 1.1663205726430996, "grad_norm": 0.1552073359489441, "learning_rate": 1.751960847731807e-05, "loss": 0.073, "num_input_tokens_seen": 30959808, "step": 23870 }, { "epoch": 1.166564874306794, "grad_norm": 0.6520854234695435, "learning_rate": 1.7510241360001362e-05, "loss": 0.075, "num_input_tokens_seen": 30966304, "step": 23875 }, { "epoch": 1.1668091759704884, "grad_norm": 0.13023146986961365, "learning_rate": 1.7500875397699562e-05, "loss": 0.0659, "num_input_tokens_seen": 30972576, "step": 23880 }, { "epoch": 1.1670534776341828, "grad_norm": 0.18890999257564545, "learning_rate": 1.7491510591857015e-05, "loss": 0.0928, "num_input_tokens_seen": 30978976, "step": 23885 }, { "epoch": 1.167297779297877, "grad_norm": 0.641879141330719, "learning_rate": 1.7482146943917896e-05, "loss": 0.0826, "num_input_tokens_seen": 30985568, "step": 23890 }, { "epoch": 1.1675420809615713, "grad_norm": 0.2387307733297348, "learning_rate": 1.7472784455326185e-05, "loss": 0.0816, "num_input_tokens_seen": 30992224, "step": 23895 }, { "epoch": 1.1677863826252657, "grad_norm": 0.3043403625488281, "learning_rate": 1.746342312752572e-05, "loss": 0.085, "num_input_tokens_seen": 30999008, "step": 23900 }, { "epoch": 1.16803068428896, "grad_norm": 0.4101060628890991, "learning_rate": 1.74540629619601e-05, "loss": 0.0913, "num_input_tokens_seen": 31005408, "step": 23905 }, { "epoch": 1.1682749859526544, "grad_norm": 0.42727187275886536, "learning_rate": 1.7444703960072815e-05, "loss": 0.1195, "num_input_tokens_seen": 31011520, "step": 23910 }, { "epoch": 1.1685192876163486, "grad_norm": 0.6992448568344116, "learning_rate": 1.7435346123307118e-05, "loss": 0.0885, "num_input_tokens_seen": 31017792, "step": 23915 }, { "epoch": 1.168763589280043, "grad_norm": 0.18530438840389252, "learning_rate": 1.742598945310611e-05, "loss": 0.0714, "num_input_tokens_seen": 31024352, "step": 23920 }, { "epoch": 1.1690078909437374, "grad_norm": 0.4514653980731964, "learning_rate": 1.741663395091272e-05, "loss": 0.0845, "num_input_tokens_seen": 31030816, "step": 23925 }, { "epoch": 1.1692521926074317, "grad_norm": 0.19313080608844757, "learning_rate": 1.7407279618169657e-05, "loss": 0.0611, "num_input_tokens_seen": 31036736, "step": 23930 }, { "epoch": 1.169496494271126, "grad_norm": 0.3903602957725525, "learning_rate": 1.73979264563195e-05, "loss": 0.0975, "num_input_tokens_seen": 31043392, "step": 23935 }, { "epoch": 1.1697407959348203, "grad_norm": 0.10845532268285751, "learning_rate": 1.7388574466804625e-05, "loss": 0.117, "num_input_tokens_seen": 31049888, "step": 23940 }, { "epoch": 1.1699850975985147, "grad_norm": 0.22931450605392456, "learning_rate": 1.7379223651067207e-05, "loss": 0.087, "num_input_tokens_seen": 31056736, "step": 23945 }, { "epoch": 1.170229399262209, "grad_norm": 0.20002825558185577, "learning_rate": 1.736987401054928e-05, "loss": 0.0645, "num_input_tokens_seen": 31063424, "step": 23950 }, { "epoch": 1.1704737009259034, "grad_norm": 0.3110719621181488, "learning_rate": 1.736052554669266e-05, "loss": 0.0703, "num_input_tokens_seen": 31069760, "step": 23955 }, { "epoch": 1.1707180025895976, "grad_norm": 0.18489594757556915, "learning_rate": 1.7351178260939007e-05, "loss": 0.0875, "num_input_tokens_seen": 31076352, "step": 23960 }, { "epoch": 1.170962304253292, "grad_norm": 0.26728910207748413, "learning_rate": 1.7341832154729794e-05, "loss": 0.1136, "num_input_tokens_seen": 31082560, "step": 23965 }, { "epoch": 1.1712066059169863, "grad_norm": 0.1490795761346817, "learning_rate": 1.7332487229506286e-05, "loss": 0.0605, "num_input_tokens_seen": 31089152, "step": 23970 }, { "epoch": 1.1714509075806807, "grad_norm": 0.2678917944431305, "learning_rate": 1.732314348670961e-05, "loss": 0.0765, "num_input_tokens_seen": 31095744, "step": 23975 }, { "epoch": 1.1716952092443749, "grad_norm": 0.17666786909103394, "learning_rate": 1.7313800927780686e-05, "loss": 0.0787, "num_input_tokens_seen": 31102272, "step": 23980 }, { "epoch": 1.1719395109080692, "grad_norm": 0.4881027340888977, "learning_rate": 1.7304459554160245e-05, "loss": 0.1009, "num_input_tokens_seen": 31108832, "step": 23985 }, { "epoch": 1.1721838125717636, "grad_norm": 0.3541282117366791, "learning_rate": 1.7295119367288853e-05, "loss": 0.0948, "num_input_tokens_seen": 31114912, "step": 23990 }, { "epoch": 1.172428114235458, "grad_norm": 0.2865843176841736, "learning_rate": 1.728578036860688e-05, "loss": 0.0699, "num_input_tokens_seen": 31121280, "step": 23995 }, { "epoch": 1.1726724158991524, "grad_norm": 0.4020851254463196, "learning_rate": 1.7276442559554513e-05, "loss": 0.0808, "num_input_tokens_seen": 31127744, "step": 24000 }, { "epoch": 1.1726724158991524, "eval_loss": 0.08834915608167648, "eval_runtime": 375.3159, "eval_samples_per_second": 96.945, "eval_steps_per_second": 24.238, "num_input_tokens_seen": 31127744, "step": 24000 }, { "epoch": 1.1729167175628465, "grad_norm": 0.3111734986305237, "learning_rate": 1.726710594157177e-05, "loss": 0.0988, "num_input_tokens_seen": 31134272, "step": 24005 }, { "epoch": 1.173161019226541, "grad_norm": 0.461178719997406, "learning_rate": 1.725777051609846e-05, "loss": 0.1037, "num_input_tokens_seen": 31140640, "step": 24010 }, { "epoch": 1.1734053208902353, "grad_norm": 0.2598462700843811, "learning_rate": 1.7248436284574228e-05, "loss": 0.1054, "num_input_tokens_seen": 31146368, "step": 24015 }, { "epoch": 1.1736496225539297, "grad_norm": 0.22265848517417908, "learning_rate": 1.723910324843855e-05, "loss": 0.1187, "num_input_tokens_seen": 31152480, "step": 24020 }, { "epoch": 1.1738939242176238, "grad_norm": 0.32498204708099365, "learning_rate": 1.722977140913067e-05, "loss": 0.0847, "num_input_tokens_seen": 31158784, "step": 24025 }, { "epoch": 1.1741382258813182, "grad_norm": 0.2475956380367279, "learning_rate": 1.7220440768089688e-05, "loss": 0.0949, "num_input_tokens_seen": 31165248, "step": 24030 }, { "epoch": 1.1743825275450126, "grad_norm": 0.2346009910106659, "learning_rate": 1.7211111326754505e-05, "loss": 0.0837, "num_input_tokens_seen": 31171360, "step": 24035 }, { "epoch": 1.174626829208707, "grad_norm": 0.22160038352012634, "learning_rate": 1.720178308656383e-05, "loss": 0.0743, "num_input_tokens_seen": 31177760, "step": 24040 }, { "epoch": 1.1748711308724014, "grad_norm": 0.2860366106033325, "learning_rate": 1.719245604895621e-05, "loss": 0.0934, "num_input_tokens_seen": 31184160, "step": 24045 }, { "epoch": 1.1751154325360955, "grad_norm": 0.33044782280921936, "learning_rate": 1.7183130215369972e-05, "loss": 0.0985, "num_input_tokens_seen": 31190944, "step": 24050 }, { "epoch": 1.17535973419979, "grad_norm": 0.4281122088432312, "learning_rate": 1.7173805587243292e-05, "loss": 0.0724, "num_input_tokens_seen": 31197440, "step": 24055 }, { "epoch": 1.1756040358634843, "grad_norm": 0.17051313817501068, "learning_rate": 1.7164482166014147e-05, "loss": 0.0963, "num_input_tokens_seen": 31204352, "step": 24060 }, { "epoch": 1.1758483375271787, "grad_norm": 0.5567505955696106, "learning_rate": 1.7155159953120313e-05, "loss": 0.0917, "num_input_tokens_seen": 31210752, "step": 24065 }, { "epoch": 1.1760926391908728, "grad_norm": 0.41865411400794983, "learning_rate": 1.714583894999941e-05, "loss": 0.0749, "num_input_tokens_seen": 31217792, "step": 24070 }, { "epoch": 1.1763369408545672, "grad_norm": 0.2801002860069275, "learning_rate": 1.7136519158088826e-05, "loss": 0.0786, "num_input_tokens_seen": 31224320, "step": 24075 }, { "epoch": 1.1765812425182616, "grad_norm": 0.33937424421310425, "learning_rate": 1.712720057882581e-05, "loss": 0.0661, "num_input_tokens_seen": 31230592, "step": 24080 }, { "epoch": 1.176825544181956, "grad_norm": 0.18224751949310303, "learning_rate": 1.7117883213647413e-05, "loss": 0.0714, "num_input_tokens_seen": 31236416, "step": 24085 }, { "epoch": 1.1770698458456503, "grad_norm": 0.231103777885437, "learning_rate": 1.710856706399046e-05, "loss": 0.0941, "num_input_tokens_seen": 31242656, "step": 24090 }, { "epoch": 1.1773141475093445, "grad_norm": 0.2981773614883423, "learning_rate": 1.7099252131291648e-05, "loss": 0.0806, "num_input_tokens_seen": 31249280, "step": 24095 }, { "epoch": 1.1775584491730389, "grad_norm": 1.1109724044799805, "learning_rate": 1.708993841698744e-05, "loss": 0.1071, "num_input_tokens_seen": 31255968, "step": 24100 }, { "epoch": 1.1778027508367332, "grad_norm": 0.20251323282718658, "learning_rate": 1.7080625922514132e-05, "loss": 0.0841, "num_input_tokens_seen": 31262208, "step": 24105 }, { "epoch": 1.1780470525004274, "grad_norm": 0.3388395607471466, "learning_rate": 1.7071314649307836e-05, "loss": 0.1241, "num_input_tokens_seen": 31268160, "step": 24110 }, { "epoch": 1.1782913541641218, "grad_norm": 0.3236645460128784, "learning_rate": 1.7062004598804448e-05, "loss": 0.078, "num_input_tokens_seen": 31274368, "step": 24115 }, { "epoch": 1.1785356558278162, "grad_norm": 0.20767541229724884, "learning_rate": 1.7052695772439702e-05, "loss": 0.0846, "num_input_tokens_seen": 31280416, "step": 24120 }, { "epoch": 1.1787799574915105, "grad_norm": 0.373769611120224, "learning_rate": 1.7043388171649154e-05, "loss": 0.1183, "num_input_tokens_seen": 31286656, "step": 24125 }, { "epoch": 1.179024259155205, "grad_norm": 0.1740257889032364, "learning_rate": 1.7034081797868127e-05, "loss": 0.0673, "num_input_tokens_seen": 31292736, "step": 24130 }, { "epoch": 1.179268560818899, "grad_norm": 0.27120915055274963, "learning_rate": 1.70247766525318e-05, "loss": 0.0909, "num_input_tokens_seen": 31299008, "step": 24135 }, { "epoch": 1.1795128624825935, "grad_norm": 0.15123648941516876, "learning_rate": 1.701547273707514e-05, "loss": 0.0749, "num_input_tokens_seen": 31305568, "step": 24140 }, { "epoch": 1.1797571641462878, "grad_norm": 0.13428793847560883, "learning_rate": 1.7006170052932916e-05, "loss": 0.072, "num_input_tokens_seen": 31311936, "step": 24145 }, { "epoch": 1.1800014658099822, "grad_norm": 0.4790928363800049, "learning_rate": 1.6996868601539735e-05, "loss": 0.0994, "num_input_tokens_seen": 31318208, "step": 24150 }, { "epoch": 1.1802457674736764, "grad_norm": 0.1967855840921402, "learning_rate": 1.6987568384329977e-05, "loss": 0.1016, "num_input_tokens_seen": 31324512, "step": 24155 }, { "epoch": 1.1804900691373708, "grad_norm": 0.1648922711610794, "learning_rate": 1.6978269402737866e-05, "loss": 0.0923, "num_input_tokens_seen": 31331584, "step": 24160 }, { "epoch": 1.1807343708010651, "grad_norm": 0.35566067695617676, "learning_rate": 1.696897165819743e-05, "loss": 0.0935, "num_input_tokens_seen": 31338016, "step": 24165 }, { "epoch": 1.1809786724647595, "grad_norm": 0.2018577754497528, "learning_rate": 1.6959675152142487e-05, "loss": 0.0993, "num_input_tokens_seen": 31344640, "step": 24170 }, { "epoch": 1.181222974128454, "grad_norm": 0.19179530441761017, "learning_rate": 1.6950379886006667e-05, "loss": 0.0728, "num_input_tokens_seen": 31351040, "step": 24175 }, { "epoch": 1.181467275792148, "grad_norm": 0.1593402475118637, "learning_rate": 1.6941085861223438e-05, "loss": 0.0767, "num_input_tokens_seen": 31357664, "step": 24180 }, { "epoch": 1.1817115774558424, "grad_norm": 0.377448171377182, "learning_rate": 1.6931793079226034e-05, "loss": 0.0814, "num_input_tokens_seen": 31364064, "step": 24185 }, { "epoch": 1.1819558791195368, "grad_norm": 0.31069350242614746, "learning_rate": 1.692250154144754e-05, "loss": 0.0956, "num_input_tokens_seen": 31370592, "step": 24190 }, { "epoch": 1.1822001807832312, "grad_norm": 0.288137823343277, "learning_rate": 1.6913211249320807e-05, "loss": 0.0782, "num_input_tokens_seen": 31377120, "step": 24195 }, { "epoch": 1.1824444824469253, "grad_norm": 0.15383578836917877, "learning_rate": 1.6903922204278522e-05, "loss": 0.1008, "num_input_tokens_seen": 31383392, "step": 24200 }, { "epoch": 1.1824444824469253, "eval_loss": 0.08821100741624832, "eval_runtime": 374.8904, "eval_samples_per_second": 97.055, "eval_steps_per_second": 24.266, "num_input_tokens_seen": 31383392, "step": 24200 }, { "epoch": 1.1826887841106197, "grad_norm": 0.35958659648895264, "learning_rate": 1.6894634407753186e-05, "loss": 0.0707, "num_input_tokens_seen": 31390304, "step": 24205 }, { "epoch": 1.182933085774314, "grad_norm": 0.34996652603149414, "learning_rate": 1.6885347861177077e-05, "loss": 0.0978, "num_input_tokens_seen": 31396768, "step": 24210 }, { "epoch": 1.1831773874380085, "grad_norm": 0.11087770760059357, "learning_rate": 1.6876062565982298e-05, "loss": 0.0579, "num_input_tokens_seen": 31403232, "step": 24215 }, { "epoch": 1.1834216891017029, "grad_norm": 0.28688228130340576, "learning_rate": 1.6866778523600774e-05, "loss": 0.1004, "num_input_tokens_seen": 31410368, "step": 24220 }, { "epoch": 1.183665990765397, "grad_norm": 0.1725674718618393, "learning_rate": 1.6857495735464195e-05, "loss": 0.087, "num_input_tokens_seen": 31416768, "step": 24225 }, { "epoch": 1.1839102924290914, "grad_norm": 0.1540045589208603, "learning_rate": 1.6848214203004115e-05, "loss": 0.0669, "num_input_tokens_seen": 31422976, "step": 24230 }, { "epoch": 1.1841545940927858, "grad_norm": 0.4237011969089508, "learning_rate": 1.6838933927651835e-05, "loss": 0.0708, "num_input_tokens_seen": 31429600, "step": 24235 }, { "epoch": 1.1843988957564802, "grad_norm": 0.2078070193529129, "learning_rate": 1.6829654910838506e-05, "loss": 0.0805, "num_input_tokens_seen": 31436256, "step": 24240 }, { "epoch": 1.1846431974201743, "grad_norm": 0.6156185865402222, "learning_rate": 1.6820377153995065e-05, "loss": 0.1046, "num_input_tokens_seen": 31442304, "step": 24245 }, { "epoch": 1.1848874990838687, "grad_norm": 0.5147428512573242, "learning_rate": 1.681110065855226e-05, "loss": 0.0751, "num_input_tokens_seen": 31448832, "step": 24250 }, { "epoch": 1.185131800747563, "grad_norm": 0.2847868800163269, "learning_rate": 1.6801825425940642e-05, "loss": 0.084, "num_input_tokens_seen": 31455616, "step": 24255 }, { "epoch": 1.1853761024112575, "grad_norm": 0.13897864520549774, "learning_rate": 1.679255145759056e-05, "loss": 0.0909, "num_input_tokens_seen": 31462016, "step": 24260 }, { "epoch": 1.1856204040749518, "grad_norm": 0.5806822776794434, "learning_rate": 1.6783278754932187e-05, "loss": 0.0726, "num_input_tokens_seen": 31468288, "step": 24265 }, { "epoch": 1.185864705738646, "grad_norm": 0.307610422372818, "learning_rate": 1.6774007319395496e-05, "loss": 0.0667, "num_input_tokens_seen": 31474720, "step": 24270 }, { "epoch": 1.1861090074023404, "grad_norm": 0.5027682185173035, "learning_rate": 1.6764737152410243e-05, "loss": 0.1141, "num_input_tokens_seen": 31481376, "step": 24275 }, { "epoch": 1.1863533090660348, "grad_norm": 0.3484010398387909, "learning_rate": 1.6755468255406016e-05, "loss": 0.0591, "num_input_tokens_seen": 31487584, "step": 24280 }, { "epoch": 1.1865976107297291, "grad_norm": 0.38856229186058044, "learning_rate": 1.674620062981219e-05, "loss": 0.0771, "num_input_tokens_seen": 31493760, "step": 24285 }, { "epoch": 1.1868419123934233, "grad_norm": 0.19012802839279175, "learning_rate": 1.6736934277057947e-05, "loss": 0.079, "num_input_tokens_seen": 31499936, "step": 24290 }, { "epoch": 1.1870862140571177, "grad_norm": 0.17644250392913818, "learning_rate": 1.6727669198572286e-05, "loss": 0.0733, "num_input_tokens_seen": 31505984, "step": 24295 }, { "epoch": 1.187330515720812, "grad_norm": 0.5297979712486267, "learning_rate": 1.6718405395783984e-05, "loss": 0.0995, "num_input_tokens_seen": 31512640, "step": 24300 }, { "epoch": 1.1875748173845064, "grad_norm": 0.29540935158729553, "learning_rate": 1.6709142870121643e-05, "loss": 0.0673, "num_input_tokens_seen": 31518816, "step": 24305 }, { "epoch": 1.1878191190482008, "grad_norm": 0.2464703768491745, "learning_rate": 1.669988162301367e-05, "loss": 0.1039, "num_input_tokens_seen": 31525280, "step": 24310 }, { "epoch": 1.188063420711895, "grad_norm": 0.5204055309295654, "learning_rate": 1.6690621655888243e-05, "loss": 0.0992, "num_input_tokens_seen": 31531584, "step": 24315 }, { "epoch": 1.1883077223755893, "grad_norm": 0.38496842980384827, "learning_rate": 1.6681362970173386e-05, "loss": 0.0942, "num_input_tokens_seen": 31538048, "step": 24320 }, { "epoch": 1.1885520240392837, "grad_norm": 0.149429589509964, "learning_rate": 1.6672105567296904e-05, "loss": 0.0827, "num_input_tokens_seen": 31544800, "step": 24325 }, { "epoch": 1.188796325702978, "grad_norm": 0.1958753913640976, "learning_rate": 1.666284944868639e-05, "loss": 0.0738, "num_input_tokens_seen": 31551680, "step": 24330 }, { "epoch": 1.1890406273666723, "grad_norm": 0.5802128314971924, "learning_rate": 1.665359461576927e-05, "loss": 0.0858, "num_input_tokens_seen": 31558240, "step": 24335 }, { "epoch": 1.1892849290303666, "grad_norm": 0.19487819075584412, "learning_rate": 1.6644341069972736e-05, "loss": 0.0974, "num_input_tokens_seen": 31564608, "step": 24340 }, { "epoch": 1.189529230694061, "grad_norm": 0.18071608245372772, "learning_rate": 1.6635088812723813e-05, "loss": 0.0847, "num_input_tokens_seen": 31570880, "step": 24345 }, { "epoch": 1.1897735323577554, "grad_norm": 0.19881026446819305, "learning_rate": 1.6625837845449328e-05, "loss": 0.0643, "num_input_tokens_seen": 31577664, "step": 24350 }, { "epoch": 1.1900178340214498, "grad_norm": 0.21438352763652802, "learning_rate": 1.6616588169575874e-05, "loss": 0.0754, "num_input_tokens_seen": 31584128, "step": 24355 }, { "epoch": 1.190262135685144, "grad_norm": 0.10385286808013916, "learning_rate": 1.6607339786529878e-05, "loss": 0.0695, "num_input_tokens_seen": 31590592, "step": 24360 }, { "epoch": 1.1905064373488383, "grad_norm": 0.42750975489616394, "learning_rate": 1.659809269773756e-05, "loss": 0.0946, "num_input_tokens_seen": 31596864, "step": 24365 }, { "epoch": 1.1907507390125327, "grad_norm": 0.15890900790691376, "learning_rate": 1.658884690462493e-05, "loss": 0.0991, "num_input_tokens_seen": 31603040, "step": 24370 }, { "epoch": 1.190995040676227, "grad_norm": 0.4267442226409912, "learning_rate": 1.6579602408617813e-05, "loss": 0.0704, "num_input_tokens_seen": 31609248, "step": 24375 }, { "epoch": 1.1912393423399212, "grad_norm": 0.31285929679870605, "learning_rate": 1.657035921114181e-05, "loss": 0.0844, "num_input_tokens_seen": 31615648, "step": 24380 }, { "epoch": 1.1914836440036156, "grad_norm": 0.29172876477241516, "learning_rate": 1.656111731362236e-05, "loss": 0.0855, "num_input_tokens_seen": 31621600, "step": 24385 }, { "epoch": 1.19172794566731, "grad_norm": 0.2808176279067993, "learning_rate": 1.6551876717484666e-05, "loss": 0.0933, "num_input_tokens_seen": 31628000, "step": 24390 }, { "epoch": 1.1919722473310044, "grad_norm": 0.17154306173324585, "learning_rate": 1.6542637424153752e-05, "loss": 0.0805, "num_input_tokens_seen": 31634560, "step": 24395 }, { "epoch": 1.1922165489946988, "grad_norm": 0.27979540824890137, "learning_rate": 1.6533399435054418e-05, "loss": 0.0729, "num_input_tokens_seen": 31641056, "step": 24400 }, { "epoch": 1.1922165489946988, "eval_loss": 0.08877936005592346, "eval_runtime": 374.2087, "eval_samples_per_second": 97.232, "eval_steps_per_second": 24.31, "num_input_tokens_seen": 31641056, "step": 24400 }, { "epoch": 1.192460850658393, "grad_norm": 0.24176718294620514, "learning_rate": 1.6524162751611304e-05, "loss": 0.0955, "num_input_tokens_seen": 31648064, "step": 24405 }, { "epoch": 1.1927051523220873, "grad_norm": 0.40295934677124023, "learning_rate": 1.6514927375248796e-05, "loss": 0.0655, "num_input_tokens_seen": 31654528, "step": 24410 }, { "epoch": 1.1929494539857817, "grad_norm": 0.23432651162147522, "learning_rate": 1.6505693307391127e-05, "loss": 0.0559, "num_input_tokens_seen": 31661216, "step": 24415 }, { "epoch": 1.193193755649476, "grad_norm": 0.2776679992675781, "learning_rate": 1.6496460549462288e-05, "loss": 0.0966, "num_input_tokens_seen": 31667584, "step": 24420 }, { "epoch": 1.1934380573131702, "grad_norm": 0.22139978408813477, "learning_rate": 1.6487229102886097e-05, "loss": 0.0777, "num_input_tokens_seen": 31673728, "step": 24425 }, { "epoch": 1.1936823589768646, "grad_norm": 0.2742421329021454, "learning_rate": 1.6477998969086155e-05, "loss": 0.1081, "num_input_tokens_seen": 31679872, "step": 24430 }, { "epoch": 1.193926660640559, "grad_norm": 0.19011162221431732, "learning_rate": 1.646877014948587e-05, "loss": 0.0688, "num_input_tokens_seen": 31686176, "step": 24435 }, { "epoch": 1.1941709623042533, "grad_norm": 0.2532915472984314, "learning_rate": 1.6459542645508433e-05, "loss": 0.0845, "num_input_tokens_seen": 31692544, "step": 24440 }, { "epoch": 1.1944152639679477, "grad_norm": 0.4480985701084137, "learning_rate": 1.6450316458576852e-05, "loss": 0.0767, "num_input_tokens_seen": 31699136, "step": 24445 }, { "epoch": 1.1946595656316419, "grad_norm": 0.24386285245418549, "learning_rate": 1.6441091590113912e-05, "loss": 0.0842, "num_input_tokens_seen": 31705472, "step": 24450 }, { "epoch": 1.1949038672953363, "grad_norm": 0.2860795855522156, "learning_rate": 1.6431868041542213e-05, "loss": 0.0915, "num_input_tokens_seen": 31711712, "step": 24455 }, { "epoch": 1.1951481689590306, "grad_norm": 0.35905712842941284, "learning_rate": 1.6422645814284123e-05, "loss": 0.0918, "num_input_tokens_seen": 31718368, "step": 24460 }, { "epoch": 1.195392470622725, "grad_norm": 0.25171351432800293, "learning_rate": 1.6413424909761846e-05, "loss": 0.0759, "num_input_tokens_seen": 31724576, "step": 24465 }, { "epoch": 1.1956367722864192, "grad_norm": 0.4168550968170166, "learning_rate": 1.640420532939736e-05, "loss": 0.0928, "num_input_tokens_seen": 31731776, "step": 24470 }, { "epoch": 1.1958810739501136, "grad_norm": 0.778192937374115, "learning_rate": 1.639498707461242e-05, "loss": 0.1188, "num_input_tokens_seen": 31738400, "step": 24475 }, { "epoch": 1.196125375613808, "grad_norm": 0.37748032808303833, "learning_rate": 1.6385770146828614e-05, "loss": 0.0863, "num_input_tokens_seen": 31744384, "step": 24480 }, { "epoch": 1.1963696772775023, "grad_norm": 1.069503664970398, "learning_rate": 1.637655454746731e-05, "loss": 0.0951, "num_input_tokens_seen": 31750816, "step": 24485 }, { "epoch": 1.1966139789411967, "grad_norm": 0.3891218900680542, "learning_rate": 1.6367340277949658e-05, "loss": 0.0946, "num_input_tokens_seen": 31757344, "step": 24490 }, { "epoch": 1.1968582806048909, "grad_norm": 0.2604268789291382, "learning_rate": 1.635812733969663e-05, "loss": 0.0845, "num_input_tokens_seen": 31764096, "step": 24495 }, { "epoch": 1.1971025822685852, "grad_norm": 0.3548349440097809, "learning_rate": 1.634891573412896e-05, "loss": 0.0744, "num_input_tokens_seen": 31770528, "step": 24500 }, { "epoch": 1.1973468839322796, "grad_norm": 0.2827579975128174, "learning_rate": 1.6339705462667196e-05, "loss": 0.115, "num_input_tokens_seen": 31776576, "step": 24505 }, { "epoch": 1.197591185595974, "grad_norm": 0.2724158465862274, "learning_rate": 1.633049652673169e-05, "loss": 0.0836, "num_input_tokens_seen": 31782848, "step": 24510 }, { "epoch": 1.1978354872596682, "grad_norm": 0.7787217497825623, "learning_rate": 1.632128892774256e-05, "loss": 0.0746, "num_input_tokens_seen": 31789696, "step": 24515 }, { "epoch": 1.1980797889233625, "grad_norm": 0.5216692686080933, "learning_rate": 1.6312082667119737e-05, "loss": 0.0939, "num_input_tokens_seen": 31796512, "step": 24520 }, { "epoch": 1.198324090587057, "grad_norm": 0.3693056106567383, "learning_rate": 1.630287774628296e-05, "loss": 0.0983, "num_input_tokens_seen": 31802272, "step": 24525 }, { "epoch": 1.1985683922507513, "grad_norm": 0.1574568897485733, "learning_rate": 1.6293674166651718e-05, "loss": 0.0923, "num_input_tokens_seen": 31809312, "step": 24530 }, { "epoch": 1.1988126939144457, "grad_norm": 0.1345331370830536, "learning_rate": 1.6284471929645338e-05, "loss": 0.0806, "num_input_tokens_seen": 31815424, "step": 24535 }, { "epoch": 1.1990569955781398, "grad_norm": 0.541074812412262, "learning_rate": 1.627527103668291e-05, "loss": 0.0839, "num_input_tokens_seen": 31822048, "step": 24540 }, { "epoch": 1.1993012972418342, "grad_norm": 0.15069858729839325, "learning_rate": 1.6266071489183327e-05, "loss": 0.0751, "num_input_tokens_seen": 31828544, "step": 24545 }, { "epoch": 1.1995455989055286, "grad_norm": 0.7321081161499023, "learning_rate": 1.6256873288565283e-05, "loss": 0.1103, "num_input_tokens_seen": 31835424, "step": 24550 }, { "epoch": 1.199789900569223, "grad_norm": 0.23036976158618927, "learning_rate": 1.6247676436247245e-05, "loss": 0.0899, "num_input_tokens_seen": 31841856, "step": 24555 }, { "epoch": 1.2000342022329171, "grad_norm": 0.5028965473175049, "learning_rate": 1.6238480933647486e-05, "loss": 0.092, "num_input_tokens_seen": 31848032, "step": 24560 }, { "epoch": 1.2002785038966115, "grad_norm": 0.1690777987241745, "learning_rate": 1.6229286782184083e-05, "loss": 0.0837, "num_input_tokens_seen": 31854464, "step": 24565 }, { "epoch": 1.2005228055603059, "grad_norm": 0.6874639391899109, "learning_rate": 1.622009398327487e-05, "loss": 0.1028, "num_input_tokens_seen": 31860992, "step": 24570 }, { "epoch": 1.2007671072240003, "grad_norm": 0.31878629326820374, "learning_rate": 1.6210902538337502e-05, "loss": 0.078, "num_input_tokens_seen": 31867904, "step": 24575 }, { "epoch": 1.2010114088876946, "grad_norm": 0.2408560961484909, "learning_rate": 1.6201712448789413e-05, "loss": 0.088, "num_input_tokens_seen": 31874048, "step": 24580 }, { "epoch": 1.2012557105513888, "grad_norm": 0.342919260263443, "learning_rate": 1.6192523716047827e-05, "loss": 0.0751, "num_input_tokens_seen": 31881952, "step": 24585 }, { "epoch": 1.2015000122150832, "grad_norm": 0.19076092541217804, "learning_rate": 1.6183336341529776e-05, "loss": 0.0716, "num_input_tokens_seen": 31888256, "step": 24590 }, { "epoch": 1.2017443138787776, "grad_norm": 0.21221424639225006, "learning_rate": 1.6174150326652047e-05, "loss": 0.0701, "num_input_tokens_seen": 31894624, "step": 24595 }, { "epoch": 1.201988615542472, "grad_norm": 0.4570421278476715, "learning_rate": 1.6164965672831256e-05, "loss": 0.1071, "num_input_tokens_seen": 31900960, "step": 24600 }, { "epoch": 1.201988615542472, "eval_loss": 0.08879601210355759, "eval_runtime": 375.3274, "eval_samples_per_second": 96.942, "eval_steps_per_second": 24.238, "num_input_tokens_seen": 31900960, "step": 24600 }, { "epoch": 1.202232917206166, "grad_norm": 0.34616026282310486, "learning_rate": 1.6155782381483784e-05, "loss": 0.0817, "num_input_tokens_seen": 31907584, "step": 24605 }, { "epoch": 1.2024772188698605, "grad_norm": 0.18617019057273865, "learning_rate": 1.6146600454025813e-05, "loss": 0.0804, "num_input_tokens_seen": 31914208, "step": 24610 }, { "epoch": 1.2027215205335549, "grad_norm": 0.4333128035068512, "learning_rate": 1.6137419891873317e-05, "loss": 0.1331, "num_input_tokens_seen": 31920128, "step": 24615 }, { "epoch": 1.2029658221972492, "grad_norm": 0.16158415377140045, "learning_rate": 1.6128240696442038e-05, "loss": 0.0868, "num_input_tokens_seen": 31926688, "step": 24620 }, { "epoch": 1.2032101238609436, "grad_norm": 0.18652395904064178, "learning_rate": 1.611906286914753e-05, "loss": 0.0654, "num_input_tokens_seen": 31933152, "step": 24625 }, { "epoch": 1.2034544255246378, "grad_norm": 1.1319457292556763, "learning_rate": 1.6109886411405144e-05, "loss": 0.0666, "num_input_tokens_seen": 31940576, "step": 24630 }, { "epoch": 1.2036987271883322, "grad_norm": 0.36403921246528625, "learning_rate": 1.6100711324629985e-05, "loss": 0.1157, "num_input_tokens_seen": 31947360, "step": 24635 }, { "epoch": 1.2039430288520265, "grad_norm": 0.3333069682121277, "learning_rate": 1.609153761023698e-05, "loss": 0.0772, "num_input_tokens_seen": 31953760, "step": 24640 }, { "epoch": 1.2041873305157207, "grad_norm": 0.6274518370628357, "learning_rate": 1.608236526964083e-05, "loss": 0.0959, "num_input_tokens_seen": 31960064, "step": 24645 }, { "epoch": 1.204431632179415, "grad_norm": 0.33702051639556885, "learning_rate": 1.607319430425601e-05, "loss": 0.0671, "num_input_tokens_seen": 31967168, "step": 24650 }, { "epoch": 1.2046759338431094, "grad_norm": 0.409261554479599, "learning_rate": 1.606402471549682e-05, "loss": 0.0855, "num_input_tokens_seen": 31974144, "step": 24655 }, { "epoch": 1.2049202355068038, "grad_norm": 0.35141655802726746, "learning_rate": 1.6054856504777312e-05, "loss": 0.0629, "num_input_tokens_seen": 31980672, "step": 24660 }, { "epoch": 1.2051645371704982, "grad_norm": 0.2839643657207489, "learning_rate": 1.6045689673511334e-05, "loss": 0.0971, "num_input_tokens_seen": 31987040, "step": 24665 }, { "epoch": 1.2054088388341924, "grad_norm": 0.1840907484292984, "learning_rate": 1.6036524223112548e-05, "loss": 0.0872, "num_input_tokens_seen": 31993760, "step": 24670 }, { "epoch": 1.2056531404978867, "grad_norm": 0.9274163246154785, "learning_rate": 1.602736015499436e-05, "loss": 0.0937, "num_input_tokens_seen": 32000288, "step": 24675 }, { "epoch": 1.2058974421615811, "grad_norm": 0.3029390573501587, "learning_rate": 1.601819747057e-05, "loss": 0.0846, "num_input_tokens_seen": 32006592, "step": 24680 }, { "epoch": 1.2061417438252755, "grad_norm": 0.26740431785583496, "learning_rate": 1.6009036171252465e-05, "loss": 0.0995, "num_input_tokens_seen": 32012352, "step": 24685 }, { "epoch": 1.2063860454889697, "grad_norm": 0.3790183663368225, "learning_rate": 1.599987625845453e-05, "loss": 0.0778, "num_input_tokens_seen": 32018880, "step": 24690 }, { "epoch": 1.206630347152664, "grad_norm": 0.2754775285720825, "learning_rate": 1.599071773358879e-05, "loss": 0.0866, "num_input_tokens_seen": 32025184, "step": 24695 }, { "epoch": 1.2068746488163584, "grad_norm": 0.29090195894241333, "learning_rate": 1.598156059806758e-05, "loss": 0.123, "num_input_tokens_seen": 32031648, "step": 24700 }, { "epoch": 1.2071189504800528, "grad_norm": 0.41381242871284485, "learning_rate": 1.5972404853303062e-05, "loss": 0.0709, "num_input_tokens_seen": 32038432, "step": 24705 }, { "epoch": 1.2073632521437472, "grad_norm": 0.4795931577682495, "learning_rate": 1.5963250500707172e-05, "loss": 0.0665, "num_input_tokens_seen": 32044384, "step": 24710 }, { "epoch": 1.2076075538074413, "grad_norm": 0.2261962741613388, "learning_rate": 1.5954097541691612e-05, "loss": 0.0969, "num_input_tokens_seen": 32050688, "step": 24715 }, { "epoch": 1.2078518554711357, "grad_norm": 0.20286764204502106, "learning_rate": 1.5944945977667884e-05, "loss": 0.0998, "num_input_tokens_seen": 32057216, "step": 24720 }, { "epoch": 1.20809615713483, "grad_norm": 0.30432775616645813, "learning_rate": 1.593579581004729e-05, "loss": 0.1183, "num_input_tokens_seen": 32063680, "step": 24725 }, { "epoch": 1.2083404587985245, "grad_norm": 0.16225813329219818, "learning_rate": 1.592664704024088e-05, "loss": 0.0729, "num_input_tokens_seen": 32070080, "step": 24730 }, { "epoch": 1.2085847604622186, "grad_norm": 0.45805418491363525, "learning_rate": 1.591749966965953e-05, "loss": 0.0833, "num_input_tokens_seen": 32076384, "step": 24735 }, { "epoch": 1.208829062125913, "grad_norm": 0.34135526418685913, "learning_rate": 1.5908353699713856e-05, "loss": 0.0822, "num_input_tokens_seen": 32082880, "step": 24740 }, { "epoch": 1.2090733637896074, "grad_norm": 0.20437295734882355, "learning_rate": 1.5899209131814298e-05, "loss": 0.0822, "num_input_tokens_seen": 32088928, "step": 24745 }, { "epoch": 1.2093176654533018, "grad_norm": 0.8305702209472656, "learning_rate": 1.5890065967371067e-05, "loss": 0.1084, "num_input_tokens_seen": 32095200, "step": 24750 }, { "epoch": 1.2095619671169962, "grad_norm": 0.10377305001020432, "learning_rate": 1.5880924207794144e-05, "loss": 0.0721, "num_input_tokens_seen": 32101856, "step": 24755 }, { "epoch": 1.2098062687806903, "grad_norm": 0.8245893120765686, "learning_rate": 1.5871783854493298e-05, "loss": 0.0915, "num_input_tokens_seen": 32108096, "step": 24760 }, { "epoch": 1.2100505704443847, "grad_norm": 0.8027825951576233, "learning_rate": 1.5862644908878106e-05, "loss": 0.0945, "num_input_tokens_seen": 32114304, "step": 24765 }, { "epoch": 1.210294872108079, "grad_norm": 1.0613762140274048, "learning_rate": 1.5853507372357885e-05, "loss": 0.0879, "num_input_tokens_seen": 32120576, "step": 24770 }, { "epoch": 1.2105391737717734, "grad_norm": 0.10198996216058731, "learning_rate": 1.5844371246341776e-05, "loss": 0.0708, "num_input_tokens_seen": 32126688, "step": 24775 }, { "epoch": 1.2107834754354676, "grad_norm": 0.34596744179725647, "learning_rate": 1.5835236532238674e-05, "loss": 0.0881, "num_input_tokens_seen": 32132864, "step": 24780 }, { "epoch": 1.211027777099162, "grad_norm": 0.4405292868614197, "learning_rate": 1.582610323145727e-05, "loss": 0.0947, "num_input_tokens_seen": 32139136, "step": 24785 }, { "epoch": 1.2112720787628564, "grad_norm": 0.18132233619689941, "learning_rate": 1.5816971345406035e-05, "loss": 0.0822, "num_input_tokens_seen": 32145440, "step": 24790 }, { "epoch": 1.2115163804265507, "grad_norm": 0.2082556039094925, "learning_rate": 1.5807840875493225e-05, "loss": 0.0743, "num_input_tokens_seen": 32151936, "step": 24795 }, { "epoch": 1.2117606820902451, "grad_norm": 0.5762012600898743, "learning_rate": 1.5798711823126854e-05, "loss": 0.0779, "num_input_tokens_seen": 32158304, "step": 24800 }, { "epoch": 1.2117606820902451, "eval_loss": 0.08829185366630554, "eval_runtime": 374.2953, "eval_samples_per_second": 97.209, "eval_steps_per_second": 24.304, "num_input_tokens_seen": 32158304, "step": 24800 }, { "epoch": 1.2120049837539393, "grad_norm": 0.4280950427055359, "learning_rate": 1.578958418971477e-05, "loss": 0.0987, "num_input_tokens_seen": 32164832, "step": 24805 }, { "epoch": 1.2122492854176337, "grad_norm": 0.7062125205993652, "learning_rate": 1.578045797666453e-05, "loss": 0.0804, "num_input_tokens_seen": 32171360, "step": 24810 }, { "epoch": 1.212493587081328, "grad_norm": 0.16137292981147766, "learning_rate": 1.5771333185383548e-05, "loss": 0.0808, "num_input_tokens_seen": 32178016, "step": 24815 }, { "epoch": 1.2127378887450224, "grad_norm": 0.20225152373313904, "learning_rate": 1.576220981727895e-05, "loss": 0.0852, "num_input_tokens_seen": 32184320, "step": 24820 }, { "epoch": 1.2129821904087166, "grad_norm": 0.25654634833335876, "learning_rate": 1.575308787375769e-05, "loss": 0.0775, "num_input_tokens_seen": 32190816, "step": 24825 }, { "epoch": 1.213226492072411, "grad_norm": 0.9083828926086426, "learning_rate": 1.5743967356226492e-05, "loss": 0.0876, "num_input_tokens_seen": 32197440, "step": 24830 }, { "epoch": 1.2134707937361053, "grad_norm": 0.3765968978404999, "learning_rate": 1.5734848266091835e-05, "loss": 0.0985, "num_input_tokens_seen": 32204096, "step": 24835 }, { "epoch": 1.2137150953997997, "grad_norm": 0.30871275067329407, "learning_rate": 1.572573060476001e-05, "loss": 0.0856, "num_input_tokens_seen": 32210560, "step": 24840 }, { "epoch": 1.213959397063494, "grad_norm": 0.1266026794910431, "learning_rate": 1.5716614373637085e-05, "loss": 0.082, "num_input_tokens_seen": 32217600, "step": 24845 }, { "epoch": 1.2142036987271883, "grad_norm": 0.12617245316505432, "learning_rate": 1.570749957412887e-05, "loss": 0.0537, "num_input_tokens_seen": 32224160, "step": 24850 }, { "epoch": 1.2144480003908826, "grad_norm": 0.362613707780838, "learning_rate": 1.5698386207641013e-05, "loss": 0.0815, "num_input_tokens_seen": 32230560, "step": 24855 }, { "epoch": 1.214692302054577, "grad_norm": 0.16170601546764374, "learning_rate": 1.5689274275578884e-05, "loss": 0.0829, "num_input_tokens_seen": 32236736, "step": 24860 }, { "epoch": 1.2149366037182714, "grad_norm": 0.35928866267204285, "learning_rate": 1.5680163779347667e-05, "loss": 0.0791, "num_input_tokens_seen": 32243200, "step": 24865 }, { "epoch": 1.2151809053819655, "grad_norm": 0.20218518376350403, "learning_rate": 1.5671054720352327e-05, "loss": 0.0816, "num_input_tokens_seen": 32249728, "step": 24870 }, { "epoch": 1.21542520704566, "grad_norm": 0.2694813311100006, "learning_rate": 1.566194709999757e-05, "loss": 0.0989, "num_input_tokens_seen": 32256416, "step": 24875 }, { "epoch": 1.2156695087093543, "grad_norm": 0.2409064769744873, "learning_rate": 1.5652840919687933e-05, "loss": 0.0713, "num_input_tokens_seen": 32262560, "step": 24880 }, { "epoch": 1.2159138103730487, "grad_norm": 0.5510802865028381, "learning_rate": 1.5643736180827676e-05, "loss": 0.0822, "num_input_tokens_seen": 32268928, "step": 24885 }, { "epoch": 1.216158112036743, "grad_norm": 0.24627253413200378, "learning_rate": 1.5634632884820878e-05, "loss": 0.0671, "num_input_tokens_seen": 32275200, "step": 24890 }, { "epoch": 1.2164024137004372, "grad_norm": 0.1453610211610794, "learning_rate": 1.5625531033071395e-05, "loss": 0.0804, "num_input_tokens_seen": 32282080, "step": 24895 }, { "epoch": 1.2166467153641316, "grad_norm": 0.17365503311157227, "learning_rate": 1.5616430626982828e-05, "loss": 0.0801, "num_input_tokens_seen": 32289184, "step": 24900 }, { "epoch": 1.216891017027826, "grad_norm": 0.2583310008049011, "learning_rate": 1.5607331667958575e-05, "loss": 0.0772, "num_input_tokens_seen": 32295776, "step": 24905 }, { "epoch": 1.2171353186915204, "grad_norm": 0.48217928409576416, "learning_rate": 1.5598234157401824e-05, "loss": 0.0748, "num_input_tokens_seen": 32302240, "step": 24910 }, { "epoch": 1.2173796203552145, "grad_norm": 0.5845999121665955, "learning_rate": 1.5589138096715503e-05, "loss": 0.1215, "num_input_tokens_seen": 32308384, "step": 24915 }, { "epoch": 1.217623922018909, "grad_norm": 0.10935187339782715, "learning_rate": 1.5580043487302365e-05, "loss": 0.083, "num_input_tokens_seen": 32314912, "step": 24920 }, { "epoch": 1.2178682236826033, "grad_norm": 0.1677830070257187, "learning_rate": 1.5570950330564888e-05, "loss": 0.0733, "num_input_tokens_seen": 32322112, "step": 24925 }, { "epoch": 1.2181125253462977, "grad_norm": 0.2535870671272278, "learning_rate": 1.5561858627905367e-05, "loss": 0.077, "num_input_tokens_seen": 32328704, "step": 24930 }, { "epoch": 1.218356827009992, "grad_norm": 0.3831513226032257, "learning_rate": 1.5552768380725857e-05, "loss": 0.0677, "num_input_tokens_seen": 32335168, "step": 24935 }, { "epoch": 1.2186011286736862, "grad_norm": 0.24609941244125366, "learning_rate": 1.5543679590428183e-05, "loss": 0.0763, "num_input_tokens_seen": 32341856, "step": 24940 }, { "epoch": 1.2188454303373806, "grad_norm": 0.940830647945404, "learning_rate": 1.5534592258413943e-05, "loss": 0.094, "num_input_tokens_seen": 32348384, "step": 24945 }, { "epoch": 1.219089732001075, "grad_norm": 0.26549750566482544, "learning_rate": 1.5525506386084538e-05, "loss": 0.0686, "num_input_tokens_seen": 32355136, "step": 24950 }, { "epoch": 1.2193340336647693, "grad_norm": 0.14345628023147583, "learning_rate": 1.55164219748411e-05, "loss": 0.091, "num_input_tokens_seen": 32361568, "step": 24955 }, { "epoch": 1.2195783353284635, "grad_norm": 0.6293279528617859, "learning_rate": 1.550733902608459e-05, "loss": 0.0923, "num_input_tokens_seen": 32368128, "step": 24960 }, { "epoch": 1.2198226369921579, "grad_norm": 0.3443799316883087, "learning_rate": 1.549825754121568e-05, "loss": 0.1, "num_input_tokens_seen": 32374336, "step": 24965 }, { "epoch": 1.2200669386558523, "grad_norm": 0.4696604609489441, "learning_rate": 1.5489177521634864e-05, "loss": 0.0738, "num_input_tokens_seen": 32381024, "step": 24970 }, { "epoch": 1.2203112403195466, "grad_norm": 0.20426639914512634, "learning_rate": 1.5480098968742402e-05, "loss": 0.074, "num_input_tokens_seen": 32387552, "step": 24975 }, { "epoch": 1.220555541983241, "grad_norm": 0.31521904468536377, "learning_rate": 1.5471021883938304e-05, "loss": 0.1031, "num_input_tokens_seen": 32394048, "step": 24980 }, { "epoch": 1.2207998436469352, "grad_norm": 0.15247222781181335, "learning_rate": 1.546194626862238e-05, "loss": 0.0763, "num_input_tokens_seen": 32400352, "step": 24985 }, { "epoch": 1.2210441453106295, "grad_norm": 0.2107960283756256, "learning_rate": 1.5452872124194216e-05, "loss": 0.0877, "num_input_tokens_seen": 32406944, "step": 24990 }, { "epoch": 1.221288446974324, "grad_norm": 0.24275875091552734, "learning_rate": 1.5443799452053136e-05, "loss": 0.0976, "num_input_tokens_seen": 32412672, "step": 24995 }, { "epoch": 1.2215327486380183, "grad_norm": 0.24012181162834167, "learning_rate": 1.543472825359828e-05, "loss": 0.0784, "num_input_tokens_seen": 32419552, "step": 25000 }, { "epoch": 1.2215327486380183, "eval_loss": 0.08778662979602814, "eval_runtime": 374.3662, "eval_samples_per_second": 97.191, "eval_steps_per_second": 24.3, "num_input_tokens_seen": 32419552, "step": 25000 }, { "epoch": 1.2217770503017125, "grad_norm": 0.27773892879486084, "learning_rate": 1.5425658530228522e-05, "loss": 0.0974, "num_input_tokens_seen": 32425728, "step": 25005 }, { "epoch": 1.2220213519654068, "grad_norm": 0.22383107244968414, "learning_rate": 1.5416590283342546e-05, "loss": 0.0872, "num_input_tokens_seen": 32432096, "step": 25010 }, { "epoch": 1.2222656536291012, "grad_norm": 0.5690848231315613, "learning_rate": 1.5407523514338783e-05, "loss": 0.1119, "num_input_tokens_seen": 32438528, "step": 25015 }, { "epoch": 1.2225099552927956, "grad_norm": 0.23976829648017883, "learning_rate": 1.539845822461543e-05, "loss": 0.0952, "num_input_tokens_seen": 32445632, "step": 25020 }, { "epoch": 1.22275425695649, "grad_norm": 0.6787528991699219, "learning_rate": 1.538939441557048e-05, "loss": 0.058, "num_input_tokens_seen": 32451936, "step": 25025 }, { "epoch": 1.2229985586201841, "grad_norm": 0.12737970054149628, "learning_rate": 1.5380332088601696e-05, "loss": 0.0754, "num_input_tokens_seen": 32458176, "step": 25030 }, { "epoch": 1.2232428602838785, "grad_norm": 0.17520840466022491, "learning_rate": 1.537127124510658e-05, "loss": 0.0731, "num_input_tokens_seen": 32464640, "step": 25035 }, { "epoch": 1.223487161947573, "grad_norm": 0.26177743077278137, "learning_rate": 1.5362211886482457e-05, "loss": 0.0972, "num_input_tokens_seen": 32471904, "step": 25040 }, { "epoch": 1.2237314636112673, "grad_norm": 0.186601459980011, "learning_rate": 1.5353154014126363e-05, "loss": 0.0814, "num_input_tokens_seen": 32478240, "step": 25045 }, { "epoch": 1.2239757652749614, "grad_norm": 0.5488385558128357, "learning_rate": 1.534409762943515e-05, "loss": 0.0899, "num_input_tokens_seen": 32484768, "step": 25050 }, { "epoch": 1.2242200669386558, "grad_norm": 0.16970010101795197, "learning_rate": 1.5335042733805438e-05, "loss": 0.0726, "num_input_tokens_seen": 32491264, "step": 25055 }, { "epoch": 1.2244643686023502, "grad_norm": 0.37622544169425964, "learning_rate": 1.532598932863358e-05, "loss": 0.1011, "num_input_tokens_seen": 32497344, "step": 25060 }, { "epoch": 1.2247086702660446, "grad_norm": 0.2392369955778122, "learning_rate": 1.531693741531574e-05, "loss": 0.0944, "num_input_tokens_seen": 32503680, "step": 25065 }, { "epoch": 1.224952971929739, "grad_norm": 0.23403972387313843, "learning_rate": 1.5307886995247844e-05, "loss": 0.1117, "num_input_tokens_seen": 32510016, "step": 25070 }, { "epoch": 1.2251972735934331, "grad_norm": 0.26370009779930115, "learning_rate": 1.529883806982557e-05, "loss": 0.0682, "num_input_tokens_seen": 32516320, "step": 25075 }, { "epoch": 1.2254415752571275, "grad_norm": 0.34888434410095215, "learning_rate": 1.5289790640444376e-05, "loss": 0.0926, "num_input_tokens_seen": 32522400, "step": 25080 }, { "epoch": 1.2256858769208219, "grad_norm": 0.3487650454044342, "learning_rate": 1.5280744708499494e-05, "loss": 0.092, "num_input_tokens_seen": 32528992, "step": 25085 }, { "epoch": 1.2259301785845163, "grad_norm": 0.2576135993003845, "learning_rate": 1.527170027538591e-05, "loss": 0.0748, "num_input_tokens_seen": 32535296, "step": 25090 }, { "epoch": 1.2261744802482104, "grad_norm": 0.41690078377723694, "learning_rate": 1.5262657342498407e-05, "loss": 0.0859, "num_input_tokens_seen": 32541568, "step": 25095 }, { "epoch": 1.2264187819119048, "grad_norm": 0.49250322580337524, "learning_rate": 1.52536159112315e-05, "loss": 0.0962, "num_input_tokens_seen": 32548064, "step": 25100 }, { "epoch": 1.2266630835755992, "grad_norm": 0.28775978088378906, "learning_rate": 1.5244575982979497e-05, "loss": 0.0648, "num_input_tokens_seen": 32554592, "step": 25105 }, { "epoch": 1.2269073852392935, "grad_norm": 0.1341841220855713, "learning_rate": 1.5235537559136487e-05, "loss": 0.0777, "num_input_tokens_seen": 32561440, "step": 25110 }, { "epoch": 1.227151686902988, "grad_norm": 0.22968634963035583, "learning_rate": 1.5226500641096286e-05, "loss": 0.0806, "num_input_tokens_seen": 32568064, "step": 25115 }, { "epoch": 1.227395988566682, "grad_norm": 0.11488199979066849, "learning_rate": 1.5217465230252509e-05, "loss": 0.0736, "num_input_tokens_seen": 32574592, "step": 25120 }, { "epoch": 1.2276402902303765, "grad_norm": 0.31505370140075684, "learning_rate": 1.5208431327998523e-05, "loss": 0.1016, "num_input_tokens_seen": 32580608, "step": 25125 }, { "epoch": 1.2278845918940708, "grad_norm": 0.5413758158683777, "learning_rate": 1.5199398935727477e-05, "loss": 0.0839, "num_input_tokens_seen": 32587200, "step": 25130 }, { "epoch": 1.2281288935577652, "grad_norm": 0.34145811200141907, "learning_rate": 1.5190368054832282e-05, "loss": 0.1067, "num_input_tokens_seen": 32593568, "step": 25135 }, { "epoch": 1.2283731952214594, "grad_norm": 0.11338799446821213, "learning_rate": 1.5181338686705601e-05, "loss": 0.0866, "num_input_tokens_seen": 32600128, "step": 25140 }, { "epoch": 1.2286174968851538, "grad_norm": 0.20394062995910645, "learning_rate": 1.5172310832739889e-05, "loss": 0.0709, "num_input_tokens_seen": 32606336, "step": 25145 }, { "epoch": 1.2288617985488481, "grad_norm": 0.43838945031166077, "learning_rate": 1.5163284494327346e-05, "loss": 0.0697, "num_input_tokens_seen": 32613088, "step": 25150 }, { "epoch": 1.2291061002125425, "grad_norm": 0.21217504143714905, "learning_rate": 1.5154259672859952e-05, "loss": 0.0703, "num_input_tokens_seen": 32619520, "step": 25155 }, { "epoch": 1.229350401876237, "grad_norm": 0.1670224517583847, "learning_rate": 1.5145236369729452e-05, "loss": 0.0878, "num_input_tokens_seen": 32626016, "step": 25160 }, { "epoch": 1.229594703539931, "grad_norm": 0.3610907793045044, "learning_rate": 1.5136214586327335e-05, "loss": 0.0886, "num_input_tokens_seen": 32632352, "step": 25165 }, { "epoch": 1.2298390052036254, "grad_norm": 0.1732012778520584, "learning_rate": 1.5127194324044885e-05, "loss": 0.1032, "num_input_tokens_seen": 32639552, "step": 25170 }, { "epoch": 1.2300833068673198, "grad_norm": 0.46530330181121826, "learning_rate": 1.5118175584273148e-05, "loss": 0.0564, "num_input_tokens_seen": 32646464, "step": 25175 }, { "epoch": 1.230327608531014, "grad_norm": 0.16895656287670135, "learning_rate": 1.5109158368402909e-05, "loss": 0.0824, "num_input_tokens_seen": 32653024, "step": 25180 }, { "epoch": 1.2305719101947084, "grad_norm": 0.16295284032821655, "learning_rate": 1.5100142677824753e-05, "loss": 0.0767, "num_input_tokens_seen": 32659072, "step": 25185 }, { "epoch": 1.2308162118584027, "grad_norm": 0.19524016976356506, "learning_rate": 1.509112851392901e-05, "loss": 0.1183, "num_input_tokens_seen": 32665152, "step": 25190 }, { "epoch": 1.231060513522097, "grad_norm": 0.4227093756198883, "learning_rate": 1.5082115878105763e-05, "loss": 0.0693, "num_input_tokens_seen": 32671872, "step": 25195 }, { "epoch": 1.2313048151857915, "grad_norm": 0.09414276480674744, "learning_rate": 1.5073104771744892e-05, "loss": 0.0652, "num_input_tokens_seen": 32677888, "step": 25200 }, { "epoch": 1.2313048151857915, "eval_loss": 0.08787746727466583, "eval_runtime": 374.5412, "eval_samples_per_second": 97.146, "eval_steps_per_second": 24.288, "num_input_tokens_seen": 32677888, "step": 25200 }, { "epoch": 1.2315491168494859, "grad_norm": 0.30946075916290283, "learning_rate": 1.5064095196236006e-05, "loss": 0.0963, "num_input_tokens_seen": 32684576, "step": 25205 }, { "epoch": 1.23179341851318, "grad_norm": 0.2507924437522888, "learning_rate": 1.50550871529685e-05, "loss": 0.0848, "num_input_tokens_seen": 32690912, "step": 25210 }, { "epoch": 1.2320377201768744, "grad_norm": 0.2036089301109314, "learning_rate": 1.5046080643331546e-05, "loss": 0.0648, "num_input_tokens_seen": 32697376, "step": 25215 }, { "epoch": 1.2322820218405688, "grad_norm": 0.6746824979782104, "learning_rate": 1.5037075668714028e-05, "loss": 0.073, "num_input_tokens_seen": 32703584, "step": 25220 }, { "epoch": 1.232526323504263, "grad_norm": 0.246103897690773, "learning_rate": 1.5028072230504656e-05, "loss": 0.0648, "num_input_tokens_seen": 32709760, "step": 25225 }, { "epoch": 1.2327706251679573, "grad_norm": 0.15970845520496368, "learning_rate": 1.5019070330091861e-05, "loss": 0.053, "num_input_tokens_seen": 32715968, "step": 25230 }, { "epoch": 1.2330149268316517, "grad_norm": 0.4110338091850281, "learning_rate": 1.5010069968863843e-05, "loss": 0.1001, "num_input_tokens_seen": 32722368, "step": 25235 }, { "epoch": 1.233259228495346, "grad_norm": 0.3287578225135803, "learning_rate": 1.5001071148208584e-05, "loss": 0.1051, "num_input_tokens_seen": 32729088, "step": 25240 }, { "epoch": 1.2335035301590405, "grad_norm": 0.19995784759521484, "learning_rate": 1.49920738695138e-05, "loss": 0.0955, "num_input_tokens_seen": 32735328, "step": 25245 }, { "epoch": 1.2337478318227346, "grad_norm": 0.14675530791282654, "learning_rate": 1.4983078134166995e-05, "loss": 0.0973, "num_input_tokens_seen": 32742272, "step": 25250 }, { "epoch": 1.233992133486429, "grad_norm": 0.3928643465042114, "learning_rate": 1.4974083943555428e-05, "loss": 0.0833, "num_input_tokens_seen": 32748736, "step": 25255 }, { "epoch": 1.2342364351501234, "grad_norm": 0.5018299221992493, "learning_rate": 1.496509129906611e-05, "loss": 0.1056, "num_input_tokens_seen": 32755360, "step": 25260 }, { "epoch": 1.2344807368138178, "grad_norm": 0.33408603072166443, "learning_rate": 1.4956100202085809e-05, "loss": 0.0956, "num_input_tokens_seen": 32761824, "step": 25265 }, { "epoch": 1.234725038477512, "grad_norm": 0.26936104893684387, "learning_rate": 1.4947110654001093e-05, "loss": 0.0962, "num_input_tokens_seen": 32768384, "step": 25270 }, { "epoch": 1.2349693401412063, "grad_norm": 0.18658927083015442, "learning_rate": 1.4938122656198234e-05, "loss": 0.0843, "num_input_tokens_seen": 32774464, "step": 25275 }, { "epoch": 1.2352136418049007, "grad_norm": 0.43671831488609314, "learning_rate": 1.4929136210063316e-05, "loss": 0.0867, "num_input_tokens_seen": 32780704, "step": 25280 }, { "epoch": 1.235457943468595, "grad_norm": 0.30844560265541077, "learning_rate": 1.4920151316982146e-05, "loss": 0.0849, "num_input_tokens_seen": 32787392, "step": 25285 }, { "epoch": 1.2357022451322894, "grad_norm": 0.15080179274082184, "learning_rate": 1.4911167978340312e-05, "loss": 0.094, "num_input_tokens_seen": 32793248, "step": 25290 }, { "epoch": 1.2359465467959836, "grad_norm": 0.24739132821559906, "learning_rate": 1.4902186195523166e-05, "loss": 0.0813, "num_input_tokens_seen": 32799808, "step": 25295 }, { "epoch": 1.236190848459678, "grad_norm": 0.19951605796813965, "learning_rate": 1.4893205969915805e-05, "loss": 0.0915, "num_input_tokens_seen": 32806464, "step": 25300 }, { "epoch": 1.2364351501233724, "grad_norm": 0.2558390200138092, "learning_rate": 1.4884227302903086e-05, "loss": 0.0585, "num_input_tokens_seen": 32812928, "step": 25305 }, { "epoch": 1.2366794517870667, "grad_norm": 0.18780052661895752, "learning_rate": 1.4875250195869653e-05, "loss": 0.0759, "num_input_tokens_seen": 32819648, "step": 25310 }, { "epoch": 1.2369237534507609, "grad_norm": 0.4633871614933014, "learning_rate": 1.4866274650199862e-05, "loss": 0.087, "num_input_tokens_seen": 32825984, "step": 25315 }, { "epoch": 1.2371680551144553, "grad_norm": 0.6512073278427124, "learning_rate": 1.485730066727788e-05, "loss": 0.0756, "num_input_tokens_seen": 32832192, "step": 25320 }, { "epoch": 1.2374123567781496, "grad_norm": 0.504584789276123, "learning_rate": 1.4848328248487586e-05, "loss": 0.0937, "num_input_tokens_seen": 32838784, "step": 25325 }, { "epoch": 1.237656658441844, "grad_norm": 0.41685736179351807, "learning_rate": 1.4839357395212656e-05, "loss": 0.0938, "num_input_tokens_seen": 32845152, "step": 25330 }, { "epoch": 1.2379009601055384, "grad_norm": 0.2620007395744324, "learning_rate": 1.4830388108836502e-05, "loss": 0.0939, "num_input_tokens_seen": 32851712, "step": 25335 }, { "epoch": 1.2381452617692326, "grad_norm": 0.24017691612243652, "learning_rate": 1.4821420390742299e-05, "loss": 0.1208, "num_input_tokens_seen": 32858080, "step": 25340 }, { "epoch": 1.238389563432927, "grad_norm": 0.20529073476791382, "learning_rate": 1.4812454242312979e-05, "loss": 0.0868, "num_input_tokens_seen": 32864672, "step": 25345 }, { "epoch": 1.2386338650966213, "grad_norm": 0.2766796946525574, "learning_rate": 1.4803489664931253e-05, "loss": 0.0677, "num_input_tokens_seen": 32871296, "step": 25350 }, { "epoch": 1.2388781667603157, "grad_norm": 0.2821716368198395, "learning_rate": 1.4794526659979544e-05, "loss": 0.1043, "num_input_tokens_seen": 32877792, "step": 25355 }, { "epoch": 1.2391224684240099, "grad_norm": 0.1916375458240509, "learning_rate": 1.4785565228840086e-05, "loss": 0.1017, "num_input_tokens_seen": 32884640, "step": 25360 }, { "epoch": 1.2393667700877042, "grad_norm": 0.5513014197349548, "learning_rate": 1.4776605372894819e-05, "loss": 0.0939, "num_input_tokens_seen": 32891648, "step": 25365 }, { "epoch": 1.2396110717513986, "grad_norm": 0.3446405231952667, "learning_rate": 1.4767647093525488e-05, "loss": 0.0813, "num_input_tokens_seen": 32898432, "step": 25370 }, { "epoch": 1.239855373415093, "grad_norm": 0.19505390524864197, "learning_rate": 1.4758690392113566e-05, "loss": 0.0744, "num_input_tokens_seen": 32904512, "step": 25375 }, { "epoch": 1.2400996750787874, "grad_norm": 0.5752661824226379, "learning_rate": 1.4749735270040276e-05, "loss": 0.0902, "num_input_tokens_seen": 32910656, "step": 25380 }, { "epoch": 1.2403439767424815, "grad_norm": 0.1536714881658554, "learning_rate": 1.4740781728686623e-05, "loss": 0.1008, "num_input_tokens_seen": 32917216, "step": 25385 }, { "epoch": 1.240588278406176, "grad_norm": 0.16975566744804382, "learning_rate": 1.4731829769433358e-05, "loss": 0.0879, "num_input_tokens_seen": 32923808, "step": 25390 }, { "epoch": 1.2408325800698703, "grad_norm": 0.1313553899526596, "learning_rate": 1.4722879393660976e-05, "loss": 0.0928, "num_input_tokens_seen": 32930240, "step": 25395 }, { "epoch": 1.2410768817335647, "grad_norm": 0.4570459723472595, "learning_rate": 1.4713930602749748e-05, "loss": 0.0816, "num_input_tokens_seen": 32936608, "step": 25400 }, { "epoch": 1.2410768817335647, "eval_loss": 0.08771780133247375, "eval_runtime": 374.4075, "eval_samples_per_second": 97.18, "eval_steps_per_second": 24.297, "num_input_tokens_seen": 32936608, "step": 25400 }, { "epoch": 1.2413211833972588, "grad_norm": 0.3358851671218872, "learning_rate": 1.470498339807968e-05, "loss": 0.0793, "num_input_tokens_seen": 32942912, "step": 25405 }, { "epoch": 1.2415654850609532, "grad_norm": 0.19330085813999176, "learning_rate": 1.4696037781030542e-05, "loss": 0.0766, "num_input_tokens_seen": 32948800, "step": 25410 }, { "epoch": 1.2418097867246476, "grad_norm": 0.1745702475309372, "learning_rate": 1.4687093752981876e-05, "loss": 0.0834, "num_input_tokens_seen": 32955456, "step": 25415 }, { "epoch": 1.242054088388342, "grad_norm": 0.202764093875885, "learning_rate": 1.4678151315312943e-05, "loss": 0.0785, "num_input_tokens_seen": 32961664, "step": 25420 }, { "epoch": 1.2422983900520363, "grad_norm": 0.2516981363296509, "learning_rate": 1.4669210469402789e-05, "loss": 0.1192, "num_input_tokens_seen": 32967872, "step": 25425 }, { "epoch": 1.2425426917157305, "grad_norm": 0.578857421875, "learning_rate": 1.4660271216630218e-05, "loss": 0.0956, "num_input_tokens_seen": 32973696, "step": 25430 }, { "epoch": 1.2427869933794249, "grad_norm": 0.475387841463089, "learning_rate": 1.4651333558373748e-05, "loss": 0.0928, "num_input_tokens_seen": 32979904, "step": 25435 }, { "epoch": 1.2430312950431193, "grad_norm": 0.40214288234710693, "learning_rate": 1.4642397496011707e-05, "loss": 0.0984, "num_input_tokens_seen": 32986016, "step": 25440 }, { "epoch": 1.2432755967068136, "grad_norm": 0.3920091688632965, "learning_rate": 1.4633463030922129e-05, "loss": 0.0721, "num_input_tokens_seen": 32992160, "step": 25445 }, { "epoch": 1.2435198983705078, "grad_norm": 0.37634316086769104, "learning_rate": 1.462453016448282e-05, "loss": 0.0793, "num_input_tokens_seen": 32998272, "step": 25450 }, { "epoch": 1.2437642000342022, "grad_norm": 0.1677108108997345, "learning_rate": 1.4615598898071354e-05, "loss": 0.0669, "num_input_tokens_seen": 33005344, "step": 25455 }, { "epoch": 1.2440085016978966, "grad_norm": 0.5540145635604858, "learning_rate": 1.4606669233065026e-05, "loss": 0.1042, "num_input_tokens_seen": 33011680, "step": 25460 }, { "epoch": 1.244252803361591, "grad_norm": 0.5020498633384705, "learning_rate": 1.4597741170840914e-05, "loss": 0.0815, "num_input_tokens_seen": 33018208, "step": 25465 }, { "epoch": 1.2444971050252853, "grad_norm": 0.412094384431839, "learning_rate": 1.4588814712775853e-05, "loss": 0.1108, "num_input_tokens_seen": 33024768, "step": 25470 }, { "epoch": 1.2447414066889795, "grad_norm": 0.5973905920982361, "learning_rate": 1.4579889860246382e-05, "loss": 0.0706, "num_input_tokens_seen": 33031392, "step": 25475 }, { "epoch": 1.2449857083526739, "grad_norm": 0.18736743927001953, "learning_rate": 1.457096661462885e-05, "loss": 0.0742, "num_input_tokens_seen": 33038304, "step": 25480 }, { "epoch": 1.2452300100163682, "grad_norm": 0.13687770068645477, "learning_rate": 1.4562044977299322e-05, "loss": 0.104, "num_input_tokens_seen": 33044736, "step": 25485 }, { "epoch": 1.2454743116800626, "grad_norm": 0.10177542269229889, "learning_rate": 1.4553124949633623e-05, "loss": 0.0803, "num_input_tokens_seen": 33051168, "step": 25490 }, { "epoch": 1.2457186133437568, "grad_norm": 0.21900054812431335, "learning_rate": 1.4544206533007354e-05, "loss": 0.0912, "num_input_tokens_seen": 33057792, "step": 25495 }, { "epoch": 1.2459629150074512, "grad_norm": 0.3290378153324127, "learning_rate": 1.4535289728795821e-05, "loss": 0.0826, "num_input_tokens_seen": 33064576, "step": 25500 }, { "epoch": 1.2462072166711455, "grad_norm": 0.42224952578544617, "learning_rate": 1.4526374538374132e-05, "loss": 0.0915, "num_input_tokens_seen": 33071360, "step": 25505 }, { "epoch": 1.24645151833484, "grad_norm": 0.6455332040786743, "learning_rate": 1.4517460963117097e-05, "loss": 0.0944, "num_input_tokens_seen": 33078080, "step": 25510 }, { "epoch": 1.2466958199985343, "grad_norm": 0.1688244491815567, "learning_rate": 1.4508549004399314e-05, "loss": 0.0907, "num_input_tokens_seen": 33084448, "step": 25515 }, { "epoch": 1.2469401216622285, "grad_norm": 0.6466909646987915, "learning_rate": 1.449963866359513e-05, "loss": 0.085, "num_input_tokens_seen": 33090624, "step": 25520 }, { "epoch": 1.2471844233259228, "grad_norm": 0.7785986065864563, "learning_rate": 1.4490729942078607e-05, "loss": 0.0921, "num_input_tokens_seen": 33097440, "step": 25525 }, { "epoch": 1.2474287249896172, "grad_norm": 0.2612534463405609, "learning_rate": 1.4481822841223608e-05, "loss": 0.0974, "num_input_tokens_seen": 33104096, "step": 25530 }, { "epoch": 1.2476730266533116, "grad_norm": 0.0941544622182846, "learning_rate": 1.4472917362403704e-05, "loss": 0.0702, "num_input_tokens_seen": 33110848, "step": 25535 }, { "epoch": 1.2479173283170057, "grad_norm": 0.23314709961414337, "learning_rate": 1.4464013506992224e-05, "loss": 0.0935, "num_input_tokens_seen": 33117408, "step": 25540 }, { "epoch": 1.2481616299807001, "grad_norm": 0.5788547396659851, "learning_rate": 1.4455111276362277e-05, "loss": 0.0676, "num_input_tokens_seen": 33123904, "step": 25545 }, { "epoch": 1.2484059316443945, "grad_norm": 0.21748648583889008, "learning_rate": 1.4446210671886676e-05, "loss": 0.0744, "num_input_tokens_seen": 33130208, "step": 25550 }, { "epoch": 1.2486502333080889, "grad_norm": 0.5821284651756287, "learning_rate": 1.4437311694938015e-05, "loss": 0.0888, "num_input_tokens_seen": 33136672, "step": 25555 }, { "epoch": 1.2488945349717833, "grad_norm": 0.2257837951183319, "learning_rate": 1.442841434688864e-05, "loss": 0.0768, "num_input_tokens_seen": 33142944, "step": 25560 }, { "epoch": 1.2491388366354774, "grad_norm": 0.4357706308364868, "learning_rate": 1.4419518629110615e-05, "loss": 0.0608, "num_input_tokens_seen": 33149728, "step": 25565 }, { "epoch": 1.2493831382991718, "grad_norm": 0.6729416847229004, "learning_rate": 1.4410624542975778e-05, "loss": 0.0934, "num_input_tokens_seen": 33156576, "step": 25570 }, { "epoch": 1.2496274399628662, "grad_norm": 0.5704357624053955, "learning_rate": 1.4401732089855724e-05, "loss": 0.1003, "num_input_tokens_seen": 33163328, "step": 25575 }, { "epoch": 1.2498717416265606, "grad_norm": 0.3269321322441101, "learning_rate": 1.4392841271121754e-05, "loss": 0.0815, "num_input_tokens_seen": 33169632, "step": 25580 }, { "epoch": 1.2501160432902547, "grad_norm": 0.25619298219680786, "learning_rate": 1.438395208814497e-05, "loss": 0.0816, "num_input_tokens_seen": 33176224, "step": 25585 }, { "epoch": 1.250360344953949, "grad_norm": 0.19595597684383392, "learning_rate": 1.4375064542296174e-05, "loss": 0.0912, "num_input_tokens_seen": 33182304, "step": 25590 }, { "epoch": 1.2506046466176435, "grad_norm": 0.44644615054130554, "learning_rate": 1.4366178634945946e-05, "loss": 0.0846, "num_input_tokens_seen": 33188896, "step": 25595 }, { "epoch": 1.2508489482813379, "grad_norm": 0.20568051934242249, "learning_rate": 1.4357294367464616e-05, "loss": 0.0813, "num_input_tokens_seen": 33195264, "step": 25600 }, { "epoch": 1.2508489482813379, "eval_loss": 0.08815651386976242, "eval_runtime": 374.678, "eval_samples_per_second": 97.11, "eval_steps_per_second": 24.28, "num_input_tokens_seen": 33195264, "step": 25600 }, { "epoch": 1.2510932499450322, "grad_norm": 0.22921472787857056, "learning_rate": 1.434841174122224e-05, "loss": 0.0808, "num_input_tokens_seen": 33201920, "step": 25605 }, { "epoch": 1.2513375516087264, "grad_norm": 0.3965938687324524, "learning_rate": 1.4339530757588615e-05, "loss": 0.1242, "num_input_tokens_seen": 33208192, "step": 25610 }, { "epoch": 1.2515818532724208, "grad_norm": 0.4431011378765106, "learning_rate": 1.433065141793333e-05, "loss": 0.0801, "num_input_tokens_seen": 33214624, "step": 25615 }, { "epoch": 1.2518261549361152, "grad_norm": 0.29147103428840637, "learning_rate": 1.4321773723625665e-05, "loss": 0.0614, "num_input_tokens_seen": 33220736, "step": 25620 }, { "epoch": 1.2520704565998093, "grad_norm": 0.22731029987335205, "learning_rate": 1.4312897676034693e-05, "loss": 0.1069, "num_input_tokens_seen": 33227584, "step": 25625 }, { "epoch": 1.2523147582635037, "grad_norm": 0.25455987453460693, "learning_rate": 1.4304023276529188e-05, "loss": 0.0705, "num_input_tokens_seen": 33234400, "step": 25630 }, { "epoch": 1.252559059927198, "grad_norm": 0.2377905696630478, "learning_rate": 1.4295150526477712e-05, "loss": 0.0803, "num_input_tokens_seen": 33241152, "step": 25635 }, { "epoch": 1.2528033615908925, "grad_norm": 0.2594551742076874, "learning_rate": 1.4286279427248562e-05, "loss": 0.0735, "num_input_tokens_seen": 33247520, "step": 25640 }, { "epoch": 1.2530476632545868, "grad_norm": 0.3330792784690857, "learning_rate": 1.4277409980209747e-05, "loss": 0.0851, "num_input_tokens_seen": 33254304, "step": 25645 }, { "epoch": 1.2532919649182812, "grad_norm": 0.23299556970596313, "learning_rate": 1.4268542186729061e-05, "loss": 0.08, "num_input_tokens_seen": 33260928, "step": 25650 }, { "epoch": 1.2535362665819754, "grad_norm": 0.30506178736686707, "learning_rate": 1.4259676048174043e-05, "loss": 0.0957, "num_input_tokens_seen": 33267264, "step": 25655 }, { "epoch": 1.2537805682456697, "grad_norm": 0.4116295576095581, "learning_rate": 1.4250811565911937e-05, "loss": 0.0721, "num_input_tokens_seen": 33273536, "step": 25660 }, { "epoch": 1.2540248699093641, "grad_norm": 0.3233422636985779, "learning_rate": 1.4241948741309782e-05, "loss": 0.0714, "num_input_tokens_seen": 33280256, "step": 25665 }, { "epoch": 1.2542691715730583, "grad_norm": 0.14007942378520966, "learning_rate": 1.4233087575734317e-05, "loss": 0.0926, "num_input_tokens_seen": 33286944, "step": 25670 }, { "epoch": 1.2545134732367527, "grad_norm": 0.25849929451942444, "learning_rate": 1.422422807055206e-05, "loss": 0.0873, "num_input_tokens_seen": 33293312, "step": 25675 }, { "epoch": 1.254757774900447, "grad_norm": 0.14728915691375732, "learning_rate": 1.4215370227129243e-05, "loss": 0.0661, "num_input_tokens_seen": 33299968, "step": 25680 }, { "epoch": 1.2550020765641414, "grad_norm": 0.46702298521995544, "learning_rate": 1.4206514046831876e-05, "loss": 0.0812, "num_input_tokens_seen": 33306272, "step": 25685 }, { "epoch": 1.2552463782278358, "grad_norm": 0.1449936032295227, "learning_rate": 1.419765953102567e-05, "loss": 0.0662, "num_input_tokens_seen": 33313120, "step": 25690 }, { "epoch": 1.2554906798915302, "grad_norm": 0.43514391779899597, "learning_rate": 1.4188806681076125e-05, "loss": 0.0968, "num_input_tokens_seen": 33319040, "step": 25695 }, { "epoch": 1.2557349815552243, "grad_norm": 0.4687068462371826, "learning_rate": 1.4179955498348443e-05, "loss": 0.0981, "num_input_tokens_seen": 33325376, "step": 25700 }, { "epoch": 1.2559792832189187, "grad_norm": 0.24194487929344177, "learning_rate": 1.4171105984207605e-05, "loss": 0.0863, "num_input_tokens_seen": 33332352, "step": 25705 }, { "epoch": 1.256223584882613, "grad_norm": 0.8864560127258301, "learning_rate": 1.4162258140018304e-05, "loss": 0.0833, "num_input_tokens_seen": 33339104, "step": 25710 }, { "epoch": 1.2564678865463073, "grad_norm": 0.1658695787191391, "learning_rate": 1.4153411967144986e-05, "loss": 0.0753, "num_input_tokens_seen": 33346176, "step": 25715 }, { "epoch": 1.2567121882100016, "grad_norm": 0.19978176057338715, "learning_rate": 1.4144567466951864e-05, "loss": 0.0837, "num_input_tokens_seen": 33352448, "step": 25720 }, { "epoch": 1.256956489873696, "grad_norm": 0.31498318910598755, "learning_rate": 1.4135724640802844e-05, "loss": 0.1004, "num_input_tokens_seen": 33359040, "step": 25725 }, { "epoch": 1.2572007915373904, "grad_norm": 0.20481610298156738, "learning_rate": 1.4126883490061615e-05, "loss": 0.0765, "num_input_tokens_seen": 33365408, "step": 25730 }, { "epoch": 1.2574450932010848, "grad_norm": 0.1375054568052292, "learning_rate": 1.4118044016091603e-05, "loss": 0.0673, "num_input_tokens_seen": 33372000, "step": 25735 }, { "epoch": 1.2576893948647792, "grad_norm": 0.6540963649749756, "learning_rate": 1.410920622025594e-05, "loss": 0.1001, "num_input_tokens_seen": 33378560, "step": 25740 }, { "epoch": 1.2579336965284733, "grad_norm": 0.4624376595020294, "learning_rate": 1.4100370103917554e-05, "loss": 0.1056, "num_input_tokens_seen": 33384992, "step": 25745 }, { "epoch": 1.2581779981921677, "grad_norm": 0.2664892077445984, "learning_rate": 1.409153566843907e-05, "loss": 0.0677, "num_input_tokens_seen": 33391424, "step": 25750 }, { "epoch": 1.258422299855862, "grad_norm": 0.7797110080718994, "learning_rate": 1.408270291518286e-05, "loss": 0.0779, "num_input_tokens_seen": 33397888, "step": 25755 }, { "epoch": 1.2586666015195562, "grad_norm": 0.4921726882457733, "learning_rate": 1.407387184551107e-05, "loss": 0.0787, "num_input_tokens_seen": 33404128, "step": 25760 }, { "epoch": 1.2589109031832506, "grad_norm": 0.2884329557418823, "learning_rate": 1.4065042460785532e-05, "loss": 0.1104, "num_input_tokens_seen": 33410464, "step": 25765 }, { "epoch": 1.259155204846945, "grad_norm": 0.4107776880264282, "learning_rate": 1.405621476236787e-05, "loss": 0.0919, "num_input_tokens_seen": 33416864, "step": 25770 }, { "epoch": 1.2593995065106394, "grad_norm": 0.3326261043548584, "learning_rate": 1.4047388751619423e-05, "loss": 0.0984, "num_input_tokens_seen": 33423328, "step": 25775 }, { "epoch": 1.2596438081743337, "grad_norm": 0.3473319709300995, "learning_rate": 1.4038564429901264e-05, "loss": 0.0913, "num_input_tokens_seen": 33429600, "step": 25780 }, { "epoch": 1.2598881098380281, "grad_norm": 0.17289400100708008, "learning_rate": 1.4029741798574227e-05, "loss": 0.0852, "num_input_tokens_seen": 33435552, "step": 25785 }, { "epoch": 1.2601324115017223, "grad_norm": 0.561569333076477, "learning_rate": 1.402092085899886e-05, "loss": 0.0801, "num_input_tokens_seen": 33441696, "step": 25790 }, { "epoch": 1.2603767131654167, "grad_norm": 0.12675848603248596, "learning_rate": 1.4012101612535464e-05, "loss": 0.0702, "num_input_tokens_seen": 33448256, "step": 25795 }, { "epoch": 1.260621014829111, "grad_norm": 0.49626144766807556, "learning_rate": 1.4003284060544092e-05, "loss": 0.0737, "num_input_tokens_seen": 33454720, "step": 25800 }, { "epoch": 1.260621014829111, "eval_loss": 0.08798237890005112, "eval_runtime": 374.5674, "eval_samples_per_second": 97.139, "eval_steps_per_second": 24.287, "num_input_tokens_seen": 33454720, "step": 25800 }, { "epoch": 1.2608653164928052, "grad_norm": 0.36479753255844116, "learning_rate": 1.3994468204384504e-05, "loss": 0.0809, "num_input_tokens_seen": 33460992, "step": 25805 }, { "epoch": 1.2611096181564996, "grad_norm": 0.1943364143371582, "learning_rate": 1.398565404541622e-05, "loss": 0.0779, "num_input_tokens_seen": 33467040, "step": 25810 }, { "epoch": 1.261353919820194, "grad_norm": 0.591613233089447, "learning_rate": 1.3976841584998513e-05, "loss": 0.0847, "num_input_tokens_seen": 33473248, "step": 25815 }, { "epoch": 1.2615982214838883, "grad_norm": 0.6302608847618103, "learning_rate": 1.3968030824490352e-05, "loss": 0.0859, "num_input_tokens_seen": 33479776, "step": 25820 }, { "epoch": 1.2618425231475827, "grad_norm": 0.24894660711288452, "learning_rate": 1.3959221765250469e-05, "loss": 0.0915, "num_input_tokens_seen": 33486144, "step": 25825 }, { "epoch": 1.262086824811277, "grad_norm": 0.0804329589009285, "learning_rate": 1.3950414408637343e-05, "loss": 0.0679, "num_input_tokens_seen": 33493600, "step": 25830 }, { "epoch": 1.2623311264749713, "grad_norm": 0.34555932879447937, "learning_rate": 1.3941608756009166e-05, "loss": 0.0799, "num_input_tokens_seen": 33499872, "step": 25835 }, { "epoch": 1.2625754281386656, "grad_norm": 0.2811524271965027, "learning_rate": 1.3932804808723898e-05, "loss": 0.0894, "num_input_tokens_seen": 33506848, "step": 25840 }, { "epoch": 1.26281972980236, "grad_norm": 0.28786152601242065, "learning_rate": 1.3924002568139194e-05, "loss": 0.0762, "num_input_tokens_seen": 33513440, "step": 25845 }, { "epoch": 1.2630640314660542, "grad_norm": 0.3816375136375427, "learning_rate": 1.3915202035612485e-05, "loss": 0.0612, "num_input_tokens_seen": 33519872, "step": 25850 }, { "epoch": 1.2633083331297486, "grad_norm": 0.45724886655807495, "learning_rate": 1.3906403212500935e-05, "loss": 0.0832, "num_input_tokens_seen": 33526496, "step": 25855 }, { "epoch": 1.263552634793443, "grad_norm": 0.3883766829967499, "learning_rate": 1.3897606100161409e-05, "loss": 0.0899, "num_input_tokens_seen": 33532928, "step": 25860 }, { "epoch": 1.2637969364571373, "grad_norm": 0.23161542415618896, "learning_rate": 1.388881069995055e-05, "loss": 0.085, "num_input_tokens_seen": 33539968, "step": 25865 }, { "epoch": 1.2640412381208317, "grad_norm": 0.4027371406555176, "learning_rate": 1.3880017013224708e-05, "loss": 0.0965, "num_input_tokens_seen": 33546048, "step": 25870 }, { "epoch": 1.264285539784526, "grad_norm": 0.2871894836425781, "learning_rate": 1.3871225041339984e-05, "loss": 0.0691, "num_input_tokens_seen": 33552800, "step": 25875 }, { "epoch": 1.2645298414482202, "grad_norm": 0.22617654502391815, "learning_rate": 1.386243478565222e-05, "loss": 0.079, "num_input_tokens_seen": 33559712, "step": 25880 }, { "epoch": 1.2647741431119146, "grad_norm": 0.3379892110824585, "learning_rate": 1.3853646247516966e-05, "loss": 0.0816, "num_input_tokens_seen": 33565888, "step": 25885 }, { "epoch": 1.265018444775609, "grad_norm": 0.5600388646125793, "learning_rate": 1.3844859428289545e-05, "loss": 0.1102, "num_input_tokens_seen": 33572256, "step": 25890 }, { "epoch": 1.2652627464393031, "grad_norm": 0.19101855158805847, "learning_rate": 1.3836074329324984e-05, "loss": 0.0726, "num_input_tokens_seen": 33578656, "step": 25895 }, { "epoch": 1.2655070481029975, "grad_norm": 0.4005063772201538, "learning_rate": 1.3827290951978044e-05, "loss": 0.0806, "num_input_tokens_seen": 33584640, "step": 25900 }, { "epoch": 1.265751349766692, "grad_norm": 0.19471849501132965, "learning_rate": 1.381850929760326e-05, "loss": 0.0908, "num_input_tokens_seen": 33591264, "step": 25905 }, { "epoch": 1.2659956514303863, "grad_norm": 0.6382273435592651, "learning_rate": 1.3809729367554842e-05, "loss": 0.0895, "num_input_tokens_seen": 33597376, "step": 25910 }, { "epoch": 1.2662399530940807, "grad_norm": 0.23199835419654846, "learning_rate": 1.3800951163186784e-05, "loss": 0.0777, "num_input_tokens_seen": 33604000, "step": 25915 }, { "epoch": 1.2664842547577748, "grad_norm": 0.34030041098594666, "learning_rate": 1.3792174685852801e-05, "loss": 0.0774, "num_input_tokens_seen": 33610912, "step": 25920 }, { "epoch": 1.2667285564214692, "grad_norm": 0.251545250415802, "learning_rate": 1.378339993690632e-05, "loss": 0.1085, "num_input_tokens_seen": 33617792, "step": 25925 }, { "epoch": 1.2669728580851636, "grad_norm": 0.16458560526371002, "learning_rate": 1.3774626917700523e-05, "loss": 0.0769, "num_input_tokens_seen": 33624448, "step": 25930 }, { "epoch": 1.267217159748858, "grad_norm": 0.5294756889343262, "learning_rate": 1.3765855629588334e-05, "loss": 0.1197, "num_input_tokens_seen": 33630976, "step": 25935 }, { "epoch": 1.2674614614125521, "grad_norm": 0.5499122142791748, "learning_rate": 1.3757086073922374e-05, "loss": 0.0811, "num_input_tokens_seen": 33637280, "step": 25940 }, { "epoch": 1.2677057630762465, "grad_norm": 0.5399562120437622, "learning_rate": 1.3748318252055038e-05, "loss": 0.0751, "num_input_tokens_seen": 33644160, "step": 25945 }, { "epoch": 1.2679500647399409, "grad_norm": 0.16505348682403564, "learning_rate": 1.3739552165338416e-05, "loss": 0.0767, "num_input_tokens_seen": 33650528, "step": 25950 }, { "epoch": 1.2681943664036353, "grad_norm": 0.7172524333000183, "learning_rate": 1.3730787815124354e-05, "loss": 0.1079, "num_input_tokens_seen": 33657024, "step": 25955 }, { "epoch": 1.2684386680673296, "grad_norm": 0.5012084245681763, "learning_rate": 1.3722025202764443e-05, "loss": 0.0949, "num_input_tokens_seen": 33663296, "step": 25960 }, { "epoch": 1.2686829697310238, "grad_norm": 0.17458145320415497, "learning_rate": 1.371326432960997e-05, "loss": 0.0718, "num_input_tokens_seen": 33669568, "step": 25965 }, { "epoch": 1.2689272713947182, "grad_norm": 0.39747270941734314, "learning_rate": 1.3704505197011969e-05, "loss": 0.0862, "num_input_tokens_seen": 33676000, "step": 25970 }, { "epoch": 1.2691715730584125, "grad_norm": 0.4952263832092285, "learning_rate": 1.3695747806321224e-05, "loss": 0.0937, "num_input_tokens_seen": 33682432, "step": 25975 }, { "epoch": 1.269415874722107, "grad_norm": 0.23522767424583435, "learning_rate": 1.3686992158888212e-05, "loss": 0.0792, "num_input_tokens_seen": 33688928, "step": 25980 }, { "epoch": 1.269660176385801, "grad_norm": 0.41667431592941284, "learning_rate": 1.367823825606319e-05, "loss": 0.0912, "num_input_tokens_seen": 33695456, "step": 25985 }, { "epoch": 1.2699044780494955, "grad_norm": 0.25449463725090027, "learning_rate": 1.36694860991961e-05, "loss": 0.0963, "num_input_tokens_seen": 33702208, "step": 25990 }, { "epoch": 1.2701487797131898, "grad_norm": 0.3325096070766449, "learning_rate": 1.3660735689636636e-05, "loss": 0.1032, "num_input_tokens_seen": 33708480, "step": 25995 }, { "epoch": 1.2703930813768842, "grad_norm": 0.1903206706047058, "learning_rate": 1.365198702873424e-05, "loss": 0.1029, "num_input_tokens_seen": 33714496, "step": 26000 }, { "epoch": 1.2703930813768842, "eval_loss": 0.08805037289857864, "eval_runtime": 374.1151, "eval_samples_per_second": 97.256, "eval_steps_per_second": 24.316, "num_input_tokens_seen": 33714496, "step": 26000 }, { "epoch": 1.2706373830405786, "grad_norm": 0.3323430120944977, "learning_rate": 1.364324011783804e-05, "loss": 0.0946, "num_input_tokens_seen": 33720992, "step": 26005 }, { "epoch": 1.2708816847042728, "grad_norm": 0.37906166911125183, "learning_rate": 1.3634494958296934e-05, "loss": 0.103, "num_input_tokens_seen": 33727680, "step": 26010 }, { "epoch": 1.2711259863679671, "grad_norm": 0.10977818071842194, "learning_rate": 1.3625751551459542e-05, "loss": 0.0858, "num_input_tokens_seen": 33733984, "step": 26015 }, { "epoch": 1.2713702880316615, "grad_norm": 0.14399772882461548, "learning_rate": 1.3617009898674188e-05, "loss": 0.0682, "num_input_tokens_seen": 33740384, "step": 26020 }, { "epoch": 1.271614589695356, "grad_norm": 0.2783221900463104, "learning_rate": 1.3608270001288967e-05, "loss": 0.094, "num_input_tokens_seen": 33746848, "step": 26025 }, { "epoch": 1.27185889135905, "grad_norm": 0.15986888110637665, "learning_rate": 1.359953186065166e-05, "loss": 0.0936, "num_input_tokens_seen": 33753024, "step": 26030 }, { "epoch": 1.2721031930227444, "grad_norm": 0.4091680347919464, "learning_rate": 1.3590795478109814e-05, "loss": 0.0462, "num_input_tokens_seen": 33759360, "step": 26035 }, { "epoch": 1.2723474946864388, "grad_norm": 0.50697922706604, "learning_rate": 1.3582060855010675e-05, "loss": 0.0789, "num_input_tokens_seen": 33765632, "step": 26040 }, { "epoch": 1.2725917963501332, "grad_norm": 0.520423412322998, "learning_rate": 1.3573327992701245e-05, "loss": 0.0896, "num_input_tokens_seen": 33771872, "step": 26045 }, { "epoch": 1.2728360980138276, "grad_norm": 0.13809099793434143, "learning_rate": 1.356459689252823e-05, "loss": 0.0617, "num_input_tokens_seen": 33778752, "step": 26050 }, { "epoch": 1.2730803996775217, "grad_norm": 0.18456757068634033, "learning_rate": 1.3555867555838087e-05, "loss": 0.0912, "num_input_tokens_seen": 33785216, "step": 26055 }, { "epoch": 1.2733247013412161, "grad_norm": 0.4355297088623047, "learning_rate": 1.3547139983976975e-05, "loss": 0.078, "num_input_tokens_seen": 33791584, "step": 26060 }, { "epoch": 1.2735690030049105, "grad_norm": 0.26508399844169617, "learning_rate": 1.3538414178290815e-05, "loss": 0.0961, "num_input_tokens_seen": 33797920, "step": 26065 }, { "epoch": 1.2738133046686049, "grad_norm": 0.6552367210388184, "learning_rate": 1.3529690140125209e-05, "loss": 0.0842, "num_input_tokens_seen": 33804448, "step": 26070 }, { "epoch": 1.274057606332299, "grad_norm": 0.2653310298919678, "learning_rate": 1.352096787082553e-05, "loss": 0.0858, "num_input_tokens_seen": 33810976, "step": 26075 }, { "epoch": 1.2743019079959934, "grad_norm": 0.29685869812965393, "learning_rate": 1.3512247371736871e-05, "loss": 0.0675, "num_input_tokens_seen": 33817056, "step": 26080 }, { "epoch": 1.2745462096596878, "grad_norm": 0.2955096364021301, "learning_rate": 1.3503528644204022e-05, "loss": 0.0769, "num_input_tokens_seen": 33823296, "step": 26085 }, { "epoch": 1.2747905113233822, "grad_norm": 0.19446952641010284, "learning_rate": 1.349481168957153e-05, "loss": 0.0684, "num_input_tokens_seen": 33830048, "step": 26090 }, { "epoch": 1.2750348129870765, "grad_norm": 0.5021581053733826, "learning_rate": 1.3486096509183665e-05, "loss": 0.0914, "num_input_tokens_seen": 33836640, "step": 26095 }, { "epoch": 1.2752791146507707, "grad_norm": 0.2801263630390167, "learning_rate": 1.3477383104384406e-05, "loss": 0.0721, "num_input_tokens_seen": 33843328, "step": 26100 }, { "epoch": 1.275523416314465, "grad_norm": 0.14152806997299194, "learning_rate": 1.3468671476517481e-05, "loss": 0.0742, "num_input_tokens_seen": 33850368, "step": 26105 }, { "epoch": 1.2757677179781595, "grad_norm": 0.33130648732185364, "learning_rate": 1.3459961626926326e-05, "loss": 0.0842, "num_input_tokens_seen": 33856480, "step": 26110 }, { "epoch": 1.2760120196418536, "grad_norm": 0.3347123861312866, "learning_rate": 1.3451253556954101e-05, "loss": 0.0609, "num_input_tokens_seen": 33862720, "step": 26115 }, { "epoch": 1.276256321305548, "grad_norm": 0.14569300413131714, "learning_rate": 1.3442547267943717e-05, "loss": 0.0747, "num_input_tokens_seen": 33869056, "step": 26120 }, { "epoch": 1.2765006229692424, "grad_norm": 0.4094216227531433, "learning_rate": 1.3433842761237774e-05, "loss": 0.1097, "num_input_tokens_seen": 33875488, "step": 26125 }, { "epoch": 1.2767449246329368, "grad_norm": 0.3046726584434509, "learning_rate": 1.3425140038178639e-05, "loss": 0.0929, "num_input_tokens_seen": 33881920, "step": 26130 }, { "epoch": 1.2769892262966311, "grad_norm": 0.33737415075302124, "learning_rate": 1.3416439100108358e-05, "loss": 0.0864, "num_input_tokens_seen": 33888704, "step": 26135 }, { "epoch": 1.2772335279603255, "grad_norm": 0.6499781608581543, "learning_rate": 1.3407739948368734e-05, "loss": 0.0787, "num_input_tokens_seen": 33895328, "step": 26140 }, { "epoch": 1.2774778296240197, "grad_norm": 0.21450021862983704, "learning_rate": 1.3399042584301298e-05, "loss": 0.0794, "num_input_tokens_seen": 33901920, "step": 26145 }, { "epoch": 1.277722131287714, "grad_norm": 0.3022378981113434, "learning_rate": 1.3390347009247272e-05, "loss": 0.0783, "num_input_tokens_seen": 33908608, "step": 26150 }, { "epoch": 1.2779664329514084, "grad_norm": 0.36477166414260864, "learning_rate": 1.3381653224547635e-05, "loss": 0.1157, "num_input_tokens_seen": 33915072, "step": 26155 }, { "epoch": 1.2782107346151026, "grad_norm": 0.2078276425600052, "learning_rate": 1.3372961231543086e-05, "loss": 0.0873, "num_input_tokens_seen": 33921536, "step": 26160 }, { "epoch": 1.278455036278797, "grad_norm": 0.2416142076253891, "learning_rate": 1.3364271031574016e-05, "loss": 0.0814, "num_input_tokens_seen": 33927808, "step": 26165 }, { "epoch": 1.2786993379424914, "grad_norm": 0.23344002664089203, "learning_rate": 1.335558262598059e-05, "loss": 0.0817, "num_input_tokens_seen": 33933856, "step": 26170 }, { "epoch": 1.2789436396061857, "grad_norm": 0.17064055800437927, "learning_rate": 1.3346896016102645e-05, "loss": 0.088, "num_input_tokens_seen": 33940096, "step": 26175 }, { "epoch": 1.2791879412698801, "grad_norm": 0.2374865561723709, "learning_rate": 1.3338211203279788e-05, "loss": 0.1268, "num_input_tokens_seen": 33946528, "step": 26180 }, { "epoch": 1.2794322429335745, "grad_norm": 0.6095322370529175, "learning_rate": 1.3329528188851303e-05, "loss": 0.1021, "num_input_tokens_seen": 33952800, "step": 26185 }, { "epoch": 1.2796765445972687, "grad_norm": 0.6113459467887878, "learning_rate": 1.3320846974156242e-05, "loss": 0.0973, "num_input_tokens_seen": 33959136, "step": 26190 }, { "epoch": 1.279920846260963, "grad_norm": 0.289580762386322, "learning_rate": 1.3312167560533337e-05, "loss": 0.095, "num_input_tokens_seen": 33965888, "step": 26195 }, { "epoch": 1.2801651479246574, "grad_norm": 0.19625933468341827, "learning_rate": 1.3303489949321082e-05, "loss": 0.0838, "num_input_tokens_seen": 33972576, "step": 26200 }, { "epoch": 1.2801651479246574, "eval_loss": 0.08782710880041122, "eval_runtime": 374.968, "eval_samples_per_second": 97.035, "eval_steps_per_second": 24.261, "num_input_tokens_seen": 33972576, "step": 26200 }, { "epoch": 1.2804094495883516, "grad_norm": 0.1734575778245926, "learning_rate": 1.3294814141857653e-05, "loss": 0.0716, "num_input_tokens_seen": 33978976, "step": 26205 }, { "epoch": 1.280653751252046, "grad_norm": 0.17351743578910828, "learning_rate": 1.3286140139480992e-05, "loss": 0.0929, "num_input_tokens_seen": 33985248, "step": 26210 }, { "epoch": 1.2808980529157403, "grad_norm": 0.1827230304479599, "learning_rate": 1.3277467943528719e-05, "loss": 0.0704, "num_input_tokens_seen": 33991264, "step": 26215 }, { "epoch": 1.2811423545794347, "grad_norm": 0.28530752658843994, "learning_rate": 1.3268797555338203e-05, "loss": 0.0735, "num_input_tokens_seen": 33997824, "step": 26220 }, { "epoch": 1.281386656243129, "grad_norm": 0.2559194564819336, "learning_rate": 1.3260128976246533e-05, "loss": 0.0909, "num_input_tokens_seen": 34004672, "step": 26225 }, { "epoch": 1.2816309579068235, "grad_norm": 0.3647770881652832, "learning_rate": 1.32514622075905e-05, "loss": 0.0752, "num_input_tokens_seen": 34011392, "step": 26230 }, { "epoch": 1.2818752595705176, "grad_norm": 0.7050350904464722, "learning_rate": 1.3242797250706638e-05, "loss": 0.0951, "num_input_tokens_seen": 34018016, "step": 26235 }, { "epoch": 1.282119561234212, "grad_norm": 0.23863773047924042, "learning_rate": 1.3234134106931195e-05, "loss": 0.0793, "num_input_tokens_seen": 34024192, "step": 26240 }, { "epoch": 1.2823638628979064, "grad_norm": 0.5302475094795227, "learning_rate": 1.322547277760013e-05, "loss": 0.0797, "num_input_tokens_seen": 34030496, "step": 26245 }, { "epoch": 1.2826081645616005, "grad_norm": 0.5258589386940002, "learning_rate": 1.3216813264049132e-05, "loss": 0.0896, "num_input_tokens_seen": 34037632, "step": 26250 }, { "epoch": 1.282852466225295, "grad_norm": 0.5029003024101257, "learning_rate": 1.32081555676136e-05, "loss": 0.09, "num_input_tokens_seen": 34044256, "step": 26255 }, { "epoch": 1.2830967678889893, "grad_norm": 0.23623435199260712, "learning_rate": 1.3199499689628674e-05, "loss": 0.0997, "num_input_tokens_seen": 34050752, "step": 26260 }, { "epoch": 1.2833410695526837, "grad_norm": 0.2881461977958679, "learning_rate": 1.3190845631429192e-05, "loss": 0.0721, "num_input_tokens_seen": 34057440, "step": 26265 }, { "epoch": 1.283585371216378, "grad_norm": 0.34504616260528564, "learning_rate": 1.3182193394349704e-05, "loss": 0.0865, "num_input_tokens_seen": 34063744, "step": 26270 }, { "epoch": 1.2838296728800724, "grad_norm": 0.5567809343338013, "learning_rate": 1.3173542979724507e-05, "loss": 0.0839, "num_input_tokens_seen": 34069920, "step": 26275 }, { "epoch": 1.2840739745437666, "grad_norm": 0.7105206847190857, "learning_rate": 1.3164894388887617e-05, "loss": 0.064, "num_input_tokens_seen": 34076224, "step": 26280 }, { "epoch": 1.284318276207461, "grad_norm": 0.4900006353855133, "learning_rate": 1.3156247623172727e-05, "loss": 0.0725, "num_input_tokens_seen": 34082496, "step": 26285 }, { "epoch": 1.2845625778711554, "grad_norm": 0.6073176264762878, "learning_rate": 1.3147602683913302e-05, "loss": 0.1165, "num_input_tokens_seen": 34088992, "step": 26290 }, { "epoch": 1.2848068795348495, "grad_norm": 0.2271377444267273, "learning_rate": 1.3138959572442481e-05, "loss": 0.0582, "num_input_tokens_seen": 34095296, "step": 26295 }, { "epoch": 1.285051181198544, "grad_norm": 0.19207611680030823, "learning_rate": 1.3130318290093146e-05, "loss": 0.0793, "num_input_tokens_seen": 34101632, "step": 26300 }, { "epoch": 1.2852954828622383, "grad_norm": 0.3511781394481659, "learning_rate": 1.3121678838197909e-05, "loss": 0.0792, "num_input_tokens_seen": 34108448, "step": 26305 }, { "epoch": 1.2855397845259326, "grad_norm": 0.1941857933998108, "learning_rate": 1.3113041218089056e-05, "loss": 0.0829, "num_input_tokens_seen": 34114816, "step": 26310 }, { "epoch": 1.285784086189627, "grad_norm": 0.37763094902038574, "learning_rate": 1.3104405431098626e-05, "loss": 0.0963, "num_input_tokens_seen": 34121408, "step": 26315 }, { "epoch": 1.2860283878533214, "grad_norm": 0.34475141763687134, "learning_rate": 1.3095771478558377e-05, "loss": 0.0651, "num_input_tokens_seen": 34127776, "step": 26320 }, { "epoch": 1.2862726895170156, "grad_norm": 0.5406477451324463, "learning_rate": 1.3087139361799766e-05, "loss": 0.0833, "num_input_tokens_seen": 34134304, "step": 26325 }, { "epoch": 1.28651699118071, "grad_norm": 0.176521435379982, "learning_rate": 1.3078509082153964e-05, "loss": 0.0852, "num_input_tokens_seen": 34140768, "step": 26330 }, { "epoch": 1.2867612928444043, "grad_norm": 0.5197817087173462, "learning_rate": 1.3069880640951885e-05, "loss": 0.1082, "num_input_tokens_seen": 34147200, "step": 26335 }, { "epoch": 1.2870055945080985, "grad_norm": 0.32230716943740845, "learning_rate": 1.3061254039524123e-05, "loss": 0.0711, "num_input_tokens_seen": 34153664, "step": 26340 }, { "epoch": 1.2872498961717929, "grad_norm": 0.19246019423007965, "learning_rate": 1.3052629279201028e-05, "loss": 0.1045, "num_input_tokens_seen": 34159904, "step": 26345 }, { "epoch": 1.2874941978354872, "grad_norm": 0.15550369024276733, "learning_rate": 1.3044006361312633e-05, "loss": 0.0802, "num_input_tokens_seen": 34166400, "step": 26350 }, { "epoch": 1.2877384994991816, "grad_norm": 0.26401880383491516, "learning_rate": 1.30353852871887e-05, "loss": 0.0869, "num_input_tokens_seen": 34173152, "step": 26355 }, { "epoch": 1.287982801162876, "grad_norm": 0.3131406307220459, "learning_rate": 1.302676605815873e-05, "loss": 0.0672, "num_input_tokens_seen": 34179840, "step": 26360 }, { "epoch": 1.2882271028265704, "grad_norm": 0.6992785334587097, "learning_rate": 1.3018148675551884e-05, "loss": 0.1047, "num_input_tokens_seen": 34185824, "step": 26365 }, { "epoch": 1.2884714044902645, "grad_norm": 0.4680384695529938, "learning_rate": 1.3009533140697094e-05, "loss": 0.0887, "num_input_tokens_seen": 34192672, "step": 26370 }, { "epoch": 1.288715706153959, "grad_norm": 0.27713268995285034, "learning_rate": 1.3000919454922966e-05, "loss": 0.0825, "num_input_tokens_seen": 34198816, "step": 26375 }, { "epoch": 1.2889600078176533, "grad_norm": 0.31095075607299805, "learning_rate": 1.299230761955785e-05, "loss": 0.1004, "num_input_tokens_seen": 34205888, "step": 26380 }, { "epoch": 1.2892043094813475, "grad_norm": 0.28471866250038147, "learning_rate": 1.2983697635929807e-05, "loss": 0.1292, "num_input_tokens_seen": 34212000, "step": 26385 }, { "epoch": 1.2894486111450418, "grad_norm": 0.7578660845756531, "learning_rate": 1.2975089505366584e-05, "loss": 0.0882, "num_input_tokens_seen": 34218464, "step": 26390 }, { "epoch": 1.2896929128087362, "grad_norm": 0.22530502080917358, "learning_rate": 1.2966483229195683e-05, "loss": 0.0758, "num_input_tokens_seen": 34224704, "step": 26395 }, { "epoch": 1.2899372144724306, "grad_norm": 0.6209445595741272, "learning_rate": 1.2957878808744283e-05, "loss": 0.0902, "num_input_tokens_seen": 34231488, "step": 26400 }, { "epoch": 1.2899372144724306, "eval_loss": 0.0876758024096489, "eval_runtime": 374.639, "eval_samples_per_second": 97.12, "eval_steps_per_second": 24.282, "num_input_tokens_seen": 34231488, "step": 26400 }, { "epoch": 1.290181516136125, "grad_norm": 0.5664975047111511, "learning_rate": 1.294927624533931e-05, "loss": 0.0725, "num_input_tokens_seen": 34238016, "step": 26405 }, { "epoch": 1.2904258177998194, "grad_norm": 0.8209329843521118, "learning_rate": 1.2940675540307378e-05, "loss": 0.1001, "num_input_tokens_seen": 34244416, "step": 26410 }, { "epoch": 1.2906701194635135, "grad_norm": 0.33199140429496765, "learning_rate": 1.2932076694974814e-05, "loss": 0.077, "num_input_tokens_seen": 34250560, "step": 26415 }, { "epoch": 1.290914421127208, "grad_norm": 0.6230090260505676, "learning_rate": 1.2923479710667682e-05, "loss": 0.0747, "num_input_tokens_seen": 34257504, "step": 26420 }, { "epoch": 1.2911587227909023, "grad_norm": 0.1653863936662674, "learning_rate": 1.2914884588711751e-05, "loss": 0.0741, "num_input_tokens_seen": 34263776, "step": 26425 }, { "epoch": 1.2914030244545964, "grad_norm": 0.17041951417922974, "learning_rate": 1.2906291330432475e-05, "loss": 0.0777, "num_input_tokens_seen": 34269792, "step": 26430 }, { "epoch": 1.2916473261182908, "grad_norm": 0.5254075527191162, "learning_rate": 1.2897699937155055e-05, "loss": 0.0879, "num_input_tokens_seen": 34276160, "step": 26435 }, { "epoch": 1.2918916277819852, "grad_norm": 0.2644568681716919, "learning_rate": 1.2889110410204403e-05, "loss": 0.0902, "num_input_tokens_seen": 34282464, "step": 26440 }, { "epoch": 1.2921359294456796, "grad_norm": 0.44941195845603943, "learning_rate": 1.2880522750905111e-05, "loss": 0.0948, "num_input_tokens_seen": 34288512, "step": 26445 }, { "epoch": 1.292380231109374, "grad_norm": 0.1538281887769699, "learning_rate": 1.2871936960581523e-05, "loss": 0.0907, "num_input_tokens_seen": 34294432, "step": 26450 }, { "epoch": 1.292624532773068, "grad_norm": 0.28144407272338867, "learning_rate": 1.2863353040557658e-05, "loss": 0.109, "num_input_tokens_seen": 34300928, "step": 26455 }, { "epoch": 1.2928688344367625, "grad_norm": 0.6906524300575256, "learning_rate": 1.2854770992157273e-05, "loss": 0.0986, "num_input_tokens_seen": 34307360, "step": 26460 }, { "epoch": 1.2931131361004569, "grad_norm": 0.6413604021072388, "learning_rate": 1.2846190816703835e-05, "loss": 0.0976, "num_input_tokens_seen": 34313792, "step": 26465 }, { "epoch": 1.2933574377641512, "grad_norm": 0.5160540342330933, "learning_rate": 1.2837612515520498e-05, "loss": 0.0609, "num_input_tokens_seen": 34321664, "step": 26470 }, { "epoch": 1.2936017394278454, "grad_norm": 0.2398315817117691, "learning_rate": 1.2829036089930163e-05, "loss": 0.0846, "num_input_tokens_seen": 34328672, "step": 26475 }, { "epoch": 1.2938460410915398, "grad_norm": 0.7585335969924927, "learning_rate": 1.2820461541255412e-05, "loss": 0.0927, "num_input_tokens_seen": 34335232, "step": 26480 }, { "epoch": 1.2940903427552342, "grad_norm": 0.2580733001232147, "learning_rate": 1.2811888870818543e-05, "loss": 0.0907, "num_input_tokens_seen": 34342400, "step": 26485 }, { "epoch": 1.2943346444189285, "grad_norm": 0.2092244029045105, "learning_rate": 1.2803318079941581e-05, "loss": 0.0688, "num_input_tokens_seen": 34348736, "step": 26490 }, { "epoch": 1.294578946082623, "grad_norm": 0.13776367902755737, "learning_rate": 1.2794749169946235e-05, "loss": 0.0755, "num_input_tokens_seen": 34354976, "step": 26495 }, { "epoch": 1.294823247746317, "grad_norm": 0.4148256480693817, "learning_rate": 1.2786182142153952e-05, "loss": 0.0612, "num_input_tokens_seen": 34361472, "step": 26500 }, { "epoch": 1.2950675494100115, "grad_norm": 0.19977901875972748, "learning_rate": 1.2777616997885878e-05, "loss": 0.0797, "num_input_tokens_seen": 34367776, "step": 26505 }, { "epoch": 1.2953118510737058, "grad_norm": 0.3144170343875885, "learning_rate": 1.2769053738462847e-05, "loss": 0.083, "num_input_tokens_seen": 34375360, "step": 26510 }, { "epoch": 1.2955561527374002, "grad_norm": 0.39092540740966797, "learning_rate": 1.2760492365205434e-05, "loss": 0.0755, "num_input_tokens_seen": 34381696, "step": 26515 }, { "epoch": 1.2958004544010944, "grad_norm": 0.39636939764022827, "learning_rate": 1.2751932879433919e-05, "loss": 0.0832, "num_input_tokens_seen": 34387968, "step": 26520 }, { "epoch": 1.2960447560647887, "grad_norm": 0.1615982949733734, "learning_rate": 1.2743375282468267e-05, "loss": 0.069, "num_input_tokens_seen": 34394464, "step": 26525 }, { "epoch": 1.2962890577284831, "grad_norm": 0.2877745032310486, "learning_rate": 1.2734819575628182e-05, "loss": 0.0952, "num_input_tokens_seen": 34400608, "step": 26530 }, { "epoch": 1.2965333593921775, "grad_norm": 0.2939121425151825, "learning_rate": 1.2726265760233039e-05, "loss": 0.1051, "num_input_tokens_seen": 34406848, "step": 26535 }, { "epoch": 1.2967776610558719, "grad_norm": 0.25061506032943726, "learning_rate": 1.271771383760197e-05, "loss": 0.087, "num_input_tokens_seen": 34413664, "step": 26540 }, { "epoch": 1.297021962719566, "grad_norm": 0.18633903563022614, "learning_rate": 1.2709163809053764e-05, "loss": 0.0591, "num_input_tokens_seen": 34420448, "step": 26545 }, { "epoch": 1.2972662643832604, "grad_norm": 0.26861801743507385, "learning_rate": 1.2700615675906963e-05, "loss": 0.1029, "num_input_tokens_seen": 34427168, "step": 26550 }, { "epoch": 1.2975105660469548, "grad_norm": 0.25846198201179504, "learning_rate": 1.269206943947978e-05, "loss": 0.0864, "num_input_tokens_seen": 34433824, "step": 26555 }, { "epoch": 1.2977548677106492, "grad_norm": 0.5602665543556213, "learning_rate": 1.2683525101090177e-05, "loss": 0.0799, "num_input_tokens_seen": 34440480, "step": 26560 }, { "epoch": 1.2979991693743433, "grad_norm": 0.6796472668647766, "learning_rate": 1.2674982662055765e-05, "loss": 0.0862, "num_input_tokens_seen": 34447712, "step": 26565 }, { "epoch": 1.2982434710380377, "grad_norm": 0.33853471279144287, "learning_rate": 1.2666442123693922e-05, "loss": 0.0621, "num_input_tokens_seen": 34454560, "step": 26570 }, { "epoch": 1.298487772701732, "grad_norm": 0.127733513712883, "learning_rate": 1.265790348732169e-05, "loss": 0.0727, "num_input_tokens_seen": 34461216, "step": 26575 }, { "epoch": 1.2987320743654265, "grad_norm": 0.5934942960739136, "learning_rate": 1.264936675425584e-05, "loss": 0.0842, "num_input_tokens_seen": 34467456, "step": 26580 }, { "epoch": 1.2989763760291209, "grad_norm": 0.1974954903125763, "learning_rate": 1.2640831925812852e-05, "loss": 0.0872, "num_input_tokens_seen": 34473504, "step": 26585 }, { "epoch": 1.299220677692815, "grad_norm": 0.21640804409980774, "learning_rate": 1.263229900330889e-05, "loss": 0.0866, "num_input_tokens_seen": 34479552, "step": 26590 }, { "epoch": 1.2994649793565094, "grad_norm": 0.6245140433311462, "learning_rate": 1.2623767988059843e-05, "loss": 0.103, "num_input_tokens_seen": 34485568, "step": 26595 }, { "epoch": 1.2997092810202038, "grad_norm": 0.18170925974845886, "learning_rate": 1.2615238881381309e-05, "loss": 0.0971, "num_input_tokens_seen": 34491904, "step": 26600 }, { "epoch": 1.2997092810202038, "eval_loss": 0.08965002000331879, "eval_runtime": 373.7852, "eval_samples_per_second": 97.342, "eval_steps_per_second": 24.338, "num_input_tokens_seen": 34491904, "step": 26600 }, { "epoch": 1.2999535826838982, "grad_norm": 0.17207972705364227, "learning_rate": 1.2606711684588568e-05, "loss": 0.0861, "num_input_tokens_seen": 34498112, "step": 26605 }, { "epoch": 1.3001978843475923, "grad_norm": 0.282608300447464, "learning_rate": 1.2598186398996636e-05, "loss": 0.0853, "num_input_tokens_seen": 34504256, "step": 26610 }, { "epoch": 1.3004421860112867, "grad_norm": 0.4231691062450409, "learning_rate": 1.2589663025920207e-05, "loss": 0.0698, "num_input_tokens_seen": 34510496, "step": 26615 }, { "epoch": 1.300686487674981, "grad_norm": 0.7363823056221008, "learning_rate": 1.2581141566673705e-05, "loss": 0.0673, "num_input_tokens_seen": 34516672, "step": 26620 }, { "epoch": 1.3009307893386755, "grad_norm": 0.5940817594528198, "learning_rate": 1.257262202257124e-05, "loss": 0.1191, "num_input_tokens_seen": 34522976, "step": 26625 }, { "epoch": 1.3011750910023698, "grad_norm": 0.4043331742286682, "learning_rate": 1.2564104394926618e-05, "loss": 0.067, "num_input_tokens_seen": 34529632, "step": 26630 }, { "epoch": 1.301419392666064, "grad_norm": 0.4983549118041992, "learning_rate": 1.2555588685053383e-05, "loss": 0.0778, "num_input_tokens_seen": 34536416, "step": 26635 }, { "epoch": 1.3016636943297584, "grad_norm": 0.4711242914199829, "learning_rate": 1.2547074894264762e-05, "loss": 0.0676, "num_input_tokens_seen": 34543136, "step": 26640 }, { "epoch": 1.3019079959934527, "grad_norm": 0.6156367063522339, "learning_rate": 1.2538563023873679e-05, "loss": 0.0759, "num_input_tokens_seen": 34549504, "step": 26645 }, { "epoch": 1.302152297657147, "grad_norm": 0.3564734160900116, "learning_rate": 1.2530053075192789e-05, "loss": 0.0842, "num_input_tokens_seen": 34555840, "step": 26650 }, { "epoch": 1.3023965993208413, "grad_norm": 0.218926340341568, "learning_rate": 1.252154504953441e-05, "loss": 0.0464, "num_input_tokens_seen": 34563328, "step": 26655 }, { "epoch": 1.3026409009845357, "grad_norm": 0.18506155908107758, "learning_rate": 1.25130389482106e-05, "loss": 0.0947, "num_input_tokens_seen": 34569600, "step": 26660 }, { "epoch": 1.30288520264823, "grad_norm": 0.29049918055534363, "learning_rate": 1.2504534772533116e-05, "loss": 0.0821, "num_input_tokens_seen": 34575712, "step": 26665 }, { "epoch": 1.3031295043119244, "grad_norm": 0.22011728584766388, "learning_rate": 1.2496032523813387e-05, "loss": 0.0843, "num_input_tokens_seen": 34582272, "step": 26670 }, { "epoch": 1.3033738059756188, "grad_norm": 0.7027329206466675, "learning_rate": 1.2487532203362576e-05, "loss": 0.1052, "num_input_tokens_seen": 34588576, "step": 26675 }, { "epoch": 1.303618107639313, "grad_norm": 0.20969580113887787, "learning_rate": 1.247903381249155e-05, "loss": 0.0565, "num_input_tokens_seen": 34595264, "step": 26680 }, { "epoch": 1.3038624093030073, "grad_norm": 0.3673431873321533, "learning_rate": 1.2470537352510853e-05, "loss": 0.0839, "num_input_tokens_seen": 34602144, "step": 26685 }, { "epoch": 1.3041067109667017, "grad_norm": 0.3885919451713562, "learning_rate": 1.2462042824730758e-05, "loss": 0.0939, "num_input_tokens_seen": 34608576, "step": 26690 }, { "epoch": 1.3043510126303959, "grad_norm": 0.2120114415884018, "learning_rate": 1.245355023046122e-05, "loss": 0.1009, "num_input_tokens_seen": 34614880, "step": 26695 }, { "epoch": 1.3045953142940903, "grad_norm": 0.1555614173412323, "learning_rate": 1.2445059571011896e-05, "loss": 0.0731, "num_input_tokens_seen": 34621536, "step": 26700 }, { "epoch": 1.3048396159577846, "grad_norm": 0.2585587501525879, "learning_rate": 1.2436570847692173e-05, "loss": 0.1094, "num_input_tokens_seen": 34627776, "step": 26705 }, { "epoch": 1.305083917621479, "grad_norm": 0.25144726037979126, "learning_rate": 1.2428084061811096e-05, "loss": 0.0996, "num_input_tokens_seen": 34634272, "step": 26710 }, { "epoch": 1.3053282192851734, "grad_norm": 0.42396095395088196, "learning_rate": 1.2419599214677447e-05, "loss": 0.1003, "num_input_tokens_seen": 34640960, "step": 26715 }, { "epoch": 1.3055725209488678, "grad_norm": 0.17031626403331757, "learning_rate": 1.2411116307599702e-05, "loss": 0.0832, "num_input_tokens_seen": 34647296, "step": 26720 }, { "epoch": 1.305816822612562, "grad_norm": 0.529815673828125, "learning_rate": 1.2402635341886016e-05, "loss": 0.0903, "num_input_tokens_seen": 34653920, "step": 26725 }, { "epoch": 1.3060611242762563, "grad_norm": 0.29202139377593994, "learning_rate": 1.2394156318844278e-05, "loss": 0.0957, "num_input_tokens_seen": 34660224, "step": 26730 }, { "epoch": 1.3063054259399507, "grad_norm": 0.15605528652668, "learning_rate": 1.2385679239782039e-05, "loss": 0.0808, "num_input_tokens_seen": 34666784, "step": 26735 }, { "epoch": 1.3065497276036449, "grad_norm": 0.16218285262584686, "learning_rate": 1.2377204106006585e-05, "loss": 0.0739, "num_input_tokens_seen": 34673760, "step": 26740 }, { "epoch": 1.3067940292673392, "grad_norm": 0.45948028564453125, "learning_rate": 1.2368730918824891e-05, "loss": 0.083, "num_input_tokens_seen": 34679968, "step": 26745 }, { "epoch": 1.3070383309310336, "grad_norm": 0.4057193100452423, "learning_rate": 1.236025967954362e-05, "loss": 0.0978, "num_input_tokens_seen": 34686560, "step": 26750 }, { "epoch": 1.307282632594728, "grad_norm": 0.34674859046936035, "learning_rate": 1.2351790389469153e-05, "loss": 0.0801, "num_input_tokens_seen": 34692864, "step": 26755 }, { "epoch": 1.3075269342584224, "grad_norm": 0.22054636478424072, "learning_rate": 1.234332304990755e-05, "loss": 0.0952, "num_input_tokens_seen": 34699040, "step": 26760 }, { "epoch": 1.3077712359221167, "grad_norm": 0.14409281313419342, "learning_rate": 1.2334857662164593e-05, "loss": 0.0688, "num_input_tokens_seen": 34705248, "step": 26765 }, { "epoch": 1.308015537585811, "grad_norm": 0.22928592562675476, "learning_rate": 1.2326394227545743e-05, "loss": 0.0913, "num_input_tokens_seen": 34711904, "step": 26770 }, { "epoch": 1.3082598392495053, "grad_norm": 0.15352295339107513, "learning_rate": 1.2317932747356162e-05, "loss": 0.1093, "num_input_tokens_seen": 34718272, "step": 26775 }, { "epoch": 1.3085041409131997, "grad_norm": 0.5049769282341003, "learning_rate": 1.2309473222900726e-05, "loss": 0.0794, "num_input_tokens_seen": 34724768, "step": 26780 }, { "epoch": 1.3087484425768938, "grad_norm": 0.13318361341953278, "learning_rate": 1.2301015655484006e-05, "loss": 0.0755, "num_input_tokens_seen": 34731232, "step": 26785 }, { "epoch": 1.3089927442405882, "grad_norm": 0.1784399449825287, "learning_rate": 1.2292560046410245e-05, "loss": 0.0588, "num_input_tokens_seen": 34737984, "step": 26790 }, { "epoch": 1.3092370459042826, "grad_norm": 0.4313248097896576, "learning_rate": 1.228410639698343e-05, "loss": 0.1034, "num_input_tokens_seen": 34744480, "step": 26795 }, { "epoch": 1.309481347567977, "grad_norm": 0.3015482723712921, "learning_rate": 1.2275654708507195e-05, "loss": 0.0691, "num_input_tokens_seen": 34751008, "step": 26800 }, { "epoch": 1.309481347567977, "eval_loss": 0.08787915110588074, "eval_runtime": 373.8152, "eval_samples_per_second": 97.334, "eval_steps_per_second": 24.336, "num_input_tokens_seen": 34751008, "step": 26800 }, { "epoch": 1.3097256492316713, "grad_norm": 0.38599538803100586, "learning_rate": 1.2267204982284908e-05, "loss": 0.076, "num_input_tokens_seen": 34757440, "step": 26805 }, { "epoch": 1.3099699508953657, "grad_norm": 0.4203595519065857, "learning_rate": 1.2258757219619635e-05, "loss": 0.0899, "num_input_tokens_seen": 34763840, "step": 26810 }, { "epoch": 1.3102142525590599, "grad_norm": 0.5691134929656982, "learning_rate": 1.2250311421814104e-05, "loss": 0.0779, "num_input_tokens_seen": 34770176, "step": 26815 }, { "epoch": 1.3104585542227543, "grad_norm": 0.6400696039199829, "learning_rate": 1.2241867590170772e-05, "loss": 0.0737, "num_input_tokens_seen": 34776576, "step": 26820 }, { "epoch": 1.3107028558864486, "grad_norm": 0.5142503380775452, "learning_rate": 1.2233425725991799e-05, "loss": 0.0852, "num_input_tokens_seen": 34783136, "step": 26825 }, { "epoch": 1.3109471575501428, "grad_norm": 0.23171840608119965, "learning_rate": 1.2224985830579003e-05, "loss": 0.079, "num_input_tokens_seen": 34789344, "step": 26830 }, { "epoch": 1.3111914592138372, "grad_norm": 0.31906360387802124, "learning_rate": 1.2216547905233944e-05, "loss": 0.0738, "num_input_tokens_seen": 34795584, "step": 26835 }, { "epoch": 1.3114357608775316, "grad_norm": 0.22312726080417633, "learning_rate": 1.2208111951257842e-05, "loss": 0.0651, "num_input_tokens_seen": 34801600, "step": 26840 }, { "epoch": 1.311680062541226, "grad_norm": 0.20204105973243713, "learning_rate": 1.2199677969951622e-05, "loss": 0.09, "num_input_tokens_seen": 34807456, "step": 26845 }, { "epoch": 1.3119243642049203, "grad_norm": 0.5351085066795349, "learning_rate": 1.2191245962615927e-05, "loss": 0.0796, "num_input_tokens_seen": 34814240, "step": 26850 }, { "epoch": 1.3121686658686147, "grad_norm": 0.23096923530101776, "learning_rate": 1.218281593055106e-05, "loss": 0.1055, "num_input_tokens_seen": 34820416, "step": 26855 }, { "epoch": 1.3124129675323088, "grad_norm": 0.4382914900779724, "learning_rate": 1.217438787505705e-05, "loss": 0.0899, "num_input_tokens_seen": 34827936, "step": 26860 }, { "epoch": 1.3126572691960032, "grad_norm": 0.3350611627101898, "learning_rate": 1.2165961797433615e-05, "loss": 0.0862, "num_input_tokens_seen": 34833856, "step": 26865 }, { "epoch": 1.3129015708596976, "grad_norm": 0.210317462682724, "learning_rate": 1.215753769898014e-05, "loss": 0.0836, "num_input_tokens_seen": 34840672, "step": 26870 }, { "epoch": 1.3131458725233918, "grad_norm": 0.3105853796005249, "learning_rate": 1.2149115580995755e-05, "loss": 0.0782, "num_input_tokens_seen": 34847872, "step": 26875 }, { "epoch": 1.3133901741870861, "grad_norm": 0.38333356380462646, "learning_rate": 1.2140695444779227e-05, "loss": 0.0733, "num_input_tokens_seen": 34854176, "step": 26880 }, { "epoch": 1.3136344758507805, "grad_norm": 0.19293589890003204, "learning_rate": 1.2132277291629066e-05, "loss": 0.0842, "num_input_tokens_seen": 34860000, "step": 26885 }, { "epoch": 1.313878777514475, "grad_norm": 0.47311392426490784, "learning_rate": 1.2123861122843458e-05, "loss": 0.0699, "num_input_tokens_seen": 34866560, "step": 26890 }, { "epoch": 1.3141230791781693, "grad_norm": 0.17211337387561798, "learning_rate": 1.2115446939720271e-05, "loss": 0.0865, "num_input_tokens_seen": 34872672, "step": 26895 }, { "epoch": 1.3143673808418637, "grad_norm": 0.27500414848327637, "learning_rate": 1.210703474355708e-05, "loss": 0.0933, "num_input_tokens_seen": 34878848, "step": 26900 }, { "epoch": 1.3146116825055578, "grad_norm": 0.6024054884910583, "learning_rate": 1.2098624535651164e-05, "loss": 0.0758, "num_input_tokens_seen": 34885664, "step": 26905 }, { "epoch": 1.3148559841692522, "grad_norm": 0.32844045758247375, "learning_rate": 1.2090216317299477e-05, "loss": 0.1198, "num_input_tokens_seen": 34892032, "step": 26910 }, { "epoch": 1.3151002858329466, "grad_norm": 0.20235095918178558, "learning_rate": 1.2081810089798668e-05, "loss": 0.0836, "num_input_tokens_seen": 34898272, "step": 26915 }, { "epoch": 1.3153445874966407, "grad_norm": 0.24638813734054565, "learning_rate": 1.2073405854445072e-05, "loss": 0.0889, "num_input_tokens_seen": 34904448, "step": 26920 }, { "epoch": 1.3155888891603351, "grad_norm": 0.6335692405700684, "learning_rate": 1.206500361253474e-05, "loss": 0.099, "num_input_tokens_seen": 34911104, "step": 26925 }, { "epoch": 1.3158331908240295, "grad_norm": 0.17380839586257935, "learning_rate": 1.2056603365363409e-05, "loss": 0.0747, "num_input_tokens_seen": 34917568, "step": 26930 }, { "epoch": 1.3160774924877239, "grad_norm": 0.5121434330940247, "learning_rate": 1.2048205114226487e-05, "loss": 0.0915, "num_input_tokens_seen": 34924000, "step": 26935 }, { "epoch": 1.3163217941514183, "grad_norm": 0.27621960639953613, "learning_rate": 1.2039808860419102e-05, "loss": 0.053, "num_input_tokens_seen": 34930816, "step": 26940 }, { "epoch": 1.3165660958151126, "grad_norm": 0.5371702909469604, "learning_rate": 1.2031414605236066e-05, "loss": 0.0733, "num_input_tokens_seen": 34937312, "step": 26945 }, { "epoch": 1.3168103974788068, "grad_norm": 0.4065188765525818, "learning_rate": 1.2023022349971862e-05, "loss": 0.0914, "num_input_tokens_seen": 34943200, "step": 26950 }, { "epoch": 1.3170546991425012, "grad_norm": 0.34896835684776306, "learning_rate": 1.20146320959207e-05, "loss": 0.0987, "num_input_tokens_seen": 34949408, "step": 26955 }, { "epoch": 1.3172990008061956, "grad_norm": 0.24393068253993988, "learning_rate": 1.2006243844376445e-05, "loss": 0.09, "num_input_tokens_seen": 34955616, "step": 26960 }, { "epoch": 1.3175433024698897, "grad_norm": 0.3281315565109253, "learning_rate": 1.1997857596632678e-05, "loss": 0.0887, "num_input_tokens_seen": 34961888, "step": 26965 }, { "epoch": 1.317787604133584, "grad_norm": 0.3368445634841919, "learning_rate": 1.1989473353982672e-05, "loss": 0.0895, "num_input_tokens_seen": 34968352, "step": 26970 }, { "epoch": 1.3180319057972785, "grad_norm": 0.9644770622253418, "learning_rate": 1.198109111771937e-05, "loss": 0.1199, "num_input_tokens_seen": 34974624, "step": 26975 }, { "epoch": 1.3182762074609728, "grad_norm": 0.29594412446022034, "learning_rate": 1.197271088913543e-05, "loss": 0.102, "num_input_tokens_seen": 34980576, "step": 26980 }, { "epoch": 1.3185205091246672, "grad_norm": 0.4222959876060486, "learning_rate": 1.1964332669523182e-05, "loss": 0.0577, "num_input_tokens_seen": 34987040, "step": 26985 }, { "epoch": 1.3187648107883614, "grad_norm": 0.1560286432504654, "learning_rate": 1.1955956460174645e-05, "loss": 0.1137, "num_input_tokens_seen": 34993856, "step": 26990 }, { "epoch": 1.3190091124520558, "grad_norm": 0.2973630428314209, "learning_rate": 1.1947582262381552e-05, "loss": 0.074, "num_input_tokens_seen": 35000000, "step": 26995 }, { "epoch": 1.3192534141157501, "grad_norm": 0.28668197989463806, "learning_rate": 1.1939210077435293e-05, "loss": 0.0955, "num_input_tokens_seen": 35006432, "step": 27000 }, { "epoch": 1.3192534141157501, "eval_loss": 0.08788041770458221, "eval_runtime": 374.5317, "eval_samples_per_second": 97.148, "eval_steps_per_second": 24.289, "num_input_tokens_seen": 35006432, "step": 27000 }, { "epoch": 1.3194977157794445, "grad_norm": 0.21074047684669495, "learning_rate": 1.193083990662697e-05, "loss": 0.071, "num_input_tokens_seen": 35012608, "step": 27005 }, { "epoch": 1.3197420174431387, "grad_norm": 0.16829338669776917, "learning_rate": 1.192247175124738e-05, "loss": 0.086, "num_input_tokens_seen": 35019136, "step": 27010 }, { "epoch": 1.319986319106833, "grad_norm": 0.6158556938171387, "learning_rate": 1.191410561258698e-05, "loss": 0.0894, "num_input_tokens_seen": 35025600, "step": 27015 }, { "epoch": 1.3202306207705274, "grad_norm": 0.1458953469991684, "learning_rate": 1.1905741491935944e-05, "loss": 0.0564, "num_input_tokens_seen": 35031936, "step": 27020 }, { "epoch": 1.3204749224342218, "grad_norm": 0.19348791241645813, "learning_rate": 1.1897379390584129e-05, "loss": 0.0822, "num_input_tokens_seen": 35038592, "step": 27025 }, { "epoch": 1.3207192240979162, "grad_norm": 0.45477795600891113, "learning_rate": 1.1889019309821062e-05, "loss": 0.0605, "num_input_tokens_seen": 35045216, "step": 27030 }, { "epoch": 1.3209635257616104, "grad_norm": 0.35538145899772644, "learning_rate": 1.188066125093599e-05, "loss": 0.0917, "num_input_tokens_seen": 35051744, "step": 27035 }, { "epoch": 1.3212078274253047, "grad_norm": 0.15278038382530212, "learning_rate": 1.1872305215217811e-05, "loss": 0.0745, "num_input_tokens_seen": 35058304, "step": 27040 }, { "epoch": 1.3214521290889991, "grad_norm": 0.246846541762352, "learning_rate": 1.186395120395514e-05, "loss": 0.087, "num_input_tokens_seen": 35065056, "step": 27045 }, { "epoch": 1.3216964307526935, "grad_norm": 0.10363039374351501, "learning_rate": 1.1855599218436283e-05, "loss": 0.0642, "num_input_tokens_seen": 35072288, "step": 27050 }, { "epoch": 1.3219407324163877, "grad_norm": 0.4505213797092438, "learning_rate": 1.1847249259949209e-05, "loss": 0.0741, "num_input_tokens_seen": 35078752, "step": 27055 }, { "epoch": 1.322185034080082, "grad_norm": 0.14330236613750458, "learning_rate": 1.1838901329781574e-05, "loss": 0.0663, "num_input_tokens_seen": 35084992, "step": 27060 }, { "epoch": 1.3224293357437764, "grad_norm": 0.6975647807121277, "learning_rate": 1.1830555429220758e-05, "loss": 0.1363, "num_input_tokens_seen": 35091584, "step": 27065 }, { "epoch": 1.3226736374074708, "grad_norm": 0.3814919590950012, "learning_rate": 1.1822211559553784e-05, "loss": 0.0828, "num_input_tokens_seen": 35097504, "step": 27070 }, { "epoch": 1.3229179390711652, "grad_norm": 0.2261962890625, "learning_rate": 1.18138697220674e-05, "loss": 0.1106, "num_input_tokens_seen": 35104032, "step": 27075 }, { "epoch": 1.3231622407348593, "grad_norm": 0.27691468596458435, "learning_rate": 1.1805529918048e-05, "loss": 0.0715, "num_input_tokens_seen": 35110560, "step": 27080 }, { "epoch": 1.3234065423985537, "grad_norm": 0.5232120752334595, "learning_rate": 1.1797192148781702e-05, "loss": 0.1152, "num_input_tokens_seen": 35116576, "step": 27085 }, { "epoch": 1.323650844062248, "grad_norm": 0.2660790979862213, "learning_rate": 1.1788856415554297e-05, "loss": 0.0738, "num_input_tokens_seen": 35122880, "step": 27090 }, { "epoch": 1.3238951457259425, "grad_norm": 0.36949095129966736, "learning_rate": 1.1780522719651249e-05, "loss": 0.07, "num_input_tokens_seen": 35130016, "step": 27095 }, { "epoch": 1.3241394473896366, "grad_norm": 0.4084712564945221, "learning_rate": 1.1772191062357721e-05, "loss": 0.1127, "num_input_tokens_seen": 35136256, "step": 27100 }, { "epoch": 1.324383749053331, "grad_norm": 0.2765308916568756, "learning_rate": 1.1763861444958573e-05, "loss": 0.095, "num_input_tokens_seen": 35142656, "step": 27105 }, { "epoch": 1.3246280507170254, "grad_norm": 0.4375302791595459, "learning_rate": 1.1755533868738317e-05, "loss": 0.0751, "num_input_tokens_seen": 35148864, "step": 27110 }, { "epoch": 1.3248723523807198, "grad_norm": 0.2973688542842865, "learning_rate": 1.1747208334981185e-05, "loss": 0.0897, "num_input_tokens_seen": 35155360, "step": 27115 }, { "epoch": 1.3251166540444141, "grad_norm": 0.7601795196533203, "learning_rate": 1.1738884844971067e-05, "loss": 0.106, "num_input_tokens_seen": 35161376, "step": 27120 }, { "epoch": 1.3253609557081083, "grad_norm": 0.3455275297164917, "learning_rate": 1.1730563399991563e-05, "loss": 0.122, "num_input_tokens_seen": 35167840, "step": 27125 }, { "epoch": 1.3256052573718027, "grad_norm": 0.588085412979126, "learning_rate": 1.1722244001325938e-05, "loss": 0.0603, "num_input_tokens_seen": 35174944, "step": 27130 }, { "epoch": 1.325849559035497, "grad_norm": 0.4385225176811218, "learning_rate": 1.1713926650257137e-05, "loss": 0.0959, "num_input_tokens_seen": 35180928, "step": 27135 }, { "epoch": 1.3260938606991914, "grad_norm": 0.36062660813331604, "learning_rate": 1.170561134806781e-05, "loss": 0.0934, "num_input_tokens_seen": 35187424, "step": 27140 }, { "epoch": 1.3263381623628856, "grad_norm": 0.5500331521034241, "learning_rate": 1.1697298096040287e-05, "loss": 0.0833, "num_input_tokens_seen": 35193792, "step": 27145 }, { "epoch": 1.32658246402658, "grad_norm": 0.15200036764144897, "learning_rate": 1.1688986895456567e-05, "loss": 0.0776, "num_input_tokens_seen": 35199936, "step": 27150 }, { "epoch": 1.3268267656902744, "grad_norm": 0.3005058467388153, "learning_rate": 1.1680677747598349e-05, "loss": 0.0704, "num_input_tokens_seen": 35206304, "step": 27155 }, { "epoch": 1.3270710673539687, "grad_norm": 0.25406742095947266, "learning_rate": 1.1672370653746995e-05, "loss": 0.0753, "num_input_tokens_seen": 35212544, "step": 27160 }, { "epoch": 1.3273153690176631, "grad_norm": 0.21330970525741577, "learning_rate": 1.166406561518357e-05, "loss": 0.0717, "num_input_tokens_seen": 35219616, "step": 27165 }, { "epoch": 1.3275596706813573, "grad_norm": 0.6759474277496338, "learning_rate": 1.1655762633188826e-05, "loss": 0.0909, "num_input_tokens_seen": 35225984, "step": 27170 }, { "epoch": 1.3278039723450517, "grad_norm": 0.3461734354496002, "learning_rate": 1.1647461709043172e-05, "loss": 0.0841, "num_input_tokens_seen": 35232672, "step": 27175 }, { "epoch": 1.328048274008746, "grad_norm": 0.3548443019390106, "learning_rate": 1.1639162844026722e-05, "loss": 0.0762, "num_input_tokens_seen": 35239072, "step": 27180 }, { "epoch": 1.3282925756724404, "grad_norm": 0.2416105568408966, "learning_rate": 1.163086603941927e-05, "loss": 0.0727, "num_input_tokens_seen": 35245376, "step": 27185 }, { "epoch": 1.3285368773361346, "grad_norm": 0.1261776238679886, "learning_rate": 1.1622571296500273e-05, "loss": 0.0941, "num_input_tokens_seen": 35251968, "step": 27190 }, { "epoch": 1.328781178999829, "grad_norm": 0.3993528485298157, "learning_rate": 1.1614278616548904e-05, "loss": 0.0783, "num_input_tokens_seen": 35258624, "step": 27195 }, { "epoch": 1.3290254806635233, "grad_norm": 0.14635702967643738, "learning_rate": 1.1605988000843986e-05, "loss": 0.0723, "num_input_tokens_seen": 35264896, "step": 27200 }, { "epoch": 1.3290254806635233, "eval_loss": 0.08854660391807556, "eval_runtime": 374.4719, "eval_samples_per_second": 97.164, "eval_steps_per_second": 24.293, "num_input_tokens_seen": 35264896, "step": 27200 }, { "epoch": 1.3292697823272177, "grad_norm": 0.6875989437103271, "learning_rate": 1.1597699450664028e-05, "loss": 0.0842, "num_input_tokens_seen": 35271040, "step": 27205 }, { "epoch": 1.329514083990912, "grad_norm": 0.25056183338165283, "learning_rate": 1.1589412967287252e-05, "loss": 0.0697, "num_input_tokens_seen": 35277472, "step": 27210 }, { "epoch": 1.3297583856546062, "grad_norm": 0.584972083568573, "learning_rate": 1.1581128551991514e-05, "loss": 0.1242, "num_input_tokens_seen": 35284384, "step": 27215 }, { "epoch": 1.3300026873183006, "grad_norm": 0.44610828161239624, "learning_rate": 1.1572846206054383e-05, "loss": 0.1134, "num_input_tokens_seen": 35290944, "step": 27220 }, { "epoch": 1.330246988981995, "grad_norm": 0.8149862885475159, "learning_rate": 1.1564565930753113e-05, "loss": 0.0883, "num_input_tokens_seen": 35297312, "step": 27225 }, { "epoch": 1.3304912906456892, "grad_norm": 0.40273991227149963, "learning_rate": 1.1556287727364606e-05, "loss": 0.118, "num_input_tokens_seen": 35303520, "step": 27230 }, { "epoch": 1.3307355923093835, "grad_norm": 0.3572959899902344, "learning_rate": 1.1548011597165489e-05, "loss": 0.102, "num_input_tokens_seen": 35309472, "step": 27235 }, { "epoch": 1.330979893973078, "grad_norm": 0.2831721305847168, "learning_rate": 1.1539737541432019e-05, "loss": 0.0728, "num_input_tokens_seen": 35315584, "step": 27240 }, { "epoch": 1.3312241956367723, "grad_norm": 0.30352282524108887, "learning_rate": 1.1531465561440174e-05, "loss": 0.0731, "num_input_tokens_seen": 35321472, "step": 27245 }, { "epoch": 1.3314684973004667, "grad_norm": 0.31647881865501404, "learning_rate": 1.1523195658465605e-05, "loss": 0.0958, "num_input_tokens_seen": 35328000, "step": 27250 }, { "epoch": 1.331712798964161, "grad_norm": 0.5331254601478577, "learning_rate": 1.1514927833783618e-05, "loss": 0.1001, "num_input_tokens_seen": 35334368, "step": 27255 }, { "epoch": 1.3319571006278552, "grad_norm": 0.23662962019443512, "learning_rate": 1.150666208866922e-05, "loss": 0.0845, "num_input_tokens_seen": 35341024, "step": 27260 }, { "epoch": 1.3322014022915496, "grad_norm": 0.29525014758110046, "learning_rate": 1.1498398424397106e-05, "loss": 0.0754, "num_input_tokens_seen": 35347296, "step": 27265 }, { "epoch": 1.332445703955244, "grad_norm": 0.2045425921678543, "learning_rate": 1.1490136842241628e-05, "loss": 0.0782, "num_input_tokens_seen": 35353632, "step": 27270 }, { "epoch": 1.3326900056189381, "grad_norm": 0.6299206018447876, "learning_rate": 1.1481877343476813e-05, "loss": 0.0678, "num_input_tokens_seen": 35360192, "step": 27275 }, { "epoch": 1.3329343072826325, "grad_norm": 0.19797028601169586, "learning_rate": 1.14736199293764e-05, "loss": 0.0639, "num_input_tokens_seen": 35366816, "step": 27280 }, { "epoch": 1.333178608946327, "grad_norm": 0.24506254494190216, "learning_rate": 1.1465364601213771e-05, "loss": 0.0813, "num_input_tokens_seen": 35373280, "step": 27285 }, { "epoch": 1.3334229106100213, "grad_norm": 0.23586654663085938, "learning_rate": 1.1457111360262012e-05, "loss": 0.0906, "num_input_tokens_seen": 35379648, "step": 27290 }, { "epoch": 1.3336672122737157, "grad_norm": 0.3510473668575287, "learning_rate": 1.1448860207793869e-05, "loss": 0.11, "num_input_tokens_seen": 35385952, "step": 27295 }, { "epoch": 1.33391151393741, "grad_norm": 0.16034314036369324, "learning_rate": 1.144061114508177e-05, "loss": 0.0843, "num_input_tokens_seen": 35392576, "step": 27300 }, { "epoch": 1.3341558156011042, "grad_norm": 0.23572205007076263, "learning_rate": 1.1432364173397842e-05, "loss": 0.0669, "num_input_tokens_seen": 35399584, "step": 27305 }, { "epoch": 1.3344001172647986, "grad_norm": 0.19369269907474518, "learning_rate": 1.1424119294013852e-05, "loss": 0.0907, "num_input_tokens_seen": 35406432, "step": 27310 }, { "epoch": 1.334644418928493, "grad_norm": 0.17710471153259277, "learning_rate": 1.1415876508201279e-05, "loss": 0.1123, "num_input_tokens_seen": 35412768, "step": 27315 }, { "epoch": 1.334888720592187, "grad_norm": 0.5967814922332764, "learning_rate": 1.140763581723125e-05, "loss": 0.1033, "num_input_tokens_seen": 35419104, "step": 27320 }, { "epoch": 1.3351330222558815, "grad_norm": 0.4443228542804718, "learning_rate": 1.1399397222374588e-05, "loss": 0.0712, "num_input_tokens_seen": 35425824, "step": 27325 }, { "epoch": 1.3353773239195759, "grad_norm": 0.5330938696861267, "learning_rate": 1.1391160724901804e-05, "loss": 0.0862, "num_input_tokens_seen": 35432256, "step": 27330 }, { "epoch": 1.3356216255832702, "grad_norm": 0.33844906091690063, "learning_rate": 1.138292632608304e-05, "loss": 0.0661, "num_input_tokens_seen": 35439040, "step": 27335 }, { "epoch": 1.3358659272469646, "grad_norm": 0.23182417452335358, "learning_rate": 1.1374694027188174e-05, "loss": 0.0927, "num_input_tokens_seen": 35445280, "step": 27340 }, { "epoch": 1.336110228910659, "grad_norm": 0.8074058890342712, "learning_rate": 1.1366463829486711e-05, "loss": 0.1201, "num_input_tokens_seen": 35451872, "step": 27345 }, { "epoch": 1.3363545305743532, "grad_norm": 0.5312296748161316, "learning_rate": 1.1358235734247849e-05, "loss": 0.0995, "num_input_tokens_seen": 35458240, "step": 27350 }, { "epoch": 1.3365988322380475, "grad_norm": 0.2659396529197693, "learning_rate": 1.1350009742740478e-05, "loss": 0.096, "num_input_tokens_seen": 35464640, "step": 27355 }, { "epoch": 1.336843133901742, "grad_norm": 0.23808054625988007, "learning_rate": 1.134178585623313e-05, "loss": 0.0881, "num_input_tokens_seen": 35471456, "step": 27360 }, { "epoch": 1.337087435565436, "grad_norm": 0.3810248076915741, "learning_rate": 1.1333564075994047e-05, "loss": 0.055, "num_input_tokens_seen": 35477792, "step": 27365 }, { "epoch": 1.3373317372291305, "grad_norm": 0.10511555522680283, "learning_rate": 1.1325344403291133e-05, "loss": 0.091, "num_input_tokens_seen": 35484000, "step": 27370 }, { "epoch": 1.3375760388928248, "grad_norm": 0.6356857419013977, "learning_rate": 1.1317126839391951e-05, "loss": 0.0699, "num_input_tokens_seen": 35490752, "step": 27375 }, { "epoch": 1.3378203405565192, "grad_norm": 0.1832631677389145, "learning_rate": 1.1308911385563766e-05, "loss": 0.0768, "num_input_tokens_seen": 35497248, "step": 27380 }, { "epoch": 1.3380646422202136, "grad_norm": 0.3534155786037445, "learning_rate": 1.1300698043073494e-05, "loss": 0.0681, "num_input_tokens_seen": 35504000, "step": 27385 }, { "epoch": 1.338308943883908, "grad_norm": 0.23129203915596008, "learning_rate": 1.1292486813187736e-05, "loss": 0.0855, "num_input_tokens_seen": 35510688, "step": 27390 }, { "epoch": 1.3385532455476021, "grad_norm": 0.2685849964618683, "learning_rate": 1.1284277697172782e-05, "loss": 0.0817, "num_input_tokens_seen": 35517184, "step": 27395 }, { "epoch": 1.3387975472112965, "grad_norm": 0.520972728729248, "learning_rate": 1.127607069629456e-05, "loss": 0.0792, "num_input_tokens_seen": 35523424, "step": 27400 }, { "epoch": 1.3387975472112965, "eval_loss": 0.08910129964351654, "eval_runtime": 374.0374, "eval_samples_per_second": 97.276, "eval_steps_per_second": 24.321, "num_input_tokens_seen": 35523424, "step": 27400 }, { "epoch": 1.339041848874991, "grad_norm": 0.17017464339733124, "learning_rate": 1.1267865811818701e-05, "loss": 0.0864, "num_input_tokens_seen": 35529824, "step": 27405 }, { "epoch": 1.339286150538685, "grad_norm": 0.2574337124824524, "learning_rate": 1.1259663045010513e-05, "loss": 0.0821, "num_input_tokens_seen": 35536224, "step": 27410 }, { "epoch": 1.3395304522023794, "grad_norm": 0.24069121479988098, "learning_rate": 1.1251462397134957e-05, "loss": 0.0931, "num_input_tokens_seen": 35542368, "step": 27415 }, { "epoch": 1.3397747538660738, "grad_norm": 0.5087684392929077, "learning_rate": 1.1243263869456664e-05, "loss": 0.0775, "num_input_tokens_seen": 35548672, "step": 27420 }, { "epoch": 1.3400190555297682, "grad_norm": 0.27102094888687134, "learning_rate": 1.1235067463239967e-05, "loss": 0.0591, "num_input_tokens_seen": 35554976, "step": 27425 }, { "epoch": 1.3402633571934626, "grad_norm": 0.33073291182518005, "learning_rate": 1.122687317974884e-05, "loss": 0.0526, "num_input_tokens_seen": 35561088, "step": 27430 }, { "epoch": 1.340507658857157, "grad_norm": 0.38105738162994385, "learning_rate": 1.1218681020246963e-05, "loss": 0.0651, "num_input_tokens_seen": 35567680, "step": 27435 }, { "epoch": 1.340751960520851, "grad_norm": 0.34911519289016724, "learning_rate": 1.1210490985997652e-05, "loss": 0.0804, "num_input_tokens_seen": 35574016, "step": 27440 }, { "epoch": 1.3409962621845455, "grad_norm": 0.4833929240703583, "learning_rate": 1.1202303078263917e-05, "loss": 0.0849, "num_input_tokens_seen": 35580608, "step": 27445 }, { "epoch": 1.3412405638482399, "grad_norm": 0.09251074492931366, "learning_rate": 1.1194117298308451e-05, "loss": 0.0528, "num_input_tokens_seen": 35587552, "step": 27450 }, { "epoch": 1.341484865511934, "grad_norm": 0.9327024221420288, "learning_rate": 1.1185933647393585e-05, "loss": 0.1253, "num_input_tokens_seen": 35593888, "step": 27455 }, { "epoch": 1.3417291671756284, "grad_norm": 0.21100248396396637, "learning_rate": 1.1177752126781354e-05, "loss": 0.0863, "num_input_tokens_seen": 35600096, "step": 27460 }, { "epoch": 1.3419734688393228, "grad_norm": 0.16278444230556488, "learning_rate": 1.1169572737733441e-05, "loss": 0.0949, "num_input_tokens_seen": 35606080, "step": 27465 }, { "epoch": 1.3422177705030172, "grad_norm": 0.3447291851043701, "learning_rate": 1.1161395481511216e-05, "loss": 0.0728, "num_input_tokens_seen": 35612960, "step": 27470 }, { "epoch": 1.3424620721667115, "grad_norm": 0.2095881849527359, "learning_rate": 1.1153220359375722e-05, "loss": 0.0882, "num_input_tokens_seen": 35619136, "step": 27475 }, { "epoch": 1.342706373830406, "grad_norm": 0.40788814425468445, "learning_rate": 1.114504737258765e-05, "loss": 0.0691, "num_input_tokens_seen": 35625728, "step": 27480 }, { "epoch": 1.3429506754941, "grad_norm": 0.2588546574115753, "learning_rate": 1.1136876522407393e-05, "loss": 0.104, "num_input_tokens_seen": 35632160, "step": 27485 }, { "epoch": 1.3431949771577945, "grad_norm": 0.4327765107154846, "learning_rate": 1.1128707810094985e-05, "loss": 0.0783, "num_input_tokens_seen": 35638944, "step": 27490 }, { "epoch": 1.3434392788214888, "grad_norm": 0.17193856835365295, "learning_rate": 1.1120541236910157e-05, "loss": 0.0747, "num_input_tokens_seen": 35645408, "step": 27495 }, { "epoch": 1.343683580485183, "grad_norm": 0.3934682607650757, "learning_rate": 1.111237680411229e-05, "loss": 0.0966, "num_input_tokens_seen": 35651552, "step": 27500 }, { "epoch": 1.3439278821488774, "grad_norm": 0.13039366900920868, "learning_rate": 1.1104214512960433e-05, "loss": 0.102, "num_input_tokens_seen": 35658976, "step": 27505 }, { "epoch": 1.3441721838125718, "grad_norm": 0.2321646511554718, "learning_rate": 1.1096054364713327e-05, "loss": 0.0917, "num_input_tokens_seen": 35665280, "step": 27510 }, { "epoch": 1.3444164854762661, "grad_norm": 0.08879397809505463, "learning_rate": 1.1087896360629371e-05, "loss": 0.0626, "num_input_tokens_seen": 35671904, "step": 27515 }, { "epoch": 1.3446607871399605, "grad_norm": 0.6903400421142578, "learning_rate": 1.107974050196662e-05, "loss": 0.0907, "num_input_tokens_seen": 35677888, "step": 27520 }, { "epoch": 1.344905088803655, "grad_norm": 0.3092653155326843, "learning_rate": 1.1071586789982816e-05, "loss": 0.0737, "num_input_tokens_seen": 35684256, "step": 27525 }, { "epoch": 1.345149390467349, "grad_norm": 0.23166336119174957, "learning_rate": 1.1063435225935373e-05, "loss": 0.0936, "num_input_tokens_seen": 35690528, "step": 27530 }, { "epoch": 1.3453936921310434, "grad_norm": 0.6909124851226807, "learning_rate": 1.1055285811081348e-05, "loss": 0.0693, "num_input_tokens_seen": 35697056, "step": 27535 }, { "epoch": 1.3456379937947378, "grad_norm": 0.27175334095954895, "learning_rate": 1.1047138546677499e-05, "loss": 0.0762, "num_input_tokens_seen": 35703552, "step": 27540 }, { "epoch": 1.345882295458432, "grad_norm": 0.1827247589826584, "learning_rate": 1.1038993433980219e-05, "loss": 0.0882, "num_input_tokens_seen": 35709760, "step": 27545 }, { "epoch": 1.3461265971221263, "grad_norm": 0.6189061999320984, "learning_rate": 1.1030850474245597e-05, "loss": 0.0742, "num_input_tokens_seen": 35715872, "step": 27550 }, { "epoch": 1.3463708987858207, "grad_norm": 0.24793069064617157, "learning_rate": 1.102270966872939e-05, "loss": 0.0961, "num_input_tokens_seen": 35722336, "step": 27555 }, { "epoch": 1.346615200449515, "grad_norm": 0.30483001470565796, "learning_rate": 1.1014571018687e-05, "loss": 0.101, "num_input_tokens_seen": 35728704, "step": 27560 }, { "epoch": 1.3468595021132095, "grad_norm": 0.23155170679092407, "learning_rate": 1.1006434525373502e-05, "loss": 0.0787, "num_input_tokens_seen": 35735008, "step": 27565 }, { "epoch": 1.3471038037769036, "grad_norm": 0.5824602246284485, "learning_rate": 1.0998300190043664e-05, "loss": 0.0719, "num_input_tokens_seen": 35741664, "step": 27570 }, { "epoch": 1.347348105440598, "grad_norm": 0.407875120639801, "learning_rate": 1.0990168013951882e-05, "loss": 0.1032, "num_input_tokens_seen": 35748000, "step": 27575 }, { "epoch": 1.3475924071042924, "grad_norm": 0.6408826112747192, "learning_rate": 1.0982037998352263e-05, "loss": 0.0763, "num_input_tokens_seen": 35754336, "step": 27580 }, { "epoch": 1.3478367087679868, "grad_norm": 0.4399871528148651, "learning_rate": 1.0973910144498534e-05, "loss": 0.0908, "num_input_tokens_seen": 35760992, "step": 27585 }, { "epoch": 1.348081010431681, "grad_norm": 0.6214854717254639, "learning_rate": 1.0965784453644123e-05, "loss": 0.0856, "num_input_tokens_seen": 35767872, "step": 27590 }, { "epoch": 1.3483253120953753, "grad_norm": 0.4520018696784973, "learning_rate": 1.0957660927042127e-05, "loss": 0.1094, "num_input_tokens_seen": 35774208, "step": 27595 }, { "epoch": 1.3485696137590697, "grad_norm": 0.17335958778858185, "learning_rate": 1.094953956594527e-05, "loss": 0.0965, "num_input_tokens_seen": 35781024, "step": 27600 }, { "epoch": 1.3485696137590697, "eval_loss": 0.08766358345746994, "eval_runtime": 375.3291, "eval_samples_per_second": 96.942, "eval_steps_per_second": 24.237, "num_input_tokens_seen": 35781024, "step": 27600 }, { "epoch": 1.348813915422764, "grad_norm": 0.2580772042274475, "learning_rate": 1.0941420371605981e-05, "loss": 0.0925, "num_input_tokens_seen": 35787616, "step": 27605 }, { "epoch": 1.3490582170864585, "grad_norm": 0.21806703507900238, "learning_rate": 1.0933303345276354e-05, "loss": 0.0739, "num_input_tokens_seen": 35793920, "step": 27610 }, { "epoch": 1.3493025187501526, "grad_norm": 0.18506518006324768, "learning_rate": 1.0925188488208112e-05, "loss": 0.067, "num_input_tokens_seen": 35799936, "step": 27615 }, { "epoch": 1.349546820413847, "grad_norm": 0.25149959325790405, "learning_rate": 1.0917075801652694e-05, "loss": 0.0877, "num_input_tokens_seen": 35806560, "step": 27620 }, { "epoch": 1.3497911220775414, "grad_norm": 0.22816751897335052, "learning_rate": 1.0908965286861151e-05, "loss": 0.0906, "num_input_tokens_seen": 35813216, "step": 27625 }, { "epoch": 1.3500354237412358, "grad_norm": 0.5120549201965332, "learning_rate": 1.090085694508425e-05, "loss": 0.1068, "num_input_tokens_seen": 35819936, "step": 27630 }, { "epoch": 1.35027972540493, "grad_norm": 0.4117245376110077, "learning_rate": 1.089275077757238e-05, "loss": 0.1064, "num_input_tokens_seen": 35826208, "step": 27635 }, { "epoch": 1.3505240270686243, "grad_norm": 0.14236371219158173, "learning_rate": 1.0884646785575633e-05, "loss": 0.0798, "num_input_tokens_seen": 35833344, "step": 27640 }, { "epoch": 1.3507683287323187, "grad_norm": 0.35144299268722534, "learning_rate": 1.0876544970343728e-05, "loss": 0.0758, "num_input_tokens_seen": 35839776, "step": 27645 }, { "epoch": 1.351012630396013, "grad_norm": 0.20916326344013214, "learning_rate": 1.0868445333126082e-05, "loss": 0.0782, "num_input_tokens_seen": 35846176, "step": 27650 }, { "epoch": 1.3512569320597074, "grad_norm": 0.2017519772052765, "learning_rate": 1.0860347875171745e-05, "loss": 0.0658, "num_input_tokens_seen": 35852576, "step": 27655 }, { "epoch": 1.3515012337234016, "grad_norm": 0.3907008171081543, "learning_rate": 1.0852252597729465e-05, "loss": 0.0834, "num_input_tokens_seen": 35859072, "step": 27660 }, { "epoch": 1.351745535387096, "grad_norm": 0.4270867705345154, "learning_rate": 1.0844159502047615e-05, "loss": 0.1073, "num_input_tokens_seen": 35865184, "step": 27665 }, { "epoch": 1.3519898370507903, "grad_norm": 0.43507930636405945, "learning_rate": 1.0836068589374265e-05, "loss": 0.0979, "num_input_tokens_seen": 35871744, "step": 27670 }, { "epoch": 1.3522341387144847, "grad_norm": 0.6292684078216553, "learning_rate": 1.0827979860957144e-05, "loss": 0.0694, "num_input_tokens_seen": 35878208, "step": 27675 }, { "epoch": 1.3524784403781789, "grad_norm": 0.6141310334205627, "learning_rate": 1.0819893318043615e-05, "loss": 0.1026, "num_input_tokens_seen": 35884512, "step": 27680 }, { "epoch": 1.3527227420418733, "grad_norm": 0.3463062047958374, "learning_rate": 1.0811808961880734e-05, "loss": 0.0795, "num_input_tokens_seen": 35890944, "step": 27685 }, { "epoch": 1.3529670437055676, "grad_norm": 0.2885027825832367, "learning_rate": 1.080372679371522e-05, "loss": 0.0793, "num_input_tokens_seen": 35897472, "step": 27690 }, { "epoch": 1.353211345369262, "grad_norm": 0.17170584201812744, "learning_rate": 1.0795646814793428e-05, "loss": 0.0889, "num_input_tokens_seen": 35904320, "step": 27695 }, { "epoch": 1.3534556470329564, "grad_norm": 0.2629190981388092, "learning_rate": 1.078756902636141e-05, "loss": 0.0967, "num_input_tokens_seen": 35910656, "step": 27700 }, { "epoch": 1.3536999486966506, "grad_norm": 0.808660626411438, "learning_rate": 1.077949342966485e-05, "loss": 0.0706, "num_input_tokens_seen": 35916896, "step": 27705 }, { "epoch": 1.353944250360345, "grad_norm": 0.1922212541103363, "learning_rate": 1.0771420025949103e-05, "loss": 0.0877, "num_input_tokens_seen": 35923424, "step": 27710 }, { "epoch": 1.3541885520240393, "grad_norm": 0.3409559726715088, "learning_rate": 1.0763348816459204e-05, "loss": 0.1013, "num_input_tokens_seen": 35930368, "step": 27715 }, { "epoch": 1.3544328536877337, "grad_norm": 0.24094438552856445, "learning_rate": 1.0755279802439816e-05, "loss": 0.0732, "num_input_tokens_seen": 35936576, "step": 27720 }, { "epoch": 1.3546771553514279, "grad_norm": 0.2189141809940338, "learning_rate": 1.0747212985135293e-05, "loss": 0.0697, "num_input_tokens_seen": 35943072, "step": 27725 }, { "epoch": 1.3549214570151222, "grad_norm": 0.22221939265727997, "learning_rate": 1.073914836578965e-05, "loss": 0.1166, "num_input_tokens_seen": 35948992, "step": 27730 }, { "epoch": 1.3551657586788166, "grad_norm": 0.5002884268760681, "learning_rate": 1.0731085945646529e-05, "loss": 0.0647, "num_input_tokens_seen": 35955296, "step": 27735 }, { "epoch": 1.355410060342511, "grad_norm": 0.542829692363739, "learning_rate": 1.0723025725949285e-05, "loss": 0.0937, "num_input_tokens_seen": 35961536, "step": 27740 }, { "epoch": 1.3556543620062054, "grad_norm": 0.20491409301757812, "learning_rate": 1.0714967707940875e-05, "loss": 0.1072, "num_input_tokens_seen": 35967680, "step": 27745 }, { "epoch": 1.3558986636698995, "grad_norm": 0.13390137255191803, "learning_rate": 1.0706911892863963e-05, "loss": 0.0743, "num_input_tokens_seen": 35974528, "step": 27750 }, { "epoch": 1.356142965333594, "grad_norm": 0.18735338747501373, "learning_rate": 1.0698858281960866e-05, "loss": 0.0627, "num_input_tokens_seen": 35980960, "step": 27755 }, { "epoch": 1.3563872669972883, "grad_norm": 0.3806086778640747, "learning_rate": 1.069080687647353e-05, "loss": 0.1186, "num_input_tokens_seen": 35987136, "step": 27760 }, { "epoch": 1.3566315686609824, "grad_norm": 0.2951149642467499, "learning_rate": 1.0682757677643596e-05, "loss": 0.0931, "num_input_tokens_seen": 35993664, "step": 27765 }, { "epoch": 1.3568758703246768, "grad_norm": 0.24909541010856628, "learning_rate": 1.0674710686712359e-05, "loss": 0.0865, "num_input_tokens_seen": 36000384, "step": 27770 }, { "epoch": 1.3571201719883712, "grad_norm": 0.46057626605033875, "learning_rate": 1.0666665904920756e-05, "loss": 0.0777, "num_input_tokens_seen": 36006944, "step": 27775 }, { "epoch": 1.3573644736520656, "grad_norm": 0.28065380454063416, "learning_rate": 1.0658623333509385e-05, "loss": 0.0777, "num_input_tokens_seen": 36013600, "step": 27780 }, { "epoch": 1.35760877531576, "grad_norm": 0.2508014738559723, "learning_rate": 1.0650582973718532e-05, "loss": 0.0666, "num_input_tokens_seen": 36020544, "step": 27785 }, { "epoch": 1.3578530769794543, "grad_norm": 0.14338430762290955, "learning_rate": 1.0642544826788098e-05, "loss": 0.086, "num_input_tokens_seen": 36027264, "step": 27790 }, { "epoch": 1.3580973786431485, "grad_norm": 0.09113239496946335, "learning_rate": 1.063450889395769e-05, "loss": 0.0755, "num_input_tokens_seen": 36033696, "step": 27795 }, { "epoch": 1.3583416803068429, "grad_norm": 0.15070119500160217, "learning_rate": 1.062647517646653e-05, "loss": 0.0966, "num_input_tokens_seen": 36040224, "step": 27800 }, { "epoch": 1.3583416803068429, "eval_loss": 0.08757250010967255, "eval_runtime": 375.1753, "eval_samples_per_second": 96.981, "eval_steps_per_second": 24.247, "num_input_tokens_seen": 36040224, "step": 27800 }, { "epoch": 1.3585859819705373, "grad_norm": 0.21119070053100586, "learning_rate": 1.0618443675553527e-05, "loss": 0.0832, "num_input_tokens_seen": 36046688, "step": 27805 }, { "epoch": 1.3588302836342314, "grad_norm": 0.3704436123371124, "learning_rate": 1.0610414392457247e-05, "loss": 0.09, "num_input_tokens_seen": 36052928, "step": 27810 }, { "epoch": 1.3590745852979258, "grad_norm": 0.2326943725347519, "learning_rate": 1.0602387328415888e-05, "loss": 0.1033, "num_input_tokens_seen": 36059552, "step": 27815 }, { "epoch": 1.3593188869616202, "grad_norm": 0.4223805367946625, "learning_rate": 1.0594362484667347e-05, "loss": 0.0854, "num_input_tokens_seen": 36066368, "step": 27820 }, { "epoch": 1.3595631886253146, "grad_norm": 0.11788204312324524, "learning_rate": 1.0586339862449132e-05, "loss": 0.0688, "num_input_tokens_seen": 36073088, "step": 27825 }, { "epoch": 1.359807490289009, "grad_norm": 0.8583989143371582, "learning_rate": 1.0578319462998445e-05, "loss": 0.0702, "num_input_tokens_seen": 36079520, "step": 27830 }, { "epoch": 1.3600517919527033, "grad_norm": 0.22187335789203644, "learning_rate": 1.057030128755214e-05, "loss": 0.0629, "num_input_tokens_seen": 36086464, "step": 27835 }, { "epoch": 1.3602960936163975, "grad_norm": 0.3141491711139679, "learning_rate": 1.0562285337346703e-05, "loss": 0.1095, "num_input_tokens_seen": 36092768, "step": 27840 }, { "epoch": 1.3605403952800919, "grad_norm": 0.14251071214675903, "learning_rate": 1.0554271613618308e-05, "loss": 0.0933, "num_input_tokens_seen": 36098912, "step": 27845 }, { "epoch": 1.3607846969437862, "grad_norm": 0.5315182209014893, "learning_rate": 1.054626011760276e-05, "loss": 0.0904, "num_input_tokens_seen": 36105696, "step": 27850 }, { "epoch": 1.3610289986074804, "grad_norm": 0.15151885151863098, "learning_rate": 1.0538250850535549e-05, "loss": 0.0805, "num_input_tokens_seen": 36112064, "step": 27855 }, { "epoch": 1.3612733002711748, "grad_norm": 0.19914481043815613, "learning_rate": 1.0530243813651794e-05, "loss": 0.1052, "num_input_tokens_seen": 36119072, "step": 27860 }, { "epoch": 1.3615176019348691, "grad_norm": 0.14661958813667297, "learning_rate": 1.0522239008186271e-05, "loss": 0.0932, "num_input_tokens_seen": 36125440, "step": 27865 }, { "epoch": 1.3617619035985635, "grad_norm": 0.1602923423051834, "learning_rate": 1.0514236435373434e-05, "loss": 0.0933, "num_input_tokens_seen": 36131840, "step": 27870 }, { "epoch": 1.362006205262258, "grad_norm": 0.6493881940841675, "learning_rate": 1.0506236096447386e-05, "loss": 0.0903, "num_input_tokens_seen": 36138144, "step": 27875 }, { "epoch": 1.3622505069259523, "grad_norm": 0.22633492946624756, "learning_rate": 1.049823799264186e-05, "loss": 0.0765, "num_input_tokens_seen": 36144448, "step": 27880 }, { "epoch": 1.3624948085896464, "grad_norm": 0.31909817457199097, "learning_rate": 1.049024212519028e-05, "loss": 0.0927, "num_input_tokens_seen": 36150688, "step": 27885 }, { "epoch": 1.3627391102533408, "grad_norm": 0.16739659011363983, "learning_rate": 1.0482248495325713e-05, "loss": 0.0893, "num_input_tokens_seen": 36156928, "step": 27890 }, { "epoch": 1.3629834119170352, "grad_norm": 0.38120731711387634, "learning_rate": 1.047425710428086e-05, "loss": 0.0916, "num_input_tokens_seen": 36163136, "step": 27895 }, { "epoch": 1.3632277135807294, "grad_norm": 0.4077245891094208, "learning_rate": 1.0466267953288114e-05, "loss": 0.0821, "num_input_tokens_seen": 36169376, "step": 27900 }, { "epoch": 1.3634720152444237, "grad_norm": 0.24616758525371552, "learning_rate": 1.0458281043579482e-05, "loss": 0.0956, "num_input_tokens_seen": 36175744, "step": 27905 }, { "epoch": 1.3637163169081181, "grad_norm": 0.5223914980888367, "learning_rate": 1.0450296376386657e-05, "loss": 0.0561, "num_input_tokens_seen": 36182560, "step": 27910 }, { "epoch": 1.3639606185718125, "grad_norm": 0.9085201025009155, "learning_rate": 1.044231395294098e-05, "loss": 0.0897, "num_input_tokens_seen": 36188992, "step": 27915 }, { "epoch": 1.3642049202355069, "grad_norm": 0.5368872284889221, "learning_rate": 1.0434333774473435e-05, "loss": 0.0903, "num_input_tokens_seen": 36195136, "step": 27920 }, { "epoch": 1.3644492218992013, "grad_norm": 0.2469288855791092, "learning_rate": 1.0426355842214657e-05, "loss": 0.0874, "num_input_tokens_seen": 36201664, "step": 27925 }, { "epoch": 1.3646935235628954, "grad_norm": 0.18142357468605042, "learning_rate": 1.0418380157394963e-05, "loss": 0.0761, "num_input_tokens_seen": 36208128, "step": 27930 }, { "epoch": 1.3649378252265898, "grad_norm": 0.5577632784843445, "learning_rate": 1.0410406721244281e-05, "loss": 0.0997, "num_input_tokens_seen": 36214304, "step": 27935 }, { "epoch": 1.3651821268902842, "grad_norm": 0.5418192744255066, "learning_rate": 1.0402435534992238e-05, "loss": 0.0829, "num_input_tokens_seen": 36221056, "step": 27940 }, { "epoch": 1.3654264285539783, "grad_norm": 0.21170476078987122, "learning_rate": 1.0394466599868071e-05, "loss": 0.0734, "num_input_tokens_seen": 36227584, "step": 27945 }, { "epoch": 1.3656707302176727, "grad_norm": 0.34640204906463623, "learning_rate": 1.0386499917100697e-05, "loss": 0.0879, "num_input_tokens_seen": 36233760, "step": 27950 }, { "epoch": 1.365915031881367, "grad_norm": 0.13823047280311584, "learning_rate": 1.0378535487918692e-05, "loss": 0.0664, "num_input_tokens_seen": 36239872, "step": 27955 }, { "epoch": 1.3661593335450615, "grad_norm": 0.18318194150924683, "learning_rate": 1.037057331355025e-05, "loss": 0.0653, "num_input_tokens_seen": 36246144, "step": 27960 }, { "epoch": 1.3664036352087559, "grad_norm": 0.154646098613739, "learning_rate": 1.0362613395223247e-05, "loss": 0.0832, "num_input_tokens_seen": 36252928, "step": 27965 }, { "epoch": 1.3666479368724502, "grad_norm": 0.3088966906070709, "learning_rate": 1.0354655734165212e-05, "loss": 0.0954, "num_input_tokens_seen": 36259264, "step": 27970 }, { "epoch": 1.3668922385361444, "grad_norm": 0.2847239077091217, "learning_rate": 1.03467003316033e-05, "loss": 0.0876, "num_input_tokens_seen": 36265856, "step": 27975 }, { "epoch": 1.3671365401998388, "grad_norm": 0.3420848250389099, "learning_rate": 1.033874718876435e-05, "loss": 0.1325, "num_input_tokens_seen": 36272224, "step": 27980 }, { "epoch": 1.3673808418635331, "grad_norm": 0.20747371017932892, "learning_rate": 1.0330796306874818e-05, "loss": 0.0994, "num_input_tokens_seen": 36278336, "step": 27985 }, { "epoch": 1.3676251435272273, "grad_norm": 0.1889817863702774, "learning_rate": 1.032284768716085e-05, "loss": 0.088, "num_input_tokens_seen": 36284704, "step": 27990 }, { "epoch": 1.3678694451909217, "grad_norm": 0.3639185130596161, "learning_rate": 1.0314901330848206e-05, "loss": 0.0884, "num_input_tokens_seen": 36290784, "step": 27995 }, { "epoch": 1.368113746854616, "grad_norm": 0.15512771904468536, "learning_rate": 1.030695723916233e-05, "loss": 0.0762, "num_input_tokens_seen": 36297952, "step": 28000 }, { "epoch": 1.368113746854616, "eval_loss": 0.08781658113002777, "eval_runtime": 374.1788, "eval_samples_per_second": 97.24, "eval_steps_per_second": 24.312, "num_input_tokens_seen": 36297952, "step": 28000 }, { "epoch": 1.3683580485183104, "grad_norm": 0.3083416819572449, "learning_rate": 1.0299015413328289e-05, "loss": 0.0805, "num_input_tokens_seen": 36304512, "step": 28005 }, { "epoch": 1.3686023501820048, "grad_norm": 0.18659032881259918, "learning_rate": 1.0291075854570809e-05, "loss": 0.0764, "num_input_tokens_seen": 36310432, "step": 28010 }, { "epoch": 1.3688466518456992, "grad_norm": 0.14516736567020416, "learning_rate": 1.0283138564114275e-05, "loss": 0.0772, "num_input_tokens_seen": 36317376, "step": 28015 }, { "epoch": 1.3690909535093934, "grad_norm": 0.34995895624160767, "learning_rate": 1.027520354318273e-05, "loss": 0.0752, "num_input_tokens_seen": 36323968, "step": 28020 }, { "epoch": 1.3693352551730877, "grad_norm": 0.2830808758735657, "learning_rate": 1.0267270792999828e-05, "loss": 0.0877, "num_input_tokens_seen": 36330240, "step": 28025 }, { "epoch": 1.3695795568367821, "grad_norm": 0.14951445162296295, "learning_rate": 1.0259340314788919e-05, "loss": 0.0808, "num_input_tokens_seen": 36336704, "step": 28030 }, { "epoch": 1.3698238585004763, "grad_norm": 0.16714811325073242, "learning_rate": 1.0251412109772979e-05, "loss": 0.0866, "num_input_tokens_seen": 36343296, "step": 28035 }, { "epoch": 1.3700681601641707, "grad_norm": 0.2968352735042572, "learning_rate": 1.0243486179174627e-05, "loss": 0.0914, "num_input_tokens_seen": 36349504, "step": 28040 }, { "epoch": 1.370312461827865, "grad_norm": 0.3376460671424866, "learning_rate": 1.0235562524216158e-05, "loss": 0.0614, "num_input_tokens_seen": 36356128, "step": 28045 }, { "epoch": 1.3705567634915594, "grad_norm": 0.2926466464996338, "learning_rate": 1.022764114611948e-05, "loss": 0.1002, "num_input_tokens_seen": 36362336, "step": 28050 }, { "epoch": 1.3708010651552538, "grad_norm": 0.2857157289981842, "learning_rate": 1.0219722046106178e-05, "loss": 0.0709, "num_input_tokens_seen": 36368512, "step": 28055 }, { "epoch": 1.3710453668189482, "grad_norm": 0.2929544448852539, "learning_rate": 1.0211805225397486e-05, "loss": 0.0988, "num_input_tokens_seen": 36374656, "step": 28060 }, { "epoch": 1.3712896684826423, "grad_norm": 0.5290884971618652, "learning_rate": 1.020389068521426e-05, "loss": 0.0874, "num_input_tokens_seen": 36380928, "step": 28065 }, { "epoch": 1.3715339701463367, "grad_norm": 0.7582263350486755, "learning_rate": 1.0195978426777039e-05, "loss": 0.0802, "num_input_tokens_seen": 36388000, "step": 28070 }, { "epoch": 1.371778271810031, "grad_norm": 0.27262550592422485, "learning_rate": 1.0188068451305982e-05, "loss": 0.091, "num_input_tokens_seen": 36394368, "step": 28075 }, { "epoch": 1.3720225734737252, "grad_norm": 0.1638423204421997, "learning_rate": 1.0180160760020902e-05, "loss": 0.0953, "num_input_tokens_seen": 36400704, "step": 28080 }, { "epoch": 1.3722668751374196, "grad_norm": 0.4100121855735779, "learning_rate": 1.0172255354141278e-05, "loss": 0.0757, "num_input_tokens_seen": 36407104, "step": 28085 }, { "epoch": 1.372511176801114, "grad_norm": 0.1746324598789215, "learning_rate": 1.0164352234886205e-05, "loss": 0.0759, "num_input_tokens_seen": 36413600, "step": 28090 }, { "epoch": 1.3727554784648084, "grad_norm": 0.1889795958995819, "learning_rate": 1.0156451403474454e-05, "loss": 0.1257, "num_input_tokens_seen": 36419968, "step": 28095 }, { "epoch": 1.3729997801285028, "grad_norm": 0.17938224971294403, "learning_rate": 1.0148552861124443e-05, "loss": 0.0678, "num_input_tokens_seen": 36426624, "step": 28100 }, { "epoch": 1.373244081792197, "grad_norm": 0.4327538311481476, "learning_rate": 1.0140656609054205e-05, "loss": 0.0883, "num_input_tokens_seen": 36433184, "step": 28105 }, { "epoch": 1.3734883834558913, "grad_norm": 0.5041929483413696, "learning_rate": 1.0132762648481455e-05, "loss": 0.1006, "num_input_tokens_seen": 36439744, "step": 28110 }, { "epoch": 1.3737326851195857, "grad_norm": 0.15127679705619812, "learning_rate": 1.0124870980623543e-05, "loss": 0.107, "num_input_tokens_seen": 36446240, "step": 28115 }, { "epoch": 1.37397698678328, "grad_norm": 0.4853961169719696, "learning_rate": 1.0116981606697453e-05, "loss": 0.0987, "num_input_tokens_seen": 36452512, "step": 28120 }, { "epoch": 1.3742212884469742, "grad_norm": 0.6553086638450623, "learning_rate": 1.0109094527919838e-05, "loss": 0.0764, "num_input_tokens_seen": 36459360, "step": 28125 }, { "epoch": 1.3744655901106686, "grad_norm": 0.8704549670219421, "learning_rate": 1.010120974550697e-05, "loss": 0.1034, "num_input_tokens_seen": 36465536, "step": 28130 }, { "epoch": 1.374709891774363, "grad_norm": 0.30570921301841736, "learning_rate": 1.0093327260674795e-05, "loss": 0.0752, "num_input_tokens_seen": 36472192, "step": 28135 }, { "epoch": 1.3749541934380574, "grad_norm": 0.3060132563114166, "learning_rate": 1.0085447074638878e-05, "loss": 0.0849, "num_input_tokens_seen": 36478816, "step": 28140 }, { "epoch": 1.3751984951017517, "grad_norm": 0.6728317141532898, "learning_rate": 1.0077569188614461e-05, "loss": 0.0686, "num_input_tokens_seen": 36485376, "step": 28145 }, { "epoch": 1.375442796765446, "grad_norm": 0.1964479684829712, "learning_rate": 1.0069693603816393e-05, "loss": 0.0751, "num_input_tokens_seen": 36491648, "step": 28150 }, { "epoch": 1.3756870984291403, "grad_norm": 0.22534781694412231, "learning_rate": 1.0061820321459204e-05, "loss": 0.0702, "num_input_tokens_seen": 36498688, "step": 28155 }, { "epoch": 1.3759314000928347, "grad_norm": 0.38938769698143005, "learning_rate": 1.0053949342757038e-05, "loss": 0.0833, "num_input_tokens_seen": 36505312, "step": 28160 }, { "epoch": 1.376175701756529, "grad_norm": 0.5586791038513184, "learning_rate": 1.0046080668923717e-05, "loss": 0.0788, "num_input_tokens_seen": 36511360, "step": 28165 }, { "epoch": 1.3764200034202232, "grad_norm": 0.3249897360801697, "learning_rate": 1.003821430117267e-05, "loss": 0.0866, "num_input_tokens_seen": 36518752, "step": 28170 }, { "epoch": 1.3766643050839176, "grad_norm": 0.26931604743003845, "learning_rate": 1.0030350240716999e-05, "loss": 0.0855, "num_input_tokens_seen": 36524992, "step": 28175 }, { "epoch": 1.376908606747612, "grad_norm": 0.6036450266838074, "learning_rate": 1.0022488488769449e-05, "loss": 0.0864, "num_input_tokens_seen": 36531648, "step": 28180 }, { "epoch": 1.3771529084113063, "grad_norm": 0.5431899428367615, "learning_rate": 1.0014629046542387e-05, "loss": 0.1055, "num_input_tokens_seen": 36537856, "step": 28185 }, { "epoch": 1.3773972100750007, "grad_norm": 0.22730152308940887, "learning_rate": 1.0006771915247842e-05, "loss": 0.0888, "num_input_tokens_seen": 36544096, "step": 28190 }, { "epoch": 1.3776415117386949, "grad_norm": 0.1669911891222, "learning_rate": 9.998917096097495e-06, "loss": 0.0759, "num_input_tokens_seen": 36550688, "step": 28195 }, { "epoch": 1.3778858134023892, "grad_norm": 0.2438899427652359, "learning_rate": 9.991064590302638e-06, "loss": 0.1319, "num_input_tokens_seen": 36557056, "step": 28200 }, { "epoch": 1.3778858134023892, "eval_loss": 0.08769864588975906, "eval_runtime": 375.0943, "eval_samples_per_second": 97.002, "eval_steps_per_second": 24.253, "num_input_tokens_seen": 36557056, "step": 28200 }, { "epoch": 1.3781301150660836, "grad_norm": 0.2544384002685547, "learning_rate": 9.983214399074241e-06, "loss": 0.1023, "num_input_tokens_seen": 36563616, "step": 28205 }, { "epoch": 1.378374416729778, "grad_norm": 0.21841484308242798, "learning_rate": 9.975366523622893e-06, "loss": 0.0684, "num_input_tokens_seen": 36570048, "step": 28210 }, { "epoch": 1.3786187183934722, "grad_norm": 0.19707071781158447, "learning_rate": 9.967520965158841e-06, "loss": 0.0706, "num_input_tokens_seen": 36576128, "step": 28215 }, { "epoch": 1.3788630200571665, "grad_norm": 0.31227511167526245, "learning_rate": 9.95967772489197e-06, "loss": 0.078, "num_input_tokens_seen": 36582720, "step": 28220 }, { "epoch": 1.379107321720861, "grad_norm": 0.6024026870727539, "learning_rate": 9.951836804031794e-06, "loss": 0.0868, "num_input_tokens_seen": 36588832, "step": 28225 }, { "epoch": 1.3793516233845553, "grad_norm": 0.28952810168266296, "learning_rate": 9.943998203787489e-06, "loss": 0.0908, "num_input_tokens_seen": 36595424, "step": 28230 }, { "epoch": 1.3795959250482497, "grad_norm": 0.4097736179828644, "learning_rate": 9.936161925367874e-06, "loss": 0.0697, "num_input_tokens_seen": 36602144, "step": 28235 }, { "epoch": 1.3798402267119438, "grad_norm": 0.8095641732215881, "learning_rate": 9.928327969981386e-06, "loss": 0.0666, "num_input_tokens_seen": 36608544, "step": 28240 }, { "epoch": 1.3800845283756382, "grad_norm": 0.12165567278862, "learning_rate": 9.920496338836135e-06, "loss": 0.0781, "num_input_tokens_seen": 36614944, "step": 28245 }, { "epoch": 1.3803288300393326, "grad_norm": 0.9666043519973755, "learning_rate": 9.912667033139844e-06, "loss": 0.0679, "num_input_tokens_seen": 36621440, "step": 28250 }, { "epoch": 1.380573131703027, "grad_norm": 0.17052249610424042, "learning_rate": 9.904840054099893e-06, "loss": 0.116, "num_input_tokens_seen": 36627744, "step": 28255 }, { "epoch": 1.3808174333667211, "grad_norm": 0.29783138632774353, "learning_rate": 9.897015402923312e-06, "loss": 0.0816, "num_input_tokens_seen": 36634240, "step": 28260 }, { "epoch": 1.3810617350304155, "grad_norm": 0.4551701545715332, "learning_rate": 9.889193080816744e-06, "loss": 0.0746, "num_input_tokens_seen": 36640992, "step": 28265 }, { "epoch": 1.38130603669411, "grad_norm": 0.5312690138816833, "learning_rate": 9.881373088986498e-06, "loss": 0.1075, "num_input_tokens_seen": 36647296, "step": 28270 }, { "epoch": 1.3815503383578043, "grad_norm": 0.26685792207717896, "learning_rate": 9.873555428638523e-06, "loss": 0.0923, "num_input_tokens_seen": 36653568, "step": 28275 }, { "epoch": 1.3817946400214987, "grad_norm": 0.5913446545600891, "learning_rate": 9.865740100978383e-06, "loss": 0.0711, "num_input_tokens_seen": 36660352, "step": 28280 }, { "epoch": 1.3820389416851928, "grad_norm": 0.18518652021884918, "learning_rate": 9.857927107211315e-06, "loss": 0.0669, "num_input_tokens_seen": 36666912, "step": 28285 }, { "epoch": 1.3822832433488872, "grad_norm": 0.5560652613639832, "learning_rate": 9.850116448542177e-06, "loss": 0.0956, "num_input_tokens_seen": 36673216, "step": 28290 }, { "epoch": 1.3825275450125816, "grad_norm": 0.22196203470230103, "learning_rate": 9.842308126175457e-06, "loss": 0.0715, "num_input_tokens_seen": 36679904, "step": 28295 }, { "epoch": 1.3827718466762757, "grad_norm": 0.1668350249528885, "learning_rate": 9.834502141315315e-06, "loss": 0.0722, "num_input_tokens_seen": 36686112, "step": 28300 }, { "epoch": 1.38301614833997, "grad_norm": 0.6065716743469238, "learning_rate": 9.82669849516552e-06, "loss": 0.0841, "num_input_tokens_seen": 36693056, "step": 28305 }, { "epoch": 1.3832604500036645, "grad_norm": 0.847972571849823, "learning_rate": 9.818897188929493e-06, "loss": 0.0854, "num_input_tokens_seen": 36699456, "step": 28310 }, { "epoch": 1.3835047516673589, "grad_norm": 0.6853190064430237, "learning_rate": 9.811098223810309e-06, "loss": 0.1216, "num_input_tokens_seen": 36705920, "step": 28315 }, { "epoch": 1.3837490533310532, "grad_norm": 1.0792124271392822, "learning_rate": 9.803301601010641e-06, "loss": 0.104, "num_input_tokens_seen": 36712960, "step": 28320 }, { "epoch": 1.3839933549947476, "grad_norm": 0.258689820766449, "learning_rate": 9.795507321732853e-06, "loss": 0.0658, "num_input_tokens_seen": 36720064, "step": 28325 }, { "epoch": 1.3842376566584418, "grad_norm": 0.2603517770767212, "learning_rate": 9.787715387178898e-06, "loss": 0.0763, "num_input_tokens_seen": 36726464, "step": 28330 }, { "epoch": 1.3844819583221362, "grad_norm": 0.10741082578897476, "learning_rate": 9.779925798550399e-06, "loss": 0.099, "num_input_tokens_seen": 36732640, "step": 28335 }, { "epoch": 1.3847262599858305, "grad_norm": 0.11060348153114319, "learning_rate": 9.772138557048619e-06, "loss": 0.0698, "num_input_tokens_seen": 36739200, "step": 28340 }, { "epoch": 1.3849705616495247, "grad_norm": 0.14529812335968018, "learning_rate": 9.764353663874426e-06, "loss": 0.079, "num_input_tokens_seen": 36746144, "step": 28345 }, { "epoch": 1.385214863313219, "grad_norm": 0.3578432500362396, "learning_rate": 9.756571120228375e-06, "loss": 0.0837, "num_input_tokens_seen": 36752672, "step": 28350 }, { "epoch": 1.3854591649769135, "grad_norm": 0.17851541936397552, "learning_rate": 9.748790927310605e-06, "loss": 0.0784, "num_input_tokens_seen": 36758560, "step": 28355 }, { "epoch": 1.3857034666406078, "grad_norm": 0.3894256055355072, "learning_rate": 9.741013086320946e-06, "loss": 0.0883, "num_input_tokens_seen": 36765344, "step": 28360 }, { "epoch": 1.3859477683043022, "grad_norm": 0.14415322244167328, "learning_rate": 9.733237598458821e-06, "loss": 0.0849, "num_input_tokens_seen": 36771456, "step": 28365 }, { "epoch": 1.3861920699679966, "grad_norm": 0.4232376217842102, "learning_rate": 9.725464464923308e-06, "loss": 0.0728, "num_input_tokens_seen": 36777824, "step": 28370 }, { "epoch": 1.3864363716316908, "grad_norm": 0.44967523217201233, "learning_rate": 9.717693686913123e-06, "loss": 0.0997, "num_input_tokens_seen": 36784448, "step": 28375 }, { "epoch": 1.3866806732953851, "grad_norm": 0.1486043781042099, "learning_rate": 9.709925265626632e-06, "loss": 0.0868, "num_input_tokens_seen": 36790720, "step": 28380 }, { "epoch": 1.3869249749590795, "grad_norm": 0.3273695409297943, "learning_rate": 9.702159202261801e-06, "loss": 0.0884, "num_input_tokens_seen": 36797344, "step": 28385 }, { "epoch": 1.3871692766227737, "grad_norm": 0.33013418316841125, "learning_rate": 9.694395498016268e-06, "loss": 0.1036, "num_input_tokens_seen": 36803424, "step": 28390 }, { "epoch": 1.387413578286468, "grad_norm": 0.5261563658714294, "learning_rate": 9.686634154087298e-06, "loss": 0.1121, "num_input_tokens_seen": 36809632, "step": 28395 }, { "epoch": 1.3876578799501624, "grad_norm": 0.2574959993362427, "learning_rate": 9.678875171671776e-06, "loss": 0.0644, "num_input_tokens_seen": 36815904, "step": 28400 }, { "epoch": 1.3876578799501624, "eval_loss": 0.08759359270334244, "eval_runtime": 374.4211, "eval_samples_per_second": 97.177, "eval_steps_per_second": 24.296, "num_input_tokens_seen": 36815904, "step": 28400 }, { "epoch": 1.3879021816138568, "grad_norm": 0.688305139541626, "learning_rate": 9.671118551966246e-06, "loss": 0.088, "num_input_tokens_seen": 36822240, "step": 28405 }, { "epoch": 1.3881464832775512, "grad_norm": 0.23120853304862976, "learning_rate": 9.66336429616686e-06, "loss": 0.0794, "num_input_tokens_seen": 36828480, "step": 28410 }, { "epoch": 1.3883907849412456, "grad_norm": 0.24399171769618988, "learning_rate": 9.655612405469436e-06, "loss": 0.1093, "num_input_tokens_seen": 36834944, "step": 28415 }, { "epoch": 1.3886350866049397, "grad_norm": 0.5806496143341064, "learning_rate": 9.647862881069413e-06, "loss": 0.0762, "num_input_tokens_seen": 36841248, "step": 28420 }, { "epoch": 1.388879388268634, "grad_norm": 0.21452921628952026, "learning_rate": 9.640115724161855e-06, "loss": 0.1076, "num_input_tokens_seen": 36847552, "step": 28425 }, { "epoch": 1.3891236899323285, "grad_norm": 0.46260756254196167, "learning_rate": 9.632370935941483e-06, "loss": 0.0865, "num_input_tokens_seen": 36854208, "step": 28430 }, { "epoch": 1.3893679915960226, "grad_norm": 0.1903448849916458, "learning_rate": 9.624628517602634e-06, "loss": 0.0767, "num_input_tokens_seen": 36860320, "step": 28435 }, { "epoch": 1.389612293259717, "grad_norm": 0.2610265612602234, "learning_rate": 9.61688847033928e-06, "loss": 0.085, "num_input_tokens_seen": 36867136, "step": 28440 }, { "epoch": 1.3898565949234114, "grad_norm": 0.3734663426876068, "learning_rate": 9.609150795345051e-06, "loss": 0.1025, "num_input_tokens_seen": 36873632, "step": 28445 }, { "epoch": 1.3901008965871058, "grad_norm": 0.12170211970806122, "learning_rate": 9.601415493813171e-06, "loss": 0.0519, "num_input_tokens_seen": 36880352, "step": 28450 }, { "epoch": 1.3903451982508002, "grad_norm": 0.16903680562973022, "learning_rate": 9.593682566936533e-06, "loss": 0.0829, "num_input_tokens_seen": 36886784, "step": 28455 }, { "epoch": 1.3905894999144945, "grad_norm": 0.4674128592014313, "learning_rate": 9.58595201590766e-06, "loss": 0.0606, "num_input_tokens_seen": 36893312, "step": 28460 }, { "epoch": 1.3908338015781887, "grad_norm": 0.1471281200647354, "learning_rate": 9.578223841918681e-06, "loss": 0.059, "num_input_tokens_seen": 36899840, "step": 28465 }, { "epoch": 1.391078103241883, "grad_norm": 0.3162803649902344, "learning_rate": 9.570498046161389e-06, "loss": 0.0908, "num_input_tokens_seen": 36906144, "step": 28470 }, { "epoch": 1.3913224049055775, "grad_norm": 0.3105061650276184, "learning_rate": 9.562774629827206e-06, "loss": 0.096, "num_input_tokens_seen": 36912448, "step": 28475 }, { "epoch": 1.3915667065692716, "grad_norm": 0.12068533152341843, "learning_rate": 9.555053594107163e-06, "loss": 0.0546, "num_input_tokens_seen": 36919168, "step": 28480 }, { "epoch": 1.391811008232966, "grad_norm": 0.16469824314117432, "learning_rate": 9.547334940191957e-06, "loss": 0.104, "num_input_tokens_seen": 36925728, "step": 28485 }, { "epoch": 1.3920553098966604, "grad_norm": 0.3255593478679657, "learning_rate": 9.539618669271886e-06, "loss": 0.1053, "num_input_tokens_seen": 36932128, "step": 28490 }, { "epoch": 1.3922996115603548, "grad_norm": 0.7071267366409302, "learning_rate": 9.531904782536904e-06, "loss": 0.134, "num_input_tokens_seen": 36938432, "step": 28495 }, { "epoch": 1.3925439132240491, "grad_norm": 0.14870405197143555, "learning_rate": 9.524193281176597e-06, "loss": 0.0704, "num_input_tokens_seen": 36945088, "step": 28500 }, { "epoch": 1.3927882148877435, "grad_norm": 0.3014524579048157, "learning_rate": 9.516484166380165e-06, "loss": 0.089, "num_input_tokens_seen": 36951584, "step": 28505 }, { "epoch": 1.3930325165514377, "grad_norm": 0.14756935834884644, "learning_rate": 9.508777439336447e-06, "loss": 0.0643, "num_input_tokens_seen": 36957952, "step": 28510 }, { "epoch": 1.393276818215132, "grad_norm": 0.2883058190345764, "learning_rate": 9.50107310123393e-06, "loss": 0.1008, "num_input_tokens_seen": 36964576, "step": 28515 }, { "epoch": 1.3935211198788264, "grad_norm": 0.2727881371974945, "learning_rate": 9.493371153260702e-06, "loss": 0.0866, "num_input_tokens_seen": 36971104, "step": 28520 }, { "epoch": 1.3937654215425206, "grad_norm": 0.8436620235443115, "learning_rate": 9.485671596604523e-06, "loss": 0.0769, "num_input_tokens_seen": 36977536, "step": 28525 }, { "epoch": 1.394009723206215, "grad_norm": 0.2621501088142395, "learning_rate": 9.477974432452738e-06, "loss": 0.0901, "num_input_tokens_seen": 36983872, "step": 28530 }, { "epoch": 1.3942540248699093, "grad_norm": 0.38412219285964966, "learning_rate": 9.470279661992356e-06, "loss": 0.0821, "num_input_tokens_seen": 36990592, "step": 28535 }, { "epoch": 1.3944983265336037, "grad_norm": 0.23150508105754852, "learning_rate": 9.462587286410021e-06, "loss": 0.0942, "num_input_tokens_seen": 36996864, "step": 28540 }, { "epoch": 1.394742628197298, "grad_norm": 0.4116593599319458, "learning_rate": 9.454897306891972e-06, "loss": 0.0855, "num_input_tokens_seen": 37003488, "step": 28545 }, { "epoch": 1.3949869298609925, "grad_norm": 0.2542552053928375, "learning_rate": 9.44720972462411e-06, "loss": 0.1325, "num_input_tokens_seen": 37009760, "step": 28550 }, { "epoch": 1.3952312315246866, "grad_norm": 0.2226571887731552, "learning_rate": 9.439524540791964e-06, "loss": 0.0912, "num_input_tokens_seen": 37016128, "step": 28555 }, { "epoch": 1.395475533188381, "grad_norm": 0.25020745396614075, "learning_rate": 9.431841756580673e-06, "loss": 0.0816, "num_input_tokens_seen": 37022944, "step": 28560 }, { "epoch": 1.3957198348520754, "grad_norm": 0.1899772584438324, "learning_rate": 9.42416137317503e-06, "loss": 0.0715, "num_input_tokens_seen": 37030368, "step": 28565 }, { "epoch": 1.3959641365157696, "grad_norm": 0.18006359040737152, "learning_rate": 9.416483391759437e-06, "loss": 0.0834, "num_input_tokens_seen": 37036704, "step": 28570 }, { "epoch": 1.396208438179464, "grad_norm": 0.44971057772636414, "learning_rate": 9.408807813517945e-06, "loss": 0.1003, "num_input_tokens_seen": 37042912, "step": 28575 }, { "epoch": 1.3964527398431583, "grad_norm": 0.43963953852653503, "learning_rate": 9.401134639634221e-06, "loss": 0.0884, "num_input_tokens_seen": 37050048, "step": 28580 }, { "epoch": 1.3966970415068527, "grad_norm": 0.39025816321372986, "learning_rate": 9.393463871291555e-06, "loss": 0.0792, "num_input_tokens_seen": 37056480, "step": 28585 }, { "epoch": 1.396941343170547, "grad_norm": 0.7000645399093628, "learning_rate": 9.385795509672881e-06, "loss": 0.0693, "num_input_tokens_seen": 37062912, "step": 28590 }, { "epoch": 1.3971856448342415, "grad_norm": 0.3211704194545746, "learning_rate": 9.378129555960771e-06, "loss": 0.0888, "num_input_tokens_seen": 37069504, "step": 28595 }, { "epoch": 1.3974299464979356, "grad_norm": 0.32413583993911743, "learning_rate": 9.370466011337392e-06, "loss": 0.0808, "num_input_tokens_seen": 37076064, "step": 28600 }, { "epoch": 1.3974299464979356, "eval_loss": 0.08760717511177063, "eval_runtime": 375.1552, "eval_samples_per_second": 96.987, "eval_steps_per_second": 24.249, "num_input_tokens_seen": 37076064, "step": 28600 }, { "epoch": 1.39767424816163, "grad_norm": 0.3919394612312317, "learning_rate": 9.362804876984573e-06, "loss": 0.0766, "num_input_tokens_seen": 37082496, "step": 28605 }, { "epoch": 1.3979185498253244, "grad_norm": 0.40475884079933167, "learning_rate": 9.355146154083747e-06, "loss": 0.0643, "num_input_tokens_seen": 37089184, "step": 28610 }, { "epoch": 1.3981628514890185, "grad_norm": 0.07208460569381714, "learning_rate": 9.347489843815987e-06, "loss": 0.0895, "num_input_tokens_seen": 37095424, "step": 28615 }, { "epoch": 1.398407153152713, "grad_norm": 0.1469489336013794, "learning_rate": 9.339835947362002e-06, "loss": 0.0491, "num_input_tokens_seen": 37101792, "step": 28620 }, { "epoch": 1.3986514548164073, "grad_norm": 0.7140313982963562, "learning_rate": 9.332184465902105e-06, "loss": 0.0896, "num_input_tokens_seen": 37108096, "step": 28625 }, { "epoch": 1.3988957564801017, "grad_norm": 0.49573782086372375, "learning_rate": 9.324535400616266e-06, "loss": 0.0998, "num_input_tokens_seen": 37114272, "step": 28630 }, { "epoch": 1.399140058143796, "grad_norm": 0.12292468547821045, "learning_rate": 9.31688875268405e-06, "loss": 0.0885, "num_input_tokens_seen": 37120512, "step": 28635 }, { "epoch": 1.3993843598074902, "grad_norm": 0.44561487436294556, "learning_rate": 9.309244523284674e-06, "loss": 0.0861, "num_input_tokens_seen": 37126976, "step": 28640 }, { "epoch": 1.3996286614711846, "grad_norm": 0.23782439529895782, "learning_rate": 9.301602713596982e-06, "loss": 0.1189, "num_input_tokens_seen": 37133408, "step": 28645 }, { "epoch": 1.399872963134879, "grad_norm": 0.22679917514324188, "learning_rate": 9.293963324799432e-06, "loss": 0.0862, "num_input_tokens_seen": 37139712, "step": 28650 }, { "epoch": 1.4001172647985733, "grad_norm": 0.22121241688728333, "learning_rate": 9.286326358070104e-06, "loss": 0.078, "num_input_tokens_seen": 37146592, "step": 28655 }, { "epoch": 1.4003615664622675, "grad_norm": 0.18405303359031677, "learning_rate": 9.278691814586729e-06, "loss": 0.0953, "num_input_tokens_seen": 37153568, "step": 28660 }, { "epoch": 1.4006058681259619, "grad_norm": 0.2694210708141327, "learning_rate": 9.271059695526635e-06, "loss": 0.0807, "num_input_tokens_seen": 37160032, "step": 28665 }, { "epoch": 1.4008501697896563, "grad_norm": 0.42619994282722473, "learning_rate": 9.263430002066805e-06, "loss": 0.0881, "num_input_tokens_seen": 37166560, "step": 28670 }, { "epoch": 1.4010944714533506, "grad_norm": 0.4779790937900543, "learning_rate": 9.25580273538382e-06, "loss": 0.0678, "num_input_tokens_seen": 37172992, "step": 28675 }, { "epoch": 1.401338773117045, "grad_norm": 0.21243484318256378, "learning_rate": 9.248177896653907e-06, "loss": 0.0703, "num_input_tokens_seen": 37179264, "step": 28680 }, { "epoch": 1.4015830747807392, "grad_norm": 0.209864541888237, "learning_rate": 9.240555487052918e-06, "loss": 0.086, "num_input_tokens_seen": 37185856, "step": 28685 }, { "epoch": 1.4018273764444336, "grad_norm": 0.18864931166172028, "learning_rate": 9.232935507756313e-06, "loss": 0.072, "num_input_tokens_seen": 37192320, "step": 28690 }, { "epoch": 1.402071678108128, "grad_norm": 0.4640066921710968, "learning_rate": 9.225317959939193e-06, "loss": 0.0934, "num_input_tokens_seen": 37198944, "step": 28695 }, { "epoch": 1.4023159797718223, "grad_norm": 0.5255133509635925, "learning_rate": 9.217702844776287e-06, "loss": 0.0897, "num_input_tokens_seen": 37205344, "step": 28700 }, { "epoch": 1.4025602814355165, "grad_norm": 0.4391140043735504, "learning_rate": 9.210090163441929e-06, "loss": 0.0755, "num_input_tokens_seen": 37211488, "step": 28705 }, { "epoch": 1.4028045830992109, "grad_norm": 0.24217796325683594, "learning_rate": 9.202479917110105e-06, "loss": 0.0777, "num_input_tokens_seen": 37217536, "step": 28710 }, { "epoch": 1.4030488847629052, "grad_norm": 0.7194148302078247, "learning_rate": 9.194872106954392e-06, "loss": 0.1111, "num_input_tokens_seen": 37223840, "step": 28715 }, { "epoch": 1.4032931864265996, "grad_norm": 0.3660550117492676, "learning_rate": 9.187266734148029e-06, "loss": 0.0926, "num_input_tokens_seen": 37229888, "step": 28720 }, { "epoch": 1.403537488090294, "grad_norm": 0.4811146855354309, "learning_rate": 9.179663799863849e-06, "loss": 0.0663, "num_input_tokens_seen": 37236896, "step": 28725 }, { "epoch": 1.4037817897539882, "grad_norm": 0.16070759296417236, "learning_rate": 9.172063305274317e-06, "loss": 0.0709, "num_input_tokens_seen": 37243488, "step": 28730 }, { "epoch": 1.4040260914176825, "grad_norm": 0.7354966998100281, "learning_rate": 9.164465251551527e-06, "loss": 0.0648, "num_input_tokens_seen": 37249952, "step": 28735 }, { "epoch": 1.404270393081377, "grad_norm": 0.19927355647087097, "learning_rate": 9.156869639867205e-06, "loss": 0.0831, "num_input_tokens_seen": 37256736, "step": 28740 }, { "epoch": 1.4045146947450713, "grad_norm": 0.22602644562721252, "learning_rate": 9.149276471392677e-06, "loss": 0.0837, "num_input_tokens_seen": 37263392, "step": 28745 }, { "epoch": 1.4047589964087654, "grad_norm": 0.13004471361637115, "learning_rate": 9.141685747298914e-06, "loss": 0.0828, "num_input_tokens_seen": 37269504, "step": 28750 }, { "epoch": 1.4050032980724598, "grad_norm": 0.501865565776825, "learning_rate": 9.13409746875649e-06, "loss": 0.0895, "num_input_tokens_seen": 37276288, "step": 28755 }, { "epoch": 1.4052475997361542, "grad_norm": 0.2792428433895111, "learning_rate": 9.12651163693562e-06, "loss": 0.0893, "num_input_tokens_seen": 37282816, "step": 28760 }, { "epoch": 1.4054919013998486, "grad_norm": 0.21167875826358795, "learning_rate": 9.11892825300614e-06, "loss": 0.09, "num_input_tokens_seen": 37289088, "step": 28765 }, { "epoch": 1.405736203063543, "grad_norm": 0.1149793192744255, "learning_rate": 9.111347318137491e-06, "loss": 0.0747, "num_input_tokens_seen": 37295200, "step": 28770 }, { "epoch": 1.4059805047272371, "grad_norm": 0.3468555510044098, "learning_rate": 9.103768833498755e-06, "loss": 0.1204, "num_input_tokens_seen": 37302016, "step": 28775 }, { "epoch": 1.4062248063909315, "grad_norm": 0.10800937563180923, "learning_rate": 9.096192800258639e-06, "loss": 0.0644, "num_input_tokens_seen": 37308224, "step": 28780 }, { "epoch": 1.4064691080546259, "grad_norm": 0.5409157872200012, "learning_rate": 9.088619219585443e-06, "loss": 0.0879, "num_input_tokens_seen": 37314528, "step": 28785 }, { "epoch": 1.4067134097183203, "grad_norm": 0.4819181561470032, "learning_rate": 9.081048092647127e-06, "loss": 0.0881, "num_input_tokens_seen": 37320704, "step": 28790 }, { "epoch": 1.4069577113820144, "grad_norm": 0.38821449875831604, "learning_rate": 9.073479420611245e-06, "loss": 0.0887, "num_input_tokens_seen": 37326912, "step": 28795 }, { "epoch": 1.4072020130457088, "grad_norm": 0.1413239687681198, "learning_rate": 9.065913204644974e-06, "loss": 0.0954, "num_input_tokens_seen": 37333536, "step": 28800 }, { "epoch": 1.4072020130457088, "eval_loss": 0.08771196007728577, "eval_runtime": 375.2147, "eval_samples_per_second": 96.971, "eval_steps_per_second": 24.245, "num_input_tokens_seen": 37333536, "step": 28800 }, { "epoch": 1.4074463147094032, "grad_norm": 0.19682586193084717, "learning_rate": 9.058349445915135e-06, "loss": 0.0847, "num_input_tokens_seen": 37340160, "step": 28805 }, { "epoch": 1.4076906163730976, "grad_norm": 0.45942699909210205, "learning_rate": 9.050788145588138e-06, "loss": 0.0928, "num_input_tokens_seen": 37346720, "step": 28810 }, { "epoch": 1.407934918036792, "grad_norm": 0.394336074590683, "learning_rate": 9.043229304830039e-06, "loss": 0.078, "num_input_tokens_seen": 37353152, "step": 28815 }, { "epoch": 1.408179219700486, "grad_norm": 0.27757272124290466, "learning_rate": 9.035672924806515e-06, "loss": 0.1034, "num_input_tokens_seen": 37359232, "step": 28820 }, { "epoch": 1.4084235213641805, "grad_norm": 0.23703540861606598, "learning_rate": 9.028119006682839e-06, "loss": 0.0831, "num_input_tokens_seen": 37365984, "step": 28825 }, { "epoch": 1.4086678230278749, "grad_norm": 0.3580373525619507, "learning_rate": 9.020567551623935e-06, "loss": 0.0805, "num_input_tokens_seen": 37372384, "step": 28830 }, { "epoch": 1.408912124691569, "grad_norm": 0.15448424220085144, "learning_rate": 9.013018560794318e-06, "loss": 0.0782, "num_input_tokens_seen": 37378784, "step": 28835 }, { "epoch": 1.4091564263552634, "grad_norm": 0.69931960105896, "learning_rate": 9.005472035358139e-06, "loss": 0.0763, "num_input_tokens_seen": 37385600, "step": 28840 }, { "epoch": 1.4094007280189578, "grad_norm": 0.27435654401779175, "learning_rate": 8.997927976479185e-06, "loss": 0.0868, "num_input_tokens_seen": 37392032, "step": 28845 }, { "epoch": 1.4096450296826522, "grad_norm": 0.3460649847984314, "learning_rate": 8.99038638532082e-06, "loss": 0.0952, "num_input_tokens_seen": 37398240, "step": 28850 }, { "epoch": 1.4098893313463465, "grad_norm": 0.17371627688407898, "learning_rate": 8.982847263046065e-06, "loss": 0.0866, "num_input_tokens_seen": 37404864, "step": 28855 }, { "epoch": 1.410133633010041, "grad_norm": 0.3548118770122528, "learning_rate": 8.975310610817555e-06, "loss": 0.0648, "num_input_tokens_seen": 37411776, "step": 28860 }, { "epoch": 1.410377934673735, "grad_norm": 0.6782968640327454, "learning_rate": 8.967776429797528e-06, "loss": 0.0841, "num_input_tokens_seen": 37418048, "step": 28865 }, { "epoch": 1.4106222363374294, "grad_norm": 0.8414351940155029, "learning_rate": 8.960244721147842e-06, "loss": 0.0842, "num_input_tokens_seen": 37424384, "step": 28870 }, { "epoch": 1.4108665380011238, "grad_norm": 0.39212697744369507, "learning_rate": 8.952715486029995e-06, "loss": 0.0932, "num_input_tokens_seen": 37431104, "step": 28875 }, { "epoch": 1.411110839664818, "grad_norm": 0.21327126026153564, "learning_rate": 8.945188725605075e-06, "loss": 0.1011, "num_input_tokens_seen": 37437088, "step": 28880 }, { "epoch": 1.4113551413285124, "grad_norm": 0.21263110637664795, "learning_rate": 8.937664441033817e-06, "loss": 0.088, "num_input_tokens_seen": 37443488, "step": 28885 }, { "epoch": 1.4115994429922067, "grad_norm": 0.7135438323020935, "learning_rate": 8.930142633476549e-06, "loss": 0.0923, "num_input_tokens_seen": 37450176, "step": 28890 }, { "epoch": 1.4118437446559011, "grad_norm": 0.4787473976612091, "learning_rate": 8.92262330409323e-06, "loss": 0.081, "num_input_tokens_seen": 37456864, "step": 28895 }, { "epoch": 1.4120880463195955, "grad_norm": 0.47004783153533936, "learning_rate": 8.915106454043448e-06, "loss": 0.0724, "num_input_tokens_seen": 37463232, "step": 28900 }, { "epoch": 1.4123323479832899, "grad_norm": 0.4984084963798523, "learning_rate": 8.90759208448638e-06, "loss": 0.0821, "num_input_tokens_seen": 37470016, "step": 28905 }, { "epoch": 1.412576649646984, "grad_norm": 0.3260161578655243, "learning_rate": 8.900080196580848e-06, "loss": 0.0888, "num_input_tokens_seen": 37476352, "step": 28910 }, { "epoch": 1.4128209513106784, "grad_norm": 0.3080207109451294, "learning_rate": 8.892570791485267e-06, "loss": 0.0807, "num_input_tokens_seen": 37483136, "step": 28915 }, { "epoch": 1.4130652529743728, "grad_norm": 0.16170130670070648, "learning_rate": 8.885063870357688e-06, "loss": 0.0629, "num_input_tokens_seen": 37489440, "step": 28920 }, { "epoch": 1.413309554638067, "grad_norm": 0.24435029923915863, "learning_rate": 8.87755943435578e-06, "loss": 0.0668, "num_input_tokens_seen": 37496160, "step": 28925 }, { "epoch": 1.4135538563017613, "grad_norm": 0.12287460267543793, "learning_rate": 8.87005748463681e-06, "loss": 0.0926, "num_input_tokens_seen": 37502304, "step": 28930 }, { "epoch": 1.4137981579654557, "grad_norm": 0.4786018133163452, "learning_rate": 8.862558022357681e-06, "loss": 0.1011, "num_input_tokens_seen": 37509248, "step": 28935 }, { "epoch": 1.41404245962915, "grad_norm": 0.6548410654067993, "learning_rate": 8.855061048674903e-06, "loss": 0.0616, "num_input_tokens_seen": 37515392, "step": 28940 }, { "epoch": 1.4142867612928445, "grad_norm": 0.8803908824920654, "learning_rate": 8.847566564744595e-06, "loss": 0.088, "num_input_tokens_seen": 37521824, "step": 28945 }, { "epoch": 1.4145310629565389, "grad_norm": 0.3364100754261017, "learning_rate": 8.840074571722512e-06, "loss": 0.0745, "num_input_tokens_seen": 37528896, "step": 28950 }, { "epoch": 1.414775364620233, "grad_norm": 0.4003562331199646, "learning_rate": 8.832585070764002e-06, "loss": 0.078, "num_input_tokens_seen": 37534944, "step": 28955 }, { "epoch": 1.4150196662839274, "grad_norm": 0.599446177482605, "learning_rate": 8.825098063024045e-06, "loss": 0.0981, "num_input_tokens_seen": 37541280, "step": 28960 }, { "epoch": 1.4152639679476218, "grad_norm": 0.36968132853507996, "learning_rate": 8.817613549657244e-06, "loss": 0.1103, "num_input_tokens_seen": 37548000, "step": 28965 }, { "epoch": 1.415508269611316, "grad_norm": 0.3201185166835785, "learning_rate": 8.810131531817783e-06, "loss": 0.1102, "num_input_tokens_seen": 37553984, "step": 28970 }, { "epoch": 1.4157525712750103, "grad_norm": 0.34517601132392883, "learning_rate": 8.802652010659496e-06, "loss": 0.0794, "num_input_tokens_seen": 37560288, "step": 28975 }, { "epoch": 1.4159968729387047, "grad_norm": 0.2707633674144745, "learning_rate": 8.795174987335827e-06, "loss": 0.0808, "num_input_tokens_seen": 37567360, "step": 28980 }, { "epoch": 1.416241174602399, "grad_norm": 0.19642850756645203, "learning_rate": 8.787700462999807e-06, "loss": 0.0814, "num_input_tokens_seen": 37573696, "step": 28985 }, { "epoch": 1.4164854762660934, "grad_norm": 1.5616986751556396, "learning_rate": 8.780228438804122e-06, "loss": 0.1183, "num_input_tokens_seen": 37580256, "step": 28990 }, { "epoch": 1.4167297779297878, "grad_norm": 0.3185374438762665, "learning_rate": 8.772758915901032e-06, "loss": 0.0966, "num_input_tokens_seen": 37586784, "step": 28995 }, { "epoch": 1.416974079593482, "grad_norm": 0.1543106585741043, "learning_rate": 8.765291895442443e-06, "loss": 0.0887, "num_input_tokens_seen": 37593216, "step": 29000 }, { "epoch": 1.416974079593482, "eval_loss": 0.0874786302447319, "eval_runtime": 375.2487, "eval_samples_per_second": 96.962, "eval_steps_per_second": 24.243, "num_input_tokens_seen": 37593216, "step": 29000 }, { "epoch": 1.4172183812571764, "grad_norm": 0.5885906219482422, "learning_rate": 8.75782737857987e-06, "loss": 0.0917, "num_input_tokens_seen": 37599232, "step": 29005 }, { "epoch": 1.4174626829208707, "grad_norm": 0.4795127809047699, "learning_rate": 8.750365366464425e-06, "loss": 0.1087, "num_input_tokens_seen": 37605504, "step": 29010 }, { "epoch": 1.417706984584565, "grad_norm": 0.5495590567588806, "learning_rate": 8.742905860246838e-06, "loss": 0.0964, "num_input_tokens_seen": 37612192, "step": 29015 }, { "epoch": 1.4179512862482593, "grad_norm": 0.24097952246665955, "learning_rate": 8.735448861077478e-06, "loss": 0.0838, "num_input_tokens_seen": 37618688, "step": 29020 }, { "epoch": 1.4181955879119537, "grad_norm": 0.5465211868286133, "learning_rate": 8.727994370106288e-06, "loss": 0.0755, "num_input_tokens_seen": 37625312, "step": 29025 }, { "epoch": 1.418439889575648, "grad_norm": 0.1802450269460678, "learning_rate": 8.720542388482861e-06, "loss": 0.0778, "num_input_tokens_seen": 37631328, "step": 29030 }, { "epoch": 1.4186841912393424, "grad_norm": 0.2800860106945038, "learning_rate": 8.71309291735637e-06, "loss": 0.0856, "num_input_tokens_seen": 37637536, "step": 29035 }, { "epoch": 1.4189284929030368, "grad_norm": 0.621248185634613, "learning_rate": 8.705645957875621e-06, "loss": 0.0808, "num_input_tokens_seen": 37643584, "step": 29040 }, { "epoch": 1.419172794566731, "grad_norm": 0.19086982309818268, "learning_rate": 8.698201511189048e-06, "loss": 0.0825, "num_input_tokens_seen": 37649920, "step": 29045 }, { "epoch": 1.4194170962304253, "grad_norm": 0.26714903116226196, "learning_rate": 8.690759578444649e-06, "loss": 0.1062, "num_input_tokens_seen": 37656672, "step": 29050 }, { "epoch": 1.4196613978941197, "grad_norm": 0.32484927773475647, "learning_rate": 8.68332016079008e-06, "loss": 0.0783, "num_input_tokens_seen": 37663328, "step": 29055 }, { "epoch": 1.4199056995578139, "grad_norm": 0.23645953834056854, "learning_rate": 8.6758832593726e-06, "loss": 0.0896, "num_input_tokens_seen": 37669824, "step": 29060 }, { "epoch": 1.4201500012215083, "grad_norm": 0.5630539059638977, "learning_rate": 8.668448875339053e-06, "loss": 0.0923, "num_input_tokens_seen": 37675872, "step": 29065 }, { "epoch": 1.4203943028852026, "grad_norm": 0.42825043201446533, "learning_rate": 8.661017009835933e-06, "loss": 0.0967, "num_input_tokens_seen": 37682464, "step": 29070 }, { "epoch": 1.420638604548897, "grad_norm": 0.3175358772277832, "learning_rate": 8.653587664009311e-06, "loss": 0.0953, "num_input_tokens_seen": 37688800, "step": 29075 }, { "epoch": 1.4208829062125914, "grad_norm": 0.1714431643486023, "learning_rate": 8.646160839004902e-06, "loss": 0.087, "num_input_tokens_seen": 37695936, "step": 29080 }, { "epoch": 1.4211272078762858, "grad_norm": 0.19137680530548096, "learning_rate": 8.638736535967998e-06, "loss": 0.1056, "num_input_tokens_seen": 37702560, "step": 29085 }, { "epoch": 1.42137150953998, "grad_norm": 0.13500499725341797, "learning_rate": 8.631314756043535e-06, "loss": 0.0848, "num_input_tokens_seen": 37708960, "step": 29090 }, { "epoch": 1.4216158112036743, "grad_norm": 0.4341821074485779, "learning_rate": 8.62389550037603e-06, "loss": 0.1049, "num_input_tokens_seen": 37715712, "step": 29095 }, { "epoch": 1.4218601128673687, "grad_norm": 0.3604060113430023, "learning_rate": 8.616478770109646e-06, "loss": 0.0864, "num_input_tokens_seen": 37722208, "step": 29100 }, { "epoch": 1.4221044145310628, "grad_norm": 0.1504267156124115, "learning_rate": 8.609064566388111e-06, "loss": 0.0954, "num_input_tokens_seen": 37728640, "step": 29105 }, { "epoch": 1.4223487161947572, "grad_norm": 0.1845947504043579, "learning_rate": 8.601652890354815e-06, "loss": 0.1096, "num_input_tokens_seen": 37735008, "step": 29110 }, { "epoch": 1.4225930178584516, "grad_norm": 0.2222803384065628, "learning_rate": 8.594243743152705e-06, "loss": 0.0787, "num_input_tokens_seen": 37741216, "step": 29115 }, { "epoch": 1.422837319522146, "grad_norm": 0.23601973056793213, "learning_rate": 8.58683712592438e-06, "loss": 0.1066, "num_input_tokens_seen": 37747648, "step": 29120 }, { "epoch": 1.4230816211858404, "grad_norm": 0.4675298035144806, "learning_rate": 8.579433039812037e-06, "loss": 0.0862, "num_input_tokens_seen": 37754240, "step": 29125 }, { "epoch": 1.4233259228495347, "grad_norm": 0.19323666393756866, "learning_rate": 8.572031485957466e-06, "loss": 0.0845, "num_input_tokens_seen": 37760640, "step": 29130 }, { "epoch": 1.423570224513229, "grad_norm": 0.24954169988632202, "learning_rate": 8.564632465502084e-06, "loss": 0.0663, "num_input_tokens_seen": 37767200, "step": 29135 }, { "epoch": 1.4238145261769233, "grad_norm": 0.4150553345680237, "learning_rate": 8.557235979586928e-06, "loss": 0.1032, "num_input_tokens_seen": 37773376, "step": 29140 }, { "epoch": 1.4240588278406177, "grad_norm": 0.16317276656627655, "learning_rate": 8.549842029352606e-06, "loss": 0.0902, "num_input_tokens_seen": 37780320, "step": 29145 }, { "epoch": 1.4243031295043118, "grad_norm": 0.1909312605857849, "learning_rate": 8.542450615939376e-06, "loss": 0.0838, "num_input_tokens_seen": 37787200, "step": 29150 }, { "epoch": 1.4245474311680062, "grad_norm": 0.27013134956359863, "learning_rate": 8.535061740487082e-06, "loss": 0.1016, "num_input_tokens_seen": 37793856, "step": 29155 }, { "epoch": 1.4247917328317006, "grad_norm": 0.16217267513275146, "learning_rate": 8.527675404135168e-06, "loss": 0.0582, "num_input_tokens_seen": 37799840, "step": 29160 }, { "epoch": 1.425036034495395, "grad_norm": 0.28722867369651794, "learning_rate": 8.520291608022724e-06, "loss": 0.1052, "num_input_tokens_seen": 37806304, "step": 29165 }, { "epoch": 1.4252803361590893, "grad_norm": 0.6346067786216736, "learning_rate": 8.512910353288398e-06, "loss": 0.0773, "num_input_tokens_seen": 37812576, "step": 29170 }, { "epoch": 1.4255246378227835, "grad_norm": 0.1486268788576126, "learning_rate": 8.505531641070486e-06, "loss": 0.0695, "num_input_tokens_seen": 37819296, "step": 29175 }, { "epoch": 1.4257689394864779, "grad_norm": 0.5205997824668884, "learning_rate": 8.498155472506885e-06, "loss": 0.0699, "num_input_tokens_seen": 37825504, "step": 29180 }, { "epoch": 1.4260132411501723, "grad_norm": 0.6944811344146729, "learning_rate": 8.49078184873508e-06, "loss": 0.1123, "num_input_tokens_seen": 37831808, "step": 29185 }, { "epoch": 1.4262575428138666, "grad_norm": 0.36413291096687317, "learning_rate": 8.483410770892188e-06, "loss": 0.0793, "num_input_tokens_seen": 37838144, "step": 29190 }, { "epoch": 1.4265018444775608, "grad_norm": 0.15020018815994263, "learning_rate": 8.476042240114909e-06, "loss": 0.1027, "num_input_tokens_seen": 37844384, "step": 29195 }, { "epoch": 1.4267461461412552, "grad_norm": 0.37513989210128784, "learning_rate": 8.468676257539568e-06, "loss": 0.0824, "num_input_tokens_seen": 37850816, "step": 29200 }, { "epoch": 1.4267461461412552, "eval_loss": 0.08744002878665924, "eval_runtime": 374.7588, "eval_samples_per_second": 97.089, "eval_steps_per_second": 24.274, "num_input_tokens_seen": 37850816, "step": 29200 }, { "epoch": 1.4269904478049495, "grad_norm": 0.5629739165306091, "learning_rate": 8.4613128243021e-06, "loss": 0.0807, "num_input_tokens_seen": 37857184, "step": 29205 }, { "epoch": 1.427234749468644, "grad_norm": 0.22900648415088654, "learning_rate": 8.453951941538028e-06, "loss": 0.0774, "num_input_tokens_seen": 37863520, "step": 29210 }, { "epoch": 1.4274790511323383, "grad_norm": 0.2539283037185669, "learning_rate": 8.446593610382495e-06, "loss": 0.0901, "num_input_tokens_seen": 37869536, "step": 29215 }, { "epoch": 1.4277233527960325, "grad_norm": 0.738115668296814, "learning_rate": 8.439237831970259e-06, "loss": 0.0776, "num_input_tokens_seen": 37875936, "step": 29220 }, { "epoch": 1.4279676544597268, "grad_norm": 0.6165496110916138, "learning_rate": 8.431884607435667e-06, "loss": 0.1239, "num_input_tokens_seen": 37882880, "step": 29225 }, { "epoch": 1.4282119561234212, "grad_norm": 1.0192793607711792, "learning_rate": 8.424533937912665e-06, "loss": 0.1172, "num_input_tokens_seen": 37889440, "step": 29230 }, { "epoch": 1.4284562577871156, "grad_norm": 0.29764774441719055, "learning_rate": 8.41718582453484e-06, "loss": 0.0853, "num_input_tokens_seen": 37895936, "step": 29235 }, { "epoch": 1.4287005594508098, "grad_norm": 0.3212222754955292, "learning_rate": 8.409840268435346e-06, "loss": 0.0698, "num_input_tokens_seen": 37902752, "step": 29240 }, { "epoch": 1.4289448611145041, "grad_norm": 0.3594590425491333, "learning_rate": 8.402497270746976e-06, "loss": 0.0838, "num_input_tokens_seen": 37909376, "step": 29245 }, { "epoch": 1.4291891627781985, "grad_norm": 0.20870453119277954, "learning_rate": 8.395156832602095e-06, "loss": 0.0655, "num_input_tokens_seen": 37915904, "step": 29250 }, { "epoch": 1.429433464441893, "grad_norm": 0.2423829883337021, "learning_rate": 8.387818955132707e-06, "loss": 0.0727, "num_input_tokens_seen": 37922784, "step": 29255 }, { "epoch": 1.4296777661055873, "grad_norm": 0.20706352591514587, "learning_rate": 8.38048363947039e-06, "loss": 0.098, "num_input_tokens_seen": 37928960, "step": 29260 }, { "epoch": 1.4299220677692814, "grad_norm": 0.24708276987075806, "learning_rate": 8.373150886746351e-06, "loss": 0.0841, "num_input_tokens_seen": 37935776, "step": 29265 }, { "epoch": 1.4301663694329758, "grad_norm": 0.7747130990028381, "learning_rate": 8.365820698091397e-06, "loss": 0.1188, "num_input_tokens_seen": 37941920, "step": 29270 }, { "epoch": 1.4304106710966702, "grad_norm": 0.2918277680873871, "learning_rate": 8.358493074635922e-06, "loss": 0.0893, "num_input_tokens_seen": 37948736, "step": 29275 }, { "epoch": 1.4306549727603646, "grad_norm": 0.5993503928184509, "learning_rate": 8.351168017509948e-06, "loss": 0.0999, "num_input_tokens_seen": 37954816, "step": 29280 }, { "epoch": 1.4308992744240587, "grad_norm": 0.19595535099506378, "learning_rate": 8.343845527843094e-06, "loss": 0.0553, "num_input_tokens_seen": 37961536, "step": 29285 }, { "epoch": 1.431143576087753, "grad_norm": 0.2311709225177765, "learning_rate": 8.336525606764566e-06, "loss": 0.0733, "num_input_tokens_seen": 37967968, "step": 29290 }, { "epoch": 1.4313878777514475, "grad_norm": 0.13873827457427979, "learning_rate": 8.329208255403204e-06, "loss": 0.0593, "num_input_tokens_seen": 37974624, "step": 29295 }, { "epoch": 1.4316321794151419, "grad_norm": 0.21984155476093292, "learning_rate": 8.321893474887426e-06, "loss": 0.0908, "num_input_tokens_seen": 37980576, "step": 29300 }, { "epoch": 1.4318764810788362, "grad_norm": 0.2748331129550934, "learning_rate": 8.31458126634526e-06, "loss": 0.0993, "num_input_tokens_seen": 37987136, "step": 29305 }, { "epoch": 1.4321207827425304, "grad_norm": 0.5320574045181274, "learning_rate": 8.30727163090435e-06, "loss": 0.1123, "num_input_tokens_seen": 37993344, "step": 29310 }, { "epoch": 1.4323650844062248, "grad_norm": 0.16790920495986938, "learning_rate": 8.29996456969192e-06, "loss": 0.0868, "num_input_tokens_seen": 38000000, "step": 29315 }, { "epoch": 1.4326093860699192, "grad_norm": 0.295639306306839, "learning_rate": 8.292660083834818e-06, "loss": 0.0689, "num_input_tokens_seen": 38006464, "step": 29320 }, { "epoch": 1.4328536877336135, "grad_norm": 0.48255234956741333, "learning_rate": 8.2853581744595e-06, "loss": 0.0763, "num_input_tokens_seen": 38012768, "step": 29325 }, { "epoch": 1.4330979893973077, "grad_norm": 0.5943976640701294, "learning_rate": 8.278058842691991e-06, "loss": 0.0984, "num_input_tokens_seen": 38019136, "step": 29330 }, { "epoch": 1.433342291061002, "grad_norm": 0.14983142912387848, "learning_rate": 8.27076208965796e-06, "loss": 0.0973, "num_input_tokens_seen": 38025696, "step": 29335 }, { "epoch": 1.4335865927246965, "grad_norm": 0.5334203839302063, "learning_rate": 8.263467916482637e-06, "loss": 0.0982, "num_input_tokens_seen": 38032192, "step": 29340 }, { "epoch": 1.4338308943883908, "grad_norm": 0.19171860814094543, "learning_rate": 8.256176324290885e-06, "loss": 0.0958, "num_input_tokens_seen": 38038816, "step": 29345 }, { "epoch": 1.4340751960520852, "grad_norm": 0.4053055942058563, "learning_rate": 8.248887314207168e-06, "loss": 0.1014, "num_input_tokens_seen": 38045216, "step": 29350 }, { "epoch": 1.4343194977157794, "grad_norm": 0.2263452559709549, "learning_rate": 8.24160088735553e-06, "loss": 0.0706, "num_input_tokens_seen": 38051584, "step": 29355 }, { "epoch": 1.4345637993794738, "grad_norm": 0.19717846810817719, "learning_rate": 8.234317044859629e-06, "loss": 0.0761, "num_input_tokens_seen": 38058080, "step": 29360 }, { "epoch": 1.4348081010431681, "grad_norm": 0.3151214122772217, "learning_rate": 8.227035787842744e-06, "loss": 0.0739, "num_input_tokens_seen": 38066112, "step": 29365 }, { "epoch": 1.4350524027068623, "grad_norm": 0.24697963893413544, "learning_rate": 8.219757117427721e-06, "loss": 0.0737, "num_input_tokens_seen": 38072416, "step": 29370 }, { "epoch": 1.4352967043705567, "grad_norm": 0.1195424273610115, "learning_rate": 8.212481034737014e-06, "loss": 0.0781, "num_input_tokens_seen": 38079104, "step": 29375 }, { "epoch": 1.435541006034251, "grad_norm": 0.28808221220970154, "learning_rate": 8.205207540892707e-06, "loss": 0.0809, "num_input_tokens_seen": 38085568, "step": 29380 }, { "epoch": 1.4357853076979454, "grad_norm": 0.4125683605670929, "learning_rate": 8.197936637016442e-06, "loss": 0.0873, "num_input_tokens_seen": 38091904, "step": 29385 }, { "epoch": 1.4360296093616398, "grad_norm": 0.37691864371299744, "learning_rate": 8.190668324229508e-06, "loss": 0.0733, "num_input_tokens_seen": 38098624, "step": 29390 }, { "epoch": 1.4362739110253342, "grad_norm": 0.4238189160823822, "learning_rate": 8.183402603652749e-06, "loss": 0.102, "num_input_tokens_seen": 38104896, "step": 29395 }, { "epoch": 1.4365182126890284, "grad_norm": 0.26961344480514526, "learning_rate": 8.176139476406635e-06, "loss": 0.0799, "num_input_tokens_seen": 38111232, "step": 29400 }, { "epoch": 1.4365182126890284, "eval_loss": 0.08784128725528717, "eval_runtime": 374.8623, "eval_samples_per_second": 97.062, "eval_steps_per_second": 24.268, "num_input_tokens_seen": 38111232, "step": 29400 }, { "epoch": 1.4367625143527227, "grad_norm": 0.1619531810283661, "learning_rate": 8.16887894361125e-06, "loss": 0.0832, "num_input_tokens_seen": 38117632, "step": 29405 }, { "epoch": 1.437006816016417, "grad_norm": 0.2396540641784668, "learning_rate": 8.161621006386233e-06, "loss": 0.0991, "num_input_tokens_seen": 38124544, "step": 29410 }, { "epoch": 1.4372511176801113, "grad_norm": 0.5831301212310791, "learning_rate": 8.154365665850869e-06, "loss": 0.1181, "num_input_tokens_seen": 38130400, "step": 29415 }, { "epoch": 1.4374954193438056, "grad_norm": 0.6085248589515686, "learning_rate": 8.147112923124005e-06, "loss": 0.1171, "num_input_tokens_seen": 38136704, "step": 29420 }, { "epoch": 1.4377397210075, "grad_norm": 0.18058176338672638, "learning_rate": 8.13986277932412e-06, "loss": 0.0428, "num_input_tokens_seen": 38143488, "step": 29425 }, { "epoch": 1.4379840226711944, "grad_norm": 0.8301128149032593, "learning_rate": 8.132615235569277e-06, "loss": 0.0871, "num_input_tokens_seen": 38149856, "step": 29430 }, { "epoch": 1.4382283243348888, "grad_norm": 0.137837216258049, "learning_rate": 8.125370292977124e-06, "loss": 0.1031, "num_input_tokens_seen": 38156000, "step": 29435 }, { "epoch": 1.4384726259985832, "grad_norm": 0.1754477471113205, "learning_rate": 8.118127952664944e-06, "loss": 0.0762, "num_input_tokens_seen": 38162656, "step": 29440 }, { "epoch": 1.4387169276622773, "grad_norm": 0.25029581785202026, "learning_rate": 8.110888215749574e-06, "loss": 0.0719, "num_input_tokens_seen": 38170016, "step": 29445 }, { "epoch": 1.4389612293259717, "grad_norm": 0.27874454855918884, "learning_rate": 8.10365108334749e-06, "loss": 0.1105, "num_input_tokens_seen": 38176448, "step": 29450 }, { "epoch": 1.439205530989666, "grad_norm": 0.23861373960971832, "learning_rate": 8.096416556574743e-06, "loss": 0.0656, "num_input_tokens_seen": 38182560, "step": 29455 }, { "epoch": 1.4394498326533602, "grad_norm": 1.3330122232437134, "learning_rate": 8.08918463654698e-06, "loss": 0.1049, "num_input_tokens_seen": 38189344, "step": 29460 }, { "epoch": 1.4396941343170546, "grad_norm": 0.5485677123069763, "learning_rate": 8.081955324379458e-06, "loss": 0.0955, "num_input_tokens_seen": 38196032, "step": 29465 }, { "epoch": 1.439938435980749, "grad_norm": 0.24098266661167145, "learning_rate": 8.074728621187039e-06, "loss": 0.0684, "num_input_tokens_seen": 38202752, "step": 29470 }, { "epoch": 1.4401827376444434, "grad_norm": 0.2965514063835144, "learning_rate": 8.067504528084158e-06, "loss": 0.062, "num_input_tokens_seen": 38209376, "step": 29475 }, { "epoch": 1.4404270393081378, "grad_norm": 0.19904467463493347, "learning_rate": 8.060283046184861e-06, "loss": 0.067, "num_input_tokens_seen": 38215584, "step": 29480 }, { "epoch": 1.4406713409718321, "grad_norm": 0.4271193742752075, "learning_rate": 8.053064176602806e-06, "loss": 0.0792, "num_input_tokens_seen": 38222048, "step": 29485 }, { "epoch": 1.4409156426355263, "grad_norm": 0.2211909145116806, "learning_rate": 8.045847920451216e-06, "loss": 0.1186, "num_input_tokens_seen": 38228800, "step": 29490 }, { "epoch": 1.4411599442992207, "grad_norm": 0.3912670314311981, "learning_rate": 8.038634278842944e-06, "loss": 0.0579, "num_input_tokens_seen": 38235840, "step": 29495 }, { "epoch": 1.441404245962915, "grad_norm": 0.3887957036495209, "learning_rate": 8.031423252890408e-06, "loss": 0.0971, "num_input_tokens_seen": 38242400, "step": 29500 }, { "epoch": 1.4416485476266092, "grad_norm": 0.28852424025535583, "learning_rate": 8.024214843705646e-06, "loss": 0.0875, "num_input_tokens_seen": 38248352, "step": 29505 }, { "epoch": 1.4418928492903036, "grad_norm": 0.2687278389930725, "learning_rate": 8.017009052400295e-06, "loss": 0.0819, "num_input_tokens_seen": 38254496, "step": 29510 }, { "epoch": 1.442137150953998, "grad_norm": 0.13530980050563812, "learning_rate": 8.00980588008557e-06, "loss": 0.0818, "num_input_tokens_seen": 38260768, "step": 29515 }, { "epoch": 1.4423814526176924, "grad_norm": 0.1982559859752655, "learning_rate": 8.002605327872282e-06, "loss": 0.0908, "num_input_tokens_seen": 38267456, "step": 29520 }, { "epoch": 1.4426257542813867, "grad_norm": 0.7114068865776062, "learning_rate": 7.995407396870862e-06, "loss": 0.1072, "num_input_tokens_seen": 38273856, "step": 29525 }, { "epoch": 1.442870055945081, "grad_norm": 0.407964289188385, "learning_rate": 7.988212088191307e-06, "loss": 0.0905, "num_input_tokens_seen": 38280224, "step": 29530 }, { "epoch": 1.4431143576087753, "grad_norm": 0.3907826244831085, "learning_rate": 7.98101940294324e-06, "loss": 0.1009, "num_input_tokens_seen": 38286432, "step": 29535 }, { "epoch": 1.4433586592724696, "grad_norm": 0.33621323108673096, "learning_rate": 7.973829342235847e-06, "loss": 0.0983, "num_input_tokens_seen": 38293248, "step": 29540 }, { "epoch": 1.443602960936164, "grad_norm": 0.18864062428474426, "learning_rate": 7.966641907177936e-06, "loss": 0.0824, "num_input_tokens_seen": 38299520, "step": 29545 }, { "epoch": 1.4438472625998582, "grad_norm": 0.26521816849708557, "learning_rate": 7.959457098877901e-06, "loss": 0.0863, "num_input_tokens_seen": 38305728, "step": 29550 }, { "epoch": 1.4440915642635526, "grad_norm": 0.2873610258102417, "learning_rate": 7.952274918443719e-06, "loss": 0.0835, "num_input_tokens_seen": 38312544, "step": 29555 }, { "epoch": 1.444335865927247, "grad_norm": 0.23755748569965363, "learning_rate": 7.945095366982983e-06, "loss": 0.0982, "num_input_tokens_seen": 38318432, "step": 29560 }, { "epoch": 1.4445801675909413, "grad_norm": 0.22216053307056427, "learning_rate": 7.937918445602871e-06, "loss": 0.0735, "num_input_tokens_seen": 38324896, "step": 29565 }, { "epoch": 1.4448244692546357, "grad_norm": 0.6962943077087402, "learning_rate": 7.930744155410145e-06, "loss": 0.1166, "num_input_tokens_seen": 38331360, "step": 29570 }, { "epoch": 1.44506877091833, "grad_norm": 0.30546772480010986, "learning_rate": 7.923572497511181e-06, "loss": 0.0929, "num_input_tokens_seen": 38337216, "step": 29575 }, { "epoch": 1.4453130725820242, "grad_norm": 0.4251241385936737, "learning_rate": 7.916403473011927e-06, "loss": 0.0684, "num_input_tokens_seen": 38343680, "step": 29580 }, { "epoch": 1.4455573742457186, "grad_norm": 0.22577425837516785, "learning_rate": 7.909237083017953e-06, "loss": 0.0794, "num_input_tokens_seen": 38350176, "step": 29585 }, { "epoch": 1.445801675909413, "grad_norm": 0.12584441900253296, "learning_rate": 7.902073328634389e-06, "loss": 0.0774, "num_input_tokens_seen": 38356512, "step": 29590 }, { "epoch": 1.4460459775731072, "grad_norm": 0.48045614361763, "learning_rate": 7.894912210965987e-06, "loss": 0.0974, "num_input_tokens_seen": 38362944, "step": 29595 }, { "epoch": 1.4462902792368015, "grad_norm": 0.429423063993454, "learning_rate": 7.887753731117075e-06, "loss": 0.0682, "num_input_tokens_seen": 38370144, "step": 29600 }, { "epoch": 1.4462902792368015, "eval_loss": 0.08733224123716354, "eval_runtime": 374.8735, "eval_samples_per_second": 97.059, "eval_steps_per_second": 24.267, "num_input_tokens_seen": 38370144, "step": 29600 }, { "epoch": 1.446534580900496, "grad_norm": 0.20317281782627106, "learning_rate": 7.880597890191587e-06, "loss": 0.085, "num_input_tokens_seen": 38376416, "step": 29605 }, { "epoch": 1.4467788825641903, "grad_norm": 0.221662700176239, "learning_rate": 7.873444689293036e-06, "loss": 0.0818, "num_input_tokens_seen": 38382976, "step": 29610 }, { "epoch": 1.4470231842278847, "grad_norm": 0.1912578046321869, "learning_rate": 7.866294129524548e-06, "loss": 0.0878, "num_input_tokens_seen": 38389472, "step": 29615 }, { "epoch": 1.447267485891579, "grad_norm": 0.2578486502170563, "learning_rate": 7.859146211988811e-06, "loss": 0.0796, "num_input_tokens_seen": 38396640, "step": 29620 }, { "epoch": 1.4475117875552732, "grad_norm": 0.2408113181591034, "learning_rate": 7.852000937788134e-06, "loss": 0.0824, "num_input_tokens_seen": 38403360, "step": 29625 }, { "epoch": 1.4477560892189676, "grad_norm": 0.7072662711143494, "learning_rate": 7.844858308024416e-06, "loss": 0.1486, "num_input_tokens_seen": 38409504, "step": 29630 }, { "epoch": 1.448000390882662, "grad_norm": 0.2175658494234085, "learning_rate": 7.837718323799122e-06, "loss": 0.0745, "num_input_tokens_seen": 38415872, "step": 29635 }, { "epoch": 1.4482446925463561, "grad_norm": 0.2678352892398834, "learning_rate": 7.83058098621334e-06, "loss": 0.0816, "num_input_tokens_seen": 38422304, "step": 29640 }, { "epoch": 1.4484889942100505, "grad_norm": 0.4784083366394043, "learning_rate": 7.823446296367739e-06, "loss": 0.1168, "num_input_tokens_seen": 38428672, "step": 29645 }, { "epoch": 1.4487332958737449, "grad_norm": 0.24009636044502258, "learning_rate": 7.81631425536257e-06, "loss": 0.1128, "num_input_tokens_seen": 38434784, "step": 29650 }, { "epoch": 1.4489775975374393, "grad_norm": 0.5620831251144409, "learning_rate": 7.809184864297689e-06, "loss": 0.0884, "num_input_tokens_seen": 38441152, "step": 29655 }, { "epoch": 1.4492218992011336, "grad_norm": 0.6744812726974487, "learning_rate": 7.802058124272532e-06, "loss": 0.0849, "num_input_tokens_seen": 38447872, "step": 29660 }, { "epoch": 1.449466200864828, "grad_norm": 0.2551613450050354, "learning_rate": 7.79493403638614e-06, "loss": 0.0826, "num_input_tokens_seen": 38454208, "step": 29665 }, { "epoch": 1.4497105025285222, "grad_norm": 0.16575206816196442, "learning_rate": 7.787812601737132e-06, "loss": 0.0774, "num_input_tokens_seen": 38460640, "step": 29670 }, { "epoch": 1.4499548041922166, "grad_norm": 0.4471707344055176, "learning_rate": 7.780693821423715e-06, "loss": 0.0714, "num_input_tokens_seen": 38467136, "step": 29675 }, { "epoch": 1.450199105855911, "grad_norm": 0.7189061641693115, "learning_rate": 7.773577696543705e-06, "loss": 0.087, "num_input_tokens_seen": 38473088, "step": 29680 }, { "epoch": 1.450443407519605, "grad_norm": 0.7965344190597534, "learning_rate": 7.7664642281945e-06, "loss": 0.0719, "num_input_tokens_seen": 38479488, "step": 29685 }, { "epoch": 1.4506877091832995, "grad_norm": 0.3421914875507355, "learning_rate": 7.759353417473072e-06, "loss": 0.0847, "num_input_tokens_seen": 38485408, "step": 29690 }, { "epoch": 1.4509320108469939, "grad_norm": 0.22995193302631378, "learning_rate": 7.752245265476016e-06, "loss": 0.0653, "num_input_tokens_seen": 38491872, "step": 29695 }, { "epoch": 1.4511763125106882, "grad_norm": 0.21258752048015594, "learning_rate": 7.745139773299481e-06, "loss": 0.0654, "num_input_tokens_seen": 38498240, "step": 29700 }, { "epoch": 1.4514206141743826, "grad_norm": 0.22638584673404694, "learning_rate": 7.738036942039232e-06, "loss": 0.0906, "num_input_tokens_seen": 38504608, "step": 29705 }, { "epoch": 1.4516649158380768, "grad_norm": 0.18125931918621063, "learning_rate": 7.73093677279062e-06, "loss": 0.0686, "num_input_tokens_seen": 38511424, "step": 29710 }, { "epoch": 1.4519092175017712, "grad_norm": 0.4528430104255676, "learning_rate": 7.72383926664857e-06, "loss": 0.0734, "num_input_tokens_seen": 38517984, "step": 29715 }, { "epoch": 1.4521535191654655, "grad_norm": 0.16103628277778625, "learning_rate": 7.716744424707606e-06, "loss": 0.1069, "num_input_tokens_seen": 38524736, "step": 29720 }, { "epoch": 1.45239782082916, "grad_norm": 0.15574295818805695, "learning_rate": 7.709652248061858e-06, "loss": 0.065, "num_input_tokens_seen": 38531104, "step": 29725 }, { "epoch": 1.452642122492854, "grad_norm": 0.14838996529579163, "learning_rate": 7.702562737805017e-06, "loss": 0.0937, "num_input_tokens_seen": 38538176, "step": 29730 }, { "epoch": 1.4528864241565485, "grad_norm": 0.33645960688591003, "learning_rate": 7.695475895030365e-06, "loss": 0.0808, "num_input_tokens_seen": 38544800, "step": 29735 }, { "epoch": 1.4531307258202428, "grad_norm": 0.6239610910415649, "learning_rate": 7.6883917208308e-06, "loss": 0.1008, "num_input_tokens_seen": 38551104, "step": 29740 }, { "epoch": 1.4533750274839372, "grad_norm": 0.20197954773902893, "learning_rate": 7.681310216298778e-06, "loss": 0.0772, "num_input_tokens_seen": 38557664, "step": 29745 }, { "epoch": 1.4536193291476316, "grad_norm": 0.1836474984884262, "learning_rate": 7.674231382526367e-06, "loss": 0.0797, "num_input_tokens_seen": 38564352, "step": 29750 }, { "epoch": 1.4538636308113257, "grad_norm": 0.2484695464372635, "learning_rate": 7.667155220605198e-06, "loss": 0.0845, "num_input_tokens_seen": 38570688, "step": 29755 }, { "epoch": 1.4541079324750201, "grad_norm": 0.5566030740737915, "learning_rate": 7.660081731626515e-06, "loss": 0.099, "num_input_tokens_seen": 38577152, "step": 29760 }, { "epoch": 1.4543522341387145, "grad_norm": 0.8015033602714539, "learning_rate": 7.653010916681141e-06, "loss": 0.0977, "num_input_tokens_seen": 38583520, "step": 29765 }, { "epoch": 1.4545965358024089, "grad_norm": 0.12522096931934357, "learning_rate": 7.645942776859472e-06, "loss": 0.0999, "num_input_tokens_seen": 38589664, "step": 29770 }, { "epoch": 1.454840837466103, "grad_norm": 0.32818788290023804, "learning_rate": 7.63887731325152e-06, "loss": 0.0908, "num_input_tokens_seen": 38596064, "step": 29775 }, { "epoch": 1.4550851391297974, "grad_norm": 0.258260041475296, "learning_rate": 7.63181452694685e-06, "loss": 0.0756, "num_input_tokens_seen": 38602208, "step": 29780 }, { "epoch": 1.4553294407934918, "grad_norm": 0.14202332496643066, "learning_rate": 7.624754419034644e-06, "loss": 0.0652, "num_input_tokens_seen": 38609024, "step": 29785 }, { "epoch": 1.4555737424571862, "grad_norm": 0.28396087884902954, "learning_rate": 7.6176969906036645e-06, "loss": 0.0746, "num_input_tokens_seen": 38615552, "step": 29790 }, { "epoch": 1.4558180441208806, "grad_norm": 0.15444990992546082, "learning_rate": 7.610642242742242e-06, "loss": 0.0788, "num_input_tokens_seen": 38622016, "step": 29795 }, { "epoch": 1.4560623457845747, "grad_norm": 0.16908244788646698, "learning_rate": 7.603590176538322e-06, "loss": 0.0787, "num_input_tokens_seen": 38629280, "step": 29800 }, { "epoch": 1.4560623457845747, "eval_loss": 0.08773210644721985, "eval_runtime": 374.6312, "eval_samples_per_second": 97.122, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 38629280, "step": 29800 }, { "epoch": 1.456306647448269, "grad_norm": 0.12393070757389069, "learning_rate": 7.596540793079404e-06, "loss": 0.0673, "num_input_tokens_seen": 38635424, "step": 29805 }, { "epoch": 1.4565509491119635, "grad_norm": 0.41153445839881897, "learning_rate": 7.5894940934526125e-06, "loss": 0.0931, "num_input_tokens_seen": 38642112, "step": 29810 }, { "epoch": 1.4567952507756579, "grad_norm": 0.22847488522529602, "learning_rate": 7.582450078744621e-06, "loss": 0.0826, "num_input_tokens_seen": 38648736, "step": 29815 }, { "epoch": 1.457039552439352, "grad_norm": 0.1222163513302803, "learning_rate": 7.575408750041707e-06, "loss": 0.0817, "num_input_tokens_seen": 38655104, "step": 29820 }, { "epoch": 1.4572838541030464, "grad_norm": 0.17656582593917847, "learning_rate": 7.568370108429732e-06, "loss": 0.0791, "num_input_tokens_seen": 38661600, "step": 29825 }, { "epoch": 1.4575281557667408, "grad_norm": 0.46285587549209595, "learning_rate": 7.561334154994154e-06, "loss": 0.0925, "num_input_tokens_seen": 38668288, "step": 29830 }, { "epoch": 1.4577724574304352, "grad_norm": 0.5288655161857605, "learning_rate": 7.55430089081999e-06, "loss": 0.0941, "num_input_tokens_seen": 38674560, "step": 29835 }, { "epoch": 1.4580167590941295, "grad_norm": 0.31908199191093445, "learning_rate": 7.547270316991864e-06, "loss": 0.0923, "num_input_tokens_seen": 38680864, "step": 29840 }, { "epoch": 1.4582610607578237, "grad_norm": 0.3292886018753052, "learning_rate": 7.5402424345939884e-06, "loss": 0.0891, "num_input_tokens_seen": 38687232, "step": 29845 }, { "epoch": 1.458505362421518, "grad_norm": 0.1455976814031601, "learning_rate": 7.533217244710133e-06, "loss": 0.0642, "num_input_tokens_seen": 38693728, "step": 29850 }, { "epoch": 1.4587496640852124, "grad_norm": 0.3188941180706024, "learning_rate": 7.52619474842369e-06, "loss": 0.0852, "num_input_tokens_seen": 38700256, "step": 29855 }, { "epoch": 1.4589939657489068, "grad_norm": 0.9702967405319214, "learning_rate": 7.519174946817597e-06, "loss": 0.1071, "num_input_tokens_seen": 38707296, "step": 29860 }, { "epoch": 1.459238267412601, "grad_norm": 0.24881677329540253, "learning_rate": 7.512157840974407e-06, "loss": 0.0692, "num_input_tokens_seen": 38713696, "step": 29865 }, { "epoch": 1.4594825690762954, "grad_norm": 0.17589956521987915, "learning_rate": 7.5051434319762496e-06, "loss": 0.0639, "num_input_tokens_seen": 38720352, "step": 29870 }, { "epoch": 1.4597268707399897, "grad_norm": 0.1556529849767685, "learning_rate": 7.498131720904822e-06, "loss": 0.0879, "num_input_tokens_seen": 38727136, "step": 29875 }, { "epoch": 1.4599711724036841, "grad_norm": 0.9511533975601196, "learning_rate": 7.491122708841433e-06, "loss": 0.0909, "num_input_tokens_seen": 38733536, "step": 29880 }, { "epoch": 1.4602154740673785, "grad_norm": 0.2103213667869568, "learning_rate": 7.4841163968669524e-06, "loss": 0.0514, "num_input_tokens_seen": 38740064, "step": 29885 }, { "epoch": 1.4604597757310727, "grad_norm": 0.32726672291755676, "learning_rate": 7.4771127860618355e-06, "loss": 0.0795, "num_input_tokens_seen": 38746304, "step": 29890 }, { "epoch": 1.460704077394767, "grad_norm": 0.4976106286048889, "learning_rate": 7.470111877506139e-06, "loss": 0.0972, "num_input_tokens_seen": 38752992, "step": 29895 }, { "epoch": 1.4609483790584614, "grad_norm": 0.11175316572189331, "learning_rate": 7.463113672279479e-06, "loss": 0.0753, "num_input_tokens_seen": 38759520, "step": 29900 }, { "epoch": 1.4611926807221558, "grad_norm": 0.7994087338447571, "learning_rate": 7.456118171461071e-06, "loss": 0.0841, "num_input_tokens_seen": 38766336, "step": 29905 }, { "epoch": 1.46143698238585, "grad_norm": 1.0297894477844238, "learning_rate": 7.449125376129721e-06, "loss": 0.0796, "num_input_tokens_seen": 38772800, "step": 29910 }, { "epoch": 1.4616812840495443, "grad_norm": 0.24484136700630188, "learning_rate": 7.442135287363788e-06, "loss": 0.0841, "num_input_tokens_seen": 38779648, "step": 29915 }, { "epoch": 1.4619255857132387, "grad_norm": 0.28369462490081787, "learning_rate": 7.435147906241247e-06, "loss": 0.0882, "num_input_tokens_seen": 38786112, "step": 29920 }, { "epoch": 1.462169887376933, "grad_norm": 0.3218742311000824, "learning_rate": 7.428163233839624e-06, "loss": 0.074, "num_input_tokens_seen": 38792608, "step": 29925 }, { "epoch": 1.4624141890406275, "grad_norm": 0.2926478087902069, "learning_rate": 7.4211812712360525e-06, "loss": 0.1098, "num_input_tokens_seen": 38799488, "step": 29930 }, { "epoch": 1.4626584907043216, "grad_norm": 0.16864997148513794, "learning_rate": 7.4142020195072464e-06, "loss": 0.0789, "num_input_tokens_seen": 38805792, "step": 29935 }, { "epoch": 1.462902792368016, "grad_norm": 0.1620044708251953, "learning_rate": 7.407225479729479e-06, "loss": 0.089, "num_input_tokens_seen": 38811744, "step": 29940 }, { "epoch": 1.4631470940317104, "grad_norm": 0.26055219769477844, "learning_rate": 7.400251652978632e-06, "loss": 0.0991, "num_input_tokens_seen": 38818208, "step": 29945 }, { "epoch": 1.4633913956954046, "grad_norm": 0.4539051949977875, "learning_rate": 7.393280540330147e-06, "loss": 0.0765, "num_input_tokens_seen": 38824800, "step": 29950 }, { "epoch": 1.463635697359099, "grad_norm": 0.5142937302589417, "learning_rate": 7.386312142859069e-06, "loss": 0.1099, "num_input_tokens_seen": 38831392, "step": 29955 }, { "epoch": 1.4638799990227933, "grad_norm": 0.7518265843391418, "learning_rate": 7.379346461640008e-06, "loss": 0.0921, "num_input_tokens_seen": 38837696, "step": 29960 }, { "epoch": 1.4641243006864877, "grad_norm": 0.25069716572761536, "learning_rate": 7.372383497747149e-06, "loss": 0.0841, "num_input_tokens_seen": 38844256, "step": 29965 }, { "epoch": 1.464368602350182, "grad_norm": 0.5472198724746704, "learning_rate": 7.3654232522542775e-06, "loss": 0.0901, "num_input_tokens_seen": 38850496, "step": 29970 }, { "epoch": 1.4646129040138764, "grad_norm": 0.13698415458202362, "learning_rate": 7.358465726234756e-06, "loss": 0.088, "num_input_tokens_seen": 38856352, "step": 29975 }, { "epoch": 1.4648572056775706, "grad_norm": 0.5474832653999329, "learning_rate": 7.351510920761512e-06, "loss": 0.092, "num_input_tokens_seen": 38862496, "step": 29980 }, { "epoch": 1.465101507341265, "grad_norm": 0.14398160576820374, "learning_rate": 7.344558836907067e-06, "loss": 0.1166, "num_input_tokens_seen": 38868512, "step": 29985 }, { "epoch": 1.4653458090049594, "grad_norm": 0.2466653436422348, "learning_rate": 7.3376094757435285e-06, "loss": 0.0862, "num_input_tokens_seen": 38874912, "step": 29990 }, { "epoch": 1.4655901106686535, "grad_norm": 0.30459511280059814, "learning_rate": 7.330662838342561e-06, "loss": 0.1021, "num_input_tokens_seen": 38881344, "step": 29995 }, { "epoch": 1.465834412332348, "grad_norm": 0.48063090443611145, "learning_rate": 7.323718925775438e-06, "loss": 0.0837, "num_input_tokens_seen": 38887744, "step": 30000 }, { "epoch": 1.465834412332348, "eval_loss": 0.08751439303159714, "eval_runtime": 375.5016, "eval_samples_per_second": 96.897, "eval_steps_per_second": 24.226, "num_input_tokens_seen": 38887744, "step": 30000 }, { "epoch": 1.4660787139960423, "grad_norm": 0.09679824858903885, "learning_rate": 7.316777739112985e-06, "loss": 0.0658, "num_input_tokens_seen": 38893952, "step": 30005 }, { "epoch": 1.4663230156597367, "grad_norm": 0.3481435179710388, "learning_rate": 7.309839279425626e-06, "loss": 0.1065, "num_input_tokens_seen": 38900064, "step": 30010 }, { "epoch": 1.466567317323431, "grad_norm": 0.22837556898593903, "learning_rate": 7.302903547783366e-06, "loss": 0.0807, "num_input_tokens_seen": 38906528, "step": 30015 }, { "epoch": 1.4668116189871254, "grad_norm": 0.18270820379257202, "learning_rate": 7.2959705452557644e-06, "loss": 0.0958, "num_input_tokens_seen": 38913248, "step": 30020 }, { "epoch": 1.4670559206508196, "grad_norm": 0.2462085336446762, "learning_rate": 7.289040272911996e-06, "loss": 0.0775, "num_input_tokens_seen": 38919904, "step": 30025 }, { "epoch": 1.467300222314514, "grad_norm": 0.19594137370586395, "learning_rate": 7.282112731820789e-06, "loss": 0.1079, "num_input_tokens_seen": 38926240, "step": 30030 }, { "epoch": 1.4675445239782083, "grad_norm": 0.19859646260738373, "learning_rate": 7.275187923050447e-06, "loss": 0.1046, "num_input_tokens_seen": 38932992, "step": 30035 }, { "epoch": 1.4677888256419025, "grad_norm": 0.4034597873687744, "learning_rate": 7.268265847668879e-06, "loss": 0.0719, "num_input_tokens_seen": 38939328, "step": 30040 }, { "epoch": 1.4680331273055969, "grad_norm": 0.1191459596157074, "learning_rate": 7.261346506743538e-06, "loss": 0.0926, "num_input_tokens_seen": 38945664, "step": 30045 }, { "epoch": 1.4682774289692913, "grad_norm": 0.32518619298934937, "learning_rate": 7.254429901341486e-06, "loss": 0.0873, "num_input_tokens_seen": 38951840, "step": 30050 }, { "epoch": 1.4685217306329856, "grad_norm": 0.18480804562568665, "learning_rate": 7.247516032529356e-06, "loss": 0.1077, "num_input_tokens_seen": 38958400, "step": 30055 }, { "epoch": 1.46876603229668, "grad_norm": 0.5345982909202576, "learning_rate": 7.240604901373338e-06, "loss": 0.091, "num_input_tokens_seen": 38965120, "step": 30060 }, { "epoch": 1.4690103339603744, "grad_norm": 0.2264583557844162, "learning_rate": 7.233696508939223e-06, "loss": 0.0955, "num_input_tokens_seen": 38971488, "step": 30065 }, { "epoch": 1.4692546356240686, "grad_norm": 0.18402840197086334, "learning_rate": 7.226790856292376e-06, "loss": 0.0972, "num_input_tokens_seen": 38977536, "step": 30070 }, { "epoch": 1.469498937287763, "grad_norm": 0.5145267844200134, "learning_rate": 7.219887944497727e-06, "loss": 0.0662, "num_input_tokens_seen": 38984096, "step": 30075 }, { "epoch": 1.4697432389514573, "grad_norm": 0.16226036846637726, "learning_rate": 7.2129877746198e-06, "loss": 0.0819, "num_input_tokens_seen": 38990496, "step": 30080 }, { "epoch": 1.4699875406151515, "grad_norm": 0.3746405243873596, "learning_rate": 7.20609034772268e-06, "loss": 0.1109, "num_input_tokens_seen": 38997376, "step": 30085 }, { "epoch": 1.4702318422788458, "grad_norm": 0.7841811776161194, "learning_rate": 7.19919566487004e-06, "loss": 0.0993, "num_input_tokens_seen": 39003968, "step": 30090 }, { "epoch": 1.4704761439425402, "grad_norm": 0.6150380373001099, "learning_rate": 7.192303727125132e-06, "loss": 0.1099, "num_input_tokens_seen": 39010112, "step": 30095 }, { "epoch": 1.4707204456062346, "grad_norm": 0.39792388677597046, "learning_rate": 7.185414535550777e-06, "loss": 0.0853, "num_input_tokens_seen": 39016576, "step": 30100 }, { "epoch": 1.470964747269929, "grad_norm": 0.3167487680912018, "learning_rate": 7.178528091209363e-06, "loss": 0.0619, "num_input_tokens_seen": 39022880, "step": 30105 }, { "epoch": 1.4712090489336234, "grad_norm": 0.743280827999115, "learning_rate": 7.171644395162888e-06, "loss": 0.0961, "num_input_tokens_seen": 39029312, "step": 30110 }, { "epoch": 1.4714533505973175, "grad_norm": 0.22848829627037048, "learning_rate": 7.164763448472881e-06, "loss": 0.0568, "num_input_tokens_seen": 39035488, "step": 30115 }, { "epoch": 1.471697652261012, "grad_norm": 0.24322855472564697, "learning_rate": 7.157885252200491e-06, "loss": 0.0792, "num_input_tokens_seen": 39042304, "step": 30120 }, { "epoch": 1.4719419539247063, "grad_norm": 1.0947080850601196, "learning_rate": 7.151009807406403e-06, "loss": 0.0983, "num_input_tokens_seen": 39048288, "step": 30125 }, { "epoch": 1.4721862555884004, "grad_norm": 0.13896556198596954, "learning_rate": 7.144137115150909e-06, "loss": 0.0761, "num_input_tokens_seen": 39054912, "step": 30130 }, { "epoch": 1.4724305572520948, "grad_norm": 0.17275303602218628, "learning_rate": 7.1372671764938725e-06, "loss": 0.0693, "num_input_tokens_seen": 39061472, "step": 30135 }, { "epoch": 1.4726748589157892, "grad_norm": 0.6046654582023621, "learning_rate": 7.130399992494705e-06, "loss": 0.1049, "num_input_tokens_seen": 39067968, "step": 30140 }, { "epoch": 1.4729191605794836, "grad_norm": 0.23635762929916382, "learning_rate": 7.123535564212419e-06, "loss": 0.0932, "num_input_tokens_seen": 39074400, "step": 30145 }, { "epoch": 1.473163462243178, "grad_norm": 0.17390236258506775, "learning_rate": 7.116673892705611e-06, "loss": 0.1073, "num_input_tokens_seen": 39080320, "step": 30150 }, { "epoch": 1.4734077639068723, "grad_norm": 0.2299221009016037, "learning_rate": 7.109814979032415e-06, "loss": 0.0792, "num_input_tokens_seen": 39087456, "step": 30155 }, { "epoch": 1.4736520655705665, "grad_norm": 0.10968828946352005, "learning_rate": 7.102958824250577e-06, "loss": 0.0845, "num_input_tokens_seen": 39093952, "step": 30160 }, { "epoch": 1.4738963672342609, "grad_norm": 0.7073354125022888, "learning_rate": 7.096105429417393e-06, "loss": 0.0798, "num_input_tokens_seen": 39100416, "step": 30165 }, { "epoch": 1.4741406688979553, "grad_norm": 0.2352656126022339, "learning_rate": 7.0892547955897506e-06, "loss": 0.092, "num_input_tokens_seen": 39107072, "step": 30170 }, { "epoch": 1.4743849705616494, "grad_norm": 0.1513957381248474, "learning_rate": 7.0824069238241e-06, "loss": 0.0589, "num_input_tokens_seen": 39113984, "step": 30175 }, { "epoch": 1.4746292722253438, "grad_norm": 0.8054689168930054, "learning_rate": 7.075561815176462e-06, "loss": 0.0964, "num_input_tokens_seen": 39120480, "step": 30180 }, { "epoch": 1.4748735738890382, "grad_norm": 0.35326722264289856, "learning_rate": 7.068719470702445e-06, "loss": 0.0938, "num_input_tokens_seen": 39126848, "step": 30185 }, { "epoch": 1.4751178755527325, "grad_norm": 0.3169105648994446, "learning_rate": 7.061879891457229e-06, "loss": 0.0594, "num_input_tokens_seen": 39133248, "step": 30190 }, { "epoch": 1.475362177216427, "grad_norm": 0.4719623625278473, "learning_rate": 7.0550430784955515e-06, "loss": 0.0692, "num_input_tokens_seen": 39140224, "step": 30195 }, { "epoch": 1.4756064788801213, "grad_norm": 0.438642293214798, "learning_rate": 7.048209032871752e-06, "loss": 0.1068, "num_input_tokens_seen": 39146016, "step": 30200 }, { "epoch": 1.4756064788801213, "eval_loss": 0.08738256245851517, "eval_runtime": 374.2985, "eval_samples_per_second": 97.209, "eval_steps_per_second": 24.304, "num_input_tokens_seen": 39146016, "step": 30200 }, { "epoch": 1.4758507805438155, "grad_norm": 0.5267781615257263, "learning_rate": 7.0413777556397055e-06, "loss": 0.0914, "num_input_tokens_seen": 39152544, "step": 30205 }, { "epoch": 1.4760950822075098, "grad_norm": 0.5921964645385742, "learning_rate": 7.0345492478528925e-06, "loss": 0.0927, "num_input_tokens_seen": 39158976, "step": 30210 }, { "epoch": 1.4763393838712042, "grad_norm": 0.24076546728610992, "learning_rate": 7.02772351056436e-06, "loss": 0.0856, "num_input_tokens_seen": 39165440, "step": 30215 }, { "epoch": 1.4765836855348984, "grad_norm": 0.307132750749588, "learning_rate": 7.020900544826709e-06, "loss": 0.0871, "num_input_tokens_seen": 39171872, "step": 30220 }, { "epoch": 1.4768279871985928, "grad_norm": 0.37872013449668884, "learning_rate": 7.014080351692134e-06, "loss": 0.0835, "num_input_tokens_seen": 39178240, "step": 30225 }, { "epoch": 1.4770722888622871, "grad_norm": 0.7077171802520752, "learning_rate": 7.0072629322124024e-06, "loss": 0.0861, "num_input_tokens_seen": 39184768, "step": 30230 }, { "epoch": 1.4773165905259815, "grad_norm": 0.21659600734710693, "learning_rate": 7.000448287438827e-06, "loss": 0.0729, "num_input_tokens_seen": 39191136, "step": 30235 }, { "epoch": 1.477560892189676, "grad_norm": 0.7560874819755554, "learning_rate": 6.993636418422331e-06, "loss": 0.1185, "num_input_tokens_seen": 39197312, "step": 30240 }, { "epoch": 1.47780519385337, "grad_norm": 0.5004746317863464, "learning_rate": 6.986827326213383e-06, "loss": 0.0898, "num_input_tokens_seen": 39203712, "step": 30245 }, { "epoch": 1.4780494955170644, "grad_norm": 0.23707807064056396, "learning_rate": 6.9800210118620205e-06, "loss": 0.0803, "num_input_tokens_seen": 39210656, "step": 30250 }, { "epoch": 1.4782937971807588, "grad_norm": 0.5200582146644592, "learning_rate": 6.973217476417876e-06, "loss": 0.0967, "num_input_tokens_seen": 39217312, "step": 30255 }, { "epoch": 1.4785380988444532, "grad_norm": 0.39512065052986145, "learning_rate": 6.96641672093013e-06, "loss": 0.075, "num_input_tokens_seen": 39223904, "step": 30260 }, { "epoch": 1.4787824005081474, "grad_norm": 0.2962246835231781, "learning_rate": 6.95961874644755e-06, "loss": 0.1047, "num_input_tokens_seen": 39230368, "step": 30265 }, { "epoch": 1.4790267021718417, "grad_norm": 0.11525000631809235, "learning_rate": 6.952823554018476e-06, "loss": 0.0741, "num_input_tokens_seen": 39236832, "step": 30270 }, { "epoch": 1.4792710038355361, "grad_norm": 0.5290797352790833, "learning_rate": 6.946031144690798e-06, "loss": 0.0958, "num_input_tokens_seen": 39243008, "step": 30275 }, { "epoch": 1.4795153054992305, "grad_norm": 0.276341050863266, "learning_rate": 6.939241519512005e-06, "loss": 0.0678, "num_input_tokens_seen": 39249408, "step": 30280 }, { "epoch": 1.4797596071629249, "grad_norm": 0.21850711107254028, "learning_rate": 6.932454679529129e-06, "loss": 0.0889, "num_input_tokens_seen": 39255712, "step": 30285 }, { "epoch": 1.480003908826619, "grad_norm": 0.3014419674873352, "learning_rate": 6.925670625788791e-06, "loss": 0.0732, "num_input_tokens_seen": 39262656, "step": 30290 }, { "epoch": 1.4802482104903134, "grad_norm": 0.2700783908367157, "learning_rate": 6.918889359337186e-06, "loss": 0.0776, "num_input_tokens_seen": 39269216, "step": 30295 }, { "epoch": 1.4804925121540078, "grad_norm": 0.28998446464538574, "learning_rate": 6.912110881220058e-06, "loss": 0.1002, "num_input_tokens_seen": 39275424, "step": 30300 }, { "epoch": 1.4807368138177022, "grad_norm": 0.247783824801445, "learning_rate": 6.905335192482735e-06, "loss": 0.0879, "num_input_tokens_seen": 39281856, "step": 30305 }, { "epoch": 1.4809811154813963, "grad_norm": 0.27166569232940674, "learning_rate": 6.8985622941701275e-06, "loss": 0.0643, "num_input_tokens_seen": 39288288, "step": 30310 }, { "epoch": 1.4812254171450907, "grad_norm": 0.15385623276233673, "learning_rate": 6.89179218732669e-06, "loss": 0.0845, "num_input_tokens_seen": 39295424, "step": 30315 }, { "epoch": 1.481469718808785, "grad_norm": 0.4914392828941345, "learning_rate": 6.8850248729964595e-06, "loss": 0.089, "num_input_tokens_seen": 39301984, "step": 30320 }, { "epoch": 1.4817140204724795, "grad_norm": 0.25658416748046875, "learning_rate": 6.8782603522230314e-06, "loss": 0.0884, "num_input_tokens_seen": 39308576, "step": 30325 }, { "epoch": 1.4819583221361738, "grad_norm": 0.8652148246765137, "learning_rate": 6.871498626049591e-06, "loss": 0.1228, "num_input_tokens_seen": 39314880, "step": 30330 }, { "epoch": 1.482202623799868, "grad_norm": 0.12971898913383484, "learning_rate": 6.8647396955188875e-06, "loss": 0.0849, "num_input_tokens_seen": 39321600, "step": 30335 }, { "epoch": 1.4824469254635624, "grad_norm": 0.2491333782672882, "learning_rate": 6.857983561673218e-06, "loss": 0.0984, "num_input_tokens_seen": 39328416, "step": 30340 }, { "epoch": 1.4826912271272568, "grad_norm": 0.4045732617378235, "learning_rate": 6.851230225554467e-06, "loss": 0.0894, "num_input_tokens_seen": 39334816, "step": 30345 }, { "epoch": 1.4829355287909511, "grad_norm": 0.34886518120765686, "learning_rate": 6.8444796882040946e-06, "loss": 0.0797, "num_input_tokens_seen": 39341696, "step": 30350 }, { "epoch": 1.4831798304546453, "grad_norm": 0.22263678908348083, "learning_rate": 6.837731950663106e-06, "loss": 0.0831, "num_input_tokens_seen": 39348384, "step": 30355 }, { "epoch": 1.4834241321183397, "grad_norm": 0.2744486331939697, "learning_rate": 6.830987013972098e-06, "loss": 0.0977, "num_input_tokens_seen": 39354624, "step": 30360 }, { "epoch": 1.483668433782034, "grad_norm": 0.2669101655483246, "learning_rate": 6.82424487917121e-06, "loss": 0.084, "num_input_tokens_seen": 39360928, "step": 30365 }, { "epoch": 1.4839127354457284, "grad_norm": 0.14095570147037506, "learning_rate": 6.8175055473001735e-06, "loss": 0.0848, "num_input_tokens_seen": 39367296, "step": 30370 }, { "epoch": 1.4841570371094228, "grad_norm": 0.5018042922019958, "learning_rate": 6.8107690193982855e-06, "loss": 0.0962, "num_input_tokens_seen": 39373888, "step": 30375 }, { "epoch": 1.484401338773117, "grad_norm": 0.15516850352287292, "learning_rate": 6.804035296504385e-06, "loss": 0.0987, "num_input_tokens_seen": 39380512, "step": 30380 }, { "epoch": 1.4846456404368114, "grad_norm": 0.296419233083725, "learning_rate": 6.797304379656916e-06, "loss": 0.0999, "num_input_tokens_seen": 39386912, "step": 30385 }, { "epoch": 1.4848899421005057, "grad_norm": 0.1696072518825531, "learning_rate": 6.790576269893861e-06, "loss": 0.0845, "num_input_tokens_seen": 39393184, "step": 30390 }, { "epoch": 1.4851342437642001, "grad_norm": 0.23981472849845886, "learning_rate": 6.783850968252772e-06, "loss": 0.0763, "num_input_tokens_seen": 39399712, "step": 30395 }, { "epoch": 1.4853785454278943, "grad_norm": 0.17648957669734955, "learning_rate": 6.777128475770789e-06, "loss": 0.0716, "num_input_tokens_seen": 39406240, "step": 30400 }, { "epoch": 1.4853785454278943, "eval_loss": 0.0876522809267044, "eval_runtime": 375.4135, "eval_samples_per_second": 96.92, "eval_steps_per_second": 24.232, "num_input_tokens_seen": 39406240, "step": 30400 }, { "epoch": 1.4856228470915886, "grad_norm": 0.5551695227622986, "learning_rate": 6.77040879348459e-06, "loss": 0.0974, "num_input_tokens_seen": 39412256, "step": 30405 }, { "epoch": 1.485867148755283, "grad_norm": 0.9706107974052429, "learning_rate": 6.763691922430443e-06, "loss": 0.0867, "num_input_tokens_seen": 39418592, "step": 30410 }, { "epoch": 1.4861114504189774, "grad_norm": 0.36557233333587646, "learning_rate": 6.756977863644178e-06, "loss": 0.0821, "num_input_tokens_seen": 39425056, "step": 30415 }, { "epoch": 1.4863557520826718, "grad_norm": 0.7963230013847351, "learning_rate": 6.7502666181611804e-06, "loss": 0.1136, "num_input_tokens_seen": 39431232, "step": 30420 }, { "epoch": 1.486600053746366, "grad_norm": 0.35863012075424194, "learning_rate": 6.743558187016405e-06, "loss": 0.0731, "num_input_tokens_seen": 39437568, "step": 30425 }, { "epoch": 1.4868443554100603, "grad_norm": 0.9628138542175293, "learning_rate": 6.7368525712443925e-06, "loss": 0.1391, "num_input_tokens_seen": 39443648, "step": 30430 }, { "epoch": 1.4870886570737547, "grad_norm": 0.15073919296264648, "learning_rate": 6.7301497718792155e-06, "loss": 0.0784, "num_input_tokens_seen": 39450496, "step": 30435 }, { "epoch": 1.487332958737449, "grad_norm": 0.5554937124252319, "learning_rate": 6.723449789954544e-06, "loss": 0.084, "num_input_tokens_seen": 39457408, "step": 30440 }, { "epoch": 1.4875772604011432, "grad_norm": 0.18641002476215363, "learning_rate": 6.716752626503586e-06, "loss": 0.0917, "num_input_tokens_seen": 39463712, "step": 30445 }, { "epoch": 1.4878215620648376, "grad_norm": 0.23812563717365265, "learning_rate": 6.710058282559131e-06, "loss": 0.0759, "num_input_tokens_seen": 39470368, "step": 30450 }, { "epoch": 1.488065863728532, "grad_norm": 0.5171712636947632, "learning_rate": 6.703366759153545e-06, "loss": 0.0746, "num_input_tokens_seen": 39477184, "step": 30455 }, { "epoch": 1.4883101653922264, "grad_norm": 0.2203608900308609, "learning_rate": 6.6966780573187335e-06, "loss": 0.0802, "num_input_tokens_seen": 39483424, "step": 30460 }, { "epoch": 1.4885544670559208, "grad_norm": 0.411552757024765, "learning_rate": 6.689992178086174e-06, "loss": 0.0695, "num_input_tokens_seen": 39490368, "step": 30465 }, { "epoch": 1.488798768719615, "grad_norm": 0.1041506975889206, "learning_rate": 6.683309122486925e-06, "loss": 0.0815, "num_input_tokens_seen": 39497056, "step": 30470 }, { "epoch": 1.4890430703833093, "grad_norm": 0.22023631632328033, "learning_rate": 6.676628891551584e-06, "loss": 0.0933, "num_input_tokens_seen": 39503072, "step": 30475 }, { "epoch": 1.4892873720470037, "grad_norm": 0.1640394628047943, "learning_rate": 6.6699514863103385e-06, "loss": 0.1194, "num_input_tokens_seen": 39509856, "step": 30480 }, { "epoch": 1.4895316737106978, "grad_norm": 0.6322935223579407, "learning_rate": 6.663276907792921e-06, "loss": 0.0814, "num_input_tokens_seen": 39516544, "step": 30485 }, { "epoch": 1.4897759753743922, "grad_norm": 0.2020803987979889, "learning_rate": 6.656605157028634e-06, "loss": 0.1378, "num_input_tokens_seen": 39523200, "step": 30490 }, { "epoch": 1.4900202770380866, "grad_norm": 0.408975213766098, "learning_rate": 6.649936235046358e-06, "loss": 0.0683, "num_input_tokens_seen": 39529728, "step": 30495 }, { "epoch": 1.490264578701781, "grad_norm": 0.17215897142887115, "learning_rate": 6.643270142874508e-06, "loss": 0.0832, "num_input_tokens_seen": 39536256, "step": 30500 }, { "epoch": 1.4905088803654754, "grad_norm": 0.2633885145187378, "learning_rate": 6.636606881541094e-06, "loss": 0.0838, "num_input_tokens_seen": 39542368, "step": 30505 }, { "epoch": 1.4907531820291697, "grad_norm": 0.14687195420265198, "learning_rate": 6.629946452073662e-06, "loss": 0.057, "num_input_tokens_seen": 39548960, "step": 30510 }, { "epoch": 1.490997483692864, "grad_norm": 0.3550335764884949, "learning_rate": 6.6232888554993375e-06, "loss": 0.0885, "num_input_tokens_seen": 39555328, "step": 30515 }, { "epoch": 1.4912417853565583, "grad_norm": 0.5795713067054749, "learning_rate": 6.616634092844817e-06, "loss": 0.1125, "num_input_tokens_seen": 39561760, "step": 30520 }, { "epoch": 1.4914860870202526, "grad_norm": 1.0386384725570679, "learning_rate": 6.609982165136331e-06, "loss": 0.107, "num_input_tokens_seen": 39568352, "step": 30525 }, { "epoch": 1.4917303886839468, "grad_norm": 0.17810751497745514, "learning_rate": 6.603333073399706e-06, "loss": 0.0785, "num_input_tokens_seen": 39574496, "step": 30530 }, { "epoch": 1.4919746903476412, "grad_norm": 0.3356577455997467, "learning_rate": 6.596686818660308e-06, "loss": 0.0707, "num_input_tokens_seen": 39581152, "step": 30535 }, { "epoch": 1.4922189920113356, "grad_norm": 0.2826632857322693, "learning_rate": 6.590043401943066e-06, "loss": 0.0692, "num_input_tokens_seen": 39587552, "step": 30540 }, { "epoch": 1.49246329367503, "grad_norm": 0.18942126631736755, "learning_rate": 6.583402824272494e-06, "loss": 0.0875, "num_input_tokens_seen": 39593984, "step": 30545 }, { "epoch": 1.4927075953387243, "grad_norm": 0.3327763080596924, "learning_rate": 6.576765086672634e-06, "loss": 0.076, "num_input_tokens_seen": 39600160, "step": 30550 }, { "epoch": 1.4929518970024187, "grad_norm": 0.46476465463638306, "learning_rate": 6.57013019016712e-06, "loss": 0.087, "num_input_tokens_seen": 39606528, "step": 30555 }, { "epoch": 1.4931961986661129, "grad_norm": 0.6069561839103699, "learning_rate": 6.563498135779142e-06, "loss": 0.0885, "num_input_tokens_seen": 39613088, "step": 30560 }, { "epoch": 1.4934405003298072, "grad_norm": 0.2057134360074997, "learning_rate": 6.556868924531431e-06, "loss": 0.0672, "num_input_tokens_seen": 39620128, "step": 30565 }, { "epoch": 1.4936848019935016, "grad_norm": 0.16771356761455536, "learning_rate": 6.550242557446304e-06, "loss": 0.0829, "num_input_tokens_seen": 39626624, "step": 30570 }, { "epoch": 1.4939291036571958, "grad_norm": 0.41731998324394226, "learning_rate": 6.543619035545634e-06, "loss": 0.0617, "num_input_tokens_seen": 39632704, "step": 30575 }, { "epoch": 1.4941734053208902, "grad_norm": 0.14198264479637146, "learning_rate": 6.53699835985084e-06, "loss": 0.0535, "num_input_tokens_seen": 39638720, "step": 30580 }, { "epoch": 1.4944177069845845, "grad_norm": 0.15165401995182037, "learning_rate": 6.530380531382927e-06, "loss": 0.0873, "num_input_tokens_seen": 39645440, "step": 30585 }, { "epoch": 1.494662008648279, "grad_norm": 0.21325565874576569, "learning_rate": 6.523765551162433e-06, "loss": 0.0906, "num_input_tokens_seen": 39652128, "step": 30590 }, { "epoch": 1.4949063103119733, "grad_norm": 0.5396822094917297, "learning_rate": 6.517153420209476e-06, "loss": 0.0582, "num_input_tokens_seen": 39658336, "step": 30595 }, { "epoch": 1.4951506119756677, "grad_norm": 0.23070946335792542, "learning_rate": 6.510544139543739e-06, "loss": 0.0791, "num_input_tokens_seen": 39664736, "step": 30600 }, { "epoch": 1.4951506119756677, "eval_loss": 0.08738286048173904, "eval_runtime": 374.7898, "eval_samples_per_second": 97.081, "eval_steps_per_second": 24.272, "num_input_tokens_seen": 39664736, "step": 30600 }, { "epoch": 1.4953949136393618, "grad_norm": 0.14589446783065796, "learning_rate": 6.503937710184452e-06, "loss": 0.0682, "num_input_tokens_seen": 39671520, "step": 30605 }, { "epoch": 1.4956392153030562, "grad_norm": 0.22570091485977173, "learning_rate": 6.4973341331503954e-06, "loss": 0.0692, "num_input_tokens_seen": 39678016, "step": 30610 }, { "epoch": 1.4958835169667506, "grad_norm": 0.2604573965072632, "learning_rate": 6.490733409459942e-06, "loss": 0.0852, "num_input_tokens_seen": 39684512, "step": 30615 }, { "epoch": 1.4961278186304448, "grad_norm": 0.23501315712928772, "learning_rate": 6.484135540130995e-06, "loss": 0.094, "num_input_tokens_seen": 39690912, "step": 30620 }, { "epoch": 1.4963721202941391, "grad_norm": 0.23409618437290192, "learning_rate": 6.4775405261810364e-06, "loss": 0.0827, "num_input_tokens_seen": 39697568, "step": 30625 }, { "epoch": 1.4966164219578335, "grad_norm": 0.286824494600296, "learning_rate": 6.470948368627092e-06, "loss": 0.111, "num_input_tokens_seen": 39704032, "step": 30630 }, { "epoch": 1.4968607236215279, "grad_norm": 0.3772953450679779, "learning_rate": 6.464359068485756e-06, "loss": 0.0666, "num_input_tokens_seen": 39710336, "step": 30635 }, { "epoch": 1.4971050252852223, "grad_norm": 0.164765864610672, "learning_rate": 6.457772626773195e-06, "loss": 0.0579, "num_input_tokens_seen": 39716736, "step": 30640 }, { "epoch": 1.4973493269489166, "grad_norm": 0.48996394872665405, "learning_rate": 6.451189044505104e-06, "loss": 0.0824, "num_input_tokens_seen": 39723648, "step": 30645 }, { "epoch": 1.4975936286126108, "grad_norm": 0.613978922367096, "learning_rate": 6.44460832269676e-06, "loss": 0.0745, "num_input_tokens_seen": 39730240, "step": 30650 }, { "epoch": 1.4978379302763052, "grad_norm": 0.5065235495567322, "learning_rate": 6.438030462363001e-06, "loss": 0.0917, "num_input_tokens_seen": 39736480, "step": 30655 }, { "epoch": 1.4980822319399996, "grad_norm": 0.2469862699508667, "learning_rate": 6.431455464518205e-06, "loss": 0.0732, "num_input_tokens_seen": 39742848, "step": 30660 }, { "epoch": 1.4983265336036937, "grad_norm": 0.2529795169830322, "learning_rate": 6.424883330176326e-06, "loss": 0.0848, "num_input_tokens_seen": 39749280, "step": 30665 }, { "epoch": 1.498570835267388, "grad_norm": 0.1959793120622635, "learning_rate": 6.418314060350864e-06, "loss": 0.07, "num_input_tokens_seen": 39755808, "step": 30670 }, { "epoch": 1.4988151369310825, "grad_norm": 0.5867663025856018, "learning_rate": 6.4117476560548895e-06, "loss": 0.1115, "num_input_tokens_seen": 39761920, "step": 30675 }, { "epoch": 1.4990594385947769, "grad_norm": 0.40911737084388733, "learning_rate": 6.405184118301016e-06, "loss": 0.0756, "num_input_tokens_seen": 39768512, "step": 30680 }, { "epoch": 1.4993037402584712, "grad_norm": 0.24110695719718933, "learning_rate": 6.398623448101434e-06, "loss": 0.099, "num_input_tokens_seen": 39774464, "step": 30685 }, { "epoch": 1.4995480419221656, "grad_norm": 0.4749588966369629, "learning_rate": 6.392065646467871e-06, "loss": 0.0888, "num_input_tokens_seen": 39780704, "step": 30690 }, { "epoch": 1.4997923435858598, "grad_norm": 0.3184003531932831, "learning_rate": 6.385510714411632e-06, "loss": 0.117, "num_input_tokens_seen": 39787328, "step": 30695 }, { "epoch": 1.5000366452495542, "grad_norm": 0.1163531020283699, "learning_rate": 6.378958652943559e-06, "loss": 0.0949, "num_input_tokens_seen": 39793792, "step": 30700 }, { "epoch": 1.5002809469132483, "grad_norm": 0.25800517201423645, "learning_rate": 6.3724094630740776e-06, "loss": 0.097, "num_input_tokens_seen": 39799968, "step": 30705 }, { "epoch": 1.5005252485769427, "grad_norm": 1.0218937397003174, "learning_rate": 6.365863145813136e-06, "loss": 0.127, "num_input_tokens_seen": 39806496, "step": 30710 }, { "epoch": 1.500769550240637, "grad_norm": 0.3198479115962982, "learning_rate": 6.359319702170269e-06, "loss": 0.0914, "num_input_tokens_seen": 39812704, "step": 30715 }, { "epoch": 1.5010138519043315, "grad_norm": 0.5112608075141907, "learning_rate": 6.352779133154566e-06, "loss": 0.0728, "num_input_tokens_seen": 39819136, "step": 30720 }, { "epoch": 1.5012581535680258, "grad_norm": 1.1532942056655884, "learning_rate": 6.346241439774648e-06, "loss": 0.0895, "num_input_tokens_seen": 39826272, "step": 30725 }, { "epoch": 1.5015024552317202, "grad_norm": 0.3727131187915802, "learning_rate": 6.339706623038716e-06, "loss": 0.0895, "num_input_tokens_seen": 39832672, "step": 30730 }, { "epoch": 1.5017467568954146, "grad_norm": 0.3690742254257202, "learning_rate": 6.333174683954532e-06, "loss": 0.0891, "num_input_tokens_seen": 39839456, "step": 30735 }, { "epoch": 1.5019910585591087, "grad_norm": 0.109475277364254, "learning_rate": 6.326645623529387e-06, "loss": 0.0667, "num_input_tokens_seen": 39845600, "step": 30740 }, { "epoch": 1.5022353602228031, "grad_norm": 0.4247491955757141, "learning_rate": 6.320119442770156e-06, "loss": 0.0817, "num_input_tokens_seen": 39851808, "step": 30745 }, { "epoch": 1.5024796618864973, "grad_norm": 0.532197117805481, "learning_rate": 6.313596142683254e-06, "loss": 0.0766, "num_input_tokens_seen": 39857888, "step": 30750 }, { "epoch": 1.5027239635501917, "grad_norm": 0.4970822036266327, "learning_rate": 6.307075724274647e-06, "loss": 0.1059, "num_input_tokens_seen": 39864288, "step": 30755 }, { "epoch": 1.502968265213886, "grad_norm": 0.38175177574157715, "learning_rate": 6.300558188549882e-06, "loss": 0.0785, "num_input_tokens_seen": 39870336, "step": 30760 }, { "epoch": 1.5032125668775804, "grad_norm": 0.20936930179595947, "learning_rate": 6.29404353651403e-06, "loss": 0.096, "num_input_tokens_seen": 39876672, "step": 30765 }, { "epoch": 1.5034568685412748, "grad_norm": 0.2895045578479767, "learning_rate": 6.287531769171737e-06, "loss": 0.0714, "num_input_tokens_seen": 39883168, "step": 30770 }, { "epoch": 1.5037011702049692, "grad_norm": 0.2095676213502884, "learning_rate": 6.2810228875272045e-06, "loss": 0.0791, "num_input_tokens_seen": 39889952, "step": 30775 }, { "epoch": 1.5039454718686636, "grad_norm": 0.1989532709121704, "learning_rate": 6.274516892584179e-06, "loss": 0.0919, "num_input_tokens_seen": 39896416, "step": 30780 }, { "epoch": 1.5041897735323577, "grad_norm": 0.34436193108558655, "learning_rate": 6.268013785345969e-06, "loss": 0.0642, "num_input_tokens_seen": 39902624, "step": 30785 }, { "epoch": 1.504434075196052, "grad_norm": 0.1980816125869751, "learning_rate": 6.26151356681543e-06, "loss": 0.062, "num_input_tokens_seen": 39909408, "step": 30790 }, { "epoch": 1.5046783768597463, "grad_norm": 0.17229750752449036, "learning_rate": 6.255016237994981e-06, "loss": 0.0819, "num_input_tokens_seen": 39915584, "step": 30795 }, { "epoch": 1.5049226785234406, "grad_norm": 0.5756641030311584, "learning_rate": 6.248521799886603e-06, "loss": 0.077, "num_input_tokens_seen": 39922240, "step": 30800 }, { "epoch": 1.5049226785234406, "eval_loss": 0.08754429966211319, "eval_runtime": 374.7094, "eval_samples_per_second": 97.102, "eval_steps_per_second": 24.277, "num_input_tokens_seen": 39922240, "step": 30800 }, { "epoch": 1.505166980187135, "grad_norm": 0.2568020820617676, "learning_rate": 6.242030253491798e-06, "loss": 0.0794, "num_input_tokens_seen": 39928864, "step": 30805 }, { "epoch": 1.5054112818508294, "grad_norm": 0.2577119767665863, "learning_rate": 6.235541599811656e-06, "loss": 0.0801, "num_input_tokens_seen": 39935584, "step": 30810 }, { "epoch": 1.5056555835145238, "grad_norm": 0.9542317390441895, "learning_rate": 6.229055839846814e-06, "loss": 0.1168, "num_input_tokens_seen": 39942304, "step": 30815 }, { "epoch": 1.5058998851782182, "grad_norm": 0.23298589885234833, "learning_rate": 6.222572974597455e-06, "loss": 0.0732, "num_input_tokens_seen": 39949152, "step": 30820 }, { "epoch": 1.5061441868419125, "grad_norm": 0.23491904139518738, "learning_rate": 6.216093005063306e-06, "loss": 0.0922, "num_input_tokens_seen": 39955616, "step": 30825 }, { "epoch": 1.5063884885056067, "grad_norm": 0.4183814227581024, "learning_rate": 6.209615932243678e-06, "loss": 0.0866, "num_input_tokens_seen": 39962144, "step": 30830 }, { "epoch": 1.506632790169301, "grad_norm": 0.2112593650817871, "learning_rate": 6.203141757137399e-06, "loss": 0.0694, "num_input_tokens_seen": 39969024, "step": 30835 }, { "epoch": 1.5068770918329952, "grad_norm": 0.26769232749938965, "learning_rate": 6.196670480742886e-06, "loss": 0.1087, "num_input_tokens_seen": 39975648, "step": 30840 }, { "epoch": 1.5071213934966896, "grad_norm": 0.11673641204833984, "learning_rate": 6.190202104058074e-06, "loss": 0.0698, "num_input_tokens_seen": 39981888, "step": 30845 }, { "epoch": 1.507365695160384, "grad_norm": 0.19085903465747833, "learning_rate": 6.183736628080475e-06, "loss": 0.089, "num_input_tokens_seen": 39988320, "step": 30850 }, { "epoch": 1.5076099968240784, "grad_norm": 0.1746114045381546, "learning_rate": 6.177274053807155e-06, "loss": 0.0914, "num_input_tokens_seen": 39994624, "step": 30855 }, { "epoch": 1.5078542984877727, "grad_norm": 0.30951520800590515, "learning_rate": 6.170814382234713e-06, "loss": 0.0962, "num_input_tokens_seen": 40000928, "step": 30860 }, { "epoch": 1.5080986001514671, "grad_norm": 0.3951529860496521, "learning_rate": 6.16435761435932e-06, "loss": 0.1014, "num_input_tokens_seen": 40006944, "step": 30865 }, { "epoch": 1.5083429018151615, "grad_norm": 0.4215184450149536, "learning_rate": 6.157903751176681e-06, "loss": 0.0673, "num_input_tokens_seen": 40013184, "step": 30870 }, { "epoch": 1.5085872034788557, "grad_norm": 0.4108315110206604, "learning_rate": 6.151452793682066e-06, "loss": 0.1081, "num_input_tokens_seen": 40019808, "step": 30875 }, { "epoch": 1.50883150514255, "grad_norm": 0.27102598547935486, "learning_rate": 6.145004742870305e-06, "loss": 0.074, "num_input_tokens_seen": 40026560, "step": 30880 }, { "epoch": 1.5090758068062442, "grad_norm": 0.5521323084831238, "learning_rate": 6.138559599735752e-06, "loss": 0.0803, "num_input_tokens_seen": 40033024, "step": 30885 }, { "epoch": 1.5093201084699386, "grad_norm": 0.5203030705451965, "learning_rate": 6.132117365272344e-06, "loss": 0.0705, "num_input_tokens_seen": 40039520, "step": 30890 }, { "epoch": 1.509564410133633, "grad_norm": 0.1724083125591278, "learning_rate": 6.125678040473545e-06, "loss": 0.0821, "num_input_tokens_seen": 40045568, "step": 30895 }, { "epoch": 1.5098087117973273, "grad_norm": 0.1891975998878479, "learning_rate": 6.1192416263323755e-06, "loss": 0.0863, "num_input_tokens_seen": 40051648, "step": 30900 }, { "epoch": 1.5100530134610217, "grad_norm": 0.1824701428413391, "learning_rate": 6.112808123841424e-06, "loss": 0.0649, "num_input_tokens_seen": 40058080, "step": 30905 }, { "epoch": 1.510297315124716, "grad_norm": 0.14342385530471802, "learning_rate": 6.106377533992805e-06, "loss": 0.1043, "num_input_tokens_seen": 40064096, "step": 30910 }, { "epoch": 1.5105416167884105, "grad_norm": 0.07518810033798218, "learning_rate": 6.099949857778204e-06, "loss": 0.057, "num_input_tokens_seen": 40070592, "step": 30915 }, { "epoch": 1.5107859184521046, "grad_norm": 0.31913650035858154, "learning_rate": 6.093525096188852e-06, "loss": 0.1032, "num_input_tokens_seen": 40076672, "step": 30920 }, { "epoch": 1.511030220115799, "grad_norm": 0.2718987464904785, "learning_rate": 6.087103250215518e-06, "loss": 0.0923, "num_input_tokens_seen": 40083424, "step": 30925 }, { "epoch": 1.5112745217794932, "grad_norm": 0.21492834389209747, "learning_rate": 6.080684320848537e-06, "loss": 0.0599, "num_input_tokens_seen": 40089728, "step": 30930 }, { "epoch": 1.5115188234431876, "grad_norm": 0.5093543529510498, "learning_rate": 6.074268309077794e-06, "loss": 0.1032, "num_input_tokens_seen": 40096768, "step": 30935 }, { "epoch": 1.511763125106882, "grad_norm": 0.43726810812950134, "learning_rate": 6.067855215892709e-06, "loss": 0.0693, "num_input_tokens_seen": 40103648, "step": 30940 }, { "epoch": 1.5120074267705763, "grad_norm": 0.6484348773956299, "learning_rate": 6.061445042282271e-06, "loss": 0.0869, "num_input_tokens_seen": 40110144, "step": 30945 }, { "epoch": 1.5122517284342707, "grad_norm": 0.29440197348594666, "learning_rate": 6.055037789234999e-06, "loss": 0.0777, "num_input_tokens_seen": 40116160, "step": 30950 }, { "epoch": 1.512496030097965, "grad_norm": 0.21705207228660583, "learning_rate": 6.048633457738975e-06, "loss": 0.0702, "num_input_tokens_seen": 40122784, "step": 30955 }, { "epoch": 1.5127403317616595, "grad_norm": 0.239182710647583, "learning_rate": 6.042232048781837e-06, "loss": 0.0755, "num_input_tokens_seen": 40129504, "step": 30960 }, { "epoch": 1.5129846334253536, "grad_norm": 0.4413195252418518, "learning_rate": 6.035833563350757e-06, "loss": 0.0837, "num_input_tokens_seen": 40136352, "step": 30965 }, { "epoch": 1.513228935089048, "grad_norm": 0.4247397780418396, "learning_rate": 6.0294380024324525e-06, "loss": 0.087, "num_input_tokens_seen": 40142848, "step": 30970 }, { "epoch": 1.5134732367527421, "grad_norm": 0.4372544288635254, "learning_rate": 6.023045367013213e-06, "loss": 0.1039, "num_input_tokens_seen": 40149376, "step": 30975 }, { "epoch": 1.5137175384164365, "grad_norm": 0.4564843475818634, "learning_rate": 6.016655658078851e-06, "loss": 0.1068, "num_input_tokens_seen": 40155872, "step": 30980 }, { "epoch": 1.513961840080131, "grad_norm": 0.15054737031459808, "learning_rate": 6.010268876614753e-06, "loss": 0.0659, "num_input_tokens_seen": 40162144, "step": 30985 }, { "epoch": 1.5142061417438253, "grad_norm": 0.8738399147987366, "learning_rate": 6.0038850236058266e-06, "loss": 0.0843, "num_input_tokens_seen": 40168800, "step": 30990 }, { "epoch": 1.5144504434075197, "grad_norm": 0.4097699522972107, "learning_rate": 5.997504100036549e-06, "loss": 0.0845, "num_input_tokens_seen": 40175264, "step": 30995 }, { "epoch": 1.514694745071214, "grad_norm": 0.3146612048149109, "learning_rate": 5.991126106890949e-06, "loss": 0.0779, "num_input_tokens_seen": 40181504, "step": 31000 }, { "epoch": 1.514694745071214, "eval_loss": 0.08757063746452332, "eval_runtime": 375.1522, "eval_samples_per_second": 96.987, "eval_steps_per_second": 24.249, "num_input_tokens_seen": 40181504, "step": 31000 }, { "epoch": 1.5149390467349084, "grad_norm": 0.1385117620229721, "learning_rate": 5.984751045152576e-06, "loss": 0.0854, "num_input_tokens_seen": 40187968, "step": 31005 }, { "epoch": 1.5151833483986026, "grad_norm": 0.2034965455532074, "learning_rate": 5.978378915804553e-06, "loss": 0.0843, "num_input_tokens_seen": 40195136, "step": 31010 }, { "epoch": 1.515427650062297, "grad_norm": 0.1953563094139099, "learning_rate": 5.972009719829547e-06, "loss": 0.1019, "num_input_tokens_seen": 40201312, "step": 31015 }, { "epoch": 1.5156719517259911, "grad_norm": 0.7006341218948364, "learning_rate": 5.965643458209755e-06, "loss": 0.1016, "num_input_tokens_seen": 40207520, "step": 31020 }, { "epoch": 1.5159162533896855, "grad_norm": 0.38165903091430664, "learning_rate": 5.95928013192695e-06, "loss": 0.1, "num_input_tokens_seen": 40214016, "step": 31025 }, { "epoch": 1.5161605550533799, "grad_norm": 0.21761316061019897, "learning_rate": 5.952919741962423e-06, "loss": 0.0742, "num_input_tokens_seen": 40220352, "step": 31030 }, { "epoch": 1.5164048567170743, "grad_norm": 0.4820622205734253, "learning_rate": 5.946562289297042e-06, "loss": 0.0798, "num_input_tokens_seen": 40226976, "step": 31035 }, { "epoch": 1.5166491583807686, "grad_norm": 0.20113810896873474, "learning_rate": 5.9402077749111855e-06, "loss": 0.0749, "num_input_tokens_seen": 40233952, "step": 31040 }, { "epoch": 1.516893460044463, "grad_norm": 0.30253711342811584, "learning_rate": 5.933856199784821e-06, "loss": 0.0644, "num_input_tokens_seen": 40240736, "step": 31045 }, { "epoch": 1.5171377617081574, "grad_norm": 0.5611027479171753, "learning_rate": 5.927507564897419e-06, "loss": 0.0979, "num_input_tokens_seen": 40247232, "step": 31050 }, { "epoch": 1.5173820633718516, "grad_norm": 0.5497592687606812, "learning_rate": 5.9211618712280395e-06, "loss": 0.0918, "num_input_tokens_seen": 40253920, "step": 31055 }, { "epoch": 1.517626365035546, "grad_norm": 0.28830990195274353, "learning_rate": 5.914819119755255e-06, "loss": 0.0896, "num_input_tokens_seen": 40260064, "step": 31060 }, { "epoch": 1.51787066669924, "grad_norm": 0.18695171177387238, "learning_rate": 5.908479311457205e-06, "loss": 0.0767, "num_input_tokens_seen": 40266464, "step": 31065 }, { "epoch": 1.5181149683629345, "grad_norm": 0.5446522235870361, "learning_rate": 5.902142447311559e-06, "loss": 0.11, "num_input_tokens_seen": 40272928, "step": 31070 }, { "epoch": 1.5183592700266288, "grad_norm": 0.4591379761695862, "learning_rate": 5.895808528295546e-06, "loss": 0.082, "num_input_tokens_seen": 40279136, "step": 31075 }, { "epoch": 1.5186035716903232, "grad_norm": 0.47075894474983215, "learning_rate": 5.889477555385941e-06, "loss": 0.1317, "num_input_tokens_seen": 40285760, "step": 31080 }, { "epoch": 1.5188478733540176, "grad_norm": 0.5118090510368347, "learning_rate": 5.883149529559051e-06, "loss": 0.1119, "num_input_tokens_seen": 40291904, "step": 31085 }, { "epoch": 1.519092175017712, "grad_norm": 0.1831393986940384, "learning_rate": 5.876824451790738e-06, "loss": 0.0955, "num_input_tokens_seen": 40298272, "step": 31090 }, { "epoch": 1.5193364766814061, "grad_norm": 0.17588776350021362, "learning_rate": 5.87050232305642e-06, "loss": 0.0728, "num_input_tokens_seen": 40304576, "step": 31095 }, { "epoch": 1.5195807783451005, "grad_norm": 0.635358989238739, "learning_rate": 5.864183144331034e-06, "loss": 0.1049, "num_input_tokens_seen": 40310944, "step": 31100 }, { "epoch": 1.519825080008795, "grad_norm": 0.36612361669540405, "learning_rate": 5.857866916589089e-06, "loss": 0.0847, "num_input_tokens_seen": 40317152, "step": 31105 }, { "epoch": 1.520069381672489, "grad_norm": 0.26893919706344604, "learning_rate": 5.8515536408046216e-06, "loss": 0.0775, "num_input_tokens_seen": 40323296, "step": 31110 }, { "epoch": 1.5203136833361834, "grad_norm": 0.24079202115535736, "learning_rate": 5.845243317951208e-06, "loss": 0.098, "num_input_tokens_seen": 40329792, "step": 31115 }, { "epoch": 1.5205579849998778, "grad_norm": 0.3588363826274872, "learning_rate": 5.838935949001997e-06, "loss": 0.0901, "num_input_tokens_seen": 40336288, "step": 31120 }, { "epoch": 1.5208022866635722, "grad_norm": 0.6879532933235168, "learning_rate": 5.8326315349296476e-06, "loss": 0.0748, "num_input_tokens_seen": 40342880, "step": 31125 }, { "epoch": 1.5210465883272666, "grad_norm": 0.26787784695625305, "learning_rate": 5.826330076706396e-06, "loss": 0.0584, "num_input_tokens_seen": 40349152, "step": 31130 }, { "epoch": 1.521290889990961, "grad_norm": 0.2561705410480499, "learning_rate": 5.820031575303988e-06, "loss": 0.0781, "num_input_tokens_seen": 40355424, "step": 31135 }, { "epoch": 1.5215351916546551, "grad_norm": 0.19165296852588654, "learning_rate": 5.813736031693745e-06, "loss": 0.078, "num_input_tokens_seen": 40362304, "step": 31140 }, { "epoch": 1.5217794933183495, "grad_norm": 0.32403919100761414, "learning_rate": 5.807443446846522e-06, "loss": 0.0779, "num_input_tokens_seen": 40369088, "step": 31145 }, { "epoch": 1.5220237949820439, "grad_norm": 0.15686197578907013, "learning_rate": 5.801153821732699e-06, "loss": 0.0752, "num_input_tokens_seen": 40375808, "step": 31150 }, { "epoch": 1.522268096645738, "grad_norm": 0.34823575615882874, "learning_rate": 5.794867157322229e-06, "loss": 0.0994, "num_input_tokens_seen": 40382400, "step": 31155 }, { "epoch": 1.5225123983094324, "grad_norm": 0.25285184383392334, "learning_rate": 5.788583454584593e-06, "loss": 0.0861, "num_input_tokens_seen": 40388928, "step": 31160 }, { "epoch": 1.5227566999731268, "grad_norm": 0.2870138883590698, "learning_rate": 5.7823027144888075e-06, "loss": 0.0843, "num_input_tokens_seen": 40395808, "step": 31165 }, { "epoch": 1.5230010016368212, "grad_norm": 0.30889326333999634, "learning_rate": 5.776024938003455e-06, "loss": 0.1, "num_input_tokens_seen": 40402016, "step": 31170 }, { "epoch": 1.5232453033005156, "grad_norm": 0.5747591257095337, "learning_rate": 5.7697501260966345e-06, "loss": 0.0749, "num_input_tokens_seen": 40408224, "step": 31175 }, { "epoch": 1.52348960496421, "grad_norm": 0.47047722339630127, "learning_rate": 5.7634782797360145e-06, "loss": 0.0848, "num_input_tokens_seen": 40414368, "step": 31180 }, { "epoch": 1.523733906627904, "grad_norm": 0.2487754374742508, "learning_rate": 5.757209399888777e-06, "loss": 0.0784, "num_input_tokens_seen": 40421152, "step": 31185 }, { "epoch": 1.5239782082915985, "grad_norm": 0.3364574611186981, "learning_rate": 5.750943487521679e-06, "loss": 0.0801, "num_input_tokens_seen": 40427072, "step": 31190 }, { "epoch": 1.5242225099552928, "grad_norm": 0.19594526290893555, "learning_rate": 5.744680543600986e-06, "loss": 0.0858, "num_input_tokens_seen": 40433376, "step": 31195 }, { "epoch": 1.524466811618987, "grad_norm": 0.19828790426254272, "learning_rate": 5.738420569092537e-06, "loss": 0.0865, "num_input_tokens_seen": 40439712, "step": 31200 }, { "epoch": 1.524466811618987, "eval_loss": 0.0876358300447464, "eval_runtime": 374.9818, "eval_samples_per_second": 97.031, "eval_steps_per_second": 24.26, "num_input_tokens_seen": 40439712, "step": 31200 }, { "epoch": 1.5247111132826814, "grad_norm": 0.7383701801300049, "learning_rate": 5.732163564961684e-06, "loss": 0.0847, "num_input_tokens_seen": 40445856, "step": 31205 }, { "epoch": 1.5249554149463758, "grad_norm": 0.29023462533950806, "learning_rate": 5.725909532173354e-06, "loss": 0.0945, "num_input_tokens_seen": 40452576, "step": 31210 }, { "epoch": 1.5251997166100701, "grad_norm": 0.22776257991790771, "learning_rate": 5.719658471691977e-06, "loss": 0.074, "num_input_tokens_seen": 40459296, "step": 31215 }, { "epoch": 1.5254440182737645, "grad_norm": 0.3375069797039032, "learning_rate": 5.71341038448156e-06, "loss": 0.0972, "num_input_tokens_seen": 40465536, "step": 31220 }, { "epoch": 1.525688319937459, "grad_norm": 0.19943244755268097, "learning_rate": 5.707165271505635e-06, "loss": 0.0712, "num_input_tokens_seen": 40472096, "step": 31225 }, { "epoch": 1.525932621601153, "grad_norm": 0.5209032297134399, "learning_rate": 5.700923133727271e-06, "loss": 0.0807, "num_input_tokens_seen": 40478528, "step": 31230 }, { "epoch": 1.5261769232648474, "grad_norm": 0.28549569845199585, "learning_rate": 5.694683972109083e-06, "loss": 0.0953, "num_input_tokens_seen": 40484512, "step": 31235 }, { "epoch": 1.5264212249285416, "grad_norm": 0.5351450443267822, "learning_rate": 5.688447787613241e-06, "loss": 0.0677, "num_input_tokens_seen": 40491424, "step": 31240 }, { "epoch": 1.526665526592236, "grad_norm": 0.2415771782398224, "learning_rate": 5.6822145812014285e-06, "loss": 0.0657, "num_input_tokens_seen": 40497696, "step": 31245 }, { "epoch": 1.5269098282559304, "grad_norm": 0.23593732714653015, "learning_rate": 5.675984353834896e-06, "loss": 0.0925, "num_input_tokens_seen": 40503872, "step": 31250 }, { "epoch": 1.5271541299196247, "grad_norm": 0.1565151810646057, "learning_rate": 5.66975710647441e-06, "loss": 0.0843, "num_input_tokens_seen": 40510176, "step": 31255 }, { "epoch": 1.5273984315833191, "grad_norm": 0.3671410381793976, "learning_rate": 5.663532840080304e-06, "loss": 0.0677, "num_input_tokens_seen": 40516832, "step": 31260 }, { "epoch": 1.5276427332470135, "grad_norm": 0.17596395313739777, "learning_rate": 5.6573115556124325e-06, "loss": 0.0897, "num_input_tokens_seen": 40523104, "step": 31265 }, { "epoch": 1.5278870349107079, "grad_norm": 0.6271832585334778, "learning_rate": 5.651093254030185e-06, "loss": 0.0983, "num_input_tokens_seen": 40529056, "step": 31270 }, { "epoch": 1.528131336574402, "grad_norm": 0.2584560513496399, "learning_rate": 5.644877936292514e-06, "loss": 0.1028, "num_input_tokens_seen": 40535136, "step": 31275 }, { "epoch": 1.5283756382380964, "grad_norm": 0.18662334978580475, "learning_rate": 5.638665603357901e-06, "loss": 0.1112, "num_input_tokens_seen": 40542016, "step": 31280 }, { "epoch": 1.5286199399017906, "grad_norm": 0.1787693053483963, "learning_rate": 5.632456256184357e-06, "loss": 0.1149, "num_input_tokens_seen": 40548448, "step": 31285 }, { "epoch": 1.528864241565485, "grad_norm": 0.2573125958442688, "learning_rate": 5.626249895729452e-06, "loss": 0.0695, "num_input_tokens_seen": 40555008, "step": 31290 }, { "epoch": 1.5291085432291793, "grad_norm": 0.20329990983009338, "learning_rate": 5.620046522950273e-06, "loss": 0.0834, "num_input_tokens_seen": 40561472, "step": 31295 }, { "epoch": 1.5293528448928737, "grad_norm": 0.17139685153961182, "learning_rate": 5.613846138803464e-06, "loss": 0.0753, "num_input_tokens_seen": 40567936, "step": 31300 }, { "epoch": 1.529597146556568, "grad_norm": 0.402304083108902, "learning_rate": 5.607648744245206e-06, "loss": 0.1001, "num_input_tokens_seen": 40574912, "step": 31305 }, { "epoch": 1.5298414482202625, "grad_norm": 0.21221564710140228, "learning_rate": 5.601454340231207e-06, "loss": 0.087, "num_input_tokens_seen": 40581216, "step": 31310 }, { "epoch": 1.5300857498839568, "grad_norm": 0.44729626178741455, "learning_rate": 5.595262927716724e-06, "loss": 0.0947, "num_input_tokens_seen": 40588032, "step": 31315 }, { "epoch": 1.530330051547651, "grad_norm": 0.22644561529159546, "learning_rate": 5.589074507656561e-06, "loss": 0.1054, "num_input_tokens_seen": 40594432, "step": 31320 }, { "epoch": 1.5305743532113454, "grad_norm": 0.28947997093200684, "learning_rate": 5.582889081005044e-06, "loss": 0.0812, "num_input_tokens_seen": 40600832, "step": 31325 }, { "epoch": 1.5308186548750395, "grad_norm": 0.13385720551013947, "learning_rate": 5.5767066487160316e-06, "loss": 0.1022, "num_input_tokens_seen": 40607488, "step": 31330 }, { "epoch": 1.531062956538734, "grad_norm": 0.5336827635765076, "learning_rate": 5.570527211742949e-06, "loss": 0.096, "num_input_tokens_seen": 40614432, "step": 31335 }, { "epoch": 1.5313072582024283, "grad_norm": 0.40397578477859497, "learning_rate": 5.564350771038731e-06, "loss": 0.0942, "num_input_tokens_seen": 40621504, "step": 31340 }, { "epoch": 1.5315515598661227, "grad_norm": 0.23860220611095428, "learning_rate": 5.558177327555875e-06, "loss": 0.1106, "num_input_tokens_seen": 40627872, "step": 31345 }, { "epoch": 1.531795861529817, "grad_norm": 0.6036118865013123, "learning_rate": 5.552006882246388e-06, "loss": 0.1018, "num_input_tokens_seen": 40634048, "step": 31350 }, { "epoch": 1.5320401631935114, "grad_norm": 0.18986831605434418, "learning_rate": 5.545839436061839e-06, "loss": 0.071, "num_input_tokens_seen": 40640640, "step": 31355 }, { "epoch": 1.5322844648572058, "grad_norm": 0.3132821321487427, "learning_rate": 5.539674989953331e-06, "loss": 0.0925, "num_input_tokens_seen": 40646944, "step": 31360 }, { "epoch": 1.5325287665209, "grad_norm": 0.32419461011886597, "learning_rate": 5.533513544871488e-06, "loss": 0.0854, "num_input_tokens_seen": 40652992, "step": 31365 }, { "epoch": 1.5327730681845944, "grad_norm": 0.17134486138820648, "learning_rate": 5.527355101766493e-06, "loss": 0.088, "num_input_tokens_seen": 40659584, "step": 31370 }, { "epoch": 1.5330173698482885, "grad_norm": 0.47321370244026184, "learning_rate": 5.521199661588044e-06, "loss": 0.0672, "num_input_tokens_seen": 40666272, "step": 31375 }, { "epoch": 1.533261671511983, "grad_norm": 0.19869697093963623, "learning_rate": 5.5150472252853944e-06, "loss": 0.0914, "num_input_tokens_seen": 40673088, "step": 31380 }, { "epoch": 1.5335059731756773, "grad_norm": 0.16303879022598267, "learning_rate": 5.50889779380733e-06, "loss": 0.0745, "num_input_tokens_seen": 40680864, "step": 31385 }, { "epoch": 1.5337502748393717, "grad_norm": 0.3401637077331543, "learning_rate": 5.5027513681021605e-06, "loss": 0.0657, "num_input_tokens_seen": 40687072, "step": 31390 }, { "epoch": 1.533994576503066, "grad_norm": 0.4616234004497528, "learning_rate": 5.4966079491177545e-06, "loss": 0.1171, "num_input_tokens_seen": 40693632, "step": 31395 }, { "epoch": 1.5342388781667604, "grad_norm": 0.12407256662845612, "learning_rate": 5.490467537801491e-06, "loss": 0.0571, "num_input_tokens_seen": 40700736, "step": 31400 }, { "epoch": 1.5342388781667604, "eval_loss": 0.08740352094173431, "eval_runtime": 374.4116, "eval_samples_per_second": 97.179, "eval_steps_per_second": 24.297, "num_input_tokens_seen": 40700736, "step": 31400 }, { "epoch": 1.5344831798304548, "grad_norm": 0.31025272607803345, "learning_rate": 5.484330135100313e-06, "loss": 0.0738, "num_input_tokens_seen": 40707424, "step": 31405 }, { "epoch": 1.534727481494149, "grad_norm": 0.1606035679578781, "learning_rate": 5.4781957419606785e-06, "loss": 0.1128, "num_input_tokens_seen": 40714400, "step": 31410 }, { "epoch": 1.5349717831578433, "grad_norm": 0.29035186767578125, "learning_rate": 5.472064359328577e-06, "loss": 0.0556, "num_input_tokens_seen": 40720928, "step": 31415 }, { "epoch": 1.5352160848215375, "grad_norm": 0.2800413966178894, "learning_rate": 5.4659359881495565e-06, "loss": 0.0675, "num_input_tokens_seen": 40727680, "step": 31420 }, { "epoch": 1.5354603864852319, "grad_norm": 0.2566192150115967, "learning_rate": 5.4598106293686916e-06, "loss": 0.084, "num_input_tokens_seen": 40733888, "step": 31425 }, { "epoch": 1.5357046881489262, "grad_norm": 0.3568838834762573, "learning_rate": 5.45368828393058e-06, "loss": 0.1119, "num_input_tokens_seen": 40740928, "step": 31430 }, { "epoch": 1.5359489898126206, "grad_norm": 0.27284231781959534, "learning_rate": 5.44756895277937e-06, "loss": 0.0991, "num_input_tokens_seen": 40747584, "step": 31435 }, { "epoch": 1.536193291476315, "grad_norm": 0.1431482583284378, "learning_rate": 5.441452636858746e-06, "loss": 0.062, "num_input_tokens_seen": 40753888, "step": 31440 }, { "epoch": 1.5364375931400094, "grad_norm": 0.5751237273216248, "learning_rate": 5.435339337111905e-06, "loss": 0.0954, "num_input_tokens_seen": 40760192, "step": 31445 }, { "epoch": 1.5366818948037038, "grad_norm": 0.6661998629570007, "learning_rate": 5.42922905448161e-06, "loss": 0.0952, "num_input_tokens_seen": 40766880, "step": 31450 }, { "epoch": 1.536926196467398, "grad_norm": 0.8479970097541809, "learning_rate": 5.423121789910129e-06, "loss": 0.0864, "num_input_tokens_seen": 40773056, "step": 31455 }, { "epoch": 1.5371704981310923, "grad_norm": 0.6938992738723755, "learning_rate": 5.417017544339287e-06, "loss": 0.0938, "num_input_tokens_seen": 40779136, "step": 31460 }, { "epoch": 1.5374147997947865, "grad_norm": 0.33937129378318787, "learning_rate": 5.410916318710443e-06, "loss": 0.0765, "num_input_tokens_seen": 40785792, "step": 31465 }, { "epoch": 1.5376591014584808, "grad_norm": 0.18724088370800018, "learning_rate": 5.404818113964466e-06, "loss": 0.0908, "num_input_tokens_seen": 40792032, "step": 31470 }, { "epoch": 1.5379034031221752, "grad_norm": 0.1779729127883911, "learning_rate": 5.398722931041792e-06, "loss": 0.0789, "num_input_tokens_seen": 40798688, "step": 31475 }, { "epoch": 1.5381477047858696, "grad_norm": 0.5386911034584045, "learning_rate": 5.392630770882367e-06, "loss": 0.0801, "num_input_tokens_seen": 40805504, "step": 31480 }, { "epoch": 1.538392006449564, "grad_norm": 0.43434938788414, "learning_rate": 5.3865416344256705e-06, "loss": 0.0766, "num_input_tokens_seen": 40813312, "step": 31485 }, { "epoch": 1.5386363081132584, "grad_norm": 0.21386265754699707, "learning_rate": 5.380455522610742e-06, "loss": 0.1115, "num_input_tokens_seen": 40819840, "step": 31490 }, { "epoch": 1.5388806097769527, "grad_norm": 0.20707781612873077, "learning_rate": 5.374372436376116e-06, "loss": 0.074, "num_input_tokens_seen": 40826432, "step": 31495 }, { "epoch": 1.539124911440647, "grad_norm": 0.5672417283058167, "learning_rate": 5.368292376659895e-06, "loss": 0.1087, "num_input_tokens_seen": 40832992, "step": 31500 }, { "epoch": 1.5393692131043413, "grad_norm": 0.8017306923866272, "learning_rate": 5.362215344399701e-06, "loss": 0.0538, "num_input_tokens_seen": 40839232, "step": 31505 }, { "epoch": 1.5396135147680354, "grad_norm": 0.12761551141738892, "learning_rate": 5.356141340532678e-06, "loss": 0.0667, "num_input_tokens_seen": 40845696, "step": 31510 }, { "epoch": 1.5398578164317298, "grad_norm": 0.32111042737960815, "learning_rate": 5.350070365995522e-06, "loss": 0.0789, "num_input_tokens_seen": 40852096, "step": 31515 }, { "epoch": 1.5401021180954242, "grad_norm": 0.20495614409446716, "learning_rate": 5.344002421724459e-06, "loss": 0.0815, "num_input_tokens_seen": 40858592, "step": 31520 }, { "epoch": 1.5403464197591186, "grad_norm": 0.5393933057785034, "learning_rate": 5.337937508655228e-06, "loss": 0.0809, "num_input_tokens_seen": 40865696, "step": 31525 }, { "epoch": 1.540590721422813, "grad_norm": 0.3962903916835785, "learning_rate": 5.331875627723126e-06, "loss": 0.0846, "num_input_tokens_seen": 40872288, "step": 31530 }, { "epoch": 1.5408350230865073, "grad_norm": 0.3790434002876282, "learning_rate": 5.325816779862963e-06, "loss": 0.0715, "num_input_tokens_seen": 40878560, "step": 31535 }, { "epoch": 1.5410793247502017, "grad_norm": 0.45432621240615845, "learning_rate": 5.319760966009102e-06, "loss": 0.1124, "num_input_tokens_seen": 40884800, "step": 31540 }, { "epoch": 1.5413236264138959, "grad_norm": 0.26235535740852356, "learning_rate": 5.3137081870954096e-06, "loss": 0.0745, "num_input_tokens_seen": 40891552, "step": 31545 }, { "epoch": 1.5415679280775902, "grad_norm": 0.2888689935207367, "learning_rate": 5.307658444055313e-06, "loss": 0.0686, "num_input_tokens_seen": 40897952, "step": 31550 }, { "epoch": 1.5418122297412844, "grad_norm": 0.41594767570495605, "learning_rate": 5.301611737821749e-06, "loss": 0.106, "num_input_tokens_seen": 40905088, "step": 31555 }, { "epoch": 1.5420565314049788, "grad_norm": 0.31338682770729065, "learning_rate": 5.295568069327206e-06, "loss": 0.0886, "num_input_tokens_seen": 40911584, "step": 31560 }, { "epoch": 1.5423008330686732, "grad_norm": 0.33225125074386597, "learning_rate": 5.289527439503683e-06, "loss": 0.0832, "num_input_tokens_seen": 40917920, "step": 31565 }, { "epoch": 1.5425451347323675, "grad_norm": 0.45728233456611633, "learning_rate": 5.28348984928273e-06, "loss": 0.0829, "num_input_tokens_seen": 40924288, "step": 31570 }, { "epoch": 1.542789436396062, "grad_norm": 0.5764654278755188, "learning_rate": 5.27745529959541e-06, "loss": 0.0977, "num_input_tokens_seen": 40930464, "step": 31575 }, { "epoch": 1.5430337380597563, "grad_norm": 0.12928389012813568, "learning_rate": 5.271423791372335e-06, "loss": 0.063, "num_input_tokens_seen": 40936960, "step": 31580 }, { "epoch": 1.5432780397234507, "grad_norm": 0.29145872592926025, "learning_rate": 5.26539532554364e-06, "loss": 0.0771, "num_input_tokens_seen": 40943392, "step": 31585 }, { "epoch": 1.5435223413871448, "grad_norm": 0.5477609634399414, "learning_rate": 5.25936990303898e-06, "loss": 0.0817, "num_input_tokens_seen": 40950176, "step": 31590 }, { "epoch": 1.5437666430508392, "grad_norm": 0.2039160430431366, "learning_rate": 5.253347524787555e-06, "loss": 0.0719, "num_input_tokens_seen": 40956736, "step": 31595 }, { "epoch": 1.5440109447145334, "grad_norm": 0.4618054926395416, "learning_rate": 5.2473281917181035e-06, "loss": 0.1024, "num_input_tokens_seen": 40963072, "step": 31600 }, { "epoch": 1.5440109447145334, "eval_loss": 0.0874054804444313, "eval_runtime": 374.473, "eval_samples_per_second": 97.163, "eval_steps_per_second": 24.293, "num_input_tokens_seen": 40963072, "step": 31600 }, { "epoch": 1.5442552463782278, "grad_norm": 0.2293204367160797, "learning_rate": 5.241311904758864e-06, "loss": 0.1011, "num_input_tokens_seen": 40969952, "step": 31605 }, { "epoch": 1.5444995480419221, "grad_norm": 0.4172016382217407, "learning_rate": 5.23529866483764e-06, "loss": 0.0865, "num_input_tokens_seen": 40976704, "step": 31610 }, { "epoch": 1.5447438497056165, "grad_norm": 0.385409414768219, "learning_rate": 5.229288472881732e-06, "loss": 0.0921, "num_input_tokens_seen": 40982528, "step": 31615 }, { "epoch": 1.544988151369311, "grad_norm": 0.38214850425720215, "learning_rate": 5.2232813298180025e-06, "loss": 0.0763, "num_input_tokens_seen": 40989568, "step": 31620 }, { "epoch": 1.5452324530330053, "grad_norm": 0.28485026955604553, "learning_rate": 5.217277236572824e-06, "loss": 0.0628, "num_input_tokens_seen": 40996192, "step": 31625 }, { "epoch": 1.5454767546966997, "grad_norm": 0.27933046221733093, "learning_rate": 5.211276194072093e-06, "loss": 0.0983, "num_input_tokens_seen": 41002784, "step": 31630 }, { "epoch": 1.5457210563603938, "grad_norm": 0.25447145104408264, "learning_rate": 5.205278203241254e-06, "loss": 0.099, "num_input_tokens_seen": 41009440, "step": 31635 }, { "epoch": 1.5459653580240882, "grad_norm": 0.3614826798439026, "learning_rate": 5.199283265005278e-06, "loss": 0.0951, "num_input_tokens_seen": 41015456, "step": 31640 }, { "epoch": 1.5462096596877823, "grad_norm": 0.3890031576156616, "learning_rate": 5.193291380288648e-06, "loss": 0.0796, "num_input_tokens_seen": 41021664, "step": 31645 }, { "epoch": 1.5464539613514767, "grad_norm": 0.43432602286338806, "learning_rate": 5.1873025500153995e-06, "loss": 0.1176, "num_input_tokens_seen": 41027776, "step": 31650 }, { "epoch": 1.546698263015171, "grad_norm": 0.44517597556114197, "learning_rate": 5.181316775109071e-06, "loss": 0.0881, "num_input_tokens_seen": 41034880, "step": 31655 }, { "epoch": 1.5469425646788655, "grad_norm": 0.3053857684135437, "learning_rate": 5.1753340564927564e-06, "loss": 0.0703, "num_input_tokens_seen": 41041568, "step": 31660 }, { "epoch": 1.5471868663425599, "grad_norm": 0.144788458943367, "learning_rate": 5.169354395089068e-06, "loss": 0.0832, "num_input_tokens_seen": 41047904, "step": 31665 }, { "epoch": 1.5474311680062542, "grad_norm": 0.17867203056812286, "learning_rate": 5.1633777918201346e-06, "loss": 0.0899, "num_input_tokens_seen": 41054240, "step": 31670 }, { "epoch": 1.5476754696699484, "grad_norm": 0.13587045669555664, "learning_rate": 5.157404247607625e-06, "loss": 0.0808, "num_input_tokens_seen": 41060640, "step": 31675 }, { "epoch": 1.5479197713336428, "grad_norm": 0.6090927720069885, "learning_rate": 5.1514337633727454e-06, "loss": 0.0644, "num_input_tokens_seen": 41067040, "step": 31680 }, { "epoch": 1.5481640729973372, "grad_norm": 0.37861064076423645, "learning_rate": 5.145466340036206e-06, "loss": 0.0597, "num_input_tokens_seen": 41073472, "step": 31685 }, { "epoch": 1.5484083746610313, "grad_norm": 0.28371718525886536, "learning_rate": 5.139501978518274e-06, "loss": 0.1246, "num_input_tokens_seen": 41080288, "step": 31690 }, { "epoch": 1.5486526763247257, "grad_norm": 0.16004697978496552, "learning_rate": 5.133540679738716e-06, "loss": 0.1136, "num_input_tokens_seen": 41086528, "step": 31695 }, { "epoch": 1.54889697798842, "grad_norm": 0.4552938938140869, "learning_rate": 5.127582444616838e-06, "loss": 0.0996, "num_input_tokens_seen": 41092768, "step": 31700 }, { "epoch": 1.5491412796521145, "grad_norm": 0.26625484228134155, "learning_rate": 5.121627274071486e-06, "loss": 0.1077, "num_input_tokens_seen": 41099360, "step": 31705 }, { "epoch": 1.5493855813158088, "grad_norm": 0.387442409992218, "learning_rate": 5.115675169021009e-06, "loss": 0.093, "num_input_tokens_seen": 41106592, "step": 31710 }, { "epoch": 1.5496298829795032, "grad_norm": 0.16238808631896973, "learning_rate": 5.1097261303832994e-06, "loss": 0.094, "num_input_tokens_seen": 41113376, "step": 31715 }, { "epoch": 1.5498741846431974, "grad_norm": 0.38307565450668335, "learning_rate": 5.103780159075788e-06, "loss": 0.1031, "num_input_tokens_seen": 41119776, "step": 31720 }, { "epoch": 1.5501184863068918, "grad_norm": 0.4462539851665497, "learning_rate": 5.0978372560154e-06, "loss": 0.0824, "num_input_tokens_seen": 41126208, "step": 31725 }, { "epoch": 1.5503627879705861, "grad_norm": 0.29576796293258667, "learning_rate": 5.091897422118619e-06, "loss": 0.1095, "num_input_tokens_seen": 41132512, "step": 31730 }, { "epoch": 1.5506070896342803, "grad_norm": 0.42843782901763916, "learning_rate": 5.0859606583014305e-06, "loss": 0.0878, "num_input_tokens_seen": 41139136, "step": 31735 }, { "epoch": 1.5508513912979747, "grad_norm": 0.3413260579109192, "learning_rate": 5.080026965479365e-06, "loss": 0.0838, "num_input_tokens_seen": 41146016, "step": 31740 }, { "epoch": 1.551095692961669, "grad_norm": 0.41452187299728394, "learning_rate": 5.074096344567475e-06, "loss": 0.0754, "num_input_tokens_seen": 41152224, "step": 31745 }, { "epoch": 1.5513399946253634, "grad_norm": 1.003495454788208, "learning_rate": 5.0681687964803294e-06, "loss": 0.1046, "num_input_tokens_seen": 41158976, "step": 31750 }, { "epoch": 1.5515842962890578, "grad_norm": 0.5487085580825806, "learning_rate": 5.06224432213204e-06, "loss": 0.1053, "num_input_tokens_seen": 41165376, "step": 31755 }, { "epoch": 1.5518285979527522, "grad_norm": 0.370324969291687, "learning_rate": 5.056322922436224e-06, "loss": 0.0928, "num_input_tokens_seen": 41172320, "step": 31760 }, { "epoch": 1.5520728996164463, "grad_norm": 0.5715389251708984, "learning_rate": 5.0504045983060465e-06, "loss": 0.0778, "num_input_tokens_seen": 41178752, "step": 31765 }, { "epoch": 1.5523172012801407, "grad_norm": 0.23072360455989838, "learning_rate": 5.044489350654183e-06, "loss": 0.0916, "num_input_tokens_seen": 41185312, "step": 31770 }, { "epoch": 1.5525615029438349, "grad_norm": 0.2050994485616684, "learning_rate": 5.038577180392831e-06, "loss": 0.0992, "num_input_tokens_seen": 41191840, "step": 31775 }, { "epoch": 1.5528058046075293, "grad_norm": 0.2641485035419464, "learning_rate": 5.032668088433729e-06, "loss": 0.091, "num_input_tokens_seen": 41198208, "step": 31780 }, { "epoch": 1.5530501062712236, "grad_norm": 0.2559092342853546, "learning_rate": 5.02676207568814e-06, "loss": 0.0954, "num_input_tokens_seen": 41204832, "step": 31785 }, { "epoch": 1.553294407934918, "grad_norm": 0.4190521538257599, "learning_rate": 5.02085914306683e-06, "loss": 0.0773, "num_input_tokens_seen": 41211584, "step": 31790 }, { "epoch": 1.5535387095986124, "grad_norm": 0.3531191647052765, "learning_rate": 5.014959291480123e-06, "loss": 0.1245, "num_input_tokens_seen": 41218080, "step": 31795 }, { "epoch": 1.5537830112623068, "grad_norm": 0.49511194229125977, "learning_rate": 5.009062521837835e-06, "loss": 0.08, "num_input_tokens_seen": 41224800, "step": 31800 }, { "epoch": 1.5537830112623068, "eval_loss": 0.08776792883872986, "eval_runtime": 375.0522, "eval_samples_per_second": 97.013, "eval_steps_per_second": 24.255, "num_input_tokens_seen": 41224800, "step": 31800 }, { "epoch": 1.5540273129260012, "grad_norm": 0.39514753222465515, "learning_rate": 5.003168835049324e-06, "loss": 0.0956, "num_input_tokens_seen": 41231680, "step": 31805 }, { "epoch": 1.5542716145896953, "grad_norm": 0.15450716018676758, "learning_rate": 4.997278232023483e-06, "loss": 0.0748, "num_input_tokens_seen": 41238784, "step": 31810 }, { "epoch": 1.5545159162533897, "grad_norm": 0.2154591679573059, "learning_rate": 4.9913907136687036e-06, "loss": 0.0982, "num_input_tokens_seen": 41245088, "step": 31815 }, { "epoch": 1.5547602179170839, "grad_norm": 0.13843046128749847, "learning_rate": 4.985506280892918e-06, "loss": 0.0829, "num_input_tokens_seen": 41251520, "step": 31820 }, { "epoch": 1.5550045195807782, "grad_norm": 0.455288827419281, "learning_rate": 4.979624934603589e-06, "loss": 0.0921, "num_input_tokens_seen": 41258016, "step": 31825 }, { "epoch": 1.5552488212444726, "grad_norm": 0.6055790185928345, "learning_rate": 4.97374667570768e-06, "loss": 0.1032, "num_input_tokens_seen": 41264768, "step": 31830 }, { "epoch": 1.555493122908167, "grad_norm": 0.42971381545066833, "learning_rate": 4.967871505111704e-06, "loss": 0.0872, "num_input_tokens_seen": 41271264, "step": 31835 }, { "epoch": 1.5557374245718614, "grad_norm": 0.2656867504119873, "learning_rate": 4.961999423721686e-06, "loss": 0.0616, "num_input_tokens_seen": 41277568, "step": 31840 }, { "epoch": 1.5559817262355558, "grad_norm": 0.20575036108493805, "learning_rate": 4.956130432443159e-06, "loss": 0.0994, "num_input_tokens_seen": 41283680, "step": 31845 }, { "epoch": 1.5562260278992501, "grad_norm": 0.451509028673172, "learning_rate": 4.950264532181215e-06, "loss": 0.0675, "num_input_tokens_seen": 41290720, "step": 31850 }, { "epoch": 1.5564703295629443, "grad_norm": 0.12399997562170029, "learning_rate": 4.944401723840433e-06, "loss": 0.0486, "num_input_tokens_seen": 41297664, "step": 31855 }, { "epoch": 1.5567146312266387, "grad_norm": 0.4852435886859894, "learning_rate": 4.938542008324942e-06, "loss": 0.0866, "num_input_tokens_seen": 41304032, "step": 31860 }, { "epoch": 1.5569589328903328, "grad_norm": 0.27338385581970215, "learning_rate": 4.9326853865383855e-06, "loss": 0.095, "num_input_tokens_seen": 41310368, "step": 31865 }, { "epoch": 1.5572032345540272, "grad_norm": 0.8686227798461914, "learning_rate": 4.926831859383918e-06, "loss": 0.0754, "num_input_tokens_seen": 41316736, "step": 31870 }, { "epoch": 1.5574475362177216, "grad_norm": 0.1837947517633438, "learning_rate": 4.92098142776424e-06, "loss": 0.0917, "num_input_tokens_seen": 41323552, "step": 31875 }, { "epoch": 1.557691837881416, "grad_norm": 0.5663872957229614, "learning_rate": 4.91513409258155e-06, "loss": 0.096, "num_input_tokens_seen": 41330560, "step": 31880 }, { "epoch": 1.5579361395451103, "grad_norm": 0.42342525720596313, "learning_rate": 4.909289854737581e-06, "loss": 0.1039, "num_input_tokens_seen": 41336832, "step": 31885 }, { "epoch": 1.5581804412088047, "grad_norm": 0.6806967854499817, "learning_rate": 4.903448715133602e-06, "loss": 0.0919, "num_input_tokens_seen": 41343360, "step": 31890 }, { "epoch": 1.558424742872499, "grad_norm": 0.19902709126472473, "learning_rate": 4.897610674670372e-06, "loss": 0.0965, "num_input_tokens_seen": 41349856, "step": 31895 }, { "epoch": 1.5586690445361933, "grad_norm": 0.7485705614089966, "learning_rate": 4.8917757342482e-06, "loss": 0.0728, "num_input_tokens_seen": 41356736, "step": 31900 }, { "epoch": 1.5589133461998876, "grad_norm": 0.34691646695137024, "learning_rate": 4.885943894766909e-06, "loss": 0.058, "num_input_tokens_seen": 41363488, "step": 31905 }, { "epoch": 1.5591576478635818, "grad_norm": 0.7323386073112488, "learning_rate": 4.880115157125842e-06, "loss": 0.1031, "num_input_tokens_seen": 41370112, "step": 31910 }, { "epoch": 1.5594019495272762, "grad_norm": 0.3107202649116516, "learning_rate": 4.874289522223857e-06, "loss": 0.0863, "num_input_tokens_seen": 41376736, "step": 31915 }, { "epoch": 1.5596462511909706, "grad_norm": 0.6746437549591064, "learning_rate": 4.868466990959339e-06, "loss": 0.0901, "num_input_tokens_seen": 41383040, "step": 31920 }, { "epoch": 1.559890552854665, "grad_norm": 0.7535064220428467, "learning_rate": 4.8626475642301964e-06, "loss": 0.0993, "num_input_tokens_seen": 41389376, "step": 31925 }, { "epoch": 1.5601348545183593, "grad_norm": 0.2635968029499054, "learning_rate": 4.856831242933871e-06, "loss": 0.081, "num_input_tokens_seen": 41396000, "step": 31930 }, { "epoch": 1.5603791561820537, "grad_norm": 0.3606931269168854, "learning_rate": 4.851018027967294e-06, "loss": 0.0997, "num_input_tokens_seen": 41402432, "step": 31935 }, { "epoch": 1.560623457845748, "grad_norm": 0.15686258673667908, "learning_rate": 4.845207920226946e-06, "loss": 0.0962, "num_input_tokens_seen": 41408928, "step": 31940 }, { "epoch": 1.5608677595094422, "grad_norm": 0.4629254937171936, "learning_rate": 4.839400920608825e-06, "loss": 0.1092, "num_input_tokens_seen": 41415296, "step": 31945 }, { "epoch": 1.5611120611731366, "grad_norm": 0.42520055174827576, "learning_rate": 4.83359703000843e-06, "loss": 0.0926, "num_input_tokens_seen": 41421920, "step": 31950 }, { "epoch": 1.5613563628368308, "grad_norm": 0.3765465021133423, "learning_rate": 4.827796249320804e-06, "loss": 0.0776, "num_input_tokens_seen": 41428160, "step": 31955 }, { "epoch": 1.5616006645005251, "grad_norm": 0.5532428622245789, "learning_rate": 4.82199857944049e-06, "loss": 0.0938, "num_input_tokens_seen": 41434880, "step": 31960 }, { "epoch": 1.5618449661642195, "grad_norm": 0.21655993163585663, "learning_rate": 4.8162040212615695e-06, "loss": 0.0881, "num_input_tokens_seen": 41441632, "step": 31965 }, { "epoch": 1.562089267827914, "grad_norm": 0.6105093955993652, "learning_rate": 4.810412575677639e-06, "loss": 0.1028, "num_input_tokens_seen": 41448256, "step": 31970 }, { "epoch": 1.5623335694916083, "grad_norm": 0.1618848294019699, "learning_rate": 4.804624243581801e-06, "loss": 0.0893, "num_input_tokens_seen": 41454528, "step": 31975 }, { "epoch": 1.5625778711553027, "grad_norm": 0.2817300856113434, "learning_rate": 4.798839025866703e-06, "loss": 0.1049, "num_input_tokens_seen": 41460960, "step": 31980 }, { "epoch": 1.562822172818997, "grad_norm": 0.11781840026378632, "learning_rate": 4.793056923424491e-06, "loss": 0.0954, "num_input_tokens_seen": 41467072, "step": 31985 }, { "epoch": 1.5630664744826912, "grad_norm": 0.5259532928466797, "learning_rate": 4.78727793714683e-06, "loss": 0.088, "num_input_tokens_seen": 41473440, "step": 31990 }, { "epoch": 1.5633107761463856, "grad_norm": 0.17769059538841248, "learning_rate": 4.7815020679249285e-06, "loss": 0.0771, "num_input_tokens_seen": 41479520, "step": 31995 }, { "epoch": 1.5635550778100797, "grad_norm": 0.39637380838394165, "learning_rate": 4.775729316649483e-06, "loss": 0.0797, "num_input_tokens_seen": 41485536, "step": 32000 }, { "epoch": 1.5635550778100797, "eval_loss": 0.08769793808460236, "eval_runtime": 374.8027, "eval_samples_per_second": 97.078, "eval_steps_per_second": 24.271, "num_input_tokens_seen": 41485536, "step": 32000 }, { "epoch": 1.5637993794737741, "grad_norm": 0.41085219383239746, "learning_rate": 4.769959684210728e-06, "loss": 0.1144, "num_input_tokens_seen": 41492256, "step": 32005 }, { "epoch": 1.5640436811374685, "grad_norm": 0.29214775562286377, "learning_rate": 4.764193171498426e-06, "loss": 0.1028, "num_input_tokens_seen": 41498624, "step": 32010 }, { "epoch": 1.5642879828011629, "grad_norm": 0.32318902015686035, "learning_rate": 4.75842977940183e-06, "loss": 0.0904, "num_input_tokens_seen": 41505888, "step": 32015 }, { "epoch": 1.5645322844648573, "grad_norm": 0.19320034980773926, "learning_rate": 4.752669508809729e-06, "loss": 0.0847, "num_input_tokens_seen": 41512128, "step": 32020 }, { "epoch": 1.5647765861285516, "grad_norm": 0.7875279188156128, "learning_rate": 4.746912360610445e-06, "loss": 0.0991, "num_input_tokens_seen": 41518112, "step": 32025 }, { "epoch": 1.565020887792246, "grad_norm": 0.21574413776397705, "learning_rate": 4.741158335691781e-06, "loss": 0.0964, "num_input_tokens_seen": 41524800, "step": 32030 }, { "epoch": 1.5652651894559402, "grad_norm": 0.6286054253578186, "learning_rate": 4.7354074349410994e-06, "loss": 0.0914, "num_input_tokens_seen": 41531392, "step": 32035 }, { "epoch": 1.5655094911196346, "grad_norm": 0.3843156099319458, "learning_rate": 4.729659659245245e-06, "loss": 0.0935, "num_input_tokens_seen": 41537600, "step": 32040 }, { "epoch": 1.5657537927833287, "grad_norm": 0.4708574712276459, "learning_rate": 4.723915009490601e-06, "loss": 0.0912, "num_input_tokens_seen": 41543968, "step": 32045 }, { "epoch": 1.565998094447023, "grad_norm": 0.4151811897754669, "learning_rate": 4.718173486563077e-06, "loss": 0.0723, "num_input_tokens_seen": 41550240, "step": 32050 }, { "epoch": 1.5662423961107175, "grad_norm": 0.2316027134656906, "learning_rate": 4.71243509134808e-06, "loss": 0.0844, "num_input_tokens_seen": 41556448, "step": 32055 }, { "epoch": 1.5664866977744119, "grad_norm": 0.28418540954589844, "learning_rate": 4.706699824730532e-06, "loss": 0.0853, "num_input_tokens_seen": 41562624, "step": 32060 }, { "epoch": 1.5667309994381062, "grad_norm": 0.6542387008666992, "learning_rate": 4.700967687594901e-06, "loss": 0.0875, "num_input_tokens_seen": 41568512, "step": 32065 }, { "epoch": 1.5669753011018006, "grad_norm": 0.5573698878288269, "learning_rate": 4.69523868082514e-06, "loss": 0.0872, "num_input_tokens_seen": 41574784, "step": 32070 }, { "epoch": 1.567219602765495, "grad_norm": 0.36734098196029663, "learning_rate": 4.689512805304747e-06, "loss": 0.0861, "num_input_tokens_seen": 41580896, "step": 32075 }, { "epoch": 1.5674639044291891, "grad_norm": 0.3802969753742218, "learning_rate": 4.683790061916707e-06, "loss": 0.1159, "num_input_tokens_seen": 41587232, "step": 32080 }, { "epoch": 1.5677082060928835, "grad_norm": 0.2518230378627777, "learning_rate": 4.678070451543551e-06, "loss": 0.0921, "num_input_tokens_seen": 41593824, "step": 32085 }, { "epoch": 1.5679525077565777, "grad_norm": 0.21253734827041626, "learning_rate": 4.6723539750673204e-06, "loss": 0.07, "num_input_tokens_seen": 41600128, "step": 32090 }, { "epoch": 1.568196809420272, "grad_norm": 0.14515115320682526, "learning_rate": 4.666640633369551e-06, "loss": 0.0686, "num_input_tokens_seen": 41607136, "step": 32095 }, { "epoch": 1.5684411110839664, "grad_norm": 0.3129929304122925, "learning_rate": 4.660930427331323e-06, "loss": 0.0744, "num_input_tokens_seen": 41613408, "step": 32100 }, { "epoch": 1.5686854127476608, "grad_norm": 0.1340184360742569, "learning_rate": 4.6552233578332244e-06, "loss": 0.0881, "num_input_tokens_seen": 41620064, "step": 32105 }, { "epoch": 1.5689297144113552, "grad_norm": 0.45457035303115845, "learning_rate": 4.649519425755347e-06, "loss": 0.0807, "num_input_tokens_seen": 41626976, "step": 32110 }, { "epoch": 1.5691740160750496, "grad_norm": 0.5220156311988831, "learning_rate": 4.64381863197732e-06, "loss": 0.0805, "num_input_tokens_seen": 41633024, "step": 32115 }, { "epoch": 1.569418317738744, "grad_norm": 0.5247572064399719, "learning_rate": 4.638120977378269e-06, "loss": 0.0863, "num_input_tokens_seen": 41639584, "step": 32120 }, { "epoch": 1.5696626194024381, "grad_norm": 0.26215454936027527, "learning_rate": 4.632426462836848e-06, "loss": 0.0715, "num_input_tokens_seen": 41646880, "step": 32125 }, { "epoch": 1.5699069210661325, "grad_norm": 0.13390496373176575, "learning_rate": 4.626735089231224e-06, "loss": 0.0747, "num_input_tokens_seen": 41653088, "step": 32130 }, { "epoch": 1.5701512227298267, "grad_norm": 0.9249727129936218, "learning_rate": 4.621046857439068e-06, "loss": 0.1152, "num_input_tokens_seen": 41659520, "step": 32135 }, { "epoch": 1.570395524393521, "grad_norm": 0.7801864743232727, "learning_rate": 4.615361768337587e-06, "loss": 0.068, "num_input_tokens_seen": 41666144, "step": 32140 }, { "epoch": 1.5706398260572154, "grad_norm": 0.5397456288337708, "learning_rate": 4.6096798228034946e-06, "loss": 0.079, "num_input_tokens_seen": 41671904, "step": 32145 }, { "epoch": 1.5708841277209098, "grad_norm": 0.22220353782176971, "learning_rate": 4.604001021713008e-06, "loss": 0.11, "num_input_tokens_seen": 41678560, "step": 32150 }, { "epoch": 1.5711284293846042, "grad_norm": 0.1113181784749031, "learning_rate": 4.598325365941883e-06, "loss": 0.07, "num_input_tokens_seen": 41685280, "step": 32155 }, { "epoch": 1.5713727310482986, "grad_norm": 0.49672579765319824, "learning_rate": 4.5926528563653645e-06, "loss": 0.0657, "num_input_tokens_seen": 41691616, "step": 32160 }, { "epoch": 1.571617032711993, "grad_norm": 0.21610930562019348, "learning_rate": 4.5869834938582295e-06, "loss": 0.0654, "num_input_tokens_seen": 41697984, "step": 32165 }, { "epoch": 1.571861334375687, "grad_norm": 0.2931945025920868, "learning_rate": 4.581317279294772e-06, "loss": 0.0742, "num_input_tokens_seen": 41704352, "step": 32170 }, { "epoch": 1.5721056360393815, "grad_norm": 0.17168697714805603, "learning_rate": 4.57565421354878e-06, "loss": 0.0836, "num_input_tokens_seen": 41711008, "step": 32175 }, { "epoch": 1.5723499377030756, "grad_norm": 0.4965618848800659, "learning_rate": 4.569994297493579e-06, "loss": 0.0926, "num_input_tokens_seen": 41717760, "step": 32180 }, { "epoch": 1.57259423936677, "grad_norm": 0.4557882249355316, "learning_rate": 4.564337532002002e-06, "loss": 0.0951, "num_input_tokens_seen": 41723904, "step": 32185 }, { "epoch": 1.5728385410304644, "grad_norm": 0.3518292009830475, "learning_rate": 4.55868391794638e-06, "loss": 0.0859, "num_input_tokens_seen": 41730592, "step": 32190 }, { "epoch": 1.5730828426941588, "grad_norm": 0.17978627979755402, "learning_rate": 4.553033456198588e-06, "loss": 0.0704, "num_input_tokens_seen": 41737184, "step": 32195 }, { "epoch": 1.5733271443578531, "grad_norm": 0.18427197635173798, "learning_rate": 4.54738614762999e-06, "loss": 0.0883, "num_input_tokens_seen": 41743456, "step": 32200 }, { "epoch": 1.5733271443578531, "eval_loss": 0.0875600278377533, "eval_runtime": 374.6246, "eval_samples_per_second": 97.124, "eval_steps_per_second": 24.283, "num_input_tokens_seen": 41743456, "step": 32200 }, { "epoch": 1.5735714460215475, "grad_norm": 0.3912360668182373, "learning_rate": 4.541741993111465e-06, "loss": 0.0993, "num_input_tokens_seen": 41750208, "step": 32205 }, { "epoch": 1.5738157476852417, "grad_norm": 0.38680127263069153, "learning_rate": 4.536100993513423e-06, "loss": 0.0911, "num_input_tokens_seen": 41757312, "step": 32210 }, { "epoch": 1.574060049348936, "grad_norm": 0.4205072224140167, "learning_rate": 4.530463149705768e-06, "loss": 0.105, "num_input_tokens_seen": 41763936, "step": 32215 }, { "epoch": 1.5743043510126304, "grad_norm": 0.5481727719306946, "learning_rate": 4.524828462557934e-06, "loss": 0.0621, "num_input_tokens_seen": 41770592, "step": 32220 }, { "epoch": 1.5745486526763246, "grad_norm": 0.6510210633277893, "learning_rate": 4.5191969329388625e-06, "loss": 0.0994, "num_input_tokens_seen": 41776832, "step": 32225 }, { "epoch": 1.574792954340019, "grad_norm": 0.6449228525161743, "learning_rate": 4.5135685617169965e-06, "loss": 0.0991, "num_input_tokens_seen": 41782816, "step": 32230 }, { "epoch": 1.5750372560037134, "grad_norm": 0.31638604402542114, "learning_rate": 4.507943349760313e-06, "loss": 0.102, "num_input_tokens_seen": 41789248, "step": 32235 }, { "epoch": 1.5752815576674077, "grad_norm": 0.1887034922838211, "learning_rate": 4.502321297936277e-06, "loss": 0.0742, "num_input_tokens_seen": 41795936, "step": 32240 }, { "epoch": 1.5755258593311021, "grad_norm": 0.6541621088981628, "learning_rate": 4.496702407111888e-06, "loss": 0.1193, "num_input_tokens_seen": 41802560, "step": 32245 }, { "epoch": 1.5757701609947965, "grad_norm": 0.17178435623645782, "learning_rate": 4.491086678153653e-06, "loss": 0.0618, "num_input_tokens_seen": 41809248, "step": 32250 }, { "epoch": 1.5760144626584907, "grad_norm": 0.139743372797966, "learning_rate": 4.485474111927579e-06, "loss": 0.0919, "num_input_tokens_seen": 41815936, "step": 32255 }, { "epoch": 1.576258764322185, "grad_norm": 0.4254852533340454, "learning_rate": 4.479864709299197e-06, "loss": 0.0974, "num_input_tokens_seen": 41822624, "step": 32260 }, { "epoch": 1.5765030659858794, "grad_norm": 0.339493066072464, "learning_rate": 4.474258471133555e-06, "loss": 0.0633, "num_input_tokens_seen": 41829152, "step": 32265 }, { "epoch": 1.5767473676495736, "grad_norm": 0.23734869062900543, "learning_rate": 4.4686553982952014e-06, "loss": 0.087, "num_input_tokens_seen": 41835424, "step": 32270 }, { "epoch": 1.576991669313268, "grad_norm": 0.29928845167160034, "learning_rate": 4.463055491648191e-06, "loss": 0.0769, "num_input_tokens_seen": 41842272, "step": 32275 }, { "epoch": 1.5772359709769623, "grad_norm": 0.38740506768226624, "learning_rate": 4.457458752056112e-06, "loss": 0.0956, "num_input_tokens_seen": 41848768, "step": 32280 }, { "epoch": 1.5774802726406567, "grad_norm": 0.34869956970214844, "learning_rate": 4.451865180382042e-06, "loss": 0.1019, "num_input_tokens_seen": 41854912, "step": 32285 }, { "epoch": 1.577724574304351, "grad_norm": 0.1205408126115799, "learning_rate": 4.4462747774885936e-06, "loss": 0.0757, "num_input_tokens_seen": 41861344, "step": 32290 }, { "epoch": 1.5779688759680455, "grad_norm": 0.1957276463508606, "learning_rate": 4.440687544237859e-06, "loss": 0.102, "num_input_tokens_seen": 41867520, "step": 32295 }, { "epoch": 1.5782131776317396, "grad_norm": 0.48770228028297424, "learning_rate": 4.435103481491471e-06, "loss": 0.0736, "num_input_tokens_seen": 41874656, "step": 32300 }, { "epoch": 1.578457479295434, "grad_norm": 0.35954540967941284, "learning_rate": 4.429522590110569e-06, "loss": 0.0682, "num_input_tokens_seen": 41881248, "step": 32305 }, { "epoch": 1.5787017809591284, "grad_norm": 0.6158509254455566, "learning_rate": 4.423944870955779e-06, "loss": 0.0704, "num_input_tokens_seen": 41888608, "step": 32310 }, { "epoch": 1.5789460826228225, "grad_norm": 0.2465001940727234, "learning_rate": 4.418370324887272e-06, "loss": 0.09, "num_input_tokens_seen": 41895232, "step": 32315 }, { "epoch": 1.579190384286517, "grad_norm": 0.22864137589931488, "learning_rate": 4.412798952764699e-06, "loss": 0.0747, "num_input_tokens_seen": 41902368, "step": 32320 }, { "epoch": 1.5794346859502113, "grad_norm": 0.1958877146244049, "learning_rate": 4.407230755447245e-06, "loss": 0.0629, "num_input_tokens_seen": 41909152, "step": 32325 }, { "epoch": 1.5796789876139057, "grad_norm": 0.13727013766765594, "learning_rate": 4.401665733793598e-06, "loss": 0.0656, "num_input_tokens_seen": 41915904, "step": 32330 }, { "epoch": 1.5799232892776, "grad_norm": 0.4181266129016876, "learning_rate": 4.3961038886619425e-06, "loss": 0.0813, "num_input_tokens_seen": 41922240, "step": 32335 }, { "epoch": 1.5801675909412944, "grad_norm": 0.39304253458976746, "learning_rate": 4.39054522091e-06, "loss": 0.0799, "num_input_tokens_seen": 41928512, "step": 32340 }, { "epoch": 1.5804118926049886, "grad_norm": 0.1549265831708908, "learning_rate": 4.384989731394979e-06, "loss": 0.063, "num_input_tokens_seen": 41935104, "step": 32345 }, { "epoch": 1.580656194268683, "grad_norm": 0.4940533936023712, "learning_rate": 4.379437420973598e-06, "loss": 0.1264, "num_input_tokens_seen": 41941856, "step": 32350 }, { "epoch": 1.5809004959323771, "grad_norm": 0.18234507739543915, "learning_rate": 4.373888290502107e-06, "loss": 0.0919, "num_input_tokens_seen": 41948672, "step": 32355 }, { "epoch": 1.5811447975960715, "grad_norm": 0.17842388153076172, "learning_rate": 4.36834234083624e-06, "loss": 0.0629, "num_input_tokens_seen": 41955232, "step": 32360 }, { "epoch": 1.581389099259766, "grad_norm": 0.2709994316101074, "learning_rate": 4.362799572831258e-06, "loss": 0.0939, "num_input_tokens_seen": 41961472, "step": 32365 }, { "epoch": 1.5816334009234603, "grad_norm": 0.31600823998451233, "learning_rate": 4.35725998734193e-06, "loss": 0.0995, "num_input_tokens_seen": 41967424, "step": 32370 }, { "epoch": 1.5818777025871547, "grad_norm": 0.3904348313808441, "learning_rate": 4.3517235852225195e-06, "loss": 0.0789, "num_input_tokens_seen": 41974048, "step": 32375 }, { "epoch": 1.582122004250849, "grad_norm": 0.25420451164245605, "learning_rate": 4.346190367326822e-06, "loss": 0.0699, "num_input_tokens_seen": 41980096, "step": 32380 }, { "epoch": 1.5823663059145434, "grad_norm": 0.4842139780521393, "learning_rate": 4.340660334508115e-06, "loss": 0.0736, "num_input_tokens_seen": 41986624, "step": 32385 }, { "epoch": 1.5826106075782376, "grad_norm": 0.3302280604839325, "learning_rate": 4.335133487619206e-06, "loss": 0.0861, "num_input_tokens_seen": 41992832, "step": 32390 }, { "epoch": 1.582854909241932, "grad_norm": 0.42354241013526917, "learning_rate": 4.329609827512409e-06, "loss": 0.0903, "num_input_tokens_seen": 41999328, "step": 32395 }, { "epoch": 1.583099210905626, "grad_norm": 0.3688134551048279, "learning_rate": 4.324089355039531e-06, "loss": 0.0821, "num_input_tokens_seen": 42005696, "step": 32400 }, { "epoch": 1.583099210905626, "eval_loss": 0.08767848461866379, "eval_runtime": 374.4883, "eval_samples_per_second": 97.159, "eval_steps_per_second": 24.292, "num_input_tokens_seen": 42005696, "step": 32400 }, { "epoch": 1.5833435125693205, "grad_norm": 0.19299986958503723, "learning_rate": 4.3185720710519075e-06, "loss": 0.0964, "num_input_tokens_seen": 42011808, "step": 32405 }, { "epoch": 1.5835878142330149, "grad_norm": 0.7861732840538025, "learning_rate": 4.3130579764003724e-06, "loss": 0.0956, "num_input_tokens_seen": 42018240, "step": 32410 }, { "epoch": 1.5838321158967092, "grad_norm": 0.26901036500930786, "learning_rate": 4.307547071935267e-06, "loss": 0.1051, "num_input_tokens_seen": 42024800, "step": 32415 }, { "epoch": 1.5840764175604036, "grad_norm": 0.20096257328987122, "learning_rate": 4.302039358506435e-06, "loss": 0.0775, "num_input_tokens_seen": 42031232, "step": 32420 }, { "epoch": 1.584320719224098, "grad_norm": 0.21343562006950378, "learning_rate": 4.296534836963245e-06, "loss": 0.0756, "num_input_tokens_seen": 42037568, "step": 32425 }, { "epoch": 1.5845650208877924, "grad_norm": 0.17796704173088074, "learning_rate": 4.291033508154555e-06, "loss": 0.0793, "num_input_tokens_seen": 42043872, "step": 32430 }, { "epoch": 1.5848093225514865, "grad_norm": 0.3757198452949524, "learning_rate": 4.285535372928748e-06, "loss": 0.0868, "num_input_tokens_seen": 42050368, "step": 32435 }, { "epoch": 1.585053624215181, "grad_norm": 0.3815075755119324, "learning_rate": 4.280040432133695e-06, "loss": 0.1162, "num_input_tokens_seen": 42056896, "step": 32440 }, { "epoch": 1.585297925878875, "grad_norm": 0.2817600667476654, "learning_rate": 4.274548686616789e-06, "loss": 0.1027, "num_input_tokens_seen": 42063392, "step": 32445 }, { "epoch": 1.5855422275425695, "grad_norm": 0.29684916138648987, "learning_rate": 4.2690601372249364e-06, "loss": 0.0863, "num_input_tokens_seen": 42069824, "step": 32450 }, { "epoch": 1.5857865292062638, "grad_norm": 0.26791250705718994, "learning_rate": 4.263574784804525e-06, "loss": 0.0988, "num_input_tokens_seen": 42076064, "step": 32455 }, { "epoch": 1.5860308308699582, "grad_norm": 0.17813700437545776, "learning_rate": 4.258092630201479e-06, "loss": 0.0741, "num_input_tokens_seen": 42082784, "step": 32460 }, { "epoch": 1.5862751325336526, "grad_norm": 0.6333736181259155, "learning_rate": 4.252613674261202e-06, "loss": 0.0911, "num_input_tokens_seen": 42089248, "step": 32465 }, { "epoch": 1.586519434197347, "grad_norm": 0.7612040638923645, "learning_rate": 4.2471379178286224e-06, "loss": 0.0744, "num_input_tokens_seen": 42095904, "step": 32470 }, { "epoch": 1.5867637358610414, "grad_norm": 1.011290192604065, "learning_rate": 4.241665361748181e-06, "loss": 0.105, "num_input_tokens_seen": 42102688, "step": 32475 }, { "epoch": 1.5870080375247355, "grad_norm": 0.15054486691951752, "learning_rate": 4.2361960068637994e-06, "loss": 0.0978, "num_input_tokens_seen": 42108640, "step": 32480 }, { "epoch": 1.58725233918843, "grad_norm": 0.24622033536434174, "learning_rate": 4.230729854018933e-06, "loss": 0.0759, "num_input_tokens_seen": 42114720, "step": 32485 }, { "epoch": 1.587496640852124, "grad_norm": 0.36961907148361206, "learning_rate": 4.225266904056521e-06, "loss": 0.0992, "num_input_tokens_seen": 42121632, "step": 32490 }, { "epoch": 1.5877409425158184, "grad_norm": 0.33388710021972656, "learning_rate": 4.21980715781903e-06, "loss": 0.0792, "num_input_tokens_seen": 42128320, "step": 32495 }, { "epoch": 1.5879852441795128, "grad_norm": 0.7706953287124634, "learning_rate": 4.214350616148416e-06, "loss": 0.0816, "num_input_tokens_seen": 42134720, "step": 32500 }, { "epoch": 1.5882295458432072, "grad_norm": 0.2039644718170166, "learning_rate": 4.20889727988614e-06, "loss": 0.0827, "num_input_tokens_seen": 42141536, "step": 32505 }, { "epoch": 1.5884738475069016, "grad_norm": 0.19417613744735718, "learning_rate": 4.20344714987318e-06, "loss": 0.0962, "num_input_tokens_seen": 42147584, "step": 32510 }, { "epoch": 1.588718149170596, "grad_norm": 0.36364123225212097, "learning_rate": 4.198000226950022e-06, "loss": 0.0755, "num_input_tokens_seen": 42154080, "step": 32515 }, { "epoch": 1.5889624508342903, "grad_norm": 0.5283003449440002, "learning_rate": 4.192556511956635e-06, "loss": 0.0727, "num_input_tokens_seen": 42161056, "step": 32520 }, { "epoch": 1.5892067524979845, "grad_norm": 0.31019434332847595, "learning_rate": 4.18711600573252e-06, "loss": 0.0864, "num_input_tokens_seen": 42167200, "step": 32525 }, { "epoch": 1.5894510541616789, "grad_norm": 0.14507482945919037, "learning_rate": 4.181678709116671e-06, "loss": 0.0767, "num_input_tokens_seen": 42174048, "step": 32530 }, { "epoch": 1.589695355825373, "grad_norm": 0.9505629539489746, "learning_rate": 4.1762446229475785e-06, "loss": 0.0928, "num_input_tokens_seen": 42180320, "step": 32535 }, { "epoch": 1.5899396574890674, "grad_norm": 0.12952998280525208, "learning_rate": 4.17081374806326e-06, "loss": 0.0686, "num_input_tokens_seen": 42187168, "step": 32540 }, { "epoch": 1.5901839591527618, "grad_norm": 0.32079407572746277, "learning_rate": 4.165386085301212e-06, "loss": 0.0828, "num_input_tokens_seen": 42193952, "step": 32545 }, { "epoch": 1.5904282608164562, "grad_norm": 0.19224008917808533, "learning_rate": 4.1599616354984525e-06, "loss": 0.0576, "num_input_tokens_seen": 42200256, "step": 32550 }, { "epoch": 1.5906725624801505, "grad_norm": 0.29357579350471497, "learning_rate": 4.154540399491508e-06, "loss": 0.0853, "num_input_tokens_seen": 42206528, "step": 32555 }, { "epoch": 1.590916864143845, "grad_norm": 0.21294179558753967, "learning_rate": 4.149122378116394e-06, "loss": 0.0899, "num_input_tokens_seen": 42213280, "step": 32560 }, { "epoch": 1.5911611658075393, "grad_norm": 0.4292333126068115, "learning_rate": 4.14370757220863e-06, "loss": 0.0819, "num_input_tokens_seen": 42220128, "step": 32565 }, { "epoch": 1.5914054674712335, "grad_norm": 0.5422388911247253, "learning_rate": 4.138295982603263e-06, "loss": 0.077, "num_input_tokens_seen": 42227104, "step": 32570 }, { "epoch": 1.5916497691349278, "grad_norm": 0.31405675411224365, "learning_rate": 4.132887610134814e-06, "loss": 0.097, "num_input_tokens_seen": 42233984, "step": 32575 }, { "epoch": 1.591894070798622, "grad_norm": 0.3279975950717926, "learning_rate": 4.127482455637335e-06, "loss": 0.0738, "num_input_tokens_seen": 42240768, "step": 32580 }, { "epoch": 1.5921383724623164, "grad_norm": 0.2090013474225998, "learning_rate": 4.1220805199443545e-06, "loss": 0.0747, "num_input_tokens_seen": 42247712, "step": 32585 }, { "epoch": 1.5923826741260108, "grad_norm": 0.41279342770576477, "learning_rate": 4.116681803888925e-06, "loss": 0.0864, "num_input_tokens_seen": 42254688, "step": 32590 }, { "epoch": 1.5926269757897051, "grad_norm": 0.3204883933067322, "learning_rate": 4.111286308303605e-06, "loss": 0.1135, "num_input_tokens_seen": 42261120, "step": 32595 }, { "epoch": 1.5928712774533995, "grad_norm": 0.8083642721176147, "learning_rate": 4.105894034020433e-06, "loss": 0.1171, "num_input_tokens_seen": 42267520, "step": 32600 }, { "epoch": 1.5928712774533995, "eval_loss": 0.08743782341480255, "eval_runtime": 374.6941, "eval_samples_per_second": 97.106, "eval_steps_per_second": 24.278, "num_input_tokens_seen": 42267520, "step": 32600 }, { "epoch": 1.593115579117094, "grad_norm": 0.298700749874115, "learning_rate": 4.100504981870975e-06, "loss": 0.0724, "num_input_tokens_seen": 42273952, "step": 32605 }, { "epoch": 1.5933598807807883, "grad_norm": 0.5247393250465393, "learning_rate": 4.0951191526862915e-06, "loss": 0.0985, "num_input_tokens_seen": 42280352, "step": 32610 }, { "epoch": 1.5936041824444824, "grad_norm": 0.49941128492355347, "learning_rate": 4.089736547296938e-06, "loss": 0.1031, "num_input_tokens_seen": 42286848, "step": 32615 }, { "epoch": 1.5938484841081768, "grad_norm": 0.5372405648231506, "learning_rate": 4.08435716653299e-06, "loss": 0.1057, "num_input_tokens_seen": 42293472, "step": 32620 }, { "epoch": 1.594092785771871, "grad_norm": 0.30557963252067566, "learning_rate": 4.0789810112240005e-06, "loss": 0.0903, "num_input_tokens_seen": 42299552, "step": 32625 }, { "epoch": 1.5943370874355653, "grad_norm": 0.48020315170288086, "learning_rate": 4.073608082199057e-06, "loss": 0.0763, "num_input_tokens_seen": 42306432, "step": 32630 }, { "epoch": 1.5945813890992597, "grad_norm": 0.2678477168083191, "learning_rate": 4.068238380286718e-06, "loss": 0.052, "num_input_tokens_seen": 42313440, "step": 32635 }, { "epoch": 1.594825690762954, "grad_norm": 0.27444520592689514, "learning_rate": 4.062871906315072e-06, "loss": 0.0593, "num_input_tokens_seen": 42319904, "step": 32640 }, { "epoch": 1.5950699924266485, "grad_norm": 0.27575361728668213, "learning_rate": 4.057508661111686e-06, "loss": 0.084, "num_input_tokens_seen": 42326272, "step": 32645 }, { "epoch": 1.5953142940903429, "grad_norm": 0.34315305948257446, "learning_rate": 4.052148645503648e-06, "loss": 0.1021, "num_input_tokens_seen": 42332384, "step": 32650 }, { "epoch": 1.5955585957540372, "grad_norm": 0.4793260097503662, "learning_rate": 4.046791860317531e-06, "loss": 0.0797, "num_input_tokens_seen": 42338784, "step": 32655 }, { "epoch": 1.5958028974177314, "grad_norm": 0.38374024629592896, "learning_rate": 4.041438306379431e-06, "loss": 0.071, "num_input_tokens_seen": 42345408, "step": 32660 }, { "epoch": 1.5960471990814258, "grad_norm": 0.5232711434364319, "learning_rate": 4.036087984514916e-06, "loss": 0.1093, "num_input_tokens_seen": 42351936, "step": 32665 }, { "epoch": 1.59629150074512, "grad_norm": 0.4877433180809021, "learning_rate": 4.030740895549084e-06, "loss": 0.0622, "num_input_tokens_seen": 42358400, "step": 32670 }, { "epoch": 1.5965358024088143, "grad_norm": 0.2559860348701477, "learning_rate": 4.025397040306531e-06, "loss": 0.0821, "num_input_tokens_seen": 42364224, "step": 32675 }, { "epoch": 1.5967801040725087, "grad_norm": 0.20852388441562653, "learning_rate": 4.0200564196113285e-06, "loss": 0.0857, "num_input_tokens_seen": 42370816, "step": 32680 }, { "epoch": 1.597024405736203, "grad_norm": 0.597740113735199, "learning_rate": 4.014719034287079e-06, "loss": 0.0961, "num_input_tokens_seen": 42377440, "step": 32685 }, { "epoch": 1.5972687073998975, "grad_norm": 0.20579831302165985, "learning_rate": 4.0093848851568775e-06, "loss": 0.1015, "num_input_tokens_seen": 42384128, "step": 32690 }, { "epoch": 1.5975130090635918, "grad_norm": 0.20037499070167542, "learning_rate": 4.004053973043304e-06, "loss": 0.0825, "num_input_tokens_seen": 42390496, "step": 32695 }, { "epoch": 1.5977573107272862, "grad_norm": 0.21013407409191132, "learning_rate": 3.998726298768465e-06, "loss": 0.091, "num_input_tokens_seen": 42397248, "step": 32700 }, { "epoch": 1.5980016123909804, "grad_norm": 0.5380498766899109, "learning_rate": 3.99340186315395e-06, "loss": 0.0833, "num_input_tokens_seen": 42403520, "step": 32705 }, { "epoch": 1.5982459140546748, "grad_norm": 0.3377017378807068, "learning_rate": 3.988080667020849e-06, "loss": 0.0753, "num_input_tokens_seen": 42410208, "step": 32710 }, { "epoch": 1.598490215718369, "grad_norm": 0.19612255692481995, "learning_rate": 3.982762711189766e-06, "loss": 0.0801, "num_input_tokens_seen": 42416416, "step": 32715 }, { "epoch": 1.5987345173820633, "grad_norm": 0.3044872581958771, "learning_rate": 3.977447996480785e-06, "loss": 0.1071, "num_input_tokens_seen": 42422528, "step": 32720 }, { "epoch": 1.5989788190457577, "grad_norm": 0.22878605127334595, "learning_rate": 3.97213652371351e-06, "loss": 0.0982, "num_input_tokens_seen": 42428768, "step": 32725 }, { "epoch": 1.599223120709452, "grad_norm": 0.3024005889892578, "learning_rate": 3.966828293707042e-06, "loss": 0.0954, "num_input_tokens_seen": 42435552, "step": 32730 }, { "epoch": 1.5994674223731464, "grad_norm": 0.24452677369117737, "learning_rate": 3.961523307279963e-06, "loss": 0.0931, "num_input_tokens_seen": 42442208, "step": 32735 }, { "epoch": 1.5997117240368408, "grad_norm": 0.43100956082344055, "learning_rate": 3.956221565250382e-06, "loss": 0.0655, "num_input_tokens_seen": 42448960, "step": 32740 }, { "epoch": 1.599956025700535, "grad_norm": 0.26858723163604736, "learning_rate": 3.950923068435883e-06, "loss": 0.1071, "num_input_tokens_seen": 42455936, "step": 32745 }, { "epoch": 1.6002003273642293, "grad_norm": 0.18901416659355164, "learning_rate": 3.945627817653566e-06, "loss": 0.0578, "num_input_tokens_seen": 42462528, "step": 32750 }, { "epoch": 1.6004446290279237, "grad_norm": 0.19500677287578583, "learning_rate": 3.9403358137200335e-06, "loss": 0.1097, "num_input_tokens_seen": 42469344, "step": 32755 }, { "epoch": 1.6006889306916179, "grad_norm": 0.3818342685699463, "learning_rate": 3.9350470574513605e-06, "loss": 0.0924, "num_input_tokens_seen": 42475776, "step": 32760 }, { "epoch": 1.6009332323553123, "grad_norm": 0.1749701052904129, "learning_rate": 3.9297615496631525e-06, "loss": 0.0813, "num_input_tokens_seen": 42482112, "step": 32765 }, { "epoch": 1.6011775340190066, "grad_norm": 0.5210583806037903, "learning_rate": 3.924479291170505e-06, "loss": 0.0807, "num_input_tokens_seen": 42488864, "step": 32770 }, { "epoch": 1.601421835682701, "grad_norm": 0.3736514151096344, "learning_rate": 3.919200282788002e-06, "loss": 0.0807, "num_input_tokens_seen": 42495232, "step": 32775 }, { "epoch": 1.6016661373463954, "grad_norm": 0.31208834052085876, "learning_rate": 3.913924525329726e-06, "loss": 0.0583, "num_input_tokens_seen": 42501568, "step": 32780 }, { "epoch": 1.6019104390100898, "grad_norm": 0.1974368691444397, "learning_rate": 3.908652019609279e-06, "loss": 0.0722, "num_input_tokens_seen": 42508032, "step": 32785 }, { "epoch": 1.602154740673784, "grad_norm": 0.8214340806007385, "learning_rate": 3.9033827664397364e-06, "loss": 0.0909, "num_input_tokens_seen": 42515360, "step": 32790 }, { "epoch": 1.6023990423374783, "grad_norm": 0.4506360590457916, "learning_rate": 3.898116766633694e-06, "loss": 0.0936, "num_input_tokens_seen": 42522048, "step": 32795 }, { "epoch": 1.6026433440011727, "grad_norm": 0.21989105641841888, "learning_rate": 3.8928540210032225e-06, "loss": 0.0645, "num_input_tokens_seen": 42528896, "step": 32800 }, { "epoch": 1.6026433440011727, "eval_loss": 0.08742859214544296, "eval_runtime": 375.089, "eval_samples_per_second": 97.004, "eval_steps_per_second": 24.253, "num_input_tokens_seen": 42528896, "step": 32800 }, { "epoch": 1.6028876456648669, "grad_norm": 0.4817020297050476, "learning_rate": 3.887594530359909e-06, "loss": 0.1055, "num_input_tokens_seen": 42535008, "step": 32805 }, { "epoch": 1.6031319473285612, "grad_norm": 0.2652604877948761, "learning_rate": 3.88233829551484e-06, "loss": 0.0839, "num_input_tokens_seen": 42541472, "step": 32810 }, { "epoch": 1.6033762489922556, "grad_norm": 0.46149882674217224, "learning_rate": 3.877085317278581e-06, "loss": 0.069, "num_input_tokens_seen": 42548096, "step": 32815 }, { "epoch": 1.60362055065595, "grad_norm": 0.4275277256965637, "learning_rate": 3.87183559646122e-06, "loss": 0.0965, "num_input_tokens_seen": 42554688, "step": 32820 }, { "epoch": 1.6038648523196444, "grad_norm": 0.18861782550811768, "learning_rate": 3.866589133872317e-06, "loss": 0.0638, "num_input_tokens_seen": 42561120, "step": 32825 }, { "epoch": 1.6041091539833388, "grad_norm": 0.34546419978141785, "learning_rate": 3.861345930320948e-06, "loss": 0.0859, "num_input_tokens_seen": 42567744, "step": 32830 }, { "epoch": 1.604353455647033, "grad_norm": 1.1054291725158691, "learning_rate": 3.856105986615688e-06, "loss": 0.0898, "num_input_tokens_seen": 42574272, "step": 32835 }, { "epoch": 1.6045977573107273, "grad_norm": 0.21754665672779083, "learning_rate": 3.850869303564589e-06, "loss": 0.0871, "num_input_tokens_seen": 42581216, "step": 32840 }, { "epoch": 1.6048420589744217, "grad_norm": 0.3283078372478485, "learning_rate": 3.845635881975226e-06, "loss": 0.0988, "num_input_tokens_seen": 42587200, "step": 32845 }, { "epoch": 1.6050863606381158, "grad_norm": 0.1460913121700287, "learning_rate": 3.840405722654647e-06, "loss": 0.0722, "num_input_tokens_seen": 42593536, "step": 32850 }, { "epoch": 1.6053306623018102, "grad_norm": 0.5172139406204224, "learning_rate": 3.835178826409419e-06, "loss": 0.1167, "num_input_tokens_seen": 42599904, "step": 32855 }, { "epoch": 1.6055749639655046, "grad_norm": 0.7026620507240295, "learning_rate": 3.8299551940455895e-06, "loss": 0.1138, "num_input_tokens_seen": 42606592, "step": 32860 }, { "epoch": 1.605819265629199, "grad_norm": 0.19055014848709106, "learning_rate": 3.824734826368703e-06, "loss": 0.0771, "num_input_tokens_seen": 42612704, "step": 32865 }, { "epoch": 1.6060635672928933, "grad_norm": 0.3761756122112274, "learning_rate": 3.819517724183813e-06, "loss": 0.0847, "num_input_tokens_seen": 42618816, "step": 32870 }, { "epoch": 1.6063078689565877, "grad_norm": 0.14987747371196747, "learning_rate": 3.8143038882954648e-06, "loss": 0.0992, "num_input_tokens_seen": 42625248, "step": 32875 }, { "epoch": 1.6065521706202819, "grad_norm": 0.6009425520896912, "learning_rate": 3.8090933195076867e-06, "loss": 0.0736, "num_input_tokens_seen": 42631488, "step": 32880 }, { "epoch": 1.6067964722839763, "grad_norm": 0.5885695219039917, "learning_rate": 3.8038860186240198e-06, "loss": 0.077, "num_input_tokens_seen": 42637568, "step": 32885 }, { "epoch": 1.6070407739476704, "grad_norm": 0.303611159324646, "learning_rate": 3.7986819864475026e-06, "loss": 0.0708, "num_input_tokens_seen": 42643936, "step": 32890 }, { "epoch": 1.6072850756113648, "grad_norm": 0.480244517326355, "learning_rate": 3.793481223780651e-06, "loss": 0.0874, "num_input_tokens_seen": 42650432, "step": 32895 }, { "epoch": 1.6075293772750592, "grad_norm": 0.4685906171798706, "learning_rate": 3.788283731425496e-06, "loss": 0.1072, "num_input_tokens_seen": 42657088, "step": 32900 }, { "epoch": 1.6077736789387536, "grad_norm": 0.3223550319671631, "learning_rate": 3.7830895101835488e-06, "loss": 0.0732, "num_input_tokens_seen": 42663840, "step": 32905 }, { "epoch": 1.608017980602448, "grad_norm": 0.23906190693378448, "learning_rate": 3.7778985608558274e-06, "loss": 0.0936, "num_input_tokens_seen": 42670240, "step": 32910 }, { "epoch": 1.6082622822661423, "grad_norm": 0.3711854815483093, "learning_rate": 3.7727108842428443e-06, "loss": 0.0901, "num_input_tokens_seen": 42676832, "step": 32915 }, { "epoch": 1.6085065839298367, "grad_norm": 0.2664296627044678, "learning_rate": 3.7675264811446065e-06, "loss": 0.0936, "num_input_tokens_seen": 42683232, "step": 32920 }, { "epoch": 1.6087508855935309, "grad_norm": 0.36244893074035645, "learning_rate": 3.7623453523605994e-06, "loss": 0.0707, "num_input_tokens_seen": 42689920, "step": 32925 }, { "epoch": 1.6089951872572252, "grad_norm": 0.18025001883506775, "learning_rate": 3.757167498689834e-06, "loss": 0.0645, "num_input_tokens_seen": 42696416, "step": 32930 }, { "epoch": 1.6092394889209194, "grad_norm": 0.3086729049682617, "learning_rate": 3.7519929209307914e-06, "loss": 0.0831, "num_input_tokens_seen": 42702656, "step": 32935 }, { "epoch": 1.6094837905846138, "grad_norm": 0.2939235270023346, "learning_rate": 3.746821619881463e-06, "loss": 0.1196, "num_input_tokens_seen": 42708992, "step": 32940 }, { "epoch": 1.6097280922483082, "grad_norm": 0.16898669302463531, "learning_rate": 3.74165359633932e-06, "loss": 0.0911, "num_input_tokens_seen": 42715168, "step": 32945 }, { "epoch": 1.6099723939120025, "grad_norm": 0.32297882437705994, "learning_rate": 3.736488851101341e-06, "loss": 0.071, "num_input_tokens_seen": 42721952, "step": 32950 }, { "epoch": 1.610216695575697, "grad_norm": 0.5592007040977478, "learning_rate": 3.7313273849640035e-06, "loss": 0.0805, "num_input_tokens_seen": 42728416, "step": 32955 }, { "epoch": 1.6104609972393913, "grad_norm": 0.6958295702934265, "learning_rate": 3.7261691987232533e-06, "loss": 0.0904, "num_input_tokens_seen": 42734848, "step": 32960 }, { "epoch": 1.6107052989030857, "grad_norm": 0.4315039813518524, "learning_rate": 3.7210142931745575e-06, "loss": 0.0902, "num_input_tokens_seen": 42741408, "step": 32965 }, { "epoch": 1.6109496005667798, "grad_norm": 0.6441124081611633, "learning_rate": 3.7158626691128712e-06, "loss": 0.0755, "num_input_tokens_seen": 42747616, "step": 32970 }, { "epoch": 1.6111939022304742, "grad_norm": 0.5374704003334045, "learning_rate": 3.710714327332629e-06, "loss": 0.1063, "num_input_tokens_seen": 42754144, "step": 32975 }, { "epoch": 1.6114382038941684, "grad_norm": 0.15506431460380554, "learning_rate": 3.7055692686277815e-06, "loss": 0.0766, "num_input_tokens_seen": 42760448, "step": 32980 }, { "epoch": 1.6116825055578627, "grad_norm": 0.5314242839813232, "learning_rate": 3.70042749379175e-06, "loss": 0.0787, "num_input_tokens_seen": 42766944, "step": 32985 }, { "epoch": 1.6119268072215571, "grad_norm": 0.436225563287735, "learning_rate": 3.6952890036174693e-06, "loss": 0.0956, "num_input_tokens_seen": 42773376, "step": 32990 }, { "epoch": 1.6121711088852515, "grad_norm": 0.3637750744819641, "learning_rate": 3.690153798897353e-06, "loss": 0.068, "num_input_tokens_seen": 42780032, "step": 32995 }, { "epoch": 1.6124154105489459, "grad_norm": 0.44142526388168335, "learning_rate": 3.6850218804233225e-06, "loss": 0.1057, "num_input_tokens_seen": 42786240, "step": 33000 }, { "epoch": 1.6124154105489459, "eval_loss": 0.08736912161111832, "eval_runtime": 374.9569, "eval_samples_per_second": 97.038, "eval_steps_per_second": 24.261, "num_input_tokens_seen": 42786240, "step": 33000 }, { "epoch": 1.6126597122126403, "grad_norm": 0.5655313730239868, "learning_rate": 3.679893248986779e-06, "loss": 0.083, "num_input_tokens_seen": 42793216, "step": 33005 }, { "epoch": 1.6129040138763346, "grad_norm": 0.18820834159851074, "learning_rate": 3.6747679053786147e-06, "loss": 0.0627, "num_input_tokens_seen": 42799200, "step": 33010 }, { "epoch": 1.6131483155400288, "grad_norm": 0.406303733587265, "learning_rate": 3.669645850389228e-06, "loss": 0.1025, "num_input_tokens_seen": 42805632, "step": 33015 }, { "epoch": 1.6133926172037232, "grad_norm": 0.473610520362854, "learning_rate": 3.664527084808514e-06, "loss": 0.0743, "num_input_tokens_seen": 42812160, "step": 33020 }, { "epoch": 1.6136369188674173, "grad_norm": 0.28853270411491394, "learning_rate": 3.6594116094258337e-06, "loss": 0.0789, "num_input_tokens_seen": 42818848, "step": 33025 }, { "epoch": 1.6138812205311117, "grad_norm": 0.1548653095960617, "learning_rate": 3.6542994250300665e-06, "loss": 0.0906, "num_input_tokens_seen": 42825024, "step": 33030 }, { "epoch": 1.614125522194806, "grad_norm": 0.4682413339614868, "learning_rate": 3.6491905324095825e-06, "loss": 0.1155, "num_input_tokens_seen": 42831712, "step": 33035 }, { "epoch": 1.6143698238585005, "grad_norm": 0.2621241807937622, "learning_rate": 3.644084932352221e-06, "loss": 0.0832, "num_input_tokens_seen": 42838176, "step": 33040 }, { "epoch": 1.6146141255221949, "grad_norm": 0.16546419262886047, "learning_rate": 3.6389826256453457e-06, "loss": 0.0776, "num_input_tokens_seen": 42845056, "step": 33045 }, { "epoch": 1.6148584271858892, "grad_norm": 0.2828223705291748, "learning_rate": 3.633883613075781e-06, "loss": 0.0929, "num_input_tokens_seen": 42851456, "step": 33050 }, { "epoch": 1.6151027288495836, "grad_norm": 0.6013824343681335, "learning_rate": 3.6287878954298693e-06, "loss": 0.0884, "num_input_tokens_seen": 42857632, "step": 33055 }, { "epoch": 1.6153470305132778, "grad_norm": 0.4858764410018921, "learning_rate": 3.6236954734934354e-06, "loss": 0.1118, "num_input_tokens_seen": 42864320, "step": 33060 }, { "epoch": 1.6155913321769722, "grad_norm": 0.26460394263267517, "learning_rate": 3.618606348051784e-06, "loss": 0.0975, "num_input_tokens_seen": 42870720, "step": 33065 }, { "epoch": 1.6158356338406663, "grad_norm": 0.3198973834514618, "learning_rate": 3.6135205198897376e-06, "loss": 0.0973, "num_input_tokens_seen": 42876960, "step": 33070 }, { "epoch": 1.6160799355043607, "grad_norm": 0.31941813230514526, "learning_rate": 3.6084379897915854e-06, "loss": 0.0868, "num_input_tokens_seen": 42882976, "step": 33075 }, { "epoch": 1.616324237168055, "grad_norm": 0.3634151816368103, "learning_rate": 3.6033587585411115e-06, "loss": 0.0936, "num_input_tokens_seen": 42889664, "step": 33080 }, { "epoch": 1.6165685388317494, "grad_norm": 0.27199238538742065, "learning_rate": 3.5982828269216117e-06, "loss": 0.0783, "num_input_tokens_seen": 42896192, "step": 33085 }, { "epoch": 1.6168128404954438, "grad_norm": 0.16237448155879974, "learning_rate": 3.593210195715843e-06, "loss": 0.0844, "num_input_tokens_seen": 42902336, "step": 33090 }, { "epoch": 1.6170571421591382, "grad_norm": 0.4198843240737915, "learning_rate": 3.5881408657060773e-06, "loss": 0.0603, "num_input_tokens_seen": 42909088, "step": 33095 }, { "epoch": 1.6173014438228326, "grad_norm": 0.23337581753730774, "learning_rate": 3.583074837674075e-06, "loss": 0.0817, "num_input_tokens_seen": 42916032, "step": 33100 }, { "epoch": 1.6175457454865267, "grad_norm": 0.21930360794067383, "learning_rate": 3.578012112401069e-06, "loss": 0.086, "num_input_tokens_seen": 42922272, "step": 33105 }, { "epoch": 1.6177900471502211, "grad_norm": 0.21255066990852356, "learning_rate": 3.5729526906677996e-06, "loss": 0.0873, "num_input_tokens_seen": 42929056, "step": 33110 }, { "epoch": 1.6180343488139153, "grad_norm": 0.7347705960273743, "learning_rate": 3.5678965732545007e-06, "loss": 0.0757, "num_input_tokens_seen": 42935264, "step": 33115 }, { "epoch": 1.6182786504776097, "grad_norm": 0.22749283909797668, "learning_rate": 3.562843760940876e-06, "loss": 0.0662, "num_input_tokens_seen": 42941920, "step": 33120 }, { "epoch": 1.618522952141304, "grad_norm": 0.31062382459640503, "learning_rate": 3.5577942545061473e-06, "loss": 0.0682, "num_input_tokens_seen": 42948352, "step": 33125 }, { "epoch": 1.6187672538049984, "grad_norm": 0.2587828040122986, "learning_rate": 3.5527480547289967e-06, "loss": 0.0675, "num_input_tokens_seen": 42954336, "step": 33130 }, { "epoch": 1.6190115554686928, "grad_norm": 0.7981301546096802, "learning_rate": 3.547705162387624e-06, "loss": 0.0702, "num_input_tokens_seen": 42960800, "step": 33135 }, { "epoch": 1.6192558571323872, "grad_norm": 0.29479295015335083, "learning_rate": 3.542665578259699e-06, "loss": 0.0901, "num_input_tokens_seen": 42967392, "step": 33140 }, { "epoch": 1.6195001587960816, "grad_norm": 0.181453675031662, "learning_rate": 3.5376293031223945e-06, "loss": 0.0987, "num_input_tokens_seen": 42973536, "step": 33145 }, { "epoch": 1.6197444604597757, "grad_norm": 0.31267014145851135, "learning_rate": 3.5325963377523614e-06, "loss": 0.076, "num_input_tokens_seen": 42979808, "step": 33150 }, { "epoch": 1.61998876212347, "grad_norm": 0.21724793314933777, "learning_rate": 3.5275666829257536e-06, "loss": 0.0783, "num_input_tokens_seen": 42986368, "step": 33155 }, { "epoch": 1.6202330637871643, "grad_norm": 0.19929476082324982, "learning_rate": 3.5225403394181955e-06, "loss": 0.0845, "num_input_tokens_seen": 42993024, "step": 33160 }, { "epoch": 1.6204773654508586, "grad_norm": 0.32291361689567566, "learning_rate": 3.517517308004828e-06, "loss": 0.084, "num_input_tokens_seen": 42999264, "step": 33165 }, { "epoch": 1.620721667114553, "grad_norm": 0.2379598766565323, "learning_rate": 3.512497589460251e-06, "loss": 0.0724, "num_input_tokens_seen": 43005344, "step": 33170 }, { "epoch": 1.6209659687782474, "grad_norm": 0.5449444055557251, "learning_rate": 3.5074811845585727e-06, "loss": 0.0702, "num_input_tokens_seen": 43011680, "step": 33175 }, { "epoch": 1.6212102704419418, "grad_norm": 0.5487431287765503, "learning_rate": 3.5024680940733937e-06, "loss": 0.0804, "num_input_tokens_seen": 43017920, "step": 33180 }, { "epoch": 1.6214545721056361, "grad_norm": 0.17458106577396393, "learning_rate": 3.4974583187777852e-06, "loss": 0.0624, "num_input_tokens_seen": 43024512, "step": 33185 }, { "epoch": 1.6216988737693305, "grad_norm": 0.2138625830411911, "learning_rate": 3.4924518594443204e-06, "loss": 0.0785, "num_input_tokens_seen": 43030784, "step": 33190 }, { "epoch": 1.6219431754330247, "grad_norm": 0.2823992073535919, "learning_rate": 3.4874487168450682e-06, "loss": 0.1229, "num_input_tokens_seen": 43037184, "step": 33195 }, { "epoch": 1.622187477096719, "grad_norm": 0.18827471137046814, "learning_rate": 3.482448891751558e-06, "loss": 0.0915, "num_input_tokens_seen": 43043616, "step": 33200 }, { "epoch": 1.622187477096719, "eval_loss": 0.08763042092323303, "eval_runtime": 374.9792, "eval_samples_per_second": 97.032, "eval_steps_per_second": 24.26, "num_input_tokens_seen": 43043616, "step": 33200 }, { "epoch": 1.6224317787604132, "grad_norm": 0.47229358553886414, "learning_rate": 3.477452384934843e-06, "loss": 0.0739, "num_input_tokens_seen": 43049824, "step": 33205 }, { "epoch": 1.6226760804241076, "grad_norm": 0.2793095111846924, "learning_rate": 3.472459197165434e-06, "loss": 0.0776, "num_input_tokens_seen": 43056096, "step": 33210 }, { "epoch": 1.622920382087802, "grad_norm": 0.4510778784751892, "learning_rate": 3.4674693292133518e-06, "loss": 0.083, "num_input_tokens_seen": 43062496, "step": 33215 }, { "epoch": 1.6231646837514964, "grad_norm": 0.18455630540847778, "learning_rate": 3.4624827818480977e-06, "loss": 0.0812, "num_input_tokens_seen": 43069216, "step": 33220 }, { "epoch": 1.6234089854151907, "grad_norm": 0.3055039048194885, "learning_rate": 3.4574995558386474e-06, "loss": 0.1076, "num_input_tokens_seen": 43075776, "step": 33225 }, { "epoch": 1.6236532870788851, "grad_norm": 0.2972475588321686, "learning_rate": 3.452519651953487e-06, "loss": 0.0648, "num_input_tokens_seen": 43082496, "step": 33230 }, { "epoch": 1.6238975887425795, "grad_norm": 0.5028567314147949, "learning_rate": 3.447543070960585e-06, "loss": 0.0803, "num_input_tokens_seen": 43088608, "step": 33235 }, { "epoch": 1.6241418904062737, "grad_norm": 0.29908907413482666, "learning_rate": 3.4425698136273778e-06, "loss": 0.0755, "num_input_tokens_seen": 43095040, "step": 33240 }, { "epoch": 1.624386192069968, "grad_norm": 0.2690106928348541, "learning_rate": 3.437599880720821e-06, "loss": 0.0981, "num_input_tokens_seen": 43101216, "step": 33245 }, { "epoch": 1.6246304937336622, "grad_norm": 0.24979111552238464, "learning_rate": 3.4326332730073267e-06, "loss": 0.0621, "num_input_tokens_seen": 43107840, "step": 33250 }, { "epoch": 1.6248747953973566, "grad_norm": 0.19608396291732788, "learning_rate": 3.427669991252813e-06, "loss": 0.0912, "num_input_tokens_seen": 43114176, "step": 33255 }, { "epoch": 1.625119097061051, "grad_norm": 0.2244119644165039, "learning_rate": 3.42271003622269e-06, "loss": 0.065, "num_input_tokens_seen": 43120512, "step": 33260 }, { "epoch": 1.6253633987247453, "grad_norm": 0.32223597168922424, "learning_rate": 3.4177534086818286e-06, "loss": 0.116, "num_input_tokens_seen": 43126688, "step": 33265 }, { "epoch": 1.6256077003884397, "grad_norm": 0.1834426075220108, "learning_rate": 3.412800109394612e-06, "loss": 0.0777, "num_input_tokens_seen": 43132928, "step": 33270 }, { "epoch": 1.625852002052134, "grad_norm": 0.17126037180423737, "learning_rate": 3.4078501391249044e-06, "loss": 0.077, "num_input_tokens_seen": 43138880, "step": 33275 }, { "epoch": 1.6260963037158283, "grad_norm": 0.3623603284358978, "learning_rate": 3.4029034986360453e-06, "loss": 0.0736, "num_input_tokens_seen": 43145568, "step": 33280 }, { "epoch": 1.6263406053795226, "grad_norm": 0.1580297350883484, "learning_rate": 3.397960188690877e-06, "loss": 0.0982, "num_input_tokens_seen": 43152160, "step": 33285 }, { "epoch": 1.626584907043217, "grad_norm": 0.16023609042167664, "learning_rate": 3.393020210051717e-06, "loss": 0.0821, "num_input_tokens_seen": 43158880, "step": 33290 }, { "epoch": 1.6268292087069112, "grad_norm": 0.3587471842765808, "learning_rate": 3.3880835634803655e-06, "loss": 0.0859, "num_input_tokens_seen": 43165440, "step": 33295 }, { "epoch": 1.6270735103706055, "grad_norm": 0.1285192221403122, "learning_rate": 3.383150249738126e-06, "loss": 0.0622, "num_input_tokens_seen": 43171872, "step": 33300 }, { "epoch": 1.6273178120343, "grad_norm": 0.16412688791751862, "learning_rate": 3.3782202695857663e-06, "loss": 0.1171, "num_input_tokens_seen": 43178304, "step": 33305 }, { "epoch": 1.6275621136979943, "grad_norm": 0.8367612361907959, "learning_rate": 3.373293623783558e-06, "loss": 0.0782, "num_input_tokens_seen": 43184320, "step": 33310 }, { "epoch": 1.6278064153616887, "grad_norm": 0.3034772574901581, "learning_rate": 3.368370313091257e-06, "loss": 0.0823, "num_input_tokens_seen": 43190656, "step": 33315 }, { "epoch": 1.628050717025383, "grad_norm": 0.30351656675338745, "learning_rate": 3.363450338268087e-06, "loss": 0.0621, "num_input_tokens_seen": 43197440, "step": 33320 }, { "epoch": 1.6282950186890772, "grad_norm": 0.13874323666095734, "learning_rate": 3.358533700072783e-06, "loss": 0.0863, "num_input_tokens_seen": 43203712, "step": 33325 }, { "epoch": 1.6285393203527716, "grad_norm": 0.7539260983467102, "learning_rate": 3.3536203992635377e-06, "loss": 0.0624, "num_input_tokens_seen": 43210656, "step": 33330 }, { "epoch": 1.628783622016466, "grad_norm": 0.30174851417541504, "learning_rate": 3.348710436598057e-06, "loss": 0.0675, "num_input_tokens_seen": 43217088, "step": 33335 }, { "epoch": 1.6290279236801601, "grad_norm": 0.6944115161895752, "learning_rate": 3.3438038128335155e-06, "loss": 0.0856, "num_input_tokens_seen": 43223712, "step": 33340 }, { "epoch": 1.6292722253438545, "grad_norm": 0.21527181565761566, "learning_rate": 3.338900528726571e-06, "loss": 0.1023, "num_input_tokens_seen": 43230432, "step": 33345 }, { "epoch": 1.629516527007549, "grad_norm": 0.13600550591945648, "learning_rate": 3.3340005850333812e-06, "loss": 0.0863, "num_input_tokens_seen": 43236992, "step": 33350 }, { "epoch": 1.6297608286712433, "grad_norm": 0.1587822288274765, "learning_rate": 3.329103982509568e-06, "loss": 0.0754, "num_input_tokens_seen": 43243264, "step": 33355 }, { "epoch": 1.6300051303349377, "grad_norm": 1.007207989692688, "learning_rate": 3.324210721910259e-06, "loss": 0.1124, "num_input_tokens_seen": 43249536, "step": 33360 }, { "epoch": 1.630249431998632, "grad_norm": 0.15025146305561066, "learning_rate": 3.319320803990053e-06, "loss": 0.1032, "num_input_tokens_seen": 43256288, "step": 33365 }, { "epoch": 1.6304937336623262, "grad_norm": 0.8255740404129028, "learning_rate": 3.3144342295030274e-06, "loss": 0.1005, "num_input_tokens_seen": 43263104, "step": 33370 }, { "epoch": 1.6307380353260206, "grad_norm": 0.46242809295654297, "learning_rate": 3.309550999202765e-06, "loss": 0.0717, "num_input_tokens_seen": 43269408, "step": 33375 }, { "epoch": 1.630982336989715, "grad_norm": 0.2660594880580902, "learning_rate": 3.3046711138423197e-06, "loss": 0.1045, "num_input_tokens_seen": 43275936, "step": 33380 }, { "epoch": 1.6312266386534091, "grad_norm": 0.38275688886642456, "learning_rate": 3.2997945741742255e-06, "loss": 0.0895, "num_input_tokens_seen": 43282208, "step": 33385 }, { "epoch": 1.6314709403171035, "grad_norm": 0.2479323297739029, "learning_rate": 3.2949213809505082e-06, "loss": 0.0979, "num_input_tokens_seen": 43288288, "step": 33390 }, { "epoch": 1.6317152419807979, "grad_norm": 0.1238979697227478, "learning_rate": 3.2900515349226834e-06, "loss": 0.0702, "num_input_tokens_seen": 43294400, "step": 33395 }, { "epoch": 1.6319595436444922, "grad_norm": 0.2108013778924942, "learning_rate": 3.285185036841731e-06, "loss": 0.0905, "num_input_tokens_seen": 43300896, "step": 33400 }, { "epoch": 1.6319595436444922, "eval_loss": 0.08757232874631882, "eval_runtime": 375.2065, "eval_samples_per_second": 96.973, "eval_steps_per_second": 24.245, "num_input_tokens_seen": 43300896, "step": 33400 }, { "epoch": 1.6322038453081866, "grad_norm": 0.20266246795654297, "learning_rate": 3.2803218874581377e-06, "loss": 0.0836, "num_input_tokens_seen": 43307520, "step": 33405 }, { "epoch": 1.632448146971881, "grad_norm": 0.4255695641040802, "learning_rate": 3.2754620875218494e-06, "loss": 0.1042, "num_input_tokens_seen": 43313600, "step": 33410 }, { "epoch": 1.6326924486355752, "grad_norm": 0.2340075820684433, "learning_rate": 3.2706056377823146e-06, "loss": 0.0829, "num_input_tokens_seen": 43320160, "step": 33415 }, { "epoch": 1.6329367502992695, "grad_norm": 0.18464581668376923, "learning_rate": 3.2657525389884647e-06, "loss": 0.0754, "num_input_tokens_seen": 43326560, "step": 33420 }, { "epoch": 1.6331810519629637, "grad_norm": 0.09780599921941757, "learning_rate": 3.260902791888698e-06, "loss": 0.0448, "num_input_tokens_seen": 43332672, "step": 33425 }, { "epoch": 1.633425353626658, "grad_norm": 0.30284643173217773, "learning_rate": 3.2560563972309166e-06, "loss": 0.0813, "num_input_tokens_seen": 43338880, "step": 33430 }, { "epoch": 1.6336696552903525, "grad_norm": 0.14085792005062103, "learning_rate": 3.251213355762489e-06, "loss": 0.0774, "num_input_tokens_seen": 43345440, "step": 33435 }, { "epoch": 1.6339139569540468, "grad_norm": 0.2232472002506256, "learning_rate": 3.2463736682302707e-06, "loss": 0.0684, "num_input_tokens_seen": 43351936, "step": 33440 }, { "epoch": 1.6341582586177412, "grad_norm": 0.39699265360832214, "learning_rate": 3.2415373353806124e-06, "loss": 0.1056, "num_input_tokens_seen": 43358176, "step": 33445 }, { "epoch": 1.6344025602814356, "grad_norm": 0.1799163669347763, "learning_rate": 3.236704357959322e-06, "loss": 0.1286, "num_input_tokens_seen": 43364544, "step": 33450 }, { "epoch": 1.63464686194513, "grad_norm": 0.3247864842414856, "learning_rate": 3.2318747367117154e-06, "loss": 0.1068, "num_input_tokens_seen": 43370912, "step": 33455 }, { "epoch": 1.6348911636088241, "grad_norm": 0.15143516659736633, "learning_rate": 3.227048472382585e-06, "loss": 0.0722, "num_input_tokens_seen": 43377024, "step": 33460 }, { "epoch": 1.6351354652725185, "grad_norm": 0.643530547618866, "learning_rate": 3.2222255657161915e-06, "loss": 0.0984, "num_input_tokens_seen": 43383968, "step": 33465 }, { "epoch": 1.6353797669362127, "grad_norm": 0.5698816776275635, "learning_rate": 3.2174060174562924e-06, "loss": 0.0937, "num_input_tokens_seen": 43390528, "step": 33470 }, { "epoch": 1.635624068599907, "grad_norm": 0.22099487483501434, "learning_rate": 3.2125898283461298e-06, "loss": 0.091, "num_input_tokens_seen": 43397184, "step": 33475 }, { "epoch": 1.6358683702636014, "grad_norm": 0.35849830508232117, "learning_rate": 3.207776999128406e-06, "loss": 0.0953, "num_input_tokens_seen": 43403456, "step": 33480 }, { "epoch": 1.6361126719272958, "grad_norm": 0.18420176208019257, "learning_rate": 3.202967530545331e-06, "loss": 0.0778, "num_input_tokens_seen": 43409760, "step": 33485 }, { "epoch": 1.6363569735909902, "grad_norm": 0.4656493365764618, "learning_rate": 3.1981614233385778e-06, "loss": 0.0774, "num_input_tokens_seen": 43416800, "step": 33490 }, { "epoch": 1.6366012752546846, "grad_norm": 0.30720531940460205, "learning_rate": 3.1933586782493115e-06, "loss": 0.0671, "num_input_tokens_seen": 43423648, "step": 33495 }, { "epoch": 1.636845576918379, "grad_norm": 0.48887962102890015, "learning_rate": 3.188559296018184e-06, "loss": 0.0854, "num_input_tokens_seen": 43430240, "step": 33500 }, { "epoch": 1.637089878582073, "grad_norm": 0.34021544456481934, "learning_rate": 3.1837632773853098e-06, "loss": 0.0773, "num_input_tokens_seen": 43436832, "step": 33505 }, { "epoch": 1.6373341802457675, "grad_norm": 0.26918989419937134, "learning_rate": 3.178970623090294e-06, "loss": 0.0798, "num_input_tokens_seen": 43442912, "step": 33510 }, { "epoch": 1.6375784819094616, "grad_norm": 0.6066461801528931, "learning_rate": 3.174181333872234e-06, "loss": 0.0835, "num_input_tokens_seen": 43449408, "step": 33515 }, { "epoch": 1.637822783573156, "grad_norm": 0.19387823343276978, "learning_rate": 3.169395410469686e-06, "loss": 0.0676, "num_input_tokens_seen": 43456096, "step": 33520 }, { "epoch": 1.6380670852368504, "grad_norm": 0.2408176213502884, "learning_rate": 3.164612853620713e-06, "loss": 0.0913, "num_input_tokens_seen": 43462528, "step": 33525 }, { "epoch": 1.6383113869005448, "grad_norm": 0.12679514288902283, "learning_rate": 3.1598336640628333e-06, "loss": 0.0886, "num_input_tokens_seen": 43469280, "step": 33530 }, { "epoch": 1.6385556885642392, "grad_norm": 0.21043813228607178, "learning_rate": 3.155057842533063e-06, "loss": 0.0696, "num_input_tokens_seen": 43475616, "step": 33535 }, { "epoch": 1.6387999902279335, "grad_norm": 0.5667757391929626, "learning_rate": 3.1502853897678984e-06, "loss": 0.0828, "num_input_tokens_seen": 43482048, "step": 33540 }, { "epoch": 1.639044291891628, "grad_norm": 0.24202655255794525, "learning_rate": 3.1455163065033017e-06, "loss": 0.0876, "num_input_tokens_seen": 43488416, "step": 33545 }, { "epoch": 1.639288593555322, "grad_norm": 0.35039839148521423, "learning_rate": 3.140750593474734e-06, "loss": 0.1047, "num_input_tokens_seen": 43495136, "step": 33550 }, { "epoch": 1.6395328952190165, "grad_norm": 0.24239954352378845, "learning_rate": 3.1359882514171294e-06, "loss": 0.0876, "num_input_tokens_seen": 43501472, "step": 33555 }, { "epoch": 1.6397771968827106, "grad_norm": 0.461875855922699, "learning_rate": 3.1312292810648903e-06, "loss": 0.0881, "num_input_tokens_seen": 43508032, "step": 33560 }, { "epoch": 1.640021498546405, "grad_norm": 0.6431345343589783, "learning_rate": 3.1264736831519204e-06, "loss": 0.0603, "num_input_tokens_seen": 43514304, "step": 33565 }, { "epoch": 1.6402658002100994, "grad_norm": 0.15000571310520172, "learning_rate": 3.1217214584115863e-06, "loss": 0.0893, "num_input_tokens_seen": 43520960, "step": 33570 }, { "epoch": 1.6405101018737938, "grad_norm": 0.5798916816711426, "learning_rate": 3.116972607576746e-06, "loss": 0.0852, "num_input_tokens_seen": 43527040, "step": 33575 }, { "epoch": 1.6407544035374881, "grad_norm": 0.25317031145095825, "learning_rate": 3.1122271313797303e-06, "loss": 0.0676, "num_input_tokens_seen": 43533536, "step": 33580 }, { "epoch": 1.6409987052011825, "grad_norm": 0.16549186408519745, "learning_rate": 3.107485030552343e-06, "loss": 0.0845, "num_input_tokens_seen": 43540288, "step": 33585 }, { "epoch": 1.641243006864877, "grad_norm": 0.6415577530860901, "learning_rate": 3.1027463058258848e-06, "loss": 0.0883, "num_input_tokens_seen": 43546720, "step": 33590 }, { "epoch": 1.641487308528571, "grad_norm": 0.36643490195274353, "learning_rate": 3.0980109579311273e-06, "loss": 0.0894, "num_input_tokens_seen": 43552832, "step": 33595 }, { "epoch": 1.6417316101922654, "grad_norm": 0.2102011889219284, "learning_rate": 3.093278987598314e-06, "loss": 0.0678, "num_input_tokens_seen": 43559424, "step": 33600 }, { "epoch": 1.6417316101922654, "eval_loss": 0.08745572715997696, "eval_runtime": 374.7364, "eval_samples_per_second": 97.095, "eval_steps_per_second": 24.276, "num_input_tokens_seen": 43559424, "step": 33600 }, { "epoch": 1.6419759118559596, "grad_norm": 0.293713241815567, "learning_rate": 3.0885503955571826e-06, "loss": 0.0703, "num_input_tokens_seen": 43565664, "step": 33605 }, { "epoch": 1.642220213519654, "grad_norm": 0.3773508667945862, "learning_rate": 3.0838251825369313e-06, "loss": 0.0801, "num_input_tokens_seen": 43572000, "step": 33610 }, { "epoch": 1.6424645151833484, "grad_norm": 0.12176062166690826, "learning_rate": 3.0791033492662517e-06, "loss": 0.0725, "num_input_tokens_seen": 43578304, "step": 33615 }, { "epoch": 1.6427088168470427, "grad_norm": 0.3289298415184021, "learning_rate": 3.0743848964733203e-06, "loss": 0.0693, "num_input_tokens_seen": 43584672, "step": 33620 }, { "epoch": 1.642953118510737, "grad_norm": 0.16341644525527954, "learning_rate": 3.0696698248857625e-06, "loss": 0.0908, "num_input_tokens_seen": 43591072, "step": 33625 }, { "epoch": 1.6431974201744315, "grad_norm": 0.28773823380470276, "learning_rate": 3.0649581352307192e-06, "loss": 0.1149, "num_input_tokens_seen": 43597056, "step": 33630 }, { "epoch": 1.6434417218381259, "grad_norm": 0.9149653315544128, "learning_rate": 3.060249828234776e-06, "loss": 0.0866, "num_input_tokens_seen": 43603584, "step": 33635 }, { "epoch": 1.64368602350182, "grad_norm": 0.3495566248893738, "learning_rate": 3.055544904624025e-06, "loss": 0.1069, "num_input_tokens_seen": 43609984, "step": 33640 }, { "epoch": 1.6439303251655144, "grad_norm": 0.15473677217960358, "learning_rate": 3.050843365124026e-06, "loss": 0.0773, "num_input_tokens_seen": 43616544, "step": 33645 }, { "epoch": 1.6441746268292086, "grad_norm": 0.7845650911331177, "learning_rate": 3.0461452104598083e-06, "loss": 0.0974, "num_input_tokens_seen": 43622912, "step": 33650 }, { "epoch": 1.644418928492903, "grad_norm": 0.47779083251953125, "learning_rate": 3.0414504413558836e-06, "loss": 0.0888, "num_input_tokens_seen": 43629984, "step": 33655 }, { "epoch": 1.6446632301565973, "grad_norm": 0.48302146792411804, "learning_rate": 3.0367590585362564e-06, "loss": 0.0892, "num_input_tokens_seen": 43636480, "step": 33660 }, { "epoch": 1.6449075318202917, "grad_norm": 0.2282746136188507, "learning_rate": 3.0320710627243813e-06, "loss": 0.0754, "num_input_tokens_seen": 43643008, "step": 33665 }, { "epoch": 1.645151833483986, "grad_norm": 0.4066648483276367, "learning_rate": 3.027386454643222e-06, "loss": 0.0897, "num_input_tokens_seen": 43648928, "step": 33670 }, { "epoch": 1.6453961351476805, "grad_norm": 0.15482747554779053, "learning_rate": 3.0227052350151914e-06, "loss": 0.0882, "num_input_tokens_seen": 43655328, "step": 33675 }, { "epoch": 1.6456404368113748, "grad_norm": 0.47855839133262634, "learning_rate": 3.0180274045621957e-06, "loss": 0.0782, "num_input_tokens_seen": 43661664, "step": 33680 }, { "epoch": 1.645884738475069, "grad_norm": 0.37983423471450806, "learning_rate": 3.013352964005625e-06, "loss": 0.084, "num_input_tokens_seen": 43667712, "step": 33685 }, { "epoch": 1.6461290401387634, "grad_norm": 0.25942471623420715, "learning_rate": 3.0086819140663218e-06, "loss": 0.0911, "num_input_tokens_seen": 43674112, "step": 33690 }, { "epoch": 1.6463733418024575, "grad_norm": 0.45124390721321106, "learning_rate": 3.0040142554646265e-06, "loss": 0.0598, "num_input_tokens_seen": 43680256, "step": 33695 }, { "epoch": 1.646617643466152, "grad_norm": 0.19565404951572418, "learning_rate": 2.999349988920361e-06, "loss": 0.0782, "num_input_tokens_seen": 43686432, "step": 33700 }, { "epoch": 1.6468619451298463, "grad_norm": 0.14768418669700623, "learning_rate": 2.994689115152796e-06, "loss": 0.0777, "num_input_tokens_seen": 43692800, "step": 33705 }, { "epoch": 1.6471062467935407, "grad_norm": 0.22769154608249664, "learning_rate": 2.9900316348807105e-06, "loss": 0.067, "num_input_tokens_seen": 43699392, "step": 33710 }, { "epoch": 1.647350548457235, "grad_norm": 0.14416059851646423, "learning_rate": 2.985377548822338e-06, "loss": 0.1011, "num_input_tokens_seen": 43705792, "step": 33715 }, { "epoch": 1.6475948501209294, "grad_norm": 0.2917579412460327, "learning_rate": 2.980726857695404e-06, "loss": 0.0566, "num_input_tokens_seen": 43712256, "step": 33720 }, { "epoch": 1.6478391517846238, "grad_norm": 0.2344561368227005, "learning_rate": 2.9760795622171017e-06, "loss": 0.0899, "num_input_tokens_seen": 43718912, "step": 33725 }, { "epoch": 1.648083453448318, "grad_norm": 0.24195100367069244, "learning_rate": 2.971435663104094e-06, "loss": 0.0726, "num_input_tokens_seen": 43725248, "step": 33730 }, { "epoch": 1.6483277551120123, "grad_norm": 0.43295150995254517, "learning_rate": 2.9667951610725385e-06, "loss": 0.0629, "num_input_tokens_seen": 43731936, "step": 33735 }, { "epoch": 1.6485720567757065, "grad_norm": 0.3331984579563141, "learning_rate": 2.9621580568380575e-06, "loss": 0.0923, "num_input_tokens_seen": 43738208, "step": 33740 }, { "epoch": 1.6488163584394009, "grad_norm": 0.41754183173179626, "learning_rate": 2.9575243511157453e-06, "loss": 0.0907, "num_input_tokens_seen": 43744608, "step": 33745 }, { "epoch": 1.6490606601030953, "grad_norm": 0.45206573605537415, "learning_rate": 2.952894044620186e-06, "loss": 0.0905, "num_input_tokens_seen": 43751072, "step": 33750 }, { "epoch": 1.6493049617667896, "grad_norm": 0.4820760190486908, "learning_rate": 2.948267138065419e-06, "loss": 0.0857, "num_input_tokens_seen": 43757312, "step": 33755 }, { "epoch": 1.649549263430484, "grad_norm": 0.44265204668045044, "learning_rate": 2.943643632164983e-06, "loss": 0.111, "num_input_tokens_seen": 43763616, "step": 33760 }, { "epoch": 1.6497935650941784, "grad_norm": 0.42197173833847046, "learning_rate": 2.939023527631879e-06, "loss": 0.1114, "num_input_tokens_seen": 43770240, "step": 33765 }, { "epoch": 1.6500378667578728, "grad_norm": 0.2548753619194031, "learning_rate": 2.934406825178576e-06, "loss": 0.0764, "num_input_tokens_seen": 43776672, "step": 33770 }, { "epoch": 1.650282168421567, "grad_norm": 0.20060069859027863, "learning_rate": 2.9297935255170357e-06, "loss": 0.1161, "num_input_tokens_seen": 43782848, "step": 33775 }, { "epoch": 1.6505264700852613, "grad_norm": 0.36338770389556885, "learning_rate": 2.925183629358691e-06, "loss": 0.0682, "num_input_tokens_seen": 43789376, "step": 33780 }, { "epoch": 1.6507707717489555, "grad_norm": 0.40961819887161255, "learning_rate": 2.9205771374144346e-06, "loss": 0.0934, "num_input_tokens_seen": 43795872, "step": 33785 }, { "epoch": 1.6510150734126499, "grad_norm": 0.5421215295791626, "learning_rate": 2.915974050394657e-06, "loss": 0.0775, "num_input_tokens_seen": 43802688, "step": 33790 }, { "epoch": 1.6512593750763442, "grad_norm": 0.5701448917388916, "learning_rate": 2.9113743690092067e-06, "loss": 0.0916, "num_input_tokens_seen": 43809024, "step": 33795 }, { "epoch": 1.6515036767400386, "grad_norm": 0.4574601650238037, "learning_rate": 2.906778093967402e-06, "loss": 0.0985, "num_input_tokens_seen": 43815424, "step": 33800 }, { "epoch": 1.6515036767400386, "eval_loss": 0.08732643723487854, "eval_runtime": 374.5097, "eval_samples_per_second": 97.154, "eval_steps_per_second": 24.29, "num_input_tokens_seen": 43815424, "step": 33800 }, { "epoch": 1.651747978403733, "grad_norm": 0.4287489950656891, "learning_rate": 2.9021852259780656e-06, "loss": 0.0761, "num_input_tokens_seen": 43822720, "step": 33805 }, { "epoch": 1.6519922800674274, "grad_norm": 0.8068355321884155, "learning_rate": 2.8975957657494583e-06, "loss": 0.0938, "num_input_tokens_seen": 43829088, "step": 33810 }, { "epoch": 1.6522365817311215, "grad_norm": 0.38653364777565, "learning_rate": 2.8930097139893417e-06, "loss": 0.0962, "num_input_tokens_seen": 43835392, "step": 33815 }, { "epoch": 1.652480883394816, "grad_norm": 0.4144250750541687, "learning_rate": 2.888427071404945e-06, "loss": 0.0853, "num_input_tokens_seen": 43841600, "step": 33820 }, { "epoch": 1.6527251850585103, "grad_norm": 0.3238370418548584, "learning_rate": 2.8838478387029606e-06, "loss": 0.0804, "num_input_tokens_seen": 43848096, "step": 33825 }, { "epoch": 1.6529694867222045, "grad_norm": 0.243028461933136, "learning_rate": 2.8792720165895737e-06, "loss": 0.1073, "num_input_tokens_seen": 43854752, "step": 33830 }, { "epoch": 1.6532137883858988, "grad_norm": 0.21241331100463867, "learning_rate": 2.874699605770423e-06, "loss": 0.0812, "num_input_tokens_seen": 43861408, "step": 33835 }, { "epoch": 1.6534580900495932, "grad_norm": 0.11562520265579224, "learning_rate": 2.8701306069506383e-06, "loss": 0.0801, "num_input_tokens_seen": 43867936, "step": 33840 }, { "epoch": 1.6537023917132876, "grad_norm": 0.7003346085548401, "learning_rate": 2.8655650208348178e-06, "loss": 0.0901, "num_input_tokens_seen": 43874592, "step": 33845 }, { "epoch": 1.653946693376982, "grad_norm": 0.20749999582767487, "learning_rate": 2.8610028481270257e-06, "loss": 0.0846, "num_input_tokens_seen": 43881024, "step": 33850 }, { "epoch": 1.6541909950406763, "grad_norm": 0.17787757515907288, "learning_rate": 2.856444089530813e-06, "loss": 0.0894, "num_input_tokens_seen": 43887328, "step": 33855 }, { "epoch": 1.6544352967043705, "grad_norm": 0.705316960811615, "learning_rate": 2.8518887457491955e-06, "loss": 0.0908, "num_input_tokens_seen": 43893856, "step": 33860 }, { "epoch": 1.6546795983680649, "grad_norm": 0.35889315605163574, "learning_rate": 2.8473368174846666e-06, "loss": 0.0686, "num_input_tokens_seen": 43900352, "step": 33865 }, { "epoch": 1.6549239000317593, "grad_norm": 0.27334678173065186, "learning_rate": 2.842788305439184e-06, "loss": 0.0973, "num_input_tokens_seen": 43906656, "step": 33870 }, { "epoch": 1.6551682016954534, "grad_norm": 0.1488208770751953, "learning_rate": 2.8382432103141925e-06, "loss": 0.0666, "num_input_tokens_seen": 43912960, "step": 33875 }, { "epoch": 1.6554125033591478, "grad_norm": 0.4529539942741394, "learning_rate": 2.833701532810598e-06, "loss": 0.0866, "num_input_tokens_seen": 43919360, "step": 33880 }, { "epoch": 1.6556568050228422, "grad_norm": 0.24764278531074524, "learning_rate": 2.8291632736287877e-06, "loss": 0.0865, "num_input_tokens_seen": 43925696, "step": 33885 }, { "epoch": 1.6559011066865366, "grad_norm": 0.27732017636299133, "learning_rate": 2.824628433468615e-06, "loss": 0.0871, "num_input_tokens_seen": 43932128, "step": 33890 }, { "epoch": 1.656145408350231, "grad_norm": 0.9545009732246399, "learning_rate": 2.8200970130294073e-06, "loss": 0.1029, "num_input_tokens_seen": 43938368, "step": 33895 }, { "epoch": 1.6563897100139253, "grad_norm": 0.2500476539134979, "learning_rate": 2.8155690130099775e-06, "loss": 0.0749, "num_input_tokens_seen": 43944768, "step": 33900 }, { "epoch": 1.6566340116776195, "grad_norm": 0.21361063420772552, "learning_rate": 2.8110444341085895e-06, "loss": 0.0799, "num_input_tokens_seen": 43951296, "step": 33905 }, { "epoch": 1.6568783133413139, "grad_norm": 0.32322967052459717, "learning_rate": 2.806523277022996e-06, "loss": 0.0885, "num_input_tokens_seen": 43957952, "step": 33910 }, { "epoch": 1.6571226150050082, "grad_norm": 0.5026939511299133, "learning_rate": 2.802005542450409e-06, "loss": 0.0775, "num_input_tokens_seen": 43964320, "step": 33915 }, { "epoch": 1.6573669166687024, "grad_norm": 0.19974584877490997, "learning_rate": 2.797491231087526e-06, "loss": 0.083, "num_input_tokens_seen": 43970976, "step": 33920 }, { "epoch": 1.6576112183323968, "grad_norm": 0.5715603828430176, "learning_rate": 2.7929803436305137e-06, "loss": 0.0779, "num_input_tokens_seen": 43977472, "step": 33925 }, { "epoch": 1.6578555199960912, "grad_norm": 0.17410622537136078, "learning_rate": 2.788472880774998e-06, "loss": 0.0804, "num_input_tokens_seen": 43983808, "step": 33930 }, { "epoch": 1.6580998216597855, "grad_norm": 0.1768808811903, "learning_rate": 2.7839688432160977e-06, "loss": 0.1039, "num_input_tokens_seen": 43989920, "step": 33935 }, { "epoch": 1.65834412332348, "grad_norm": 0.1468597948551178, "learning_rate": 2.779468231648383e-06, "loss": 0.065, "num_input_tokens_seen": 43996192, "step": 33940 }, { "epoch": 1.6585884249871743, "grad_norm": 0.22285045683383942, "learning_rate": 2.774971046765906e-06, "loss": 0.0928, "num_input_tokens_seen": 44002656, "step": 33945 }, { "epoch": 1.6588327266508684, "grad_norm": 0.18903540074825287, "learning_rate": 2.770477289262194e-06, "loss": 0.0789, "num_input_tokens_seen": 44008960, "step": 33950 }, { "epoch": 1.6590770283145628, "grad_norm": 0.23833099007606506, "learning_rate": 2.765986959830233e-06, "loss": 0.0682, "num_input_tokens_seen": 44015616, "step": 33955 }, { "epoch": 1.659321329978257, "grad_norm": 0.16048091650009155, "learning_rate": 2.761500059162492e-06, "loss": 0.072, "num_input_tokens_seen": 44022816, "step": 33960 }, { "epoch": 1.6595656316419514, "grad_norm": 0.17569135129451752, "learning_rate": 2.757016587950914e-06, "loss": 0.0831, "num_input_tokens_seen": 44029344, "step": 33965 }, { "epoch": 1.6598099333056457, "grad_norm": 0.6266214847564697, "learning_rate": 2.752536546886897e-06, "loss": 0.0933, "num_input_tokens_seen": 44035776, "step": 33970 }, { "epoch": 1.6600542349693401, "grad_norm": 0.3761709928512573, "learning_rate": 2.7480599366613234e-06, "loss": 0.0869, "num_input_tokens_seen": 44041984, "step": 33975 }, { "epoch": 1.6602985366330345, "grad_norm": 0.1044292226433754, "learning_rate": 2.7435867579645473e-06, "loss": 0.0605, "num_input_tokens_seen": 44048576, "step": 33980 }, { "epoch": 1.6605428382967289, "grad_norm": 0.12020748853683472, "learning_rate": 2.739117011486378e-06, "loss": 0.0533, "num_input_tokens_seen": 44054976, "step": 33985 }, { "epoch": 1.6607871399604233, "grad_norm": 0.22981776297092438, "learning_rate": 2.7346506979161216e-06, "loss": 0.0836, "num_input_tokens_seen": 44061280, "step": 33990 }, { "epoch": 1.6610314416241174, "grad_norm": 0.40388742089271545, "learning_rate": 2.7301878179425227e-06, "loss": 0.088, "num_input_tokens_seen": 44067936, "step": 33995 }, { "epoch": 1.6612757432878118, "grad_norm": 0.6130599975585938, "learning_rate": 2.7257283722538244e-06, "loss": 0.0736, "num_input_tokens_seen": 44074432, "step": 34000 }, { "epoch": 1.6612757432878118, "eval_loss": 0.08721271902322769, "eval_runtime": 374.9639, "eval_samples_per_second": 97.036, "eval_steps_per_second": 24.261, "num_input_tokens_seen": 44074432, "step": 34000 }, { "epoch": 1.661520044951506, "grad_norm": 0.2206110805273056, "learning_rate": 2.7212723615377326e-06, "loss": 0.1, "num_input_tokens_seen": 44081120, "step": 34005 }, { "epoch": 1.6617643466152003, "grad_norm": 0.14902523159980774, "learning_rate": 2.7168197864814145e-06, "loss": 0.076, "num_input_tokens_seen": 44087424, "step": 34010 }, { "epoch": 1.6620086482788947, "grad_norm": 0.3835611343383789, "learning_rate": 2.712370647771509e-06, "loss": 0.0893, "num_input_tokens_seen": 44093856, "step": 34015 }, { "epoch": 1.662252949942589, "grad_norm": 0.2026328593492508, "learning_rate": 2.707924946094137e-06, "loss": 0.1024, "num_input_tokens_seen": 44100384, "step": 34020 }, { "epoch": 1.6624972516062835, "grad_norm": 0.3717241585254669, "learning_rate": 2.7034826821348723e-06, "loss": 0.0811, "num_input_tokens_seen": 44106752, "step": 34025 }, { "epoch": 1.6627415532699779, "grad_norm": 0.24878771603107452, "learning_rate": 2.6990438565787786e-06, "loss": 0.0762, "num_input_tokens_seen": 44113632, "step": 34030 }, { "epoch": 1.6629858549336722, "grad_norm": 1.1730865240097046, "learning_rate": 2.6946084701103714e-06, "loss": 0.0807, "num_input_tokens_seen": 44120096, "step": 34035 }, { "epoch": 1.6632301565973664, "grad_norm": 0.19578193128108978, "learning_rate": 2.6901765234136428e-06, "loss": 0.0682, "num_input_tokens_seen": 44126656, "step": 34040 }, { "epoch": 1.6634744582610608, "grad_norm": 0.33296042680740356, "learning_rate": 2.685748017172063e-06, "loss": 0.0969, "num_input_tokens_seen": 44132896, "step": 34045 }, { "epoch": 1.663718759924755, "grad_norm": 0.19720013439655304, "learning_rate": 2.681322952068549e-06, "loss": 0.0748, "num_input_tokens_seen": 44139008, "step": 34050 }, { "epoch": 1.6639630615884493, "grad_norm": 0.2104743868112564, "learning_rate": 2.6769013287855137e-06, "loss": 0.0785, "num_input_tokens_seen": 44145728, "step": 34055 }, { "epoch": 1.6642073632521437, "grad_norm": 0.18192711472511292, "learning_rate": 2.6724831480048286e-06, "loss": 0.0718, "num_input_tokens_seen": 44152896, "step": 34060 }, { "epoch": 1.664451664915838, "grad_norm": 0.2797521948814392, "learning_rate": 2.66806841040782e-06, "loss": 0.0894, "num_input_tokens_seen": 44159520, "step": 34065 }, { "epoch": 1.6646959665795324, "grad_norm": 0.22350095212459564, "learning_rate": 2.6636571166753083e-06, "loss": 0.0694, "num_input_tokens_seen": 44166048, "step": 34070 }, { "epoch": 1.6649402682432268, "grad_norm": 0.19060476124286652, "learning_rate": 2.6592492674875598e-06, "loss": 0.0664, "num_input_tokens_seen": 44172288, "step": 34075 }, { "epoch": 1.6651845699069212, "grad_norm": 0.2274918407201767, "learning_rate": 2.6548448635243305e-06, "loss": 0.0734, "num_input_tokens_seen": 44178464, "step": 34080 }, { "epoch": 1.6654288715706154, "grad_norm": 0.5086325407028198, "learning_rate": 2.650443905464828e-06, "loss": 0.0934, "num_input_tokens_seen": 44184928, "step": 34085 }, { "epoch": 1.6656731732343097, "grad_norm": 0.2996973693370819, "learning_rate": 2.646046393987739e-06, "loss": 0.0969, "num_input_tokens_seen": 44191456, "step": 34090 }, { "epoch": 1.665917474898004, "grad_norm": 0.17243404686450958, "learning_rate": 2.64165232977121e-06, "loss": 0.0563, "num_input_tokens_seen": 44198080, "step": 34095 }, { "epoch": 1.6661617765616983, "grad_norm": 0.2931663990020752, "learning_rate": 2.6372617134928695e-06, "loss": 0.0978, "num_input_tokens_seen": 44204672, "step": 34100 }, { "epoch": 1.6664060782253927, "grad_norm": 0.37470653653144836, "learning_rate": 2.6328745458297943e-06, "loss": 0.0786, "num_input_tokens_seen": 44211040, "step": 34105 }, { "epoch": 1.666650379889087, "grad_norm": 0.474650502204895, "learning_rate": 2.6284908274585546e-06, "loss": 0.0868, "num_input_tokens_seen": 44217728, "step": 34110 }, { "epoch": 1.6668946815527814, "grad_norm": 0.12622949481010437, "learning_rate": 2.6241105590551595e-06, "loss": 0.0808, "num_input_tokens_seen": 44224000, "step": 34115 }, { "epoch": 1.6671389832164758, "grad_norm": 0.3057810068130493, "learning_rate": 2.6197337412951105e-06, "loss": 0.0591, "num_input_tokens_seen": 44230752, "step": 34120 }, { "epoch": 1.6673832848801702, "grad_norm": 0.19904476404190063, "learning_rate": 2.6153603748533705e-06, "loss": 0.075, "num_input_tokens_seen": 44237312, "step": 34125 }, { "epoch": 1.6676275865438643, "grad_norm": 0.334049254655838, "learning_rate": 2.6109904604043585e-06, "loss": 0.0786, "num_input_tokens_seen": 44243712, "step": 34130 }, { "epoch": 1.6678718882075587, "grad_norm": 0.2937774062156677, "learning_rate": 2.6066239986219765e-06, "loss": 0.0893, "num_input_tokens_seen": 44249856, "step": 34135 }, { "epoch": 1.6681161898712529, "grad_norm": 0.1807953417301178, "learning_rate": 2.602260990179592e-06, "loss": 0.0944, "num_input_tokens_seen": 44256160, "step": 34140 }, { "epoch": 1.6683604915349473, "grad_norm": 0.24107928574085236, "learning_rate": 2.5979014357500248e-06, "loss": 0.1025, "num_input_tokens_seen": 44262400, "step": 34145 }, { "epoch": 1.6686047931986416, "grad_norm": 0.19886542856693268, "learning_rate": 2.5935453360055844e-06, "loss": 0.0795, "num_input_tokens_seen": 44268800, "step": 34150 }, { "epoch": 1.668849094862336, "grad_norm": 0.2662227153778076, "learning_rate": 2.5891926916180283e-06, "loss": 0.0811, "num_input_tokens_seen": 44274784, "step": 34155 }, { "epoch": 1.6690933965260304, "grad_norm": 0.2766326665878296, "learning_rate": 2.5848435032585883e-06, "loss": 0.1018, "num_input_tokens_seen": 44281152, "step": 34160 }, { "epoch": 1.6693376981897248, "grad_norm": 0.42498689889907837, "learning_rate": 2.58049777159797e-06, "loss": 0.0762, "num_input_tokens_seen": 44287904, "step": 34165 }, { "epoch": 1.6695819998534192, "grad_norm": 0.08910827338695526, "learning_rate": 2.576155497306332e-06, "loss": 0.0586, "num_input_tokens_seen": 44294496, "step": 34170 }, { "epoch": 1.6698263015171133, "grad_norm": 0.3382999300956726, "learning_rate": 2.57181668105331e-06, "loss": 0.0693, "num_input_tokens_seen": 44301120, "step": 34175 }, { "epoch": 1.6700706031808077, "grad_norm": 0.19943715631961823, "learning_rate": 2.567481323508014e-06, "loss": 0.0908, "num_input_tokens_seen": 44308096, "step": 34180 }, { "epoch": 1.6703149048445018, "grad_norm": 1.1123971939086914, "learning_rate": 2.5631494253389954e-06, "loss": 0.1181, "num_input_tokens_seen": 44314592, "step": 34185 }, { "epoch": 1.6705592065081962, "grad_norm": 0.14150527119636536, "learning_rate": 2.5588209872142997e-06, "loss": 0.088, "num_input_tokens_seen": 44321152, "step": 34190 }, { "epoch": 1.6708035081718906, "grad_norm": 0.5196865200996399, "learning_rate": 2.5544960098014186e-06, "loss": 0.0999, "num_input_tokens_seen": 44327584, "step": 34195 }, { "epoch": 1.671047809835585, "grad_norm": 0.34611275792121887, "learning_rate": 2.550174493767318e-06, "loss": 0.0871, "num_input_tokens_seen": 44334304, "step": 34200 }, { "epoch": 1.671047809835585, "eval_loss": 0.08731072396039963, "eval_runtime": 374.6543, "eval_samples_per_second": 97.116, "eval_steps_per_second": 24.281, "num_input_tokens_seen": 44334304, "step": 34200 }, { "epoch": 1.6712921114992794, "grad_norm": 0.3391371965408325, "learning_rate": 2.545856439778438e-06, "loss": 0.0836, "num_input_tokens_seen": 44340928, "step": 34205 }, { "epoch": 1.6715364131629737, "grad_norm": 0.1932019293308258, "learning_rate": 2.541541848500667e-06, "loss": 0.071, "num_input_tokens_seen": 44347616, "step": 34210 }, { "epoch": 1.6717807148266681, "grad_norm": 0.4275667071342468, "learning_rate": 2.5372307205993733e-06, "loss": 0.0724, "num_input_tokens_seen": 44354240, "step": 34215 }, { "epoch": 1.6720250164903623, "grad_norm": 0.11396872252225876, "learning_rate": 2.5329230567393917e-06, "loss": 0.0817, "num_input_tokens_seen": 44361024, "step": 34220 }, { "epoch": 1.6722693181540567, "grad_norm": 0.25817498564720154, "learning_rate": 2.5286188575850164e-06, "loss": 0.0771, "num_input_tokens_seen": 44367232, "step": 34225 }, { "epoch": 1.6725136198177508, "grad_norm": 0.9601452946662903, "learning_rate": 2.5243181237999984e-06, "loss": 0.0981, "num_input_tokens_seen": 44374176, "step": 34230 }, { "epoch": 1.6727579214814452, "grad_norm": 0.3887992799282074, "learning_rate": 2.520020856047578e-06, "loss": 0.0683, "num_input_tokens_seen": 44380416, "step": 34235 }, { "epoch": 1.6730022231451396, "grad_norm": 0.5092947483062744, "learning_rate": 2.515727054990438e-06, "loss": 0.0806, "num_input_tokens_seen": 44386720, "step": 34240 }, { "epoch": 1.673246524808834, "grad_norm": 0.215668722987175, "learning_rate": 2.511436721290747e-06, "loss": 0.0934, "num_input_tokens_seen": 44393152, "step": 34245 }, { "epoch": 1.6734908264725283, "grad_norm": 0.4442688226699829, "learning_rate": 2.5071498556101164e-06, "loss": 0.0687, "num_input_tokens_seen": 44399808, "step": 34250 }, { "epoch": 1.6737351281362227, "grad_norm": 0.20522089302539825, "learning_rate": 2.5028664586096485e-06, "loss": 0.0753, "num_input_tokens_seen": 44406336, "step": 34255 }, { "epoch": 1.673979429799917, "grad_norm": 0.15845583379268646, "learning_rate": 2.498586530949881e-06, "loss": 0.0702, "num_input_tokens_seen": 44413312, "step": 34260 }, { "epoch": 1.6742237314636113, "grad_norm": 0.1996746063232422, "learning_rate": 2.4943100732908427e-06, "loss": 0.0723, "num_input_tokens_seen": 44419488, "step": 34265 }, { "epoch": 1.6744680331273056, "grad_norm": 0.13303451240062714, "learning_rate": 2.4900370862920188e-06, "loss": 0.0703, "num_input_tokens_seen": 44426048, "step": 34270 }, { "epoch": 1.6747123347909998, "grad_norm": 0.21099910140037537, "learning_rate": 2.4857675706123518e-06, "loss": 0.0758, "num_input_tokens_seen": 44432576, "step": 34275 }, { "epoch": 1.6749566364546942, "grad_norm": 0.6042290329933167, "learning_rate": 2.4815015269102543e-06, "loss": 0.076, "num_input_tokens_seen": 44439296, "step": 34280 }, { "epoch": 1.6752009381183885, "grad_norm": 0.4295180141925812, "learning_rate": 2.477238955843611e-06, "loss": 0.0976, "num_input_tokens_seen": 44445568, "step": 34285 }, { "epoch": 1.675445239782083, "grad_norm": 0.21909229457378387, "learning_rate": 2.4729798580697573e-06, "loss": 0.0679, "num_input_tokens_seen": 44452288, "step": 34290 }, { "epoch": 1.6756895414457773, "grad_norm": 0.2741876542568207, "learning_rate": 2.4687242342455034e-06, "loss": 0.0779, "num_input_tokens_seen": 44458848, "step": 34295 }, { "epoch": 1.6759338431094717, "grad_norm": 0.6021143794059753, "learning_rate": 2.4644720850271196e-06, "loss": 0.1013, "num_input_tokens_seen": 44465472, "step": 34300 }, { "epoch": 1.676178144773166, "grad_norm": 0.33441218733787537, "learning_rate": 2.4602234110703364e-06, "loss": 0.0925, "num_input_tokens_seen": 44471840, "step": 34305 }, { "epoch": 1.6764224464368602, "grad_norm": 0.44712013006210327, "learning_rate": 2.4559782130303576e-06, "loss": 0.0869, "num_input_tokens_seen": 44478112, "step": 34310 }, { "epoch": 1.6766667481005546, "grad_norm": 0.148274227976799, "learning_rate": 2.451736491561843e-06, "loss": 0.0774, "num_input_tokens_seen": 44484832, "step": 34315 }, { "epoch": 1.6769110497642488, "grad_norm": 0.38778287172317505, "learning_rate": 2.4474982473189163e-06, "loss": 0.0764, "num_input_tokens_seen": 44491744, "step": 34320 }, { "epoch": 1.6771553514279431, "grad_norm": 0.26754868030548096, "learning_rate": 2.4432634809551796e-06, "loss": 0.0787, "num_input_tokens_seen": 44498336, "step": 34325 }, { "epoch": 1.6773996530916375, "grad_norm": 0.14176690578460693, "learning_rate": 2.439032193123675e-06, "loss": 0.0944, "num_input_tokens_seen": 44504672, "step": 34330 }, { "epoch": 1.677643954755332, "grad_norm": 0.5182506442070007, "learning_rate": 2.4348043844769297e-06, "loss": 0.0992, "num_input_tokens_seen": 44511488, "step": 34335 }, { "epoch": 1.6778882564190263, "grad_norm": 0.20257596671581268, "learning_rate": 2.4305800556669146e-06, "loss": 0.0754, "num_input_tokens_seen": 44517760, "step": 34340 }, { "epoch": 1.6781325580827207, "grad_norm": 0.5352253913879395, "learning_rate": 2.426359207345083e-06, "loss": 0.0738, "num_input_tokens_seen": 44524512, "step": 34345 }, { "epoch": 1.678376859746415, "grad_norm": 0.5183080434799194, "learning_rate": 2.4221418401623396e-06, "loss": 0.0782, "num_input_tokens_seen": 44530560, "step": 34350 }, { "epoch": 1.6786211614101092, "grad_norm": 0.5590787529945374, "learning_rate": 2.4179279547690557e-06, "loss": 0.0885, "num_input_tokens_seen": 44536864, "step": 34355 }, { "epoch": 1.6788654630738036, "grad_norm": 0.2913488447666168, "learning_rate": 2.413717551815062e-06, "loss": 0.0891, "num_input_tokens_seen": 44543136, "step": 34360 }, { "epoch": 1.6791097647374977, "grad_norm": 0.2091982066631317, "learning_rate": 2.409510631949666e-06, "loss": 0.0622, "num_input_tokens_seen": 44549984, "step": 34365 }, { "epoch": 1.6793540664011921, "grad_norm": 0.14118872582912445, "learning_rate": 2.405307195821618e-06, "loss": 0.088, "num_input_tokens_seen": 44556768, "step": 34370 }, { "epoch": 1.6795983680648865, "grad_norm": 0.15143586695194244, "learning_rate": 2.4011072440791372e-06, "loss": 0.086, "num_input_tokens_seen": 44562848, "step": 34375 }, { "epoch": 1.6798426697285809, "grad_norm": 0.3177318572998047, "learning_rate": 2.3969107773699233e-06, "loss": 0.0956, "num_input_tokens_seen": 44569504, "step": 34380 }, { "epoch": 1.6800869713922753, "grad_norm": 0.34382179379463196, "learning_rate": 2.3927177963411096e-06, "loss": 0.1025, "num_input_tokens_seen": 44575552, "step": 34385 }, { "epoch": 1.6803312730559696, "grad_norm": 0.668940007686615, "learning_rate": 2.3885283016393144e-06, "loss": 0.0897, "num_input_tokens_seen": 44581856, "step": 34390 }, { "epoch": 1.6805755747196638, "grad_norm": 0.2641056776046753, "learning_rate": 2.3843422939106076e-06, "loss": 0.0811, "num_input_tokens_seen": 44588128, "step": 34395 }, { "epoch": 1.6808198763833582, "grad_norm": 0.29878154397010803, "learning_rate": 2.380159773800525e-06, "loss": 0.0751, "num_input_tokens_seen": 44594368, "step": 34400 }, { "epoch": 1.6808198763833582, "eval_loss": 0.08723931759595871, "eval_runtime": 375.1015, "eval_samples_per_second": 97.0, "eval_steps_per_second": 24.252, "num_input_tokens_seen": 44594368, "step": 34400 }, { "epoch": 1.6810641780470525, "grad_norm": 0.8855957388877869, "learning_rate": 2.3759807419540675e-06, "loss": 0.0991, "num_input_tokens_seen": 44600704, "step": 34405 }, { "epoch": 1.6813084797107467, "grad_norm": 0.19971992075443268, "learning_rate": 2.3718051990156835e-06, "loss": 0.086, "num_input_tokens_seen": 44607392, "step": 34410 }, { "epoch": 1.681552781374441, "grad_norm": 0.4275510907173157, "learning_rate": 2.367633145629311e-06, "loss": 0.0964, "num_input_tokens_seen": 44613472, "step": 34415 }, { "epoch": 1.6817970830381355, "grad_norm": 0.41002216935157776, "learning_rate": 2.363464582438316e-06, "loss": 0.1006, "num_input_tokens_seen": 44620096, "step": 34420 }, { "epoch": 1.6820413847018298, "grad_norm": 0.8700831532478333, "learning_rate": 2.3592995100855526e-06, "loss": 0.1115, "num_input_tokens_seen": 44626656, "step": 34425 }, { "epoch": 1.6822856863655242, "grad_norm": 0.17638356983661652, "learning_rate": 2.3551379292133273e-06, "loss": 0.0877, "num_input_tokens_seen": 44633152, "step": 34430 }, { "epoch": 1.6825299880292186, "grad_norm": 0.24857203662395477, "learning_rate": 2.3509798404634047e-06, "loss": 0.0902, "num_input_tokens_seen": 44639584, "step": 34435 }, { "epoch": 1.6827742896929128, "grad_norm": 0.24344654381275177, "learning_rate": 2.346825244477019e-06, "loss": 0.0928, "num_input_tokens_seen": 44645952, "step": 34440 }, { "epoch": 1.6830185913566071, "grad_norm": 1.3951479196548462, "learning_rate": 2.3426741418948545e-06, "loss": 0.1067, "num_input_tokens_seen": 44652288, "step": 34445 }, { "epoch": 1.6832628930203015, "grad_norm": 0.24570730328559875, "learning_rate": 2.3385265333570715e-06, "loss": 0.1231, "num_input_tokens_seen": 44658528, "step": 34450 }, { "epoch": 1.6835071946839957, "grad_norm": 0.24871647357940674, "learning_rate": 2.334382419503278e-06, "loss": 0.065, "num_input_tokens_seen": 44665440, "step": 34455 }, { "epoch": 1.68375149634769, "grad_norm": 0.6918848156929016, "learning_rate": 2.3302418009725465e-06, "loss": 0.0816, "num_input_tokens_seen": 44672480, "step": 34460 }, { "epoch": 1.6839957980113844, "grad_norm": 0.20002593100070953, "learning_rate": 2.326104678403415e-06, "loss": 0.0665, "num_input_tokens_seen": 44679008, "step": 34465 }, { "epoch": 1.6842400996750788, "grad_norm": 0.21452826261520386, "learning_rate": 2.321971052433883e-06, "loss": 0.082, "num_input_tokens_seen": 44685440, "step": 34470 }, { "epoch": 1.6844844013387732, "grad_norm": 0.5269829630851746, "learning_rate": 2.3178409237014004e-06, "loss": 0.09, "num_input_tokens_seen": 44691904, "step": 34475 }, { "epoch": 1.6847287030024676, "grad_norm": 0.4449310600757599, "learning_rate": 2.313714292842889e-06, "loss": 0.092, "num_input_tokens_seen": 44698080, "step": 34480 }, { "epoch": 1.6849730046661617, "grad_norm": 0.603649377822876, "learning_rate": 2.309591160494734e-06, "loss": 0.0813, "num_input_tokens_seen": 44704384, "step": 34485 }, { "epoch": 1.6852173063298561, "grad_norm": 0.1894998848438263, "learning_rate": 2.305471527292763e-06, "loss": 0.0992, "num_input_tokens_seen": 44710688, "step": 34490 }, { "epoch": 1.6854616079935503, "grad_norm": 0.17763568460941315, "learning_rate": 2.3013553938722817e-06, "loss": 0.0872, "num_input_tokens_seen": 44716992, "step": 34495 }, { "epoch": 1.6857059096572446, "grad_norm": 1.0420705080032349, "learning_rate": 2.297242760868043e-06, "loss": 0.0893, "num_input_tokens_seen": 44723584, "step": 34500 }, { "epoch": 1.685950211320939, "grad_norm": 0.38435402512550354, "learning_rate": 2.2931336289142735e-06, "loss": 0.105, "num_input_tokens_seen": 44730176, "step": 34505 }, { "epoch": 1.6861945129846334, "grad_norm": 1.027924656867981, "learning_rate": 2.289027998644655e-06, "loss": 0.0964, "num_input_tokens_seen": 44736704, "step": 34510 }, { "epoch": 1.6864388146483278, "grad_norm": 0.24523182213306427, "learning_rate": 2.2849258706923228e-06, "loss": 0.0843, "num_input_tokens_seen": 44743136, "step": 34515 }, { "epoch": 1.6866831163120222, "grad_norm": 0.2798115313053131, "learning_rate": 2.2808272456898705e-06, "loss": 0.0891, "num_input_tokens_seen": 44749568, "step": 34520 }, { "epoch": 1.6869274179757165, "grad_norm": 0.3200191557407379, "learning_rate": 2.2767321242693707e-06, "loss": 0.075, "num_input_tokens_seen": 44756320, "step": 34525 }, { "epoch": 1.6871717196394107, "grad_norm": 0.21027541160583496, "learning_rate": 2.272640507062329e-06, "loss": 0.0719, "num_input_tokens_seen": 44762208, "step": 34530 }, { "epoch": 1.687416021303105, "grad_norm": 0.2610880732536316, "learning_rate": 2.2685523946997382e-06, "loss": 0.0972, "num_input_tokens_seen": 44768192, "step": 34535 }, { "epoch": 1.6876603229667992, "grad_norm": 0.2587672472000122, "learning_rate": 2.2644677878120245e-06, "loss": 0.0746, "num_input_tokens_seen": 44774240, "step": 34540 }, { "epoch": 1.6879046246304936, "grad_norm": 0.4263085722923279, "learning_rate": 2.2603866870290897e-06, "loss": 0.0513, "num_input_tokens_seen": 44780672, "step": 34545 }, { "epoch": 1.688148926294188, "grad_norm": 0.19288672506809235, "learning_rate": 2.256309092980294e-06, "loss": 0.0687, "num_input_tokens_seen": 44787104, "step": 34550 }, { "epoch": 1.6883932279578824, "grad_norm": 0.4720199704170227, "learning_rate": 2.252235006294448e-06, "loss": 0.0801, "num_input_tokens_seen": 44793568, "step": 34555 }, { "epoch": 1.6886375296215768, "grad_norm": 0.23902635276317596, "learning_rate": 2.2481644275998333e-06, "loss": 0.0951, "num_input_tokens_seen": 44800224, "step": 34560 }, { "epoch": 1.6888818312852711, "grad_norm": 0.15208564698696136, "learning_rate": 2.2440973575241832e-06, "loss": 0.071, "num_input_tokens_seen": 44806688, "step": 34565 }, { "epoch": 1.6891261329489655, "grad_norm": 0.23057104647159576, "learning_rate": 2.240033796694685e-06, "loss": 0.0969, "num_input_tokens_seen": 44813888, "step": 34570 }, { "epoch": 1.6893704346126597, "grad_norm": 0.4930696189403534, "learning_rate": 2.235973745737999e-06, "loss": 0.1062, "num_input_tokens_seen": 44820832, "step": 34575 }, { "epoch": 1.689614736276354, "grad_norm": 0.2512300908565521, "learning_rate": 2.2319172052802263e-06, "loss": 0.1282, "num_input_tokens_seen": 44826848, "step": 34580 }, { "epoch": 1.6898590379400482, "grad_norm": 0.2956909239292145, "learning_rate": 2.2278641759469477e-06, "loss": 0.0971, "num_input_tokens_seen": 44833152, "step": 34585 }, { "epoch": 1.6901033396037426, "grad_norm": 0.2517082095146179, "learning_rate": 2.2238146583631825e-06, "loss": 0.0692, "num_input_tokens_seen": 44839808, "step": 34590 }, { "epoch": 1.690347641267437, "grad_norm": 0.7378585338592529, "learning_rate": 2.2197686531534256e-06, "loss": 0.0899, "num_input_tokens_seen": 44846048, "step": 34595 }, { "epoch": 1.6905919429311314, "grad_norm": 0.2297006994485855, "learning_rate": 2.2157261609416087e-06, "loss": 0.0937, "num_input_tokens_seen": 44852576, "step": 34600 }, { "epoch": 1.6905919429311314, "eval_loss": 0.08735229074954987, "eval_runtime": 374.382, "eval_samples_per_second": 97.187, "eval_steps_per_second": 24.299, "num_input_tokens_seen": 44852576, "step": 34600 }, { "epoch": 1.6908362445948257, "grad_norm": 0.23846836388111115, "learning_rate": 2.211687182351149e-06, "loss": 0.0966, "num_input_tokens_seen": 44858912, "step": 34605 }, { "epoch": 1.6910805462585201, "grad_norm": 0.4750725030899048, "learning_rate": 2.2076517180048993e-06, "loss": 0.092, "num_input_tokens_seen": 44865248, "step": 34610 }, { "epoch": 1.6913248479222145, "grad_norm": 0.23008006811141968, "learning_rate": 2.2036197685251834e-06, "loss": 0.074, "num_input_tokens_seen": 44871616, "step": 34615 }, { "epoch": 1.6915691495859086, "grad_norm": 0.21487882733345032, "learning_rate": 2.199591334533771e-06, "loss": 0.0765, "num_input_tokens_seen": 44878176, "step": 34620 }, { "epoch": 1.691813451249603, "grad_norm": 0.341692715883255, "learning_rate": 2.1955664166519036e-06, "loss": 0.0835, "num_input_tokens_seen": 44884320, "step": 34625 }, { "epoch": 1.6920577529132972, "grad_norm": 0.22633971273899078, "learning_rate": 2.1915450155002793e-06, "loss": 0.0811, "num_input_tokens_seen": 44890464, "step": 34630 }, { "epoch": 1.6923020545769916, "grad_norm": 0.3549036979675293, "learning_rate": 2.187527131699038e-06, "loss": 0.1093, "num_input_tokens_seen": 44897216, "step": 34635 }, { "epoch": 1.692546356240686, "grad_norm": 0.4634723365306854, "learning_rate": 2.18351276586779e-06, "loss": 0.0861, "num_input_tokens_seen": 44903744, "step": 34640 }, { "epoch": 1.6927906579043803, "grad_norm": 0.17772157490253448, "learning_rate": 2.1795019186256092e-06, "loss": 0.0952, "num_input_tokens_seen": 44910272, "step": 34645 }, { "epoch": 1.6930349595680747, "grad_norm": 0.26848268508911133, "learning_rate": 2.1754945905910094e-06, "loss": 0.0869, "num_input_tokens_seen": 44916224, "step": 34650 }, { "epoch": 1.693279261231769, "grad_norm": 0.631507158279419, "learning_rate": 2.171490782381977e-06, "loss": 0.0677, "num_input_tokens_seen": 44922752, "step": 34655 }, { "epoch": 1.6935235628954635, "grad_norm": 0.1753087192773819, "learning_rate": 2.1674904946159425e-06, "loss": 0.0854, "num_input_tokens_seen": 44929568, "step": 34660 }, { "epoch": 1.6937678645591576, "grad_norm": 0.7535102367401123, "learning_rate": 2.16349372790981e-06, "loss": 0.092, "num_input_tokens_seen": 44935776, "step": 34665 }, { "epoch": 1.694012166222852, "grad_norm": 0.24757328629493713, "learning_rate": 2.159500482879928e-06, "loss": 0.0825, "num_input_tokens_seen": 44941632, "step": 34670 }, { "epoch": 1.6942564678865462, "grad_norm": 0.1911851316690445, "learning_rate": 2.155510760142096e-06, "loss": 0.0835, "num_input_tokens_seen": 44947904, "step": 34675 }, { "epoch": 1.6945007695502405, "grad_norm": 0.21171121299266815, "learning_rate": 2.151524560311588e-06, "loss": 0.084, "num_input_tokens_seen": 44953984, "step": 34680 }, { "epoch": 1.694745071213935, "grad_norm": 0.2036118060350418, "learning_rate": 2.147541884003129e-06, "loss": 0.0652, "num_input_tokens_seen": 44960640, "step": 34685 }, { "epoch": 1.6949893728776293, "grad_norm": 0.6274338960647583, "learning_rate": 2.1435627318308895e-06, "loss": 0.0739, "num_input_tokens_seen": 44967296, "step": 34690 }, { "epoch": 1.6952336745413237, "grad_norm": 0.16450999677181244, "learning_rate": 2.139587104408511e-06, "loss": 0.0757, "num_input_tokens_seen": 44973728, "step": 34695 }, { "epoch": 1.695477976205018, "grad_norm": 0.1914381980895996, "learning_rate": 2.1356150023490783e-06, "loss": 0.0716, "num_input_tokens_seen": 44980160, "step": 34700 }, { "epoch": 1.6957222778687124, "grad_norm": 0.30316486954689026, "learning_rate": 2.1316464262651464e-06, "loss": 0.0782, "num_input_tokens_seen": 44986560, "step": 34705 }, { "epoch": 1.6959665795324066, "grad_norm": 0.19430798292160034, "learning_rate": 2.1276813767687224e-06, "loss": 0.0823, "num_input_tokens_seen": 44993152, "step": 34710 }, { "epoch": 1.696210881196101, "grad_norm": 0.34753748774528503, "learning_rate": 2.123719854471254e-06, "loss": 0.082, "num_input_tokens_seen": 44999328, "step": 34715 }, { "epoch": 1.6964551828597951, "grad_norm": 0.1863141655921936, "learning_rate": 2.119761859983668e-06, "loss": 0.0707, "num_input_tokens_seen": 45005952, "step": 34720 }, { "epoch": 1.6966994845234895, "grad_norm": 0.9216874837875366, "learning_rate": 2.1158073939163386e-06, "loss": 0.0863, "num_input_tokens_seen": 45012480, "step": 34725 }, { "epoch": 1.696943786187184, "grad_norm": 0.1963362991809845, "learning_rate": 2.111856456879088e-06, "loss": 0.0716, "num_input_tokens_seen": 45019104, "step": 34730 }, { "epoch": 1.6971880878508783, "grad_norm": 0.7363724112510681, "learning_rate": 2.1079090494811993e-06, "loss": 0.0763, "num_input_tokens_seen": 45025952, "step": 34735 }, { "epoch": 1.6974323895145726, "grad_norm": 0.36170443892478943, "learning_rate": 2.103965172331418e-06, "loss": 0.083, "num_input_tokens_seen": 45032448, "step": 34740 }, { "epoch": 1.697676691178267, "grad_norm": 0.41328543424606323, "learning_rate": 2.100024826037933e-06, "loss": 0.1001, "num_input_tokens_seen": 45038720, "step": 34745 }, { "epoch": 1.6979209928419614, "grad_norm": 0.21124300360679626, "learning_rate": 2.0960880112084027e-06, "loss": 0.0798, "num_input_tokens_seen": 45044736, "step": 34750 }, { "epoch": 1.6981652945056556, "grad_norm": 0.19953741133213043, "learning_rate": 2.092154728449927e-06, "loss": 0.093, "num_input_tokens_seen": 45051008, "step": 34755 }, { "epoch": 1.69840959616935, "grad_norm": 0.33366814255714417, "learning_rate": 2.0882249783690687e-06, "loss": 0.0953, "num_input_tokens_seen": 45057728, "step": 34760 }, { "epoch": 1.698653897833044, "grad_norm": 0.31733500957489014, "learning_rate": 2.084298761571851e-06, "loss": 0.0859, "num_input_tokens_seen": 45064160, "step": 34765 }, { "epoch": 1.6988981994967385, "grad_norm": 0.27143868803977966, "learning_rate": 2.080376078663737e-06, "loss": 0.0813, "num_input_tokens_seen": 45070592, "step": 34770 }, { "epoch": 1.6991425011604329, "grad_norm": 0.24214176833629608, "learning_rate": 2.0764569302496593e-06, "loss": 0.0841, "num_input_tokens_seen": 45076864, "step": 34775 }, { "epoch": 1.6993868028241272, "grad_norm": 0.13682390749454498, "learning_rate": 2.0725413169339957e-06, "loss": 0.0649, "num_input_tokens_seen": 45083232, "step": 34780 }, { "epoch": 1.6996311044878216, "grad_norm": 0.6480834484100342, "learning_rate": 2.068629239320588e-06, "loss": 0.0949, "num_input_tokens_seen": 45089760, "step": 34785 }, { "epoch": 1.699875406151516, "grad_norm": 0.23295700550079346, "learning_rate": 2.064720698012726e-06, "loss": 0.1015, "num_input_tokens_seen": 45095744, "step": 34790 }, { "epoch": 1.7001197078152104, "grad_norm": 0.3375244438648224, "learning_rate": 2.0608156936131522e-06, "loss": 0.0746, "num_input_tokens_seen": 45102240, "step": 34795 }, { "epoch": 1.7003640094789045, "grad_norm": 0.2843177318572998, "learning_rate": 2.056914226724074e-06, "loss": 0.0571, "num_input_tokens_seen": 45109056, "step": 34800 }, { "epoch": 1.7003640094789045, "eval_loss": 0.08752372115850449, "eval_runtime": 375.0145, "eval_samples_per_second": 97.023, "eval_steps_per_second": 24.258, "num_input_tokens_seen": 45109056, "step": 34800 }, { "epoch": 1.700608311142599, "grad_norm": 0.40096670389175415, "learning_rate": 2.0530162979471385e-06, "loss": 0.0693, "num_input_tokens_seen": 45115680, "step": 34805 }, { "epoch": 1.700852612806293, "grad_norm": 0.5172724723815918, "learning_rate": 2.0491219078834667e-06, "loss": 0.0994, "num_input_tokens_seen": 45122016, "step": 34810 }, { "epoch": 1.7010969144699875, "grad_norm": 0.3338644206523895, "learning_rate": 2.045231057133612e-06, "loss": 0.0776, "num_input_tokens_seen": 45128512, "step": 34815 }, { "epoch": 1.7013412161336818, "grad_norm": 0.5046899914741516, "learning_rate": 2.0413437462975944e-06, "loss": 0.1039, "num_input_tokens_seen": 45134720, "step": 34820 }, { "epoch": 1.7015855177973762, "grad_norm": 0.22918471693992615, "learning_rate": 2.0374599759748843e-06, "loss": 0.0793, "num_input_tokens_seen": 45141120, "step": 34825 }, { "epoch": 1.7018298194610706, "grad_norm": 0.09887829422950745, "learning_rate": 2.033579746764419e-06, "loss": 0.07, "num_input_tokens_seen": 45147424, "step": 34830 }, { "epoch": 1.702074121124765, "grad_norm": 0.7153754234313965, "learning_rate": 2.029703059264565e-06, "loss": 0.0919, "num_input_tokens_seen": 45154432, "step": 34835 }, { "epoch": 1.7023184227884594, "grad_norm": 0.1651044487953186, "learning_rate": 2.02582991407316e-06, "loss": 0.0761, "num_input_tokens_seen": 45160800, "step": 34840 }, { "epoch": 1.7025627244521535, "grad_norm": 0.5890371203422546, "learning_rate": 2.0219603117874992e-06, "loss": 0.0864, "num_input_tokens_seen": 45167744, "step": 34845 }, { "epoch": 1.7028070261158479, "grad_norm": 0.3623405992984772, "learning_rate": 2.0180942530043156e-06, "loss": 0.0866, "num_input_tokens_seen": 45174656, "step": 34850 }, { "epoch": 1.703051327779542, "grad_norm": 0.5901879072189331, "learning_rate": 2.0142317383198107e-06, "loss": 0.0857, "num_input_tokens_seen": 45181184, "step": 34855 }, { "epoch": 1.7032956294432364, "grad_norm": 0.2451443374156952, "learning_rate": 2.0103727683296243e-06, "loss": 0.0831, "num_input_tokens_seen": 45187936, "step": 34860 }, { "epoch": 1.7035399311069308, "grad_norm": 0.13182838261127472, "learning_rate": 2.0065173436288636e-06, "loss": 0.0723, "num_input_tokens_seen": 45194560, "step": 34865 }, { "epoch": 1.7037842327706252, "grad_norm": 0.17784257233142853, "learning_rate": 2.002665464812087e-06, "loss": 0.0667, "num_input_tokens_seen": 45200800, "step": 34870 }, { "epoch": 1.7040285344343196, "grad_norm": 0.17358069121837616, "learning_rate": 1.998817132473291e-06, "loss": 0.0868, "num_input_tokens_seen": 45207296, "step": 34875 }, { "epoch": 1.704272836098014, "grad_norm": 0.9680712223052979, "learning_rate": 1.9949723472059507e-06, "loss": 0.1028, "num_input_tokens_seen": 45213632, "step": 34880 }, { "epoch": 1.7045171377617083, "grad_norm": 0.5821139812469482, "learning_rate": 1.9911311096029726e-06, "loss": 0.0706, "num_input_tokens_seen": 45220096, "step": 34885 }, { "epoch": 1.7047614394254025, "grad_norm": 0.20753687620162964, "learning_rate": 1.9872934202567224e-06, "loss": 0.0776, "num_input_tokens_seen": 45226560, "step": 34890 }, { "epoch": 1.7050057410890969, "grad_norm": 0.5488436818122864, "learning_rate": 1.9834592797590257e-06, "loss": 0.0852, "num_input_tokens_seen": 45232736, "step": 34895 }, { "epoch": 1.705250042752791, "grad_norm": 0.39325809478759766, "learning_rate": 1.979628688701149e-06, "loss": 0.089, "num_input_tokens_seen": 45239424, "step": 34900 }, { "epoch": 1.7054943444164854, "grad_norm": 0.19592007994651794, "learning_rate": 1.9758016476738193e-06, "loss": 0.0915, "num_input_tokens_seen": 45245504, "step": 34905 }, { "epoch": 1.7057386460801798, "grad_norm": 0.14560174942016602, "learning_rate": 1.971978157267221e-06, "loss": 0.0776, "num_input_tokens_seen": 45252256, "step": 34910 }, { "epoch": 1.7059829477438742, "grad_norm": 0.29751887917518616, "learning_rate": 1.968158218070973e-06, "loss": 0.0889, "num_input_tokens_seen": 45258208, "step": 34915 }, { "epoch": 1.7062272494075685, "grad_norm": 0.16635039448738098, "learning_rate": 1.9643418306741682e-06, "loss": 0.0955, "num_input_tokens_seen": 45264768, "step": 34920 }, { "epoch": 1.706471551071263, "grad_norm": 0.11783694475889206, "learning_rate": 1.9605289956653337e-06, "loss": 0.0621, "num_input_tokens_seen": 45271520, "step": 34925 }, { "epoch": 1.706715852734957, "grad_norm": 0.23237814009189606, "learning_rate": 1.9567197136324626e-06, "loss": 0.106, "num_input_tokens_seen": 45277664, "step": 34930 }, { "epoch": 1.7069601543986515, "grad_norm": 0.22422565519809723, "learning_rate": 1.9529139851629935e-06, "loss": 0.0913, "num_input_tokens_seen": 45284192, "step": 34935 }, { "epoch": 1.7072044560623458, "grad_norm": 0.27490684390068054, "learning_rate": 1.949111810843812e-06, "loss": 0.0948, "num_input_tokens_seen": 45290528, "step": 34940 }, { "epoch": 1.70744875772604, "grad_norm": 0.2911582589149475, "learning_rate": 1.9453131912612694e-06, "loss": 0.1037, "num_input_tokens_seen": 45297024, "step": 34945 }, { "epoch": 1.7076930593897344, "grad_norm": 0.2451341301202774, "learning_rate": 1.941518127001149e-06, "loss": 0.0894, "num_input_tokens_seen": 45303328, "step": 34950 }, { "epoch": 1.7079373610534287, "grad_norm": 0.1750946342945099, "learning_rate": 1.9377266186487107e-06, "loss": 0.0944, "num_input_tokens_seen": 45309696, "step": 34955 }, { "epoch": 1.7081816627171231, "grad_norm": 0.22889983654022217, "learning_rate": 1.9339386667886483e-06, "loss": 0.1002, "num_input_tokens_seen": 45316224, "step": 34960 }, { "epoch": 1.7084259643808175, "grad_norm": 0.4017528295516968, "learning_rate": 1.9301542720051024e-06, "loss": 0.0684, "num_input_tokens_seen": 45322368, "step": 34965 }, { "epoch": 1.7086702660445119, "grad_norm": 0.36599329113960266, "learning_rate": 1.926373434881684e-06, "loss": 0.0878, "num_input_tokens_seen": 45328704, "step": 34970 }, { "epoch": 1.708914567708206, "grad_norm": 0.2960280179977417, "learning_rate": 1.9225961560014468e-06, "loss": 0.0856, "num_input_tokens_seen": 45335328, "step": 34975 }, { "epoch": 1.7091588693719004, "grad_norm": 0.16906805336475372, "learning_rate": 1.918822435946885e-06, "loss": 0.0821, "num_input_tokens_seen": 45341696, "step": 34980 }, { "epoch": 1.7094031710355948, "grad_norm": 0.33571794629096985, "learning_rate": 1.915052275299961e-06, "loss": 0.0718, "num_input_tokens_seen": 45348224, "step": 34985 }, { "epoch": 1.709647472699289, "grad_norm": 0.20752590894699097, "learning_rate": 1.9112856746420854e-06, "loss": 0.0924, "num_input_tokens_seen": 45354592, "step": 34990 }, { "epoch": 1.7098917743629833, "grad_norm": 0.18515031039714813, "learning_rate": 1.907522634554104e-06, "loss": 0.0985, "num_input_tokens_seen": 45361152, "step": 34995 }, { "epoch": 1.7101360760266777, "grad_norm": 0.3270217180252075, "learning_rate": 1.9037631556163337e-06, "loss": 0.0699, "num_input_tokens_seen": 45367936, "step": 35000 }, { "epoch": 1.7101360760266777, "eval_loss": 0.08734756708145142, "eval_runtime": 374.7547, "eval_samples_per_second": 97.09, "eval_steps_per_second": 24.275, "num_input_tokens_seen": 45367936, "step": 35000 }, { "epoch": 1.710380377690372, "grad_norm": 0.5715668797492981, "learning_rate": 1.9000072384085272e-06, "loss": 0.1088, "num_input_tokens_seen": 45374048, "step": 35005 }, { "epoch": 1.7106246793540665, "grad_norm": 0.1673736721277237, "learning_rate": 1.8962548835098987e-06, "loss": 0.0642, "num_input_tokens_seen": 45380480, "step": 35010 }, { "epoch": 1.7108689810177609, "grad_norm": 0.7644982933998108, "learning_rate": 1.8925060914991077e-06, "loss": 0.0996, "num_input_tokens_seen": 45386912, "step": 35015 }, { "epoch": 1.711113282681455, "grad_norm": 1.0376569032669067, "learning_rate": 1.888760862954264e-06, "loss": 0.0759, "num_input_tokens_seen": 45393344, "step": 35020 }, { "epoch": 1.7113575843451494, "grad_norm": 0.48963019251823425, "learning_rate": 1.8850191984529309e-06, "loss": 0.1158, "num_input_tokens_seen": 45399936, "step": 35025 }, { "epoch": 1.7116018860088436, "grad_norm": 0.37986892461776733, "learning_rate": 1.8812810985721186e-06, "loss": 0.0495, "num_input_tokens_seen": 45406368, "step": 35030 }, { "epoch": 1.711846187672538, "grad_norm": 0.14612695574760437, "learning_rate": 1.8775465638882856e-06, "loss": 0.0867, "num_input_tokens_seen": 45412608, "step": 35035 }, { "epoch": 1.7120904893362323, "grad_norm": 0.3152582347393036, "learning_rate": 1.8738155949773517e-06, "loss": 0.0912, "num_input_tokens_seen": 45419712, "step": 35040 }, { "epoch": 1.7123347909999267, "grad_norm": 0.37762925028800964, "learning_rate": 1.8700881924146707e-06, "loss": 0.0843, "num_input_tokens_seen": 45425952, "step": 35045 }, { "epoch": 1.712579092663621, "grad_norm": 0.5829451680183411, "learning_rate": 1.8663643567750577e-06, "loss": 0.0884, "num_input_tokens_seen": 45432416, "step": 35050 }, { "epoch": 1.7128233943273155, "grad_norm": 0.6323891282081604, "learning_rate": 1.8626440886327813e-06, "loss": 0.0833, "num_input_tokens_seen": 45439520, "step": 35055 }, { "epoch": 1.7130676959910098, "grad_norm": 0.2651180326938629, "learning_rate": 1.8589273885615432e-06, "loss": 0.1037, "num_input_tokens_seen": 45446112, "step": 35060 }, { "epoch": 1.713311997654704, "grad_norm": 0.2141294777393341, "learning_rate": 1.8552142571345133e-06, "loss": 0.1013, "num_input_tokens_seen": 45452736, "step": 35065 }, { "epoch": 1.7135562993183984, "grad_norm": 0.258066326379776, "learning_rate": 1.8515046949243025e-06, "loss": 0.0838, "num_input_tokens_seen": 45459712, "step": 35070 }, { "epoch": 1.7138006009820925, "grad_norm": 0.6357113122940063, "learning_rate": 1.8477987025029674e-06, "loss": 0.0847, "num_input_tokens_seen": 45466496, "step": 35075 }, { "epoch": 1.714044902645787, "grad_norm": 0.33255141973495483, "learning_rate": 1.8440962804420232e-06, "loss": 0.0819, "num_input_tokens_seen": 45472768, "step": 35080 }, { "epoch": 1.7142892043094813, "grad_norm": 1.0177454948425293, "learning_rate": 1.8403974293124265e-06, "loss": 0.0979, "num_input_tokens_seen": 45478912, "step": 35085 }, { "epoch": 1.7145335059731757, "grad_norm": 0.5236492156982422, "learning_rate": 1.8367021496845854e-06, "loss": 0.0763, "num_input_tokens_seen": 45485568, "step": 35090 }, { "epoch": 1.71477780763687, "grad_norm": 0.34966930747032166, "learning_rate": 1.8330104421283662e-06, "loss": 0.0903, "num_input_tokens_seen": 45491968, "step": 35095 }, { "epoch": 1.7150221093005644, "grad_norm": 0.38863492012023926, "learning_rate": 1.8293223072130717e-06, "loss": 0.0607, "num_input_tokens_seen": 45498240, "step": 35100 }, { "epoch": 1.7152664109642588, "grad_norm": 0.376001238822937, "learning_rate": 1.8256377455074525e-06, "loss": 0.0938, "num_input_tokens_seen": 45504768, "step": 35105 }, { "epoch": 1.715510712627953, "grad_norm": 0.4317205846309662, "learning_rate": 1.8219567575797263e-06, "loss": 0.0993, "num_input_tokens_seen": 45511008, "step": 35110 }, { "epoch": 1.7157550142916473, "grad_norm": 0.16503365337848663, "learning_rate": 1.8182793439975365e-06, "loss": 0.0936, "num_input_tokens_seen": 45517696, "step": 35115 }, { "epoch": 1.7159993159553415, "grad_norm": 0.5422804355621338, "learning_rate": 1.8146055053279958e-06, "loss": 0.064, "num_input_tokens_seen": 45524224, "step": 35120 }, { "epoch": 1.7162436176190359, "grad_norm": 0.15937013924121857, "learning_rate": 1.8109352421376486e-06, "loss": 0.0887, "num_input_tokens_seen": 45530944, "step": 35125 }, { "epoch": 1.7164879192827303, "grad_norm": 0.18522858619689941, "learning_rate": 1.8072685549924972e-06, "loss": 0.0822, "num_input_tokens_seen": 45537600, "step": 35130 }, { "epoch": 1.7167322209464246, "grad_norm": 0.34801748394966125, "learning_rate": 1.8036054444579982e-06, "loss": 0.091, "num_input_tokens_seen": 45544064, "step": 35135 }, { "epoch": 1.716976522610119, "grad_norm": 0.2631693482398987, "learning_rate": 1.7999459110990407e-06, "loss": 0.1058, "num_input_tokens_seen": 45550624, "step": 35140 }, { "epoch": 1.7172208242738134, "grad_norm": 0.6637195348739624, "learning_rate": 1.7962899554799712e-06, "loss": 0.0786, "num_input_tokens_seen": 45557280, "step": 35145 }, { "epoch": 1.7174651259375078, "grad_norm": 0.46885576844215393, "learning_rate": 1.7926375781645937e-06, "loss": 0.0742, "num_input_tokens_seen": 45563136, "step": 35150 }, { "epoch": 1.717709427601202, "grad_norm": 0.3129327893257141, "learning_rate": 1.7889887797161359e-06, "loss": 0.1041, "num_input_tokens_seen": 45569440, "step": 35155 }, { "epoch": 1.7179537292648963, "grad_norm": 0.16416195034980774, "learning_rate": 1.7853435606973028e-06, "loss": 0.0847, "num_input_tokens_seen": 45575552, "step": 35160 }, { "epoch": 1.7181980309285905, "grad_norm": 0.39110538363456726, "learning_rate": 1.781701921670223e-06, "loss": 0.0875, "num_input_tokens_seen": 45581760, "step": 35165 }, { "epoch": 1.7184423325922848, "grad_norm": 0.2434716671705246, "learning_rate": 1.7780638631964886e-06, "loss": 0.1113, "num_input_tokens_seen": 45588128, "step": 35170 }, { "epoch": 1.7186866342559792, "grad_norm": 0.37004104256629944, "learning_rate": 1.7744293858371314e-06, "loss": 0.0914, "num_input_tokens_seen": 45594752, "step": 35175 }, { "epoch": 1.7189309359196736, "grad_norm": 0.17253364622592926, "learning_rate": 1.770798490152631e-06, "loss": 0.0881, "num_input_tokens_seen": 45601152, "step": 35180 }, { "epoch": 1.719175237583368, "grad_norm": 0.3470449149608612, "learning_rate": 1.767171176702917e-06, "loss": 0.1013, "num_input_tokens_seen": 45607616, "step": 35185 }, { "epoch": 1.7194195392470624, "grad_norm": 0.20396697521209717, "learning_rate": 1.7635474460473755e-06, "loss": 0.0743, "num_input_tokens_seen": 45614240, "step": 35190 }, { "epoch": 1.7196638409107567, "grad_norm": 0.19950813055038452, "learning_rate": 1.7599272987448206e-06, "loss": 0.09, "num_input_tokens_seen": 45620576, "step": 35195 }, { "epoch": 1.719908142574451, "grad_norm": 0.18114496767520905, "learning_rate": 1.7563107353535362e-06, "loss": 0.0747, "num_input_tokens_seen": 45627104, "step": 35200 }, { "epoch": 1.719908142574451, "eval_loss": 0.08720598369836807, "eval_runtime": 374.7378, "eval_samples_per_second": 97.095, "eval_steps_per_second": 24.276, "num_input_tokens_seen": 45627104, "step": 35200 }, { "epoch": 1.7201524442381453, "grad_norm": 0.2184046357870102, "learning_rate": 1.7526977564312263e-06, "loss": 0.0763, "num_input_tokens_seen": 45633408, "step": 35205 }, { "epoch": 1.7203967459018394, "grad_norm": 0.21301524341106415, "learning_rate": 1.7490883625350701e-06, "loss": 0.0729, "num_input_tokens_seen": 45639552, "step": 35210 }, { "epoch": 1.7206410475655338, "grad_norm": 0.7889171242713928, "learning_rate": 1.7454825542216807e-06, "loss": 0.0831, "num_input_tokens_seen": 45646432, "step": 35215 }, { "epoch": 1.7208853492292282, "grad_norm": 0.7091575264930725, "learning_rate": 1.7418803320471105e-06, "loss": 0.087, "num_input_tokens_seen": 45652928, "step": 35220 }, { "epoch": 1.7211296508929226, "grad_norm": 0.34562739729881287, "learning_rate": 1.7382816965668737e-06, "loss": 0.0752, "num_input_tokens_seen": 45659072, "step": 35225 }, { "epoch": 1.721373952556617, "grad_norm": 0.6815593838691711, "learning_rate": 1.7346866483359285e-06, "loss": 0.0877, "num_input_tokens_seen": 45665408, "step": 35230 }, { "epoch": 1.7216182542203113, "grad_norm": 0.570710301399231, "learning_rate": 1.7310951879086657e-06, "loss": 0.0652, "num_input_tokens_seen": 45671584, "step": 35235 }, { "epoch": 1.7218625558840057, "grad_norm": 0.1710273176431656, "learning_rate": 1.7275073158389471e-06, "loss": 0.0772, "num_input_tokens_seen": 45677920, "step": 35240 }, { "epoch": 1.7221068575476999, "grad_norm": 0.14154188334941864, "learning_rate": 1.723923032680061e-06, "loss": 0.0791, "num_input_tokens_seen": 45684512, "step": 35245 }, { "epoch": 1.7223511592113943, "grad_norm": 0.2341194599866867, "learning_rate": 1.7203423389847428e-06, "loss": 0.0678, "num_input_tokens_seen": 45690816, "step": 35250 }, { "epoch": 1.7225954608750884, "grad_norm": 1.4585670232772827, "learning_rate": 1.7167652353051928e-06, "loss": 0.1178, "num_input_tokens_seen": 45697504, "step": 35255 }, { "epoch": 1.7228397625387828, "grad_norm": 0.45262473821640015, "learning_rate": 1.7131917221930333e-06, "loss": 0.1002, "num_input_tokens_seen": 45703744, "step": 35260 }, { "epoch": 1.7230840642024772, "grad_norm": 0.9961969256401062, "learning_rate": 1.7096218001993513e-06, "loss": 0.0989, "num_input_tokens_seen": 45711008, "step": 35265 }, { "epoch": 1.7233283658661716, "grad_norm": 0.3985184133052826, "learning_rate": 1.706055469874676e-06, "loss": 0.0795, "num_input_tokens_seen": 45717760, "step": 35270 }, { "epoch": 1.723572667529866, "grad_norm": 0.21416954696178436, "learning_rate": 1.702492731768976e-06, "loss": 0.0998, "num_input_tokens_seen": 45724032, "step": 35275 }, { "epoch": 1.7238169691935603, "grad_norm": 0.16411375999450684, "learning_rate": 1.6989335864316724e-06, "loss": 0.0668, "num_input_tokens_seen": 45730144, "step": 35280 }, { "epoch": 1.7240612708572547, "grad_norm": 0.4184115529060364, "learning_rate": 1.6953780344116265e-06, "loss": 0.0982, "num_input_tokens_seen": 45736128, "step": 35285 }, { "epoch": 1.7243055725209488, "grad_norm": 0.17573417723178864, "learning_rate": 1.6918260762571497e-06, "loss": 0.1129, "num_input_tokens_seen": 45742368, "step": 35290 }, { "epoch": 1.7245498741846432, "grad_norm": 0.43464988470077515, "learning_rate": 1.6882777125160093e-06, "loss": 0.1068, "num_input_tokens_seen": 45748896, "step": 35295 }, { "epoch": 1.7247941758483374, "grad_norm": 0.4845811128616333, "learning_rate": 1.6847329437353899e-06, "loss": 0.0899, "num_input_tokens_seen": 45755360, "step": 35300 }, { "epoch": 1.7250384775120318, "grad_norm": 0.8473479747772217, "learning_rate": 1.6811917704619511e-06, "loss": 0.1002, "num_input_tokens_seen": 45762112, "step": 35305 }, { "epoch": 1.7252827791757261, "grad_norm": 0.23423393070697784, "learning_rate": 1.67765419324179e-06, "loss": 0.0704, "num_input_tokens_seen": 45768480, "step": 35310 }, { "epoch": 1.7255270808394205, "grad_norm": 0.407196968793869, "learning_rate": 1.6741202126204364e-06, "loss": 0.0967, "num_input_tokens_seen": 45775264, "step": 35315 }, { "epoch": 1.725771382503115, "grad_norm": 0.7689816355705261, "learning_rate": 1.6705898291428767e-06, "loss": 0.0878, "num_input_tokens_seen": 45781760, "step": 35320 }, { "epoch": 1.7260156841668093, "grad_norm": 0.1846734583377838, "learning_rate": 1.6670630433535395e-06, "loss": 0.0631, "num_input_tokens_seen": 45788256, "step": 35325 }, { "epoch": 1.7262599858305037, "grad_norm": 0.448760062456131, "learning_rate": 1.6635398557962979e-06, "loss": 0.0872, "num_input_tokens_seen": 45794816, "step": 35330 }, { "epoch": 1.7265042874941978, "grad_norm": 0.6896107792854309, "learning_rate": 1.660020267014481e-06, "loss": 0.0889, "num_input_tokens_seen": 45801088, "step": 35335 }, { "epoch": 1.7267485891578922, "grad_norm": 0.3426644206047058, "learning_rate": 1.6565042775508438e-06, "loss": 0.0824, "num_input_tokens_seen": 45807648, "step": 35340 }, { "epoch": 1.7269928908215864, "grad_norm": 0.15705382823944092, "learning_rate": 1.6529918879475997e-06, "loss": 0.091, "num_input_tokens_seen": 45813760, "step": 35345 }, { "epoch": 1.7272371924852807, "grad_norm": 0.701723039150238, "learning_rate": 1.6494830987464043e-06, "loss": 0.0727, "num_input_tokens_seen": 45820608, "step": 35350 }, { "epoch": 1.7274814941489751, "grad_norm": 0.24229595065116882, "learning_rate": 1.6459779104883555e-06, "loss": 0.0952, "num_input_tokens_seen": 45826816, "step": 35355 }, { "epoch": 1.7277257958126695, "grad_norm": 0.35447433590888977, "learning_rate": 1.6424763237140013e-06, "loss": 0.0852, "num_input_tokens_seen": 45833216, "step": 35360 }, { "epoch": 1.7279700974763639, "grad_norm": 0.12086720764636993, "learning_rate": 1.6389783389633207e-06, "loss": 0.0885, "num_input_tokens_seen": 45839712, "step": 35365 }, { "epoch": 1.7282143991400583, "grad_norm": 0.577789306640625, "learning_rate": 1.6354839567757546e-06, "loss": 0.0813, "num_input_tokens_seen": 45846208, "step": 35370 }, { "epoch": 1.7284587008037526, "grad_norm": 0.5451107621192932, "learning_rate": 1.6319931776901831e-06, "loss": 0.1402, "num_input_tokens_seen": 45852512, "step": 35375 }, { "epoch": 1.7287030024674468, "grad_norm": 0.3646405041217804, "learning_rate": 1.6285060022449229e-06, "loss": 0.0807, "num_input_tokens_seen": 45859104, "step": 35380 }, { "epoch": 1.7289473041311412, "grad_norm": 0.2421029508113861, "learning_rate": 1.6250224309777434e-06, "loss": 0.0829, "num_input_tokens_seen": 45865568, "step": 35385 }, { "epoch": 1.7291916057948353, "grad_norm": 0.5823048949241638, "learning_rate": 1.6215424644258515e-06, "loss": 0.1161, "num_input_tokens_seen": 45872224, "step": 35390 }, { "epoch": 1.7294359074585297, "grad_norm": 0.21131667494773865, "learning_rate": 1.6180661031259036e-06, "loss": 0.1144, "num_input_tokens_seen": 45878752, "step": 35395 }, { "epoch": 1.729680209122224, "grad_norm": 0.23111553490161896, "learning_rate": 1.614593347613999e-06, "loss": 0.0753, "num_input_tokens_seen": 45885312, "step": 35400 }, { "epoch": 1.729680209122224, "eval_loss": 0.087428979575634, "eval_runtime": 374.0406, "eval_samples_per_second": 97.276, "eval_steps_per_second": 24.321, "num_input_tokens_seen": 45885312, "step": 35400 }, { "epoch": 1.7299245107859185, "grad_norm": 0.29946210980415344, "learning_rate": 1.6111241984256758e-06, "loss": 0.0656, "num_input_tokens_seen": 45891872, "step": 35405 }, { "epoch": 1.7301688124496128, "grad_norm": 0.36453405022621155, "learning_rate": 1.6076586560959257e-06, "loss": 0.0895, "num_input_tokens_seen": 45898240, "step": 35410 }, { "epoch": 1.7304131141133072, "grad_norm": 0.37099170684814453, "learning_rate": 1.604196721159182e-06, "loss": 0.1059, "num_input_tokens_seen": 45904544, "step": 35415 }, { "epoch": 1.7306574157770016, "grad_norm": 0.23307155072689056, "learning_rate": 1.6007383941493092e-06, "loss": 0.0925, "num_input_tokens_seen": 45910848, "step": 35420 }, { "epoch": 1.7309017174406958, "grad_norm": 0.1730329990386963, "learning_rate": 1.5972836755996285e-06, "loss": 0.0744, "num_input_tokens_seen": 45917600, "step": 35425 }, { "epoch": 1.7311460191043901, "grad_norm": 0.40140411257743835, "learning_rate": 1.5938325660429076e-06, "loss": 0.0832, "num_input_tokens_seen": 45924160, "step": 35430 }, { "epoch": 1.7313903207680843, "grad_norm": 0.23441709578037262, "learning_rate": 1.5903850660113378e-06, "loss": 0.0711, "num_input_tokens_seen": 45930880, "step": 35435 }, { "epoch": 1.7316346224317787, "grad_norm": 0.6592224836349487, "learning_rate": 1.5869411760365826e-06, "loss": 0.0764, "num_input_tokens_seen": 45937280, "step": 35440 }, { "epoch": 1.731878924095473, "grad_norm": 0.6762955188751221, "learning_rate": 1.58350089664972e-06, "loss": 0.0897, "num_input_tokens_seen": 45943872, "step": 35445 }, { "epoch": 1.7321232257591674, "grad_norm": 0.2518135607242584, "learning_rate": 1.5800642283812865e-06, "loss": 0.089, "num_input_tokens_seen": 45950112, "step": 35450 }, { "epoch": 1.7323675274228618, "grad_norm": 0.2806938886642456, "learning_rate": 1.5766311717612698e-06, "loss": 0.0573, "num_input_tokens_seen": 45956896, "step": 35455 }, { "epoch": 1.7326118290865562, "grad_norm": 0.3799154460430145, "learning_rate": 1.5732017273190818e-06, "loss": 0.0755, "num_input_tokens_seen": 45963296, "step": 35460 }, { "epoch": 1.7328561307502504, "grad_norm": 0.7378211617469788, "learning_rate": 1.5697758955835806e-06, "loss": 0.1116, "num_input_tokens_seen": 45970016, "step": 35465 }, { "epoch": 1.7331004324139447, "grad_norm": 0.24055133759975433, "learning_rate": 1.566353677083085e-06, "loss": 0.0984, "num_input_tokens_seen": 45976416, "step": 35470 }, { "epoch": 1.7333447340776391, "grad_norm": 0.4012989103794098, "learning_rate": 1.562935072345334e-06, "loss": 0.1003, "num_input_tokens_seen": 45982848, "step": 35475 }, { "epoch": 1.7335890357413333, "grad_norm": 0.1813778430223465, "learning_rate": 1.5595200818975281e-06, "loss": 0.0779, "num_input_tokens_seen": 45989536, "step": 35480 }, { "epoch": 1.7338333374050277, "grad_norm": 0.46499893069267273, "learning_rate": 1.5561087062662905e-06, "loss": 0.0944, "num_input_tokens_seen": 45995968, "step": 35485 }, { "epoch": 1.734077639068722, "grad_norm": 1.1667503118515015, "learning_rate": 1.5527009459777087e-06, "loss": 0.085, "num_input_tokens_seen": 46002208, "step": 35490 }, { "epoch": 1.7343219407324164, "grad_norm": 0.31188252568244934, "learning_rate": 1.5492968015572984e-06, "loss": 0.0859, "num_input_tokens_seen": 46008896, "step": 35495 }, { "epoch": 1.7345662423961108, "grad_norm": 0.26012325286865234, "learning_rate": 1.5458962735300203e-06, "loss": 0.1305, "num_input_tokens_seen": 46015392, "step": 35500 }, { "epoch": 1.7348105440598052, "grad_norm": 0.1821180284023285, "learning_rate": 1.54249936242028e-06, "loss": 0.0623, "num_input_tokens_seen": 46021856, "step": 35505 }, { "epoch": 1.7350548457234993, "grad_norm": 0.6758583188056946, "learning_rate": 1.5391060687519222e-06, "loss": 0.0757, "num_input_tokens_seen": 46028416, "step": 35510 }, { "epoch": 1.7352991473871937, "grad_norm": 0.248247429728508, "learning_rate": 1.5357163930482367e-06, "loss": 0.0669, "num_input_tokens_seen": 46035072, "step": 35515 }, { "epoch": 1.735543449050888, "grad_norm": 0.5130344033241272, "learning_rate": 1.532330335831955e-06, "loss": 0.0998, "num_input_tokens_seen": 46041344, "step": 35520 }, { "epoch": 1.7357877507145822, "grad_norm": 0.20127451419830322, "learning_rate": 1.5289478976252491e-06, "loss": 0.0987, "num_input_tokens_seen": 46047552, "step": 35525 }, { "epoch": 1.7360320523782766, "grad_norm": 0.4312961995601654, "learning_rate": 1.5255690789497345e-06, "loss": 0.0912, "num_input_tokens_seen": 46054432, "step": 35530 }, { "epoch": 1.736276354041971, "grad_norm": 0.15291306376457214, "learning_rate": 1.5221938803264641e-06, "loss": 0.0905, "num_input_tokens_seen": 46061056, "step": 35535 }, { "epoch": 1.7365206557056654, "grad_norm": 0.24852615594863892, "learning_rate": 1.518822302275938e-06, "loss": 0.0896, "num_input_tokens_seen": 46067776, "step": 35540 }, { "epoch": 1.7367649573693598, "grad_norm": 0.1602809876203537, "learning_rate": 1.5154543453180958e-06, "loss": 0.0655, "num_input_tokens_seen": 46074144, "step": 35545 }, { "epoch": 1.7370092590330541, "grad_norm": 0.47390085458755493, "learning_rate": 1.5120900099723167e-06, "loss": 0.0823, "num_input_tokens_seen": 46080480, "step": 35550 }, { "epoch": 1.7372535606967483, "grad_norm": 0.1797630786895752, "learning_rate": 1.5087292967574273e-06, "loss": 0.0965, "num_input_tokens_seen": 46086880, "step": 35555 }, { "epoch": 1.7374978623604427, "grad_norm": 0.17243297398090363, "learning_rate": 1.5053722061916908e-06, "loss": 0.075, "num_input_tokens_seen": 46093408, "step": 35560 }, { "epoch": 1.737742164024137, "grad_norm": 0.4283725619316101, "learning_rate": 1.5020187387928124e-06, "loss": 0.0762, "num_input_tokens_seen": 46099776, "step": 35565 }, { "epoch": 1.7379864656878312, "grad_norm": 0.24897104501724243, "learning_rate": 1.4986688950779343e-06, "loss": 0.0959, "num_input_tokens_seen": 46106240, "step": 35570 }, { "epoch": 1.7382307673515256, "grad_norm": 0.2100917249917984, "learning_rate": 1.495322675563654e-06, "loss": 0.0789, "num_input_tokens_seen": 46112736, "step": 35575 }, { "epoch": 1.73847506901522, "grad_norm": 0.3619621694087982, "learning_rate": 1.4919800807659922e-06, "loss": 0.0692, "num_input_tokens_seen": 46119200, "step": 35580 }, { "epoch": 1.7387193706789144, "grad_norm": 0.9067457914352417, "learning_rate": 1.4886411112004255e-06, "loss": 0.0813, "num_input_tokens_seen": 46125824, "step": 35585 }, { "epoch": 1.7389636723426087, "grad_norm": 0.24529901146888733, "learning_rate": 1.4853057673818588e-06, "loss": 0.1123, "num_input_tokens_seen": 46132384, "step": 35590 }, { "epoch": 1.7392079740063031, "grad_norm": 0.743380606174469, "learning_rate": 1.481974049824647e-06, "loss": 0.0967, "num_input_tokens_seen": 46138880, "step": 35595 }, { "epoch": 1.7394522756699973, "grad_norm": 0.12814243137836456, "learning_rate": 1.4786459590425849e-06, "loss": 0.0692, "num_input_tokens_seen": 46145568, "step": 35600 }, { "epoch": 1.7394522756699973, "eval_loss": 0.08715619146823883, "eval_runtime": 374.6681, "eval_samples_per_second": 97.113, "eval_steps_per_second": 24.28, "num_input_tokens_seen": 46145568, "step": 35600 }, { "epoch": 1.7396965773336917, "grad_norm": 1.364309549331665, "learning_rate": 1.4753214955489036e-06, "loss": 0.1074, "num_input_tokens_seen": 46152448, "step": 35605 }, { "epoch": 1.7399408789973858, "grad_norm": 0.2931547164916992, "learning_rate": 1.4720006598562737e-06, "loss": 0.0991, "num_input_tokens_seen": 46158880, "step": 35610 }, { "epoch": 1.7401851806610802, "grad_norm": 0.325460821390152, "learning_rate": 1.4686834524768185e-06, "loss": 0.0768, "num_input_tokens_seen": 46165664, "step": 35615 }, { "epoch": 1.7404294823247746, "grad_norm": 0.464091420173645, "learning_rate": 1.4653698739220844e-06, "loss": 0.1002, "num_input_tokens_seen": 46172384, "step": 35620 }, { "epoch": 1.740673783988469, "grad_norm": 0.7131773829460144, "learning_rate": 1.4620599247030715e-06, "loss": 0.0679, "num_input_tokens_seen": 46179104, "step": 35625 }, { "epoch": 1.7409180856521633, "grad_norm": 0.3838690221309662, "learning_rate": 1.4587536053302125e-06, "loss": 0.0739, "num_input_tokens_seen": 46185728, "step": 35630 }, { "epoch": 1.7411623873158577, "grad_norm": 0.21859945356845856, "learning_rate": 1.4554509163133862e-06, "loss": 0.1015, "num_input_tokens_seen": 46192192, "step": 35635 }, { "epoch": 1.741406688979552, "grad_norm": 0.4772632122039795, "learning_rate": 1.4521518581619098e-06, "loss": 0.0758, "num_input_tokens_seen": 46198656, "step": 35640 }, { "epoch": 1.7416509906432462, "grad_norm": 0.19103483855724335, "learning_rate": 1.4488564313845348e-06, "loss": 0.0914, "num_input_tokens_seen": 46204928, "step": 35645 }, { "epoch": 1.7418952923069406, "grad_norm": 0.29151055216789246, "learning_rate": 1.4455646364894603e-06, "loss": 0.0989, "num_input_tokens_seen": 46211136, "step": 35650 }, { "epoch": 1.7421395939706348, "grad_norm": 0.2004164308309555, "learning_rate": 1.4422764739843247e-06, "loss": 0.0809, "num_input_tokens_seen": 46217728, "step": 35655 }, { "epoch": 1.7423838956343292, "grad_norm": 0.14466975629329681, "learning_rate": 1.4389919443762e-06, "loss": 0.0809, "num_input_tokens_seen": 46224256, "step": 35660 }, { "epoch": 1.7426281972980235, "grad_norm": 0.5441108345985413, "learning_rate": 1.4357110481716063e-06, "loss": 0.068, "num_input_tokens_seen": 46230816, "step": 35665 }, { "epoch": 1.742872498961718, "grad_norm": 0.8874549865722656, "learning_rate": 1.4324337858764941e-06, "loss": 0.0931, "num_input_tokens_seen": 46237248, "step": 35670 }, { "epoch": 1.7431168006254123, "grad_norm": 0.271901935338974, "learning_rate": 1.4291601579962622e-06, "loss": 0.0661, "num_input_tokens_seen": 46244352, "step": 35675 }, { "epoch": 1.7433611022891067, "grad_norm": 0.48221567273139954, "learning_rate": 1.42589016503574e-06, "loss": 0.0816, "num_input_tokens_seen": 46251040, "step": 35680 }, { "epoch": 1.743605403952801, "grad_norm": 0.40014901757240295, "learning_rate": 1.4226238074992099e-06, "loss": 0.0984, "num_input_tokens_seen": 46257920, "step": 35685 }, { "epoch": 1.7438497056164952, "grad_norm": 0.21799778938293457, "learning_rate": 1.4193610858903778e-06, "loss": 0.0694, "num_input_tokens_seen": 46264512, "step": 35690 }, { "epoch": 1.7440940072801896, "grad_norm": 0.40514039993286133, "learning_rate": 1.416102000712402e-06, "loss": 0.0774, "num_input_tokens_seen": 46270976, "step": 35695 }, { "epoch": 1.7443383089438838, "grad_norm": 1.2423968315124512, "learning_rate": 1.4128465524678668e-06, "loss": 0.1252, "num_input_tokens_seen": 46277696, "step": 35700 }, { "epoch": 1.7445826106075781, "grad_norm": 0.12480627745389938, "learning_rate": 1.4095947416588124e-06, "loss": 0.061, "num_input_tokens_seen": 46283968, "step": 35705 }, { "epoch": 1.7448269122712725, "grad_norm": 0.3371552526950836, "learning_rate": 1.4063465687866983e-06, "loss": 0.1065, "num_input_tokens_seen": 46290624, "step": 35710 }, { "epoch": 1.745071213934967, "grad_norm": 0.15254628658294678, "learning_rate": 1.4031020343524438e-06, "loss": 0.0789, "num_input_tokens_seen": 46297472, "step": 35715 }, { "epoch": 1.7453155155986613, "grad_norm": 0.2254478633403778, "learning_rate": 1.3998611388563926e-06, "loss": 0.1042, "num_input_tokens_seen": 46303936, "step": 35720 }, { "epoch": 1.7455598172623557, "grad_norm": 0.4227355122566223, "learning_rate": 1.3966238827983314e-06, "loss": 0.0626, "num_input_tokens_seen": 46310144, "step": 35725 }, { "epoch": 1.74580411892605, "grad_norm": 0.2771748900413513, "learning_rate": 1.393390266677483e-06, "loss": 0.0921, "num_input_tokens_seen": 46317216, "step": 35730 }, { "epoch": 1.7460484205897442, "grad_norm": 0.41456589102745056, "learning_rate": 1.3901602909925204e-06, "loss": 0.0815, "num_input_tokens_seen": 46323712, "step": 35735 }, { "epoch": 1.7462927222534386, "grad_norm": 0.21913424134254456, "learning_rate": 1.3869339562415373e-06, "loss": 0.0666, "num_input_tokens_seen": 46330592, "step": 35740 }, { "epoch": 1.7465370239171327, "grad_norm": 0.8078837394714355, "learning_rate": 1.38371126292208e-06, "loss": 0.0805, "num_input_tokens_seen": 46336832, "step": 35745 }, { "epoch": 1.746781325580827, "grad_norm": 0.23916411399841309, "learning_rate": 1.3804922115311286e-06, "loss": 0.0559, "num_input_tokens_seen": 46343424, "step": 35750 }, { "epoch": 1.7470256272445215, "grad_norm": 0.43657171726226807, "learning_rate": 1.3772768025650945e-06, "loss": 0.078, "num_input_tokens_seen": 46349760, "step": 35755 }, { "epoch": 1.7472699289082159, "grad_norm": 0.1470821648836136, "learning_rate": 1.3740650365198448e-06, "loss": 0.083, "num_input_tokens_seen": 46356704, "step": 35760 }, { "epoch": 1.7475142305719102, "grad_norm": 0.15847159922122955, "learning_rate": 1.3708569138906612e-06, "loss": 0.0911, "num_input_tokens_seen": 46362912, "step": 35765 }, { "epoch": 1.7477585322356046, "grad_norm": 0.1514485776424408, "learning_rate": 1.367652435172287e-06, "loss": 0.0662, "num_input_tokens_seen": 46369376, "step": 35770 }, { "epoch": 1.748002833899299, "grad_norm": 0.30484646558761597, "learning_rate": 1.364451600858893e-06, "loss": 0.0615, "num_input_tokens_seen": 46375872, "step": 35775 }, { "epoch": 1.7482471355629932, "grad_norm": 1.0062661170959473, "learning_rate": 1.3612544114440823e-06, "loss": 0.0818, "num_input_tokens_seen": 46382688, "step": 35780 }, { "epoch": 1.7484914372266875, "grad_norm": 0.17390255630016327, "learning_rate": 1.3580608674209072e-06, "loss": 0.0818, "num_input_tokens_seen": 46389344, "step": 35785 }, { "epoch": 1.7487357388903817, "grad_norm": 0.6328800320625305, "learning_rate": 1.3548709692818434e-06, "loss": 0.0733, "num_input_tokens_seen": 46396480, "step": 35790 }, { "epoch": 1.748980040554076, "grad_norm": 0.31685593724250793, "learning_rate": 1.3516847175188223e-06, "loss": 0.0712, "num_input_tokens_seen": 46403040, "step": 35795 }, { "epoch": 1.7492243422177705, "grad_norm": 0.19415244460105896, "learning_rate": 1.348502112623204e-06, "loss": 0.0823, "num_input_tokens_seen": 46409504, "step": 35800 }, { "epoch": 1.7492243422177705, "eval_loss": 0.08722209185361862, "eval_runtime": 375.5757, "eval_samples_per_second": 96.878, "eval_steps_per_second": 24.221, "num_input_tokens_seen": 46409504, "step": 35800 }, { "epoch": 1.7494686438814648, "grad_norm": 0.5752057433128357, "learning_rate": 1.3453231550857787e-06, "loss": 0.0939, "num_input_tokens_seen": 46416160, "step": 35805 }, { "epoch": 1.7497129455451592, "grad_norm": 0.3742920160293579, "learning_rate": 1.3421478453967878e-06, "loss": 0.0747, "num_input_tokens_seen": 46422656, "step": 35810 }, { "epoch": 1.7499572472088536, "grad_norm": 0.24357794225215912, "learning_rate": 1.3389761840459065e-06, "loss": 0.0819, "num_input_tokens_seen": 46429152, "step": 35815 }, { "epoch": 1.750201548872548, "grad_norm": 0.20453977584838867, "learning_rate": 1.3358081715222376e-06, "loss": 0.0971, "num_input_tokens_seen": 46435136, "step": 35820 }, { "epoch": 1.7504458505362421, "grad_norm": 0.22687019407749176, "learning_rate": 1.3326438083143295e-06, "loss": 0.0964, "num_input_tokens_seen": 46441760, "step": 35825 }, { "epoch": 1.7506901521999365, "grad_norm": 0.6103973388671875, "learning_rate": 1.3294830949101723e-06, "loss": 0.085, "num_input_tokens_seen": 46447936, "step": 35830 }, { "epoch": 1.7509344538636307, "grad_norm": 0.5068308711051941, "learning_rate": 1.3263260317971815e-06, "loss": 0.0692, "num_input_tokens_seen": 46454304, "step": 35835 }, { "epoch": 1.751178755527325, "grad_norm": 0.1955385059118271, "learning_rate": 1.3231726194622208e-06, "loss": 0.0876, "num_input_tokens_seen": 46460832, "step": 35840 }, { "epoch": 1.7514230571910194, "grad_norm": 0.6254904866218567, "learning_rate": 1.3200228583915814e-06, "loss": 0.0908, "num_input_tokens_seen": 46467200, "step": 35845 }, { "epoch": 1.7516673588547138, "grad_norm": 0.20767782628536224, "learning_rate": 1.3168767490709971e-06, "loss": 0.0717, "num_input_tokens_seen": 46473824, "step": 35850 }, { "epoch": 1.7519116605184082, "grad_norm": 0.2103624790906906, "learning_rate": 1.3137342919856437e-06, "loss": 0.0769, "num_input_tokens_seen": 46480352, "step": 35855 }, { "epoch": 1.7521559621821026, "grad_norm": 0.22472845017910004, "learning_rate": 1.310595487620117e-06, "loss": 0.0937, "num_input_tokens_seen": 46486656, "step": 35860 }, { "epoch": 1.752400263845797, "grad_norm": 0.20105691254138947, "learning_rate": 1.3074603364584715e-06, "loss": 0.1466, "num_input_tokens_seen": 46492736, "step": 35865 }, { "epoch": 1.752644565509491, "grad_norm": 0.16148297488689423, "learning_rate": 1.3043288389841758e-06, "loss": 0.0835, "num_input_tokens_seen": 46499168, "step": 35870 }, { "epoch": 1.7528888671731855, "grad_norm": 0.14152102172374725, "learning_rate": 1.3012009956801546e-06, "loss": 0.0822, "num_input_tokens_seen": 46505952, "step": 35875 }, { "epoch": 1.7531331688368796, "grad_norm": 0.17054757475852966, "learning_rate": 1.2980768070287586e-06, "loss": 0.0825, "num_input_tokens_seen": 46512640, "step": 35880 }, { "epoch": 1.753377470500574, "grad_norm": 0.21991191804409027, "learning_rate": 1.2949562735117716e-06, "loss": 0.0693, "num_input_tokens_seen": 46519296, "step": 35885 }, { "epoch": 1.7536217721642684, "grad_norm": 0.45051974058151245, "learning_rate": 1.291839395610428e-06, "loss": 0.1044, "num_input_tokens_seen": 46525664, "step": 35890 }, { "epoch": 1.7538660738279628, "grad_norm": 0.1649608016014099, "learning_rate": 1.2887261738053852e-06, "loss": 0.1002, "num_input_tokens_seen": 46531968, "step": 35895 }, { "epoch": 1.7541103754916572, "grad_norm": 0.4344385862350464, "learning_rate": 1.2856166085767396e-06, "loss": 0.068, "num_input_tokens_seen": 46538816, "step": 35900 }, { "epoch": 1.7543546771553515, "grad_norm": 0.6993955373764038, "learning_rate": 1.2825107004040272e-06, "loss": 0.1043, "num_input_tokens_seen": 46545152, "step": 35905 }, { "epoch": 1.754598978819046, "grad_norm": 0.34899601340293884, "learning_rate": 1.2794084497662146e-06, "loss": 0.0875, "num_input_tokens_seen": 46551712, "step": 35910 }, { "epoch": 1.75484328048274, "grad_norm": 0.12493237853050232, "learning_rate": 1.276309857141711e-06, "loss": 0.0579, "num_input_tokens_seen": 46558048, "step": 35915 }, { "epoch": 1.7550875821464345, "grad_norm": 0.29732105135917664, "learning_rate": 1.273214923008359e-06, "loss": 0.0752, "num_input_tokens_seen": 46564704, "step": 35920 }, { "epoch": 1.7553318838101286, "grad_norm": 0.3805377781391144, "learning_rate": 1.2701236478434352e-06, "loss": 0.083, "num_input_tokens_seen": 46571232, "step": 35925 }, { "epoch": 1.755576185473823, "grad_norm": 0.20524990558624268, "learning_rate": 1.2670360321236502e-06, "loss": 0.0842, "num_input_tokens_seen": 46577728, "step": 35930 }, { "epoch": 1.7558204871375174, "grad_norm": 0.17870037257671356, "learning_rate": 1.2639520763251617e-06, "loss": 0.0806, "num_input_tokens_seen": 46584096, "step": 35935 }, { "epoch": 1.7560647888012118, "grad_norm": 0.5977426767349243, "learning_rate": 1.2608717809235448e-06, "loss": 0.0866, "num_input_tokens_seen": 46590272, "step": 35940 }, { "epoch": 1.7563090904649061, "grad_norm": 0.33187976479530334, "learning_rate": 1.2577951463938282e-06, "loss": 0.1227, "num_input_tokens_seen": 46596320, "step": 35945 }, { "epoch": 1.7565533921286005, "grad_norm": 0.24610434472560883, "learning_rate": 1.2547221732104569e-06, "loss": 0.0758, "num_input_tokens_seen": 46603072, "step": 35950 }, { "epoch": 1.756797693792295, "grad_norm": 0.6130965948104858, "learning_rate": 1.25165286184733e-06, "loss": 0.0913, "num_input_tokens_seen": 46610336, "step": 35955 }, { "epoch": 1.757041995455989, "grad_norm": 0.19878269731998444, "learning_rate": 1.248587212777777e-06, "loss": 0.0881, "num_input_tokens_seen": 46616640, "step": 35960 }, { "epoch": 1.7572862971196834, "grad_norm": 0.26671063899993896, "learning_rate": 1.2455252264745532e-06, "loss": 0.088, "num_input_tokens_seen": 46623232, "step": 35965 }, { "epoch": 1.7575305987833776, "grad_norm": 0.2373581975698471, "learning_rate": 1.2424669034098528e-06, "loss": 0.0766, "num_input_tokens_seen": 46630112, "step": 35970 }, { "epoch": 1.757774900447072, "grad_norm": 0.2638692259788513, "learning_rate": 1.2394122440553185e-06, "loss": 0.078, "num_input_tokens_seen": 46636800, "step": 35975 }, { "epoch": 1.7580192021107663, "grad_norm": 0.312722772359848, "learning_rate": 1.2363612488820037e-06, "loss": 0.0899, "num_input_tokens_seen": 46643648, "step": 35980 }, { "epoch": 1.7582635037744607, "grad_norm": 0.31843623518943787, "learning_rate": 1.2333139183604208e-06, "loss": 0.0849, "num_input_tokens_seen": 46649888, "step": 35985 }, { "epoch": 1.758507805438155, "grad_norm": 0.34486886858940125, "learning_rate": 1.2302702529604998e-06, "loss": 0.0769, "num_input_tokens_seen": 46656192, "step": 35990 }, { "epoch": 1.7587521071018495, "grad_norm": 0.4415193796157837, "learning_rate": 1.227230253151615e-06, "loss": 0.0812, "num_input_tokens_seen": 46662784, "step": 35995 }, { "epoch": 1.7589964087655436, "grad_norm": 0.8721622824668884, "learning_rate": 1.2241939194025748e-06, "loss": 0.0777, "num_input_tokens_seen": 46669472, "step": 36000 }, { "epoch": 1.7589964087655436, "eval_loss": 0.08722388744354248, "eval_runtime": 374.4144, "eval_samples_per_second": 97.178, "eval_steps_per_second": 24.297, "num_input_tokens_seen": 46669472, "step": 36000 }, { "epoch": 1.759240710429238, "grad_norm": 0.18471749126911163, "learning_rate": 1.2211612521816156e-06, "loss": 0.1048, "num_input_tokens_seen": 46675936, "step": 36005 }, { "epoch": 1.7594850120929324, "grad_norm": 0.5835134983062744, "learning_rate": 1.2181322519564137e-06, "loss": 0.0894, "num_input_tokens_seen": 46682272, "step": 36010 }, { "epoch": 1.7597293137566266, "grad_norm": 0.24553616344928741, "learning_rate": 1.2151069191940839e-06, "loss": 0.0895, "num_input_tokens_seen": 46688768, "step": 36015 }, { "epoch": 1.759973615420321, "grad_norm": 0.45700883865356445, "learning_rate": 1.2120852543611644e-06, "loss": 0.0657, "num_input_tokens_seen": 46695584, "step": 36020 }, { "epoch": 1.7602179170840153, "grad_norm": 0.29727256298065186, "learning_rate": 1.2090672579236379e-06, "loss": 0.0863, "num_input_tokens_seen": 46702144, "step": 36025 }, { "epoch": 1.7604622187477097, "grad_norm": 0.1652219146490097, "learning_rate": 1.2060529303469126e-06, "loss": 0.0526, "num_input_tokens_seen": 46708672, "step": 36030 }, { "epoch": 1.760706520411404, "grad_norm": 0.29542288184165955, "learning_rate": 1.2030422720958445e-06, "loss": 0.1039, "num_input_tokens_seen": 46715008, "step": 36035 }, { "epoch": 1.7609508220750985, "grad_norm": 0.2488560825586319, "learning_rate": 1.200035283634704e-06, "loss": 0.1014, "num_input_tokens_seen": 46721312, "step": 36040 }, { "epoch": 1.7611951237387926, "grad_norm": 0.14321798086166382, "learning_rate": 1.1970319654272144e-06, "loss": 0.0602, "num_input_tokens_seen": 46728224, "step": 36045 }, { "epoch": 1.761439425402487, "grad_norm": 0.35261428356170654, "learning_rate": 1.1940323179365192e-06, "loss": 0.0799, "num_input_tokens_seen": 46734400, "step": 36050 }, { "epoch": 1.7616837270661814, "grad_norm": 0.5518016219139099, "learning_rate": 1.1910363416252095e-06, "loss": 0.0639, "num_input_tokens_seen": 46741088, "step": 36055 }, { "epoch": 1.7619280287298755, "grad_norm": 0.3522661626338959, "learning_rate": 1.1880440369552964e-06, "loss": 0.082, "num_input_tokens_seen": 46747488, "step": 36060 }, { "epoch": 1.76217233039357, "grad_norm": 0.4582001566886902, "learning_rate": 1.1850554043882328e-06, "loss": 0.0798, "num_input_tokens_seen": 46753664, "step": 36065 }, { "epoch": 1.7624166320572643, "grad_norm": 0.1942957639694214, "learning_rate": 1.1820704443849028e-06, "loss": 0.0799, "num_input_tokens_seen": 46760544, "step": 36070 }, { "epoch": 1.7626609337209587, "grad_norm": 0.1362931728363037, "learning_rate": 1.1790891574056219e-06, "loss": 0.0726, "num_input_tokens_seen": 46766880, "step": 36075 }, { "epoch": 1.762905235384653, "grad_norm": 0.41675907373428345, "learning_rate": 1.1761115439101523e-06, "loss": 0.0886, "num_input_tokens_seen": 46773024, "step": 36080 }, { "epoch": 1.7631495370483474, "grad_norm": 0.11262615770101547, "learning_rate": 1.1731376043576659e-06, "loss": 0.0704, "num_input_tokens_seen": 46779744, "step": 36085 }, { "epoch": 1.7633938387120416, "grad_norm": 0.3007643222808838, "learning_rate": 1.1701673392067875e-06, "loss": 0.0878, "num_input_tokens_seen": 46786304, "step": 36090 }, { "epoch": 1.763638140375736, "grad_norm": 0.6651868224143982, "learning_rate": 1.1672007489155757e-06, "loss": 0.075, "num_input_tokens_seen": 46792896, "step": 36095 }, { "epoch": 1.7638824420394303, "grad_norm": 0.282736212015152, "learning_rate": 1.164237833941506e-06, "loss": 0.0793, "num_input_tokens_seen": 46799232, "step": 36100 }, { "epoch": 1.7641267437031245, "grad_norm": 0.2624035179615021, "learning_rate": 1.1612785947415022e-06, "loss": 0.0822, "num_input_tokens_seen": 46805856, "step": 36105 }, { "epoch": 1.7643710453668189, "grad_norm": 0.16875137388706207, "learning_rate": 1.1583230317719185e-06, "loss": 0.0679, "num_input_tokens_seen": 46812512, "step": 36110 }, { "epoch": 1.7646153470305133, "grad_norm": 0.4586661159992218, "learning_rate": 1.1553711454885318e-06, "loss": 0.08, "num_input_tokens_seen": 46819968, "step": 36115 }, { "epoch": 1.7648596486942076, "grad_norm": 0.1423063427209854, "learning_rate": 1.152422936346567e-06, "loss": 0.0731, "num_input_tokens_seen": 46826080, "step": 36120 }, { "epoch": 1.765103950357902, "grad_norm": 0.5679332613945007, "learning_rate": 1.1494784048006718e-06, "loss": 0.0818, "num_input_tokens_seen": 46832896, "step": 36125 }, { "epoch": 1.7653482520215964, "grad_norm": 0.4897793233394623, "learning_rate": 1.1465375513049326e-06, "loss": 0.0857, "num_input_tokens_seen": 46839552, "step": 36130 }, { "epoch": 1.7655925536852906, "grad_norm": 0.22618569433689117, "learning_rate": 1.1436003763128616e-06, "loss": 0.0979, "num_input_tokens_seen": 46845568, "step": 36135 }, { "epoch": 1.765836855348985, "grad_norm": 0.08676314353942871, "learning_rate": 1.1406668802774106e-06, "loss": 0.0747, "num_input_tokens_seen": 46852416, "step": 36140 }, { "epoch": 1.766081157012679, "grad_norm": 0.2867613434791565, "learning_rate": 1.137737063650965e-06, "loss": 0.0959, "num_input_tokens_seen": 46859008, "step": 36145 }, { "epoch": 1.7663254586763735, "grad_norm": 0.597449004650116, "learning_rate": 1.1348109268853323e-06, "loss": 0.1071, "num_input_tokens_seen": 46865248, "step": 36150 }, { "epoch": 1.7665697603400679, "grad_norm": 0.5725751519203186, "learning_rate": 1.1318884704317634e-06, "loss": 0.1116, "num_input_tokens_seen": 46871872, "step": 36155 }, { "epoch": 1.7668140620037622, "grad_norm": 0.3825112283229828, "learning_rate": 1.1289696947409417e-06, "loss": 0.0797, "num_input_tokens_seen": 46878208, "step": 36160 }, { "epoch": 1.7670583636674566, "grad_norm": 0.22394882142543793, "learning_rate": 1.126054600262974e-06, "loss": 0.0999, "num_input_tokens_seen": 46884864, "step": 36165 }, { "epoch": 1.767302665331151, "grad_norm": 0.9464812278747559, "learning_rate": 1.1231431874474064e-06, "loss": 0.0849, "num_input_tokens_seen": 46891360, "step": 36170 }, { "epoch": 1.7675469669948454, "grad_norm": 0.6035975217819214, "learning_rate": 1.12023545674321e-06, "loss": 0.0917, "num_input_tokens_seen": 46897344, "step": 36175 }, { "epoch": 1.7677912686585395, "grad_norm": 0.613437294960022, "learning_rate": 1.117331408598804e-06, "loss": 0.1008, "num_input_tokens_seen": 46903488, "step": 36180 }, { "epoch": 1.768035570322234, "grad_norm": 0.24240007996559143, "learning_rate": 1.1144310434620191e-06, "loss": 0.0783, "num_input_tokens_seen": 46909984, "step": 36185 }, { "epoch": 1.768279871985928, "grad_norm": 0.2987649142742157, "learning_rate": 1.1115343617801365e-06, "loss": 0.0886, "num_input_tokens_seen": 46916736, "step": 36190 }, { "epoch": 1.7685241736496224, "grad_norm": 0.2080046534538269, "learning_rate": 1.1086413639998515e-06, "loss": 0.0884, "num_input_tokens_seen": 46923072, "step": 36195 }, { "epoch": 1.7687684753133168, "grad_norm": 0.7373991012573242, "learning_rate": 1.1057520505673103e-06, "loss": 0.095, "num_input_tokens_seen": 46929280, "step": 36200 }, { "epoch": 1.7687684753133168, "eval_loss": 0.08741622418165207, "eval_runtime": 374.6344, "eval_samples_per_second": 97.121, "eval_steps_per_second": 24.282, "num_input_tokens_seen": 46929280, "step": 36200 }, { "epoch": 1.7690127769770112, "grad_norm": 0.22856813669204712, "learning_rate": 1.1028664219280727e-06, "loss": 0.0892, "num_input_tokens_seen": 46935392, "step": 36205 }, { "epoch": 1.7692570786407056, "grad_norm": 0.33721745014190674, "learning_rate": 1.0999844785271468e-06, "loss": 0.0617, "num_input_tokens_seen": 46942176, "step": 36210 }, { "epoch": 1.7695013803044, "grad_norm": 0.4399031102657318, "learning_rate": 1.097106220808955e-06, "loss": 0.0965, "num_input_tokens_seen": 46948960, "step": 36215 }, { "epoch": 1.7697456819680943, "grad_norm": 0.2942976951599121, "learning_rate": 1.0942316492173698e-06, "loss": 0.0616, "num_input_tokens_seen": 46955712, "step": 36220 }, { "epoch": 1.7699899836317885, "grad_norm": 0.31336119771003723, "learning_rate": 1.0913607641956841e-06, "loss": 0.0701, "num_input_tokens_seen": 46962208, "step": 36225 }, { "epoch": 1.7702342852954829, "grad_norm": 0.26949527859687805, "learning_rate": 1.0884935661866213e-06, "loss": 0.0647, "num_input_tokens_seen": 46968672, "step": 36230 }, { "epoch": 1.770478586959177, "grad_norm": 0.20398803055286407, "learning_rate": 1.0856300556323418e-06, "loss": 0.1055, "num_input_tokens_seen": 46975264, "step": 36235 }, { "epoch": 1.7707228886228714, "grad_norm": 0.22734880447387695, "learning_rate": 1.0827702329744365e-06, "loss": 0.0781, "num_input_tokens_seen": 46981728, "step": 36240 }, { "epoch": 1.7709671902865658, "grad_norm": 0.47449791431427, "learning_rate": 1.0799140986539197e-06, "loss": 0.0844, "num_input_tokens_seen": 46988288, "step": 36245 }, { "epoch": 1.7712114919502602, "grad_norm": 0.44043654203414917, "learning_rate": 1.0770616531112526e-06, "loss": 0.0895, "num_input_tokens_seen": 46995296, "step": 36250 }, { "epoch": 1.7714557936139546, "grad_norm": 0.28273266553878784, "learning_rate": 1.0742128967863085e-06, "loss": 0.098, "num_input_tokens_seen": 47001664, "step": 36255 }, { "epoch": 1.771700095277649, "grad_norm": 0.2815449833869934, "learning_rate": 1.071367830118411e-06, "loss": 0.0935, "num_input_tokens_seen": 47007904, "step": 36260 }, { "epoch": 1.7719443969413433, "grad_norm": 0.24684453010559082, "learning_rate": 1.068526453546298e-06, "loss": 0.0886, "num_input_tokens_seen": 47013696, "step": 36265 }, { "epoch": 1.7721886986050375, "grad_norm": 0.5741454362869263, "learning_rate": 1.0656887675081467e-06, "loss": 0.1108, "num_input_tokens_seen": 47020352, "step": 36270 }, { "epoch": 1.7724330002687319, "grad_norm": 0.08087961375713348, "learning_rate": 1.0628547724415628e-06, "loss": 0.0751, "num_input_tokens_seen": 47026592, "step": 36275 }, { "epoch": 1.772677301932426, "grad_norm": 0.20515528321266174, "learning_rate": 1.0600244687835881e-06, "loss": 0.0893, "num_input_tokens_seen": 47032896, "step": 36280 }, { "epoch": 1.7729216035961204, "grad_norm": 0.22675055265426636, "learning_rate": 1.0571978569706876e-06, "loss": 0.0566, "num_input_tokens_seen": 47040192, "step": 36285 }, { "epoch": 1.7731659052598148, "grad_norm": 0.27050039172172546, "learning_rate": 1.0543749374387652e-06, "loss": 0.0714, "num_input_tokens_seen": 47046752, "step": 36290 }, { "epoch": 1.7734102069235091, "grad_norm": 0.11562774330377579, "learning_rate": 1.051555710623142e-06, "loss": 0.0727, "num_input_tokens_seen": 47053472, "step": 36295 }, { "epoch": 1.7736545085872035, "grad_norm": 0.4354059100151062, "learning_rate": 1.0487401769585847e-06, "loss": 0.0744, "num_input_tokens_seen": 47059776, "step": 36300 }, { "epoch": 1.773898810250898, "grad_norm": 0.2228255569934845, "learning_rate": 1.0459283368792845e-06, "loss": 0.0827, "num_input_tokens_seen": 47066048, "step": 36305 }, { "epoch": 1.7741431119145923, "grad_norm": 0.2375064343214035, "learning_rate": 1.043120190818858e-06, "loss": 0.1028, "num_input_tokens_seen": 47072512, "step": 36310 }, { "epoch": 1.7743874135782864, "grad_norm": 0.2776833474636078, "learning_rate": 1.0403157392103596e-06, "loss": 0.0927, "num_input_tokens_seen": 47078816, "step": 36315 }, { "epoch": 1.7746317152419808, "grad_norm": 0.2707836627960205, "learning_rate": 1.0375149824862735e-06, "loss": 0.0657, "num_input_tokens_seen": 47085216, "step": 36320 }, { "epoch": 1.774876016905675, "grad_norm": 0.2408815324306488, "learning_rate": 1.034717921078507e-06, "loss": 0.087, "num_input_tokens_seen": 47091488, "step": 36325 }, { "epoch": 1.7751203185693694, "grad_norm": 0.26874464750289917, "learning_rate": 1.0319245554184009e-06, "loss": 0.094, "num_input_tokens_seen": 47097920, "step": 36330 }, { "epoch": 1.7753646202330637, "grad_norm": 0.16638149321079254, "learning_rate": 1.0291348859367361e-06, "loss": 0.0783, "num_input_tokens_seen": 47104288, "step": 36335 }, { "epoch": 1.7756089218967581, "grad_norm": 0.35376429557800293, "learning_rate": 1.0263489130637016e-06, "loss": 0.0866, "num_input_tokens_seen": 47110976, "step": 36340 }, { "epoch": 1.7758532235604525, "grad_norm": 0.18011754751205444, "learning_rate": 1.0235666372289427e-06, "loss": 0.0832, "num_input_tokens_seen": 47117568, "step": 36345 }, { "epoch": 1.7760975252241469, "grad_norm": 0.44133833050727844, "learning_rate": 1.0207880588615076e-06, "loss": 0.0679, "num_input_tokens_seen": 47123712, "step": 36350 }, { "epoch": 1.7763418268878413, "grad_norm": 0.5363987684249878, "learning_rate": 1.0180131783898984e-06, "loss": 0.1102, "num_input_tokens_seen": 47130208, "step": 36355 }, { "epoch": 1.7765861285515354, "grad_norm": 0.32520681619644165, "learning_rate": 1.0152419962420362e-06, "loss": 0.0812, "num_input_tokens_seen": 47136992, "step": 36360 }, { "epoch": 1.7768304302152298, "grad_norm": 0.7772891521453857, "learning_rate": 1.0124745128452685e-06, "loss": 0.076, "num_input_tokens_seen": 47143712, "step": 36365 }, { "epoch": 1.777074731878924, "grad_norm": 0.18819963932037354, "learning_rate": 1.0097107286263758e-06, "loss": 0.1166, "num_input_tokens_seen": 47150144, "step": 36370 }, { "epoch": 1.7773190335426183, "grad_norm": 0.28942960500717163, "learning_rate": 1.00695064401157e-06, "loss": 0.0959, "num_input_tokens_seen": 47156192, "step": 36375 }, { "epoch": 1.7775633352063127, "grad_norm": 0.2556678056716919, "learning_rate": 1.0041942594264886e-06, "loss": 0.0935, "num_input_tokens_seen": 47162560, "step": 36380 }, { "epoch": 1.777807636870007, "grad_norm": 0.38516396284103394, "learning_rate": 1.001441575296208e-06, "loss": 0.0789, "num_input_tokens_seen": 47168544, "step": 36385 }, { "epoch": 1.7780519385337015, "grad_norm": 0.7810837626457214, "learning_rate": 9.986925920452139e-07, "loss": 0.0906, "num_input_tokens_seen": 47174848, "step": 36390 }, { "epoch": 1.7782962401973958, "grad_norm": 0.5665349364280701, "learning_rate": 9.959473100974475e-07, "loss": 0.1075, "num_input_tokens_seen": 47181536, "step": 36395 }, { "epoch": 1.7785405418610902, "grad_norm": 0.3744535744190216, "learning_rate": 9.932057298762564e-07, "loss": 0.1026, "num_input_tokens_seen": 47188416, "step": 36400 }, { "epoch": 1.7785405418610902, "eval_loss": 0.08749520778656006, "eval_runtime": 374.498, "eval_samples_per_second": 97.157, "eval_steps_per_second": 24.291, "num_input_tokens_seen": 47188416, "step": 36400 }, { "epoch": 1.7787848435247844, "grad_norm": 0.1363677680492401, "learning_rate": 9.90467851804433e-07, "loss": 0.0694, "num_input_tokens_seen": 47194688, "step": 36405 }, { "epoch": 1.7790291451884788, "grad_norm": 0.5941649079322815, "learning_rate": 9.877336763041895e-07, "loss": 0.1021, "num_input_tokens_seen": 47201216, "step": 36410 }, { "epoch": 1.779273446852173, "grad_norm": 0.720389187335968, "learning_rate": 9.850032037971662e-07, "loss": 0.0746, "num_input_tokens_seen": 47207744, "step": 36415 }, { "epoch": 1.7795177485158673, "grad_norm": 0.23392976820468903, "learning_rate": 9.822764347044406e-07, "loss": 0.0785, "num_input_tokens_seen": 47213984, "step": 36420 }, { "epoch": 1.7797620501795617, "grad_norm": 0.2989122271537781, "learning_rate": 9.795533694465175e-07, "loss": 0.0861, "num_input_tokens_seen": 47220320, "step": 36425 }, { "epoch": 1.780006351843256, "grad_norm": 0.4564272463321686, "learning_rate": 9.768340084433197e-07, "loss": 0.0857, "num_input_tokens_seen": 47226592, "step": 36430 }, { "epoch": 1.7802506535069504, "grad_norm": 0.7117528915405273, "learning_rate": 9.741183521142143e-07, "loss": 0.07, "num_input_tokens_seen": 47233056, "step": 36435 }, { "epoch": 1.7804949551706448, "grad_norm": 0.41116079688072205, "learning_rate": 9.714064008779889e-07, "loss": 0.0797, "num_input_tokens_seen": 47239776, "step": 36440 }, { "epoch": 1.7807392568343392, "grad_norm": 0.24881967902183533, "learning_rate": 9.686981551528584e-07, "loss": 0.0714, "num_input_tokens_seen": 47246080, "step": 36445 }, { "epoch": 1.7809835584980334, "grad_norm": 0.4230796992778778, "learning_rate": 9.65993615356467e-07, "loss": 0.0883, "num_input_tokens_seen": 47252512, "step": 36450 }, { "epoch": 1.7812278601617277, "grad_norm": 0.21590961515903473, "learning_rate": 9.632927819058917e-07, "loss": 0.1022, "num_input_tokens_seen": 47259136, "step": 36455 }, { "epoch": 1.781472161825422, "grad_norm": 0.16815058887004852, "learning_rate": 9.605956552176305e-07, "loss": 0.0906, "num_input_tokens_seen": 47265632, "step": 36460 }, { "epoch": 1.7817164634891163, "grad_norm": 0.6332250237464905, "learning_rate": 9.579022357076223e-07, "loss": 0.0864, "num_input_tokens_seen": 47272000, "step": 36465 }, { "epoch": 1.7819607651528107, "grad_norm": 0.23806914687156677, "learning_rate": 9.552125237912158e-07, "loss": 0.1113, "num_input_tokens_seen": 47278272, "step": 36470 }, { "epoch": 1.782205066816505, "grad_norm": 0.10068871825933456, "learning_rate": 9.525265198832096e-07, "loss": 0.0786, "num_input_tokens_seen": 47284928, "step": 36475 }, { "epoch": 1.7824493684801994, "grad_norm": 0.20253975689411163, "learning_rate": 9.498442243978112e-07, "loss": 0.0889, "num_input_tokens_seen": 47291328, "step": 36480 }, { "epoch": 1.7826936701438938, "grad_norm": 0.17373478412628174, "learning_rate": 9.471656377486649e-07, "loss": 0.0786, "num_input_tokens_seen": 47298272, "step": 36485 }, { "epoch": 1.7829379718075882, "grad_norm": 0.34783604741096497, "learning_rate": 9.444907603488456e-07, "loss": 0.1186, "num_input_tokens_seen": 47304736, "step": 36490 }, { "epoch": 1.7831822734712823, "grad_norm": 0.49486520886421204, "learning_rate": 9.418195926108514e-07, "loss": 0.0909, "num_input_tokens_seen": 47310976, "step": 36495 }, { "epoch": 1.7834265751349767, "grad_norm": 0.9890741109848022, "learning_rate": 9.391521349466053e-07, "loss": 0.1033, "num_input_tokens_seen": 47317696, "step": 36500 }, { "epoch": 1.7836708767986709, "grad_norm": 0.3765221834182739, "learning_rate": 9.364883877674758e-07, "loss": 0.0994, "num_input_tokens_seen": 47323552, "step": 36505 }, { "epoch": 1.7839151784623652, "grad_norm": 0.1683756709098816, "learning_rate": 9.33828351484231e-07, "loss": 0.1012, "num_input_tokens_seen": 47330208, "step": 36510 }, { "epoch": 1.7841594801260596, "grad_norm": 0.3004171848297119, "learning_rate": 9.311720265070906e-07, "loss": 0.0911, "num_input_tokens_seen": 47337088, "step": 36515 }, { "epoch": 1.784403781789754, "grad_norm": 0.29894423484802246, "learning_rate": 9.285194132456931e-07, "loss": 0.0877, "num_input_tokens_seen": 47343712, "step": 36520 }, { "epoch": 1.7846480834534484, "grad_norm": 0.29931747913360596, "learning_rate": 9.258705121091032e-07, "loss": 0.0656, "num_input_tokens_seen": 47350240, "step": 36525 }, { "epoch": 1.7848923851171428, "grad_norm": 0.9012888073921204, "learning_rate": 9.232253235058136e-07, "loss": 0.0882, "num_input_tokens_seen": 47357248, "step": 36530 }, { "epoch": 1.785136686780837, "grad_norm": 0.5631705522537231, "learning_rate": 9.205838478437478e-07, "loss": 0.079, "num_input_tokens_seen": 47363712, "step": 36535 }, { "epoch": 1.7853809884445313, "grad_norm": 0.5172924995422363, "learning_rate": 9.179460855302524e-07, "loss": 0.1141, "num_input_tokens_seen": 47370336, "step": 36540 }, { "epoch": 1.7856252901082257, "grad_norm": 0.26401156187057495, "learning_rate": 9.153120369721046e-07, "loss": 0.0883, "num_input_tokens_seen": 47376800, "step": 36545 }, { "epoch": 1.7858695917719198, "grad_norm": 0.21216543018817902, "learning_rate": 9.126817025755103e-07, "loss": 0.0734, "num_input_tokens_seen": 47383360, "step": 36550 }, { "epoch": 1.7861138934356142, "grad_norm": 0.3284030258655548, "learning_rate": 9.100550827460947e-07, "loss": 0.0839, "num_input_tokens_seen": 47389696, "step": 36555 }, { "epoch": 1.7863581950993086, "grad_norm": 0.3512326776981354, "learning_rate": 9.0743217788892e-07, "loss": 0.0781, "num_input_tokens_seen": 47396128, "step": 36560 }, { "epoch": 1.786602496763003, "grad_norm": 0.6017968654632568, "learning_rate": 9.048129884084683e-07, "loss": 0.0885, "num_input_tokens_seen": 47402880, "step": 36565 }, { "epoch": 1.7868467984266974, "grad_norm": 0.4905262291431427, "learning_rate": 9.021975147086553e-07, "loss": 0.084, "num_input_tokens_seen": 47409376, "step": 36570 }, { "epoch": 1.7870911000903917, "grad_norm": 0.34956854581832886, "learning_rate": 8.995857571928141e-07, "loss": 0.0759, "num_input_tokens_seen": 47415520, "step": 36575 }, { "epoch": 1.787335401754086, "grad_norm": 0.41061750054359436, "learning_rate": 8.969777162637139e-07, "loss": 0.0865, "num_input_tokens_seen": 47421984, "step": 36580 }, { "epoch": 1.7875797034177803, "grad_norm": 1.030691146850586, "learning_rate": 8.943733923235525e-07, "loss": 0.1103, "num_input_tokens_seen": 47428480, "step": 36585 }, { "epoch": 1.7878240050814747, "grad_norm": 0.2693135142326355, "learning_rate": 8.917727857739394e-07, "loss": 0.0813, "num_input_tokens_seen": 47434944, "step": 36590 }, { "epoch": 1.7880683067451688, "grad_norm": 0.14511245489120483, "learning_rate": 8.891758970159258e-07, "loss": 0.0537, "num_input_tokens_seen": 47441088, "step": 36595 }, { "epoch": 1.7883126084088632, "grad_norm": 0.24445882439613342, "learning_rate": 8.86582726449986e-07, "loss": 0.101, "num_input_tokens_seen": 47447328, "step": 36600 }, { "epoch": 1.7883126084088632, "eval_loss": 0.08741970360279083, "eval_runtime": 374.6588, "eval_samples_per_second": 97.115, "eval_steps_per_second": 24.281, "num_input_tokens_seen": 47447328, "step": 36600 }, { "epoch": 1.7885569100725576, "grad_norm": 0.3121313750743866, "learning_rate": 8.839932744760165e-07, "loss": 0.0858, "num_input_tokens_seen": 47454016, "step": 36605 }, { "epoch": 1.788801211736252, "grad_norm": 0.22005532681941986, "learning_rate": 8.814075414933482e-07, "loss": 0.0959, "num_input_tokens_seen": 47460576, "step": 36610 }, { "epoch": 1.7890455133999463, "grad_norm": 0.2279372215270996, "learning_rate": 8.788255279007257e-07, "loss": 0.0843, "num_input_tokens_seen": 47467040, "step": 36615 }, { "epoch": 1.7892898150636407, "grad_norm": 0.6190971732139587, "learning_rate": 8.762472340963362e-07, "loss": 0.089, "num_input_tokens_seen": 47473408, "step": 36620 }, { "epoch": 1.7895341167273349, "grad_norm": 0.15147997438907623, "learning_rate": 8.736726604777811e-07, "loss": 0.0733, "num_input_tokens_seen": 47479872, "step": 36625 }, { "epoch": 1.7897784183910292, "grad_norm": 0.7839540839195251, "learning_rate": 8.711018074420901e-07, "loss": 0.0852, "num_input_tokens_seen": 47486368, "step": 36630 }, { "epoch": 1.7900227200547236, "grad_norm": 0.3957987129688263, "learning_rate": 8.685346753857209e-07, "loss": 0.0873, "num_input_tokens_seen": 47492832, "step": 36635 }, { "epoch": 1.7902670217184178, "grad_norm": 0.5179715752601624, "learning_rate": 8.659712647045654e-07, "loss": 0.0896, "num_input_tokens_seen": 47499360, "step": 36640 }, { "epoch": 1.7905113233821122, "grad_norm": 0.5497621297836304, "learning_rate": 8.634115757939209e-07, "loss": 0.0928, "num_input_tokens_seen": 47506016, "step": 36645 }, { "epoch": 1.7907556250458065, "grad_norm": 0.36986058950424194, "learning_rate": 8.608556090485387e-07, "loss": 0.0772, "num_input_tokens_seen": 47512160, "step": 36650 }, { "epoch": 1.790999926709501, "grad_norm": 0.1092635989189148, "learning_rate": 8.583033648625671e-07, "loss": 0.069, "num_input_tokens_seen": 47519072, "step": 36655 }, { "epoch": 1.7912442283731953, "grad_norm": 0.27572759985923767, "learning_rate": 8.557548436295998e-07, "loss": 0.1047, "num_input_tokens_seen": 47525632, "step": 36660 }, { "epoch": 1.7914885300368897, "grad_norm": 0.6719483137130737, "learning_rate": 8.532100457426556e-07, "loss": 0.0846, "num_input_tokens_seen": 47531520, "step": 36665 }, { "epoch": 1.7917328317005838, "grad_norm": 0.2494811862707138, "learning_rate": 8.506689715941679e-07, "loss": 0.0821, "num_input_tokens_seen": 47537760, "step": 36670 }, { "epoch": 1.7919771333642782, "grad_norm": 0.28597211837768555, "learning_rate": 8.481316215760011e-07, "loss": 0.061, "num_input_tokens_seen": 47544448, "step": 36675 }, { "epoch": 1.7922214350279724, "grad_norm": 0.4796362817287445, "learning_rate": 8.455979960794558e-07, "loss": 0.1231, "num_input_tokens_seen": 47550944, "step": 36680 }, { "epoch": 1.7924657366916668, "grad_norm": 0.8453354239463806, "learning_rate": 8.430680954952364e-07, "loss": 0.1049, "num_input_tokens_seen": 47556864, "step": 36685 }, { "epoch": 1.7927100383553611, "grad_norm": 0.22046791017055511, "learning_rate": 8.405419202134974e-07, "loss": 0.0564, "num_input_tokens_seen": 47563616, "step": 36690 }, { "epoch": 1.7929543400190555, "grad_norm": 0.413933128118515, "learning_rate": 8.380194706237993e-07, "loss": 0.0788, "num_input_tokens_seen": 47570304, "step": 36695 }, { "epoch": 1.79319864168275, "grad_norm": 0.4879229962825775, "learning_rate": 8.355007471151366e-07, "loss": 0.083, "num_input_tokens_seen": 47576896, "step": 36700 }, { "epoch": 1.7934429433464443, "grad_norm": 0.3265645205974579, "learning_rate": 8.329857500759292e-07, "loss": 0.077, "num_input_tokens_seen": 47583680, "step": 36705 }, { "epoch": 1.7936872450101387, "grad_norm": 0.22143974900245667, "learning_rate": 8.304744798940194e-07, "loss": 0.0899, "num_input_tokens_seen": 47590368, "step": 36710 }, { "epoch": 1.7939315466738328, "grad_norm": 0.846732497215271, "learning_rate": 8.279669369566756e-07, "loss": 0.0728, "num_input_tokens_seen": 47596512, "step": 36715 }, { "epoch": 1.7941758483375272, "grad_norm": 0.38379454612731934, "learning_rate": 8.254631216505993e-07, "loss": 0.0876, "num_input_tokens_seen": 47602784, "step": 36720 }, { "epoch": 1.7944201500012213, "grad_norm": 0.5739847421646118, "learning_rate": 8.229630343619038e-07, "loss": 0.086, "num_input_tokens_seen": 47608960, "step": 36725 }, { "epoch": 1.7946644516649157, "grad_norm": 0.20050425827503204, "learning_rate": 8.204666754761392e-07, "loss": 0.0791, "num_input_tokens_seen": 47615456, "step": 36730 }, { "epoch": 1.79490875332861, "grad_norm": 0.24025839567184448, "learning_rate": 8.179740453782669e-07, "loss": 0.0889, "num_input_tokens_seen": 47621856, "step": 36735 }, { "epoch": 1.7951530549923045, "grad_norm": 0.14178523421287537, "learning_rate": 8.154851444526907e-07, "loss": 0.0879, "num_input_tokens_seen": 47627904, "step": 36740 }, { "epoch": 1.7953973566559989, "grad_norm": 0.19124835729599, "learning_rate": 8.129999730832283e-07, "loss": 0.0877, "num_input_tokens_seen": 47635008, "step": 36745 }, { "epoch": 1.7956416583196932, "grad_norm": 0.40901145339012146, "learning_rate": 8.105185316531178e-07, "loss": 0.0604, "num_input_tokens_seen": 47641344, "step": 36750 }, { "epoch": 1.7958859599833876, "grad_norm": 0.12399322539567947, "learning_rate": 8.08040820545039e-07, "loss": 0.0848, "num_input_tokens_seen": 47648064, "step": 36755 }, { "epoch": 1.7961302616470818, "grad_norm": 0.2488618791103363, "learning_rate": 8.055668401410782e-07, "loss": 0.098, "num_input_tokens_seen": 47654816, "step": 36760 }, { "epoch": 1.7963745633107762, "grad_norm": 0.1734081357717514, "learning_rate": 8.030965908227578e-07, "loss": 0.0869, "num_input_tokens_seen": 47661440, "step": 36765 }, { "epoch": 1.7966188649744703, "grad_norm": 0.5823834538459778, "learning_rate": 8.006300729710203e-07, "loss": 0.0793, "num_input_tokens_seen": 47667776, "step": 36770 }, { "epoch": 1.7968631666381647, "grad_norm": 0.20649102330207825, "learning_rate": 7.981672869662337e-07, "loss": 0.0719, "num_input_tokens_seen": 47674080, "step": 36775 }, { "epoch": 1.797107468301859, "grad_norm": 0.16873891651630402, "learning_rate": 7.957082331881888e-07, "loss": 0.0786, "num_input_tokens_seen": 47680416, "step": 36780 }, { "epoch": 1.7973517699655535, "grad_norm": 0.17952753603458405, "learning_rate": 7.932529120161069e-07, "loss": 0.0954, "num_input_tokens_seen": 47687328, "step": 36785 }, { "epoch": 1.7975960716292478, "grad_norm": 0.23850370943546295, "learning_rate": 7.908013238286243e-07, "loss": 0.0713, "num_input_tokens_seen": 47693696, "step": 36790 }, { "epoch": 1.7978403732929422, "grad_norm": 0.15681089460849762, "learning_rate": 7.883534690038136e-07, "loss": 0.0761, "num_input_tokens_seen": 47700128, "step": 36795 }, { "epoch": 1.7980846749566366, "grad_norm": 0.33839455246925354, "learning_rate": 7.859093479191559e-07, "loss": 0.1037, "num_input_tokens_seen": 47707040, "step": 36800 }, { "epoch": 1.7980846749566366, "eval_loss": 0.08743904531002045, "eval_runtime": 374.8799, "eval_samples_per_second": 97.058, "eval_steps_per_second": 24.266, "num_input_tokens_seen": 47707040, "step": 36800 }, { "epoch": 1.7983289766203308, "grad_norm": 0.41581660509109497, "learning_rate": 7.834689609515722e-07, "loss": 0.0964, "num_input_tokens_seen": 47713664, "step": 36805 }, { "epoch": 1.7985732782840251, "grad_norm": 0.2758987843990326, "learning_rate": 7.810323084774002e-07, "loss": 0.0635, "num_input_tokens_seen": 47720416, "step": 36810 }, { "epoch": 1.7988175799477193, "grad_norm": 0.2451661378145218, "learning_rate": 7.785993908723976e-07, "loss": 0.0898, "num_input_tokens_seen": 47727040, "step": 36815 }, { "epoch": 1.7990618816114137, "grad_norm": 0.7338926792144775, "learning_rate": 7.761702085117534e-07, "loss": 0.0913, "num_input_tokens_seen": 47733952, "step": 36820 }, { "epoch": 1.799306183275108, "grad_norm": 0.35888850688934326, "learning_rate": 7.737447617700844e-07, "loss": 0.0591, "num_input_tokens_seen": 47740512, "step": 36825 }, { "epoch": 1.7995504849388024, "grad_norm": 0.30465686321258545, "learning_rate": 7.713230510214136e-07, "loss": 0.0959, "num_input_tokens_seen": 47747008, "step": 36830 }, { "epoch": 1.7997947866024968, "grad_norm": 0.29099783301353455, "learning_rate": 7.689050766392092e-07, "loss": 0.0469, "num_input_tokens_seen": 47753856, "step": 36835 }, { "epoch": 1.8000390882661912, "grad_norm": 0.2338351011276245, "learning_rate": 7.664908389963477e-07, "loss": 0.0754, "num_input_tokens_seen": 47760192, "step": 36840 }, { "epoch": 1.8002833899298856, "grad_norm": 0.1512574702501297, "learning_rate": 7.64080338465134e-07, "loss": 0.0725, "num_input_tokens_seen": 47766976, "step": 36845 }, { "epoch": 1.8005276915935797, "grad_norm": 0.3091677725315094, "learning_rate": 7.616735754173043e-07, "loss": 0.0681, "num_input_tokens_seen": 47773824, "step": 36850 }, { "epoch": 1.800771993257274, "grad_norm": 0.5429510474205017, "learning_rate": 7.592705502240005e-07, "loss": 0.0919, "num_input_tokens_seen": 47780448, "step": 36855 }, { "epoch": 1.8010162949209683, "grad_norm": 0.3514307737350464, "learning_rate": 7.568712632558095e-07, "loss": 0.0964, "num_input_tokens_seen": 47786944, "step": 36860 }, { "epoch": 1.8012605965846626, "grad_norm": 0.31189578771591187, "learning_rate": 7.544757148827297e-07, "loss": 0.0646, "num_input_tokens_seen": 47793344, "step": 36865 }, { "epoch": 1.801504898248357, "grad_norm": 0.16981376707553864, "learning_rate": 7.520839054741797e-07, "loss": 0.0793, "num_input_tokens_seen": 47799904, "step": 36870 }, { "epoch": 1.8017491999120514, "grad_norm": 0.24321086704730988, "learning_rate": 7.496958353990113e-07, "loss": 0.0778, "num_input_tokens_seen": 47806752, "step": 36875 }, { "epoch": 1.8019935015757458, "grad_norm": 1.2133188247680664, "learning_rate": 7.473115050254941e-07, "loss": 0.1005, "num_input_tokens_seen": 47813056, "step": 36880 }, { "epoch": 1.8022378032394402, "grad_norm": 0.37272289395332336, "learning_rate": 7.449309147213173e-07, "loss": 0.0991, "num_input_tokens_seen": 47819616, "step": 36885 }, { "epoch": 1.8024821049031345, "grad_norm": 0.2065863162279129, "learning_rate": 7.425540648536067e-07, "loss": 0.0829, "num_input_tokens_seen": 47825888, "step": 36890 }, { "epoch": 1.8027264065668287, "grad_norm": 0.1652543544769287, "learning_rate": 7.40180955788894e-07, "loss": 0.0766, "num_input_tokens_seen": 47832544, "step": 36895 }, { "epoch": 1.802970708230523, "grad_norm": 0.5347408652305603, "learning_rate": 7.378115878931474e-07, "loss": 0.0849, "num_input_tokens_seen": 47839200, "step": 36900 }, { "epoch": 1.8032150098942172, "grad_norm": 0.3828139901161194, "learning_rate": 7.354459615317527e-07, "loss": 0.0896, "num_input_tokens_seen": 47845664, "step": 36905 }, { "epoch": 1.8034593115579116, "grad_norm": 0.30294379591941833, "learning_rate": 7.33084077069518e-07, "loss": 0.1003, "num_input_tokens_seen": 47852000, "step": 36910 }, { "epoch": 1.803703613221606, "grad_norm": 0.1441645473241806, "learning_rate": 7.307259348706768e-07, "loss": 0.0869, "num_input_tokens_seen": 47858272, "step": 36915 }, { "epoch": 1.8039479148853004, "grad_norm": 0.2875230014324188, "learning_rate": 7.283715352988801e-07, "loss": 0.0577, "num_input_tokens_seen": 47864576, "step": 36920 }, { "epoch": 1.8041922165489948, "grad_norm": 0.25139302015304565, "learning_rate": 7.260208787172068e-07, "loss": 0.0964, "num_input_tokens_seen": 47870720, "step": 36925 }, { "epoch": 1.8044365182126891, "grad_norm": 0.26408684253692627, "learning_rate": 7.23673965488167e-07, "loss": 0.0936, "num_input_tokens_seen": 47876800, "step": 36930 }, { "epoch": 1.8046808198763835, "grad_norm": 0.2857590317726135, "learning_rate": 7.213307959736709e-07, "loss": 0.0981, "num_input_tokens_seen": 47882976, "step": 36935 }, { "epoch": 1.8049251215400777, "grad_norm": 0.4024498760700226, "learning_rate": 7.189913705350715e-07, "loss": 0.079, "num_input_tokens_seen": 47889440, "step": 36940 }, { "epoch": 1.805169423203772, "grad_norm": 0.3587559163570404, "learning_rate": 7.166556895331411e-07, "loss": 0.085, "num_input_tokens_seen": 47895584, "step": 36945 }, { "epoch": 1.8054137248674662, "grad_norm": 0.13393817842006683, "learning_rate": 7.143237533280639e-07, "loss": 0.0716, "num_input_tokens_seen": 47902240, "step": 36950 }, { "epoch": 1.8056580265311606, "grad_norm": 0.2773132920265198, "learning_rate": 7.119955622794578e-07, "loss": 0.0968, "num_input_tokens_seen": 47908544, "step": 36955 }, { "epoch": 1.805902328194855, "grad_norm": 0.10798314958810806, "learning_rate": 7.096711167463577e-07, "loss": 0.095, "num_input_tokens_seen": 47914816, "step": 36960 }, { "epoch": 1.8061466298585493, "grad_norm": 0.18852964043617249, "learning_rate": 7.073504170872213e-07, "loss": 0.114, "num_input_tokens_seen": 47921248, "step": 36965 }, { "epoch": 1.8063909315222437, "grad_norm": 0.3044116497039795, "learning_rate": 7.05033463659932e-07, "loss": 0.1098, "num_input_tokens_seen": 47927104, "step": 36970 }, { "epoch": 1.806635233185938, "grad_norm": 0.12692582607269287, "learning_rate": 7.027202568217928e-07, "loss": 0.0706, "num_input_tokens_seen": 47933696, "step": 36975 }, { "epoch": 1.8068795348496325, "grad_norm": 0.2131241261959076, "learning_rate": 7.004107969295293e-07, "loss": 0.0699, "num_input_tokens_seen": 47940224, "step": 36980 }, { "epoch": 1.8071238365133266, "grad_norm": 0.36713457107543945, "learning_rate": 6.9810508433929e-07, "loss": 0.1039, "num_input_tokens_seen": 47946272, "step": 36985 }, { "epoch": 1.807368138177021, "grad_norm": 0.3865865170955658, "learning_rate": 6.958031194066406e-07, "loss": 0.0732, "num_input_tokens_seen": 47952896, "step": 36990 }, { "epoch": 1.8076124398407152, "grad_norm": 0.32811659574508667, "learning_rate": 6.935049024865776e-07, "loss": 0.1037, "num_input_tokens_seen": 47959840, "step": 36995 }, { "epoch": 1.8078567415044096, "grad_norm": 0.1855265349149704, "learning_rate": 6.912104339335118e-07, "loss": 0.0706, "num_input_tokens_seen": 47966176, "step": 37000 }, { "epoch": 1.8078567415044096, "eval_loss": 0.0872654840350151, "eval_runtime": 374.4261, "eval_samples_per_second": 97.175, "eval_steps_per_second": 24.296, "num_input_tokens_seen": 47966176, "step": 37000 }, { "epoch": 1.808101043168104, "grad_norm": 0.18498347699642181, "learning_rate": 6.889197141012799e-07, "loss": 0.0718, "num_input_tokens_seen": 47972704, "step": 37005 }, { "epoch": 1.8083453448317983, "grad_norm": 0.45531463623046875, "learning_rate": 6.866327433431435e-07, "loss": 0.0999, "num_input_tokens_seen": 47979104, "step": 37010 }, { "epoch": 1.8085896464954927, "grad_norm": 0.19972887635231018, "learning_rate": 6.843495220117735e-07, "loss": 0.0749, "num_input_tokens_seen": 47985760, "step": 37015 }, { "epoch": 1.808833948159187, "grad_norm": 0.12133663892745972, "learning_rate": 6.820700504592798e-07, "loss": 0.0585, "num_input_tokens_seen": 47992832, "step": 37020 }, { "epoch": 1.8090782498228815, "grad_norm": 0.16038894653320312, "learning_rate": 6.797943290371839e-07, "loss": 0.0558, "num_input_tokens_seen": 47998720, "step": 37025 }, { "epoch": 1.8093225514865756, "grad_norm": 0.28751933574676514, "learning_rate": 6.775223580964274e-07, "loss": 0.1151, "num_input_tokens_seen": 48004864, "step": 37030 }, { "epoch": 1.80956685315027, "grad_norm": 0.2312197983264923, "learning_rate": 6.7525413798738e-07, "loss": 0.0815, "num_input_tokens_seen": 48011424, "step": 37035 }, { "epoch": 1.8098111548139642, "grad_norm": 0.19738316535949707, "learning_rate": 6.729896690598259e-07, "loss": 0.1166, "num_input_tokens_seen": 48017696, "step": 37040 }, { "epoch": 1.8100554564776585, "grad_norm": 0.3705195486545563, "learning_rate": 6.707289516629772e-07, "loss": 0.1055, "num_input_tokens_seen": 48024512, "step": 37045 }, { "epoch": 1.810299758141353, "grad_norm": 0.4315304458141327, "learning_rate": 6.684719861454692e-07, "loss": 0.0845, "num_input_tokens_seen": 48031392, "step": 37050 }, { "epoch": 1.8105440598050473, "grad_norm": 0.4348621070384979, "learning_rate": 6.662187728553481e-07, "loss": 0.0905, "num_input_tokens_seen": 48037888, "step": 37055 }, { "epoch": 1.8107883614687417, "grad_norm": 0.2999916672706604, "learning_rate": 6.639693121400892e-07, "loss": 0.1052, "num_input_tokens_seen": 48044160, "step": 37060 }, { "epoch": 1.811032663132436, "grad_norm": 0.22956499457359314, "learning_rate": 6.617236043465868e-07, "loss": 0.0619, "num_input_tokens_seen": 48050592, "step": 37065 }, { "epoch": 1.8112769647961302, "grad_norm": 0.2599698007106781, "learning_rate": 6.594816498211587e-07, "loss": 0.0892, "num_input_tokens_seen": 48056800, "step": 37070 }, { "epoch": 1.8115212664598246, "grad_norm": 0.6260800361633301, "learning_rate": 6.572434489095447e-07, "loss": 0.0875, "num_input_tokens_seen": 48063424, "step": 37075 }, { "epoch": 1.811765568123519, "grad_norm": 0.2482348531484604, "learning_rate": 6.550090019568994e-07, "loss": 0.0905, "num_input_tokens_seen": 48069984, "step": 37080 }, { "epoch": 1.8120098697872131, "grad_norm": 0.5674565434455872, "learning_rate": 6.527783093078027e-07, "loss": 0.0873, "num_input_tokens_seen": 48076608, "step": 37085 }, { "epoch": 1.8122541714509075, "grad_norm": 0.38804739713668823, "learning_rate": 6.5055137130626e-07, "loss": 0.0902, "num_input_tokens_seen": 48083360, "step": 37090 }, { "epoch": 1.8124984731146019, "grad_norm": 0.49606436491012573, "learning_rate": 6.483281882956854e-07, "loss": 0.0691, "num_input_tokens_seen": 48089440, "step": 37095 }, { "epoch": 1.8127427747782963, "grad_norm": 0.15964274108409882, "learning_rate": 6.461087606189298e-07, "loss": 0.0867, "num_input_tokens_seen": 48096000, "step": 37100 }, { "epoch": 1.8129870764419906, "grad_norm": 0.3537972867488861, "learning_rate": 6.438930886182554e-07, "loss": 0.0921, "num_input_tokens_seen": 48102752, "step": 37105 }, { "epoch": 1.813231378105685, "grad_norm": 0.12256834656000137, "learning_rate": 6.416811726353417e-07, "loss": 0.0533, "num_input_tokens_seen": 48109152, "step": 37110 }, { "epoch": 1.8134756797693792, "grad_norm": 0.29638993740081787, "learning_rate": 6.394730130112991e-07, "loss": 0.1077, "num_input_tokens_seen": 48115360, "step": 37115 }, { "epoch": 1.8137199814330736, "grad_norm": 0.1554211527109146, "learning_rate": 6.372686100866471e-07, "loss": 0.0535, "num_input_tokens_seen": 48121856, "step": 37120 }, { "epoch": 1.813964283096768, "grad_norm": 0.16603045165538788, "learning_rate": 6.350679642013413e-07, "loss": 0.0875, "num_input_tokens_seen": 48128064, "step": 37125 }, { "epoch": 1.814208584760462, "grad_norm": 0.25074997544288635, "learning_rate": 6.328710756947437e-07, "loss": 0.0766, "num_input_tokens_seen": 48134752, "step": 37130 }, { "epoch": 1.8144528864241565, "grad_norm": 0.15056830644607544, "learning_rate": 6.306779449056416e-07, "loss": 0.0565, "num_input_tokens_seen": 48141472, "step": 37135 }, { "epoch": 1.8146971880878509, "grad_norm": 0.7040787935256958, "learning_rate": 6.284885721722422e-07, "loss": 0.0919, "num_input_tokens_seen": 48147808, "step": 37140 }, { "epoch": 1.8149414897515452, "grad_norm": 0.296099454164505, "learning_rate": 6.26302957832181e-07, "loss": 0.0888, "num_input_tokens_seen": 48154432, "step": 37145 }, { "epoch": 1.8151857914152396, "grad_norm": 0.4877909719944, "learning_rate": 6.241211022224997e-07, "loss": 0.1131, "num_input_tokens_seen": 48160736, "step": 37150 }, { "epoch": 1.815430093078934, "grad_norm": 0.49601560831069946, "learning_rate": 6.219430056796732e-07, "loss": 0.0935, "num_input_tokens_seen": 48167392, "step": 37155 }, { "epoch": 1.8156743947426282, "grad_norm": 0.353099524974823, "learning_rate": 6.19768668539586e-07, "loss": 0.0926, "num_input_tokens_seen": 48174048, "step": 37160 }, { "epoch": 1.8159186964063225, "grad_norm": 0.8266002535820007, "learning_rate": 6.175980911375528e-07, "loss": 0.0901, "num_input_tokens_seen": 48180928, "step": 37165 }, { "epoch": 1.816162998070017, "grad_norm": 0.41435059905052185, "learning_rate": 6.154312738083034e-07, "loss": 0.0704, "num_input_tokens_seen": 48187264, "step": 37170 }, { "epoch": 1.816407299733711, "grad_norm": 0.3881143629550934, "learning_rate": 6.132682168859843e-07, "loss": 0.0959, "num_input_tokens_seen": 48193408, "step": 37175 }, { "epoch": 1.8166516013974054, "grad_norm": 0.5343785881996155, "learning_rate": 6.111089207041704e-07, "loss": 0.1025, "num_input_tokens_seen": 48200128, "step": 37180 }, { "epoch": 1.8168959030610998, "grad_norm": 0.17678304016590118, "learning_rate": 6.089533855958507e-07, "loss": 0.084, "num_input_tokens_seen": 48206816, "step": 37185 }, { "epoch": 1.8171402047247942, "grad_norm": 0.16651096940040588, "learning_rate": 6.068016118934372e-07, "loss": 0.1094, "num_input_tokens_seen": 48213312, "step": 37190 }, { "epoch": 1.8173845063884886, "grad_norm": 0.44351303577423096, "learning_rate": 6.04653599928759e-07, "loss": 0.09, "num_input_tokens_seen": 48220800, "step": 37195 }, { "epoch": 1.817628808052183, "grad_norm": 0.4416247606277466, "learning_rate": 6.025093500330675e-07, "loss": 0.0874, "num_input_tokens_seen": 48227328, "step": 37200 }, { "epoch": 1.817628808052183, "eval_loss": 0.08751247078180313, "eval_runtime": 374.5538, "eval_samples_per_second": 97.142, "eval_steps_per_second": 24.288, "num_input_tokens_seen": 48227328, "step": 37200 }, { "epoch": 1.8178731097158771, "grad_norm": 0.43935149908065796, "learning_rate": 6.003688625370291e-07, "loss": 0.0763, "num_input_tokens_seen": 48233984, "step": 37205 }, { "epoch": 1.8181174113795715, "grad_norm": 0.6457012891769409, "learning_rate": 5.982321377707406e-07, "loss": 0.098, "num_input_tokens_seen": 48240416, "step": 37210 }, { "epoch": 1.8183617130432657, "grad_norm": 0.36632782220840454, "learning_rate": 5.96099176063708e-07, "loss": 0.0885, "num_input_tokens_seen": 48246624, "step": 37215 }, { "epoch": 1.81860601470696, "grad_norm": 0.2961615025997162, "learning_rate": 5.93969977744857e-07, "loss": 0.0592, "num_input_tokens_seen": 48253824, "step": 37220 }, { "epoch": 1.8188503163706544, "grad_norm": 0.21756142377853394, "learning_rate": 5.918445431425445e-07, "loss": 0.0876, "num_input_tokens_seen": 48260256, "step": 37225 }, { "epoch": 1.8190946180343488, "grad_norm": 0.5805926322937012, "learning_rate": 5.897228725845333e-07, "loss": 0.1176, "num_input_tokens_seen": 48266912, "step": 37230 }, { "epoch": 1.8193389196980432, "grad_norm": 0.4533355236053467, "learning_rate": 5.876049663980171e-07, "loss": 0.068, "num_input_tokens_seen": 48273024, "step": 37235 }, { "epoch": 1.8195832213617376, "grad_norm": 0.21581827104091644, "learning_rate": 5.854908249095959e-07, "loss": 0.077, "num_input_tokens_seen": 48279584, "step": 37240 }, { "epoch": 1.819827523025432, "grad_norm": 0.5476219654083252, "learning_rate": 5.833804484453031e-07, "loss": 0.0814, "num_input_tokens_seen": 48285632, "step": 37245 }, { "epoch": 1.820071824689126, "grad_norm": 0.20805808901786804, "learning_rate": 5.81273837330587e-07, "loss": 0.0823, "num_input_tokens_seen": 48292096, "step": 37250 }, { "epoch": 1.8203161263528205, "grad_norm": 0.2293211817741394, "learning_rate": 5.791709918903071e-07, "loss": 0.073, "num_input_tokens_seen": 48298528, "step": 37255 }, { "epoch": 1.8205604280165146, "grad_norm": 0.591823935508728, "learning_rate": 5.770719124487483e-07, "loss": 0.0835, "num_input_tokens_seen": 48304928, "step": 37260 }, { "epoch": 1.820804729680209, "grad_norm": 0.19771425426006317, "learning_rate": 5.749765993296241e-07, "loss": 0.0893, "num_input_tokens_seen": 48311424, "step": 37265 }, { "epoch": 1.8210490313439034, "grad_norm": 0.4564490020275116, "learning_rate": 5.728850528560509e-07, "loss": 0.0866, "num_input_tokens_seen": 48317792, "step": 37270 }, { "epoch": 1.8212933330075978, "grad_norm": 0.5493279695510864, "learning_rate": 5.707972733505707e-07, "loss": 0.1045, "num_input_tokens_seen": 48324160, "step": 37275 }, { "epoch": 1.8215376346712921, "grad_norm": 0.40830197930336, "learning_rate": 5.687132611351509e-07, "loss": 0.0745, "num_input_tokens_seen": 48330624, "step": 37280 }, { "epoch": 1.8217819363349865, "grad_norm": 0.427738219499588, "learning_rate": 5.666330165311651e-07, "loss": 0.0895, "num_input_tokens_seen": 48337376, "step": 37285 }, { "epoch": 1.822026237998681, "grad_norm": 0.3254718482494354, "learning_rate": 5.645565398594204e-07, "loss": 0.0844, "num_input_tokens_seen": 48343424, "step": 37290 }, { "epoch": 1.822270539662375, "grad_norm": 0.2727367579936981, "learning_rate": 5.624838314401304e-07, "loss": 0.1006, "num_input_tokens_seen": 48350112, "step": 37295 }, { "epoch": 1.8225148413260694, "grad_norm": 0.3441227972507477, "learning_rate": 5.604148915929336e-07, "loss": 0.0873, "num_input_tokens_seen": 48356416, "step": 37300 }, { "epoch": 1.8227591429897636, "grad_norm": 0.30438753962516785, "learning_rate": 5.583497206368887e-07, "loss": 0.1135, "num_input_tokens_seen": 48363136, "step": 37305 }, { "epoch": 1.823003444653458, "grad_norm": 0.6640852093696594, "learning_rate": 5.562883188904688e-07, "loss": 0.0863, "num_input_tokens_seen": 48369696, "step": 37310 }, { "epoch": 1.8232477463171524, "grad_norm": 0.17082342505455017, "learning_rate": 5.542306866715724e-07, "loss": 0.0842, "num_input_tokens_seen": 48376320, "step": 37315 }, { "epoch": 1.8234920479808467, "grad_norm": 0.1953909993171692, "learning_rate": 5.52176824297504e-07, "loss": 0.0749, "num_input_tokens_seen": 48382496, "step": 37320 }, { "epoch": 1.8237363496445411, "grad_norm": 0.7236582040786743, "learning_rate": 5.501267320850018e-07, "loss": 0.0853, "num_input_tokens_seen": 48389504, "step": 37325 }, { "epoch": 1.8239806513082355, "grad_norm": 0.4510473310947418, "learning_rate": 5.480804103502157e-07, "loss": 0.0698, "num_input_tokens_seen": 48396000, "step": 37330 }, { "epoch": 1.8242249529719299, "grad_norm": 0.5710299015045166, "learning_rate": 5.460378594087101e-07, "loss": 0.0629, "num_input_tokens_seen": 48402496, "step": 37335 }, { "epoch": 1.824469254635624, "grad_norm": 0.39243170619010925, "learning_rate": 5.439990795754773e-07, "loss": 0.0895, "num_input_tokens_seen": 48409344, "step": 37340 }, { "epoch": 1.8247135562993184, "grad_norm": 0.6278012990951538, "learning_rate": 5.419640711649188e-07, "loss": 0.0977, "num_input_tokens_seen": 48416384, "step": 37345 }, { "epoch": 1.8249578579630126, "grad_norm": 0.2872629463672638, "learning_rate": 5.399328344908583e-07, "loss": 0.1438, "num_input_tokens_seen": 48423008, "step": 37350 }, { "epoch": 1.825202159626707, "grad_norm": 0.2960704565048218, "learning_rate": 5.379053698665399e-07, "loss": 0.0885, "num_input_tokens_seen": 48429056, "step": 37355 }, { "epoch": 1.8254464612904013, "grad_norm": 0.12539833784103394, "learning_rate": 5.358816776046216e-07, "loss": 0.0672, "num_input_tokens_seen": 48435392, "step": 37360 }, { "epoch": 1.8256907629540957, "grad_norm": 0.28488990664482117, "learning_rate": 5.338617580171817e-07, "loss": 0.0828, "num_input_tokens_seen": 48441536, "step": 37365 }, { "epoch": 1.82593506461779, "grad_norm": 0.23747941851615906, "learning_rate": 5.318456114157239e-07, "loss": 0.1067, "num_input_tokens_seen": 48448064, "step": 37370 }, { "epoch": 1.8261793662814845, "grad_norm": 0.3131645917892456, "learning_rate": 5.298332381111576e-07, "loss": 0.0815, "num_input_tokens_seen": 48453920, "step": 37375 }, { "epoch": 1.8264236679451789, "grad_norm": 0.5055148005485535, "learning_rate": 5.27824638413818e-07, "loss": 0.0795, "num_input_tokens_seen": 48460000, "step": 37380 }, { "epoch": 1.826667969608873, "grad_norm": 0.275147020816803, "learning_rate": 5.258198126334546e-07, "loss": 0.11, "num_input_tokens_seen": 48466784, "step": 37385 }, { "epoch": 1.8269122712725674, "grad_norm": 0.46401670575141907, "learning_rate": 5.238187610792367e-07, "loss": 0.1028, "num_input_tokens_seen": 48472800, "step": 37390 }, { "epoch": 1.8271565729362615, "grad_norm": 0.3171978294849396, "learning_rate": 5.218214840597563e-07, "loss": 0.0777, "num_input_tokens_seen": 48479040, "step": 37395 }, { "epoch": 1.827400874599956, "grad_norm": 0.4671914875507355, "learning_rate": 5.198279818830115e-07, "loss": 0.0928, "num_input_tokens_seen": 48485632, "step": 37400 }, { "epoch": 1.827400874599956, "eval_loss": 0.08753321319818497, "eval_runtime": 374.6863, "eval_samples_per_second": 97.108, "eval_steps_per_second": 24.279, "num_input_tokens_seen": 48485632, "step": 37400 }, { "epoch": 1.8276451762636503, "grad_norm": 0.21421319246292114, "learning_rate": 5.178382548564287e-07, "loss": 0.0829, "num_input_tokens_seen": 48492032, "step": 37405 }, { "epoch": 1.8278894779273447, "grad_norm": 0.19391843676567078, "learning_rate": 5.15852303286854e-07, "loss": 0.1031, "num_input_tokens_seen": 48498752, "step": 37410 }, { "epoch": 1.828133779591039, "grad_norm": 0.44544339179992676, "learning_rate": 5.138701274805396e-07, "loss": 0.103, "num_input_tokens_seen": 48505312, "step": 37415 }, { "epoch": 1.8283780812547334, "grad_norm": 0.27344223856925964, "learning_rate": 5.118917277431606e-07, "loss": 0.089, "num_input_tokens_seen": 48511776, "step": 37420 }, { "epoch": 1.8286223829184278, "grad_norm": 0.4321455955505371, "learning_rate": 5.099171043798145e-07, "loss": 0.0765, "num_input_tokens_seen": 48518272, "step": 37425 }, { "epoch": 1.828866684582122, "grad_norm": 0.5921148061752319, "learning_rate": 5.079462576950133e-07, "loss": 0.077, "num_input_tokens_seen": 48524448, "step": 37430 }, { "epoch": 1.8291109862458164, "grad_norm": 0.3634519577026367, "learning_rate": 5.059791879926862e-07, "loss": 0.0727, "num_input_tokens_seen": 48530560, "step": 37435 }, { "epoch": 1.8293552879095105, "grad_norm": 0.40953031182289124, "learning_rate": 5.040158955761793e-07, "loss": 0.0787, "num_input_tokens_seen": 48537376, "step": 37440 }, { "epoch": 1.829599589573205, "grad_norm": 0.5493779182434082, "learning_rate": 5.020563807482559e-07, "loss": 0.0893, "num_input_tokens_seen": 48543904, "step": 37445 }, { "epoch": 1.8298438912368993, "grad_norm": 0.2369018942117691, "learning_rate": 5.001006438110995e-07, "loss": 0.0972, "num_input_tokens_seen": 48550848, "step": 37450 }, { "epoch": 1.8300881929005937, "grad_norm": 0.4016752243041992, "learning_rate": 4.981486850663075e-07, "loss": 0.0893, "num_input_tokens_seen": 48557024, "step": 37455 }, { "epoch": 1.830332494564288, "grad_norm": 0.2495286464691162, "learning_rate": 4.962005048149005e-07, "loss": 0.1143, "num_input_tokens_seen": 48563136, "step": 37460 }, { "epoch": 1.8305767962279824, "grad_norm": 0.1523575484752655, "learning_rate": 4.942561033573073e-07, "loss": 0.0992, "num_input_tokens_seen": 48569824, "step": 37465 }, { "epoch": 1.8308210978916768, "grad_norm": 0.5084608793258667, "learning_rate": 4.923154809933827e-07, "loss": 0.0926, "num_input_tokens_seen": 48575968, "step": 37470 }, { "epoch": 1.831065399555371, "grad_norm": 0.6969185471534729, "learning_rate": 4.903786380223957e-07, "loss": 0.0947, "num_input_tokens_seen": 48582560, "step": 37475 }, { "epoch": 1.8313097012190653, "grad_norm": 0.42779701948165894, "learning_rate": 4.884455747430266e-07, "loss": 0.0695, "num_input_tokens_seen": 48589376, "step": 37480 }, { "epoch": 1.8315540028827595, "grad_norm": 0.7628805637359619, "learning_rate": 4.865162914533816e-07, "loss": 0.0839, "num_input_tokens_seen": 48595712, "step": 37485 }, { "epoch": 1.8317983045464539, "grad_norm": 0.25408485531806946, "learning_rate": 4.845907884509809e-07, "loss": 0.1035, "num_input_tokens_seen": 48602176, "step": 37490 }, { "epoch": 1.8320426062101483, "grad_norm": 0.22462248802185059, "learning_rate": 4.82669066032762e-07, "loss": 0.0784, "num_input_tokens_seen": 48608640, "step": 37495 }, { "epoch": 1.8322869078738426, "grad_norm": 0.6642612814903259, "learning_rate": 4.807511244950768e-07, "loss": 0.069, "num_input_tokens_seen": 48614944, "step": 37500 }, { "epoch": 1.832531209537537, "grad_norm": 0.26472437381744385, "learning_rate": 4.788369641336943e-07, "loss": 0.0585, "num_input_tokens_seen": 48621664, "step": 37505 }, { "epoch": 1.8327755112012314, "grad_norm": 0.4983888566493988, "learning_rate": 4.769265852438032e-07, "loss": 0.0884, "num_input_tokens_seen": 48628192, "step": 37510 }, { "epoch": 1.8330198128649258, "grad_norm": 0.43752816319465637, "learning_rate": 4.750199881200124e-07, "loss": 0.0833, "num_input_tokens_seen": 48634368, "step": 37515 }, { "epoch": 1.83326411452862, "grad_norm": 0.11587031930685043, "learning_rate": 4.7311717305633664e-07, "loss": 0.0615, "num_input_tokens_seen": 48641248, "step": 37520 }, { "epoch": 1.8335084161923143, "grad_norm": 0.31779471039772034, "learning_rate": 4.7121814034621623e-07, "loss": 0.0822, "num_input_tokens_seen": 48647680, "step": 37525 }, { "epoch": 1.8337527178560085, "grad_norm": 0.2168213278055191, "learning_rate": 4.693228902825114e-07, "loss": 0.11, "num_input_tokens_seen": 48653920, "step": 37530 }, { "epoch": 1.8339970195197028, "grad_norm": 0.6639757752418518, "learning_rate": 4.6743142315748277e-07, "loss": 0.0814, "num_input_tokens_seen": 48660256, "step": 37535 }, { "epoch": 1.8342413211833972, "grad_norm": 0.14919699728488922, "learning_rate": 4.655437392628276e-07, "loss": 0.1017, "num_input_tokens_seen": 48666976, "step": 37540 }, { "epoch": 1.8344856228470916, "grad_norm": 0.43341943621635437, "learning_rate": 4.636598388896463e-07, "loss": 0.0804, "num_input_tokens_seen": 48673280, "step": 37545 }, { "epoch": 1.834729924510786, "grad_norm": 0.16985629498958588, "learning_rate": 4.6177972232845925e-07, "loss": 0.0786, "num_input_tokens_seen": 48679904, "step": 37550 }, { "epoch": 1.8349742261744804, "grad_norm": 0.3861426115036011, "learning_rate": 4.5990338986920953e-07, "loss": 0.0928, "num_input_tokens_seen": 48686272, "step": 37555 }, { "epoch": 1.8352185278381747, "grad_norm": 0.26563262939453125, "learning_rate": 4.5803084180124633e-07, "loss": 0.0942, "num_input_tokens_seen": 48692384, "step": 37560 }, { "epoch": 1.835462829501869, "grad_norm": 0.31331098079681396, "learning_rate": 4.561620784133386e-07, "loss": 0.1363, "num_input_tokens_seen": 48699104, "step": 37565 }, { "epoch": 1.8357071311655633, "grad_norm": 0.3224145174026489, "learning_rate": 4.5429709999367796e-07, "loss": 0.0604, "num_input_tokens_seen": 48706016, "step": 37570 }, { "epoch": 1.8359514328292574, "grad_norm": 0.19146737456321716, "learning_rate": 4.5243590682986223e-07, "loss": 0.0565, "num_input_tokens_seen": 48712992, "step": 37575 }, { "epoch": 1.8361957344929518, "grad_norm": 0.21199564635753632, "learning_rate": 4.5057849920891735e-07, "loss": 0.0762, "num_input_tokens_seen": 48719072, "step": 37580 }, { "epoch": 1.8364400361566462, "grad_norm": 0.17121484875679016, "learning_rate": 4.487248774172698e-07, "loss": 0.0817, "num_input_tokens_seen": 48725632, "step": 37585 }, { "epoch": 1.8366843378203406, "grad_norm": 0.23074151575565338, "learning_rate": 4.4687504174077965e-07, "loss": 0.0855, "num_input_tokens_seen": 48732000, "step": 37590 }, { "epoch": 1.836928639484035, "grad_norm": 0.18710127472877502, "learning_rate": 4.450289924647133e-07, "loss": 0.1109, "num_input_tokens_seen": 48738272, "step": 37595 }, { "epoch": 1.8371729411477293, "grad_norm": 0.2498602420091629, "learning_rate": 4.431867298737513e-07, "loss": 0.0784, "num_input_tokens_seen": 48744768, "step": 37600 }, { "epoch": 1.8371729411477293, "eval_loss": 0.08739679306745529, "eval_runtime": 374.745, "eval_samples_per_second": 97.093, "eval_steps_per_second": 24.275, "num_input_tokens_seen": 48744768, "step": 37600 }, { "epoch": 1.8374172428114237, "grad_norm": 0.40825551748275757, "learning_rate": 4.41348254251997e-07, "loss": 0.0832, "num_input_tokens_seen": 48750912, "step": 37605 }, { "epoch": 1.8376615444751179, "grad_norm": 0.19753625988960266, "learning_rate": 4.395135658829652e-07, "loss": 0.0675, "num_input_tokens_seen": 48757472, "step": 37610 }, { "epoch": 1.8379058461388122, "grad_norm": 0.48757204413414, "learning_rate": 4.376826650495852e-07, "loss": 0.0933, "num_input_tokens_seen": 48764000, "step": 37615 }, { "epoch": 1.8381501478025064, "grad_norm": 0.3402653634548187, "learning_rate": 4.358555520342117e-07, "loss": 0.0994, "num_input_tokens_seen": 48770784, "step": 37620 }, { "epoch": 1.8383944494662008, "grad_norm": 0.28085777163505554, "learning_rate": 4.3403222711860257e-07, "loss": 0.0968, "num_input_tokens_seen": 48776960, "step": 37625 }, { "epoch": 1.8386387511298952, "grad_norm": 0.16665975749492645, "learning_rate": 4.3221269058394133e-07, "loss": 0.0599, "num_input_tokens_seen": 48783776, "step": 37630 }, { "epoch": 1.8388830527935895, "grad_norm": 0.1829465627670288, "learning_rate": 4.303969427108173e-07, "loss": 0.0778, "num_input_tokens_seen": 48790112, "step": 37635 }, { "epoch": 1.839127354457284, "grad_norm": 0.15970510244369507, "learning_rate": 4.2858498377924825e-07, "loss": 0.0795, "num_input_tokens_seen": 48796512, "step": 37640 }, { "epoch": 1.8393716561209783, "grad_norm": 0.8614884614944458, "learning_rate": 4.267768140686579e-07, "loss": 0.1073, "num_input_tokens_seen": 48802688, "step": 37645 }, { "epoch": 1.8396159577846725, "grad_norm": 0.43886899948120117, "learning_rate": 4.2497243385788975e-07, "loss": 0.1243, "num_input_tokens_seen": 48809600, "step": 37650 }, { "epoch": 1.8398602594483668, "grad_norm": 0.6888815760612488, "learning_rate": 4.231718434251991e-07, "loss": 0.0874, "num_input_tokens_seen": 48815904, "step": 37655 }, { "epoch": 1.8401045611120612, "grad_norm": 0.4142707884311676, "learning_rate": 4.213750430482666e-07, "loss": 0.1045, "num_input_tokens_seen": 48822272, "step": 37660 }, { "epoch": 1.8403488627757554, "grad_norm": 0.20336633920669556, "learning_rate": 4.1958203300417054e-07, "loss": 0.0906, "num_input_tokens_seen": 48828832, "step": 37665 }, { "epoch": 1.8405931644394498, "grad_norm": 0.22266513109207153, "learning_rate": 4.177928135694259e-07, "loss": 0.1043, "num_input_tokens_seen": 48835200, "step": 37670 }, { "epoch": 1.8408374661031441, "grad_norm": 0.5381715893745422, "learning_rate": 4.1600738501994807e-07, "loss": 0.0814, "num_input_tokens_seen": 48841376, "step": 37675 }, { "epoch": 1.8410817677668385, "grad_norm": 0.2691958546638489, "learning_rate": 4.1422574763107237e-07, "loss": 0.0819, "num_input_tokens_seen": 48847584, "step": 37680 }, { "epoch": 1.841326069430533, "grad_norm": 0.26164913177490234, "learning_rate": 4.124479016775512e-07, "loss": 0.072, "num_input_tokens_seen": 48854048, "step": 37685 }, { "epoch": 1.8415703710942273, "grad_norm": 0.494181752204895, "learning_rate": 4.106738474335514e-07, "loss": 0.1003, "num_input_tokens_seen": 48860672, "step": 37690 }, { "epoch": 1.8418146727579214, "grad_norm": 0.1530587524175644, "learning_rate": 4.089035851726486e-07, "loss": 0.0746, "num_input_tokens_seen": 48867232, "step": 37695 }, { "epoch": 1.8420589744216158, "grad_norm": 0.49303528666496277, "learning_rate": 4.0713711516784937e-07, "loss": 0.0974, "num_input_tokens_seen": 48873536, "step": 37700 }, { "epoch": 1.8423032760853102, "grad_norm": 0.43121814727783203, "learning_rate": 4.05374437691558e-07, "loss": 0.0872, "num_input_tokens_seen": 48880320, "step": 37705 }, { "epoch": 1.8425475777490044, "grad_norm": 0.4035179018974304, "learning_rate": 4.036155530156044e-07, "loss": 0.0563, "num_input_tokens_seen": 48886816, "step": 37710 }, { "epoch": 1.8427918794126987, "grad_norm": 0.17017510533332825, "learning_rate": 4.018604614112298e-07, "loss": 0.0821, "num_input_tokens_seen": 48892960, "step": 37715 }, { "epoch": 1.843036181076393, "grad_norm": 0.8927245140075684, "learning_rate": 4.0010916314908996e-07, "loss": 0.1079, "num_input_tokens_seen": 48899712, "step": 37720 }, { "epoch": 1.8432804827400875, "grad_norm": 0.501578152179718, "learning_rate": 3.983616584992578e-07, "loss": 0.0739, "num_input_tokens_seen": 48906048, "step": 37725 }, { "epoch": 1.8435247844037819, "grad_norm": 0.4696231484413147, "learning_rate": 3.9661794773122595e-07, "loss": 0.0985, "num_input_tokens_seen": 48912160, "step": 37730 }, { "epoch": 1.8437690860674762, "grad_norm": 0.2759729027748108, "learning_rate": 3.9487803111388777e-07, "loss": 0.0945, "num_input_tokens_seen": 48918656, "step": 37735 }, { "epoch": 1.8440133877311704, "grad_norm": 0.20583632588386536, "learning_rate": 3.9314190891556747e-07, "loss": 0.0729, "num_input_tokens_seen": 48925120, "step": 37740 }, { "epoch": 1.8442576893948648, "grad_norm": 0.32870373129844666, "learning_rate": 3.914095814039925e-07, "loss": 0.0921, "num_input_tokens_seen": 48931392, "step": 37745 }, { "epoch": 1.844501991058559, "grad_norm": 0.42509037256240845, "learning_rate": 3.896810488463104e-07, "loss": 0.1148, "num_input_tokens_seen": 48937504, "step": 37750 }, { "epoch": 1.8447462927222533, "grad_norm": 0.18604539334774017, "learning_rate": 3.8795631150908565e-07, "loss": 0.0957, "num_input_tokens_seen": 48944160, "step": 37755 }, { "epoch": 1.8449905943859477, "grad_norm": 0.5872364640235901, "learning_rate": 3.862353696582888e-07, "loss": 0.074, "num_input_tokens_seen": 48950464, "step": 37760 }, { "epoch": 1.845234896049642, "grad_norm": 0.2630770206451416, "learning_rate": 3.8451822355931313e-07, "loss": 0.0683, "num_input_tokens_seen": 48956832, "step": 37765 }, { "epoch": 1.8454791977133365, "grad_norm": 0.29662880301475525, "learning_rate": 3.82804873476969e-07, "loss": 0.0791, "num_input_tokens_seen": 48963392, "step": 37770 }, { "epoch": 1.8457234993770308, "grad_norm": 0.4244776666164398, "learning_rate": 3.810953196754702e-07, "loss": 0.0919, "num_input_tokens_seen": 48969856, "step": 37775 }, { "epoch": 1.8459678010407252, "grad_norm": 0.3685661256313324, "learning_rate": 3.793895624184529e-07, "loss": 0.1184, "num_input_tokens_seen": 48975936, "step": 37780 }, { "epoch": 1.8462121027044194, "grad_norm": 0.18548673391342163, "learning_rate": 3.776876019689679e-07, "loss": 0.0947, "num_input_tokens_seen": 48982496, "step": 37785 }, { "epoch": 1.8464564043681138, "grad_norm": 0.2605893015861511, "learning_rate": 3.7598943858947743e-07, "loss": 0.0767, "num_input_tokens_seen": 48988960, "step": 37790 }, { "epoch": 1.846700706031808, "grad_norm": 0.3064812421798706, "learning_rate": 3.742950725418637e-07, "loss": 0.0718, "num_input_tokens_seen": 48995552, "step": 37795 }, { "epoch": 1.8469450076955023, "grad_norm": 0.2857804298400879, "learning_rate": 3.726045040874093e-07, "loss": 0.1127, "num_input_tokens_seen": 49002400, "step": 37800 }, { "epoch": 1.8469450076955023, "eval_loss": 0.087283194065094, "eval_runtime": 374.5275, "eval_samples_per_second": 97.149, "eval_steps_per_second": 24.289, "num_input_tokens_seen": 49002400, "step": 37800 }, { "epoch": 1.8471893093591967, "grad_norm": 0.24972103536128998, "learning_rate": 3.709177334868308e-07, "loss": 0.0978, "num_input_tokens_seen": 49009024, "step": 37805 }, { "epoch": 1.847433611022891, "grad_norm": 0.20392709970474243, "learning_rate": 3.692347610002478e-07, "loss": 0.0848, "num_input_tokens_seen": 49015648, "step": 37810 }, { "epoch": 1.8476779126865854, "grad_norm": 0.35871249437332153, "learning_rate": 3.675555868871916e-07, "loss": 0.0832, "num_input_tokens_seen": 49022496, "step": 37815 }, { "epoch": 1.8479222143502798, "grad_norm": 0.15289589762687683, "learning_rate": 3.658802114066162e-07, "loss": 0.0843, "num_input_tokens_seen": 49028704, "step": 37820 }, { "epoch": 1.8481665160139742, "grad_norm": 0.627048134803772, "learning_rate": 3.6420863481688437e-07, "loss": 0.1127, "num_input_tokens_seen": 49034912, "step": 37825 }, { "epoch": 1.8484108176776683, "grad_norm": 0.3934374451637268, "learning_rate": 3.625408573757705e-07, "loss": 0.0895, "num_input_tokens_seen": 49041024, "step": 37830 }, { "epoch": 1.8486551193413627, "grad_norm": 0.5265322327613831, "learning_rate": 3.608768793404743e-07, "loss": 0.077, "num_input_tokens_seen": 49047168, "step": 37835 }, { "epoch": 1.8488994210050569, "grad_norm": 0.1695350855588913, "learning_rate": 3.592167009675934e-07, "loss": 0.0681, "num_input_tokens_seen": 49053568, "step": 37840 }, { "epoch": 1.8491437226687513, "grad_norm": 0.3793428838253021, "learning_rate": 3.575603225131563e-07, "loss": 0.0763, "num_input_tokens_seen": 49059968, "step": 37845 }, { "epoch": 1.8493880243324456, "grad_norm": 0.21613198518753052, "learning_rate": 3.55907744232592e-07, "loss": 0.0832, "num_input_tokens_seen": 49066208, "step": 37850 }, { "epoch": 1.84963232599614, "grad_norm": 0.2642154395580292, "learning_rate": 3.5425896638075217e-07, "loss": 0.0804, "num_input_tokens_seen": 49072512, "step": 37855 }, { "epoch": 1.8498766276598344, "grad_norm": 0.4856320321559906, "learning_rate": 3.5261398921189736e-07, "loss": 0.0717, "num_input_tokens_seen": 49078848, "step": 37860 }, { "epoch": 1.8501209293235288, "grad_norm": 0.18002073466777802, "learning_rate": 3.509728129797024e-07, "loss": 0.1067, "num_input_tokens_seen": 49085408, "step": 37865 }, { "epoch": 1.8503652309872232, "grad_norm": 0.1316443681716919, "learning_rate": 3.4933543793725656e-07, "loss": 0.0776, "num_input_tokens_seen": 49091808, "step": 37870 }, { "epoch": 1.8506095326509173, "grad_norm": 0.6544226408004761, "learning_rate": 3.4770186433707163e-07, "loss": 0.1101, "num_input_tokens_seen": 49098176, "step": 37875 }, { "epoch": 1.8508538343146117, "grad_norm": 0.43000656366348267, "learning_rate": 3.4607209243105453e-07, "loss": 0.094, "num_input_tokens_seen": 49104704, "step": 37880 }, { "epoch": 1.8510981359783059, "grad_norm": 0.26769524812698364, "learning_rate": 3.444461224705431e-07, "loss": 0.067, "num_input_tokens_seen": 49110752, "step": 37885 }, { "epoch": 1.8513424376420002, "grad_norm": 0.19205109775066376, "learning_rate": 3.4282395470628116e-07, "loss": 0.0704, "num_input_tokens_seen": 49117792, "step": 37890 }, { "epoch": 1.8515867393056946, "grad_norm": 0.25545743107795715, "learning_rate": 3.4120558938842417e-07, "loss": 0.0848, "num_input_tokens_seen": 49124256, "step": 37895 }, { "epoch": 1.851831040969389, "grad_norm": 0.16703736782073975, "learning_rate": 3.395910267665503e-07, "loss": 0.085, "num_input_tokens_seen": 49130496, "step": 37900 }, { "epoch": 1.8520753426330834, "grad_norm": 0.15623319149017334, "learning_rate": 3.3798026708964094e-07, "loss": 0.0631, "num_input_tokens_seen": 49136768, "step": 37905 }, { "epoch": 1.8523196442967778, "grad_norm": 0.3553796112537384, "learning_rate": 3.3637331060609456e-07, "loss": 0.0857, "num_input_tokens_seen": 49143136, "step": 37910 }, { "epoch": 1.8525639459604721, "grad_norm": 0.69588702917099, "learning_rate": 3.3477015756372966e-07, "loss": 0.0821, "num_input_tokens_seen": 49149664, "step": 37915 }, { "epoch": 1.8528082476241663, "grad_norm": 0.15842224657535553, "learning_rate": 3.3317080820976785e-07, "loss": 0.0857, "num_input_tokens_seen": 49156128, "step": 37920 }, { "epoch": 1.8530525492878607, "grad_norm": 0.26547083258628845, "learning_rate": 3.315752627908508e-07, "loss": 0.0799, "num_input_tokens_seen": 49162784, "step": 37925 }, { "epoch": 1.8532968509515548, "grad_norm": 0.173845112323761, "learning_rate": 3.299835215530317e-07, "loss": 0.0983, "num_input_tokens_seen": 49169408, "step": 37930 }, { "epoch": 1.8535411526152492, "grad_norm": 0.9662163257598877, "learning_rate": 3.2839558474177245e-07, "loss": 0.0901, "num_input_tokens_seen": 49175872, "step": 37935 }, { "epoch": 1.8537854542789436, "grad_norm": 0.3690991997718811, "learning_rate": 3.2681145260196056e-07, "loss": 0.0694, "num_input_tokens_seen": 49182432, "step": 37940 }, { "epoch": 1.854029755942638, "grad_norm": 0.5098894238471985, "learning_rate": 3.252311253778839e-07, "loss": 0.1138, "num_input_tokens_seen": 49188448, "step": 37945 }, { "epoch": 1.8542740576063323, "grad_norm": 0.14766797423362732, "learning_rate": 3.2365460331325034e-07, "loss": 0.0712, "num_input_tokens_seen": 49194944, "step": 37950 }, { "epoch": 1.8545183592700267, "grad_norm": 0.16730180382728577, "learning_rate": 3.2208188665117934e-07, "loss": 0.1098, "num_input_tokens_seen": 49201280, "step": 37955 }, { "epoch": 1.854762660933721, "grad_norm": 0.17796507477760315, "learning_rate": 3.205129756342018e-07, "loss": 0.1004, "num_input_tokens_seen": 49207712, "step": 37960 }, { "epoch": 1.8550069625974153, "grad_norm": 0.4402139186859131, "learning_rate": 3.189478705042659e-07, "loss": 0.0906, "num_input_tokens_seen": 49214048, "step": 37965 }, { "epoch": 1.8552512642611096, "grad_norm": 0.37125977873802185, "learning_rate": 3.173865715027341e-07, "loss": 0.0925, "num_input_tokens_seen": 49220928, "step": 37970 }, { "epoch": 1.8554955659248038, "grad_norm": 0.204680398106575, "learning_rate": 3.158290788703694e-07, "loss": 0.0601, "num_input_tokens_seen": 49227200, "step": 37975 }, { "epoch": 1.8557398675884982, "grad_norm": 0.8570793867111206, "learning_rate": 3.1427539284736297e-07, "loss": 0.088, "num_input_tokens_seen": 49233856, "step": 37980 }, { "epoch": 1.8559841692521926, "grad_norm": 0.39762189984321594, "learning_rate": 3.127255136733093e-07, "loss": 0.1202, "num_input_tokens_seen": 49240224, "step": 37985 }, { "epoch": 1.856228470915887, "grad_norm": 0.45177146792411804, "learning_rate": 3.1117944158722544e-07, "loss": 0.0716, "num_input_tokens_seen": 49246592, "step": 37990 }, { "epoch": 1.8564727725795813, "grad_norm": 0.39885413646698, "learning_rate": 3.0963717682752635e-07, "loss": 0.1168, "num_input_tokens_seen": 49253088, "step": 37995 }, { "epoch": 1.8567170742432757, "grad_norm": 0.4597418010234833, "learning_rate": 3.080987196320578e-07, "loss": 0.0796, "num_input_tokens_seen": 49259584, "step": 38000 }, { "epoch": 1.8567170742432757, "eval_loss": 0.08725295215845108, "eval_runtime": 374.0853, "eval_samples_per_second": 97.264, "eval_steps_per_second": 24.318, "num_input_tokens_seen": 49259584, "step": 38000 }, { "epoch": 1.85696137590697, "grad_norm": 0.110191211104393, "learning_rate": 3.065640702380607e-07, "loss": 0.0792, "num_input_tokens_seen": 49266720, "step": 38005 }, { "epoch": 1.8572056775706642, "grad_norm": 0.20464736223220825, "learning_rate": 3.050332288822011e-07, "loss": 0.1079, "num_input_tokens_seen": 49273248, "step": 38010 }, { "epoch": 1.8574499792343586, "grad_norm": 0.22543810307979584, "learning_rate": 3.035061958005542e-07, "loss": 0.1111, "num_input_tokens_seen": 49279680, "step": 38015 }, { "epoch": 1.8576942808980528, "grad_norm": 0.20175792276859283, "learning_rate": 3.019829712286093e-07, "loss": 0.0964, "num_input_tokens_seen": 49286368, "step": 38020 }, { "epoch": 1.8579385825617472, "grad_norm": 0.4642181694507599, "learning_rate": 3.004635554012647e-07, "loss": 0.0846, "num_input_tokens_seen": 49292416, "step": 38025 }, { "epoch": 1.8581828842254415, "grad_norm": 0.22006191313266754, "learning_rate": 2.9894794855283017e-07, "loss": 0.1107, "num_input_tokens_seen": 49298848, "step": 38030 }, { "epoch": 1.858427185889136, "grad_norm": 0.4620140492916107, "learning_rate": 2.9743615091703816e-07, "loss": 0.1179, "num_input_tokens_seen": 49304864, "step": 38035 }, { "epoch": 1.8586714875528303, "grad_norm": 0.21600553393363953, "learning_rate": 2.959281627270216e-07, "loss": 0.0774, "num_input_tokens_seen": 49311296, "step": 38040 }, { "epoch": 1.8589157892165247, "grad_norm": 0.6539443731307983, "learning_rate": 2.944239842153362e-07, "loss": 0.1109, "num_input_tokens_seen": 49317792, "step": 38045 }, { "epoch": 1.859160090880219, "grad_norm": 0.31980112195014954, "learning_rate": 2.929236156139381e-07, "loss": 0.0681, "num_input_tokens_seen": 49324320, "step": 38050 }, { "epoch": 1.8594043925439132, "grad_norm": 0.2572954297065735, "learning_rate": 2.9142705715420883e-07, "loss": 0.0928, "num_input_tokens_seen": 49331072, "step": 38055 }, { "epoch": 1.8596486942076076, "grad_norm": 0.9561247229576111, "learning_rate": 2.8993430906693595e-07, "loss": 0.1022, "num_input_tokens_seen": 49337760, "step": 38060 }, { "epoch": 1.8598929958713017, "grad_norm": 0.1429521143436432, "learning_rate": 2.88445371582316e-07, "loss": 0.0943, "num_input_tokens_seen": 49343904, "step": 38065 }, { "epoch": 1.8601372975349961, "grad_norm": 0.26165878772735596, "learning_rate": 2.8696024492996796e-07, "loss": 0.0768, "num_input_tokens_seen": 49350240, "step": 38070 }, { "epoch": 1.8603815991986905, "grad_norm": 0.25866183638572693, "learning_rate": 2.854789293389115e-07, "loss": 0.1127, "num_input_tokens_seen": 49356480, "step": 38075 }, { "epoch": 1.8606259008623849, "grad_norm": 0.1092461496591568, "learning_rate": 2.8400142503758606e-07, "loss": 0.0726, "num_input_tokens_seen": 49363104, "step": 38080 }, { "epoch": 1.8608702025260793, "grad_norm": 0.3489000201225281, "learning_rate": 2.8252773225384276e-07, "loss": 0.0659, "num_input_tokens_seen": 49369504, "step": 38085 }, { "epoch": 1.8611145041897736, "grad_norm": 0.31359997391700745, "learning_rate": 2.8105785121494143e-07, "loss": 0.0741, "num_input_tokens_seen": 49376288, "step": 38090 }, { "epoch": 1.861358805853468, "grad_norm": 0.26884639263153076, "learning_rate": 2.795917821475563e-07, "loss": 0.086, "num_input_tokens_seen": 49382464, "step": 38095 }, { "epoch": 1.8616031075171622, "grad_norm": 0.5350406765937805, "learning_rate": 2.78129525277776e-07, "loss": 0.082, "num_input_tokens_seen": 49388864, "step": 38100 }, { "epoch": 1.8618474091808566, "grad_norm": 0.13123439252376556, "learning_rate": 2.766710808310952e-07, "loss": 0.0873, "num_input_tokens_seen": 49395168, "step": 38105 }, { "epoch": 1.8620917108445507, "grad_norm": 0.20644627511501312, "learning_rate": 2.7521644903242827e-07, "loss": 0.1052, "num_input_tokens_seen": 49401536, "step": 38110 }, { "epoch": 1.862336012508245, "grad_norm": 0.8227205872535706, "learning_rate": 2.7376563010609593e-07, "loss": 0.0869, "num_input_tokens_seen": 49407744, "step": 38115 }, { "epoch": 1.8625803141719395, "grad_norm": 0.44689860939979553, "learning_rate": 2.72318624275833e-07, "loss": 0.0708, "num_input_tokens_seen": 49414912, "step": 38120 }, { "epoch": 1.8628246158356339, "grad_norm": 0.22318343818187714, "learning_rate": 2.7087543176478324e-07, "loss": 0.1057, "num_input_tokens_seen": 49421280, "step": 38125 }, { "epoch": 1.8630689174993282, "grad_norm": 0.18483804166316986, "learning_rate": 2.694360527955103e-07, "loss": 0.0853, "num_input_tokens_seen": 49427232, "step": 38130 }, { "epoch": 1.8633132191630226, "grad_norm": 0.2946658134460449, "learning_rate": 2.680004875899811e-07, "loss": 0.121, "num_input_tokens_seen": 49433792, "step": 38135 }, { "epoch": 1.863557520826717, "grad_norm": 0.550565779209137, "learning_rate": 2.665687363695768e-07, "loss": 0.0865, "num_input_tokens_seen": 49440576, "step": 38140 }, { "epoch": 1.8638018224904112, "grad_norm": 0.1227777972817421, "learning_rate": 2.6514079935509584e-07, "loss": 0.0692, "num_input_tokens_seen": 49447232, "step": 38145 }, { "epoch": 1.8640461241541055, "grad_norm": 0.1619793325662613, "learning_rate": 2.6371667676673983e-07, "loss": 0.093, "num_input_tokens_seen": 49453568, "step": 38150 }, { "epoch": 1.8642904258177997, "grad_norm": 0.29740121960639954, "learning_rate": 2.6229636882412755e-07, "loss": 0.0686, "num_input_tokens_seen": 49460224, "step": 38155 }, { "epoch": 1.864534727481494, "grad_norm": 0.5016767382621765, "learning_rate": 2.6087987574628935e-07, "loss": 0.0905, "num_input_tokens_seen": 49466688, "step": 38160 }, { "epoch": 1.8647790291451884, "grad_norm": 0.26031896471977234, "learning_rate": 2.5946719775166437e-07, "loss": 0.0705, "num_input_tokens_seen": 49472928, "step": 38165 }, { "epoch": 1.8650233308088828, "grad_norm": 0.5427550673484802, "learning_rate": 2.5805833505810616e-07, "loss": 0.0968, "num_input_tokens_seen": 49479360, "step": 38170 }, { "epoch": 1.8652676324725772, "grad_norm": 0.2880612015724182, "learning_rate": 2.566532878828798e-07, "loss": 0.0762, "num_input_tokens_seen": 49485856, "step": 38175 }, { "epoch": 1.8655119341362716, "grad_norm": 0.7696182131767273, "learning_rate": 2.552520564426619e-07, "loss": 0.0887, "num_input_tokens_seen": 49492064, "step": 38180 }, { "epoch": 1.8657562357999657, "grad_norm": 0.6930066347122192, "learning_rate": 2.5385464095353803e-07, "loss": 0.1313, "num_input_tokens_seen": 49498848, "step": 38185 }, { "epoch": 1.8660005374636601, "grad_norm": 0.4457934498786926, "learning_rate": 2.5246104163100804e-07, "loss": 0.0819, "num_input_tokens_seen": 49505376, "step": 38190 }, { "epoch": 1.8662448391273545, "grad_norm": 0.17925438284873962, "learning_rate": 2.510712586899833e-07, "loss": 0.0986, "num_input_tokens_seen": 49511744, "step": 38195 }, { "epoch": 1.8664891407910487, "grad_norm": 0.19281573593616486, "learning_rate": 2.4968529234478124e-07, "loss": 0.0896, "num_input_tokens_seen": 49518144, "step": 38200 }, { "epoch": 1.8664891407910487, "eval_loss": 0.08736275136470795, "eval_runtime": 374.7173, "eval_samples_per_second": 97.1, "eval_steps_per_second": 24.277, "num_input_tokens_seen": 49518144, "step": 38200 }, { "epoch": 1.866733442454743, "grad_norm": 0.3861379027366638, "learning_rate": 2.483031428091448e-07, "loss": 0.0725, "num_input_tokens_seen": 49524608, "step": 38205 }, { "epoch": 1.8669777441184374, "grad_norm": 1.7479578256607056, "learning_rate": 2.469248102962091e-07, "loss": 0.1332, "num_input_tokens_seen": 49531200, "step": 38210 }, { "epoch": 1.8672220457821318, "grad_norm": 0.15954415500164032, "learning_rate": 2.4555029501853455e-07, "loss": 0.0712, "num_input_tokens_seen": 49538144, "step": 38215 }, { "epoch": 1.8674663474458262, "grad_norm": 0.18855704367160797, "learning_rate": 2.441795971880906e-07, "loss": 0.0853, "num_input_tokens_seen": 49544544, "step": 38220 }, { "epoch": 1.8677106491095206, "grad_norm": 0.15776938199996948, "learning_rate": 2.4281271701625255e-07, "loss": 0.0973, "num_input_tokens_seen": 49550848, "step": 38225 }, { "epoch": 1.8679549507732147, "grad_norm": 0.1814531832933426, "learning_rate": 2.4144965471381007e-07, "loss": 0.0978, "num_input_tokens_seen": 49557344, "step": 38230 }, { "epoch": 1.868199252436909, "grad_norm": 0.6928481459617615, "learning_rate": 2.400904104909674e-07, "loss": 0.0777, "num_input_tokens_seen": 49563968, "step": 38235 }, { "epoch": 1.8684435541006035, "grad_norm": 0.4963442385196686, "learning_rate": 2.3873498455733725e-07, "loss": 0.1314, "num_input_tokens_seen": 49570336, "step": 38240 }, { "epoch": 1.8686878557642976, "grad_norm": 0.2864917516708374, "learning_rate": 2.3738337712194137e-07, "loss": 0.0839, "num_input_tokens_seen": 49577024, "step": 38245 }, { "epoch": 1.868932157427992, "grad_norm": 0.2395782172679901, "learning_rate": 2.3603558839321305e-07, "loss": 0.1123, "num_input_tokens_seen": 49583360, "step": 38250 }, { "epoch": 1.8691764590916864, "grad_norm": 0.15774290263652802, "learning_rate": 2.3469161857900267e-07, "loss": 0.0831, "num_input_tokens_seen": 49590240, "step": 38255 }, { "epoch": 1.8694207607553808, "grad_norm": 0.16003020107746124, "learning_rate": 2.3335146788656393e-07, "loss": 0.0799, "num_input_tokens_seen": 49596928, "step": 38260 }, { "epoch": 1.8696650624190752, "grad_norm": 0.3696548342704773, "learning_rate": 2.3201513652256757e-07, "loss": 0.0961, "num_input_tokens_seen": 49603360, "step": 38265 }, { "epoch": 1.8699093640827695, "grad_norm": 0.19318903982639313, "learning_rate": 2.3068262469308766e-07, "loss": 0.0922, "num_input_tokens_seen": 49609536, "step": 38270 }, { "epoch": 1.8701536657464637, "grad_norm": 0.3263084888458252, "learning_rate": 2.2935393260362093e-07, "loss": 0.0904, "num_input_tokens_seen": 49615744, "step": 38275 }, { "epoch": 1.870397967410158, "grad_norm": 0.3469202220439911, "learning_rate": 2.2802906045906458e-07, "loss": 0.0789, "num_input_tokens_seen": 49621952, "step": 38280 }, { "epoch": 1.8706422690738524, "grad_norm": 0.5088338851928711, "learning_rate": 2.2670800846373018e-07, "loss": 0.0621, "num_input_tokens_seen": 49628096, "step": 38285 }, { "epoch": 1.8708865707375466, "grad_norm": 0.29406023025512695, "learning_rate": 2.2539077682134367e-07, "loss": 0.0892, "num_input_tokens_seen": 49634784, "step": 38290 }, { "epoch": 1.871130872401241, "grad_norm": 0.18488338589668274, "learning_rate": 2.2407736573503423e-07, "loss": 0.064, "num_input_tokens_seen": 49640832, "step": 38295 }, { "epoch": 1.8713751740649354, "grad_norm": 0.17760995030403137, "learning_rate": 2.2276777540735093e-07, "loss": 0.066, "num_input_tokens_seen": 49647456, "step": 38300 }, { "epoch": 1.8716194757286297, "grad_norm": 0.12745141983032227, "learning_rate": 2.2146200604024613e-07, "loss": 0.0703, "num_input_tokens_seen": 49654144, "step": 38305 }, { "epoch": 1.8718637773923241, "grad_norm": 0.32655200362205505, "learning_rate": 2.2016005783508375e-07, "loss": 0.0964, "num_input_tokens_seen": 49660608, "step": 38310 }, { "epoch": 1.8721080790560185, "grad_norm": 0.452287495136261, "learning_rate": 2.1886193099264763e-07, "loss": 0.112, "num_input_tokens_seen": 49667264, "step": 38315 }, { "epoch": 1.8723523807197127, "grad_norm": 0.24325735867023468, "learning_rate": 2.175676257131165e-07, "loss": 0.0828, "num_input_tokens_seen": 49673376, "step": 38320 }, { "epoch": 1.872596682383407, "grad_norm": 0.27438536286354065, "learning_rate": 2.162771421960974e-07, "loss": 0.0844, "num_input_tokens_seen": 49679488, "step": 38325 }, { "epoch": 1.8728409840471012, "grad_norm": 0.5348635315895081, "learning_rate": 2.1499048064059224e-07, "loss": 0.0942, "num_input_tokens_seen": 49685632, "step": 38330 }, { "epoch": 1.8730852857107956, "grad_norm": 0.45319297909736633, "learning_rate": 2.1370764124502285e-07, "loss": 0.0976, "num_input_tokens_seen": 49691648, "step": 38335 }, { "epoch": 1.87332958737449, "grad_norm": 0.2805590033531189, "learning_rate": 2.1242862420721988e-07, "loss": 0.0752, "num_input_tokens_seen": 49698016, "step": 38340 }, { "epoch": 1.8735738890381843, "grad_norm": 0.2925538718700409, "learning_rate": 2.1115342972442276e-07, "loss": 0.0691, "num_input_tokens_seen": 49704384, "step": 38345 }, { "epoch": 1.8738181907018787, "grad_norm": 0.672943115234375, "learning_rate": 2.0988205799328252e-07, "loss": 0.0919, "num_input_tokens_seen": 49710880, "step": 38350 }, { "epoch": 1.874062492365573, "grad_norm": 0.3382466733455658, "learning_rate": 2.0861450920986182e-07, "loss": 0.0813, "num_input_tokens_seen": 49717216, "step": 38355 }, { "epoch": 1.8743067940292675, "grad_norm": 0.3245173394680023, "learning_rate": 2.07350783569632e-07, "loss": 0.1004, "num_input_tokens_seen": 49723712, "step": 38360 }, { "epoch": 1.8745510956929616, "grad_norm": 0.49518463015556335, "learning_rate": 2.060908812674761e-07, "loss": 0.0993, "num_input_tokens_seen": 49730272, "step": 38365 }, { "epoch": 1.874795397356656, "grad_norm": 0.33525118231773376, "learning_rate": 2.0483480249768317e-07, "loss": 0.0933, "num_input_tokens_seen": 49736640, "step": 38370 }, { "epoch": 1.8750396990203502, "grad_norm": 0.4028398394584656, "learning_rate": 2.035825474539621e-07, "loss": 0.0805, "num_input_tokens_seen": 49743008, "step": 38375 }, { "epoch": 1.8752840006840445, "grad_norm": 0.18496623635292053, "learning_rate": 2.0233411632942235e-07, "loss": 0.1168, "num_input_tokens_seen": 49749408, "step": 38380 }, { "epoch": 1.875528302347739, "grad_norm": 0.10442599654197693, "learning_rate": 2.0108950931658764e-07, "loss": 0.0813, "num_input_tokens_seen": 49756288, "step": 38385 }, { "epoch": 1.8757726040114333, "grad_norm": 0.4813983142375946, "learning_rate": 1.998487266073934e-07, "loss": 0.0767, "num_input_tokens_seen": 49763104, "step": 38390 }, { "epoch": 1.8760169056751277, "grad_norm": 0.38298001885414124, "learning_rate": 1.986117683931865e-07, "loss": 0.1035, "num_input_tokens_seen": 49769376, "step": 38395 }, { "epoch": 1.876261207338822, "grad_norm": 0.20643936097621918, "learning_rate": 1.9737863486471442e-07, "loss": 0.086, "num_input_tokens_seen": 49775776, "step": 38400 }, { "epoch": 1.876261207338822, "eval_loss": 0.08725407719612122, "eval_runtime": 374.8986, "eval_samples_per_second": 97.053, "eval_steps_per_second": 24.265, "num_input_tokens_seen": 49775776, "step": 38400 }, { "epoch": 1.8765055090025164, "grad_norm": 0.19688813388347626, "learning_rate": 1.9614932621215e-07, "loss": 0.0841, "num_input_tokens_seen": 49782016, "step": 38405 }, { "epoch": 1.8767498106662106, "grad_norm": 0.5763117074966431, "learning_rate": 1.9492384262506102e-07, "loss": 0.0645, "num_input_tokens_seen": 49788448, "step": 38410 }, { "epoch": 1.876994112329905, "grad_norm": 0.3969201445579529, "learning_rate": 1.9370218429243524e-07, "loss": 0.0928, "num_input_tokens_seen": 49795072, "step": 38415 }, { "epoch": 1.8772384139935991, "grad_norm": 0.16108007729053497, "learning_rate": 1.9248435140267197e-07, "loss": 0.122, "num_input_tokens_seen": 49801376, "step": 38420 }, { "epoch": 1.8774827156572935, "grad_norm": 0.566796600818634, "learning_rate": 1.9127034414356814e-07, "loss": 0.1036, "num_input_tokens_seen": 49807648, "step": 38425 }, { "epoch": 1.877727017320988, "grad_norm": 0.592051088809967, "learning_rate": 1.9006016270234627e-07, "loss": 0.0661, "num_input_tokens_seen": 49814528, "step": 38430 }, { "epoch": 1.8779713189846823, "grad_norm": 0.4282987117767334, "learning_rate": 1.888538072656293e-07, "loss": 0.0996, "num_input_tokens_seen": 49821248, "step": 38435 }, { "epoch": 1.8782156206483767, "grad_norm": 0.3009348511695862, "learning_rate": 1.8765127801944893e-07, "loss": 0.0584, "num_input_tokens_seen": 49827456, "step": 38440 }, { "epoch": 1.878459922312071, "grad_norm": 0.6917129158973694, "learning_rate": 1.8645257514925406e-07, "loss": 0.0789, "num_input_tokens_seen": 49833952, "step": 38445 }, { "epoch": 1.8787042239757654, "grad_norm": 0.4729410707950592, "learning_rate": 1.8525769883989685e-07, "loss": 0.0647, "num_input_tokens_seen": 49840640, "step": 38450 }, { "epoch": 1.8789485256394596, "grad_norm": 0.21892932057380676, "learning_rate": 1.8406664927564654e-07, "loss": 0.0632, "num_input_tokens_seen": 49847008, "step": 38455 }, { "epoch": 1.879192827303154, "grad_norm": 0.21196630597114563, "learning_rate": 1.8287942664017566e-07, "loss": 0.0995, "num_input_tokens_seen": 49853376, "step": 38460 }, { "epoch": 1.8794371289668481, "grad_norm": 0.5781097412109375, "learning_rate": 1.8169603111656552e-07, "loss": 0.0876, "num_input_tokens_seen": 49859936, "step": 38465 }, { "epoch": 1.8796814306305425, "grad_norm": 0.18161121010780334, "learning_rate": 1.805164628873146e-07, "loss": 0.0588, "num_input_tokens_seen": 49866656, "step": 38470 }, { "epoch": 1.8799257322942369, "grad_norm": 0.1842963546514511, "learning_rate": 1.793407221343274e-07, "loss": 0.0856, "num_input_tokens_seen": 49872928, "step": 38475 }, { "epoch": 1.8801700339579313, "grad_norm": 0.49335435032844543, "learning_rate": 1.781688090389172e-07, "loss": 0.0754, "num_input_tokens_seen": 49879456, "step": 38480 }, { "epoch": 1.8804143356216256, "grad_norm": 0.22508682310581207, "learning_rate": 1.770007237818061e-07, "loss": 0.0714, "num_input_tokens_seen": 49886496, "step": 38485 }, { "epoch": 1.88065863728532, "grad_norm": 0.5087329745292664, "learning_rate": 1.7583646654313059e-07, "loss": 0.0832, "num_input_tokens_seen": 49893312, "step": 38490 }, { "epoch": 1.8809029389490144, "grad_norm": 0.23430821299552917, "learning_rate": 1.7467603750242757e-07, "loss": 0.0768, "num_input_tokens_seen": 49899520, "step": 38495 }, { "epoch": 1.8811472406127085, "grad_norm": 0.3905538022518158, "learning_rate": 1.7351943683865944e-07, "loss": 0.0892, "num_input_tokens_seen": 49906048, "step": 38500 }, { "epoch": 1.881391542276403, "grad_norm": 0.8039484620094299, "learning_rate": 1.723666647301808e-07, "loss": 0.0814, "num_input_tokens_seen": 49912288, "step": 38505 }, { "epoch": 1.881635843940097, "grad_norm": 0.48176097869873047, "learning_rate": 1.712177213547661e-07, "loss": 0.0932, "num_input_tokens_seen": 49919008, "step": 38510 }, { "epoch": 1.8818801456037915, "grad_norm": 0.20430830121040344, "learning_rate": 1.7007260688959581e-07, "loss": 0.1123, "num_input_tokens_seen": 49925248, "step": 38515 }, { "epoch": 1.8821244472674858, "grad_norm": 0.17572438716888428, "learning_rate": 1.68931321511262e-07, "loss": 0.0483, "num_input_tokens_seen": 49931296, "step": 38520 }, { "epoch": 1.8823687489311802, "grad_norm": 0.31850242614746094, "learning_rate": 1.6779386539576835e-07, "loss": 0.0713, "num_input_tokens_seen": 49937568, "step": 38525 }, { "epoch": 1.8826130505948746, "grad_norm": 0.3142800033092499, "learning_rate": 1.666602387185162e-07, "loss": 0.1113, "num_input_tokens_seen": 49944544, "step": 38530 }, { "epoch": 1.882857352258569, "grad_norm": 0.24729382991790771, "learning_rate": 1.655304416543352e-07, "loss": 0.0785, "num_input_tokens_seen": 49951392, "step": 38535 }, { "epoch": 1.8831016539222634, "grad_norm": 0.19595283269882202, "learning_rate": 1.6440447437744698e-07, "loss": 0.1033, "num_input_tokens_seen": 49957824, "step": 38540 }, { "epoch": 1.8833459555859575, "grad_norm": 0.5737341046333313, "learning_rate": 1.6328233706149332e-07, "loss": 0.0812, "num_input_tokens_seen": 49964640, "step": 38545 }, { "epoch": 1.883590257249652, "grad_norm": 0.2845514714717865, "learning_rate": 1.6216402987951906e-07, "loss": 0.0982, "num_input_tokens_seen": 49971136, "step": 38550 }, { "epoch": 1.883834558913346, "grad_norm": 0.6240972280502319, "learning_rate": 1.6104955300398627e-07, "loss": 0.1017, "num_input_tokens_seen": 49977408, "step": 38555 }, { "epoch": 1.8840788605770404, "grad_norm": 0.45111238956451416, "learning_rate": 1.5993890660675748e-07, "loss": 0.0802, "num_input_tokens_seen": 49983680, "step": 38560 }, { "epoch": 1.8843231622407348, "grad_norm": 0.5737491846084595, "learning_rate": 1.5883209085910678e-07, "loss": 0.0721, "num_input_tokens_seen": 49990144, "step": 38565 }, { "epoch": 1.8845674639044292, "grad_norm": 0.502389132976532, "learning_rate": 1.5772910593172264e-07, "loss": 0.1079, "num_input_tokens_seen": 49996640, "step": 38570 }, { "epoch": 1.8848117655681236, "grad_norm": 0.6836555600166321, "learning_rate": 1.5662995199469954e-07, "loss": 0.078, "num_input_tokens_seen": 50003296, "step": 38575 }, { "epoch": 1.885056067231818, "grad_norm": 0.26949572563171387, "learning_rate": 1.5553462921753802e-07, "loss": 0.0947, "num_input_tokens_seen": 50009920, "step": 38580 }, { "epoch": 1.8853003688955123, "grad_norm": 0.5036179423332214, "learning_rate": 1.544431377691502e-07, "loss": 0.091, "num_input_tokens_seen": 50016672, "step": 38585 }, { "epoch": 1.8855446705592065, "grad_norm": 0.19869141280651093, "learning_rate": 1.5335547781785975e-07, "loss": 0.1013, "num_input_tokens_seen": 50023072, "step": 38590 }, { "epoch": 1.8857889722229009, "grad_norm": 0.7753545045852661, "learning_rate": 1.5227164953139917e-07, "loss": 0.079, "num_input_tokens_seen": 50029600, "step": 38595 }, { "epoch": 1.886033273886595, "grad_norm": 0.31529802083969116, "learning_rate": 1.511916530769042e-07, "loss": 0.0885, "num_input_tokens_seen": 50036384, "step": 38600 }, { "epoch": 1.886033273886595, "eval_loss": 0.08729726821184158, "eval_runtime": 374.6944, "eval_samples_per_second": 97.106, "eval_steps_per_second": 24.278, "num_input_tokens_seen": 50036384, "step": 38600 }, { "epoch": 1.8862775755502894, "grad_norm": 0.1845247596502304, "learning_rate": 1.5011548862092773e-07, "loss": 0.0612, "num_input_tokens_seen": 50042752, "step": 38605 }, { "epoch": 1.8865218772139838, "grad_norm": 0.16845721006393433, "learning_rate": 1.490431563294231e-07, "loss": 0.0634, "num_input_tokens_seen": 50048928, "step": 38610 }, { "epoch": 1.8867661788776782, "grad_norm": 0.4426576495170593, "learning_rate": 1.4797465636776365e-07, "loss": 0.0542, "num_input_tokens_seen": 50055424, "step": 38615 }, { "epoch": 1.8870104805413725, "grad_norm": 0.5386804938316345, "learning_rate": 1.4690998890072027e-07, "loss": 0.0767, "num_input_tokens_seen": 50061568, "step": 38620 }, { "epoch": 1.887254782205067, "grad_norm": 0.1933443695306778, "learning_rate": 1.4584915409248112e-07, "loss": 0.0847, "num_input_tokens_seen": 50067712, "step": 38625 }, { "epoch": 1.8874990838687613, "grad_norm": 0.3617464005947113, "learning_rate": 1.4479215210663754e-07, "loss": 0.1015, "num_input_tokens_seen": 50073888, "step": 38630 }, { "epoch": 1.8877433855324555, "grad_norm": 0.4418443441390991, "learning_rate": 1.4373898310619528e-07, "loss": 0.0909, "num_input_tokens_seen": 50080256, "step": 38635 }, { "epoch": 1.8879876871961498, "grad_norm": 0.2891891896724701, "learning_rate": 1.4268964725356604e-07, "loss": 0.097, "num_input_tokens_seen": 50086752, "step": 38640 }, { "epoch": 1.888231988859844, "grad_norm": 0.26975005865097046, "learning_rate": 1.4164414471056764e-07, "loss": 0.105, "num_input_tokens_seen": 50093632, "step": 38645 }, { "epoch": 1.8884762905235384, "grad_norm": 0.1364465206861496, "learning_rate": 1.4060247563843497e-07, "loss": 0.0596, "num_input_tokens_seen": 50100736, "step": 38650 }, { "epoch": 1.8887205921872328, "grad_norm": 0.4893462061882019, "learning_rate": 1.3956464019780068e-07, "loss": 0.0978, "num_input_tokens_seen": 50107232, "step": 38655 }, { "epoch": 1.8889648938509271, "grad_norm": 0.22609227895736694, "learning_rate": 1.385306385487145e-07, "loss": 0.0869, "num_input_tokens_seen": 50113728, "step": 38660 }, { "epoch": 1.8892091955146215, "grad_norm": 0.5164439678192139, "learning_rate": 1.3750047085063222e-07, "loss": 0.0771, "num_input_tokens_seen": 50120192, "step": 38665 }, { "epoch": 1.889453497178316, "grad_norm": 0.29109904170036316, "learning_rate": 1.3647413726242119e-07, "loss": 0.1086, "num_input_tokens_seen": 50126208, "step": 38670 }, { "epoch": 1.8896977988420103, "grad_norm": 0.419502317905426, "learning_rate": 1.3545163794235205e-07, "loss": 0.0881, "num_input_tokens_seen": 50132896, "step": 38675 }, { "epoch": 1.8899421005057044, "grad_norm": 0.13878735899925232, "learning_rate": 1.3443297304810698e-07, "loss": 0.0976, "num_input_tokens_seen": 50139328, "step": 38680 }, { "epoch": 1.8901864021693988, "grad_norm": 0.6059997081756592, "learning_rate": 1.3341814273677977e-07, "loss": 0.1024, "num_input_tokens_seen": 50145216, "step": 38685 }, { "epoch": 1.890430703833093, "grad_norm": 0.40515634417533875, "learning_rate": 1.324071471648647e-07, "loss": 0.0732, "num_input_tokens_seen": 50152992, "step": 38690 }, { "epoch": 1.8906750054967874, "grad_norm": 0.5786269307136536, "learning_rate": 1.3139998648827312e-07, "loss": 0.0898, "num_input_tokens_seen": 50159680, "step": 38695 }, { "epoch": 1.8909193071604817, "grad_norm": 0.7271573543548584, "learning_rate": 1.3039666086232526e-07, "loss": 0.0899, "num_input_tokens_seen": 50166368, "step": 38700 }, { "epoch": 1.8911636088241761, "grad_norm": 0.23597507178783417, "learning_rate": 1.2939717044174183e-07, "loss": 0.0893, "num_input_tokens_seen": 50173024, "step": 38705 }, { "epoch": 1.8914079104878705, "grad_norm": 0.6925216913223267, "learning_rate": 1.284015153806578e-07, "loss": 0.1173, "num_input_tokens_seen": 50179456, "step": 38710 }, { "epoch": 1.8916522121515649, "grad_norm": 0.6584329605102539, "learning_rate": 1.274096958326171e-07, "loss": 0.0808, "num_input_tokens_seen": 50185920, "step": 38715 }, { "epoch": 1.891896513815259, "grad_norm": 0.4988548457622528, "learning_rate": 1.2642171195056952e-07, "loss": 0.0656, "num_input_tokens_seen": 50192448, "step": 38720 }, { "epoch": 1.8921408154789534, "grad_norm": 0.6676225662231445, "learning_rate": 1.2543756388687377e-07, "loss": 0.0886, "num_input_tokens_seen": 50199200, "step": 38725 }, { "epoch": 1.8923851171426478, "grad_norm": 0.27083635330200195, "learning_rate": 1.2445725179330014e-07, "loss": 0.0815, "num_input_tokens_seen": 50205760, "step": 38730 }, { "epoch": 1.892629418806342, "grad_norm": 0.12319359928369522, "learning_rate": 1.2348077582102212e-07, "loss": 0.0767, "num_input_tokens_seen": 50212352, "step": 38735 }, { "epoch": 1.8928737204700363, "grad_norm": 0.500356137752533, "learning_rate": 1.2250813612062762e-07, "loss": 0.0921, "num_input_tokens_seen": 50219232, "step": 38740 }, { "epoch": 1.8931180221337307, "grad_norm": 0.21896325051784515, "learning_rate": 1.215393328421105e-07, "loss": 0.0723, "num_input_tokens_seen": 50225440, "step": 38745 }, { "epoch": 1.893362323797425, "grad_norm": 0.5743411183357239, "learning_rate": 1.2057436613486796e-07, "loss": 0.1022, "num_input_tokens_seen": 50232064, "step": 38750 }, { "epoch": 1.8936066254611195, "grad_norm": 0.3633442223072052, "learning_rate": 1.1961323614771424e-07, "loss": 0.0987, "num_input_tokens_seen": 50238720, "step": 38755 }, { "epoch": 1.8938509271248138, "grad_norm": 0.36538004875183105, "learning_rate": 1.1865594302886418e-07, "loss": 0.0943, "num_input_tokens_seen": 50245440, "step": 38760 }, { "epoch": 1.894095228788508, "grad_norm": 0.144136443734169, "learning_rate": 1.1770248692594687e-07, "loss": 0.0828, "num_input_tokens_seen": 50251936, "step": 38765 }, { "epoch": 1.8943395304522024, "grad_norm": 0.1854596585035324, "learning_rate": 1.167528679859975e-07, "loss": 0.0863, "num_input_tokens_seen": 50258912, "step": 38770 }, { "epoch": 1.8945838321158968, "grad_norm": 0.9904909133911133, "learning_rate": 1.1580708635545446e-07, "loss": 0.1074, "num_input_tokens_seen": 50265600, "step": 38775 }, { "epoch": 1.894828133779591, "grad_norm": 0.19237951934337616, "learning_rate": 1.1486514218017885e-07, "loss": 0.0724, "num_input_tokens_seen": 50272128, "step": 38780 }, { "epoch": 1.8950724354432853, "grad_norm": 0.6344258785247803, "learning_rate": 1.1392703560542117e-07, "loss": 0.0752, "num_input_tokens_seen": 50278720, "step": 38785 }, { "epoch": 1.8953167371069797, "grad_norm": 0.2795957624912262, "learning_rate": 1.129927667758518e-07, "loss": 0.067, "num_input_tokens_seen": 50285504, "step": 38790 }, { "epoch": 1.895561038770674, "grad_norm": 0.12449993938207626, "learning_rate": 1.1206233583554992e-07, "loss": 0.0721, "num_input_tokens_seen": 50292128, "step": 38795 }, { "epoch": 1.8958053404343684, "grad_norm": 0.39493030309677124, "learning_rate": 1.1113574292799523e-07, "loss": 0.0758, "num_input_tokens_seen": 50298720, "step": 38800 }, { "epoch": 1.8958053404343684, "eval_loss": 0.08727389574050903, "eval_runtime": 374.6486, "eval_samples_per_second": 97.118, "eval_steps_per_second": 24.281, "num_input_tokens_seen": 50298720, "step": 38800 }, { "epoch": 1.8960496420980628, "grad_norm": 0.19404038786888123, "learning_rate": 1.1021298819608449e-07, "loss": 0.0778, "num_input_tokens_seen": 50305440, "step": 38805 }, { "epoch": 1.896293943761757, "grad_norm": 0.3104190230369568, "learning_rate": 1.0929407178211226e-07, "loss": 0.112, "num_input_tokens_seen": 50311936, "step": 38810 }, { "epoch": 1.8965382454254514, "grad_norm": 0.5038530230522156, "learning_rate": 1.0837899382779293e-07, "loss": 0.089, "num_input_tokens_seen": 50318432, "step": 38815 }, { "epoch": 1.8967825470891457, "grad_norm": 0.13040494918823242, "learning_rate": 1.0746775447423862e-07, "loss": 0.0921, "num_input_tokens_seen": 50324928, "step": 38820 }, { "epoch": 1.89702684875284, "grad_norm": 1.0464606285095215, "learning_rate": 1.0656035386197583e-07, "loss": 0.0981, "num_input_tokens_seen": 50331424, "step": 38825 }, { "epoch": 1.8972711504165343, "grad_norm": 0.17801709473133087, "learning_rate": 1.0565679213093982e-07, "loss": 0.099, "num_input_tokens_seen": 50337696, "step": 38830 }, { "epoch": 1.8975154520802286, "grad_norm": 0.30748680233955383, "learning_rate": 1.0475706942046638e-07, "loss": 0.0941, "num_input_tokens_seen": 50344608, "step": 38835 }, { "epoch": 1.897759753743923, "grad_norm": 0.2863915264606476, "learning_rate": 1.0386118586930282e-07, "loss": 0.1091, "num_input_tokens_seen": 50351904, "step": 38840 }, { "epoch": 1.8980040554076174, "grad_norm": 0.552042543888092, "learning_rate": 1.0296914161561367e-07, "loss": 0.0829, "num_input_tokens_seen": 50357920, "step": 38845 }, { "epoch": 1.8982483570713118, "grad_norm": 0.3323274552822113, "learning_rate": 1.0208093679695552e-07, "loss": 0.081, "num_input_tokens_seen": 50363936, "step": 38850 }, { "epoch": 1.898492658735006, "grad_norm": 0.472764790058136, "learning_rate": 1.0119657155030493e-07, "loss": 0.0978, "num_input_tokens_seen": 50370784, "step": 38855 }, { "epoch": 1.8987369603987003, "grad_norm": 0.35680776834487915, "learning_rate": 1.003160460120417e-07, "loss": 0.1084, "num_input_tokens_seen": 50376960, "step": 38860 }, { "epoch": 1.8989812620623945, "grad_norm": 0.5055449604988098, "learning_rate": 9.943936031795165e-08, "loss": 0.0932, "num_input_tokens_seen": 50383584, "step": 38865 }, { "epoch": 1.8992255637260889, "grad_norm": 0.44818297028541565, "learning_rate": 9.856651460323219e-08, "loss": 0.0929, "num_input_tokens_seen": 50390496, "step": 38870 }, { "epoch": 1.8994698653897832, "grad_norm": 0.2600421607494354, "learning_rate": 9.769750900248953e-08, "loss": 0.0892, "num_input_tokens_seen": 50396704, "step": 38875 }, { "epoch": 1.8997141670534776, "grad_norm": 0.7177179455757141, "learning_rate": 9.683234364973038e-08, "loss": 0.1021, "num_input_tokens_seen": 50403264, "step": 38880 }, { "epoch": 1.899958468717172, "grad_norm": 0.17805080115795135, "learning_rate": 9.597101867837854e-08, "loss": 0.0969, "num_input_tokens_seen": 50410208, "step": 38885 }, { "epoch": 1.9002027703808664, "grad_norm": 0.39705362915992737, "learning_rate": 9.511353422125835e-08, "loss": 0.0785, "num_input_tokens_seen": 50416736, "step": 38890 }, { "epoch": 1.9004470720445608, "grad_norm": 0.5205065608024597, "learning_rate": 9.42598904106029e-08, "loss": 0.0682, "num_input_tokens_seen": 50423360, "step": 38895 }, { "epoch": 1.900691373708255, "grad_norm": 0.17862287163734436, "learning_rate": 9.341008737806245e-08, "loss": 0.0971, "num_input_tokens_seen": 50430208, "step": 38900 }, { "epoch": 1.9009356753719493, "grad_norm": 0.16371367871761322, "learning_rate": 9.256412525467661e-08, "loss": 0.0806, "num_input_tokens_seen": 50436928, "step": 38905 }, { "epoch": 1.9011799770356435, "grad_norm": 0.2753145098686218, "learning_rate": 9.172200417091326e-08, "loss": 0.0659, "num_input_tokens_seen": 50443104, "step": 38910 }, { "epoch": 1.9014242786993378, "grad_norm": 0.5678650140762329, "learning_rate": 9.088372425663239e-08, "loss": 0.0735, "num_input_tokens_seen": 50449824, "step": 38915 }, { "epoch": 1.9016685803630322, "grad_norm": 0.14044731855392456, "learning_rate": 9.004928564110837e-08, "loss": 0.0713, "num_input_tokens_seen": 50456640, "step": 38920 }, { "epoch": 1.9019128820267266, "grad_norm": 1.058321475982666, "learning_rate": 8.92186884530244e-08, "loss": 0.078, "num_input_tokens_seen": 50463488, "step": 38925 }, { "epoch": 1.902157183690421, "grad_norm": 0.18444344401359558, "learning_rate": 8.83919328204641e-08, "loss": 0.073, "num_input_tokens_seen": 50469664, "step": 38930 }, { "epoch": 1.9024014853541154, "grad_norm": 0.20547376573085785, "learning_rate": 8.756901887093105e-08, "loss": 0.0927, "num_input_tokens_seen": 50476288, "step": 38935 }, { "epoch": 1.9026457870178097, "grad_norm": 0.46756160259246826, "learning_rate": 8.674994673132098e-08, "loss": 0.1045, "num_input_tokens_seen": 50482688, "step": 38940 }, { "epoch": 1.9028900886815039, "grad_norm": 0.4292507767677307, "learning_rate": 8.593471652794949e-08, "loss": 0.0983, "num_input_tokens_seen": 50488960, "step": 38945 }, { "epoch": 1.9031343903451983, "grad_norm": 0.25100865960121155, "learning_rate": 8.512332838653548e-08, "loss": 0.0626, "num_input_tokens_seen": 50495488, "step": 38950 }, { "epoch": 1.9033786920088924, "grad_norm": 0.46470004320144653, "learning_rate": 8.431578243220106e-08, "loss": 0.1015, "num_input_tokens_seen": 50502336, "step": 38955 }, { "epoch": 1.9036229936725868, "grad_norm": 0.5948761701583862, "learning_rate": 8.351207878948552e-08, "loss": 0.0832, "num_input_tokens_seen": 50508576, "step": 38960 }, { "epoch": 1.9038672953362812, "grad_norm": 0.46241095662117004, "learning_rate": 8.271221758232583e-08, "loss": 0.0893, "num_input_tokens_seen": 50515232, "step": 38965 }, { "epoch": 1.9041115969999756, "grad_norm": 0.4111063480377197, "learning_rate": 8.191619893407332e-08, "loss": 0.0829, "num_input_tokens_seen": 50521376, "step": 38970 }, { "epoch": 1.90435589866367, "grad_norm": 0.2513750493526459, "learning_rate": 8.112402296748534e-08, "loss": 0.0847, "num_input_tokens_seen": 50527904, "step": 38975 }, { "epoch": 1.9046002003273643, "grad_norm": 0.4493109881877899, "learning_rate": 8.033568980471973e-08, "loss": 0.1033, "num_input_tokens_seen": 50534592, "step": 38980 }, { "epoch": 1.9048445019910587, "grad_norm": 0.4815971255302429, "learning_rate": 7.955119956735146e-08, "loss": 0.0979, "num_input_tokens_seen": 50541088, "step": 38985 }, { "epoch": 1.9050888036547529, "grad_norm": 0.18452893197536469, "learning_rate": 7.877055237636155e-08, "loss": 0.0738, "num_input_tokens_seen": 50547488, "step": 38990 }, { "epoch": 1.9053331053184472, "grad_norm": 0.4299772083759308, "learning_rate": 7.79937483521287e-08, "loss": 0.1014, "num_input_tokens_seen": 50554112, "step": 38995 }, { "epoch": 1.9055774069821414, "grad_norm": 0.39722639322280884, "learning_rate": 7.722078761444873e-08, "loss": 0.0839, "num_input_tokens_seen": 50560704, "step": 39000 }, { "epoch": 1.9055774069821414, "eval_loss": 0.08725883811712265, "eval_runtime": 374.6101, "eval_samples_per_second": 97.128, "eval_steps_per_second": 24.284, "num_input_tokens_seen": 50560704, "step": 39000 }, { "epoch": 1.9058217086458358, "grad_norm": 0.5719852447509766, "learning_rate": 7.645167028252631e-08, "loss": 0.0747, "num_input_tokens_seen": 50567296, "step": 39005 }, { "epoch": 1.9060660103095302, "grad_norm": 0.2161695808172226, "learning_rate": 7.568639647496379e-08, "loss": 0.0951, "num_input_tokens_seen": 50573888, "step": 39010 }, { "epoch": 1.9063103119732245, "grad_norm": 0.2613203227519989, "learning_rate": 7.492496630977508e-08, "loss": 0.0654, "num_input_tokens_seen": 50580192, "step": 39015 }, { "epoch": 1.906554613636919, "grad_norm": 0.4364278316497803, "learning_rate": 7.416737990438571e-08, "loss": 0.1236, "num_input_tokens_seen": 50586432, "step": 39020 }, { "epoch": 1.9067989153006133, "grad_norm": 0.15413345396518707, "learning_rate": 7.341363737562445e-08, "loss": 0.0623, "num_input_tokens_seen": 50593120, "step": 39025 }, { "epoch": 1.9070432169643077, "grad_norm": 0.49855607748031616, "learning_rate": 7.266373883972887e-08, "loss": 0.0878, "num_input_tokens_seen": 50599264, "step": 39030 }, { "epoch": 1.9072875186280018, "grad_norm": 0.14569254219532013, "learning_rate": 7.191768441233981e-08, "loss": 0.0757, "num_input_tokens_seen": 50605952, "step": 39035 }, { "epoch": 1.9075318202916962, "grad_norm": 0.1251389980316162, "learning_rate": 7.11754742085069e-08, "loss": 0.0591, "num_input_tokens_seen": 50612576, "step": 39040 }, { "epoch": 1.9077761219553904, "grad_norm": 0.25493571162223816, "learning_rate": 7.043710834269413e-08, "loss": 0.0781, "num_input_tokens_seen": 50618880, "step": 39045 }, { "epoch": 1.9080204236190847, "grad_norm": 0.5676730871200562, "learning_rate": 6.970258692876319e-08, "loss": 0.0793, "num_input_tokens_seen": 50625280, "step": 39050 }, { "epoch": 1.9082647252827791, "grad_norm": 0.19341644644737244, "learning_rate": 6.897191007998738e-08, "loss": 0.094, "num_input_tokens_seen": 50631456, "step": 39055 }, { "epoch": 1.9085090269464735, "grad_norm": 0.20802398025989532, "learning_rate": 6.824507790904599e-08, "loss": 0.0763, "num_input_tokens_seen": 50638048, "step": 39060 }, { "epoch": 1.9087533286101679, "grad_norm": 0.24703560769557953, "learning_rate": 6.752209052802439e-08, "loss": 0.1188, "num_input_tokens_seen": 50644800, "step": 39065 }, { "epoch": 1.9089976302738623, "grad_norm": 0.5927944183349609, "learning_rate": 6.680294804841946e-08, "loss": 0.0847, "num_input_tokens_seen": 50651008, "step": 39070 }, { "epoch": 1.9092419319375566, "grad_norm": 0.29148030281066895, "learning_rate": 6.608765058112865e-08, "loss": 0.0983, "num_input_tokens_seen": 50657312, "step": 39075 }, { "epoch": 1.9094862336012508, "grad_norm": 0.25652942061424255, "learning_rate": 6.537619823646368e-08, "loss": 0.0953, "num_input_tokens_seen": 50663872, "step": 39080 }, { "epoch": 1.9097305352649452, "grad_norm": 0.45436587929725647, "learning_rate": 6.466859112413404e-08, "loss": 0.1033, "num_input_tokens_seen": 50670752, "step": 39085 }, { "epoch": 1.9099748369286393, "grad_norm": 0.3538224697113037, "learning_rate": 6.39648293532663e-08, "loss": 0.0921, "num_input_tokens_seen": 50677184, "step": 39090 }, { "epoch": 1.9102191385923337, "grad_norm": 0.6333401203155518, "learning_rate": 6.32649130323848e-08, "loss": 0.0824, "num_input_tokens_seen": 50684128, "step": 39095 }, { "epoch": 1.910463440256028, "grad_norm": 0.3179842233657837, "learning_rate": 6.256884226943094e-08, "loss": 0.0947, "num_input_tokens_seen": 50690752, "step": 39100 }, { "epoch": 1.9107077419197225, "grad_norm": 0.34091711044311523, "learning_rate": 6.187661717174386e-08, "loss": 0.0867, "num_input_tokens_seen": 50697504, "step": 39105 }, { "epoch": 1.9109520435834169, "grad_norm": 0.3300513029098511, "learning_rate": 6.118823784607708e-08, "loss": 0.0641, "num_input_tokens_seen": 50704480, "step": 39110 }, { "epoch": 1.9111963452471112, "grad_norm": 0.38243091106414795, "learning_rate": 6.050370439858178e-08, "loss": 0.0718, "num_input_tokens_seen": 50710944, "step": 39115 }, { "epoch": 1.9114406469108056, "grad_norm": 0.3657492697238922, "learning_rate": 5.98230169348235e-08, "loss": 0.0833, "num_input_tokens_seen": 50716896, "step": 39120 }, { "epoch": 1.9116849485744998, "grad_norm": 0.6929110884666443, "learning_rate": 5.914617555977664e-08, "loss": 0.102, "num_input_tokens_seen": 50723424, "step": 39125 }, { "epoch": 1.9119292502381942, "grad_norm": 0.8054488897323608, "learning_rate": 5.8473180377816017e-08, "loss": 0.0899, "num_input_tokens_seen": 50729760, "step": 39130 }, { "epoch": 1.9121735519018883, "grad_norm": 0.19020412862300873, "learning_rate": 5.780403149272251e-08, "loss": 0.0698, "num_input_tokens_seen": 50736416, "step": 39135 }, { "epoch": 1.9124178535655827, "grad_norm": 0.2687043249607086, "learning_rate": 5.7138729007694126e-08, "loss": 0.0766, "num_input_tokens_seen": 50743168, "step": 39140 }, { "epoch": 1.912662155229277, "grad_norm": 0.5238874554634094, "learning_rate": 5.64772730253238e-08, "loss": 0.0971, "num_input_tokens_seen": 50749344, "step": 39145 }, { "epoch": 1.9129064568929715, "grad_norm": 0.3545820713043213, "learning_rate": 5.5819663647618814e-08, "loss": 0.0804, "num_input_tokens_seen": 50755744, "step": 39150 }, { "epoch": 1.9131507585566658, "grad_norm": 0.1792769879102707, "learning_rate": 5.5165900975989723e-08, "loss": 0.0721, "num_input_tokens_seen": 50761792, "step": 39155 }, { "epoch": 1.9133950602203602, "grad_norm": 0.3657301068305969, "learning_rate": 5.451598511125311e-08, "loss": 0.0736, "num_input_tokens_seen": 50768192, "step": 39160 }, { "epoch": 1.9136393618840546, "grad_norm": 0.536632776260376, "learning_rate": 5.3869916153637124e-08, "loss": 0.0819, "num_input_tokens_seen": 50774496, "step": 39165 }, { "epoch": 1.9138836635477487, "grad_norm": 0.3351440727710724, "learning_rate": 5.322769420277318e-08, "loss": 0.0779, "num_input_tokens_seen": 50781120, "step": 39170 }, { "epoch": 1.9141279652114431, "grad_norm": 0.4370250403881073, "learning_rate": 5.258931935769873e-08, "loss": 0.0898, "num_input_tokens_seen": 50787872, "step": 39175 }, { "epoch": 1.9143722668751373, "grad_norm": 0.23276248574256897, "learning_rate": 5.19547917168628e-08, "loss": 0.0671, "num_input_tokens_seen": 50794816, "step": 39180 }, { "epoch": 1.9146165685388317, "grad_norm": 0.3368016481399536, "learning_rate": 5.13241113781121e-08, "loss": 0.1062, "num_input_tokens_seen": 50801184, "step": 39185 }, { "epoch": 1.914860870202526, "grad_norm": 0.24073457717895508, "learning_rate": 5.0697278438707755e-08, "loss": 0.0725, "num_input_tokens_seen": 50807744, "step": 39190 }, { "epoch": 1.9151051718662204, "grad_norm": 0.21872900426387787, "learning_rate": 5.0074292995316854e-08, "loss": 0.1143, "num_input_tokens_seen": 50813984, "step": 39195 }, { "epoch": 1.9153494735299148, "grad_norm": 0.13787643611431122, "learning_rate": 4.945515514400978e-08, "loss": 0.0699, "num_input_tokens_seen": 50820416, "step": 39200 }, { "epoch": 1.9153494735299148, "eval_loss": 0.0871947631239891, "eval_runtime": 374.6806, "eval_samples_per_second": 97.109, "eval_steps_per_second": 24.279, "num_input_tokens_seen": 50820416, "step": 39200 }, { "epoch": 1.9155937751936092, "grad_norm": 0.17128336429595947, "learning_rate": 4.883986498026571e-08, "loss": 0.0942, "num_input_tokens_seen": 50826816, "step": 39205 }, { "epoch": 1.9158380768573036, "grad_norm": 0.44175130128860474, "learning_rate": 4.822842259896987e-08, "loss": 0.0937, "num_input_tokens_seen": 50833376, "step": 39210 }, { "epoch": 1.9160823785209977, "grad_norm": 0.3847131133079529, "learning_rate": 4.762082809441626e-08, "loss": 0.0919, "num_input_tokens_seen": 50839936, "step": 39215 }, { "epoch": 1.916326680184692, "grad_norm": 0.5565012693405151, "learning_rate": 4.7017081560302156e-08, "loss": 0.1067, "num_input_tokens_seen": 50846208, "step": 39220 }, { "epoch": 1.9165709818483863, "grad_norm": 0.17433275282382965, "learning_rate": 4.6417183089730866e-08, "loss": 0.0859, "num_input_tokens_seen": 50852864, "step": 39225 }, { "epoch": 1.9168152835120806, "grad_norm": 0.43637049198150635, "learning_rate": 4.5821132775217265e-08, "loss": 0.0911, "num_input_tokens_seen": 50859232, "step": 39230 }, { "epoch": 1.917059585175775, "grad_norm": 0.3817993402481079, "learning_rate": 4.5228930708679504e-08, "loss": 0.0804, "num_input_tokens_seen": 50865856, "step": 39235 }, { "epoch": 1.9173038868394694, "grad_norm": 0.180390864610672, "learning_rate": 4.464057698144175e-08, "loss": 0.0928, "num_input_tokens_seen": 50872192, "step": 39240 }, { "epoch": 1.9175481885031638, "grad_norm": 0.36266979575157166, "learning_rate": 4.4056071684236974e-08, "loss": 0.1077, "num_input_tokens_seen": 50878752, "step": 39245 }, { "epoch": 1.9177924901668582, "grad_norm": 0.2605377435684204, "learning_rate": 4.347541490719864e-08, "loss": 0.0981, "num_input_tokens_seen": 50885280, "step": 39250 }, { "epoch": 1.9180367918305523, "grad_norm": 0.31373000144958496, "learning_rate": 4.2898606739877336e-08, "loss": 0.117, "num_input_tokens_seen": 50891616, "step": 39255 }, { "epoch": 1.9182810934942467, "grad_norm": 0.40155449509620667, "learning_rate": 4.232564727122135e-08, "loss": 0.0843, "num_input_tokens_seen": 50898048, "step": 39260 }, { "epoch": 1.918525395157941, "grad_norm": 0.1271088421344757, "learning_rate": 4.1756536589585004e-08, "loss": 0.0976, "num_input_tokens_seen": 50904768, "step": 39265 }, { "epoch": 1.9187696968216352, "grad_norm": 0.18099653720855713, "learning_rate": 4.119127478273976e-08, "loss": 0.0902, "num_input_tokens_seen": 50911360, "step": 39270 }, { "epoch": 1.9190139984853296, "grad_norm": 0.18802395462989807, "learning_rate": 4.062986193784923e-08, "loss": 0.1122, "num_input_tokens_seen": 50917856, "step": 39275 }, { "epoch": 1.919258300149024, "grad_norm": 0.2758532166481018, "learning_rate": 4.007229814149416e-08, "loss": 0.0867, "num_input_tokens_seen": 50924416, "step": 39280 }, { "epoch": 1.9195026018127184, "grad_norm": 0.36057600378990173, "learning_rate": 3.951858347965576e-08, "loss": 0.0955, "num_input_tokens_seen": 50930560, "step": 39285 }, { "epoch": 1.9197469034764127, "grad_norm": 0.1455266773700714, "learning_rate": 3.896871803772684e-08, "loss": 0.0555, "num_input_tokens_seen": 50936928, "step": 39290 }, { "epoch": 1.9199912051401071, "grad_norm": 0.47721290588378906, "learning_rate": 3.842270190050068e-08, "loss": 0.0905, "num_input_tokens_seen": 50943392, "step": 39295 }, { "epoch": 1.9202355068038013, "grad_norm": 0.4839467704296112, "learning_rate": 3.7880535152179376e-08, "loss": 0.086, "num_input_tokens_seen": 50950240, "step": 39300 }, { "epoch": 1.9204798084674957, "grad_norm": 0.4953743517398834, "learning_rate": 3.734221787637382e-08, "loss": 0.0758, "num_input_tokens_seen": 50956960, "step": 39305 }, { "epoch": 1.92072411013119, "grad_norm": 0.2723410725593567, "learning_rate": 3.680775015609817e-08, "loss": 0.0707, "num_input_tokens_seen": 50963648, "step": 39310 }, { "epoch": 1.9209684117948842, "grad_norm": 0.2099589705467224, "learning_rate": 3.627713207377537e-08, "loss": 0.0781, "num_input_tokens_seen": 50969824, "step": 39315 }, { "epoch": 1.9212127134585786, "grad_norm": 0.32724863290786743, "learning_rate": 3.575036371123164e-08, "loss": 0.0931, "num_input_tokens_seen": 50976448, "step": 39320 }, { "epoch": 1.921457015122273, "grad_norm": 0.5048016905784607, "learning_rate": 3.5227445149704776e-08, "loss": 0.0874, "num_input_tokens_seen": 50982784, "step": 39325 }, { "epoch": 1.9217013167859673, "grad_norm": 0.34137141704559326, "learning_rate": 3.470837646983027e-08, "loss": 0.1033, "num_input_tokens_seen": 50989088, "step": 39330 }, { "epoch": 1.9219456184496617, "grad_norm": 0.22617271542549133, "learning_rate": 3.419315775165799e-08, "loss": 0.0949, "num_input_tokens_seen": 50995744, "step": 39335 }, { "epoch": 1.922189920113356, "grad_norm": 0.341232568025589, "learning_rate": 3.368178907464103e-08, "loss": 0.0638, "num_input_tokens_seen": 51001984, "step": 39340 }, { "epoch": 1.9224342217770503, "grad_norm": 0.14139008522033691, "learning_rate": 3.317427051763855e-08, "loss": 0.0894, "num_input_tokens_seen": 51008416, "step": 39345 }, { "epoch": 1.9226785234407446, "grad_norm": 0.12648577988147736, "learning_rate": 3.267060215891571e-08, "loss": 0.0671, "num_input_tokens_seen": 51014880, "step": 39350 }, { "epoch": 1.922922825104439, "grad_norm": 0.3684879541397095, "learning_rate": 3.217078407614649e-08, "loss": 0.0627, "num_input_tokens_seen": 51021504, "step": 39355 }, { "epoch": 1.9231671267681332, "grad_norm": 0.3695577383041382, "learning_rate": 3.1674816346405345e-08, "loss": 0.0653, "num_input_tokens_seen": 51028736, "step": 39360 }, { "epoch": 1.9234114284318276, "grad_norm": 0.17767414450645447, "learning_rate": 3.11826990461811e-08, "loss": 0.0875, "num_input_tokens_seen": 51034944, "step": 39365 }, { "epoch": 1.923655730095522, "grad_norm": 0.37930506467819214, "learning_rate": 3.069443225136304e-08, "loss": 0.0778, "num_input_tokens_seen": 51041376, "step": 39370 }, { "epoch": 1.9239000317592163, "grad_norm": 0.19008009135723114, "learning_rate": 3.021001603724372e-08, "loss": 0.0823, "num_input_tokens_seen": 51047488, "step": 39375 }, { "epoch": 1.9241443334229107, "grad_norm": 0.4790518283843994, "learning_rate": 2.9729450478532818e-08, "loss": 0.0921, "num_input_tokens_seen": 51054112, "step": 39380 }, { "epoch": 1.924388635086605, "grad_norm": 1.002664566040039, "learning_rate": 2.9252735649337726e-08, "loss": 0.086, "num_input_tokens_seen": 51060544, "step": 39385 }, { "epoch": 1.9246329367502992, "grad_norm": 0.5474033355712891, "learning_rate": 2.8779871623171863e-08, "loss": 0.0737, "num_input_tokens_seen": 51067008, "step": 39390 }, { "epoch": 1.9248772384139936, "grad_norm": 0.11335186660289764, "learning_rate": 2.8310858472957448e-08, "loss": 0.0903, "num_input_tokens_seen": 51073696, "step": 39395 }, { "epoch": 1.9251215400776878, "grad_norm": 0.23225383460521698, "learning_rate": 2.784569627101996e-08, "loss": 0.0897, "num_input_tokens_seen": 51080832, "step": 39400 }, { "epoch": 1.9251215400776878, "eval_loss": 0.08718777447938919, "eval_runtime": 374.9304, "eval_samples_per_second": 97.045, "eval_steps_per_second": 24.263, "num_input_tokens_seen": 51080832, "step": 39400 }, { "epoch": 1.9253658417413821, "grad_norm": 0.214286208152771, "learning_rate": 2.738438508909924e-08, "loss": 0.084, "num_input_tokens_seen": 51087200, "step": 39405 }, { "epoch": 1.9256101434050765, "grad_norm": 0.21833884716033936, "learning_rate": 2.692692499833005e-08, "loss": 0.0511, "num_input_tokens_seen": 51093600, "step": 39410 }, { "epoch": 1.925854445068771, "grad_norm": 0.6320027709007263, "learning_rate": 2.647331606926151e-08, "loss": 0.0975, "num_input_tokens_seen": 51099936, "step": 39415 }, { "epoch": 1.9260987467324653, "grad_norm": 0.24637684226036072, "learning_rate": 2.6023558371843225e-08, "loss": 0.0792, "num_input_tokens_seen": 51106336, "step": 39420 }, { "epoch": 1.9263430483961597, "grad_norm": 0.6314916610717773, "learning_rate": 2.557765197543638e-08, "loss": 0.1085, "num_input_tokens_seen": 51112864, "step": 39425 }, { "epoch": 1.926587350059854, "grad_norm": 0.2062954306602478, "learning_rate": 2.513559694880263e-08, "loss": 0.0883, "num_input_tokens_seen": 51119552, "step": 39430 }, { "epoch": 1.9268316517235482, "grad_norm": 0.30912312865257263, "learning_rate": 2.469739336011523e-08, "loss": 0.0667, "num_input_tokens_seen": 51125696, "step": 39435 }, { "epoch": 1.9270759533872426, "grad_norm": 0.8337470889091492, "learning_rate": 2.4263041276947894e-08, "loss": 0.0802, "num_input_tokens_seen": 51132192, "step": 39440 }, { "epoch": 1.9273202550509367, "grad_norm": 0.4673660397529602, "learning_rate": 2.3832540766283164e-08, "loss": 0.0814, "num_input_tokens_seen": 51138880, "step": 39445 }, { "epoch": 1.9275645567146311, "grad_norm": 0.5733474493026733, "learning_rate": 2.3405891894512366e-08, "loss": 0.1203, "num_input_tokens_seen": 51145312, "step": 39450 }, { "epoch": 1.9278088583783255, "grad_norm": 0.3498592674732208, "learning_rate": 2.29830947274301e-08, "loss": 0.0811, "num_input_tokens_seen": 51151808, "step": 39455 }, { "epoch": 1.9280531600420199, "grad_norm": 0.17321285605430603, "learning_rate": 2.2564149330231432e-08, "loss": 0.0878, "num_input_tokens_seen": 51158592, "step": 39460 }, { "epoch": 1.9282974617057143, "grad_norm": 0.35083264112472534, "learning_rate": 2.2149055767528572e-08, "loss": 0.0887, "num_input_tokens_seen": 51165024, "step": 39465 }, { "epoch": 1.9285417633694086, "grad_norm": 0.20198187232017517, "learning_rate": 2.1737814103334197e-08, "loss": 0.0817, "num_input_tokens_seen": 51171680, "step": 39470 }, { "epoch": 1.928786065033103, "grad_norm": 0.8043622374534607, "learning_rate": 2.1330424401064253e-08, "loss": 0.1056, "num_input_tokens_seen": 51178176, "step": 39475 }, { "epoch": 1.9290303666967972, "grad_norm": 0.5110102891921997, "learning_rate": 2.092688672354348e-08, "loss": 0.0967, "num_input_tokens_seen": 51184736, "step": 39480 }, { "epoch": 1.9292746683604916, "grad_norm": 0.45259329676628113, "learning_rate": 2.0527201133005435e-08, "loss": 0.0946, "num_input_tokens_seen": 51191328, "step": 39485 }, { "epoch": 1.9295189700241857, "grad_norm": 0.24718967080116272, "learning_rate": 2.0131367691084148e-08, "loss": 0.0935, "num_input_tokens_seen": 51197792, "step": 39490 }, { "epoch": 1.92976327168788, "grad_norm": 0.478089839220047, "learning_rate": 1.9739386458819675e-08, "loss": 0.0967, "num_input_tokens_seen": 51204416, "step": 39495 }, { "epoch": 1.9300075733515745, "grad_norm": 0.24600552022457123, "learning_rate": 1.9351257496666442e-08, "loss": 0.0851, "num_input_tokens_seen": 51210976, "step": 39500 }, { "epoch": 1.9302518750152688, "grad_norm": 0.6340338587760925, "learning_rate": 1.896698086447657e-08, "loss": 0.1117, "num_input_tokens_seen": 51217120, "step": 39505 }, { "epoch": 1.9304961766789632, "grad_norm": 0.6978753805160522, "learning_rate": 1.8586556621505436e-08, "loss": 0.0885, "num_input_tokens_seen": 51223488, "step": 39510 }, { "epoch": 1.9307404783426576, "grad_norm": 0.15306347608566284, "learning_rate": 1.820998482642833e-08, "loss": 0.0899, "num_input_tokens_seen": 51229504, "step": 39515 }, { "epoch": 1.930984780006352, "grad_norm": 0.16414600610733032, "learning_rate": 1.7837265537309912e-08, "loss": 0.0963, "num_input_tokens_seen": 51235904, "step": 39520 }, { "epoch": 1.9312290816700461, "grad_norm": 0.3533482551574707, "learning_rate": 1.7468398811629206e-08, "loss": 0.0746, "num_input_tokens_seen": 51242528, "step": 39525 }, { "epoch": 1.9314733833337405, "grad_norm": 0.6152332425117493, "learning_rate": 1.710338470627404e-08, "loss": 0.0925, "num_input_tokens_seen": 51249536, "step": 39530 }, { "epoch": 1.9317176849974347, "grad_norm": 0.22661958634853363, "learning_rate": 1.6742223277529945e-08, "loss": 0.0859, "num_input_tokens_seen": 51255936, "step": 39535 }, { "epoch": 1.931961986661129, "grad_norm": 0.44696447253227234, "learning_rate": 1.6384914581094036e-08, "loss": 0.0826, "num_input_tokens_seen": 51262624, "step": 39540 }, { "epoch": 1.9322062883248234, "grad_norm": 0.408503919839859, "learning_rate": 1.6031458672069455e-08, "loss": 0.0874, "num_input_tokens_seen": 51269568, "step": 39545 }, { "epoch": 1.9324505899885178, "grad_norm": 0.20423145592212677, "learning_rate": 1.5681855604962602e-08, "loss": 0.0748, "num_input_tokens_seen": 51275616, "step": 39550 }, { "epoch": 1.9326948916522122, "grad_norm": 0.3819074332714081, "learning_rate": 1.5336105433683135e-08, "loss": 0.0661, "num_input_tokens_seen": 51282080, "step": 39555 }, { "epoch": 1.9329391933159066, "grad_norm": 0.3668416142463684, "learning_rate": 1.499420821155506e-08, "loss": 0.1323, "num_input_tokens_seen": 51288928, "step": 39560 }, { "epoch": 1.933183494979601, "grad_norm": 0.4885484576225281, "learning_rate": 1.4656163991302874e-08, "loss": 0.0585, "num_input_tokens_seen": 51295168, "step": 39565 }, { "epoch": 1.9334277966432951, "grad_norm": 0.194198340177536, "learning_rate": 1.4321972825051544e-08, "loss": 0.1107, "num_input_tokens_seen": 51301568, "step": 39570 }, { "epoch": 1.9336720983069895, "grad_norm": 0.16971175372600555, "learning_rate": 1.3991634764345951e-08, "loss": 0.0871, "num_input_tokens_seen": 51308000, "step": 39575 }, { "epoch": 1.9339163999706837, "grad_norm": 0.5198916792869568, "learning_rate": 1.3665149860120352e-08, "loss": 0.0815, "num_input_tokens_seen": 51314208, "step": 39580 }, { "epoch": 1.934160701634378, "grad_norm": 0.6944020986557007, "learning_rate": 1.3342518162728912e-08, "loss": 0.0944, "num_input_tokens_seen": 51320384, "step": 39585 }, { "epoch": 1.9344050032980724, "grad_norm": 0.22157490253448486, "learning_rate": 1.30237397219235e-08, "loss": 0.0827, "num_input_tokens_seen": 51326816, "step": 39590 }, { "epoch": 1.9346493049617668, "grad_norm": 0.7168896198272705, "learning_rate": 1.2708814586862016e-08, "loss": 0.0979, "num_input_tokens_seen": 51332960, "step": 39595 }, { "epoch": 1.9348936066254612, "grad_norm": 0.4577750265598297, "learning_rate": 1.2397742806111168e-08, "loss": 0.0761, "num_input_tokens_seen": 51339424, "step": 39600 }, { "epoch": 1.9348936066254612, "eval_loss": 0.08725996315479279, "eval_runtime": 375.1701, "eval_samples_per_second": 96.983, "eval_steps_per_second": 24.248, "num_input_tokens_seen": 51339424, "step": 39600 }, { "epoch": 1.9351379082891556, "grad_norm": 0.19772803783416748, "learning_rate": 1.209052442764369e-08, "loss": 0.0726, "num_input_tokens_seen": 51346080, "step": 39605 }, { "epoch": 1.93538220995285, "grad_norm": 0.15421828627586365, "learning_rate": 1.17871594988328e-08, "loss": 0.0778, "num_input_tokens_seen": 51352640, "step": 39610 }, { "epoch": 1.935626511616544, "grad_norm": 0.4504114091396332, "learning_rate": 1.1487648066466072e-08, "loss": 0.0923, "num_input_tokens_seen": 51358912, "step": 39615 }, { "epoch": 1.9358708132802385, "grad_norm": 0.2988828122615814, "learning_rate": 1.1191990176728784e-08, "loss": 0.0727, "num_input_tokens_seen": 51365760, "step": 39620 }, { "epoch": 1.9361151149439326, "grad_norm": 0.2055889070034027, "learning_rate": 1.0900185875215018e-08, "loss": 0.1062, "num_input_tokens_seen": 51372128, "step": 39625 }, { "epoch": 1.936359416607627, "grad_norm": 0.19187863171100616, "learning_rate": 1.0612235206924891e-08, "loss": 0.0976, "num_input_tokens_seen": 51378656, "step": 39630 }, { "epoch": 1.9366037182713214, "grad_norm": 0.4169623553752899, "learning_rate": 1.0328138216264549e-08, "loss": 0.0547, "num_input_tokens_seen": 51385120, "step": 39635 }, { "epoch": 1.9368480199350158, "grad_norm": 0.18671023845672607, "learning_rate": 1.004789494704339e-08, "loss": 0.0831, "num_input_tokens_seen": 51391328, "step": 39640 }, { "epoch": 1.9370923215987101, "grad_norm": 0.8174832463264465, "learning_rate": 9.771505442482397e-09, "loss": 0.1041, "num_input_tokens_seen": 51397472, "step": 39645 }, { "epoch": 1.9373366232624045, "grad_norm": 0.206880122423172, "learning_rate": 9.498969745200259e-09, "loss": 0.0952, "num_input_tokens_seen": 51403296, "step": 39650 }, { "epoch": 1.937580924926099, "grad_norm": 0.5434656143188477, "learning_rate": 9.230287897230017e-09, "loss": 0.0851, "num_input_tokens_seen": 51409408, "step": 39655 }, { "epoch": 1.937825226589793, "grad_norm": 0.5509332418441772, "learning_rate": 8.965459940002419e-09, "loss": 0.0936, "num_input_tokens_seen": 51416480, "step": 39660 }, { "epoch": 1.9380695282534874, "grad_norm": 0.20928403735160828, "learning_rate": 8.704485914357019e-09, "loss": 0.0986, "num_input_tokens_seen": 51423296, "step": 39665 }, { "epoch": 1.9383138299171816, "grad_norm": 0.40717384219169617, "learning_rate": 8.447365860539402e-09, "loss": 0.1012, "num_input_tokens_seen": 51429760, "step": 39670 }, { "epoch": 1.938558131580876, "grad_norm": 0.18343065679073334, "learning_rate": 8.194099818201184e-09, "loss": 0.0869, "num_input_tokens_seen": 51436160, "step": 39675 }, { "epoch": 1.9388024332445704, "grad_norm": 0.20102864503860474, "learning_rate": 7.944687826400011e-09, "loss": 0.0772, "num_input_tokens_seen": 51442848, "step": 39680 }, { "epoch": 1.9390467349082647, "grad_norm": 0.4244809150695801, "learning_rate": 7.699129923599557e-09, "loss": 0.0847, "num_input_tokens_seen": 51448896, "step": 39685 }, { "epoch": 1.9392910365719591, "grad_norm": 0.5609836578369141, "learning_rate": 7.457426147663982e-09, "loss": 0.073, "num_input_tokens_seen": 51455648, "step": 39690 }, { "epoch": 1.9395353382356535, "grad_norm": 0.7190710306167603, "learning_rate": 7.219576535871797e-09, "loss": 0.0906, "num_input_tokens_seen": 51462528, "step": 39695 }, { "epoch": 1.9397796398993479, "grad_norm": 0.17212232947349548, "learning_rate": 6.985581124896445e-09, "loss": 0.0624, "num_input_tokens_seen": 51469248, "step": 39700 }, { "epoch": 1.940023941563042, "grad_norm": 0.15520502626895905, "learning_rate": 6.755439950828501e-09, "loss": 0.071, "num_input_tokens_seen": 51476384, "step": 39705 }, { "epoch": 1.9402682432267364, "grad_norm": 0.6535587310791016, "learning_rate": 6.5291530491562444e-09, "loss": 0.0727, "num_input_tokens_seen": 51482624, "step": 39710 }, { "epoch": 1.9405125448904306, "grad_norm": 0.5450043678283691, "learning_rate": 6.3067204547739845e-09, "loss": 0.1097, "num_input_tokens_seen": 51488928, "step": 39715 }, { "epoch": 1.940756846554125, "grad_norm": 0.14579935371875763, "learning_rate": 6.088142201987612e-09, "loss": 0.1202, "num_input_tokens_seen": 51494944, "step": 39720 }, { "epoch": 1.9410011482178193, "grad_norm": 0.2087937444448471, "learning_rate": 5.873418324503499e-09, "loss": 0.0738, "num_input_tokens_seen": 51501408, "step": 39725 }, { "epoch": 1.9412454498815137, "grad_norm": 0.6106967926025391, "learning_rate": 5.6625488554340465e-09, "loss": 0.0768, "num_input_tokens_seen": 51507616, "step": 39730 }, { "epoch": 1.941489751545208, "grad_norm": 0.18360203504562378, "learning_rate": 5.455533827297688e-09, "loss": 0.0735, "num_input_tokens_seen": 51514144, "step": 39735 }, { "epoch": 1.9417340532089025, "grad_norm": 0.18340735137462616, "learning_rate": 5.252373272018885e-09, "loss": 0.1011, "num_input_tokens_seen": 51520128, "step": 39740 }, { "epoch": 1.9419783548725968, "grad_norm": 0.15575405955314636, "learning_rate": 5.053067220925356e-09, "loss": 0.085, "num_input_tokens_seen": 51526656, "step": 39745 }, { "epoch": 1.942222656536291, "grad_norm": 0.17905490100383759, "learning_rate": 4.857615704759177e-09, "loss": 0.0927, "num_input_tokens_seen": 51532992, "step": 39750 }, { "epoch": 1.9424669581999854, "grad_norm": 0.33199936151504517, "learning_rate": 4.666018753654577e-09, "loss": 0.0846, "num_input_tokens_seen": 51539456, "step": 39755 }, { "epoch": 1.9427112598636795, "grad_norm": 0.2021709382534027, "learning_rate": 4.478276397162917e-09, "loss": 0.0829, "num_input_tokens_seen": 51545600, "step": 39760 }, { "epoch": 1.942955561527374, "grad_norm": 0.1875518262386322, "learning_rate": 4.294388664233262e-09, "loss": 0.0834, "num_input_tokens_seen": 51551808, "step": 39765 }, { "epoch": 1.9431998631910683, "grad_norm": 0.2440664917230606, "learning_rate": 4.114355583223484e-09, "loss": 0.0987, "num_input_tokens_seen": 51558208, "step": 39770 }, { "epoch": 1.9434441648547627, "grad_norm": 0.09659475088119507, "learning_rate": 3.9381771818974845e-09, "loss": 0.0783, "num_input_tokens_seen": 51564832, "step": 39775 }, { "epoch": 1.943688466518457, "grad_norm": 0.8055664300918579, "learning_rate": 3.765853487427973e-09, "loss": 0.0917, "num_input_tokens_seen": 51570944, "step": 39780 }, { "epoch": 1.9439327681821514, "grad_norm": 0.7074999213218689, "learning_rate": 3.5973845263825857e-09, "loss": 0.0907, "num_input_tokens_seen": 51577248, "step": 39785 }, { "epoch": 1.9441770698458456, "grad_norm": 0.24796298146247864, "learning_rate": 3.4327703247488684e-09, "loss": 0.0614, "num_input_tokens_seen": 51584000, "step": 39790 }, { "epoch": 1.94442137150954, "grad_norm": 0.5671608448028564, "learning_rate": 3.2720109079037443e-09, "loss": 0.0923, "num_input_tokens_seen": 51590464, "step": 39795 }, { "epoch": 1.9446656731732344, "grad_norm": 0.6156516671180725, "learning_rate": 3.1151063006468193e-09, "loss": 0.077, "num_input_tokens_seen": 51597120, "step": 39800 }, { "epoch": 1.9446656731732344, "eval_loss": 0.08723117411136627, "eval_runtime": 374.7869, "eval_samples_per_second": 97.082, "eval_steps_per_second": 24.272, "num_input_tokens_seen": 51597120, "step": 39800 }, { "epoch": 1.9449099748369285, "grad_norm": 0.12288206815719604, "learning_rate": 2.962056527169854e-09, "loss": 0.0683, "num_input_tokens_seen": 51604032, "step": 39805 }, { "epoch": 1.945154276500623, "grad_norm": 0.4241328835487366, "learning_rate": 2.8128616110761898e-09, "loss": 0.085, "num_input_tokens_seen": 51610464, "step": 39810 }, { "epoch": 1.9453985781643173, "grad_norm": 0.5323235392570496, "learning_rate": 2.6675215753724223e-09, "loss": 0.0949, "num_input_tokens_seen": 51616928, "step": 39815 }, { "epoch": 1.9456428798280117, "grad_norm": 0.380386084318161, "learning_rate": 2.5260364424739557e-09, "loss": 0.0874, "num_input_tokens_seen": 51623520, "step": 39820 }, { "epoch": 1.945887181491706, "grad_norm": 0.4714759290218353, "learning_rate": 2.3884062341994475e-09, "loss": 0.0902, "num_input_tokens_seen": 51630240, "step": 39825 }, { "epoch": 1.9461314831554004, "grad_norm": 0.3644449710845947, "learning_rate": 2.25463097177081e-09, "loss": 0.125, "num_input_tokens_seen": 51637152, "step": 39830 }, { "epoch": 1.9463757848190946, "grad_norm": 0.5102726221084595, "learning_rate": 2.1247106758215397e-09, "loss": 0.0593, "num_input_tokens_seen": 51643648, "step": 39835 }, { "epoch": 1.946620086482789, "grad_norm": 0.3707234859466553, "learning_rate": 1.998645366382834e-09, "loss": 0.0884, "num_input_tokens_seen": 51650272, "step": 39840 }, { "epoch": 1.9468643881464833, "grad_norm": 0.37544187903404236, "learning_rate": 1.876435062897475e-09, "loss": 0.1198, "num_input_tokens_seen": 51657024, "step": 39845 }, { "epoch": 1.9471086898101775, "grad_norm": 0.40341466665267944, "learning_rate": 1.758079784211497e-09, "loss": 0.0912, "num_input_tokens_seen": 51663424, "step": 39850 }, { "epoch": 1.9473529914738719, "grad_norm": 0.33903464674949646, "learning_rate": 1.6435795485797434e-09, "loss": 0.0806, "num_input_tokens_seen": 51669824, "step": 39855 }, { "epoch": 1.9475972931375662, "grad_norm": 0.6147236227989197, "learning_rate": 1.5329343736547596e-09, "loss": 0.1017, "num_input_tokens_seen": 51676448, "step": 39860 }, { "epoch": 1.9478415948012606, "grad_norm": 0.2427082508802414, "learning_rate": 1.4261442765006739e-09, "loss": 0.0756, "num_input_tokens_seen": 51683328, "step": 39865 }, { "epoch": 1.948085896464955, "grad_norm": 0.2424759864807129, "learning_rate": 1.3232092735876445e-09, "loss": 0.0855, "num_input_tokens_seen": 51689248, "step": 39870 }, { "epoch": 1.9483301981286494, "grad_norm": 0.23103713989257812, "learning_rate": 1.2241293807918607e-09, "loss": 0.0927, "num_input_tokens_seen": 51695968, "step": 39875 }, { "epoch": 1.9485744997923435, "grad_norm": 0.3399740755558014, "learning_rate": 1.128904613387216e-09, "loss": 0.0616, "num_input_tokens_seen": 51702816, "step": 39880 }, { "epoch": 1.948818801456038, "grad_norm": 0.962172269821167, "learning_rate": 1.0375349860591853e-09, "loss": 0.0802, "num_input_tokens_seen": 51709184, "step": 39885 }, { "epoch": 1.9490631031197323, "grad_norm": 0.6676482558250427, "learning_rate": 9.5002051290205e-10, "loss": 0.1086, "num_input_tokens_seen": 51715904, "step": 39890 }, { "epoch": 1.9493074047834265, "grad_norm": 1.4402109384536743, "learning_rate": 8.663612074077954e-10, "loss": 0.1014, "num_input_tokens_seen": 51722016, "step": 39895 }, { "epoch": 1.9495517064471208, "grad_norm": 0.49563494324684143, "learning_rate": 7.865570824799884e-10, "loss": 0.0606, "num_input_tokens_seen": 51728128, "step": 39900 }, { "epoch": 1.9497960081108152, "grad_norm": 0.6127286553382874, "learning_rate": 7.106081504254514e-10, "loss": 0.0649, "num_input_tokens_seen": 51734368, "step": 39905 }, { "epoch": 1.9500403097745096, "grad_norm": 0.2564631700515747, "learning_rate": 6.385144229570372e-10, "loss": 0.081, "num_input_tokens_seen": 51740768, "step": 39910 }, { "epoch": 1.950284611438204, "grad_norm": 1.5994642972946167, "learning_rate": 5.70275911190854e-10, "loss": 0.0991, "num_input_tokens_seen": 51748000, "step": 39915 }, { "epoch": 1.9505289131018984, "grad_norm": 0.28941357135772705, "learning_rate": 5.058926256490403e-10, "loss": 0.0834, "num_input_tokens_seen": 51754432, "step": 39920 }, { "epoch": 1.9507732147655925, "grad_norm": 0.4454386234283447, "learning_rate": 4.4536457626254134e-10, "loss": 0.0551, "num_input_tokens_seen": 51761120, "step": 39925 }, { "epoch": 1.951017516429287, "grad_norm": 0.224330335855484, "learning_rate": 3.88691772365557e-10, "loss": 0.1012, "num_input_tokens_seen": 51767712, "step": 39930 }, { "epoch": 1.951261818092981, "grad_norm": 0.4696376323699951, "learning_rate": 3.358742226955425e-10, "loss": 0.0916, "num_input_tokens_seen": 51773824, "step": 39935 }, { "epoch": 1.9515061197566754, "grad_norm": 0.2139567732810974, "learning_rate": 2.8691193539875925e-10, "loss": 0.086, "num_input_tokens_seen": 51780064, "step": 39940 }, { "epoch": 1.9517504214203698, "grad_norm": 0.7822788953781128, "learning_rate": 2.418049180274995e-10, "loss": 0.0958, "num_input_tokens_seen": 51786752, "step": 39945 }, { "epoch": 1.9519947230840642, "grad_norm": 0.6027117967605591, "learning_rate": 2.005531775373104e-10, "loss": 0.0896, "num_input_tokens_seen": 51793056, "step": 39950 }, { "epoch": 1.9522390247477586, "grad_norm": 0.43978872895240784, "learning_rate": 1.6315672028699435e-10, "loss": 0.0908, "num_input_tokens_seen": 51799552, "step": 39955 }, { "epoch": 1.952483326411453, "grad_norm": 0.4490887224674225, "learning_rate": 1.2961555204693555e-10, "loss": 0.0827, "num_input_tokens_seen": 51805824, "step": 39960 }, { "epoch": 1.9527276280751473, "grad_norm": 0.49677330255508423, "learning_rate": 9.992967798799768e-11, "loss": 0.0925, "num_input_tokens_seen": 51812544, "step": 39965 }, { "epoch": 1.9529719297388415, "grad_norm": 0.22032031416893005, "learning_rate": 7.409910268707521e-11, "loss": 0.0544, "num_input_tokens_seen": 51819520, "step": 39970 }, { "epoch": 1.9532162314025359, "grad_norm": 0.245782732963562, "learning_rate": 5.212383012986877e-11, "loss": 0.0668, "num_input_tokens_seen": 51826336, "step": 39975 }, { "epoch": 1.95346053306623, "grad_norm": 0.28575921058654785, "learning_rate": 3.400386370533415e-11, "loss": 0.1011, "num_input_tokens_seen": 51833216, "step": 39980 }, { "epoch": 1.9537048347299244, "grad_norm": 0.34739577770233154, "learning_rate": 1.9739206205682258e-11, "loss": 0.1078, "num_input_tokens_seen": 51840096, "step": 39985 }, { "epoch": 1.9539491363936188, "grad_norm": 0.17574380338191986, "learning_rate": 9.329859829154685e-12, "loss": 0.0734, "num_input_tokens_seen": 51846688, "step": 39990 }, { "epoch": 1.9541934380573132, "grad_norm": 0.19698283076286316, "learning_rate": 2.7758261855748148e-12, "loss": 0.0811, "num_input_tokens_seen": 51852640, "step": 39995 }, { "epoch": 1.9544377397210075, "grad_norm": 0.5411603450775146, "learning_rate": 7.710628524559838e-14, "loss": 0.071, "num_input_tokens_seen": 51858816, "step": 40000 }, { "epoch": 1.9544377397210075, "eval_loss": 0.08723115921020508, "eval_runtime": 374.5085, "eval_samples_per_second": 97.154, "eval_steps_per_second": 24.291, "num_input_tokens_seen": 51858816, "step": 40000 }, { "epoch": 1.9544377397210075, "num_input_tokens_seen": 51858816, "step": 40000, "total_flos": 2.171900184749015e+17, "train_loss": 0.09543592471405864, "train_runtime": 100838.3205, "train_samples_per_second": 6.347, "train_steps_per_second": 0.397 } ], "logging_steps": 5, "max_steps": 40000, "num_input_tokens_seen": 51858816, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.171900184749015e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }