{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.975144705481783, "eval_steps": 1000, "global_step": 3670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027238678924072182, "grad_norm": 94.36308714522585, "learning_rate": 2.4523160762942784e-07, "loss": 3.2941, "step": 10 }, { "epoch": 0.054477357848144364, "grad_norm": 72.09113030659383, "learning_rate": 5.177111716621253e-07, "loss": 2.9481, "step": 20 }, { "epoch": 0.08171603677221655, "grad_norm": 18.50935923305202, "learning_rate": 7.90190735694823e-07, "loss": 1.921, "step": 30 }, { "epoch": 0.10895471569628873, "grad_norm": 8.59280831709028, "learning_rate": 1.0626702997275206e-06, "loss": 1.5096, "step": 40 }, { "epoch": 0.13619339462036092, "grad_norm": 5.941916510585534, "learning_rate": 1.335149863760218e-06, "loss": 1.1101, "step": 50 }, { "epoch": 0.1634320735444331, "grad_norm": 4.39962525080334, "learning_rate": 1.6076294277929156e-06, "loss": 0.9091, "step": 60 }, { "epoch": 0.1906707524685053, "grad_norm": 4.489642566752767, "learning_rate": 1.8801089918256133e-06, "loss": 0.7886, "step": 70 }, { "epoch": 0.21790943139257746, "grad_norm": 3.8997170693605914, "learning_rate": 2.152588555858311e-06, "loss": 0.7292, "step": 80 }, { "epoch": 0.24514811031664965, "grad_norm": 4.2337949046127115, "learning_rate": 2.4250681198910083e-06, "loss": 0.6295, "step": 90 }, { "epoch": 0.27238678924072185, "grad_norm": 3.887748571626415, "learning_rate": 2.697547683923706e-06, "loss": 0.5892, "step": 100 }, { "epoch": 0.299625468164794, "grad_norm": 4.071563747798551, "learning_rate": 2.9700272479564033e-06, "loss": 0.4879, "step": 110 }, { "epoch": 0.3268641470888662, "grad_norm": 4.396940499759756, "learning_rate": 3.2425068119891012e-06, "loss": 0.4542, "step": 120 }, { "epoch": 0.35410282601293835, "grad_norm": 3.784810689454143, "learning_rate": 3.5149863760217988e-06, "loss": 0.3211, "step": 130 }, { "epoch": 0.3813415049370106, "grad_norm": 3.7737741184600595, "learning_rate": 3.7874659400544963e-06, "loss": 0.2891, "step": 140 }, { "epoch": 0.40858018386108275, "grad_norm": 3.361771664086218, "learning_rate": 4.059945504087194e-06, "loss": 0.1996, "step": 150 }, { "epoch": 0.4358188627851549, "grad_norm": 3.2907408057054237, "learning_rate": 4.332425068119892e-06, "loss": 0.133, "step": 160 }, { "epoch": 0.4630575417092271, "grad_norm": 3.040778638316543, "learning_rate": 4.604904632152589e-06, "loss": 0.1101, "step": 170 }, { "epoch": 0.4902962206332993, "grad_norm": 3.5750949211792156, "learning_rate": 4.877384196185287e-06, "loss": 0.0936, "step": 180 }, { "epoch": 0.5175348995573714, "grad_norm": 3.857830728644118, "learning_rate": 5.149863760217984e-06, "loss": 0.0757, "step": 190 }, { "epoch": 0.5447735784814437, "grad_norm": 4.5053152903816915, "learning_rate": 5.422343324250682e-06, "loss": 0.0677, "step": 200 }, { "epoch": 0.5720122574055159, "grad_norm": 2.0169258478044423, "learning_rate": 5.694822888283379e-06, "loss": 0.0607, "step": 210 }, { "epoch": 0.599250936329588, "grad_norm": 2.7354337720763566, "learning_rate": 5.9673024523160776e-06, "loss": 0.057, "step": 220 }, { "epoch": 0.6264896152536602, "grad_norm": 2.41187775653569, "learning_rate": 6.239782016348774e-06, "loss": 0.056, "step": 230 }, { "epoch": 0.6537282941777324, "grad_norm": 2.7992598675350258, "learning_rate": 6.512261580381472e-06, "loss": 0.0515, "step": 240 }, { "epoch": 0.6809669731018045, "grad_norm": 3.885137671280064, "learning_rate": 6.78474114441417e-06, "loss": 0.0517, "step": 250 }, { "epoch": 0.7082056520258767, "grad_norm": 2.47654168655763, "learning_rate": 7.057220708446867e-06, "loss": 0.0462, "step": 260 }, { "epoch": 0.7354443309499489, "grad_norm": 2.5720242451467454, "learning_rate": 7.329700272479565e-06, "loss": 0.0454, "step": 270 }, { "epoch": 0.7626830098740212, "grad_norm": 4.695199938458052, "learning_rate": 7.602179836512263e-06, "loss": 0.0464, "step": 280 }, { "epoch": 0.7899216887980933, "grad_norm": 2.080104008186552, "learning_rate": 7.87465940054496e-06, "loss": 0.0416, "step": 290 }, { "epoch": 0.8171603677221655, "grad_norm": 2.1981255385062517, "learning_rate": 8.147138964577658e-06, "loss": 0.0446, "step": 300 }, { "epoch": 0.8443990466462377, "grad_norm": 1.7239470049599528, "learning_rate": 8.419618528610354e-06, "loss": 0.0452, "step": 310 }, { "epoch": 0.8716377255703098, "grad_norm": 1.8811480620279255, "learning_rate": 8.692098092643052e-06, "loss": 0.033, "step": 320 }, { "epoch": 0.898876404494382, "grad_norm": 2.4779574516049556, "learning_rate": 8.964577656675751e-06, "loss": 0.0438, "step": 330 }, { "epoch": 0.9261150834184542, "grad_norm": 2.42289553821888, "learning_rate": 9.237057220708447e-06, "loss": 0.0392, "step": 340 }, { "epoch": 0.9533537623425263, "grad_norm": 2.3174600464823256, "learning_rate": 9.509536784741146e-06, "loss": 0.0394, "step": 350 }, { "epoch": 0.9805924412665986, "grad_norm": 2.076316899586314, "learning_rate": 9.782016348773843e-06, "loss": 0.0387, "step": 360 }, { "epoch": 1.0054477357848144, "grad_norm": 1.7952293400096322, "learning_rate": 9.99999095346085e-06, "loss": 0.0332, "step": 370 }, { "epoch": 1.0326864147088866, "grad_norm": 2.3966331090661925, "learning_rate": 9.999674328027824e-06, "loss": 0.0349, "step": 380 }, { "epoch": 1.0599250936329587, "grad_norm": 1.8578054931433101, "learning_rate": 9.998905408372662e-06, "loss": 0.0332, "step": 390 }, { "epoch": 1.087163772557031, "grad_norm": 2.067301033604286, "learning_rate": 9.997684264055478e-06, "loss": 0.036, "step": 400 }, { "epoch": 1.1144024514811033, "grad_norm": 1.656723292655721, "learning_rate": 9.99601100554677e-06, "loss": 0.0315, "step": 410 }, { "epoch": 1.1416411304051755, "grad_norm": 1.4305496511087652, "learning_rate": 9.99388578421743e-06, "loss": 0.0307, "step": 420 }, { "epoch": 1.1688798093292476, "grad_norm": 1.4756680048744966, "learning_rate": 9.991308792325045e-06, "loss": 0.0279, "step": 430 }, { "epoch": 1.1961184882533198, "grad_norm": 2.488649223393654, "learning_rate": 9.988280262996507e-06, "loss": 0.0263, "step": 440 }, { "epoch": 1.223357167177392, "grad_norm": 1.4666750141420732, "learning_rate": 9.98480047020693e-06, "loss": 0.0268, "step": 450 }, { "epoch": 1.2505958461014641, "grad_norm": 1.320119597330066, "learning_rate": 9.980869728754847e-06, "loss": 0.0298, "step": 460 }, { "epoch": 1.2778345250255363, "grad_norm": 1.390841137051498, "learning_rate": 9.976488394233752e-06, "loss": 0.021, "step": 470 }, { "epoch": 1.3050732039496085, "grad_norm": 1.4228820783227014, "learning_rate": 9.971656862999917e-06, "loss": 0.0276, "step": 480 }, { "epoch": 1.3323118828736806, "grad_norm": 1.1183882912895564, "learning_rate": 9.966375572136546e-06, "loss": 0.0282, "step": 490 }, { "epoch": 1.3595505617977528, "grad_norm": 1.1616366909466755, "learning_rate": 9.960644999414226e-06, "loss": 0.0214, "step": 500 }, { "epoch": 1.386789240721825, "grad_norm": 1.409120984041759, "learning_rate": 9.954465663247708e-06, "loss": 0.0178, "step": 510 }, { "epoch": 1.4140279196458971, "grad_norm": 0.9803319101168316, "learning_rate": 9.947838122649014e-06, "loss": 0.0202, "step": 520 }, { "epoch": 1.4412665985699693, "grad_norm": 1.8131277665703773, "learning_rate": 9.94076297717686e-06, "loss": 0.0208, "step": 530 }, { "epoch": 1.4685052774940415, "grad_norm": 1.1662863071032854, "learning_rate": 9.933240866882418e-06, "loss": 0.0169, "step": 540 }, { "epoch": 1.4957439564181136, "grad_norm": 1.1465063039077388, "learning_rate": 9.925272472251415e-06, "loss": 0.0211, "step": 550 }, { "epoch": 1.5229826353421858, "grad_norm": 1.193468165618273, "learning_rate": 9.916858514142575e-06, "loss": 0.0198, "step": 560 }, { "epoch": 1.550221314266258, "grad_norm": 1.109324425932322, "learning_rate": 9.907999753722407e-06, "loss": 0.021, "step": 570 }, { "epoch": 1.5774599931903301, "grad_norm": 1.290613695650256, "learning_rate": 9.898696992396333e-06, "loss": 0.0171, "step": 580 }, { "epoch": 1.6046986721144023, "grad_norm": 1.6974083987042456, "learning_rate": 9.888951071736215e-06, "loss": 0.0192, "step": 590 }, { "epoch": 1.6319373510384745, "grad_norm": 1.2163159155887682, "learning_rate": 9.878762873404197e-06, "loss": 0.013, "step": 600 }, { "epoch": 1.6591760299625467, "grad_norm": 1.0193875076300951, "learning_rate": 9.86813331907296e-06, "loss": 0.0163, "step": 610 }, { "epoch": 1.686414708886619, "grad_norm": 1.1034023564810655, "learning_rate": 9.857063370342338e-06, "loss": 0.0174, "step": 620 }, { "epoch": 1.7136533878106912, "grad_norm": 1.5309168123244021, "learning_rate": 9.845554028652331e-06, "loss": 0.0158, "step": 630 }, { "epoch": 1.7408920667347634, "grad_norm": 0.5532697619897224, "learning_rate": 9.833606335192506e-06, "loss": 0.0108, "step": 640 }, { "epoch": 1.7681307456588355, "grad_norm": 1.0576521653821709, "learning_rate": 9.821221370807805e-06, "loss": 0.0154, "step": 650 }, { "epoch": 1.7953694245829077, "grad_norm": 1.0783675700881836, "learning_rate": 9.808400255900772e-06, "loss": 0.0114, "step": 660 }, { "epoch": 1.8226081035069799, "grad_norm": 0.599102289097004, "learning_rate": 9.795144150330194e-06, "loss": 0.0126, "step": 670 }, { "epoch": 1.849846782431052, "grad_norm": 1.1075249779167584, "learning_rate": 9.781454253306169e-06, "loss": 0.0141, "step": 680 }, { "epoch": 1.8770854613551244, "grad_norm": 0.69017494598675, "learning_rate": 9.76733180328163e-06, "loss": 0.0146, "step": 690 }, { "epoch": 1.9043241402791966, "grad_norm": 0.9541591518502084, "learning_rate": 9.752778077840302e-06, "loss": 0.0097, "step": 700 }, { "epoch": 1.9315628192032688, "grad_norm": 1.515252524842471, "learning_rate": 9.737794393581125e-06, "loss": 0.0102, "step": 710 }, { "epoch": 1.958801498127341, "grad_norm": 0.6520067186874438, "learning_rate": 9.722382105999156e-06, "loss": 0.0121, "step": 720 }, { "epoch": 1.9860401770514131, "grad_norm": 1.111673947257393, "learning_rate": 9.706542609362928e-06, "loss": 0.0101, "step": 730 }, { "epoch": 2.010895471569629, "grad_norm": 1.2740352672206354, "learning_rate": 9.690277336588338e-06, "loss": 0.0093, "step": 740 }, { "epoch": 2.038134150493701, "grad_norm": 1.2314399048766502, "learning_rate": 9.673587759109007e-06, "loss": 0.0082, "step": 750 }, { "epoch": 2.065372829417773, "grad_norm": 0.6515664977184386, "learning_rate": 9.656475386743166e-06, "loss": 0.0078, "step": 760 }, { "epoch": 2.0926115083418453, "grad_norm": 0.6076512914812847, "learning_rate": 9.638941767557085e-06, "loss": 0.0086, "step": 770 }, { "epoch": 2.1198501872659175, "grad_norm": 0.49805773777686746, "learning_rate": 9.620988487724999e-06, "loss": 0.0094, "step": 780 }, { "epoch": 2.1470888661899896, "grad_norm": 1.1149194861607004, "learning_rate": 9.602617171385646e-06, "loss": 0.0075, "step": 790 }, { "epoch": 2.174327545114062, "grad_norm": 0.33357565062562405, "learning_rate": 9.583829480495325e-06, "loss": 0.0062, "step": 800 }, { "epoch": 2.201566224038134, "grad_norm": 1.2725673141069516, "learning_rate": 9.564627114677546e-06, "loss": 0.0074, "step": 810 }, { "epoch": 2.2288049029622066, "grad_norm": 0.8173124298927218, "learning_rate": 9.54501181106928e-06, "loss": 0.0102, "step": 820 }, { "epoch": 2.2560435818862787, "grad_norm": 0.6950238807140082, "learning_rate": 9.524985344163801e-06, "loss": 0.0065, "step": 830 }, { "epoch": 2.283282260810351, "grad_norm": 0.9944262630317349, "learning_rate": 9.504549525650173e-06, "loss": 0.0067, "step": 840 }, { "epoch": 2.310520939734423, "grad_norm": 0.7645573101281118, "learning_rate": 9.483706204249332e-06, "loss": 0.0072, "step": 850 }, { "epoch": 2.3377596186584952, "grad_norm": 0.5284367702467161, "learning_rate": 9.462457265546867e-06, "loss": 0.007, "step": 860 }, { "epoch": 2.3649982975825674, "grad_norm": 0.5487759594771856, "learning_rate": 9.440804631822421e-06, "loss": 0.0073, "step": 870 }, { "epoch": 2.3922369765066396, "grad_norm": 0.6876376014240485, "learning_rate": 9.418750261875811e-06, "loss": 0.006, "step": 880 }, { "epoch": 2.4194756554307117, "grad_norm": 0.5930505890719068, "learning_rate": 9.396296150849804e-06, "loss": 0.0034, "step": 890 }, { "epoch": 2.446714334354784, "grad_norm": 0.2924797737902157, "learning_rate": 9.373444330049645e-06, "loss": 0.0048, "step": 900 }, { "epoch": 2.473953013278856, "grad_norm": 0.818504316665287, "learning_rate": 9.350196866759289e-06, "loss": 0.0065, "step": 910 }, { "epoch": 2.5011916922029283, "grad_norm": 0.6173390676022147, "learning_rate": 9.326555864054383e-06, "loss": 0.0063, "step": 920 }, { "epoch": 2.5284303711270004, "grad_norm": 0.42627927587804193, "learning_rate": 9.302523460612015e-06, "loss": 0.0055, "step": 930 }, { "epoch": 2.5556690500510726, "grad_norm": 0.5539596964886677, "learning_rate": 9.278101830517234e-06, "loss": 0.0047, "step": 940 }, { "epoch": 2.5829077289751448, "grad_norm": 0.3896212375163213, "learning_rate": 9.253293183066382e-06, "loss": 0.0056, "step": 950 }, { "epoch": 2.610146407899217, "grad_norm": 0.658832029703366, "learning_rate": 9.228099762567221e-06, "loss": 0.0053, "step": 960 }, { "epoch": 2.637385086823289, "grad_norm": 0.2940172656833637, "learning_rate": 9.202523848135903e-06, "loss": 0.005, "step": 970 }, { "epoch": 2.6646237657473613, "grad_norm": 0.8264261097315277, "learning_rate": 9.176567753490795e-06, "loss": 0.0088, "step": 980 }, { "epoch": 2.6918624446714334, "grad_norm": 0.47332662322314495, "learning_rate": 9.15023382674317e-06, "loss": 0.0049, "step": 990 }, { "epoch": 2.7191011235955056, "grad_norm": 0.6043816603050725, "learning_rate": 9.12352445018478e-06, "loss": 0.0054, "step": 1000 }, { "epoch": 2.7191011235955056, "eval_loss": 0.009971115738153458, "eval_runtime": 149.6934, "eval_samples_per_second": 1.336, "eval_steps_per_second": 0.167, "step": 1000 }, { "epoch": 2.7463398025195778, "grad_norm": 0.18838425652727245, "learning_rate": 9.096442040072342e-06, "loss": 0.0036, "step": 1010 }, { "epoch": 2.77357848144365, "grad_norm": 0.33857822247434793, "learning_rate": 9.06898904640896e-06, "loss": 0.0038, "step": 1020 }, { "epoch": 2.800817160367722, "grad_norm": 0.56645910713706, "learning_rate": 9.04116795272248e-06, "loss": 0.0055, "step": 1030 }, { "epoch": 2.8280558392917943, "grad_norm": 0.28109810510949723, "learning_rate": 9.01298127584082e-06, "loss": 0.0036, "step": 1040 }, { "epoch": 2.8552945182158664, "grad_norm": 0.542296858675033, "learning_rate": 8.984431565664287e-06, "loss": 0.0048, "step": 1050 }, { "epoch": 2.8825331971399386, "grad_norm": 0.5373500607763515, "learning_rate": 8.955521404934895e-06, "loss": 0.0043, "step": 1060 }, { "epoch": 2.9097718760640108, "grad_norm": 0.5880158628582186, "learning_rate": 8.926253409002724e-06, "loss": 0.0045, "step": 1070 }, { "epoch": 2.937010554988083, "grad_norm": 1.0360372633008765, "learning_rate": 8.896630225589325e-06, "loss": 0.003, "step": 1080 }, { "epoch": 2.964249233912155, "grad_norm": 0.6664430992854442, "learning_rate": 8.866654534548188e-06, "loss": 0.0035, "step": 1090 }, { "epoch": 2.9914879128362273, "grad_norm": 0.3828974814299205, "learning_rate": 8.836329047622315e-06, "loss": 0.0051, "step": 1100 }, { "epoch": 3.0163432073544434, "grad_norm": 0.5237696754640224, "learning_rate": 8.805656508198893e-06, "loss": 0.0025, "step": 1110 }, { "epoch": 3.0435818862785156, "grad_norm": 0.31976718993311765, "learning_rate": 8.774639691061133e-06, "loss": 0.0027, "step": 1120 }, { "epoch": 3.0708205652025877, "grad_norm": 0.8311789315891146, "learning_rate": 8.743281402137234e-06, "loss": 0.0043, "step": 1130 }, { "epoch": 3.09805924412666, "grad_norm": 0.5395564063408896, "learning_rate": 8.711584478246545e-06, "loss": 0.0037, "step": 1140 }, { "epoch": 3.125297923050732, "grad_norm": 0.4868560419941669, "learning_rate": 8.679551786842947e-06, "loss": 0.003, "step": 1150 }, { "epoch": 3.1525366019748042, "grad_norm": 0.09982986237608656, "learning_rate": 8.647186225755435e-06, "loss": 0.004, "step": 1160 }, { "epoch": 3.1797752808988764, "grad_norm": 0.2992508205991956, "learning_rate": 8.614490722925976e-06, "loss": 0.0025, "step": 1170 }, { "epoch": 3.2070139598229486, "grad_norm": 0.7674760077175568, "learning_rate": 8.581468236144624e-06, "loss": 0.0029, "step": 1180 }, { "epoch": 3.2342526387470207, "grad_norm": 0.688415374822658, "learning_rate": 8.548121752781958e-06, "loss": 0.0024, "step": 1190 }, { "epoch": 3.261491317671093, "grad_norm": 0.9052988126796538, "learning_rate": 8.514454289518815e-06, "loss": 0.0034, "step": 1200 }, { "epoch": 3.288729996595165, "grad_norm": 0.40120185903679884, "learning_rate": 8.480468892073396e-06, "loss": 0.002, "step": 1210 }, { "epoch": 3.3159686755192372, "grad_norm": 0.5944020494018145, "learning_rate": 8.446168634925744e-06, "loss": 0.0027, "step": 1220 }, { "epoch": 3.3432073544433094, "grad_norm": 0.4472237168192407, "learning_rate": 8.411556621039587e-06, "loss": 0.0076, "step": 1230 }, { "epoch": 3.3704460333673816, "grad_norm": 0.28045654193358444, "learning_rate": 8.376635981581652e-06, "loss": 0.0035, "step": 1240 }, { "epoch": 3.3976847122914537, "grad_norm": 0.17536506030522428, "learning_rate": 8.341409875638396e-06, "loss": 0.0038, "step": 1250 }, { "epoch": 3.424923391215526, "grad_norm": 5.902141227200851, "learning_rate": 8.305881489930224e-06, "loss": 0.0075, "step": 1260 }, { "epoch": 3.452162070139598, "grad_norm": 1.1226970845212438, "learning_rate": 8.270054038523194e-06, "loss": 0.0039, "step": 1270 }, { "epoch": 3.4794007490636703, "grad_norm": 0.2305609441856671, "learning_rate": 8.233930762538271e-06, "loss": 0.0038, "step": 1280 }, { "epoch": 3.506639427987743, "grad_norm": 0.17972249159415393, "learning_rate": 8.197514929858108e-06, "loss": 0.0033, "step": 1290 }, { "epoch": 3.533878106911815, "grad_norm": 0.22572705207090887, "learning_rate": 8.160809834831422e-06, "loss": 0.0018, "step": 1300 }, { "epoch": 3.561116785835887, "grad_norm": 0.48294663787369996, "learning_rate": 8.123818797974973e-06, "loss": 0.001, "step": 1310 }, { "epoch": 3.5883554647599594, "grad_norm": 0.0324163008144768, "learning_rate": 8.08654516567318e-06, "loss": 0.0017, "step": 1320 }, { "epoch": 3.6155941436840315, "grad_norm": 0.2702928606649647, "learning_rate": 8.04899230987537e-06, "loss": 0.0007, "step": 1330 }, { "epoch": 3.6428328226081037, "grad_norm": 0.02733106163122347, "learning_rate": 8.011163627790765e-06, "loss": 0.0015, "step": 1340 }, { "epoch": 3.670071501532176, "grad_norm": 0.17653493830017525, "learning_rate": 7.97306254158113e-06, "loss": 0.0033, "step": 1350 }, { "epoch": 3.697310180456248, "grad_norm": 0.06779788652309977, "learning_rate": 7.934692498051202e-06, "loss": 0.0015, "step": 1360 }, { "epoch": 3.72454885938032, "grad_norm": 0.04283617369672373, "learning_rate": 7.896056968336868e-06, "loss": 0.0016, "step": 1370 }, { "epoch": 3.7517875383043924, "grad_norm": 0.4528535210316289, "learning_rate": 7.857159447591153e-06, "loss": 0.0015, "step": 1380 }, { "epoch": 3.7790262172284645, "grad_norm": 0.03703542904667366, "learning_rate": 7.81800345466804e-06, "loss": 0.0007, "step": 1390 }, { "epoch": 3.8062648961525367, "grad_norm": 0.22254648728704393, "learning_rate": 7.778592531804115e-06, "loss": 0.0018, "step": 1400 }, { "epoch": 3.833503575076609, "grad_norm": 0.1551875795468263, "learning_rate": 7.738930244298146e-06, "loss": 0.003, "step": 1410 }, { "epoch": 3.860742254000681, "grad_norm": 0.4498866311661179, "learning_rate": 7.699020180188533e-06, "loss": 0.0017, "step": 1420 }, { "epoch": 3.887980932924753, "grad_norm": 1.4505936210625598, "learning_rate": 7.658865949928717e-06, "loss": 0.0021, "step": 1430 }, { "epoch": 3.9152196118488254, "grad_norm": 0.2597247781008665, "learning_rate": 7.618471186060574e-06, "loss": 0.0009, "step": 1440 }, { "epoch": 3.9424582907728976, "grad_norm": 0.05461306630022223, "learning_rate": 7.577839542885783e-06, "loss": 0.0012, "step": 1450 }, { "epoch": 3.9696969696969697, "grad_norm": 0.2510446146562708, "learning_rate": 7.5369746961352505e-06, "loss": 0.0012, "step": 1460 }, { "epoch": 3.996935648621042, "grad_norm": 0.027247865962538042, "learning_rate": 7.495880342636581e-06, "loss": 0.0016, "step": 1470 }, { "epoch": 4.021790943139258, "grad_norm": 0.3181735974801834, "learning_rate": 7.454560199979647e-06, "loss": 0.0005, "step": 1480 }, { "epoch": 4.04902962206333, "grad_norm": 0.03550322906594092, "learning_rate": 7.413018006180278e-06, "loss": 0.0006, "step": 1490 }, { "epoch": 4.076268300987402, "grad_norm": 0.7839865968633526, "learning_rate": 7.371257519342103e-06, "loss": 0.0023, "step": 1500 }, { "epoch": 4.103506979911474, "grad_norm": 0.27056209178067114, "learning_rate": 7.329282517316574e-06, "loss": 0.0013, "step": 1510 }, { "epoch": 4.130745658835546, "grad_norm": 0.025007951464925165, "learning_rate": 7.287096797361197e-06, "loss": 0.0014, "step": 1520 }, { "epoch": 4.157984337759618, "grad_norm": 0.1639293410219479, "learning_rate": 7.244704175796028e-06, "loss": 0.002, "step": 1530 }, { "epoch": 4.185223016683691, "grad_norm": 0.09539274967398963, "learning_rate": 7.202108487658416e-06, "loss": 0.0017, "step": 1540 }, { "epoch": 4.212461695607763, "grad_norm": 0.18901065014774657, "learning_rate": 7.159313586356077e-06, "loss": 0.002, "step": 1550 }, { "epoch": 4.239700374531835, "grad_norm": 0.027388119425054664, "learning_rate": 7.116323343318495e-06, "loss": 0.0017, "step": 1560 }, { "epoch": 4.266939053455907, "grad_norm": 0.13933358678308516, "learning_rate": 7.073141647646691e-06, "loss": 0.0009, "step": 1570 }, { "epoch": 4.294177732379979, "grad_norm": 0.011964487372353175, "learning_rate": 7.029772405761397e-06, "loss": 0.0005, "step": 1580 }, { "epoch": 4.321416411304051, "grad_norm": 0.044579473352322464, "learning_rate": 6.9862195410496655e-06, "loss": 0.0007, "step": 1590 }, { "epoch": 4.348655090228124, "grad_norm": 0.9421209582985217, "learning_rate": 6.942486993509941e-06, "loss": 0.0011, "step": 1600 }, { "epoch": 4.375893769152196, "grad_norm": 0.165168485428461, "learning_rate": 6.898578719395622e-06, "loss": 0.0007, "step": 1610 }, { "epoch": 4.403132448076268, "grad_norm": 0.012552169908338339, "learning_rate": 6.854498690857173e-06, "loss": 0.0024, "step": 1620 }, { "epoch": 4.43037112700034, "grad_norm": 0.33669756307886667, "learning_rate": 6.810250895582773e-06, "loss": 0.0013, "step": 1630 }, { "epoch": 4.457609805924413, "grad_norm": 0.03430019485251983, "learning_rate": 6.765839336437574e-06, "loss": 0.001, "step": 1640 }, { "epoch": 4.484848484848484, "grad_norm": 0.7568315535654873, "learning_rate": 6.721268031101586e-06, "loss": 0.0018, "step": 1650 }, { "epoch": 4.5120871637725575, "grad_norm": 0.36926846488952053, "learning_rate": 6.676541011706212e-06, "loss": 0.0032, "step": 1660 }, { "epoch": 4.539325842696629, "grad_norm": 0.30700403135022064, "learning_rate": 6.631662324469492e-06, "loss": 0.0021, "step": 1670 }, { "epoch": 4.566564521620702, "grad_norm": 0.07283695365864443, "learning_rate": 6.586636029330054e-06, "loss": 0.0015, "step": 1680 }, { "epoch": 4.593803200544773, "grad_norm": 0.5098936742183463, "learning_rate": 6.5414661995798346e-06, "loss": 0.0026, "step": 1690 }, { "epoch": 4.621041879468846, "grad_norm": 0.15016221369549917, "learning_rate": 6.496156921495594e-06, "loss": 0.0023, "step": 1700 }, { "epoch": 4.648280558392918, "grad_norm": 0.7027337621531187, "learning_rate": 6.450712293969251e-06, "loss": 0.0036, "step": 1710 }, { "epoch": 4.6755192373169905, "grad_norm": 0.48920213901161386, "learning_rate": 6.405136428137072e-06, "loss": 0.0024, "step": 1720 }, { "epoch": 4.702757916241063, "grad_norm": 0.526663059492545, "learning_rate": 6.359433447007761e-06, "loss": 0.0037, "step": 1730 }, { "epoch": 4.729996595165135, "grad_norm": 0.43485515633750277, "learning_rate": 6.313607485089479e-06, "loss": 0.002, "step": 1740 }, { "epoch": 4.757235274089207, "grad_norm": 0.1608557350260687, "learning_rate": 6.267662688015811e-06, "loss": 0.0011, "step": 1750 }, { "epoch": 4.784473953013279, "grad_norm": 0.016233665978459856, "learning_rate": 6.221603212170727e-06, "loss": 0.0016, "step": 1760 }, { "epoch": 4.811712631937351, "grad_norm": 0.3060301856403388, "learning_rate": 6.175433224312588e-06, "loss": 0.0008, "step": 1770 }, { "epoch": 4.8389513108614235, "grad_norm": 0.0535023008279656, "learning_rate": 6.129156901197195e-06, "loss": 0.0007, "step": 1780 }, { "epoch": 4.866189989785496, "grad_norm": 0.22805192659166784, "learning_rate": 6.082778429199937e-06, "loss": 0.0011, "step": 1790 }, { "epoch": 4.893428668709568, "grad_norm": 0.06801175552041476, "learning_rate": 6.036302003937076e-06, "loss": 0.0004, "step": 1800 }, { "epoch": 4.92066734763364, "grad_norm": 0.3046742023784698, "learning_rate": 5.9897318298861885e-06, "loss": 0.0007, "step": 1810 }, { "epoch": 4.947906026557712, "grad_norm": 0.033986290353038136, "learning_rate": 5.943072120005816e-06, "loss": 0.0007, "step": 1820 }, { "epoch": 4.975144705481784, "grad_norm": 0.09040671159275827, "learning_rate": 5.89632709535433e-06, "loss": 0.0019, "step": 1830 }, { "epoch": 5.0, "grad_norm": 0.2530736784412786, "learning_rate": 5.849500984708082e-06, "loss": 0.0016, "step": 1840 }, { "epoch": 5.027238678924072, "grad_norm": 0.08687153636471827, "learning_rate": 5.802598024178848e-06, "loss": 0.0004, "step": 1850 }, { "epoch": 5.054477357848144, "grad_norm": 0.45420511928877233, "learning_rate": 5.755622456830605e-06, "loss": 0.0008, "step": 1860 }, { "epoch": 5.0817160367722165, "grad_norm": 0.1542430485091151, "learning_rate": 5.708578532295691e-06, "loss": 0.0016, "step": 1870 }, { "epoch": 5.108954715696289, "grad_norm": 1.3515082865895989, "learning_rate": 5.661470506390354e-06, "loss": 0.0011, "step": 1880 }, { "epoch": 5.136193394620361, "grad_norm": 0.1596903735504535, "learning_rate": 5.61430264072976e-06, "loss": 0.0014, "step": 1890 }, { "epoch": 5.163432073544433, "grad_norm": 0.19923117752022435, "learning_rate": 5.5670792023424615e-06, "loss": 0.0015, "step": 1900 }, { "epoch": 5.190670752468505, "grad_norm": 0.302828135997675, "learning_rate": 5.519804463284382e-06, "loss": 0.0009, "step": 1910 }, { "epoch": 5.217909431392577, "grad_norm": 0.04653233725601432, "learning_rate": 5.472482700252347e-06, "loss": 0.0012, "step": 1920 }, { "epoch": 5.2451481103166495, "grad_norm": 0.5190292480319276, "learning_rate": 5.425118194197196e-06, "loss": 0.0023, "step": 1930 }, { "epoch": 5.272386789240722, "grad_norm": 0.007225303260017864, "learning_rate": 5.3777152299365e-06, "loss": 0.0005, "step": 1940 }, { "epoch": 5.299625468164794, "grad_norm": 0.08842755160666287, "learning_rate": 5.3302780957669454e-06, "loss": 0.0006, "step": 1950 }, { "epoch": 5.326864147088866, "grad_norm": 0.024822557291839333, "learning_rate": 5.282811083076388e-06, "loss": 0.0004, "step": 1960 }, { "epoch": 5.354102826012938, "grad_norm": 1.2627706166913806, "learning_rate": 5.235318485955638e-06, "loss": 0.0007, "step": 1970 }, { "epoch": 5.38134150493701, "grad_norm": 0.03392392848083845, "learning_rate": 5.187804600809995e-06, "loss": 0.0014, "step": 1980 }, { "epoch": 5.4085801838610825, "grad_norm": 0.009487721281030682, "learning_rate": 5.140273725970569e-06, "loss": 0.0019, "step": 1990 }, { "epoch": 5.435818862785155, "grad_norm": 0.03169860186889457, "learning_rate": 5.092730161305444e-06, "loss": 0.0005, "step": 2000 }, { "epoch": 5.435818862785155, "eval_loss": 0.003610835410654545, "eval_runtime": 149.9472, "eval_samples_per_second": 1.334, "eval_steps_per_second": 0.167, "step": 2000 }, { "epoch": 5.463057541709227, "grad_norm": 0.26681987017023606, "learning_rate": 5.045178207830687e-06, "loss": 0.0005, "step": 2010 }, { "epoch": 5.490296220633299, "grad_norm": 0.020589913051757485, "learning_rate": 4.997622167321246e-06, "loss": 0.0004, "step": 2020 }, { "epoch": 5.517534899557371, "grad_norm": 0.005508716128027595, "learning_rate": 4.950066341921813e-06, "loss": 0.0002, "step": 2030 }, { "epoch": 5.544773578481443, "grad_norm": 0.12421491718000478, "learning_rate": 4.902515033757617e-06, "loss": 0.0002, "step": 2040 }, { "epoch": 5.5720122574055155, "grad_norm": 0.6904314051957775, "learning_rate": 4.854972544545231e-06, "loss": 0.0011, "step": 2050 }, { "epoch": 5.599250936329588, "grad_norm": 0.020582980767102473, "learning_rate": 4.807443175203432e-06, "loss": 0.0008, "step": 2060 }, { "epoch": 5.62648961525366, "grad_norm": 0.26044403454004866, "learning_rate": 4.759931225464107e-06, "loss": 0.0011, "step": 2070 }, { "epoch": 5.653728294177732, "grad_norm": 0.13198469815528743, "learning_rate": 4.712440993483281e-06, "loss": 0.0005, "step": 2080 }, { "epoch": 5.680966973101804, "grad_norm": 0.24787019272348249, "learning_rate": 4.664976775452293e-06, "loss": 0.0016, "step": 2090 }, { "epoch": 5.708205652025876, "grad_norm": 0.06673597677280482, "learning_rate": 4.617542865209133e-06, "loss": 0.0001, "step": 2100 }, { "epoch": 5.7354443309499485, "grad_norm": 0.01989819994546987, "learning_rate": 4.5701435538500065e-06, "loss": 0.0002, "step": 2110 }, { "epoch": 5.762683009874021, "grad_norm": 0.06711337766264915, "learning_rate": 4.522783129341141e-06, "loss": 0.0011, "step": 2120 }, { "epoch": 5.789921688798094, "grad_norm": 0.0258406372816757, "learning_rate": 4.475465876130872e-06, "loss": 0.0002, "step": 2130 }, { "epoch": 5.817160367722165, "grad_norm": 0.0069848118261474626, "learning_rate": 4.428196074762057e-06, "loss": 0.0001, "step": 2140 }, { "epoch": 5.844399046646238, "grad_norm": 0.00465294693967376, "learning_rate": 4.380978001484836e-06, "loss": 0.0008, "step": 2150 }, { "epoch": 5.871637725570309, "grad_norm": 0.062291913740785465, "learning_rate": 4.33381592786978e-06, "loss": 0.0001, "step": 2160 }, { "epoch": 5.898876404494382, "grad_norm": 0.00324106094253182, "learning_rate": 4.286714120421465e-06, "loss": 0.0001, "step": 2170 }, { "epoch": 5.926115083418454, "grad_norm": 0.010405479601857403, "learning_rate": 4.2396768401925044e-06, "loss": 0.0001, "step": 2180 }, { "epoch": 5.953353762342527, "grad_norm": 0.003621476287705226, "learning_rate": 4.1927083423980755e-06, "loss": 0.0001, "step": 2190 }, { "epoch": 5.980592441266599, "grad_norm": 0.15175481500791388, "learning_rate": 4.145812876030965e-06, "loss": 0.0001, "step": 2200 }, { "epoch": 6.005447735784815, "grad_norm": 0.009474114226140813, "learning_rate": 4.098994683477197e-06, "loss": 0.0, "step": 2210 }, { "epoch": 6.032686414708887, "grad_norm": 0.010773928421478833, "learning_rate": 4.0522580001322365e-06, "loss": 0.0, "step": 2220 }, { "epoch": 6.059925093632959, "grad_norm": 0.002749306413145174, "learning_rate": 4.0056070540178425e-06, "loss": 0.0, "step": 2230 }, { "epoch": 6.087163772557031, "grad_norm": 0.001948904462067843, "learning_rate": 3.959046065399575e-06, "loss": 0.0, "step": 2240 }, { "epoch": 6.114402451481103, "grad_norm": 0.001621993315411648, "learning_rate": 3.912579246405016e-06, "loss": 0.0, "step": 2250 }, { "epoch": 6.1416411304051755, "grad_norm": 0.001255150538557096, "learning_rate": 3.8662108006427165e-06, "loss": 0.0, "step": 2260 }, { "epoch": 6.168879809329248, "grad_norm": 0.004272455750195676, "learning_rate": 3.819944922821914e-06, "loss": 0.0, "step": 2270 }, { "epoch": 6.19611848825332, "grad_norm": 0.00187243023068361, "learning_rate": 3.773785798373069e-06, "loss": 0.0, "step": 2280 }, { "epoch": 6.223357167177392, "grad_norm": 0.0009374589516518995, "learning_rate": 3.7277376030692263e-06, "loss": 0.0, "step": 2290 }, { "epoch": 6.250595846101464, "grad_norm": 0.001253252761018019, "learning_rate": 3.681804502648254e-06, "loss": 0.0, "step": 2300 }, { "epoch": 6.277834525025536, "grad_norm": 0.002173317313832616, "learning_rate": 3.6359906524359932e-06, "loss": 0.0, "step": 2310 }, { "epoch": 6.3050732039496085, "grad_norm": 0.00122096897637743, "learning_rate": 3.590300196970341e-06, "loss": 0.0, "step": 2320 }, { "epoch": 6.332311882873681, "grad_norm": 0.0008158321760346172, "learning_rate": 3.544737269626328e-06, "loss": 0.0, "step": 2330 }, { "epoch": 6.359550561797753, "grad_norm": 0.0011021236484427205, "learning_rate": 3.4993059922421835e-06, "loss": 0.0, "step": 2340 }, { "epoch": 6.386789240721825, "grad_norm": 0.0007113597066715369, "learning_rate": 3.4540104747464575e-06, "loss": 0.0, "step": 2350 }, { "epoch": 6.414027919645897, "grad_norm": 0.0007186713752736677, "learning_rate": 3.408854814786219e-06, "loss": 0.0, "step": 2360 }, { "epoch": 6.441266598569969, "grad_norm": 0.0005956363546399899, "learning_rate": 3.3638430973563597e-06, "loss": 0.0, "step": 2370 }, { "epoch": 6.4685052774940415, "grad_norm": 0.0013985565483587347, "learning_rate": 3.318979394430051e-06, "loss": 0.0, "step": 2380 }, { "epoch": 6.495743956418114, "grad_norm": 0.0008815209213564751, "learning_rate": 3.27426776459037e-06, "loss": 0.0003, "step": 2390 }, { "epoch": 6.522982635342186, "grad_norm": 0.0024178562381092818, "learning_rate": 3.22971225266314e-06, "loss": 0.0, "step": 2400 }, { "epoch": 6.550221314266258, "grad_norm": 0.0010728069443376714, "learning_rate": 3.1853168893510223e-06, "loss": 0.0, "step": 2410 }, { "epoch": 6.57745999319033, "grad_norm": 0.0010548514445552371, "learning_rate": 3.141085690868871e-06, "loss": 0.0, "step": 2420 }, { "epoch": 6.604698672114402, "grad_norm": 0.0021998928664552233, "learning_rate": 3.0970226585804175e-06, "loss": 0.0001, "step": 2430 }, { "epoch": 6.6319373510384745, "grad_norm": 0.0011062246966726207, "learning_rate": 3.053131778636278e-06, "loss": 0.0, "step": 2440 }, { "epoch": 6.659176029962547, "grad_norm": 0.0011797120379395778, "learning_rate": 3.0094170216133545e-06, "loss": 0.0, "step": 2450 }, { "epoch": 6.686414708886619, "grad_norm": 0.007225871987921568, "learning_rate": 2.965882342155637e-06, "loss": 0.0, "step": 2460 }, { "epoch": 6.713653387810691, "grad_norm": 0.0010544539477212198, "learning_rate": 2.9225316786164417e-06, "loss": 0.0, "step": 2470 }, { "epoch": 6.740892066734763, "grad_norm": 0.0006976448579587069, "learning_rate": 2.8793689527021377e-06, "loss": 0.0, "step": 2480 }, { "epoch": 6.768130745658835, "grad_norm": 0.0027697202568828954, "learning_rate": 2.836398069117362e-06, "loss": 0.0, "step": 2490 }, { "epoch": 6.7953694245829075, "grad_norm": 0.0005279000452700497, "learning_rate": 2.7936229152117896e-06, "loss": 0.0, "step": 2500 }, { "epoch": 6.82260810350698, "grad_norm": 0.000826146277516193, "learning_rate": 2.751047360628458e-06, "loss": 0.0, "step": 2510 }, { "epoch": 6.849846782431052, "grad_norm": 0.0007622689637331964, "learning_rate": 2.708675256953708e-06, "loss": 0.0, "step": 2520 }, { "epoch": 6.877085461355124, "grad_norm": 0.0011470272220122446, "learning_rate": 2.6665104373687455e-06, "loss": 0.0006, "step": 2530 }, { "epoch": 6.904324140279196, "grad_norm": 0.0009952201118956113, "learning_rate": 2.624556716302876e-06, "loss": 0.0, "step": 2540 }, { "epoch": 6.931562819203268, "grad_norm": 0.0011435928941426047, "learning_rate": 2.582817889088435e-06, "loss": 0.0, "step": 2550 }, { "epoch": 6.9588014981273405, "grad_norm": 0.01166505316182784, "learning_rate": 2.541297731617437e-06, "loss": 0.0, "step": 2560 }, { "epoch": 6.986040177051413, "grad_norm": 0.0012091660156564501, "learning_rate": 2.5000000000000015e-06, "loss": 0.0, "step": 2570 }, { "epoch": 7.010895471569629, "grad_norm": 0.0015980505913388763, "learning_rate": 2.458928430224548e-06, "loss": 0.0, "step": 2580 }, { "epoch": 7.038134150493701, "grad_norm": 0.0009393729565236982, "learning_rate": 2.4180867378198274e-06, "loss": 0.0, "step": 2590 }, { "epoch": 7.065372829417774, "grad_norm": 0.0008257129118958041, "learning_rate": 2.3774786175187932e-06, "loss": 0.0, "step": 2600 }, { "epoch": 7.092611508341846, "grad_norm": 0.0018052707906574057, "learning_rate": 2.337107742924359e-06, "loss": 0.0, "step": 2610 }, { "epoch": 7.119850187265918, "grad_norm": 0.1645764072539005, "learning_rate": 2.29697776617707e-06, "loss": 0.0, "step": 2620 }, { "epoch": 7.14708886618999, "grad_norm": 0.0007519843719597862, "learning_rate": 2.25709231762471e-06, "loss": 0.0, "step": 2630 }, { "epoch": 7.174327545114062, "grad_norm": 0.0008477596842611903, "learning_rate": 2.217455005493884e-06, "loss": 0.0, "step": 2640 }, { "epoch": 7.201566224038134, "grad_norm": 0.004509417330392989, "learning_rate": 2.1780694155636014e-06, "loss": 0.0, "step": 2650 }, { "epoch": 7.228804902962207, "grad_norm": 0.0005766378225871001, "learning_rate": 2.138939110840888e-06, "loss": 0.0, "step": 2660 }, { "epoch": 7.256043581886279, "grad_norm": 0.004100067804604992, "learning_rate": 2.100067631238464e-06, "loss": 0.0, "step": 2670 }, { "epoch": 7.283282260810351, "grad_norm": 0.003930226882351156, "learning_rate": 2.0614584932544955e-06, "loss": 0.0, "step": 2680 }, { "epoch": 7.310520939734423, "grad_norm": 0.0004427737628504764, "learning_rate": 2.023115189654491e-06, "loss": 0.0, "step": 2690 }, { "epoch": 7.337759618658495, "grad_norm": 0.0008888326172641632, "learning_rate": 1.9850411891553186e-06, "loss": 0.0, "step": 2700 }, { "epoch": 7.364998297582567, "grad_norm": 0.0006039128437105985, "learning_rate": 1.9472399361114126e-06, "loss": 0.0, "step": 2710 }, { "epoch": 7.39223697650664, "grad_norm": 0.0004893946381054215, "learning_rate": 1.909714850203177e-06, "loss": 0.0, "step": 2720 }, { "epoch": 7.419475655430712, "grad_norm": 0.0004317709133183237, "learning_rate": 1.8724693261276345e-06, "loss": 0.0, "step": 2730 }, { "epoch": 7.446714334354784, "grad_norm": 0.0006212197750363989, "learning_rate": 1.8355067332913156e-06, "loss": 0.0, "step": 2740 }, { "epoch": 7.473953013278856, "grad_norm": 0.0007456921468394779, "learning_rate": 1.7988304155054541e-06, "loss": 0.0, "step": 2750 }, { "epoch": 7.501191692202928, "grad_norm": 0.0008149478511071161, "learning_rate": 1.7624436906834842e-06, "loss": 0.0, "step": 2760 }, { "epoch": 7.528430371127, "grad_norm": 0.0007225352461383864, "learning_rate": 1.7263498505408893e-06, "loss": 0.0, "step": 2770 }, { "epoch": 7.555669050051073, "grad_norm": 0.0005959357394697677, "learning_rate": 1.6905521602974183e-06, "loss": 0.0, "step": 2780 }, { "epoch": 7.582907728975145, "grad_norm": 0.000635015561952419, "learning_rate": 1.6550538583816967e-06, "loss": 0.0, "step": 2790 }, { "epoch": 7.610146407899217, "grad_norm": 0.0005433753417863713, "learning_rate": 1.6198581561382643e-06, "loss": 0.0, "step": 2800 }, { "epoch": 7.637385086823289, "grad_norm": 0.00046276580358275586, "learning_rate": 1.5849682375370601e-06, "loss": 0.0, "step": 2810 }, { "epoch": 7.664623765747361, "grad_norm": 0.0008516896607091807, "learning_rate": 1.550387258885388e-06, "loss": 0.0, "step": 2820 }, { "epoch": 7.691862444671433, "grad_norm": 0.0005163692295676414, "learning_rate": 1.5161183485423785e-06, "loss": 0.0, "step": 2830 }, { "epoch": 7.719101123595506, "grad_norm": 0.0011709181420231178, "learning_rate": 1.482164606635989e-06, "loss": 0.0, "step": 2840 }, { "epoch": 7.746339802519578, "grad_norm": 0.0013056291673384528, "learning_rate": 1.44852910478254e-06, "loss": 0.0, "step": 2850 }, { "epoch": 7.77357848144365, "grad_norm": 0.0004465783868552864, "learning_rate": 1.4152148858088554e-06, "loss": 0.0, "step": 2860 }, { "epoch": 7.800817160367722, "grad_norm": 0.0005635784726540899, "learning_rate": 1.3822249634769864e-06, "loss": 0.0, "step": 2870 }, { "epoch": 7.828055839291794, "grad_norm": 0.0003537589230160337, "learning_rate": 1.3495623222115735e-06, "loss": 0.0, "step": 2880 }, { "epoch": 7.855294518215866, "grad_norm": 0.0004597321862407293, "learning_rate": 1.3172299168298614e-06, "loss": 0.0, "step": 2890 }, { "epoch": 7.882533197139939, "grad_norm": 0.00040170519106549137, "learning_rate": 1.2852306722743934e-06, "loss": 0.0, "step": 2900 }, { "epoch": 7.909771876064011, "grad_norm": 0.0016780869556999771, "learning_rate": 1.253567483348407e-06, "loss": 0.0, "step": 2910 }, { "epoch": 7.937010554988083, "grad_norm": 0.000593202963073429, "learning_rate": 1.222243214453951e-06, "loss": 0.0, "step": 2920 }, { "epoch": 7.964249233912155, "grad_norm": 0.0003643083864401578, "learning_rate": 1.1912606993327685e-06, "loss": 0.0, "step": 2930 }, { "epoch": 7.991487912836227, "grad_norm": 0.0007434796810202931, "learning_rate": 1.1606227408099347e-06, "loss": 0.0, "step": 2940 }, { "epoch": 8.016343207354444, "grad_norm": 0.00047702794678360533, "learning_rate": 1.1303321105403026e-06, "loss": 0.0, "step": 2950 }, { "epoch": 8.043581886278515, "grad_norm": 0.0005298663692543725, "learning_rate": 1.1003915487577683e-06, "loss": 0.0, "step": 2960 }, { "epoch": 8.070820565202588, "grad_norm": 0.004074165794804811, "learning_rate": 1.0708037640273715e-06, "loss": 0.0, "step": 2970 }, { "epoch": 8.09805924412666, "grad_norm": 0.00040511964399428683, "learning_rate": 1.0415714330002729e-06, "loss": 0.0, "step": 2980 }, { "epoch": 8.125297923050733, "grad_norm": 0.0012113673680468461, "learning_rate": 1.0126972001716007e-06, "loss": 0.0, "step": 2990 }, { "epoch": 8.152536601974804, "grad_norm": 0.00041889827396330973, "learning_rate": 9.841836776412294e-07, "loss": 0.0, "step": 3000 }, { "epoch": 8.152536601974804, "eval_loss": 1.2345339200692251e-05, "eval_runtime": 149.4558, "eval_samples_per_second": 1.338, "eval_steps_per_second": 0.167, "step": 3000 }, { "epoch": 8.179775280898877, "grad_norm": 0.00048789814951889245, "learning_rate": 9.560334448774705e-07, "loss": 0.0, "step": 3010 }, { "epoch": 8.207013959822948, "grad_norm": 0.00047638289528187925, "learning_rate": 9.282490484837215e-07, "loss": 0.0, "step": 3020 }, { "epoch": 8.234252638747021, "grad_norm": 0.000455674971676968, "learning_rate": 9.008330019680883e-07, "loss": 0.0, "step": 3030 }, { "epoch": 8.261491317671092, "grad_norm": 0.00041234844635284676, "learning_rate": 8.737877855160032e-07, "loss": 0.0, "step": 3040 }, { "epoch": 8.288729996595166, "grad_norm": 0.0004512145872248025, "learning_rate": 8.471158457658546e-07, "loss": 0.0, "step": 3050 }, { "epoch": 8.315968675519237, "grad_norm": 0.0003955500503942679, "learning_rate": 8.208195955876513e-07, "loss": 0.0, "step": 3060 }, { "epoch": 8.34320735444331, "grad_norm": 0.00037757417965770856, "learning_rate": 7.949014138647442e-07, "loss": 0.0, "step": 3070 }, { "epoch": 8.370446033367381, "grad_norm": 0.0005928754851042547, "learning_rate": 7.693636452786213e-07, "loss": 0.0, "step": 3080 }, { "epoch": 8.397684712291454, "grad_norm": 0.0011757489143008526, "learning_rate": 7.442086000967962e-07, "loss": 0.0, "step": 3090 }, { "epoch": 8.424923391215525, "grad_norm": 0.001091795641988052, "learning_rate": 7.194385539638099e-07, "loss": 0.0, "step": 3100 }, { "epoch": 8.452162070139599, "grad_norm": 0.0004490144524655851, "learning_rate": 6.950557476953674e-07, "loss": 0.0, "step": 3110 }, { "epoch": 8.47940074906367, "grad_norm": 0.0005170136229882478, "learning_rate": 6.710623870756178e-07, "loss": 0.0, "step": 3120 }, { "epoch": 8.506639427987743, "grad_norm": 0.0005606121390368469, "learning_rate": 6.474606426576157e-07, "loss": 0.0, "step": 3130 }, { "epoch": 8.533878106911814, "grad_norm": 0.0006494691276362171, "learning_rate": 6.242526495669587e-07, "loss": 0.0, "step": 3140 }, { "epoch": 8.561116785835887, "grad_norm": 0.000705761138730125, "learning_rate": 6.01440507308631e-07, "loss": 0.0, "step": 3150 }, { "epoch": 8.588355464759958, "grad_norm": 0.00037580712390274833, "learning_rate": 5.790262795770785e-07, "loss": 0.0, "step": 3160 }, { "epoch": 8.615594143684032, "grad_norm": 0.00037712299260292257, "learning_rate": 5.570119940695135e-07, "loss": 0.0, "step": 3170 }, { "epoch": 8.642832822608103, "grad_norm": 0.00038488025576517655, "learning_rate": 5.353996423024804e-07, "loss": 0.0, "step": 3180 }, { "epoch": 8.670071501532176, "grad_norm": 0.0004034621134836744, "learning_rate": 5.141911794316934e-07, "loss": 0.0, "step": 3190 }, { "epoch": 8.697310180456247, "grad_norm": 0.00038266845732824815, "learning_rate": 4.93388524075164e-07, "loss": 0.0, "step": 3200 }, { "epoch": 8.72454885938032, "grad_norm": 0.00038362011868780437, "learning_rate": 4.729935581396328e-07, "loss": 0.0, "step": 3210 }, { "epoch": 8.751787538304391, "grad_norm": 0.00035607543651581207, "learning_rate": 4.5300812665032557e-07, "loss": 0.0, "step": 3220 }, { "epoch": 8.779026217228465, "grad_norm": 0.0006068778689349405, "learning_rate": 4.334340375840418e-07, "loss": 0.0, "step": 3230 }, { "epoch": 8.806264896152536, "grad_norm": 0.00033490740677774155, "learning_rate": 4.1427306170559624e-07, "loss": 0.0, "step": 3240 }, { "epoch": 8.833503575076609, "grad_norm": 0.00046859576305887497, "learning_rate": 3.955269324076294e-07, "loss": 0.0, "step": 3250 }, { "epoch": 8.86074225400068, "grad_norm": 0.0008093853787619229, "learning_rate": 3.771973455537936e-07, "loss": 0.0, "step": 3260 }, { "epoch": 8.887980932924753, "grad_norm": 0.0010513228996015518, "learning_rate": 3.5928595932534005e-07, "loss": 0.0, "step": 3270 }, { "epoch": 8.915219611848826, "grad_norm": 0.0003623450487854711, "learning_rate": 3.4179439407110714e-07, "loss": 0.0, "step": 3280 }, { "epoch": 8.942458290772898, "grad_norm": 0.000339362832377934, "learning_rate": 3.247242321609434e-07, "loss": 0.0, "step": 3290 }, { "epoch": 8.969696969696969, "grad_norm": 0.0005334885527562191, "learning_rate": 3.0807701784255296e-07, "loss": 0.0, "step": 3300 }, { "epoch": 8.996935648621042, "grad_norm": 0.00044685136023717045, "learning_rate": 2.9185425710179737e-07, "loss": 0.0, "step": 3310 }, { "epoch": 9.021790943139258, "grad_norm": 0.0008018093014633273, "learning_rate": 2.7605741752645686e-07, "loss": 0.0, "step": 3320 }, { "epoch": 9.04902962206333, "grad_norm": 0.0003794485076320959, "learning_rate": 2.606879281734659e-07, "loss": 0.0, "step": 3330 }, { "epoch": 9.076268300987403, "grad_norm": 0.000514433360537291, "learning_rate": 2.457471794396338e-07, "loss": 0.0, "step": 3340 }, { "epoch": 9.103506979911474, "grad_norm": 0.0006110392205499266, "learning_rate": 2.3123652293586207e-07, "loss": 0.0, "step": 3350 }, { "epoch": 9.130745658835547, "grad_norm": 0.0004051883420581768, "learning_rate": 2.1715727136487174e-07, "loss": 0.0, "step": 3360 }, { "epoch": 9.157984337759618, "grad_norm": 0.0017022735873803575, "learning_rate": 2.0351069840244986e-07, "loss": 0.0, "step": 3370 }, { "epoch": 9.185223016683691, "grad_norm": 0.000833252754224706, "learning_rate": 1.9029803858222896e-07, "loss": 0.0, "step": 3380 }, { "epoch": 9.212461695607763, "grad_norm": 0.0038641354937789443, "learning_rate": 1.775204871840014e-07, "loss": 0.0, "step": 3390 }, { "epoch": 9.239700374531836, "grad_norm": 0.0003339132774971038, "learning_rate": 1.6517920012559086e-07, "loss": 0.0, "step": 3400 }, { "epoch": 9.266939053455907, "grad_norm": 0.0003633204131676538, "learning_rate": 1.5327529385828377e-07, "loss": 0.0, "step": 3410 }, { "epoch": 9.29417773237998, "grad_norm": 0.000322045602551698, "learning_rate": 1.4180984526582675e-07, "loss": 0.0, "step": 3420 }, { "epoch": 9.321416411304051, "grad_norm": 0.0010744986200642887, "learning_rate": 1.3078389156700842e-07, "loss": 0.0, "step": 3430 }, { "epoch": 9.348655090228124, "grad_norm": 0.0005973674276566756, "learning_rate": 1.2019843022182898e-07, "loss": 0.0, "step": 3440 }, { "epoch": 9.375893769152196, "grad_norm": 0.00035574829244011577, "learning_rate": 1.1005441884126278e-07, "loss": 0.0, "step": 3450 }, { "epoch": 9.403132448076269, "grad_norm": 0.0005334084677271788, "learning_rate": 1.0035277510062835e-07, "loss": 0.0, "step": 3460 }, { "epoch": 9.43037112700034, "grad_norm": 0.00032947386696114886, "learning_rate": 9.109437665657473e-08, "loss": 0.0, "step": 3470 }, { "epoch": 9.457609805924413, "grad_norm": 0.0003827344363811748, "learning_rate": 8.228006106767883e-08, "loss": 0.0, "step": 3480 }, { "epoch": 9.484848484848484, "grad_norm": 0.00030067997503277467, "learning_rate": 7.391062571868113e-08, "loss": 0.0, "step": 3490 }, { "epoch": 9.512087163772557, "grad_norm": 0.0003243561088926095, "learning_rate": 6.598682774834775e-08, "loss": 0.0, "step": 3500 }, { "epoch": 9.539325842696629, "grad_norm": 0.0003467443287273098, "learning_rate": 5.850938398097583e-08, "loss": 0.0, "step": 3510 }, { "epoch": 9.566564521620702, "grad_norm": 0.00035286356515664503, "learning_rate": 5.1478970861548185e-08, "loss": 0.0, "step": 3520 }, { "epoch": 9.593803200544773, "grad_norm": 0.0009776284074226168, "learning_rate": 4.4896224394537226e-08, "loss": 0.0, "step": 3530 }, { "epoch": 9.621041879468846, "grad_norm": 0.00039195791750442195, "learning_rate": 3.8761740086369345e-08, "loss": 0.0, "step": 3540 }, { "epoch": 9.648280558392917, "grad_norm": 0.0005306094646929393, "learning_rate": 3.307607289155301e-08, "loss": 0.0, "step": 3550 }, { "epoch": 9.67551923731699, "grad_norm": 0.0004368446632988839, "learning_rate": 2.78397371624739e-08, "loss": 0.0, "step": 3560 }, { "epoch": 9.702757916241062, "grad_norm": 0.0006477180396400463, "learning_rate": 2.305320660286603e-08, "loss": 0.0, "step": 3570 }, { "epoch": 9.729996595165135, "grad_norm": 0.0005051573397374285, "learning_rate": 1.8716914224957138e-08, "loss": 0.0, "step": 3580 }, { "epoch": 9.757235274089206, "grad_norm": 0.00035959712658455894, "learning_rate": 1.4831252310294474e-08, "loss": 0.0, "step": 3590 }, { "epoch": 9.78447395301328, "grad_norm": 0.00047611287827840345, "learning_rate": 1.1396572374261505e-08, "loss": 0.0, "step": 3600 }, { "epoch": 9.81171263193735, "grad_norm": 0.0009275444049437501, "learning_rate": 8.413185134273916e-09, "loss": 0.0, "step": 3610 }, { "epoch": 9.838951310861423, "grad_norm": 0.0010032462120567445, "learning_rate": 5.881360481673759e-09, "loss": 0.0, "step": 3620 }, { "epoch": 9.866189989785495, "grad_norm": 0.0008774671042869975, "learning_rate": 3.801327457311765e-09, "loss": 0.0, "step": 3630 }, { "epoch": 9.893428668709568, "grad_norm": 0.0007470443454518441, "learning_rate": 2.173274230827249e-09, "loss": 0.0, "step": 3640 }, { "epoch": 9.920667347633639, "grad_norm": 0.0006590459160652287, "learning_rate": 9.97348083627836e-10, "loss": 0.0, "step": 3650 }, { "epoch": 9.947906026557712, "grad_norm": 0.0008391834531967672, "learning_rate": 2.7365539556234444e-10, "loss": 0.0, "step": 3660 }, { "epoch": 9.975144705481783, "grad_norm": 0.00034623849620746836, "learning_rate": 2.261635299039e-12, "loss": 0.0, "step": 3670 }, { "epoch": 9.975144705481783, "step": 3670, "total_flos": 3659279013773312.0, "train_loss": 0.050615286758120444, "train_runtime": 386241.8476, "train_samples_per_second": 2.433, "train_steps_per_second": 0.01 } ], "logging_steps": 10, "max_steps": 3670, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3659279013773312.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }