{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998919736415686, "eval_steps": 500, "global_step": 3471, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014403514457527638, "grad_norm": 9.498580060596645, "learning_rate": 2.2988505747126437e-07, "loss": 1.9905, "step": 5 }, { "epoch": 0.0028807028915055276, "grad_norm": 9.29730840350754, "learning_rate": 5.172413793103449e-07, "loss": 1.9858, "step": 10 }, { "epoch": 0.004321054337258291, "grad_norm": 8.510677726863225, "learning_rate": 8.045977011494253e-07, "loss": 1.9545, "step": 15 }, { "epoch": 0.005761405783011055, "grad_norm": 7.429305787241992, "learning_rate": 1.0919540229885058e-06, "loss": 1.838, "step": 20 }, { "epoch": 0.007201757228763818, "grad_norm": 6.639158892472215, "learning_rate": 1.3793103448275862e-06, "loss": 1.658, "step": 25 }, { "epoch": 0.008642108674516582, "grad_norm": 7.2050289141669035, "learning_rate": 1.6666666666666667e-06, "loss": 1.3294, "step": 30 }, { "epoch": 0.010082460120269346, "grad_norm": 7.64848090123899, "learning_rate": 1.9540229885057475e-06, "loss": 1.0103, "step": 35 }, { "epoch": 0.01152281156602211, "grad_norm": 6.412853419273422, "learning_rate": 2.241379310344828e-06, "loss": 0.5327, "step": 40 }, { "epoch": 0.012963163011774873, "grad_norm": 1.776900584940695, "learning_rate": 2.5287356321839083e-06, "loss": 0.2908, "step": 45 }, { "epoch": 0.014403514457527637, "grad_norm": 1.985660623026991, "learning_rate": 2.8160919540229887e-06, "loss": 0.1841, "step": 50 }, { "epoch": 0.0158438659032804, "grad_norm": 0.7462636137858207, "learning_rate": 3.103448275862069e-06, "loss": 0.1707, "step": 55 }, { "epoch": 0.017284217349033165, "grad_norm": 0.7526761920440953, "learning_rate": 3.3908045977011496e-06, "loss": 0.1525, "step": 60 }, { "epoch": 0.01872456879478593, "grad_norm": 0.7764098707095316, "learning_rate": 3.67816091954023e-06, "loss": 0.1407, "step": 65 }, { "epoch": 0.020164920240538693, "grad_norm": 0.7414932231148509, "learning_rate": 3.96551724137931e-06, "loss": 0.1339, "step": 70 }, { "epoch": 0.021605271686291457, "grad_norm": 0.7014166187976457, "learning_rate": 4.252873563218391e-06, "loss": 0.1294, "step": 75 }, { "epoch": 0.02304562313204422, "grad_norm": 0.4877663944572015, "learning_rate": 4.540229885057471e-06, "loss": 0.1255, "step": 80 }, { "epoch": 0.02448597457779698, "grad_norm": 0.5776767722589294, "learning_rate": 4.8275862068965525e-06, "loss": 0.1214, "step": 85 }, { "epoch": 0.025926326023549745, "grad_norm": 0.6728066183845628, "learning_rate": 5.114942528735632e-06, "loss": 0.1171, "step": 90 }, { "epoch": 0.02736667746930251, "grad_norm": 0.45388962144154116, "learning_rate": 5.402298850574713e-06, "loss": 0.1189, "step": 95 }, { "epoch": 0.028807028915055273, "grad_norm": 0.4586145950809401, "learning_rate": 5.689655172413794e-06, "loss": 0.1083, "step": 100 }, { "epoch": 0.030247380360808037, "grad_norm": 0.5500926742810166, "learning_rate": 5.977011494252874e-06, "loss": 0.1041, "step": 105 }, { "epoch": 0.0316877318065608, "grad_norm": 0.45289151449461224, "learning_rate": 6.264367816091954e-06, "loss": 0.1005, "step": 110 }, { "epoch": 0.03312808325231356, "grad_norm": 1.8638824672613103, "learning_rate": 6.551724137931035e-06, "loss": 0.1004, "step": 115 }, { "epoch": 0.03456843469806633, "grad_norm": 0.4933743801153303, "learning_rate": 6.839080459770115e-06, "loss": 0.0994, "step": 120 }, { "epoch": 0.03600878614381909, "grad_norm": 0.44067596151303207, "learning_rate": 7.126436781609196e-06, "loss": 0.1001, "step": 125 }, { "epoch": 0.03744913758957186, "grad_norm": 0.6952160335299229, "learning_rate": 7.413793103448277e-06, "loss": 0.1015, "step": 130 }, { "epoch": 0.03888948903532462, "grad_norm": 0.48612969095240394, "learning_rate": 7.701149425287356e-06, "loss": 0.0957, "step": 135 }, { "epoch": 0.040329840481077385, "grad_norm": 0.4362186062208644, "learning_rate": 7.988505747126438e-06, "loss": 0.0949, "step": 140 }, { "epoch": 0.041770191926830146, "grad_norm": 0.5207325618446389, "learning_rate": 8.275862068965518e-06, "loss": 0.0912, "step": 145 }, { "epoch": 0.04321054337258291, "grad_norm": 0.3173162407799849, "learning_rate": 8.563218390804599e-06, "loss": 0.0897, "step": 150 }, { "epoch": 0.044650894818335674, "grad_norm": 0.40145266365610965, "learning_rate": 8.85057471264368e-06, "loss": 0.0904, "step": 155 }, { "epoch": 0.04609124626408844, "grad_norm": 0.3558474419241096, "learning_rate": 9.13793103448276e-06, "loss": 0.083, "step": 160 }, { "epoch": 0.0475315977098412, "grad_norm": 0.34959789012602455, "learning_rate": 9.42528735632184e-06, "loss": 0.0907, "step": 165 }, { "epoch": 0.04897194915559396, "grad_norm": 0.4329214768393735, "learning_rate": 9.71264367816092e-06, "loss": 0.0881, "step": 170 }, { "epoch": 0.05041230060134673, "grad_norm": 0.40181207129208235, "learning_rate": 1e-05, "loss": 0.0827, "step": 175 }, { "epoch": 0.05185265204709949, "grad_norm": 0.33262831961207967, "learning_rate": 1.0287356321839081e-05, "loss": 0.0856, "step": 180 }, { "epoch": 0.05329300349285226, "grad_norm": 0.4074048572040497, "learning_rate": 1.0574712643678162e-05, "loss": 0.0872, "step": 185 }, { "epoch": 0.05473335493860502, "grad_norm": 0.6269775026390093, "learning_rate": 1.0862068965517242e-05, "loss": 0.0872, "step": 190 }, { "epoch": 0.056173706384357786, "grad_norm": 0.4503563815568868, "learning_rate": 1.1149425287356324e-05, "loss": 0.0829, "step": 195 }, { "epoch": 0.057614057830110546, "grad_norm": 0.31295787939134934, "learning_rate": 1.1436781609195405e-05, "loss": 0.0882, "step": 200 }, { "epoch": 0.059054409275863314, "grad_norm": 0.4252598579032741, "learning_rate": 1.1724137931034483e-05, "loss": 0.0806, "step": 205 }, { "epoch": 0.060494760721616074, "grad_norm": 2.172621514603248, "learning_rate": 1.2011494252873564e-05, "loss": 0.0847, "step": 210 }, { "epoch": 0.061935112167368835, "grad_norm": 0.29858098908068476, "learning_rate": 1.2298850574712644e-05, "loss": 0.0814, "step": 215 }, { "epoch": 0.0633754636131216, "grad_norm": 0.30454141006022717, "learning_rate": 1.2586206896551725e-05, "loss": 0.0841, "step": 220 }, { "epoch": 0.06481581505887436, "grad_norm": 0.42812641240805016, "learning_rate": 1.2873563218390805e-05, "loss": 0.0822, "step": 225 }, { "epoch": 0.06625616650462712, "grad_norm": 0.4542902663928825, "learning_rate": 1.3160919540229885e-05, "loss": 0.0807, "step": 230 }, { "epoch": 0.0676965179503799, "grad_norm": 0.32671450237608535, "learning_rate": 1.3448275862068967e-05, "loss": 0.0842, "step": 235 }, { "epoch": 0.06913686939613266, "grad_norm": 0.3116704997352597, "learning_rate": 1.3735632183908048e-05, "loss": 0.0835, "step": 240 }, { "epoch": 0.07057722084188542, "grad_norm": 0.3849475487986608, "learning_rate": 1.4022988505747128e-05, "loss": 0.0807, "step": 245 }, { "epoch": 0.07201757228763818, "grad_norm": 0.28484088684959274, "learning_rate": 1.4310344827586209e-05, "loss": 0.0816, "step": 250 }, { "epoch": 0.07345792373339095, "grad_norm": 0.3337157396250414, "learning_rate": 1.459770114942529e-05, "loss": 0.0839, "step": 255 }, { "epoch": 0.07489827517914371, "grad_norm": 0.2664400550606567, "learning_rate": 1.4885057471264368e-05, "loss": 0.0817, "step": 260 }, { "epoch": 0.07633862662489647, "grad_norm": 0.2505155478711773, "learning_rate": 1.5172413793103448e-05, "loss": 0.0783, "step": 265 }, { "epoch": 0.07777897807064924, "grad_norm": 0.3335733996941115, "learning_rate": 1.545977011494253e-05, "loss": 0.0755, "step": 270 }, { "epoch": 0.079219329516402, "grad_norm": 0.26719459163424675, "learning_rate": 1.574712643678161e-05, "loss": 0.0789, "step": 275 }, { "epoch": 0.08065968096215477, "grad_norm": 0.5975638138236752, "learning_rate": 1.603448275862069e-05, "loss": 0.0811, "step": 280 }, { "epoch": 0.08210003240790753, "grad_norm": 0.3120443711700076, "learning_rate": 1.632183908045977e-05, "loss": 0.0783, "step": 285 }, { "epoch": 0.08354038385366029, "grad_norm": 0.5532978578404874, "learning_rate": 1.6609195402298854e-05, "loss": 0.0774, "step": 290 }, { "epoch": 0.08498073529941305, "grad_norm": 0.3019847180020645, "learning_rate": 1.6896551724137932e-05, "loss": 0.0795, "step": 295 }, { "epoch": 0.08642108674516583, "grad_norm": 0.4488280416931883, "learning_rate": 1.7183908045977015e-05, "loss": 0.0802, "step": 300 }, { "epoch": 0.08786143819091859, "grad_norm": 0.2713355446318922, "learning_rate": 1.7471264367816093e-05, "loss": 0.0825, "step": 305 }, { "epoch": 0.08930178963667135, "grad_norm": 0.2670783396564851, "learning_rate": 1.7758620689655175e-05, "loss": 0.0835, "step": 310 }, { "epoch": 0.09074214108242411, "grad_norm": 0.2708166986235282, "learning_rate": 1.8045977011494254e-05, "loss": 0.0743, "step": 315 }, { "epoch": 0.09218249252817688, "grad_norm": 0.23178489300951058, "learning_rate": 1.8333333333333333e-05, "loss": 0.0812, "step": 320 }, { "epoch": 0.09362284397392964, "grad_norm": 0.2448448154224435, "learning_rate": 1.8620689655172415e-05, "loss": 0.078, "step": 325 }, { "epoch": 0.0950631954196824, "grad_norm": 0.2184469288572962, "learning_rate": 1.8908045977011497e-05, "loss": 0.076, "step": 330 }, { "epoch": 0.09650354686543516, "grad_norm": 0.19322313487222872, "learning_rate": 1.9195402298850576e-05, "loss": 0.0759, "step": 335 }, { "epoch": 0.09794389831118792, "grad_norm": 0.19526384677265263, "learning_rate": 1.9482758620689658e-05, "loss": 0.0751, "step": 340 }, { "epoch": 0.0993842497569407, "grad_norm": 0.4273996002750209, "learning_rate": 1.9770114942528737e-05, "loss": 0.0791, "step": 345 }, { "epoch": 0.10082460120269346, "grad_norm": 0.36910225249575607, "learning_rate": 1.9999994940288617e-05, "loss": 0.0819, "step": 350 }, { "epoch": 0.10226495264844622, "grad_norm": 0.22664069834453254, "learning_rate": 1.999981785092774e-05, "loss": 0.0715, "step": 355 }, { "epoch": 0.10370530409419898, "grad_norm": 0.22035293449009333, "learning_rate": 1.9999387781117715e-05, "loss": 0.0768, "step": 360 }, { "epoch": 0.10514565553995175, "grad_norm": 0.2409697392761486, "learning_rate": 1.9998704741738657e-05, "loss": 0.0785, "step": 365 }, { "epoch": 0.10658600698570452, "grad_norm": 0.19849583830438824, "learning_rate": 1.9997768750070442e-05, "loss": 0.0789, "step": 370 }, { "epoch": 0.10802635843145728, "grad_norm": 0.21065073891564184, "learning_rate": 1.9996579829792263e-05, "loss": 0.0741, "step": 375 }, { "epoch": 0.10946670987721004, "grad_norm": 0.2322217176695769, "learning_rate": 1.9995138010982028e-05, "loss": 0.0756, "step": 380 }, { "epoch": 0.1109070613229628, "grad_norm": 0.18082957951524414, "learning_rate": 1.9993443330115592e-05, "loss": 0.0755, "step": 385 }, { "epoch": 0.11234741276871557, "grad_norm": 0.2057441714111917, "learning_rate": 1.9991495830065857e-05, "loss": 0.077, "step": 390 }, { "epoch": 0.11378776421446833, "grad_norm": 0.1989077999547036, "learning_rate": 1.9989295560101656e-05, "loss": 0.0773, "step": 395 }, { "epoch": 0.11522811566022109, "grad_norm": 0.19081663464856954, "learning_rate": 1.998684257588654e-05, "loss": 0.0741, "step": 400 }, { "epoch": 0.11666846710597385, "grad_norm": 0.19247855517384488, "learning_rate": 1.9984136939477333e-05, "loss": 0.0757, "step": 405 }, { "epoch": 0.11810881855172663, "grad_norm": 0.19993453168502634, "learning_rate": 1.9981178719322606e-05, "loss": 0.0706, "step": 410 }, { "epoch": 0.11954916999747939, "grad_norm": 0.22409961064189973, "learning_rate": 1.99779679902609e-05, "loss": 0.0802, "step": 415 }, { "epoch": 0.12098952144323215, "grad_norm": 0.1850526119008749, "learning_rate": 1.9974504833518863e-05, "loss": 0.0778, "step": 420 }, { "epoch": 0.12242987288898491, "grad_norm": 0.20831229741416568, "learning_rate": 1.9970789336709185e-05, "loss": 0.0787, "step": 425 }, { "epoch": 0.12387022433473767, "grad_norm": 0.17198184608681197, "learning_rate": 1.9966821593828393e-05, "loss": 0.0758, "step": 430 }, { "epoch": 0.12531057578049043, "grad_norm": 0.18655586258401988, "learning_rate": 1.9962601705254442e-05, "loss": 0.0713, "step": 435 }, { "epoch": 0.1267509272262432, "grad_norm": 0.20225353346040226, "learning_rate": 1.995812977774421e-05, "loss": 0.076, "step": 440 }, { "epoch": 0.12819127867199598, "grad_norm": 0.2104521490426979, "learning_rate": 1.995340592443078e-05, "loss": 0.075, "step": 445 }, { "epoch": 0.12963163011774873, "grad_norm": 0.3236324710240072, "learning_rate": 1.9948430264820588e-05, "loss": 0.0752, "step": 450 }, { "epoch": 0.1310719815635015, "grad_norm": 0.1817007734256695, "learning_rate": 1.994320292479038e-05, "loss": 0.0714, "step": 455 }, { "epoch": 0.13251233300925425, "grad_norm": 0.21258018748179575, "learning_rate": 1.993772403658405e-05, "loss": 0.0741, "step": 460 }, { "epoch": 0.13395268445500702, "grad_norm": 0.1908524637842489, "learning_rate": 1.9931993738809288e-05, "loss": 0.0706, "step": 465 }, { "epoch": 0.1353930359007598, "grad_norm": 0.20920739812381056, "learning_rate": 1.9926012176434054e-05, "loss": 0.0677, "step": 470 }, { "epoch": 0.13683338734651254, "grad_norm": 0.1748700492849436, "learning_rate": 1.991977950078295e-05, "loss": 0.07, "step": 475 }, { "epoch": 0.13827373879226532, "grad_norm": 0.20542711837724692, "learning_rate": 1.9913295869533345e-05, "loss": 0.0717, "step": 480 }, { "epoch": 0.13971409023801806, "grad_norm": 0.1976538158422665, "learning_rate": 1.990656144671143e-05, "loss": 0.0755, "step": 485 }, { "epoch": 0.14115444168377084, "grad_norm": 0.18639103707023377, "learning_rate": 1.9899576402688038e-05, "loss": 0.0712, "step": 490 }, { "epoch": 0.1425947931295236, "grad_norm": 0.19411614042491165, "learning_rate": 1.9892340914174344e-05, "loss": 0.079, "step": 495 }, { "epoch": 0.14403514457527636, "grad_norm": 0.17878733422071091, "learning_rate": 1.988485516421739e-05, "loss": 0.0724, "step": 500 }, { "epoch": 0.14403514457527636, "eval_loss": 0.0738767758011818, "eval_runtime": 203.7076, "eval_samples_per_second": 8.856, "eval_steps_per_second": 2.214, "step": 500 }, { "epoch": 0.14547549602102913, "grad_norm": 0.20095627980982742, "learning_rate": 1.9877119342195478e-05, "loss": 0.0751, "step": 505 }, { "epoch": 0.1469158474667819, "grad_norm": 0.19127062420032984, "learning_rate": 1.986913364381333e-05, "loss": 0.0768, "step": 510 }, { "epoch": 0.14835619891253465, "grad_norm": 0.17411671796743106, "learning_rate": 1.9860898271097194e-05, "loss": 0.0721, "step": 515 }, { "epoch": 0.14979655035828743, "grad_norm": 0.20042970248260944, "learning_rate": 1.9852413432389685e-05, "loss": 0.0712, "step": 520 }, { "epoch": 0.15123690180404017, "grad_norm": 0.15880253535731764, "learning_rate": 1.984367934234455e-05, "loss": 0.0705, "step": 525 }, { "epoch": 0.15267725324979295, "grad_norm": 0.18651323384428312, "learning_rate": 1.9834696221921213e-05, "loss": 0.07, "step": 530 }, { "epoch": 0.15411760469554572, "grad_norm": 0.17659413165830717, "learning_rate": 1.98254642983792e-05, "loss": 0.0706, "step": 535 }, { "epoch": 0.15555795614129847, "grad_norm": 0.18160644848664637, "learning_rate": 1.9815983805272378e-05, "loss": 0.0683, "step": 540 }, { "epoch": 0.15699830758705124, "grad_norm": 0.19445538759505177, "learning_rate": 1.980625498244306e-05, "loss": 0.0764, "step": 545 }, { "epoch": 0.158438659032804, "grad_norm": 0.18455595535196012, "learning_rate": 1.9796278076015924e-05, "loss": 0.0697, "step": 550 }, { "epoch": 0.15987901047855677, "grad_norm": 0.18184833856061972, "learning_rate": 1.9786053338391792e-05, "loss": 0.0731, "step": 555 }, { "epoch": 0.16131936192430954, "grad_norm": 0.19338517000535457, "learning_rate": 1.9775581028241253e-05, "loss": 0.0663, "step": 560 }, { "epoch": 0.1627597133700623, "grad_norm": 0.19538801377440532, "learning_rate": 1.97648614104981e-05, "loss": 0.0675, "step": 565 }, { "epoch": 0.16420006481581506, "grad_norm": 0.19062085672060544, "learning_rate": 1.9753894756352643e-05, "loss": 0.0747, "step": 570 }, { "epoch": 0.16564041626156784, "grad_norm": 0.16777348688294874, "learning_rate": 1.9742681343244853e-05, "loss": 0.0667, "step": 575 }, { "epoch": 0.16708076770732058, "grad_norm": 0.170537203370038, "learning_rate": 1.9731221454857322e-05, "loss": 0.0716, "step": 580 }, { "epoch": 0.16852111915307336, "grad_norm": 0.18822505175377233, "learning_rate": 1.9719515381108093e-05, "loss": 0.0737, "step": 585 }, { "epoch": 0.1699614705988261, "grad_norm": 0.17674366415250964, "learning_rate": 1.970756341814335e-05, "loss": 0.0717, "step": 590 }, { "epoch": 0.17140182204457888, "grad_norm": 0.15690653895690873, "learning_rate": 1.9695365868329895e-05, "loss": 0.0721, "step": 595 }, { "epoch": 0.17284217349033165, "grad_norm": 0.18017195479881226, "learning_rate": 1.9682923040247513e-05, "loss": 0.071, "step": 600 }, { "epoch": 0.1742825249360844, "grad_norm": 0.18105083556119095, "learning_rate": 1.9670235248681154e-05, "loss": 0.069, "step": 605 }, { "epoch": 0.17572287638183717, "grad_norm": 0.21301510006977412, "learning_rate": 1.965730281461299e-05, "loss": 0.0728, "step": 610 }, { "epoch": 0.17716322782758992, "grad_norm": 0.20100310691523807, "learning_rate": 1.964412606521428e-05, "loss": 0.0706, "step": 615 }, { "epoch": 0.1786035792733427, "grad_norm": 0.19291740459291176, "learning_rate": 1.9630705333837096e-05, "loss": 0.0716, "step": 620 }, { "epoch": 0.18004393071909547, "grad_norm": 0.18688635532483766, "learning_rate": 1.9617040960005883e-05, "loss": 0.0707, "step": 625 }, { "epoch": 0.18148428216484822, "grad_norm": 0.1763269342109086, "learning_rate": 1.9603133289408883e-05, "loss": 0.0703, "step": 630 }, { "epoch": 0.182924633610601, "grad_norm": 0.1814521955212879, "learning_rate": 1.9588982673889373e-05, "loss": 0.0718, "step": 635 }, { "epoch": 0.18436498505635376, "grad_norm": 0.16733655331154948, "learning_rate": 1.9574589471436794e-05, "loss": 0.0686, "step": 640 }, { "epoch": 0.1858053365021065, "grad_norm": 0.17403974789227739, "learning_rate": 1.955995404617765e-05, "loss": 0.0723, "step": 645 }, { "epoch": 0.18724568794785929, "grad_norm": 0.18173738403202522, "learning_rate": 1.9545076768366336e-05, "loss": 0.0649, "step": 650 }, { "epoch": 0.18868603939361203, "grad_norm": 0.19621468099442035, "learning_rate": 1.9529958014375748e-05, "loss": 0.0723, "step": 655 }, { "epoch": 0.1901263908393648, "grad_norm": 0.167135269028582, "learning_rate": 1.9514598166687772e-05, "loss": 0.0747, "step": 660 }, { "epoch": 0.19156674228511758, "grad_norm": 0.17552296206320767, "learning_rate": 1.9498997613883597e-05, "loss": 0.0767, "step": 665 }, { "epoch": 0.19300709373087033, "grad_norm": 0.18484213292382873, "learning_rate": 1.9483156750633906e-05, "loss": 0.0754, "step": 670 }, { "epoch": 0.1944474451766231, "grad_norm": 0.22352663539188908, "learning_rate": 1.946707597768886e-05, "loss": 0.0678, "step": 675 }, { "epoch": 0.19588779662237585, "grad_norm": 0.19471837734529257, "learning_rate": 1.9450755701867994e-05, "loss": 0.0717, "step": 680 }, { "epoch": 0.19732814806812862, "grad_norm": 0.1638183568509793, "learning_rate": 1.9434196336049897e-05, "loss": 0.0655, "step": 685 }, { "epoch": 0.1987684995138814, "grad_norm": 0.20754326959948957, "learning_rate": 1.941739829916177e-05, "loss": 0.0692, "step": 690 }, { "epoch": 0.20020885095963414, "grad_norm": 0.18228455004715827, "learning_rate": 1.940036201616886e-05, "loss": 0.0706, "step": 695 }, { "epoch": 0.20164920240538692, "grad_norm": 0.19405238266885175, "learning_rate": 1.9383087918063662e-05, "loss": 0.073, "step": 700 }, { "epoch": 0.20308955385113966, "grad_norm": 0.17969147951103687, "learning_rate": 1.9365576441855046e-05, "loss": 0.0715, "step": 705 }, { "epoch": 0.20452990529689244, "grad_norm": 0.17634958619875035, "learning_rate": 1.9347828030557196e-05, "loss": 0.0695, "step": 710 }, { "epoch": 0.2059702567426452, "grad_norm": 0.17555207523689614, "learning_rate": 1.932984313317839e-05, "loss": 0.0726, "step": 715 }, { "epoch": 0.20741060818839796, "grad_norm": 0.18762117292247804, "learning_rate": 1.931162220470967e-05, "loss": 0.0697, "step": 720 }, { "epoch": 0.20885095963415073, "grad_norm": 0.18276137401754816, "learning_rate": 1.9293165706113287e-05, "loss": 0.0703, "step": 725 }, { "epoch": 0.2102913110799035, "grad_norm": 0.1773268424439162, "learning_rate": 1.9274474104311083e-05, "loss": 0.0689, "step": 730 }, { "epoch": 0.21173166252565626, "grad_norm": 0.25970999430399005, "learning_rate": 1.925554787217265e-05, "loss": 0.0706, "step": 735 }, { "epoch": 0.21317201397140903, "grad_norm": 0.19568404818114638, "learning_rate": 1.9236387488503378e-05, "loss": 0.0685, "step": 740 }, { "epoch": 0.21461236541716178, "grad_norm": 0.17278153132178845, "learning_rate": 1.921699343803235e-05, "loss": 0.0627, "step": 745 }, { "epoch": 0.21605271686291455, "grad_norm": 0.16307291332850227, "learning_rate": 1.9197366211400058e-05, "loss": 0.0716, "step": 750 }, { "epoch": 0.21749306830866733, "grad_norm": 0.16739005016913056, "learning_rate": 1.9177506305146008e-05, "loss": 0.0735, "step": 755 }, { "epoch": 0.21893341975442007, "grad_norm": 0.1652055715511055, "learning_rate": 1.9157414221696155e-05, "loss": 0.0695, "step": 760 }, { "epoch": 0.22037377120017285, "grad_norm": 0.1536676585476364, "learning_rate": 1.9137090469350185e-05, "loss": 0.0705, "step": 765 }, { "epoch": 0.2218141226459256, "grad_norm": 0.16180623315358525, "learning_rate": 1.9116535562268658e-05, "loss": 0.0686, "step": 770 }, { "epoch": 0.22325447409167837, "grad_norm": 0.16794475897997235, "learning_rate": 1.9095750020460017e-05, "loss": 0.0642, "step": 775 }, { "epoch": 0.22469482553743114, "grad_norm": 0.19565446028768624, "learning_rate": 1.9074734369767407e-05, "loss": 0.0679, "step": 780 }, { "epoch": 0.2261351769831839, "grad_norm": 0.20795674415023988, "learning_rate": 1.9053489141855386e-05, "loss": 0.0708, "step": 785 }, { "epoch": 0.22757552842893666, "grad_norm": 0.19615918609220573, "learning_rate": 1.9032014874196476e-05, "loss": 0.0748, "step": 790 }, { "epoch": 0.22901587987468944, "grad_norm": 0.19145581797389669, "learning_rate": 1.901031211005756e-05, "loss": 0.0693, "step": 795 }, { "epoch": 0.23045623132044218, "grad_norm": 0.18628621622706582, "learning_rate": 1.898838139848614e-05, "loss": 0.0691, "step": 800 }, { "epoch": 0.23189658276619496, "grad_norm": 0.1694305297665475, "learning_rate": 1.8966223294296445e-05, "loss": 0.0697, "step": 805 }, { "epoch": 0.2333369342119477, "grad_norm": 0.16668102270136734, "learning_rate": 1.8943838358055403e-05, "loss": 0.0756, "step": 810 }, { "epoch": 0.23477728565770048, "grad_norm": 0.16726519179537463, "learning_rate": 1.892122715606846e-05, "loss": 0.0703, "step": 815 }, { "epoch": 0.23621763710345325, "grad_norm": 0.15693138947405286, "learning_rate": 1.8898390260365227e-05, "loss": 0.0666, "step": 820 }, { "epoch": 0.237657988549206, "grad_norm": 0.15236512989059178, "learning_rate": 1.8875328248685047e-05, "loss": 0.0729, "step": 825 }, { "epoch": 0.23909833999495878, "grad_norm": 0.16512723377993288, "learning_rate": 1.885204170446235e-05, "loss": 0.073, "step": 830 }, { "epoch": 0.24053869144071152, "grad_norm": 0.8508290681028176, "learning_rate": 1.8828531216811912e-05, "loss": 0.0664, "step": 835 }, { "epoch": 0.2419790428864643, "grad_norm": 0.15837795426732162, "learning_rate": 1.8804797380513944e-05, "loss": 0.065, "step": 840 }, { "epoch": 0.24341939433221707, "grad_norm": 0.17566377373799896, "learning_rate": 1.878084079599903e-05, "loss": 0.0692, "step": 845 }, { "epoch": 0.24485974577796982, "grad_norm": 0.16777496534468336, "learning_rate": 1.8756662069332966e-05, "loss": 0.0697, "step": 850 }, { "epoch": 0.2463000972237226, "grad_norm": 0.15440366173279146, "learning_rate": 1.8732261812201408e-05, "loss": 0.0662, "step": 855 }, { "epoch": 0.24774044866947534, "grad_norm": 0.2354593017395783, "learning_rate": 1.8707640641894395e-05, "loss": 0.0677, "step": 860 }, { "epoch": 0.2491808001152281, "grad_norm": 0.18206050600161866, "learning_rate": 1.8682799181290747e-05, "loss": 0.0682, "step": 865 }, { "epoch": 0.25062115156098086, "grad_norm": 0.1878723763548132, "learning_rate": 1.86577380588423e-05, "loss": 0.0672, "step": 870 }, { "epoch": 0.25206150300673363, "grad_norm": 0.17948035203695148, "learning_rate": 1.8632457908558006e-05, "loss": 0.0656, "step": 875 }, { "epoch": 0.2535018544524864, "grad_norm": 0.19008061284514347, "learning_rate": 1.8606959369987885e-05, "loss": 0.075, "step": 880 }, { "epoch": 0.2549422058982392, "grad_norm": 0.1698902353596563, "learning_rate": 1.8581243088206865e-05, "loss": 0.0731, "step": 885 }, { "epoch": 0.25638255734399196, "grad_norm": 0.1851448703226216, "learning_rate": 1.8555309713798445e-05, "loss": 0.0678, "step": 890 }, { "epoch": 0.2578229087897447, "grad_norm": 0.1722062227581134, "learning_rate": 1.8529159902838253e-05, "loss": 0.066, "step": 895 }, { "epoch": 0.25926326023549745, "grad_norm": 0.15672701800283906, "learning_rate": 1.8502794316877423e-05, "loss": 0.0663, "step": 900 }, { "epoch": 0.2607036116812502, "grad_norm": 0.2613574893910426, "learning_rate": 1.8476213622925885e-05, "loss": 0.0663, "step": 905 }, { "epoch": 0.262143963127003, "grad_norm": 0.5103888927452046, "learning_rate": 1.844941849343548e-05, "loss": 0.0675, "step": 910 }, { "epoch": 0.2635843145727558, "grad_norm": 0.2024265148274089, "learning_rate": 1.842240960628294e-05, "loss": 0.0691, "step": 915 }, { "epoch": 0.2650246660185085, "grad_norm": 0.1938252123776783, "learning_rate": 1.8395187644752756e-05, "loss": 0.0645, "step": 920 }, { "epoch": 0.26646501746426127, "grad_norm": 0.17332839893721164, "learning_rate": 1.8367753297519874e-05, "loss": 0.0656, "step": 925 }, { "epoch": 0.26790536891001404, "grad_norm": 0.16271679417725463, "learning_rate": 1.8340107258632288e-05, "loss": 0.0698, "step": 930 }, { "epoch": 0.2693457203557668, "grad_norm": 0.1948182360731314, "learning_rate": 1.831225022749347e-05, "loss": 0.0647, "step": 935 }, { "epoch": 0.2707860718015196, "grad_norm": 0.18497254720455072, "learning_rate": 1.828418290884468e-05, "loss": 0.0669, "step": 940 }, { "epoch": 0.2722264232472723, "grad_norm": 0.17841388729576735, "learning_rate": 1.8255906012747137e-05, "loss": 0.0634, "step": 945 }, { "epoch": 0.2736667746930251, "grad_norm": 0.17987207372942032, "learning_rate": 1.8227420254564066e-05, "loss": 0.062, "step": 950 }, { "epoch": 0.27510712613877786, "grad_norm": 0.15594487078267172, "learning_rate": 1.819872635494258e-05, "loss": 0.0662, "step": 955 }, { "epoch": 0.27654747758453063, "grad_norm": 0.16476402091196268, "learning_rate": 1.816982503979546e-05, "loss": 0.0698, "step": 960 }, { "epoch": 0.2779878290302834, "grad_norm": 0.17955556471513223, "learning_rate": 1.8140717040282797e-05, "loss": 0.0653, "step": 965 }, { "epoch": 0.2794281804760361, "grad_norm": 0.23723396239220879, "learning_rate": 1.811140309279348e-05, "loss": 0.0686, "step": 970 }, { "epoch": 0.2808685319217889, "grad_norm": 0.18799932816101794, "learning_rate": 1.808188393892658e-05, "loss": 0.0689, "step": 975 }, { "epoch": 0.2823088833675417, "grad_norm": 0.1730477583895621, "learning_rate": 1.805216032547258e-05, "loss": 0.0689, "step": 980 }, { "epoch": 0.28374923481329445, "grad_norm": 0.17077954636100237, "learning_rate": 1.8022233004394487e-05, "loss": 0.0705, "step": 985 }, { "epoch": 0.2851895862590472, "grad_norm": 0.2007911822516418, "learning_rate": 1.7992102732808798e-05, "loss": 0.0689, "step": 990 }, { "epoch": 0.2866299377048, "grad_norm": 0.19728704564899405, "learning_rate": 1.796177027296637e-05, "loss": 0.0706, "step": 995 }, { "epoch": 0.2880702891505527, "grad_norm": 0.17858125265730437, "learning_rate": 1.79312363922331e-05, "loss": 0.07, "step": 1000 }, { "epoch": 0.2880702891505527, "eval_loss": 0.0674939900636673, "eval_runtime": 203.1313, "eval_samples_per_second": 8.881, "eval_steps_per_second": 2.22, "step": 1000 }, { "epoch": 0.2895106405963055, "grad_norm": 0.16535683477861135, "learning_rate": 1.7900501863070552e-05, "loss": 0.0635, "step": 1005 }, { "epoch": 0.29095099204205827, "grad_norm": 0.17493397556340032, "learning_rate": 1.7869567463016394e-05, "loss": 0.0626, "step": 1010 }, { "epoch": 0.29239134348781104, "grad_norm": 0.18719256822064576, "learning_rate": 1.7838433974664714e-05, "loss": 0.0711, "step": 1015 }, { "epoch": 0.2938316949335638, "grad_norm": 0.17416319375618142, "learning_rate": 1.7807102185646247e-05, "loss": 0.0686, "step": 1020 }, { "epoch": 0.29527204637931653, "grad_norm": 0.18401267882699013, "learning_rate": 1.7775572888608438e-05, "loss": 0.0669, "step": 1025 }, { "epoch": 0.2967123978250693, "grad_norm": 0.18105211685459255, "learning_rate": 1.774384688119539e-05, "loss": 0.0714, "step": 1030 }, { "epoch": 0.2981527492708221, "grad_norm": 0.15402578107011958, "learning_rate": 1.7711924966027678e-05, "loss": 0.0662, "step": 1035 }, { "epoch": 0.29959310071657486, "grad_norm": 0.18966226185659374, "learning_rate": 1.767980795068206e-05, "loss": 0.0715, "step": 1040 }, { "epoch": 0.30103345216232763, "grad_norm": 0.22533074590736946, "learning_rate": 1.7647496647671033e-05, "loss": 0.0678, "step": 1045 }, { "epoch": 0.30247380360808035, "grad_norm": 0.14978667904322185, "learning_rate": 1.761499187442228e-05, "loss": 0.0602, "step": 1050 }, { "epoch": 0.3039141550538331, "grad_norm": 0.16009119628660812, "learning_rate": 1.7582294453257996e-05, "loss": 0.0737, "step": 1055 }, { "epoch": 0.3053545064995859, "grad_norm": 0.1547831176643596, "learning_rate": 1.7549405211374072e-05, "loss": 0.0686, "step": 1060 }, { "epoch": 0.3067948579453387, "grad_norm": 0.17560213727576401, "learning_rate": 1.7516324980819185e-05, "loss": 0.0651, "step": 1065 }, { "epoch": 0.30823520939109145, "grad_norm": 0.1667942680628308, "learning_rate": 1.7483054598473734e-05, "loss": 0.0731, "step": 1070 }, { "epoch": 0.30967556083684417, "grad_norm": 0.18465697565580333, "learning_rate": 1.7449594906028684e-05, "loss": 0.067, "step": 1075 }, { "epoch": 0.31111591228259694, "grad_norm": 0.16066322224426005, "learning_rate": 1.7415946749964252e-05, "loss": 0.0667, "step": 1080 }, { "epoch": 0.3125562637283497, "grad_norm": 0.158008871885263, "learning_rate": 1.7382110981528506e-05, "loss": 0.0643, "step": 1085 }, { "epoch": 0.3139966151741025, "grad_norm": 0.18023373354288208, "learning_rate": 1.734808845671583e-05, "loss": 0.0603, "step": 1090 }, { "epoch": 0.31543696661985526, "grad_norm": 0.16286673700727694, "learning_rate": 1.7313880036245257e-05, "loss": 0.0649, "step": 1095 }, { "epoch": 0.316877318065608, "grad_norm": 0.1673107111388145, "learning_rate": 1.7279486585538712e-05, "loss": 0.067, "step": 1100 }, { "epoch": 0.31831766951136076, "grad_norm": 0.180435927792874, "learning_rate": 1.7244908974699112e-05, "loss": 0.0683, "step": 1105 }, { "epoch": 0.31975802095711353, "grad_norm": 0.15646264704364563, "learning_rate": 1.721014807848833e-05, "loss": 0.0698, "step": 1110 }, { "epoch": 0.3211983724028663, "grad_norm": 0.19141062322094995, "learning_rate": 1.7175204776305102e-05, "loss": 0.0634, "step": 1115 }, { "epoch": 0.3226387238486191, "grad_norm": 0.1539348164012501, "learning_rate": 1.7140079952162765e-05, "loss": 0.0649, "step": 1120 }, { "epoch": 0.32407907529437185, "grad_norm": 0.16127336339051726, "learning_rate": 1.7104774494666877e-05, "loss": 0.0631, "step": 1125 }, { "epoch": 0.3255194267401246, "grad_norm": 0.1541424306237969, "learning_rate": 1.7069289296992756e-05, "loss": 0.0667, "step": 1130 }, { "epoch": 0.32695977818587735, "grad_norm": 0.15026114726792159, "learning_rate": 1.703362525686288e-05, "loss": 0.0648, "step": 1135 }, { "epoch": 0.3284001296316301, "grad_norm": 0.15367694481808417, "learning_rate": 1.6997783276524177e-05, "loss": 0.0678, "step": 1140 }, { "epoch": 0.3298404810773829, "grad_norm": 0.14648710292293746, "learning_rate": 1.6961764262725187e-05, "loss": 0.0632, "step": 1145 }, { "epoch": 0.33128083252313567, "grad_norm": 0.15611228592369236, "learning_rate": 1.6925569126693135e-05, "loss": 0.0604, "step": 1150 }, { "epoch": 0.3327211839688884, "grad_norm": 0.16896274297951375, "learning_rate": 1.6889198784110883e-05, "loss": 0.0617, "step": 1155 }, { "epoch": 0.33416153541464116, "grad_norm": 0.14046862243355399, "learning_rate": 1.6852654155093745e-05, "loss": 0.0667, "step": 1160 }, { "epoch": 0.33560188686039394, "grad_norm": 0.16033306567178615, "learning_rate": 1.681593616416623e-05, "loss": 0.0667, "step": 1165 }, { "epoch": 0.3370422383061467, "grad_norm": 0.1464089660272697, "learning_rate": 1.6779045740238643e-05, "loss": 0.0689, "step": 1170 }, { "epoch": 0.3384825897518995, "grad_norm": 0.14637802553541343, "learning_rate": 1.6741983816583583e-05, "loss": 0.0644, "step": 1175 }, { "epoch": 0.3399229411976522, "grad_norm": 0.14849896211371577, "learning_rate": 1.6704751330812342e-05, "loss": 0.0593, "step": 1180 }, { "epoch": 0.341363292643405, "grad_norm": 0.16869632868201492, "learning_rate": 1.666734922485117e-05, "loss": 0.0651, "step": 1185 }, { "epoch": 0.34280364408915776, "grad_norm": 0.16979103264214, "learning_rate": 1.662977844491746e-05, "loss": 0.0681, "step": 1190 }, { "epoch": 0.34424399553491053, "grad_norm": 0.1485954334355779, "learning_rate": 1.6592039941495803e-05, "loss": 0.0608, "step": 1195 }, { "epoch": 0.3456843469806633, "grad_norm": 0.14777980222615944, "learning_rate": 1.6554134669313943e-05, "loss": 0.0676, "step": 1200 }, { "epoch": 0.347124698426416, "grad_norm": 0.1509175328811579, "learning_rate": 1.6516063587318627e-05, "loss": 0.0629, "step": 1205 }, { "epoch": 0.3485650498721688, "grad_norm": 0.16676709581052593, "learning_rate": 1.647782765865134e-05, "loss": 0.0619, "step": 1210 }, { "epoch": 0.35000540131792157, "grad_norm": 0.15838480213664755, "learning_rate": 1.6439427850623944e-05, "loss": 0.067, "step": 1215 }, { "epoch": 0.35144575276367435, "grad_norm": 0.14839184568071354, "learning_rate": 1.64008651346942e-05, "loss": 0.0642, "step": 1220 }, { "epoch": 0.3528861042094271, "grad_norm": 0.1648207628445432, "learning_rate": 1.63621404864412e-05, "loss": 0.0658, "step": 1225 }, { "epoch": 0.35432645565517984, "grad_norm": 0.1591196777066707, "learning_rate": 1.6323254885540672e-05, "loss": 0.068, "step": 1230 }, { "epoch": 0.3557668071009326, "grad_norm": 0.14084965766747376, "learning_rate": 1.6284209315740225e-05, "loss": 0.0617, "step": 1235 }, { "epoch": 0.3572071585466854, "grad_norm": 0.15809745947459444, "learning_rate": 1.6245004764834423e-05, "loss": 0.0657, "step": 1240 }, { "epoch": 0.35864750999243816, "grad_norm": 0.15253088492537956, "learning_rate": 1.620564222463982e-05, "loss": 0.065, "step": 1245 }, { "epoch": 0.36008786143819094, "grad_norm": 0.14590704135450994, "learning_rate": 1.6166122690969872e-05, "loss": 0.0618, "step": 1250 }, { "epoch": 0.36152821288394366, "grad_norm": 0.13085111936326246, "learning_rate": 1.612644716360972e-05, "loss": 0.0568, "step": 1255 }, { "epoch": 0.36296856432969643, "grad_norm": 0.15489937634073334, "learning_rate": 1.6086616646290926e-05, "loss": 0.0704, "step": 1260 }, { "epoch": 0.3644089157754492, "grad_norm": 0.15995869543145932, "learning_rate": 1.6046632146666056e-05, "loss": 0.0639, "step": 1265 }, { "epoch": 0.365849267221202, "grad_norm": 0.14803784398219583, "learning_rate": 1.60064946762832e-05, "loss": 0.063, "step": 1270 }, { "epoch": 0.36728961866695475, "grad_norm": 0.1482802413474464, "learning_rate": 1.5966205250560393e-05, "loss": 0.0667, "step": 1275 }, { "epoch": 0.36872997011270753, "grad_norm": 0.14080015648710145, "learning_rate": 1.592576488875989e-05, "loss": 0.0657, "step": 1280 }, { "epoch": 0.37017032155846025, "grad_norm": 0.14023456289860875, "learning_rate": 1.5885174613962427e-05, "loss": 0.0661, "step": 1285 }, { "epoch": 0.371610673004213, "grad_norm": 0.1609532074245431, "learning_rate": 1.5844435453041294e-05, "loss": 0.0675, "step": 1290 }, { "epoch": 0.3730510244499658, "grad_norm": 0.13842067239979322, "learning_rate": 1.5803548436636394e-05, "loss": 0.0655, "step": 1295 }, { "epoch": 0.37449137589571857, "grad_norm": 0.16642819320304691, "learning_rate": 1.576251459912814e-05, "loss": 0.069, "step": 1300 }, { "epoch": 0.37593172734147134, "grad_norm": 0.15113047288834136, "learning_rate": 1.5721334978611307e-05, "loss": 0.0673, "step": 1305 }, { "epoch": 0.37737207878722406, "grad_norm": 0.13337077100229278, "learning_rate": 1.5680010616868762e-05, "loss": 0.0635, "step": 1310 }, { "epoch": 0.37881243023297684, "grad_norm": 0.14914710569165873, "learning_rate": 1.5638542559345106e-05, "loss": 0.0619, "step": 1315 }, { "epoch": 0.3802527816787296, "grad_norm": 0.1716264208900402, "learning_rate": 1.559693185512023e-05, "loss": 0.0671, "step": 1320 }, { "epoch": 0.3816931331244824, "grad_norm": 0.14840365175842551, "learning_rate": 1.555517955688277e-05, "loss": 0.0633, "step": 1325 }, { "epoch": 0.38313348457023516, "grad_norm": 0.16640813297411597, "learning_rate": 1.5513286720903488e-05, "loss": 0.0707, "step": 1330 }, { "epoch": 0.3845738360159879, "grad_norm": 0.14134033460133827, "learning_rate": 1.5471254407008526e-05, "loss": 0.0611, "step": 1335 }, { "epoch": 0.38601418746174065, "grad_norm": 0.1959632020773705, "learning_rate": 1.542908367855263e-05, "loss": 0.066, "step": 1340 }, { "epoch": 0.38745453890749343, "grad_norm": 0.15059928506128456, "learning_rate": 1.53867756023922e-05, "loss": 0.0626, "step": 1345 }, { "epoch": 0.3888948903532462, "grad_norm": 0.14705027197162424, "learning_rate": 1.534433124885836e-05, "loss": 0.0612, "step": 1350 }, { "epoch": 0.390335241798999, "grad_norm": 0.1712504949998861, "learning_rate": 1.530175169172982e-05, "loss": 0.0652, "step": 1355 }, { "epoch": 0.3917755932447517, "grad_norm": 0.16428240182950052, "learning_rate": 1.525903800820575e-05, "loss": 0.0638, "step": 1360 }, { "epoch": 0.39321594469050447, "grad_norm": 0.16329645975133253, "learning_rate": 1.5216191278878522e-05, "loss": 0.0636, "step": 1365 }, { "epoch": 0.39465629613625725, "grad_norm": 0.13480884471675694, "learning_rate": 1.517321258770636e-05, "loss": 0.0613, "step": 1370 }, { "epoch": 0.39609664758201, "grad_norm": 0.1669515249018309, "learning_rate": 1.5130103021985929e-05, "loss": 0.0678, "step": 1375 }, { "epoch": 0.3975369990277628, "grad_norm": 0.15183706654976248, "learning_rate": 1.5086863672324826e-05, "loss": 0.0614, "step": 1380 }, { "epoch": 0.3989773504735155, "grad_norm": 0.16444568652959443, "learning_rate": 1.5043495632613982e-05, "loss": 0.0639, "step": 1385 }, { "epoch": 0.4004177019192683, "grad_norm": 1.6562960432751872, "learning_rate": 1.5000000000000002e-05, "loss": 0.0665, "step": 1390 }, { "epoch": 0.40185805336502106, "grad_norm": 0.14191330098363916, "learning_rate": 1.4956377874857395e-05, "loss": 0.0595, "step": 1395 }, { "epoch": 0.40329840481077384, "grad_norm": 0.14989109086985877, "learning_rate": 1.4912630360760743e-05, "loss": 0.0653, "step": 1400 }, { "epoch": 0.4047387562565266, "grad_norm": 0.15488669647541722, "learning_rate": 1.4868758564456785e-05, "loss": 0.0674, "step": 1405 }, { "epoch": 0.40617910770227933, "grad_norm": 0.13853389542189282, "learning_rate": 1.4824763595836404e-05, "loss": 0.0637, "step": 1410 }, { "epoch": 0.4076194591480321, "grad_norm": 0.157569240556149, "learning_rate": 1.4780646567906571e-05, "loss": 0.0605, "step": 1415 }, { "epoch": 0.4090598105937849, "grad_norm": 0.16007008642510542, "learning_rate": 1.473640859676217e-05, "loss": 0.0632, "step": 1420 }, { "epoch": 0.41050016203953765, "grad_norm": 0.15116913790513345, "learning_rate": 1.4692050801557769e-05, "loss": 0.0632, "step": 1425 }, { "epoch": 0.4119405134852904, "grad_norm": 0.14587678441383706, "learning_rate": 1.4647574304479295e-05, "loss": 0.0675, "step": 1430 }, { "epoch": 0.4133808649310432, "grad_norm": 0.1518975910563791, "learning_rate": 1.4602980230715674e-05, "loss": 0.0655, "step": 1435 }, { "epoch": 0.4148212163767959, "grad_norm": 0.14736642886138512, "learning_rate": 1.4558269708430333e-05, "loss": 0.0628, "step": 1440 }, { "epoch": 0.4162615678225487, "grad_norm": 0.16686599891471496, "learning_rate": 1.4513443868732674e-05, "loss": 0.0667, "step": 1445 }, { "epoch": 0.41770191926830147, "grad_norm": 0.1409423422519682, "learning_rate": 1.4468503845649462e-05, "loss": 0.0644, "step": 1450 }, { "epoch": 0.41914227071405424, "grad_norm": 0.14764702695382043, "learning_rate": 1.4423450776096122e-05, "loss": 0.0612, "step": 1455 }, { "epoch": 0.420582622159807, "grad_norm": 0.14452959429650908, "learning_rate": 1.4378285799848004e-05, "loss": 0.0677, "step": 1460 }, { "epoch": 0.42202297360555974, "grad_norm": 0.1573610389293893, "learning_rate": 1.4333010059511505e-05, "loss": 0.062, "step": 1465 }, { "epoch": 0.4234633250513125, "grad_norm": 0.1480919971980055, "learning_rate": 1.4287624700495211e-05, "loss": 0.0619, "step": 1470 }, { "epoch": 0.4249036764970653, "grad_norm": 0.15894026434712646, "learning_rate": 1.4242130870980882e-05, "loss": 0.0611, "step": 1475 }, { "epoch": 0.42634402794281806, "grad_norm": 0.15069243728870851, "learning_rate": 1.4196529721894427e-05, "loss": 0.064, "step": 1480 }, { "epoch": 0.42778437938857083, "grad_norm": 0.1682063987256438, "learning_rate": 1.4150822406876774e-05, "loss": 0.0639, "step": 1485 }, { "epoch": 0.42922473083432355, "grad_norm": 0.15006120941638998, "learning_rate": 1.4105010082254697e-05, "loss": 0.0665, "step": 1490 }, { "epoch": 0.43066508228007633, "grad_norm": 0.13479491661214907, "learning_rate": 1.4059093907011552e-05, "loss": 0.0617, "step": 1495 }, { "epoch": 0.4321054337258291, "grad_norm": 0.1715681331105154, "learning_rate": 1.401307504275796e-05, "loss": 0.0638, "step": 1500 }, { "epoch": 0.4321054337258291, "eval_loss": 0.06420264393091202, "eval_runtime": 204.1795, "eval_samples_per_second": 8.835, "eval_steps_per_second": 2.209, "step": 1500 }, { "epoch": 0.4335457851715819, "grad_norm": 0.1519666716391639, "learning_rate": 1.3966954653702423e-05, "loss": 0.0648, "step": 1505 }, { "epoch": 0.43498613661733465, "grad_norm": 0.14120235921333318, "learning_rate": 1.3920733906621861e-05, "loss": 0.0625, "step": 1510 }, { "epoch": 0.43642648806308737, "grad_norm": 0.15467790310945892, "learning_rate": 1.3874413970832123e-05, "loss": 0.064, "step": 1515 }, { "epoch": 0.43786683950884014, "grad_norm": 0.13984061839469614, "learning_rate": 1.3827996018158356e-05, "loss": 0.0605, "step": 1520 }, { "epoch": 0.4393071909545929, "grad_norm": 0.145852830960354, "learning_rate": 1.378148122290541e-05, "loss": 0.0655, "step": 1525 }, { "epoch": 0.4407475424003457, "grad_norm": 0.15286178112634702, "learning_rate": 1.3734870761828095e-05, "loss": 0.063, "step": 1530 }, { "epoch": 0.44218789384609847, "grad_norm": 0.1259636985152776, "learning_rate": 1.368816581410143e-05, "loss": 0.0615, "step": 1535 }, { "epoch": 0.4436282452918512, "grad_norm": 0.15476898913684597, "learning_rate": 1.3641367561290795e-05, "loss": 0.0644, "step": 1540 }, { "epoch": 0.44506859673760396, "grad_norm": 0.1617280661787022, "learning_rate": 1.3594477187322065e-05, "loss": 0.0614, "step": 1545 }, { "epoch": 0.44650894818335674, "grad_norm": 0.14998584782237406, "learning_rate": 1.3547495878451635e-05, "loss": 0.0635, "step": 1550 }, { "epoch": 0.4479492996291095, "grad_norm": 0.15207416735601922, "learning_rate": 1.3500424823236413e-05, "loss": 0.0669, "step": 1555 }, { "epoch": 0.4493896510748623, "grad_norm": 0.13476067626565744, "learning_rate": 1.3453265212503756e-05, "loss": 0.0606, "step": 1560 }, { "epoch": 0.450830002520615, "grad_norm": 0.15284821669337595, "learning_rate": 1.340601823932135e-05, "loss": 0.0624, "step": 1565 }, { "epoch": 0.4522703539663678, "grad_norm": 0.15463620126735736, "learning_rate": 1.335868509896702e-05, "loss": 0.0651, "step": 1570 }, { "epoch": 0.45371070541212055, "grad_norm": 0.15250686045319206, "learning_rate": 1.3311266988898477e-05, "loss": 0.0646, "step": 1575 }, { "epoch": 0.4551510568578733, "grad_norm": 0.13859935456186184, "learning_rate": 1.3263765108723061e-05, "loss": 0.0623, "step": 1580 }, { "epoch": 0.4565914083036261, "grad_norm": 0.1614173179165357, "learning_rate": 1.3216180660167355e-05, "loss": 0.0611, "step": 1585 }, { "epoch": 0.4580317597493789, "grad_norm": 0.16964451095340738, "learning_rate": 1.3168514847046802e-05, "loss": 0.0612, "step": 1590 }, { "epoch": 0.4594721111951316, "grad_norm": 0.1497788074170894, "learning_rate": 1.3120768875235252e-05, "loss": 0.0593, "step": 1595 }, { "epoch": 0.46091246264088437, "grad_norm": 0.12383289401240005, "learning_rate": 1.3072943952634446e-05, "loss": 0.0592, "step": 1600 }, { "epoch": 0.46235281408663714, "grad_norm": 0.146818726530369, "learning_rate": 1.3025041289143459e-05, "loss": 0.0605, "step": 1605 }, { "epoch": 0.4637931655323899, "grad_norm": 0.14306567032337267, "learning_rate": 1.2977062096628096e-05, "loss": 0.0605, "step": 1610 }, { "epoch": 0.4652335169781427, "grad_norm": 0.14605676143356022, "learning_rate": 1.2929007588890241e-05, "loss": 0.0612, "step": 1615 }, { "epoch": 0.4666738684238954, "grad_norm": 0.15086264411486566, "learning_rate": 1.2880878981637129e-05, "loss": 0.063, "step": 1620 }, { "epoch": 0.4681142198696482, "grad_norm": 0.15203796445266168, "learning_rate": 1.2832677492450602e-05, "loss": 0.0635, "step": 1625 }, { "epoch": 0.46955457131540096, "grad_norm": 0.1644232831015843, "learning_rate": 1.2784404340756315e-05, "loss": 0.0656, "step": 1630 }, { "epoch": 0.47099492276115373, "grad_norm": 0.1417073515720032, "learning_rate": 1.2736060747792877e-05, "loss": 0.0639, "step": 1635 }, { "epoch": 0.4724352742069065, "grad_norm": 0.14017747593627589, "learning_rate": 1.268764793658094e-05, "loss": 0.0677, "step": 1640 }, { "epoch": 0.4738756256526592, "grad_norm": 0.14349670833488368, "learning_rate": 1.2639167131892294e-05, "loss": 0.0575, "step": 1645 }, { "epoch": 0.475315977098412, "grad_norm": 0.1399709068160109, "learning_rate": 1.2590619560218851e-05, "loss": 0.0667, "step": 1650 }, { "epoch": 0.4767563285441648, "grad_norm": 0.16424535413090544, "learning_rate": 1.2542006449741631e-05, "loss": 0.0638, "step": 1655 }, { "epoch": 0.47819667998991755, "grad_norm": 0.13951898085012698, "learning_rate": 1.249332903029969e-05, "loss": 0.0636, "step": 1660 }, { "epoch": 0.4796370314356703, "grad_norm": 0.14482227044886414, "learning_rate": 1.2444588533358996e-05, "loss": 0.0623, "step": 1665 }, { "epoch": 0.48107738288142304, "grad_norm": 0.149830114351691, "learning_rate": 1.23957861919813e-05, "loss": 0.0631, "step": 1670 }, { "epoch": 0.4825177343271758, "grad_norm": 0.15277720600057818, "learning_rate": 1.2346923240792907e-05, "loss": 0.0656, "step": 1675 }, { "epoch": 0.4839580857729286, "grad_norm": 0.14145469398357127, "learning_rate": 1.229800091595347e-05, "loss": 0.0602, "step": 1680 }, { "epoch": 0.48539843721868137, "grad_norm": 0.145030274869654, "learning_rate": 1.2249020455124703e-05, "loss": 0.0605, "step": 1685 }, { "epoch": 0.48683878866443414, "grad_norm": 0.13292226014635264, "learning_rate": 1.2199983097439079e-05, "loss": 0.0576, "step": 1690 }, { "epoch": 0.48827914011018686, "grad_norm": 0.14980471445078763, "learning_rate": 1.2150890083468465e-05, "loss": 0.0613, "step": 1695 }, { "epoch": 0.48971949155593963, "grad_norm": 0.15237623136710496, "learning_rate": 1.2101742655192761e-05, "loss": 0.0641, "step": 1700 }, { "epoch": 0.4911598430016924, "grad_norm": 0.14853782098493126, "learning_rate": 1.2052542055968461e-05, "loss": 0.0635, "step": 1705 }, { "epoch": 0.4926001944474452, "grad_norm": 0.16064984443815533, "learning_rate": 1.2003289530497206e-05, "loss": 0.0664, "step": 1710 }, { "epoch": 0.49404054589319796, "grad_norm": 0.14592019591217842, "learning_rate": 1.1953986324794295e-05, "loss": 0.0621, "step": 1715 }, { "epoch": 0.4954808973389507, "grad_norm": 0.14899551585302004, "learning_rate": 1.1904633686157158e-05, "loss": 0.0661, "step": 1720 }, { "epoch": 0.49692124878470345, "grad_norm": 0.1662839926677418, "learning_rate": 1.1855232863133809e-05, "loss": 0.06, "step": 1725 }, { "epoch": 0.4983616002304562, "grad_norm": 0.14234586755951342, "learning_rate": 1.1805785105491247e-05, "loss": 0.0621, "step": 1730 }, { "epoch": 0.499801951676209, "grad_norm": 0.14261214561399946, "learning_rate": 1.1756291664183858e-05, "loss": 0.0649, "step": 1735 }, { "epoch": 0.5012423031219617, "grad_norm": 0.15004211923520408, "learning_rate": 1.1706753791321748e-05, "loss": 0.0619, "step": 1740 }, { "epoch": 0.5026826545677145, "grad_norm": 0.1605343977528051, "learning_rate": 1.1657172740139074e-05, "loss": 0.0612, "step": 1745 }, { "epoch": 0.5041230060134673, "grad_norm": 0.1447253493029267, "learning_rate": 1.1607549764962342e-05, "loss": 0.0634, "step": 1750 }, { "epoch": 0.5055633574592201, "grad_norm": 0.1346139878314749, "learning_rate": 1.1557886121178683e-05, "loss": 0.0664, "step": 1755 }, { "epoch": 0.5070037089049728, "grad_norm": 0.14646185824109836, "learning_rate": 1.1508183065204066e-05, "loss": 0.0621, "step": 1760 }, { "epoch": 0.5084440603507255, "grad_norm": 0.147637884179789, "learning_rate": 1.1458441854451539e-05, "loss": 0.0598, "step": 1765 }, { "epoch": 0.5098844117964784, "grad_norm": 0.1379823925555806, "learning_rate": 1.1408663747299409e-05, "loss": 0.0609, "step": 1770 }, { "epoch": 0.5113247632422311, "grad_norm": 0.1367073920260954, "learning_rate": 1.13588500030594e-05, "loss": 0.0625, "step": 1775 }, { "epoch": 0.5127651146879839, "grad_norm": 0.1358599716163257, "learning_rate": 1.130900188194481e-05, "loss": 0.0609, "step": 1780 }, { "epoch": 0.5142054661337366, "grad_norm": 0.1490120576953094, "learning_rate": 1.1259120645038612e-05, "loss": 0.0603, "step": 1785 }, { "epoch": 0.5156458175794894, "grad_norm": 0.1387405614374518, "learning_rate": 1.1209207554261573e-05, "loss": 0.0584, "step": 1790 }, { "epoch": 0.5170861690252422, "grad_norm": 0.1279767885162475, "learning_rate": 1.1159263872340293e-05, "loss": 0.06, "step": 1795 }, { "epoch": 0.5185265204709949, "grad_norm": 0.14033610983416972, "learning_rate": 1.1109290862775307e-05, "loss": 0.0559, "step": 1800 }, { "epoch": 0.5199668719167477, "grad_norm": 0.14071210053007446, "learning_rate": 1.1059289789809071e-05, "loss": 0.0591, "step": 1805 }, { "epoch": 0.5214072233625004, "grad_norm": 0.14899424536618833, "learning_rate": 1.1009261918394028e-05, "loss": 0.0573, "step": 1810 }, { "epoch": 0.5228475748082532, "grad_norm": 0.13803763706679575, "learning_rate": 1.0959208514160561e-05, "loss": 0.0643, "step": 1815 }, { "epoch": 0.524287926254006, "grad_norm": 0.14811878913823373, "learning_rate": 1.0909130843385009e-05, "loss": 0.0581, "step": 1820 }, { "epoch": 0.5257282776997587, "grad_norm": 0.14341490101278728, "learning_rate": 1.085903017295761e-05, "loss": 0.0575, "step": 1825 }, { "epoch": 0.5271686291455115, "grad_norm": 0.13792111506735585, "learning_rate": 1.0808907770350463e-05, "loss": 0.0593, "step": 1830 }, { "epoch": 0.5286089805912643, "grad_norm": 0.14395673234387163, "learning_rate": 1.0758764903585457e-05, "loss": 0.0609, "step": 1835 }, { "epoch": 0.530049332037017, "grad_norm": 0.1405053430609704, "learning_rate": 1.070860284120219e-05, "loss": 0.06, "step": 1840 }, { "epoch": 0.5314896834827698, "grad_norm": 0.15982572109675375, "learning_rate": 1.0658422852225889e-05, "loss": 0.0702, "step": 1845 }, { "epoch": 0.5329300349285225, "grad_norm": 0.14599264911552556, "learning_rate": 1.0608226206135292e-05, "loss": 0.0604, "step": 1850 }, { "epoch": 0.5343703863742754, "grad_norm": 0.14298260555858402, "learning_rate": 1.0558014172830537e-05, "loss": 0.0619, "step": 1855 }, { "epoch": 0.5358107378200281, "grad_norm": 0.13092410126397575, "learning_rate": 1.0507788022601033e-05, "loss": 0.057, "step": 1860 }, { "epoch": 0.5372510892657808, "grad_norm": 0.1453972372171612, "learning_rate": 1.0457549026093338e-05, "loss": 0.0585, "step": 1865 }, { "epoch": 0.5386914407115336, "grad_norm": 0.12317966498614596, "learning_rate": 1.0407298454278983e-05, "loss": 0.0564, "step": 1870 }, { "epoch": 0.5401317921572864, "grad_norm": 0.14436498244704113, "learning_rate": 1.0357037578422349e-05, "loss": 0.0645, "step": 1875 }, { "epoch": 0.5415721436030392, "grad_norm": 0.12970657416072792, "learning_rate": 1.0306767670048497e-05, "loss": 0.0589, "step": 1880 }, { "epoch": 0.5430124950487919, "grad_norm": 0.1367592762940604, "learning_rate": 1.0256490000910986e-05, "loss": 0.0621, "step": 1885 }, { "epoch": 0.5444528464945446, "grad_norm": 0.13370324714287835, "learning_rate": 1.0206205842959718e-05, "loss": 0.0626, "step": 1890 }, { "epoch": 0.5458931979402974, "grad_norm": 0.13187409015415524, "learning_rate": 1.0155916468308749e-05, "loss": 0.0614, "step": 1895 }, { "epoch": 0.5473335493860502, "grad_norm": 0.16314950628212277, "learning_rate": 1.0105623149204118e-05, "loss": 0.0642, "step": 1900 }, { "epoch": 0.548773900831803, "grad_norm": 0.15059666297472246, "learning_rate": 1.0055327157991639e-05, "loss": 0.0613, "step": 1905 }, { "epoch": 0.5502142522775557, "grad_norm": 0.1479385664820797, "learning_rate": 1.0005029767084739e-05, "loss": 0.0608, "step": 1910 }, { "epoch": 0.5516546037233084, "grad_norm": 0.14109693631194206, "learning_rate": 9.954732248932243e-06, "loss": 0.064, "step": 1915 }, { "epoch": 0.5530949551690613, "grad_norm": 0.13106823837594228, "learning_rate": 9.904435875986213e-06, "loss": 0.0584, "step": 1920 }, { "epoch": 0.554535306614814, "grad_norm": 0.14125376801616402, "learning_rate": 9.85414192066972e-06, "loss": 0.0619, "step": 1925 }, { "epoch": 0.5559756580605668, "grad_norm": 0.13715317817212974, "learning_rate": 9.803851655344682e-06, "loss": 0.0557, "step": 1930 }, { "epoch": 0.5574160095063195, "grad_norm": 0.12415591514509057, "learning_rate": 9.75356635227966e-06, "loss": 0.0605, "step": 1935 }, { "epoch": 0.5588563609520723, "grad_norm": 0.1471168680809387, "learning_rate": 9.70328728361769e-06, "loss": 0.0624, "step": 1940 }, { "epoch": 0.5602967123978251, "grad_norm": 0.13972784391125845, "learning_rate": 9.653015721344073e-06, "loss": 0.0625, "step": 1945 }, { "epoch": 0.5617370638435778, "grad_norm": 0.14230983317081475, "learning_rate": 9.602752937254215e-06, "loss": 0.0595, "step": 1950 }, { "epoch": 0.5631774152893306, "grad_norm": 0.17270902776994063, "learning_rate": 9.552500202921449e-06, "loss": 0.0633, "step": 1955 }, { "epoch": 0.5646177667350833, "grad_norm": 0.16874434258709992, "learning_rate": 9.502258789664865e-06, "loss": 0.0647, "step": 1960 }, { "epoch": 0.5660581181808362, "grad_norm": 0.14698351108372149, "learning_rate": 9.45202996851714e-06, "loss": 0.0652, "step": 1965 }, { "epoch": 0.5674984696265889, "grad_norm": 0.1583270327898255, "learning_rate": 9.401815010192388e-06, "loss": 0.0646, "step": 1970 }, { "epoch": 0.5689388210723416, "grad_norm": 0.1449159304203145, "learning_rate": 9.351615185054029e-06, "loss": 0.0613, "step": 1975 }, { "epoch": 0.5703791725180944, "grad_norm": 0.1365799756032449, "learning_rate": 9.301431763082623e-06, "loss": 0.059, "step": 1980 }, { "epoch": 0.5718195239638472, "grad_norm": 0.14707584276953978, "learning_rate": 9.251266013843757e-06, "loss": 0.0588, "step": 1985 }, { "epoch": 0.5732598754096, "grad_norm": 0.16188309788542712, "learning_rate": 9.201119206455927e-06, "loss": 0.0655, "step": 1990 }, { "epoch": 0.5747002268553527, "grad_norm": 0.1524865885491688, "learning_rate": 9.150992609558425e-06, "loss": 0.0608, "step": 1995 }, { "epoch": 0.5761405783011054, "grad_norm": 0.1439363997300933, "learning_rate": 9.10088749127926e-06, "loss": 0.0602, "step": 2000 }, { "epoch": 0.5761405783011054, "eval_loss": 0.06202974170446396, "eval_runtime": 197.6148, "eval_samples_per_second": 9.129, "eval_steps_per_second": 2.282, "step": 2000 }, { "epoch": 0.5775809297468583, "grad_norm": 0.13774654558009133, "learning_rate": 9.050805119203035e-06, "loss": 0.0574, "step": 2005 }, { "epoch": 0.579021281192611, "grad_norm": 0.1598545346397363, "learning_rate": 9.000746760338938e-06, "loss": 0.0644, "step": 2010 }, { "epoch": 0.5804616326383638, "grad_norm": 0.13573728248991526, "learning_rate": 8.950713681088647e-06, "loss": 0.0587, "step": 2015 }, { "epoch": 0.5819019840841165, "grad_norm": 0.14813992842491036, "learning_rate": 8.900707147214301e-06, "loss": 0.0627, "step": 2020 }, { "epoch": 0.5833423355298692, "grad_norm": 0.1440587796894665, "learning_rate": 8.850728423806487e-06, "loss": 0.061, "step": 2025 }, { "epoch": 0.5847826869756221, "grad_norm": 0.1358163633361261, "learning_rate": 8.800778775252221e-06, "loss": 0.0582, "step": 2030 }, { "epoch": 0.5862230384213748, "grad_norm": 0.14949783609406306, "learning_rate": 8.75085946520298e-06, "loss": 0.0673, "step": 2035 }, { "epoch": 0.5876633898671276, "grad_norm": 0.14548649497172944, "learning_rate": 8.700971756542707e-06, "loss": 0.06, "step": 2040 }, { "epoch": 0.5891037413128803, "grad_norm": 0.1482887794432985, "learning_rate": 8.65111691135589e-06, "loss": 0.0605, "step": 2045 }, { "epoch": 0.5905440927586331, "grad_norm": 0.14245894718321123, "learning_rate": 8.601296190895611e-06, "loss": 0.0592, "step": 2050 }, { "epoch": 0.5919844442043859, "grad_norm": 0.1299014295335545, "learning_rate": 8.551510855551658e-06, "loss": 0.0625, "step": 2055 }, { "epoch": 0.5934247956501386, "grad_norm": 0.14990167404663077, "learning_rate": 8.501762164818615e-06, "loss": 0.0615, "step": 2060 }, { "epoch": 0.5948651470958914, "grad_norm": 0.15687154771193254, "learning_rate": 8.452051377264025e-06, "loss": 0.0676, "step": 2065 }, { "epoch": 0.5963054985416442, "grad_norm": 0.14285391504986938, "learning_rate": 8.402379750496535e-06, "loss": 0.063, "step": 2070 }, { "epoch": 0.5977458499873969, "grad_norm": 0.16481562129681976, "learning_rate": 8.35274854113407e-06, "loss": 0.062, "step": 2075 }, { "epoch": 0.5991862014331497, "grad_norm": 0.13142064433487616, "learning_rate": 8.303159004772065e-06, "loss": 0.0603, "step": 2080 }, { "epoch": 0.6006265528789024, "grad_norm": 0.1515273341700813, "learning_rate": 8.253612395951697e-06, "loss": 0.0656, "step": 2085 }, { "epoch": 0.6020669043246553, "grad_norm": 0.17295436986543897, "learning_rate": 8.204109968128128e-06, "loss": 0.0655, "step": 2090 }, { "epoch": 0.603507255770408, "grad_norm": 0.14037313951418073, "learning_rate": 8.15465297363881e-06, "loss": 0.0662, "step": 2095 }, { "epoch": 0.6049476072161607, "grad_norm": 0.13747939963648245, "learning_rate": 8.105242663671807e-06, "loss": 0.0615, "step": 2100 }, { "epoch": 0.6063879586619135, "grad_norm": 0.12848974547228537, "learning_rate": 8.055880288234127e-06, "loss": 0.0615, "step": 2105 }, { "epoch": 0.6078283101076662, "grad_norm": 0.1429108532569868, "learning_rate": 8.006567096120103e-06, "loss": 0.0606, "step": 2110 }, { "epoch": 0.6092686615534191, "grad_norm": 0.13212576027139222, "learning_rate": 7.957304334879809e-06, "loss": 0.0644, "step": 2115 }, { "epoch": 0.6107090129991718, "grad_norm": 0.14626257804070625, "learning_rate": 7.908093250787496e-06, "loss": 0.0616, "step": 2120 }, { "epoch": 0.6121493644449245, "grad_norm": 0.1369876288040746, "learning_rate": 7.858935088810054e-06, "loss": 0.0643, "step": 2125 }, { "epoch": 0.6135897158906773, "grad_norm": 0.1562204322111819, "learning_rate": 7.809831092575528e-06, "loss": 0.0626, "step": 2130 }, { "epoch": 0.6150300673364301, "grad_norm": 0.1478800925724685, "learning_rate": 7.760782504341644e-06, "loss": 0.0643, "step": 2135 }, { "epoch": 0.6164704187821829, "grad_norm": 0.14017951944044135, "learning_rate": 7.7117905649644e-06, "loss": 0.0582, "step": 2140 }, { "epoch": 0.6179107702279356, "grad_norm": 0.18625663278817955, "learning_rate": 7.662856513866643e-06, "loss": 0.0567, "step": 2145 }, { "epoch": 0.6193511216736883, "grad_norm": 0.13723738608866848, "learning_rate": 7.613981589006754e-06, "loss": 0.0595, "step": 2150 }, { "epoch": 0.6207914731194412, "grad_norm": 0.17391455144126491, "learning_rate": 7.565167026847294e-06, "loss": 0.0577, "step": 2155 }, { "epoch": 0.6222318245651939, "grad_norm": 0.13637323163266152, "learning_rate": 7.5164140623237454e-06, "loss": 0.0583, "step": 2160 }, { "epoch": 0.6236721760109467, "grad_norm": 0.13490860613290204, "learning_rate": 7.467723928813262e-06, "loss": 0.0616, "step": 2165 }, { "epoch": 0.6251125274566994, "grad_norm": 0.13694189106475327, "learning_rate": 7.419097858103464e-06, "loss": 0.0584, "step": 2170 }, { "epoch": 0.6265528789024521, "grad_norm": 0.14264218273335677, "learning_rate": 7.370537080361282e-06, "loss": 0.0584, "step": 2175 }, { "epoch": 0.627993230348205, "grad_norm": 0.14133949369553425, "learning_rate": 7.3220428241018225e-06, "loss": 0.0565, "step": 2180 }, { "epoch": 0.6294335817939577, "grad_norm": 0.13811563164931434, "learning_rate": 7.273616316157312e-06, "loss": 0.0593, "step": 2185 }, { "epoch": 0.6308739332397105, "grad_norm": 0.14172526952252731, "learning_rate": 7.225258781646036e-06, "loss": 0.0563, "step": 2190 }, { "epoch": 0.6323142846854632, "grad_norm": 0.14556762141841273, "learning_rate": 7.176971443941359e-06, "loss": 0.0581, "step": 2195 }, { "epoch": 0.633754636131216, "grad_norm": 0.14455168465042534, "learning_rate": 7.128755524640771e-06, "loss": 0.0616, "step": 2200 }, { "epoch": 0.6351949875769688, "grad_norm": 0.14914771737576238, "learning_rate": 7.080612243534981e-06, "loss": 0.0632, "step": 2205 }, { "epoch": 0.6366353390227215, "grad_norm": 0.15500897160871308, "learning_rate": 7.032542818577066e-06, "loss": 0.0574, "step": 2210 }, { "epoch": 0.6380756904684743, "grad_norm": 0.12621800276033995, "learning_rate": 6.984548465851641e-06, "loss": 0.0578, "step": 2215 }, { "epoch": 0.6395160419142271, "grad_norm": 0.14453230217275803, "learning_rate": 6.93663039954412e-06, "loss": 0.0598, "step": 2220 }, { "epoch": 0.6409563933599798, "grad_norm": 0.12835711140154188, "learning_rate": 6.888789831909972e-06, "loss": 0.0609, "step": 2225 }, { "epoch": 0.6423967448057326, "grad_norm": 0.14406333134449356, "learning_rate": 6.841027973244077e-06, "loss": 0.0573, "step": 2230 }, { "epoch": 0.6438370962514853, "grad_norm": 0.14298113650078734, "learning_rate": 6.793346031850085e-06, "loss": 0.0596, "step": 2235 }, { "epoch": 0.6452774476972382, "grad_norm": 0.1351787448129904, "learning_rate": 6.745745214009866e-06, "loss": 0.0579, "step": 2240 }, { "epoch": 0.6467177991429909, "grad_norm": 0.1416662783049815, "learning_rate": 6.698226723952985e-06, "loss": 0.0601, "step": 2245 }, { "epoch": 0.6481581505887437, "grad_norm": 0.13123613630770536, "learning_rate": 6.65079176382623e-06, "loss": 0.0622, "step": 2250 }, { "epoch": 0.6495985020344964, "grad_norm": 0.13862477589129096, "learning_rate": 6.603441533663214e-06, "loss": 0.0641, "step": 2255 }, { "epoch": 0.6510388534802491, "grad_norm": 0.13893589833870676, "learning_rate": 6.556177231354003e-06, "loss": 0.0623, "step": 2260 }, { "epoch": 0.652479204926002, "grad_norm": 0.13047767245398073, "learning_rate": 6.509000052614824e-06, "loss": 0.0625, "step": 2265 }, { "epoch": 0.6539195563717547, "grad_norm": 0.14948071139040017, "learning_rate": 6.4619111909577995e-06, "loss": 0.0568, "step": 2270 }, { "epoch": 0.6553599078175075, "grad_norm": 0.16740289210607096, "learning_rate": 6.414911837660768e-06, "loss": 0.0593, "step": 2275 }, { "epoch": 0.6568002592632602, "grad_norm": 0.1392403260751934, "learning_rate": 6.36800318173714e-06, "loss": 0.0614, "step": 2280 }, { "epoch": 0.658240610709013, "grad_norm": 0.14713068471331234, "learning_rate": 6.321186409905812e-06, "loss": 0.0591, "step": 2285 }, { "epoch": 0.6596809621547658, "grad_norm": 0.15702296926939516, "learning_rate": 6.274462706561153e-06, "loss": 0.06, "step": 2290 }, { "epoch": 0.6611213136005185, "grad_norm": 0.14577218240146522, "learning_rate": 6.227833253743045e-06, "loss": 0.0577, "step": 2295 }, { "epoch": 0.6625616650462713, "grad_norm": 0.1347810878894777, "learning_rate": 6.181299231106963e-06, "loss": 0.0595, "step": 2300 }, { "epoch": 0.6640020164920241, "grad_norm": 0.13273221655238504, "learning_rate": 6.134861815894146e-06, "loss": 0.0569, "step": 2305 }, { "epoch": 0.6654423679377768, "grad_norm": 0.13814471094934377, "learning_rate": 6.08852218290181e-06, "loss": 0.0561, "step": 2310 }, { "epoch": 0.6668827193835296, "grad_norm": 0.13189040653435405, "learning_rate": 6.0422815044534265e-06, "loss": 0.0632, "step": 2315 }, { "epoch": 0.6683230708292823, "grad_norm": 0.15479186111317447, "learning_rate": 5.9961409503690605e-06, "loss": 0.0566, "step": 2320 }, { "epoch": 0.6697634222750352, "grad_norm": 0.15426426306388247, "learning_rate": 5.950101687935783e-06, "loss": 0.0615, "step": 2325 }, { "epoch": 0.6712037737207879, "grad_norm": 0.12977972650554861, "learning_rate": 5.904164881878143e-06, "loss": 0.064, "step": 2330 }, { "epoch": 0.6726441251665406, "grad_norm": 0.12489906355375564, "learning_rate": 5.858331694328686e-06, "loss": 0.0588, "step": 2335 }, { "epoch": 0.6740844766122934, "grad_norm": 0.14177743912393773, "learning_rate": 5.812603284798575e-06, "loss": 0.0559, "step": 2340 }, { "epoch": 0.6755248280580461, "grad_norm": 0.127402859937035, "learning_rate": 5.766980810148238e-06, "loss": 0.0585, "step": 2345 }, { "epoch": 0.676965179503799, "grad_norm": 0.14721828485950575, "learning_rate": 5.721465424558111e-06, "loss": 0.0569, "step": 2350 }, { "epoch": 0.6784055309495517, "grad_norm": 0.12933895802216824, "learning_rate": 5.676058279499438e-06, "loss": 0.0583, "step": 2355 }, { "epoch": 0.6798458823953044, "grad_norm": 0.2334988683393806, "learning_rate": 5.630760523705137e-06, "loss": 0.0577, "step": 2360 }, { "epoch": 0.6812862338410572, "grad_norm": 0.13307693046592253, "learning_rate": 5.585573303140741e-06, "loss": 0.0607, "step": 2365 }, { "epoch": 0.68272658528681, "grad_norm": 0.14681808383769718, "learning_rate": 5.540497760975412e-06, "loss": 0.0596, "step": 2370 }, { "epoch": 0.6841669367325628, "grad_norm": 0.14526949538273093, "learning_rate": 5.495535037553003e-06, "loss": 0.0575, "step": 2375 }, { "epoch": 0.6856072881783155, "grad_norm": 0.15463516823806045, "learning_rate": 5.450686270363244e-06, "loss": 0.063, "step": 2380 }, { "epoch": 0.6870476396240682, "grad_norm": 0.1370433070265185, "learning_rate": 5.405952594012921e-06, "loss": 0.0574, "step": 2385 }, { "epoch": 0.6884879910698211, "grad_norm": 0.13459031723410086, "learning_rate": 5.361335140197199e-06, "loss": 0.0616, "step": 2390 }, { "epoch": 0.6899283425155738, "grad_norm": 0.15099977742892481, "learning_rate": 5.316835037670987e-06, "loss": 0.0585, "step": 2395 }, { "epoch": 0.6913686939613266, "grad_norm": 0.1475383831706844, "learning_rate": 5.272453412220389e-06, "loss": 0.0606, "step": 2400 }, { "epoch": 0.6928090454070793, "grad_norm": 0.12116703992402802, "learning_rate": 5.228191386634212e-06, "loss": 0.0552, "step": 2405 }, { "epoch": 0.694249396852832, "grad_norm": 0.18353789252461628, "learning_rate": 5.184050080675558e-06, "loss": 0.0581, "step": 2410 }, { "epoch": 0.6956897482985849, "grad_norm": 0.15256328875682043, "learning_rate": 5.140030611053512e-06, "loss": 0.0583, "step": 2415 }, { "epoch": 0.6971300997443376, "grad_norm": 0.14298118193344345, "learning_rate": 5.096134091394879e-06, "loss": 0.0625, "step": 2420 }, { "epoch": 0.6985704511900904, "grad_norm": 0.15308814352700958, "learning_rate": 5.052361632216009e-06, "loss": 0.0563, "step": 2425 }, { "epoch": 0.7000108026358431, "grad_norm": 0.12806804658727294, "learning_rate": 5.008714340894716e-06, "loss": 0.0621, "step": 2430 }, { "epoch": 0.7014511540815959, "grad_norm": 0.1283153150885506, "learning_rate": 4.965193321642245e-06, "loss": 0.0639, "step": 2435 }, { "epoch": 0.7028915055273487, "grad_norm": 0.16204924222767175, "learning_rate": 4.9217996754753505e-06, "loss": 0.0632, "step": 2440 }, { "epoch": 0.7043318569731014, "grad_norm": 0.13720075582090696, "learning_rate": 4.878534500188443e-06, "loss": 0.0579, "step": 2445 }, { "epoch": 0.7057722084188542, "grad_norm": 0.1336786402162477, "learning_rate": 4.835398890325806e-06, "loss": 0.0602, "step": 2450 }, { "epoch": 0.707212559864607, "grad_norm": 0.15250075242936512, "learning_rate": 4.792393937153914e-06, "loss": 0.0587, "step": 2455 }, { "epoch": 0.7086529113103597, "grad_norm": 0.14586936476718837, "learning_rate": 4.749520728633812e-06, "loss": 0.0599, "step": 2460 }, { "epoch": 0.7100932627561125, "grad_norm": 0.13737451021327757, "learning_rate": 4.706780349393621e-06, "loss": 0.059, "step": 2465 }, { "epoch": 0.7115336142018652, "grad_norm": 0.14534033618919, "learning_rate": 4.664173880701065e-06, "loss": 0.06, "step": 2470 }, { "epoch": 0.7129739656476181, "grad_norm": 0.14705542529286947, "learning_rate": 4.62170240043614e-06, "loss": 0.0576, "step": 2475 }, { "epoch": 0.7144143170933708, "grad_norm": 0.1599807488079511, "learning_rate": 4.579366983063829e-06, "loss": 0.0597, "step": 2480 }, { "epoch": 0.7158546685391235, "grad_norm": 0.1345238151706872, "learning_rate": 4.537168699606928e-06, "loss": 0.0618, "step": 2485 }, { "epoch": 0.7172950199848763, "grad_norm": 0.1469249032470179, "learning_rate": 4.4951086176189666e-06, "loss": 0.0633, "step": 2490 }, { "epoch": 0.718735371430629, "grad_norm": 0.13426918611168998, "learning_rate": 4.453187801157155e-06, "loss": 0.0585, "step": 2495 }, { "epoch": 0.7201757228763819, "grad_norm": 0.14672886262551482, "learning_rate": 4.411407310755513e-06, "loss": 0.0599, "step": 2500 }, { "epoch": 0.7201757228763819, "eval_loss": 0.06022631376981735, "eval_runtime": 199.5971, "eval_samples_per_second": 9.038, "eval_steps_per_second": 2.26, "step": 2500 }, { "epoch": 0.7216160743221346, "grad_norm": 0.1432101633635202, "learning_rate": 4.369768203398014e-06, "loss": 0.0554, "step": 2505 }, { "epoch": 0.7230564257678873, "grad_norm": 0.14150540798087066, "learning_rate": 4.328271532491859e-06, "loss": 0.0606, "step": 2510 }, { "epoch": 0.7244967772136401, "grad_norm": 0.14496753728614564, "learning_rate": 4.2869183478408125e-06, "loss": 0.0601, "step": 2515 }, { "epoch": 0.7259371286593929, "grad_norm": 0.13903106714425764, "learning_rate": 4.2457096956186525e-06, "loss": 0.0608, "step": 2520 }, { "epoch": 0.7273774801051457, "grad_norm": 0.15285152111447173, "learning_rate": 4.2046466183427035e-06, "loss": 0.0653, "step": 2525 }, { "epoch": 0.7288178315508984, "grad_norm": 0.1266370772861076, "learning_rate": 4.163730154847462e-06, "loss": 0.0527, "step": 2530 }, { "epoch": 0.7302581829966511, "grad_norm": 0.13117482319657733, "learning_rate": 4.122961340258312e-06, "loss": 0.0559, "step": 2535 }, { "epoch": 0.731698534442404, "grad_norm": 0.14194653004592997, "learning_rate": 4.082341205965344e-06, "loss": 0.059, "step": 2540 }, { "epoch": 0.7331388858881567, "grad_norm": 0.15066840894513114, "learning_rate": 4.0418707795972575e-06, "loss": 0.0592, "step": 2545 }, { "epoch": 0.7345792373339095, "grad_norm": 0.14684421420293026, "learning_rate": 4.001551084995363e-06, "loss": 0.0627, "step": 2550 }, { "epoch": 0.7360195887796622, "grad_norm": 0.13640481951408895, "learning_rate": 3.961383142187691e-06, "loss": 0.0608, "step": 2555 }, { "epoch": 0.7374599402254151, "grad_norm": 0.14295407920295589, "learning_rate": 3.9213679673631745e-06, "loss": 0.0614, "step": 2560 }, { "epoch": 0.7389002916711678, "grad_norm": 0.1407341748583866, "learning_rate": 3.881506572845933e-06, "loss": 0.063, "step": 2565 }, { "epoch": 0.7403406431169205, "grad_norm": 0.14190189878723156, "learning_rate": 3.841799967069686e-06, "loss": 0.0657, "step": 2570 }, { "epoch": 0.7417809945626733, "grad_norm": 0.13086316062234218, "learning_rate": 3.8022491545522346e-06, "loss": 0.0625, "step": 2575 }, { "epoch": 0.743221346008426, "grad_norm": 0.15392265951922993, "learning_rate": 3.7628551358700303e-06, "loss": 0.0594, "step": 2580 }, { "epoch": 0.7446616974541789, "grad_norm": 0.1417233830083159, "learning_rate": 3.723618907632882e-06, "loss": 0.0613, "step": 2585 }, { "epoch": 0.7461020488999316, "grad_norm": 0.1454111678080567, "learning_rate": 3.6845414624587326e-06, "loss": 0.063, "step": 2590 }, { "epoch": 0.7475424003456843, "grad_norm": 0.13443095013823156, "learning_rate": 3.6456237889485547e-06, "loss": 0.0623, "step": 2595 }, { "epoch": 0.7489827517914371, "grad_norm": 0.12036202234264778, "learning_rate": 3.606866871661333e-06, "loss": 0.0567, "step": 2600 }, { "epoch": 0.7504231032371899, "grad_norm": 0.13114907916546412, "learning_rate": 3.5682716910891613e-06, "loss": 0.0586, "step": 2605 }, { "epoch": 0.7518634546829427, "grad_norm": 0.15054577661915033, "learning_rate": 3.5298392236324365e-06, "loss": 0.0608, "step": 2610 }, { "epoch": 0.7533038061286954, "grad_norm": 0.13218839559511986, "learning_rate": 3.491570441575154e-06, "loss": 0.0587, "step": 2615 }, { "epoch": 0.7547441575744481, "grad_norm": 0.1268003731336924, "learning_rate": 3.453466313060322e-06, "loss": 0.0601, "step": 2620 }, { "epoch": 0.756184509020201, "grad_norm": 0.14035220802374737, "learning_rate": 3.4155278020654502e-06, "loss": 0.0605, "step": 2625 }, { "epoch": 0.7576248604659537, "grad_norm": 0.1426120779032921, "learning_rate": 3.3777558683781795e-06, "loss": 0.0592, "step": 2630 }, { "epoch": 0.7590652119117065, "grad_norm": 0.12614500048877936, "learning_rate": 3.3401514675719815e-06, "loss": 0.0577, "step": 2635 }, { "epoch": 0.7605055633574592, "grad_norm": 0.13720240974898895, "learning_rate": 3.302715550982014e-06, "loss": 0.0593, "step": 2640 }, { "epoch": 0.7619459148032119, "grad_norm": 0.3087564092748528, "learning_rate": 3.2654490656810256e-06, "loss": 0.0566, "step": 2645 }, { "epoch": 0.7633862662489648, "grad_norm": 0.16717331588926376, "learning_rate": 3.228352954455406e-06, "loss": 0.0564, "step": 2650 }, { "epoch": 0.7648266176947175, "grad_norm": 0.14323201872042673, "learning_rate": 3.1914281557813386e-06, "loss": 0.0567, "step": 2655 }, { "epoch": 0.7662669691404703, "grad_norm": 0.1497485160314171, "learning_rate": 3.1546756038010507e-06, "loss": 0.0588, "step": 2660 }, { "epoch": 0.767707320586223, "grad_norm": 0.13401880002666341, "learning_rate": 3.1180962282991976e-06, "loss": 0.0606, "step": 2665 }, { "epoch": 0.7691476720319758, "grad_norm": 0.13208176864571497, "learning_rate": 3.081690954679313e-06, "loss": 0.0566, "step": 2670 }, { "epoch": 0.7705880234777286, "grad_norm": 0.12464358137882478, "learning_rate": 3.0454607039404206e-06, "loss": 0.0572, "step": 2675 }, { "epoch": 0.7720283749234813, "grad_norm": 0.13502277775724228, "learning_rate": 3.0094063926537233e-06, "loss": 0.0585, "step": 2680 }, { "epoch": 0.7734687263692341, "grad_norm": 0.13081561307511916, "learning_rate": 2.973528932939429e-06, "loss": 0.0564, "step": 2685 }, { "epoch": 0.7749090778149869, "grad_norm": 0.1268910010135427, "learning_rate": 2.937829232443654e-06, "loss": 0.0555, "step": 2690 }, { "epoch": 0.7763494292607396, "grad_norm": 0.13623501931518958, "learning_rate": 2.9023081943154753e-06, "loss": 0.0601, "step": 2695 }, { "epoch": 0.7777897807064924, "grad_norm": 0.14656323058804094, "learning_rate": 2.86696671718408e-06, "loss": 0.0564, "step": 2700 }, { "epoch": 0.7792301321522451, "grad_norm": 0.15181738159360775, "learning_rate": 2.8318056951360294e-06, "loss": 0.0566, "step": 2705 }, { "epoch": 0.780670483597998, "grad_norm": 0.16379931893275942, "learning_rate": 2.7968260176926407e-06, "loss": 0.0611, "step": 2710 }, { "epoch": 0.7821108350437507, "grad_norm": 0.13279024083856536, "learning_rate": 2.762028569787485e-06, "loss": 0.0632, "step": 2715 }, { "epoch": 0.7835511864895034, "grad_norm": 0.1514977510458698, "learning_rate": 2.7274142317439956e-06, "loss": 0.0579, "step": 2720 }, { "epoch": 0.7849915379352562, "grad_norm": 0.13057048011714373, "learning_rate": 2.6929838792532035e-06, "loss": 0.062, "step": 2725 }, { "epoch": 0.7864318893810089, "grad_norm": 0.13540271426738182, "learning_rate": 2.6587383833515746e-06, "loss": 0.0561, "step": 2730 }, { "epoch": 0.7878722408267618, "grad_norm": 0.1571274616935904, "learning_rate": 2.6246786103989887e-06, "loss": 0.0565, "step": 2735 }, { "epoch": 0.7893125922725145, "grad_norm": 0.13901055115200536, "learning_rate": 2.590805422056807e-06, "loss": 0.0581, "step": 2740 }, { "epoch": 0.7907529437182672, "grad_norm": 0.1292805259799327, "learning_rate": 2.5571196752660733e-06, "loss": 0.0582, "step": 2745 }, { "epoch": 0.79219329516402, "grad_norm": 0.14166775661867112, "learning_rate": 2.5236222222258455e-06, "loss": 0.0592, "step": 2750 }, { "epoch": 0.7936336466097728, "grad_norm": 0.14804665844069495, "learning_rate": 2.4903139103716365e-06, "loss": 0.0574, "step": 2755 }, { "epoch": 0.7950739980555256, "grad_norm": 0.13663548348085658, "learning_rate": 2.4571955823539617e-06, "loss": 0.0546, "step": 2760 }, { "epoch": 0.7965143495012783, "grad_norm": 0.16386374204432605, "learning_rate": 2.424268076017032e-06, "loss": 0.0582, "step": 2765 }, { "epoch": 0.797954700947031, "grad_norm": 0.13788750638406272, "learning_rate": 2.3915322243775564e-06, "loss": 0.054, "step": 2770 }, { "epoch": 0.7993950523927839, "grad_norm": 0.13024548818008383, "learning_rate": 2.3589888556036623e-06, "loss": 0.0596, "step": 2775 }, { "epoch": 0.8008354038385366, "grad_norm": 0.1434591335226828, "learning_rate": 2.3266387929939525e-06, "loss": 0.0595, "step": 2780 }, { "epoch": 0.8022757552842894, "grad_norm": 0.1323094772826723, "learning_rate": 2.294482854956672e-06, "loss": 0.0577, "step": 2785 }, { "epoch": 0.8037161067300421, "grad_norm": 0.13191375863763416, "learning_rate": 2.2625218549890014e-06, "loss": 0.0586, "step": 2790 }, { "epoch": 0.8051564581757948, "grad_norm": 0.1436295049255119, "learning_rate": 2.230756601656481e-06, "loss": 0.0614, "step": 2795 }, { "epoch": 0.8065968096215477, "grad_norm": 0.14620897997093418, "learning_rate": 2.1991878985725566e-06, "loss": 0.0549, "step": 2800 }, { "epoch": 0.8080371610673004, "grad_norm": 0.13487749668910298, "learning_rate": 2.167816544378244e-06, "loss": 0.0604, "step": 2805 }, { "epoch": 0.8094775125130532, "grad_norm": 0.14233905408675931, "learning_rate": 2.1366433327219284e-06, "loss": 0.0609, "step": 2810 }, { "epoch": 0.8109178639588059, "grad_norm": 0.13538036372546858, "learning_rate": 2.105669052239274e-06, "loss": 0.0604, "step": 2815 }, { "epoch": 0.8123582154045587, "grad_norm": 0.14197120797805796, "learning_rate": 2.0748944865333033e-06, "loss": 0.0584, "step": 2820 }, { "epoch": 0.8137985668503115, "grad_norm": 0.14183713480867427, "learning_rate": 2.0443204141545393e-06, "loss": 0.0581, "step": 2825 }, { "epoch": 0.8152389182960642, "grad_norm": 0.15177244791728164, "learning_rate": 2.013947608581327e-06, "loss": 0.0571, "step": 2830 }, { "epoch": 0.816679269741817, "grad_norm": 0.13725362054115173, "learning_rate": 1.983776838200262e-06, "loss": 0.0555, "step": 2835 }, { "epoch": 0.8181196211875698, "grad_norm": 0.13907159884961337, "learning_rate": 1.9538088662867495e-06, "loss": 0.0584, "step": 2840 }, { "epoch": 0.8195599726333225, "grad_norm": 0.13469702777956086, "learning_rate": 1.924044450985706e-06, "loss": 0.0608, "step": 2845 }, { "epoch": 0.8210003240790753, "grad_norm": 0.13554271157894637, "learning_rate": 1.8944843452923546e-06, "loss": 0.061, "step": 2850 }, { "epoch": 0.822440675524828, "grad_norm": 0.12070016414097579, "learning_rate": 1.8651292970332003e-06, "loss": 0.057, "step": 2855 }, { "epoch": 0.8238810269705809, "grad_norm": 0.14543851459148718, "learning_rate": 1.835980048847098e-06, "loss": 0.0609, "step": 2860 }, { "epoch": 0.8253213784163336, "grad_norm": 0.12894063460571178, "learning_rate": 1.8070373381664752e-06, "loss": 0.0558, "step": 2865 }, { "epoch": 0.8267617298620864, "grad_norm": 0.13829694305046422, "learning_rate": 1.77830189719866e-06, "loss": 0.0577, "step": 2870 }, { "epoch": 0.8282020813078391, "grad_norm": 0.23328526390899323, "learning_rate": 1.7497744529073712e-06, "loss": 0.0557, "step": 2875 }, { "epoch": 0.8296424327535918, "grad_norm": 0.14339826319501542, "learning_rate": 1.721455726994321e-06, "loss": 0.0617, "step": 2880 }, { "epoch": 0.8310827841993447, "grad_norm": 0.13288242940677378, "learning_rate": 1.6933464358809593e-06, "loss": 0.0531, "step": 2885 }, { "epoch": 0.8325231356450974, "grad_norm": 0.16090987532307297, "learning_rate": 1.6654472906903486e-06, "loss": 0.0557, "step": 2890 }, { "epoch": 0.8339634870908502, "grad_norm": 0.13565187898118353, "learning_rate": 1.637758997229173e-06, "loss": 0.0551, "step": 2895 }, { "epoch": 0.8354038385366029, "grad_norm": 0.13398237328307583, "learning_rate": 1.6102822559698828e-06, "loss": 0.0562, "step": 2900 }, { "epoch": 0.8368441899823557, "grad_norm": 0.12547576724680876, "learning_rate": 1.5830177620329712e-06, "loss": 0.058, "step": 2905 }, { "epoch": 0.8382845414281085, "grad_norm": 0.21201898166629826, "learning_rate": 1.5559662051694002e-06, "loss": 0.0595, "step": 2910 }, { "epoch": 0.8397248928738612, "grad_norm": 0.13854983530846096, "learning_rate": 1.5291282697431353e-06, "loss": 0.059, "step": 2915 }, { "epoch": 0.841165244319614, "grad_norm": 0.1412910572198349, "learning_rate": 1.502504634713835e-06, "loss": 0.057, "step": 2920 }, { "epoch": 0.8426055957653668, "grad_norm": 0.13150667676384337, "learning_rate": 1.4760959736196834e-06, "loss": 0.0566, "step": 2925 }, { "epoch": 0.8440459472111195, "grad_norm": 0.12609782628169966, "learning_rate": 1.4499029545603472e-06, "loss": 0.0571, "step": 2930 }, { "epoch": 0.8454862986568723, "grad_norm": 0.1556487705171841, "learning_rate": 1.423926240180068e-06, "loss": 0.0609, "step": 2935 }, { "epoch": 0.846926650102625, "grad_norm": 0.1388357105513542, "learning_rate": 1.3981664876509028e-06, "loss": 0.0592, "step": 2940 }, { "epoch": 0.8483670015483779, "grad_norm": 0.14513822447031188, "learning_rate": 1.3726243486560975e-06, "loss": 0.062, "step": 2945 }, { "epoch": 0.8498073529941306, "grad_norm": 0.14134009678939588, "learning_rate": 1.3473004693736037e-06, "loss": 0.0605, "step": 2950 }, { "epoch": 0.8512477044398833, "grad_norm": 0.13877865523926497, "learning_rate": 1.3221954904597256e-06, "loss": 0.0567, "step": 2955 }, { "epoch": 0.8526880558856361, "grad_norm": 0.1338985016688266, "learning_rate": 1.2973100470329159e-06, "loss": 0.0592, "step": 2960 }, { "epoch": 0.8541284073313888, "grad_norm": 0.13691724800191765, "learning_rate": 1.272644768657707e-06, "loss": 0.0552, "step": 2965 }, { "epoch": 0.8555687587771417, "grad_norm": 0.12943506117621348, "learning_rate": 1.248200279328784e-06, "loss": 0.0581, "step": 2970 }, { "epoch": 0.8570091102228944, "grad_norm": 0.1350768678776799, "learning_rate": 1.223977197455204e-06, "loss": 0.0596, "step": 2975 }, { "epoch": 0.8584494616686471, "grad_norm": 0.14454784322868913, "learning_rate": 1.1999761358447403e-06, "loss": 0.0595, "step": 2980 }, { "epoch": 0.8598898131143999, "grad_norm": 0.1480760428605418, "learning_rate": 1.1761977016883897e-06, "loss": 0.0559, "step": 2985 }, { "epoch": 0.8613301645601527, "grad_norm": 0.1298300829120424, "learning_rate": 1.152642496544998e-06, "loss": 0.0556, "step": 2990 }, { "epoch": 0.8627705160059055, "grad_norm": 0.14190562141399815, "learning_rate": 1.1293111163260639e-06, "loss": 0.0592, "step": 2995 }, { "epoch": 0.8642108674516582, "grad_norm": 0.1340146209373762, "learning_rate": 1.1062041512806409e-06, "loss": 0.0624, "step": 3000 }, { "epoch": 0.8642108674516582, "eval_loss": 0.05916040763258934, "eval_runtime": 198.9831, "eval_samples_per_second": 9.066, "eval_steps_per_second": 2.267, "step": 3000 }, { "epoch": 0.8656512188974109, "grad_norm": 0.1359785326923773, "learning_rate": 1.0833221859804188e-06, "loss": 0.0582, "step": 3005 }, { "epoch": 0.8670915703431638, "grad_norm": 0.1273647539914811, "learning_rate": 1.0606657993049253e-06, "loss": 0.0574, "step": 3010 }, { "epoch": 0.8685319217889165, "grad_norm": 0.14728516217233614, "learning_rate": 1.0382355644268871e-06, "loss": 0.062, "step": 3015 }, { "epoch": 0.8699722732346693, "grad_norm": 0.19933620610664268, "learning_rate": 1.0160320487977349e-06, "loss": 0.0569, "step": 3020 }, { "epoch": 0.871412624680422, "grad_norm": 0.15209153121626354, "learning_rate": 9.940558141332323e-07, "loss": 0.0619, "step": 3025 }, { "epoch": 0.8728529761261747, "grad_norm": 0.1310096696794578, "learning_rate": 9.723074163992774e-07, "loss": 0.0582, "step": 3030 }, { "epoch": 0.8742933275719276, "grad_norm": 0.13609449965932524, "learning_rate": 9.507874057978339e-07, "loss": 0.0569, "step": 3035 }, { "epoch": 0.8757336790176803, "grad_norm": 0.13290384955473775, "learning_rate": 9.294963267530177e-07, "loss": 0.0571, "step": 3040 }, { "epoch": 0.8771740304634331, "grad_norm": 0.1343134556624474, "learning_rate": 9.084347178973107e-07, "loss": 0.0571, "step": 3045 }, { "epoch": 0.8786143819091858, "grad_norm": 0.14323917922303897, "learning_rate": 8.876031120579454e-07, "loss": 0.0568, "step": 3050 }, { "epoch": 0.8800547333549386, "grad_norm": 0.13867733291744297, "learning_rate": 8.670020362434229e-07, "loss": 0.0575, "step": 3055 }, { "epoch": 0.8814950848006914, "grad_norm": 0.13288663406354173, "learning_rate": 8.466320116301752e-07, "loss": 0.0561, "step": 3060 }, { "epoch": 0.8829354362464441, "grad_norm": 0.13259296941859702, "learning_rate": 8.264935535493879e-07, "loss": 0.0555, "step": 3065 }, { "epoch": 0.8843757876921969, "grad_norm": 0.13951091711976848, "learning_rate": 8.065871714739581e-07, "loss": 0.0607, "step": 3070 }, { "epoch": 0.8858161391379497, "grad_norm": 0.12832449227606774, "learning_rate": 7.869133690056063e-07, "loss": 0.058, "step": 3075 }, { "epoch": 0.8872564905837024, "grad_norm": 0.1321296955840568, "learning_rate": 7.67472643862136e-07, "loss": 0.0557, "step": 3080 }, { "epoch": 0.8886968420294552, "grad_norm": 0.14306664554533388, "learning_rate": 7.482654878648465e-07, "loss": 0.0642, "step": 3085 }, { "epoch": 0.8901371934752079, "grad_norm": 0.12616475566967347, "learning_rate": 7.292923869260837e-07, "loss": 0.0609, "step": 3090 }, { "epoch": 0.8915775449209608, "grad_norm": 0.13027354471908487, "learning_rate": 7.105538210369467e-07, "loss": 0.0551, "step": 3095 }, { "epoch": 0.8930178963667135, "grad_norm": 0.12728728659733196, "learning_rate": 6.920502642551519e-07, "loss": 0.0577, "step": 3100 }, { "epoch": 0.8944582478124662, "grad_norm": 0.14539496144756697, "learning_rate": 6.737821846930403e-07, "loss": 0.0594, "step": 3105 }, { "epoch": 0.895898599258219, "grad_norm": 0.142416710805929, "learning_rate": 6.557500445057252e-07, "loss": 0.0618, "step": 3110 }, { "epoch": 0.8973389507039717, "grad_norm": 0.1372415423626945, "learning_rate": 6.379542998794086e-07, "loss": 0.0547, "step": 3115 }, { "epoch": 0.8987793021497246, "grad_norm": 0.14646165458637372, "learning_rate": 6.203954010198387e-07, "loss": 0.0598, "step": 3120 }, { "epoch": 0.9002196535954773, "grad_norm": 0.12761881782441253, "learning_rate": 6.030737921409169e-07, "loss": 0.0581, "step": 3125 }, { "epoch": 0.90166000504123, "grad_norm": 0.13614699389750304, "learning_rate": 5.859899114534662e-07, "loss": 0.0556, "step": 3130 }, { "epoch": 0.9031003564869828, "grad_norm": 0.1386025331106829, "learning_rate": 5.691441911541385e-07, "loss": 0.0582, "step": 3135 }, { "epoch": 0.9045407079327356, "grad_norm": 0.13232032514165884, "learning_rate": 5.525370574144873e-07, "loss": 0.0545, "step": 3140 }, { "epoch": 0.9059810593784884, "grad_norm": 0.12527580811393457, "learning_rate": 5.361689303701767e-07, "loss": 0.0533, "step": 3145 }, { "epoch": 0.9074214108242411, "grad_norm": 0.13604485662549864, "learning_rate": 5.200402241103674e-07, "loss": 0.0563, "step": 3150 }, { "epoch": 0.9088617622699939, "grad_norm": 0.12602730701179232, "learning_rate": 5.041513466672254e-07, "loss": 0.0528, "step": 3155 }, { "epoch": 0.9103021137157467, "grad_norm": 0.1331744306571229, "learning_rate": 4.885027000056075e-07, "loss": 0.0596, "step": 3160 }, { "epoch": 0.9117424651614994, "grad_norm": 0.13237107440276355, "learning_rate": 4.730946800128888e-07, "loss": 0.0586, "step": 3165 }, { "epoch": 0.9131828166072522, "grad_norm": 0.12937474830598486, "learning_rate": 4.5792767648895396e-07, "loss": 0.061, "step": 3170 }, { "epoch": 0.9146231680530049, "grad_norm": 0.1398098738438078, "learning_rate": 4.4300207313632713e-07, "loss": 0.0569, "step": 3175 }, { "epoch": 0.9160635194987578, "grad_norm": 0.1302735721399194, "learning_rate": 4.2831824755046994e-07, "loss": 0.0559, "step": 3180 }, { "epoch": 0.9175038709445105, "grad_norm": 0.1246210367618306, "learning_rate": 4.138765712102299e-07, "loss": 0.0564, "step": 3185 }, { "epoch": 0.9189442223902632, "grad_norm": 0.13332208166529505, "learning_rate": 3.9967740946843523e-07, "loss": 0.0554, "step": 3190 }, { "epoch": 0.920384573836016, "grad_norm": 0.14610411993837782, "learning_rate": 3.8572112154266593e-07, "loss": 0.0578, "step": 3195 }, { "epoch": 0.9218249252817687, "grad_norm": 0.15226138047700943, "learning_rate": 3.7200806050614714e-07, "loss": 0.0567, "step": 3200 }, { "epoch": 0.9232652767275216, "grad_norm": 0.1308225111781015, "learning_rate": 3.585385732788327e-07, "loss": 0.0563, "step": 3205 }, { "epoch": 0.9247056281732743, "grad_norm": 0.14684465929558974, "learning_rate": 3.453130006186234e-07, "loss": 0.0581, "step": 3210 }, { "epoch": 0.926145979619027, "grad_norm": 0.13266776739181627, "learning_rate": 3.3233167711274496e-07, "loss": 0.0601, "step": 3215 }, { "epoch": 0.9275863310647798, "grad_norm": 0.1358476730566447, "learning_rate": 3.1959493116928473e-07, "loss": 0.0576, "step": 3220 }, { "epoch": 0.9290266825105326, "grad_norm": 0.17850604077128118, "learning_rate": 3.0710308500888184e-07, "loss": 0.0603, "step": 3225 }, { "epoch": 0.9304670339562854, "grad_norm": 0.14331524298007126, "learning_rate": 2.948564546565791e-07, "loss": 0.0569, "step": 3230 }, { "epoch": 0.9319073854020381, "grad_norm": 0.13970764920849238, "learning_rate": 2.828553499338227e-07, "loss": 0.0555, "step": 3235 }, { "epoch": 0.9333477368477908, "grad_norm": 0.14419134050474353, "learning_rate": 2.71100074450632e-07, "loss": 0.0555, "step": 3240 }, { "epoch": 0.9347880882935437, "grad_norm": 0.1461736889168219, "learning_rate": 2.595909255979079e-07, "loss": 0.0572, "step": 3245 }, { "epoch": 0.9362284397392964, "grad_norm": 0.13124715065443018, "learning_rate": 2.4832819453992073e-07, "loss": 0.0591, "step": 3250 }, { "epoch": 0.9376687911850492, "grad_norm": 0.1306820541079532, "learning_rate": 2.3731216620693554e-07, "loss": 0.0605, "step": 3255 }, { "epoch": 0.9391091426308019, "grad_norm": 0.13542742296333235, "learning_rate": 2.2654311928800965e-07, "loss": 0.0536, "step": 3260 }, { "epoch": 0.9405494940765546, "grad_norm": 0.14758285988575107, "learning_rate": 2.1602132622393745e-07, "loss": 0.0606, "step": 3265 }, { "epoch": 0.9419898455223075, "grad_norm": 0.14532495800717846, "learning_rate": 2.0574705320036025e-07, "loss": 0.0639, "step": 3270 }, { "epoch": 0.9434301969680602, "grad_norm": 0.12954948070786576, "learning_rate": 1.9572056014103281e-07, "loss": 0.0579, "step": 3275 }, { "epoch": 0.944870548413813, "grad_norm": 0.14435133264907077, "learning_rate": 1.8594210070124852e-07, "loss": 0.0623, "step": 3280 }, { "epoch": 0.9463108998595657, "grad_norm": 0.13182234725830524, "learning_rate": 1.7641192226141913e-07, "loss": 0.0541, "step": 3285 }, { "epoch": 0.9477512513053185, "grad_norm": 1.4633024268191004, "learning_rate": 1.671302659208185e-07, "loss": 0.0572, "step": 3290 }, { "epoch": 0.9491916027510713, "grad_norm": 0.1503610098957111, "learning_rate": 1.58097366491482e-07, "loss": 0.061, "step": 3295 }, { "epoch": 0.950631954196824, "grad_norm": 0.1443388849896849, "learning_rate": 1.4931345249226792e-07, "loss": 0.0555, "step": 3300 }, { "epoch": 0.9520723056425768, "grad_norm": 0.12701776281849797, "learning_rate": 1.407787461430743e-07, "loss": 0.0628, "step": 3305 }, { "epoch": 0.9535126570883296, "grad_norm": 0.13129256088024213, "learning_rate": 1.324934633592201e-07, "loss": 0.0541, "step": 3310 }, { "epoch": 0.9549530085340823, "grad_norm": 0.1327785363196553, "learning_rate": 1.2445781374597842e-07, "loss": 0.0588, "step": 3315 }, { "epoch": 0.9563933599798351, "grad_norm": 0.13315365290665568, "learning_rate": 1.1667200059327644e-07, "loss": 0.0558, "step": 3320 }, { "epoch": 0.9578337114255878, "grad_norm": 0.14439945965305848, "learning_rate": 1.0913622087055264e-07, "loss": 0.0597, "step": 3325 }, { "epoch": 0.9592740628713406, "grad_norm": 0.13166148371196648, "learning_rate": 1.0185066522177545e-07, "loss": 0.0533, "step": 3330 }, { "epoch": 0.9607144143170934, "grad_norm": 0.1400270351664016, "learning_rate": 9.481551796061472e-08, "loss": 0.0591, "step": 3335 }, { "epoch": 0.9621547657628461, "grad_norm": 0.14119139667530914, "learning_rate": 8.803095706578335e-08, "loss": 0.0583, "step": 3340 }, { "epoch": 0.9635951172085989, "grad_norm": 0.13763855728395058, "learning_rate": 8.149715417653414e-08, "loss": 0.0567, "step": 3345 }, { "epoch": 0.9650354686543516, "grad_norm": 0.13842628044965613, "learning_rate": 7.521427458831776e-08, "loss": 0.0588, "step": 3350 }, { "epoch": 0.9664758201001045, "grad_norm": 0.12724872379714094, "learning_rate": 6.918247724859939e-08, "loss": 0.0541, "step": 3355 }, { "epoch": 0.9679161715458572, "grad_norm": 0.14898075476210107, "learning_rate": 6.340191475283753e-08, "loss": 0.06, "step": 3360 }, { "epoch": 0.9693565229916099, "grad_norm": 0.1347911477145421, "learning_rate": 5.787273334062593e-08, "loss": 0.0596, "step": 3365 }, { "epoch": 0.9707968744373627, "grad_norm": 0.13079385477539324, "learning_rate": 5.259507289199328e-08, "loss": 0.0619, "step": 3370 }, { "epoch": 0.9722372258831155, "grad_norm": 0.14313408319905535, "learning_rate": 4.756906692386043e-08, "loss": 0.0597, "step": 3375 }, { "epoch": 0.9736775773288683, "grad_norm": 0.13044063273707465, "learning_rate": 4.2794842586670884e-08, "loss": 0.0589, "step": 3380 }, { "epoch": 0.975117928774621, "grad_norm": 0.148098221291949, "learning_rate": 3.827252066116338e-08, "loss": 0.0547, "step": 3385 }, { "epoch": 0.9765582802203737, "grad_norm": 0.1344069973111941, "learning_rate": 3.400221555532768e-08, "loss": 0.0544, "step": 3390 }, { "epoch": 0.9779986316661265, "grad_norm": 0.12133249021447175, "learning_rate": 2.998403530150018e-08, "loss": 0.06, "step": 3395 }, { "epoch": 0.9794389831118793, "grad_norm": 0.13732750082544246, "learning_rate": 2.6218081553638363e-08, "loss": 0.0605, "step": 3400 }, { "epoch": 0.9808793345576321, "grad_norm": 0.13176353353269668, "learning_rate": 2.2704449584745046e-08, "loss": 0.0592, "step": 3405 }, { "epoch": 0.9823196860033848, "grad_norm": 0.13323188939537126, "learning_rate": 1.9443228284455882e-08, "loss": 0.0568, "step": 3410 }, { "epoch": 0.9837600374491375, "grad_norm": 0.1207130237315737, "learning_rate": 1.6434500156800037e-08, "loss": 0.0566, "step": 3415 }, { "epoch": 0.9852003888948904, "grad_norm": 0.12840927154274398, "learning_rate": 1.3678341318100751e-08, "loss": 0.0547, "step": 3420 }, { "epoch": 0.9866407403406431, "grad_norm": 0.15035865147658953, "learning_rate": 1.1174821495059106e-08, "loss": 0.0553, "step": 3425 }, { "epoch": 0.9880810917863959, "grad_norm": 0.1435557916571953, "learning_rate": 8.924004022986543e-09, "loss": 0.0555, "step": 3430 }, { "epoch": 0.9895214432321486, "grad_norm": 0.12838967785145278, "learning_rate": 6.9259458442005875e-09, "loss": 0.0614, "step": 3435 }, { "epoch": 0.9909617946779014, "grad_norm": 0.14083462190226395, "learning_rate": 5.180697506587118e-09, "loss": 0.0543, "step": 3440 }, { "epoch": 0.9924021461236542, "grad_norm": 0.12037599402495351, "learning_rate": 3.688303162322493e-09, "loss": 0.0568, "step": 3445 }, { "epoch": 0.9938424975694069, "grad_norm": 0.12739736699073684, "learning_rate": 2.44880056675334e-09, "loss": 0.0538, "step": 3450 }, { "epoch": 0.9952828490151597, "grad_norm": 0.1356848847097788, "learning_rate": 1.4622210774428714e-09, "loss": 0.0581, "step": 3455 }, { "epoch": 0.9967232004609125, "grad_norm": 0.1406371102406219, "learning_rate": 7.285896533770765e-10, "loss": 0.0577, "step": 3460 }, { "epoch": 0.9981635519066653, "grad_norm": 0.13450694809907887, "learning_rate": 2.479248543363344e-10, "loss": 0.0566, "step": 3465 }, { "epoch": 0.999603903352418, "grad_norm": 0.14028964451278583, "learning_rate": 2.0238840421349382e-11, "loss": 0.0579, "step": 3470 }, { "epoch": 0.9998919736415686, "step": 3471, "total_flos": 2.0789981999005696e+16, "train_loss": 0.0, "train_runtime": 2.6266, "train_samples_per_second": 15511.09, "train_steps_per_second": 242.135 } ], "logging_steps": 5, "max_steps": 636, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0789981999005696e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }